]> git-server-git.apps.pok.os.sepia.ceph.com Git - rocksdb.git/commitdiff
Move functions from VersionSet to Version
authorIgor Canadi <icanadi@fb.com>
Thu, 16 Jan 2014 00:18:04 +0000 (16:18 -0800)
committerIgor Canadi <icanadi@fb.com>
Thu, 16 Jan 2014 00:18:04 +0000 (16:18 -0800)
Summary:
There were some functions in VersionSet that had no reason to be there instead of Version. Moving them to Version will make column families implementation easier.

The functions moved are:
* NumLevelBytes
* LevelSummary
* LevelFileSummary
* MaxNextLevelOverlappingBytes
* AddLiveFiles (previously AddLiveFilesCurrentVersion())
* NeedSlowdownForNumLevel0Files

The diff continues on (and depends on) D15171

Test Plan: make check

Reviewers: dhruba, haobo, kailiu, sdong, emayanke

Reviewed By: sdong

CC: leveldb
Differential Revision: https://reviews.facebook.net/D15183

271 files changed:
.arcconfig [new file with mode: 0644]
.clang-format [new file with mode: 0644]
.gitignore [new file with mode: 0644]
CONTRIBUTING.md [new file with mode: 0644]
INSTALL.md [new file with mode: 0644]
LICENSE [new file with mode: 0644]
Makefile [new file with mode: 0644]
PATENTS [new file with mode: 0644]
README [new file with mode: 0644]
README.fb [new file with mode: 0644]
build_tools/build_detect_platform [new file with mode: 0755]
build_tools/build_detect_version [new file with mode: 0755]
build_tools/fbcode.clang31.sh [new file with mode: 0644]
build_tools/fbcode.gcc471.sh [new file with mode: 0644]
build_tools/fbcode.gcc481.sh [new file with mode: 0644]
build_tools/format-diff.sh [new file with mode: 0755]
build_tools/mac-install-gflags.sh [new file with mode: 0755]
build_tools/make_new_version.sh [new file with mode: 0755]
build_tools/regression_build_test.sh [new file with mode: 0755]
build_tools/valgrind_test.sh [new file with mode: 0755]
coverage/coverage_test.sh [new file with mode: 0755]
coverage/parse_gcov_output.py [new file with mode: 0644]
db/builder.cc [new file with mode: 0644]
db/builder.h [new file with mode: 0644]
db/c.cc [new file with mode: 0644]
db/c_test.c [new file with mode: 0644]
db/corruption_test.cc [new file with mode: 0644]
db/db_bench.cc [new file with mode: 0644]
db/db_filesnapshot.cc [new file with mode: 0644]
db/db_impl.cc [new file with mode: 0644]
db/db_impl.h [new file with mode: 0644]
db/db_impl_readonly.cc [new file with mode: 0644]
db/db_impl_readonly.h [new file with mode: 0644]
db/db_iter.cc [new file with mode: 0644]
db/db_iter.h [new file with mode: 0644]
db/db_statistics.cc [new file with mode: 0644]
db/db_statistics.h [new file with mode: 0644]
db/db_stats_logger.cc [new file with mode: 0644]
db/db_test.cc [new file with mode: 0644]
db/dbformat.cc [new file with mode: 0644]
db/dbformat.h [new file with mode: 0644]
db/dbformat_test.cc [new file with mode: 0644]
db/deletefile_test.cc [new file with mode: 0644]
db/filename.cc [new file with mode: 0644]
db/filename.h [new file with mode: 0644]
db/filename_test.cc [new file with mode: 0644]
db/log_format.h [new file with mode: 0644]
db/log_reader.cc [new file with mode: 0644]
db/log_reader.h [new file with mode: 0644]
db/log_test.cc [new file with mode: 0644]
db/log_writer.cc [new file with mode: 0644]
db/log_writer.h [new file with mode: 0644]
db/memtable.cc [new file with mode: 0644]
db/memtable.h [new file with mode: 0644]
db/memtablelist.cc [new file with mode: 0644]
db/memtablelist.h [new file with mode: 0644]
db/merge_context.h [new file with mode: 0644]
db/merge_helper.cc [new file with mode: 0644]
db/merge_helper.h [new file with mode: 0644]
db/merge_operator.cc [new file with mode: 0644]
db/merge_test.cc [new file with mode: 0644]
db/perf_context_test.cc [new file with mode: 0644]
db/prefix_filter_iterator.h [new file with mode: 0644]
db/prefix_test.cc [new file with mode: 0644]
db/repair.cc [new file with mode: 0644]
db/simple_table_db_test.cc [new file with mode: 0644]
db/skiplist.h [new file with mode: 0644]
db/skiplist_test.cc [new file with mode: 0644]
db/snapshot.h [new file with mode: 0644]
db/table_cache.cc [new file with mode: 0644]
db/table_cache.h [new file with mode: 0644]
db/table_properties_collector.cc [new file with mode: 0644]
db/table_properties_collector.h [new file with mode: 0644]
db/table_properties_collector_test.cc [new file with mode: 0644]
db/transaction_log_impl.cc [new file with mode: 0644]
db/transaction_log_impl.h [new file with mode: 0644]
db/version_edit.cc [new file with mode: 0644]
db/version_edit.h [new file with mode: 0644]
db/version_edit_test.cc [new file with mode: 0644]
db/version_set.cc [new file with mode: 0644]
db/version_set.h [new file with mode: 0644]
db/version_set_reduce_num_levels.cc [new file with mode: 0644]
db/version_set_test.cc [new file with mode: 0644]
db/write_batch.cc [new file with mode: 0644]
db/write_batch_internal.h [new file with mode: 0644]
db/write_batch_test.cc [new file with mode: 0644]
doc/doc.css [new file with mode: 0644]
doc/index.html [new file with mode: 0644]
doc/log_format.txt [new file with mode: 0644]
doc/rockslogo.jpg [new file with mode: 0644]
doc/rockslogo.png [new file with mode: 0644]
hdfs/README [new file with mode: 0644]
hdfs/env_hdfs.h [new file with mode: 0644]
hdfs/hdfs.h [new file with mode: 0644]
hdfs/libhdfs.a [new file with mode: 0644]
helpers/memenv/memenv.cc [new file with mode: 0644]
helpers/memenv/memenv.h [new file with mode: 0644]
helpers/memenv/memenv_test.cc [new file with mode: 0644]
include/rocksdb/arena.h [new file with mode: 0644]
include/rocksdb/c.h [new file with mode: 0644]
include/rocksdb/cache.h [new file with mode: 0644]
include/rocksdb/compaction_filter.h [new file with mode: 0644]
include/rocksdb/comparator.h [new file with mode: 0644]
include/rocksdb/db.h [new file with mode: 0644]
include/rocksdb/env.h [new file with mode: 0644]
include/rocksdb/filter_policy.h [new file with mode: 0644]
include/rocksdb/flush_block_policy.h [new file with mode: 0644]
include/rocksdb/iterator.h [new file with mode: 0644]
include/rocksdb/ldb_tool.h [new file with mode: 0644]
include/rocksdb/memtablerep.h [new file with mode: 0644]
include/rocksdb/merge_operator.h [new file with mode: 0644]
include/rocksdb/options.h [new file with mode: 0644]
include/rocksdb/perf_context.h [new file with mode: 0644]
include/rocksdb/slice.h [new file with mode: 0644]
include/rocksdb/slice_transform.h [new file with mode: 0644]
include/rocksdb/statistics.h [new file with mode: 0644]
include/rocksdb/status.h [new file with mode: 0644]
include/rocksdb/table.h [new file with mode: 0644]
include/rocksdb/table_properties.h [new file with mode: 0644]
include/rocksdb/transaction_log.h [new file with mode: 0644]
include/rocksdb/types.h [new file with mode: 0644]
include/rocksdb/universal_compaction.h [new file with mode: 0644]
include/rocksdb/write_batch.h [new file with mode: 0644]
include/utilities/backupable_db.h [new file with mode: 0644]
include/utilities/stackable_db.h [new file with mode: 0644]
include/utilities/utility_db.h [new file with mode: 0644]
linters/src/.phutil_module_cache [new file with mode: 0644]
linters/src/__phutil_library_init__.php [new file with mode: 0644]
linters/src/__phutil_library_map__.php [new file with mode: 0644]
linters/src/cpp_linter/FbcodeCppLinter.php [new file with mode: 0644]
linters/src/cpp_linter/PfffCppLinter.php [new file with mode: 0644]
linters/src/lint_engine/FacebookFbcodeLintEngine.php [new file with mode: 0644]
port/README [new file with mode: 0644]
port/atomic_pointer.h [new file with mode: 0644]
port/port.h [new file with mode: 0644]
port/port_example.h [new file with mode: 0644]
port/port_posix.cc [new file with mode: 0644]
port/port_posix.h [new file with mode: 0644]
port/stack_trace.cc [new file with mode: 0644]
port/win/stdint.h [new file with mode: 0644]
table/block.cc [new file with mode: 0644]
table/block.h [new file with mode: 0644]
table/block_based_table_builder.cc [new file with mode: 0644]
table/block_based_table_builder.h [new file with mode: 0644]
table/block_based_table_factory.cc [new file with mode: 0644]
table/block_based_table_factory.h [new file with mode: 0644]
table/block_based_table_reader.cc [new file with mode: 0644]
table/block_based_table_reader.h [new file with mode: 0644]
table/block_builder.cc [new file with mode: 0644]
table/block_builder.h [new file with mode: 0644]
table/block_test.cc [new file with mode: 0644]
table/filter_block.cc [new file with mode: 0644]
table/filter_block.h [new file with mode: 0644]
table/filter_block_test.cc [new file with mode: 0644]
table/flush_block_policy.cc [new file with mode: 0644]
table/format.cc [new file with mode: 0644]
table/format.h [new file with mode: 0644]
table/iter_heap.h [new file with mode: 0644]
table/iterator.cc [new file with mode: 0644]
table/iterator_wrapper.h [new file with mode: 0644]
table/merger.cc [new file with mode: 0644]
table/merger.h [new file with mode: 0644]
table/table_reader_bench.cc [new file with mode: 0644]
table/table_test.cc [new file with mode: 0644]
table/two_level_iterator.cc [new file with mode: 0644]
table/two_level_iterator.h [new file with mode: 0644]
tools/blob_store_bench.cc [new file with mode: 0644]
tools/db_crashtest.py [new file with mode: 0644]
tools/db_crashtest2.py [new file with mode: 0644]
tools/db_repl_stress.cc [new file with mode: 0644]
tools/db_stress.cc [new file with mode: 0644]
tools/ldb.cc [new file with mode: 0644]
tools/ldb_test.py [new file with mode: 0644]
tools/reduce_levels_test.cc [new file with mode: 0644]
tools/shell/DBClientProxy.cpp [new file with mode: 0644]
tools/shell/DBClientProxy.h [new file with mode: 0644]
tools/shell/LeveldbShell.cpp [new file with mode: 0644]
tools/shell/ShellContext.cpp [new file with mode: 0644]
tools/shell/ShellContext.h [new file with mode: 0644]
tools/shell/ShellState.cpp [new file with mode: 0644]
tools/shell/ShellState.h [new file with mode: 0644]
tools/shell/test/DBClientProxyTest.cpp [new file with mode: 0644]
tools/sst_dump.cc [new file with mode: 0644]
util/arena_impl.cc [new file with mode: 0644]
util/arena_impl.h [new file with mode: 0644]
util/arena_test.cc [new file with mode: 0644]
util/auto_roll_logger.cc [new file with mode: 0644]
util/auto_roll_logger.h [new file with mode: 0644]
util/auto_roll_logger_test.cc [new file with mode: 0755]
util/autovector.h [new file with mode: 0644]
util/autovector_test.cc [new file with mode: 0644]
util/bit_set.h [new file with mode: 0644]
util/blob_store.cc [new file with mode: 0644]
util/blob_store.h [new file with mode: 0644]
util/blob_store_test.cc [new file with mode: 0644]
util/bloom.cc [new file with mode: 0644]
util/bloom_test.cc [new file with mode: 0644]
util/build_version.h [new file with mode: 0644]
util/cache.cc [new file with mode: 0644]
util/cache_test.cc [new file with mode: 0644]
util/coding.cc [new file with mode: 0644]
util/coding.h [new file with mode: 0644]
util/coding_test.cc [new file with mode: 0644]
util/comparator.cc [new file with mode: 0644]
util/crc32c.cc [new file with mode: 0644]
util/crc32c.h [new file with mode: 0644]
util/crc32c_test.cc [new file with mode: 0644]
util/env.cc [new file with mode: 0644]
util/env_hdfs.cc [new file with mode: 0644]
util/env_posix.cc [new file with mode: 0644]
util/env_test.cc [new file with mode: 0644]
util/filelock_test.cc [new file with mode: 0644]
util/filter_policy.cc [new file with mode: 0644]
util/hash.cc [new file with mode: 0644]
util/hash.h [new file with mode: 0644]
util/hash_skiplist_rep.cc [new file with mode: 0644]
util/hash_skiplist_rep.h [new file with mode: 0644]
util/histogram.cc [new file with mode: 0644]
util/histogram.h [new file with mode: 0644]
util/histogram_test.cc [new file with mode: 0644]
util/ldb_cmd.cc [new file with mode: 0644]
util/ldb_cmd.h [new file with mode: 0644]
util/ldb_cmd_execute_result.h [new file with mode: 0644]
util/ldb_tool.cc [new file with mode: 0644]
util/logging.cc [new file with mode: 0644]
util/logging.h [new file with mode: 0644]
util/manual_compaction_test.cc [new file with mode: 0644]
util/murmurhash.cc [new file with mode: 0644]
util/murmurhash.h [new file with mode: 0644]
util/mutexlock.h [new file with mode: 0644]
util/options.cc [new file with mode: 0644]
util/perf_context.cc [new file with mode: 0644]
util/perf_context_imp.h [new file with mode: 0644]
util/posix_logger.h [new file with mode: 0644]
util/random.h [new file with mode: 0644]
util/signal_test.cc [new file with mode: 0644]
util/skiplistrep.cc [new file with mode: 0644]
util/slice.cc [new file with mode: 0644]
util/stack_trace.h [new file with mode: 0644]
util/statistics.cc [new file with mode: 0644]
util/statistics_imp.h [new file with mode: 0644]
util/stats_logger.h [new file with mode: 0644]
util/status.cc [new file with mode: 0644]
util/stl_wrappers.h [new file with mode: 0644]
util/stop_watch.h [new file with mode: 0644]
util/string_util.cc [new file with mode: 0644]
util/string_util.h [new file with mode: 0644]
util/testharness.cc [new file with mode: 0644]
util/testharness.h [new file with mode: 0644]
util/testutil.cc [new file with mode: 0644]
util/testutil.h [new file with mode: 0644]
util/vectorrep.cc [new file with mode: 0644]
utilities/backupable/backupable_db.cc [new file with mode: 0644]
utilities/backupable/backupable_db_test.cc [new file with mode: 0644]
utilities/merge_operators.h [new file with mode: 0644]
utilities/merge_operators/put.cc [new file with mode: 0644]
utilities/merge_operators/string_append/stringappend.cc [new file with mode: 0644]
utilities/merge_operators/string_append/stringappend.h [new file with mode: 0644]
utilities/merge_operators/string_append/stringappend2.cc [new file with mode: 0644]
utilities/merge_operators/string_append/stringappend2.h [new file with mode: 0644]
utilities/merge_operators/string_append/stringappend_test.cc [new file with mode: 0644]
utilities/merge_operators/uint64add.cc [new file with mode: 0644]
utilities/redis/README [new file with mode: 0644]
utilities/redis/redis_list_exception.h [new file with mode: 0644]
utilities/redis/redis_list_iterator.h [new file with mode: 0644]
utilities/redis/redis_lists.cc [new file with mode: 0644]
utilities/redis/redis_lists.h [new file with mode: 0644]
utilities/redis/redis_lists_test.cc [new file with mode: 0644]
utilities/ttl/db_ttl.cc [new file with mode: 0644]
utilities/ttl/db_ttl.h [new file with mode: 0644]
utilities/ttl/ttl_test.cc [new file with mode: 0644]

diff --git a/.arcconfig b/.arcconfig
new file mode 100644 (file)
index 0000000..82d1771
--- /dev/null
@@ -0,0 +1,10 @@
+{
+  "project_id" : "leveldb",
+  "conduit_uri" : "https://reviews.facebook.net/",
+  "copyright_holder" : "",
+  "load" : [
+    "linters/src/"
+  ],
+  "lint.engine" : "FacebookFbcodeLintEngine",
+  "lint.engine.single.linter" : "FbcodeCppLinter"
+}
diff --git a/.clang-format b/.clang-format
new file mode 100644 (file)
index 0000000..7c27981
--- /dev/null
@@ -0,0 +1,5 @@
+# Complete list of style options can be found at: 
+# http://clang.llvm.org/docs/ClangFormatStyleOptions.html
+---
+BasedOnStyle: Google
+...
diff --git a/.gitignore b/.gitignore
new file mode 100644 (file)
index 0000000..03a5f17
--- /dev/null
@@ -0,0 +1,22 @@
+build_config.mk
+
+*.a
+*.arc
+*.d
+*.dylib*
+*.gcda
+*.gcno
+*.o
+*.so
+*.so.*
+*_test
+*_bench
+*_stress
+
+ldb
+manifest_dump
+sst_dump
+util/build_version.cc
+build_tools/VALGRIND_LOGS/
+coverage/COVERAGE_REPORT
+.gdbhistory
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644 (file)
index 0000000..3a17a88
--- /dev/null
@@ -0,0 +1,16 @@
+# Contributing to RocksDB
+
+## Contributor License Agreement ("CLA")
+
+In order to accept your pull request, we need you to submit a CLA. You
+only need to do this once, so if you've done this for another Facebook
+open source project, you're good to go. If you are submitting a pull
+request for the first time, just let us know that you have completed
+the CLA and we can cross-check with your GitHub username.
+
+Complete your CLA here: <https://developers.facebook.com/opensource/cla>
+
+## License
+
+By contributing to RocksDB, you agree that your contributions will be
+licensed under the [BSD License](LICENSE).
diff --git a/INSTALL.md b/INSTALL.md
new file mode 100644 (file)
index 0000000..ab04603
--- /dev/null
@@ -0,0 +1,50 @@
+## Dependencies
+
+RocksDB is developed on Linux (CentOS release 5.2), with gcc 4.8.1.
+It depends on gcc with C++11 support.
+
+* RocksDB depends on the following libraries:
+  - [zlib](http://www.zlib.net/) - a library for data compression.
+  - [bzip2](http://www.bzip.org/) - a library for data compression.
+  - [snappy](https://code.google.com/p/snappy/) - a library for fast
+      data compression.
+  - [gflags](https://code.google.com/p/gflags/) - a library that handles
+      command line flags processing.
+
+RocksDB will successfully compile without the compression libraries included,
+but some things may fail. We do not support releases without the compression
+libraries. You are on your own.
+
+## Supported platforms
+
+* **Linux**
+    * Upgrade your gcc to version at least 4.7 to get C++11 support.
+    * Install gflags. First, try: `sudo apt-get install libgflags-dev`.
+      If this doesn't work and you're using Ubuntu, here's a nice tutorial:
+      (http://askubuntu.com/questions/312173/installing-gflags-12-04)
+    * Install snappy. This is usually as easy as:
+      `sudo apt-get install libsnappy-dev`.
+    * Install zlib. Try: `sudo apt-get install zlib1g-dev`.
+    * Install bzip2: `sudo apt-get install libbz2-dev`.
+* **OS X**:
+    * Install latest C++ compiler that supports C++ 11:
+        * Update XCode:  run `xcode-select --install` (or install it from XCode App's settting).
+        * Install via [homebrew](http://brew.sh/).
+            * If you're first time developer in MacOS, you still need to run: `xcode-select --install` in your command line.
+            * run `brew tap homebrew/dupes; brew install gcc47 --use-llvm` to install gcc 4.7 (or higher).
+    * Install zlib, bzip2 and snappy libraries for compression.
+    * Install gflags. We have included a script
+    `build_tools/mac-install-gflags.sh`, which should automatically install it.
+    If you installed gflags by other means (for example, `brew install gflags`),
+    please set `LIBRARY_PATH` and `CPATH` accordingly.
+    * Please note that some of the optimizations/features are disabled in OSX.
+    We did not run any production workloads on it.
+
+## Compilation
+`make clean; make` will compile librocksdb.a (RocskDB static library) and all
+the unit tests. You can run all unit tests with `make check`.
+
+For shared library builds, exec `make librocksdb.so` instead.
+
+If you followed the above steps and your compile or unit tests fail,
+please submit an issue: (https://github.com/facebook/rocksdb/issues)
diff --git a/LICENSE b/LICENSE
new file mode 100644 (file)
index 0000000..716ad9e
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,35 @@
+BSD License
+
+For rocksdb software
+
+Copyright (c) 2013, Facebook, Inc.
+All rights reserved.
+---------------------------------------------------------------------
+
+Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+   * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/Makefile b/Makefile
new file mode 100644 (file)
index 0000000..572e42e
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,427 @@
+# Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+# Inherit some settings from environment variables, if available
+INSTALL_PATH ?= $(CURDIR)
+
+#-----------------------------------------------
+# Uncomment exactly one of the lines labelled (A), (B), and (C) below
+# to switch between compilation modes.
+
+# OPT ?= -DNDEBUG     # (A) Production use (optimized mode)
+OPT += -O2 -fno-omit-frame-pointer -momit-leaf-frame-pointer
+#-----------------------------------------------
+
+# detect what platform we're building on
+$(shell (export ROCKSDB_ROOT=$(CURDIR); $(CURDIR)/build_tools/build_detect_platform $(CURDIR)/build_config.mk))
+# this file is generated by the previous line to set build flags and sources
+include build_config.mk
+
+# ASAN doesn't work well with jemalloc. If we're compiling with ASAN, we should use regular malloc.
+ifdef COMPILE_WITH_ASAN
+       # ASAN compile flags
+       EXEC_LDFLAGS += -fsanitize=address
+       PLATFORM_CCFLAGS += -fsanitize=address
+       PLATFORM_CXXFLAGS += -fsanitize=address
+else
+       # if we're not compiling with ASAN, use jemalloc
+       EXEC_LDFLAGS := $(JEMALLOC_LIB) $(EXEC_LDFLAGS)
+       PLATFORM_CXXFLAGS += $(JEMALLOC_INCLUDE) -DHAVE_JEMALLOC
+       PLATFORM_CCFLAGS += $(JEMALLOC_INCLUDE) -DHAVE_JEMALLOC
+endif
+
+WARNING_FLAGS = -Wall -Werror
+CFLAGS += -g $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CCFLAGS) $(OPT)
+CXXFLAGS += -g $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) -Woverloaded-virtual
+
+LDFLAGS += $(PLATFORM_LDFLAGS)
+
+LIBOBJECTS = $(SOURCES:.cc=.o)
+LIBOBJECTS += $(SOURCESCPP:.cpp=.o)
+MEMENVOBJECTS = $(MEMENV_SOURCES:.cc=.o)
+
+TESTUTIL = ./util/testutil.o
+TESTHARNESS = ./util/testharness.o $(TESTUTIL)
+VALGRIND_ERROR = 2
+VALGRIND_DIR = build_tools/VALGRIND_LOGS
+VALGRIND_VER := $(join $(VALGRIND_VER),valgrind)
+VALGRIND_OPTS = --error-exitcode=$(VALGRIND_ERROR) --leak-check=full
+
+TESTS = \
+       db_test \
+       autovector_test \
+       table_properties_collector_test \
+       arena_test \
+       auto_roll_logger_test \
+       block_test \
+       bloom_test \
+       c_test \
+       cache_test \
+       coding_test \
+       corruption_test \
+       crc32c_test \
+       dbformat_test \
+       env_test \
+       blob_store_test \
+       filelock_test \
+       filename_test \
+       filter_block_test \
+       histogram_test \
+       log_test \
+       manual_compaction_test \
+       memenv_test \
+       merge_test \
+       redis_test \
+       reduce_levels_test \
+       simple_table_db_test \
+       skiplist_test \
+       stringappend_test \
+       ttl_test \
+       backupable_db_test \
+       version_edit_test \
+       version_set_test \
+       write_batch_test\
+       deletefile_test \
+       table_test
+
+TOOLS = \
+        sst_dump \
+        db_stress \
+        ldb \
+       db_repl_stress \
+       blob_store_bench
+
+PROGRAMS = db_bench signal_test $(TESTS) $(TOOLS)
+BENCHMARKS = db_bench_sqlite3 db_bench_tree_db table_reader_bench
+
+# The library name is configurable since we are maintaining libraries of both
+# debug/release mode.
+LIBNAME = librocksdb
+LIBRARY = ${LIBNAME}.a
+MEMENVLIBRARY = libmemenv.a
+
+default: all
+
+#-----------------------------------------------
+# Create platform independent shared libraries.
+#-----------------------------------------------
+ifneq ($(PLATFORM_SHARED_EXT),)
+
+ifneq ($(PLATFORM_SHARED_VERSIONED),true)
+SHARED1 = ${LIBNAME}.$(PLATFORM_SHARED_EXT)
+SHARED2 = $(SHARED1)
+SHARED3 = $(SHARED1)
+SHARED = $(SHARED1)
+else
+# Update db.h if you change these.
+SHARED_MAJOR = 2
+SHARED_MINOR = 0
+SHARED1 = ${LIBNAME}.$(PLATFORM_SHARED_EXT)
+SHARED2 = $(SHARED1).$(SHARED_MAJOR)
+SHARED3 = $(SHARED1).$(SHARED_MAJOR).$(SHARED_MINOR)
+SHARED = $(SHARED1) $(SHARED2) $(SHARED3)
+$(SHARED1): $(SHARED3)
+       ln -fs $(SHARED3) $(SHARED1)
+$(SHARED2): $(SHARED3)
+       ln -fs $(SHARED3) $(SHARED2)
+endif
+
+$(SHARED3): $(LIBOBJECTS)
+       $(CXX) $(PLATFORM_SHARED_LDFLAGS)$(SHARED2) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(LDFLAGS) $(SOURCES)-o $@
+
+endif  # PLATFORM_SHARED_EXT
+
+all: $(LIBRARY) $(PROGRAMS)
+
+.PHONY: blackbox_crash_test check clean coverage crash_test ldb_tests \
+       release tags valgrind_check whitebox_crash_test format
+
+# Will also generate shared libraries. 
+release:
+       $(MAKE) clean
+       OPT=-DNDEBUG $(MAKE) all -j32
+       OPT=-DNDEBUG $(MAKE) $(SHARED) -j32
+
+coverage:
+       $(MAKE) clean
+       COVERAGEFLAGS="-fprofile-arcs -ftest-coverage" LDFLAGS+="-lgcov" $(MAKE) all check
+       (cd coverage; ./coverage_test.sh)
+       # Delete intermediate files
+       find . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \;
+
+check: all $(PROGRAMS) $(TESTS) $(TOOLS)
+       for t in $(TESTS); do echo "***** Running $$t"; ./$$t || exit 1; done
+       python tools/ldb_test.py
+
+ldb_tests: all $(PROGRAMS) $(TOOLS)
+       python tools/ldb_test.py
+
+crash_test: blackbox_crash_test whitebox_crash_test
+
+blackbox_crash_test: db_stress
+       python -u tools/db_crashtest.py
+
+whitebox_crash_test: db_stress
+       python -u tools/db_crashtest2.py
+
+asan_check:
+       $(MAKE) clean
+       COMPILE_WITH_ASAN=1 $(MAKE) check -j32
+       $(MAKE) clean
+
+asan_crash_test:
+       $(MAKE) clean
+       COMPILE_WITH_ASAN=1 $(MAKE) crash_test -j32
+       $(MAKE) clean
+
+valgrind_check: all $(PROGRAMS) $(TESTS)
+       mkdir -p $(VALGRIND_DIR)
+       echo TESTS THAT HAVE VALGRIND ERRORS > $(VALGRIND_DIR)/valgrind_failed_tests; \
+       echo TIMES in seconds TAKEN BY TESTS ON VALGRIND > $(VALGRIND_DIR)/valgrind_tests_times; \
+       for t in $(filter-out skiplist_test,$(TESTS)); do \
+               stime=`date '+%s'`; \
+               $(VALGRIND_VER) $(VALGRIND_OPTS) ./$$t; \
+               if [ $$? -eq $(VALGRIND_ERROR) ] ; then \
+                       echo $$t >> $(VALGRIND_DIR)/valgrind_failed_tests; \
+               fi; \
+               etime=`date '+%s'`; \
+               echo $$t $$((etime - stime)) >> $(VALGRIND_DIR)/valgrind_tests_times; \
+       done
+
+clean:
+       -rm -f $(PROGRAMS) $(BENCHMARKS) $(LIBRARY) $(SHARED) $(MEMENVLIBRARY) build_config.mk
+       -rm -rf ios-x86/* ios-arm/*
+       -find . -name "*.[od]" -exec rm {} \;
+       -find . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \;
+tags:
+       ctags * -R
+       cscope -b `find . -name '*.cc'` `find . -name '*.h'`
+
+format:
+       build_tools/format-diff.sh
+
+# ---------------------------------------------------------------------------
+#      Unit tests and tools
+# ---------------------------------------------------------------------------
+$(LIBRARY): $(LIBOBJECTS)
+       rm -f $@
+       $(AR) -rs $@ $(LIBOBJECTS)
+
+db_bench: db/db_bench.o $(LIBOBJECTS) $(TESTUTIL)
+       $(CXX) db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@  $(LDFLAGS) $(COVERAGEFLAGS)
+
+db_stress: tools/db_stress.o $(LIBOBJECTS) $(TESTUTIL)
+       $(CXX) tools/db_stress.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@  $(LDFLAGS) $(COVERAGEFLAGS)
+
+db_repl_stress: tools/db_repl_stress.o $(LIBOBJECTS) $(TESTUTIL)
+       $(CXX) tools/db_repl_stress.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@  $(LDFLAGS) $(COVERAGEFLAGS)
+
+blob_store_bench: tools/blob_store_bench.o $(LIBOBJECTS) $(TESTUTIL)
+       $(CXX) tools/blob_store_bench.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@  $(LDFLAGS) $(COVERAGEFLAGS)
+
+db_bench_sqlite3: doc/bench/db_bench_sqlite3.o $(LIBOBJECTS) $(TESTUTIL)
+       $(CXX) doc/bench/db_bench_sqlite3.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) -lsqlite3 $(COVERAGEFLAGS)
+
+db_bench_tree_db: doc/bench/db_bench_tree_db.o $(LIBOBJECTS) $(TESTUTIL)
+       $(CXX) doc/bench/db_bench_tree_db.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) -lkyotocabinet $(COVERAGEFLAGS)
+
+signal_test: util/signal_test.o $(LIBOBJECTS)
+       $(CXX) util/signal_test.o $(LIBOBJECTS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+arena_test: util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+autovector_test: util/autovector_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) util/autovector_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+table_properties_collector_test: db/table_properties_collector_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) db/table_properties_collector_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+bloom_test: util/bloom_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) util/bloom_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+c_test: db/c_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) db/c_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+cache_test: util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+coding_test: util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+blob_store_test: util/blob_store_test.o $(LIBOBJECTS) $(TESTHARNESS) $(TESTUTIL)
+       $(CXX) util/blob_store_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o$@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+stringappend_test: utilities/merge_operators/string_append/stringappend_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) utilities/merge_operators/string_append/stringappend_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+redis_test: utilities/redis/redis_lists_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) utilities/redis/redis_lists_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+histogram_test: util/histogram_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) util/histogram_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o$@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+corruption_test: db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+crc32c_test: util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+db_test: db/db_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+simple_table_db_test: db/simple_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) db/simple_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+table_reader_bench: table/table_reader_bench.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) table/table_reader_bench.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+perf_context_test: db/perf_context_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) db/perf_context_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS)
+
+prefix_test: db/prefix_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) db/prefix_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS)
+
+backupable_db_test: utilities/backupable/backupable_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) utilities/backupable/backupable_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@  $(LDFLAGS) $(COVERAGEFLAGS)
+
+ttl_test: utilities/ttl/ttl_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) utilities/ttl/ttl_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@  $(LDFLAGS) $(COVERAGEFLAGS)
+
+dbformat_test: db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+env_test: util/env_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) util/env_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+filename_test: db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+filter_block_test: table/filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) table/filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+log_test: db/log_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+table_test: table/table_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) table/table_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+block_test: table/block_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) table/block_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+skiplist_test: db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+version_edit_test: db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+version_set_test: db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+reduce_levels_test: tools/reduce_levels_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) tools/reduce_levels_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+write_batch_test: db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+merge_test: db/merge_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) db/merge_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+deletefile_test: db/deletefile_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) db/deletefile_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS)
+
+$(MEMENVLIBRARY) : $(MEMENVOBJECTS)
+       rm -f $@
+       $(AR) -rs $@ $(MEMENVOBJECTS)
+
+memenv_test : helpers/memenv/memenv_test.o $(MEMENVLIBRARY) $(LIBRARY) $(TESTHARNESS)
+       $(CXX) helpers/memenv/memenv_test.o $(MEMENVLIBRARY) $(LIBRARY) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+manual_compaction_test: util/manual_compaction_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) util/manual_compaction_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+rocksdb_shell: tools/shell/ShellContext.o tools/shell/ShellState.o tools/shell/LeveldbShell.o tools/shell/DBClientProxy.o tools/shell/ShellContext.h tools/shell/ShellState.h tools/shell/DBClientProxy.h $(LIBOBJECTS)
+       $(CXX) tools/shell/ShellContext.o tools/shell/ShellState.o tools/shell/LeveldbShell.o tools/shell/DBClientProxy.o $(LIBOBJECTS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+DBClientProxy_test: tools/shell/test/DBClientProxyTest.o tools/shell/DBClientProxy.o $(LIBRARY)
+       $(CXX) tools/shell/test/DBClientProxyTest.o tools/shell/DBClientProxy.o $(LIBRARY) $(EXEC_LDFLAGS) $(EXEC_LDFLAGS) -o $@  $(LDFLAGS) $(COVERAGEFLAGS)
+
+filelock_test: util/filelock_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) util/filelock_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+auto_roll_logger_test: util/auto_roll_logger_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) util/auto_roll_logger_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+sst_dump: tools/sst_dump.o $(LIBOBJECTS)
+       $(CXX) tools/sst_dump.o $(LIBOBJECTS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+ldb: tools/ldb.o $(LIBOBJECTS)
+       $(CXX) tools/ldb.o $(LIBOBJECTS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+# ---------------------------------------------------------------------------
+#      Platform-specific compilation
+# ---------------------------------------------------------------------------
+
+ifeq ($(PLATFORM), IOS)
+# For iOS, create universal object files to be used on both the simulator and
+# a device.
+PLATFORMSROOT=/Applications/Xcode.app/Contents/Developer/Platforms
+SIMULATORROOT=$(PLATFORMSROOT)/iPhoneSimulator.platform/Developer
+DEVICEROOT=$(PLATFORMSROOT)/iPhoneOS.platform/Developer
+IOSVERSION=$(shell defaults read $(PLATFORMSROOT)/iPhoneOS.platform/versionCFBundleShortVersionString)
+
+.cc.o:
+       mkdir -p ios-x86/$(dir $@)
+       $(SIMULATORROOT)/usr/bin/$(CXX) $(CXXFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -c $< -o ios-x86/$@ $(COVERAGEFLAGS)
+       mkdir -p ios-arm/$(dir $@)
+       $(DEVICEROOT)/usr/bin/$(CXX) $(CXXFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -c $< -o ios-arm/$@ $(COVERAGEFLAGS)
+       lipo ios-x86/$@ ios-arm/$@ -create -output $@
+
+.c.o:
+       mkdir -p ios-x86/$(dir $@)
+       $(SIMULATORROOT)/usr/bin/$(CC) $(CFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -c $< -o ios-x86/$@
+       mkdir -p ios-arm/$(dir $@)
+       $(DEVICEROOT)/usr/bin/$(CC) $(CFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -c $< -o ios-arm/$@
+       lipo ios-x86/$@ ios-arm/$@ -create -output $@
+
+else
+.cc.o:
+       $(CXX) $(CXXFLAGS) -c $< -o $@ $(COVERAGEFLAGS)
+
+.c.o:
+       $(CC) $(CFLAGS) -c $< -o $@
+endif
+
+# ---------------------------------------------------------------------------
+#      Source files dependencies detection
+# ---------------------------------------------------------------------------
+
+# Add proper dependency support so changing a .h file forces a .cc file to
+# rebuild.
+
+# The .d file indicates .cc file's dependencies on .h files. We generate such
+# dependency by g++'s -MM option, whose output is a make dependency rule.
+# The sed command makes sure the "target" file in the generated .d file has
+# the correct path prefix.
+%.d: %.cc
+       $(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) -MM $< -o $@
+ifeq ($(PLATFORM), OS_MACOSX)
+       @sed -i '' -e 's,.*:,$*.o:,' $@
+else
+       @sed -i -e 's,.*:,$*.o:,' $@
+endif
+
+DEPFILES = $(filter-out util/build_version.d,$(SOURCES:.cc=.d))
+
+depend: $(DEPFILES)
+
+# if the make goal is either "clean" or "format", we shouldn't
+# try to import the *.d files.
+# TODO(kailiu) The unfamiliarity of Make's conditions leads to the ugly
+# working solution.
+ifneq ($(MAKECMDGOALS),clean)
+ifneq ($(MAKECMDGOALS),format)
+-include $(DEPFILES)
+endif
+endif
diff --git a/PATENTS b/PATENTS
new file mode 100644 (file)
index 0000000..8a6fca4
--- /dev/null
+++ b/PATENTS
@@ -0,0 +1,23 @@
+Additional Grant of Patent Rights
+
+“Software” means the rocksdb software distributed by Facebook, Inc.
+
+Facebook hereby grants you a perpetual, worldwide, royalty-free,
+non-exclusive, irrevocable (subject to the termination provision below)
+license under any rights in any patent claims owned by Facebook, to make,
+have made, use, sell, offer to sell, import, and otherwise transfer the
+Software. For avoidance of doubt, no license is granted under Facebook’s
+rights in any patent claims that are infringed by (i) modifications to the
+Software made by you or a third party, or (ii) the Software in combination
+with any software or other technology provided by you or a third party.
+
+The license granted hereunder will terminate, automatically and without
+notice, for anyone that makes any claim (including by filing any lawsuit,
+assertion or other action) alleging (a) direct, indirect, or contributory
+infringement or inducement to infringe any patent: (i) by Facebook or any
+of its subsidiaries or affiliates, whether or not such claim is related
+to the Software, (ii) by any party if such claim arises in whole or in
+part from any software, product or service of Facebook or any of its
+subsidiaries or affiliates, whether or not such claim is related to the
+Software, or (iii) by any party relating to the Software; or (b) that
+any right in any patent claim of Facebook is invalid or unenforceable.
diff --git a/README b/README
new file mode 100644 (file)
index 0000000..473e414
--- /dev/null
+++ b/README
@@ -0,0 +1,82 @@
+rocksdb: A persistent key-value store for flash storage
+Authors: * The Facebook Database Engineering Team
+         * Build on earlier work on leveldb by Sanjay Ghemawat
+           (sanjay@google.com) and Jeff Dean (jeff@google.com)
+
+This code is a library that forms the core building block for a fast
+key value server, especially suited for storing data on flash drives.
+It has an Log-Structured-Merge-Database (LSM) design with flexible tradeoffs
+between Write-Amplification-Factor(WAF), Read-Amplification-Factor (RAF)
+and Space-Amplification-Factor(SAF). It has multi-threaded compactions,
+making it specially suitable for storing multiple terabytes of data in a
+single database.
+
+The core of this code has been derived from open-source leveldb.
+
+The code under this directory implements a system for maintaining a
+persistent key/value store.
+
+See doc/index.html and github wiki (https://github.com/facebook/rocksdb/wiki)
+for more explanation.
+
+The public interface is in include/*.  Callers should not include or
+rely on the details of any other header files in this package.  Those
+internal APIs may be changed without warning.
+
+Guide to header files:
+
+include/rocksdb/db.h
+    Main interface to the DB: Start here
+
+include/rocksdb/options.h
+    Control over the behavior of an entire database, and also
+    control over the behavior of individual reads and writes.
+
+include/rocksdb/comparator.h
+    Abstraction for user-specified comparison function.  If you want
+    just bytewise comparison of keys, you can use the default comparator,
+    but clients can write their own comparator implementations if they
+    want custom ordering (e.g. to handle different character
+    encodings, etc.)
+
+include/rocksdb/iterator.h
+    Interface for iterating over data. You can get an iterator
+    from a DB object.
+
+include/rocksdb/write_batch.h
+    Interface for atomically applying multiple updates to a database.
+
+include/rocksdb/slice.h
+    A simple module for maintaining a pointer and a length into some
+    other byte array.
+
+include/rocksdb/status.h
+    Status is returned from many of the public interfaces and is used
+    to report success and various kinds of errors.
+
+include/rocksdb/env.h
+    Abstraction of the OS environment.  A posix implementation of
+    this interface is in util/env_posix.cc
+
+include/rocksdb/table_builder.h
+    Lower-level modules that most clients probably won't use directly
+
+include/rocksdb/cache.h
+    An API for the block cache.
+
+include/rocksdb/compaction_filter.h
+    An API for a application filter invoked on every compaction.
+
+include/rocksdb/filter_policy.h
+    An API for configuring a bloom filter.
+
+include/rocksdb/memtablerep.h
+    An API for implementing a memtable.
+
+include/rocksdb/statistics.h
+    An API to retrieve various database statistics.
+
+include/rocksdb/transaction_log.h
+    An API to retrieve transaction logs from a database.
+
+Design discussions are conducted in https://www.facebook.com/groups/rocksdb.dev/
diff --git a/README.fb b/README.fb
new file mode 100644 (file)
index 0000000..d3cc411
--- /dev/null
+++ b/README.fb
@@ -0,0 +1,3 @@
+* Detailed instructions on how to compile using fbcode and jemalloc
+
+* Latest release is 2.7.fb
diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform
new file mode 100755 (executable)
index 0000000..8e83ae4
--- /dev/null
@@ -0,0 +1,292 @@
+#!/bin/sh
+#
+# Detects OS we're compiling on and outputs a file specified by the first
+# argument, which in turn gets read while processing Makefile.
+#
+# The output will set the following variables:
+#   CC                          C Compiler path
+#   CXX                         C++ Compiler path
+#   PLATFORM_LDFLAGS            Linker flags
+#   PLATFORM_SHARED_EXT         Extension for shared libraries
+#   PLATFORM_SHARED_LDFLAGS     Flags for building shared library
+#   PLATFORM_SHARED_CFLAGS      Flags for compiling objects for shared library
+#   PLATFORM_CCFLAGS            C compiler flags
+#   PLATFORM_CXXFLAGS           C++ compiler flags.  Will contain:
+#   PLATFORM_SHARED_VERSIONED   Set to 'true' if platform supports versioned
+#                               shared libraries, empty otherwise.
+#
+# The PLATFORM_CCFLAGS and PLATFORM_CXXFLAGS might include the following:
+#
+#       -DLEVELDB_PLATFORM_POSIX if cstdatomic is present
+#       -DLEVELDB_PLATFORM_NOATOMIC if it is not
+#       -DSNAPPY                     if the Snappy library is present
+#
+# Using gflags in rocksdb:
+# Our project depends on gflags, which requires users to take some extra steps
+# before they can compile the whole repository:
+#   1. Install gflags. You may download it from here:
+#      https://code.google.com/p/gflags/
+#   2. Once install, add the include path/lib path for gflags to CPATH and
+#      LIBRARY_PATH respectively. If installed with default mode, the
+#      lib and include path will be /usr/local/lib and /usr/local/include
+# Mac user can do this by running build_tools/mac-install-gflags.sh
+
+OUTPUT=$1
+if test -z "$OUTPUT"; then
+  echo "usage: $0 <output-filename>" >&2
+  exit 1
+fi
+
+# we depend on C++11
+PLATFORM_CXXFLAGS="-std=gnu++11"
+# we currently depend on POSIX platform
+COMMON_FLAGS="-DROCKSDB_PLATFORM_POSIX"
+
+# Default to fbcode gcc on internal fb machines
+if [ -d /mnt/gvfs/third-party -a -z "$CXX" ]; then
+    FBCODE_BUILD="true"
+    if [ -z "$USE_CLANG" ]; then
+        CENTOS_VERSION=`rpm -q --qf "%{VERSION}" \
+          $(rpm -q --whatprovides redhat-release)`
+        if [ "$CENTOS_VERSION" = "6" ]; then
+          source $PWD/build_tools/fbcode.gcc481.sh
+        else
+          source $PWD/build_tools/fbcode.gcc471.sh
+        fi
+    else
+        source $PWD/build_tools/fbcode.clang31.sh
+    fi
+fi
+
+# Delete existing output, if it exists
+rm -f $OUTPUT
+touch $OUTPUT
+
+if test -z "$CC"; then
+   CC=cc
+fi
+
+if test -z "$CXX"; then
+    CXX=g++
+fi
+
+# Detect OS
+if test -z "$TARGET_OS"; then
+    TARGET_OS=`uname -s`
+fi
+
+COMMON_FLAGS="$COMMON_FLAGS ${CFLAGS}"
+CROSS_COMPILE=
+PLATFORM_CCFLAGS=
+PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS ${CXXFLAGS}"
+PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS"
+PLATFORM_SHARED_EXT="so"
+PLATFORM_SHARED_LDFLAGS="-shared -Wl,-soname -Wl,"
+PLATFORM_SHARED_CFLAGS="-fPIC"
+PLATFORM_SHARED_VERSIONED=false
+
+# generic port files (working on all platform by #ifdef) go directly in /port
+GENERIC_PORT_FILES=`find $ROCKSDB_ROOT/port -name '*.cc' | tr "\n" " "`
+
+# On GCC, we pick libc's memcmp over GCC's memcmp via -fno-builtin-memcmp
+case "$TARGET_OS" in
+    Darwin)
+        PLATFORM=OS_MACOSX
+        COMMON_FLAGS="$COMMON_FLAGS -DOS_MACOSX"
+        PLATFORM_SHARED_EXT=dylib
+        PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name "
+        # PORT_FILES=port/darwin/darwin_specific.cc
+        ;;
+    Linux)
+        PLATFORM=OS_LINUX
+        COMMON_FLAGS="$COMMON_FLAGS -DOS_LINUX"
+        if [ -z "$USE_CLANG" ]; then
+            COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp"
+        fi
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lrt"
+        # PORT_FILES=port/linux/linux_specific.cc
+        ;;
+    SunOS)
+        PLATFORM=OS_SOLARIS
+        COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_SOLARIS"
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lrt"
+        # PORT_FILES=port/sunos/sunos_specific.cc
+        ;;
+    FreeBSD)
+        PLATFORM=OS_FREEBSD
+        COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_FREEBSD"
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread"
+        # PORT_FILES=port/freebsd/freebsd_specific.cc
+        ;;
+    NetBSD)
+        PLATFORM=OS_NETBSD
+        COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_NETBSD"
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lgcc_s"
+        # PORT_FILES=port/netbsd/netbsd_specific.cc
+        ;;
+    OpenBSD)
+        PLATFORM=OS_OPENBSD
+        COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_OPENBSD"
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -pthread"
+        # PORT_FILES=port/openbsd/openbsd_specific.cc
+        ;;
+    DragonFly)
+        PLATFORM=OS_DRAGONFLYBSD
+        COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_DRAGONFLYBSD"
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread"
+        # PORT_FILES=port/dragonfly/dragonfly_specific.cc
+        ;;
+    OS_ANDROID_CROSSCOMPILE)
+        PLATFORM=OS_ANDROID
+       COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_ANDROID -DLEVELDB_PLATFORM_POSIX"
+       PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS "  # All pthread features are in the Android C library
+        # PORT_FILES=port/android/android.cc
+        CROSS_COMPILE=true
+        ;;
+    *)
+        echo "Unknown platform!" >&2
+        exit 1
+esac
+
+$PWD/build_tools/build_detect_version
+
+# We want to make a list of all cc files within util, db, table, and helpers
+# except for the test and benchmark files. By default, find will output a list
+# of all files matching either rule, so we need to append -print to make the
+# prune take effect.
+DIRS="util db table utilities"
+
+set -f # temporarily disable globbing so that our patterns arent expanded
+PRUNE_TEST="-name *test*.cc -prune"
+PRUNE_BENCH="-name *_bench.cc -prune"
+PORTABLE_FILES=`cd $ROCKSDB_ROOT; find $DIRS $PRUNE_TEST -o $PRUNE_BENCH -o -name '*.cc' -print | sort | tr "\n" " "`
+PORTABLE_CPP=`cd $ROCKSDB_ROOT; find $DIRS $PRUNE_TEST -o $PRUNE_BENCH -o -name '*.cpp' -print | sort | tr "\n" " "`
+set +f # re-enable globbing
+
+# The sources consist of the portable files, plus the platform-specific port
+# file.
+echo "SOURCES=$PORTABLE_FILES $GENERIC_PORT_FILES $PORT_FILES" >> $OUTPUT
+echo "SOURCESCPP=$PORTABLE_CPP" >> $OUTPUT
+echo "MEMENV_SOURCES=helpers/memenv/memenv.cc" >> $OUTPUT
+
+if [ "$CROSS_COMPILE" = "true" -o "$FBCODE_BUILD" = "true" ]; then
+    # Cross-compiling; do not try any compilation tests.
+    # Also don't need any compilation tests if compiling on fbcode
+    true
+else
+    # do fPIC on 64 bit in non-fbcode environment
+    case "$TARGET_OS" in
+        x86_64)
+            PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS -fPIC"
+    esac
+
+    # If -std=c++0x works, use <atomic>.  Otherwise use port_posix.h.
+    $CXX $CFLAGS -std=c++0x -x c++ - -o /dev/null 2>/dev/null  <<EOF
+      #include <atomic>
+      int main() {}
+EOF
+    if [ "$?" = 0 ]; then
+        COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_ATOMIC_PRESENT"
+    fi
+
+    # Test whether fallocate is available
+    $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+      #include <fcntl.h>
+      int main() {
+       int fd = open("/dev/null", 0);
+       fallocate(fd, 0, 0, 1024);
+      }
+EOF
+    if [ "$?" = 0 ]; then
+        COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_FALLOCATE_PRESENT"
+    fi
+
+    # Test whether Snappy library is installed
+    # http://code.google.com/p/snappy/
+    $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+      #include <snappy.h>
+      int main() {}
+EOF
+    if [ "$?" = 0 ]; then
+        COMMON_FLAGS="$COMMON_FLAGS -DSNAPPY"
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lsnappy"
+    fi
+
+
+    # Test whether gflags library is installed
+    # http://code.google.com/p/gflags/
+    $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+      #include <gflags/gflags.h>
+      int main() {}
+EOF
+    if [ "$?" = 0 ]; then
+        COMMON_FLAGS="$COMMON_FLAGS -DGFLAGS"
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lgflags"
+    fi
+
+    # Test whether zlib library is installed
+    $CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+      #include <zlib.h>
+      int main() {}
+EOF
+    if [ "$?" = 0 ]; then
+        COMMON_FLAGS="$COMMON_FLAGS -DZLIB"
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lz"
+    fi
+
+    # Test whether bzip library is installed
+    $CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+      #include <bzlib.h>
+      int main() {}
+EOF
+    if [ "$?" = 0 ]; then
+        COMMON_FLAGS="$COMMON_FLAGS -DBZIP2"
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lbz2"
+    fi
+
+    # Test whether tcmalloc is available
+    $CXX $CFLAGS -x c++ - -o /dev/null -ltcmalloc 2>/dev/null  <<EOF
+      int main() {}
+EOF
+    if [ "$?" = 0 ]; then
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -ltcmalloc"
+    fi
+fi
+
+# shall we use HDFS?
+
+if test "$USE_HDFS"; then
+  if test -z "$JAVA_HOME"; then
+    echo "JAVA_HOME has to be set for HDFS usage."
+    exit 1
+  fi
+  HDFS_CCFLAGS="$HDFS_CCFLAGS -I$JAVA_HOME/include -I$JAVA_HOME/include/linux -DUSE_HDFS"
+  HDFS_LDFLAGS="$HDFS_LDFLAGS -Wl,--no-whole-archive hdfs/libhdfs.a -L$JAVA_HOME/jre/lib/amd64"
+  HDFS_LDFLAGS="$HDFS_LDFLAGS -L$JAVA_HOME/jre/lib/amd64/server -L$GLIBC_RUNTIME_PATH/lib"
+  HDFS_LDFLAGS="$HDFS_LDFLAGS -ldl -lverify -ljava -ljvm"
+  COMMON_FLAGS="$COMMON_FLAGS $HDFS_CCFLAGS"
+  PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS $HDFS_LDFLAGS"
+fi
+
+# if Intel SSE instruction set is supported, set USE_SSE=" -msse -msse4.2 "
+COMMON_FLAGS="$COMMON_FLAGS $USE_SSE"
+
+PLATFORM_CCFLAGS="$PLATFORM_CCFLAGS $COMMON_FLAGS"
+PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS $COMMON_FLAGS"
+
+VALGRIND_VER="$VALGRIND_VER"
+
+echo "CC=$CC" >> $OUTPUT
+echo "CXX=$CXX" >> $OUTPUT
+echo "PLATFORM=$PLATFORM" >> $OUTPUT
+echo "PLATFORM_LDFLAGS=$PLATFORM_LDFLAGS" >> $OUTPUT
+echo "VALGRIND_VER=$VALGRIND_VER" >> $OUTPUT
+echo "PLATFORM_CCFLAGS=$PLATFORM_CCFLAGS" >> $OUTPUT
+echo "PLATFORM_CXXFLAGS=$PLATFORM_CXXFLAGS" >> $OUTPUT
+echo "PLATFORM_SHARED_CFLAGS=$PLATFORM_SHARED_CFLAGS" >> $OUTPUT
+echo "PLATFORM_SHARED_EXT=$PLATFORM_SHARED_EXT" >> $OUTPUT
+echo "PLATFORM_SHARED_LDFLAGS=$PLATFORM_SHARED_LDFLAGS" >> $OUTPUT
+echo "PLATFORM_SHARED_VERSIONED=$PLATFORM_SHARED_VERSIONED" >> $OUTPUT
+echo "EXEC_LDFLAGS=$EXEC_LDFLAGS" >> $OUTPUT
+echo "JEMALLOC_INCLUDE=$JEMALLOC_INCLUDE" >> $OUTPUT
+echo "JEMALLOC_LIB=$JEMALLOC_LIB" >> $OUTPUT
diff --git a/build_tools/build_detect_version b/build_tools/build_detect_version
new file mode 100755 (executable)
index 0000000..f7d711f
--- /dev/null
@@ -0,0 +1,22 @@
+#!/bin/sh
+#
+# Record the version of the source that we are compiling.
+# We keep a record of the git revision in util/version.cc. This source file
+# is then built as a regular source file as part of the compilation process.
+# One can run "strings executable_filename | grep _build_" to find the version of
+# the source that we used to build the executable file.
+
+OUTFILE="$PWD/util/build_version.cc"
+
+GIT_SHA=""
+if command -v git >/dev/null 2>&1; then
+    GIT_SHA=$(git rev-parse HEAD 2>/dev/null)
+fi
+
+cat > "${OUTFILE}" <<EOF
+#include "build_version.h"
+const char* rocksdb_build_git_sha = "rocksdb_build_git_sha:${GIT_SHA}";
+const char* rocksdb_build_git_datetime = "rocksdb_build_git_datetime:$(date)";
+const char* rocksdb_build_compile_date = __DATE__;
+const char* rocksdb_build_compile_time = __TIME__;
+EOF
diff --git a/build_tools/fbcode.clang31.sh b/build_tools/fbcode.clang31.sh
new file mode 100644 (file)
index 0000000..25a2ca7
--- /dev/null
@@ -0,0 +1,74 @@
+#!/bin/sh
+#
+# Set environment variables so that we can compile leveldb using
+# fbcode settings.  It uses the latest g++ compiler and also
+# uses jemalloc
+
+TOOLCHAIN_REV=fbe3b095a4cc4a3713730050d182b7b4a80c342f
+TOOLCHAIN_EXECUTABLES="/mnt/gvfs/third-party/$TOOLCHAIN_REV/centos5.2-native"
+TOOLCHAIN_LIB_BASE="/mnt/gvfs/third-party/$TOOLCHAIN_REV/gcc-4.7.1-glibc-2.14.1"
+TOOL_JEMALLOC=jemalloc-3.3.1/9202ce3
+GLIBC_RUNTIME_PATH=/usr/local/fbcode/gcc-4.7.1-glibc-2.14.1
+
+# location of libgcc
+LIBGCC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include"
+LIBGCC_LIBS=" -L $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/libs"
+
+# location of glibc
+GLIBC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/glibc/glibc-2.14.1/99df8fc/include"
+GLIBC_LIBS=" -L $TOOLCHAIN_LIB_BASE/glibc/glibc-2.14.1/99df8fc/lib"
+
+# location of snappy headers and libraries
+SNAPPY_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/7518bbe/include"
+SNAPPY_LIBS=" $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/7518bbe/lib/libsnappy.a"
+
+# location of zlib headers and libraries
+ZLIB_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/91ddd43/include"
+ZLIB_LIBS=" $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/91ddd43/lib/libz.a"
+
+# location of gflags headers and libraries
+GFLAGS_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/include"
+GFLAGS_LIBS=" $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/lib/libgflags.a"
+
+# location of bzip headers and libraries
+BZIP_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/91ddd43/include"
+BZIP_LIBS=" $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/91ddd43/lib/libbz2.a"
+
+# location of gflags headers and libraries
+GFLAGS_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/include"
+GFLAGS_LIBS=" $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/lib/libgflags.a"
+
+# use Intel SSE support for checksum calculations
+export USE_SSE=" -msse -msse4.2 "
+
+CC="$TOOLCHAIN_EXECUTABLES/clang/clang-3.2/0b7c69d/bin/clang $CLANG_INCLUDES"
+CXX="$TOOLCHAIN_EXECUTABLES/clang/clang-3.2/0b7c69d/bin/clang++ $CLANG_INCLUDES $JINCLUDE $SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $GFLAGS_INCLUDE"
+AR=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ar
+RANLIB=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ranlib
+
+CFLAGS="-B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin -nostdlib "
+CFLAGS+=" -nostdinc -isystem $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include/c++/4.7.1 "
+CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include/c++/4.7.1/x86_64-facebook-linux "
+CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include/c++/4.7.1/backward "
+CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/glibc/glibc-2.14.1/99df8fc/include "
+CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include "
+CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/clang/clang-3.2/0b7c69d/lib/clang/3.2/include "
+CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/kernel-headers/kernel-headers-3.2.18_70_fbk11_00129_gc8882d0/da39a3e/include/linux "
+CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/kernel-headers/kernel-headers-3.2.18_70_fbk11_00129_gc8882d0/da39a3e/include "
+CFLAGS+=" -Wall -Wno-sign-compare -Wno-unused-variable -Winvalid-pch -Wno-deprecated -Woverloaded-virtual"
+CFLAGS+=" $LIBGCC_INCLUDE $GLIBC_INCLUDE"
+CXXFLAGS="$CFLAGS -nostdinc++"
+
+CFLAGS+=" -I $TOOLCHAIN_LIB_BASE/jemalloc/$TOOL_JEMALLOC/include -DHAVE_JEMALLOC"
+
+EXEC_LDFLAGS=" -Wl,--whole-archive $TOOLCHAIN_LIB_BASE/jemalloc/$TOOL_JEMALLOC/lib/libjemalloc.a"
+EXEC_LDFLAGS+=" -Wl,--no-whole-archive $TOOLCHAIN_LIB_BASE/libunwind/libunwind-1.0.1/350336c/lib/libunwind.a"
+EXEC_LDFLAGS+=" $HDFSLIB $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $GFLAGS_LIBS"
+EXEC_LDFLAGS+=" -Wl,--dynamic-linker,$GLIBC_RUNTIME_PATH/lib/ld-linux-x86-64.so.2"
+EXEC_LDFLAGS+=" -B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin"
+
+PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS "
+
+EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $GFLAGS_LIBS"
+
+export CC CXX AR RANLIB CFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED 
diff --git a/build_tools/fbcode.gcc471.sh b/build_tools/fbcode.gcc471.sh
new file mode 100644 (file)
index 0000000..9294057
--- /dev/null
@@ -0,0 +1,70 @@
+#!/bin/sh
+#
+# Set environment variables so that we can compile leveldb using
+# fbcode settings.  It uses the latest g++ compiler and also
+# uses jemalloc
+
+TOOLCHAIN_REV=fbe3b095a4cc4a3713730050d182b7b4a80c342f
+TOOLCHAIN_EXECUTABLES="/mnt/gvfs/third-party/$TOOLCHAIN_REV/centos5.2-native"
+TOOLCHAIN_LIB_BASE="/mnt/gvfs/third-party/$TOOLCHAIN_REV/gcc-4.7.1-glibc-2.14.1"
+TOOL_JEMALLOC=jemalloc-3.3.1/9202ce3
+
+# location of libhdfs libraries
+if test "$USE_HDFS"; then
+  JAVA_HOME="/usr/local/jdk-6u22-64"
+  JINCLUDE="-I$JAVA_HOME/include -I$JAVA_HOME/include/linux"
+  GLIBC_RUNTIME_PATH="/usr/local/fbcode/gcc-4.7.1-glibc-2.14.1"
+  HDFSLIB=" -Wl,--no-whole-archive hdfs/libhdfs.a -L$JAVA_HOME/jre/lib/amd64 "
+  HDFSLIB+=" -L$JAVA_HOME/jre/lib/amd64/server -L$GLIBC_RUNTIME_PATH/lib "
+  HDFSLIB+=" -ldl -lverify -ljava -ljvm "
+fi
+
+# location of libgcc
+LIBGCC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include"
+LIBGCC_LIBS=" -L $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/libs"
+
+# location of glibc
+GLIBC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/glibc/glibc-2.14.1/99df8fc/include"
+GLIBC_LIBS=" -L $TOOLCHAIN_LIB_BASE/glibc/glibc-2.14.1/99df8fc/lib"
+
+# location of snappy headers and libraries
+SNAPPY_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/7518bbe/include"
+SNAPPY_LIBS=" $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/7518bbe/lib/libsnappy.a"
+
+# location of zlib headers and libraries
+ZLIB_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/91ddd43/include"
+ZLIB_LIBS=" $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/91ddd43/lib/libz.a"
+
+# location of bzip headers and libraries
+BZIP_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/91ddd43/include"
+BZIP_LIBS=" $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/91ddd43/lib/libbz2.a"
+
+# location of gflags headers and libraries
+GFLAGS_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/include"
+GFLAGS_LIBS=" $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/lib/libgflags.a"
+
+# use Intel SSE support for checksum calculations
+export USE_SSE=" -msse -msse4.2 "
+
+CC="$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.7.1-glibc-2.14.1/bin/gcc"
+CXX="$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.7.1-glibc-2.14.1/bin/g++ $JINCLUDE $SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $GFLAGS_INCLUDE"
+AR=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ar
+RANLIB=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ranlib
+
+CFLAGS="-B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/gold -m64 -mtune=generic"
+CFLAGS+=" -I $TOOLCHAIN_LIB_BASE/jemalloc/$TOOL_JEMALLOC/include -DHAVE_JEMALLOC"
+CFLAGS+=" $LIBGCC_INCLUDE $GLIBC_INCLUDE"
+CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_ATOMIC_PRESENT -DROCKSDB_FALLOCATE_PRESENT"
+CFLAGS+=" -DSNAPPY -DGFLAGS -DZLIB -DBZIP2"
+
+EXEC_LDFLAGS=" -Wl,--whole-archive $TOOLCHAIN_LIB_BASE/jemalloc/$TOOL_JEMALLOC/lib/libjemalloc.a"
+EXEC_LDFLAGS+=" -Wl,--no-whole-archive $TOOLCHAIN_LIB_BASE/libunwind/libunwind-1.0.1/350336c/lib/libunwind.a"
+EXEC_LDFLAGS+=" $HDFSLIB $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $GFLAGS_LIBS"
+
+PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS "
+
+EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $GFLAGS_LIBS"
+
+VALGRIND_VER="$TOOLCHAIN_LIB_BASE/valgrind/valgrind-3.8.1/91ddd43/bin/"
+
+export CC CXX AR RANLIB CFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER
diff --git a/build_tools/fbcode.gcc481.sh b/build_tools/fbcode.gcc481.sh
new file mode 100644 (file)
index 0000000..e8c9f09
--- /dev/null
@@ -0,0 +1,77 @@
+#!/bin/sh
+#
+# Set environment variables so that we can compile rocksdb using
+# fbcode settings.  It uses the latest g++ compiler and also
+# uses jemalloc
+
+TOOLCHAIN_REV=53dc1fe83f84e9145b9ffb81b81aa7f6a49c87cc
+CENTOS_VERSION=`rpm -q --qf "%{VERSION}" $(rpm -q --whatprovides redhat-release)`
+if [ "$CENTOS_VERSION" = "6" ]; then
+  TOOLCHAIN_EXECUTABLES="/mnt/gvfs/third-party/$TOOLCHAIN_REV/centos6-native"
+else
+  TOOLCHAIN_EXECUTABLES="/mnt/gvfs/third-party/$TOOLCHAIN_REV/centos5.2-native"
+fi
+TOOLCHAIN_LIB_BASE="/mnt/gvfs/third-party/$TOOLCHAIN_REV/gcc-4.8.1-glibc-2.17"
+
+# location of libhdfs libraries
+if test "$USE_HDFS"; then
+  JAVA_HOME="/usr/local/jdk-6u22-64"
+  JINCLUDE="-I$JAVA_HOME/include -I$JAVA_HOME/include/linux"
+  GLIBC_RUNTIME_PATH="/usr/local/fbcode/gcc-4.8.1-glibc-2.17"
+  HDFSLIB=" -Wl,--no-whole-archive hdfs/libhdfs.a -L$JAVA_HOME/jre/lib/amd64 "
+  HDFSLIB+=" -L$JAVA_HOME/jre/lib/amd64/server -L$GLIBC_RUNTIME_PATH/lib "
+  HDFSLIB+=" -ldl -lverify -ljava -ljvm "
+fi
+
+# location of libgcc
+LIBGCC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.8.1/8aac7fc/include"
+LIBGCC_LIBS=" -L $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.8.1/8aac7fc/libs"
+
+# location of glibc
+GLIBC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/glibc/glibc-2.17/99df8fc/include"
+GLIBC_LIBS=" -L $TOOLCHAIN_LIB_BASE/glibc/glibc-2.17/99df8fc/lib"
+
+# location of snappy headers and libraries
+SNAPPY_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/43d84e2/include"
+SNAPPY_LIBS=" $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/43d84e2/lib/libsnappy.a"
+
+# location of zlib headers and libraries
+ZLIB_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/c3f970a/include"
+ZLIB_LIBS=" $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/c3f970a/lib/libz.a"
+
+# location of bzip headers and libraries
+BZIP_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/c3f970a/include"
+BZIP_LIBS=" $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/c3f970a/lib/libbz2.a"
+
+# location of gflags headers and libraries
+GFLAGS_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/c3f970a/include"
+GFLAGS_LIBS=" $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/c3f970a/lib/libgflags.a"
+
+# location of jemalloc
+JEMALLOC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/jemalloc/jemalloc-3.4.1/4d53c6f/include/"
+JEMALLOC_LIB=" -Wl,--whole-archive $TOOLCHAIN_LIB_BASE/jemalloc/jemalloc-3.4.1/4d53c6f/lib/libjemalloc.a"
+
+# use Intel SSE support for checksum calculations
+export USE_SSE=" -msse -msse4.2 "
+
+CC="$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.8.1/cc6c9dc/bin/gcc"
+CXX="$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.8.1/cc6c9dc/bin/g++ $JINCLUDE $SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $GFLAGS_INCLUDE"
+AR=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ar
+RANLIB=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ranlib
+
+CFLAGS="-B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/gold -m64 -mtune=generic"
+CFLAGS+=" $LIBGCC_INCLUDE $GLIBC_INCLUDE"
+CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_ATOMIC_PRESENT -DROCKSDB_FALLOCATE_PRESENT"
+CFLAGS+=" -DSNAPPY -DGFLAGS -DZLIB -DBZIP2"
+
+EXEC_LDFLAGS="-Wl,--dynamic-linker,/usr/local/fbcode/gcc-4.8.1-glibc-2.17/lib/ld.so"
+EXEC_LDFLAGS+=" -Wl,--no-whole-archive $TOOLCHAIN_LIB_BASE/libunwind/libunwind-1.0.1/675d945/lib/libunwind.a"
+EXEC_LDFLAGS+=" $HDFSLIB $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $GFLAGS_LIBS"
+
+PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS "
+
+EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $GFLAGS_LIBS"
+
+VALGRIND_VER="$TOOLCHAIN_LIB_BASE/valgrind/valgrind-3.8.1/c3f970a/bin/"
+
+export CC CXX AR RANLIB CFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE
diff --git a/build_tools/format-diff.sh b/build_tools/format-diff.sh
new file mode 100755 (executable)
index 0000000..758135c
--- /dev/null
@@ -0,0 +1,83 @@
+#!/bin/bash
+set -e
+# If clang_format_diff.py command is not specfied, we assume we are able to
+# access directly without any path.
+if [ -z $CLANG_FORMAT_DIFF ]
+then
+CLANG_FORMAT_DIFF="clang-format-diff.py"
+fi
+
+# Check clang-format-diff.py
+if ! which $CLANG_FORMAT_DIFF &> /dev/null
+then
+  echo "You didn't have clang-format-diff.py available in your computer!"
+  echo "You can download it by running: "
+  echo "    curl https://fburl.com/clang-format-diff"
+  exit 128
+fi
+
+# Check argparse, a library that clang-format-diff.py requires.
+python 2>/dev/null << EOF
+import argparse
+EOF
+
+if [ "$?" != 0 ]
+then
+  echo "To run clang-format-diff.py, we'll need the library "argparse" to be"
+  echo "installed. You can try either of the follow ways to install it:"
+  echo "  1. Manually download argparse: https://pypi.python.org/pypi/argparse"
+  echo "  2. easy_install argparse (if you have easy_install)"
+  echo "  3. pip install argparse (if you have pip)"
+  exit 129
+fi
+
+# TODO(kailiu) following work is not complete since we still need to figure
+# out how to add the modified files done pre-commit hook to git's commit index.
+#
+# Check if this script has already been added to pre-commit hook.
+# Will suggest user to add this script to pre-commit hook if their pre-commit
+# is empty.
+# PRE_COMMIT_SCRIPT_PATH="`git rev-parse --show-toplevel`/.git/hooks/pre-commit"
+# if ! ls $PRE_COMMIT_SCRIPT_PATH &> /dev/null
+# then
+#   echo "Would you like to add this script to pre-commit hook, which will do "
+#   echo -n "the format check for all the affected lines before you check in (y/n):"
+#   read add_to_hook
+#   if [ "$add_to_hook" == "y" ]
+#   then
+#     ln -s `git rev-parse --show-toplevel`/build_tools/format-diff.sh $PRE_COMMIT_SCRIPT_PATH
+#   fi
+# fi
+
+# Check the format of recently changed lines,
+diffs=$(git diff -U0 HEAD^ | $CLANG_FORMAT_DIFF -p 1)
+
+if [ -z "$diffs" ]
+then
+  echo "Nothing needs to be reformatted!"
+  exit 0
+fi
+
+# Highlight the insertion/deletion from the clang-format-diff.py's output
+COLOR_END="\033[0m"
+COLOR_RED="\033[0;31m" 
+COLOR_GREEN="\033[0;32m" 
+
+echo -e "Detect lines that doesn't follow the format rules:\r"
+# Add the color to the diff. lines added will be green; lines removed will be red.
+echo "$diffs" | 
+  sed -e "s/\(^-.*$\)/`echo -e \"$COLOR_RED\1$COLOR_END\"`/" |
+  sed -e "s/\(^+.*$\)/`echo -e \"$COLOR_GREEN\1$COLOR_END\"`/"
+echo -e "Would you like to fix the format automatically (y/n): \c"
+
+# Make sure under any mode, we can read user input.
+exec < /dev/tty
+read to_fix
+
+if [ "$to_fix" != "y" ]
+then
+  exit 1
+fi
+
+# Do in-place format adjustment.
+git diff -U0 HEAD^ | $CLANG_FORMAT_DIFF -i -p 1
diff --git a/build_tools/mac-install-gflags.sh b/build_tools/mac-install-gflags.sh
new file mode 100755 (executable)
index 0000000..ef0339c
--- /dev/null
@@ -0,0 +1,25 @@
+#!/bin/sh
+# Install gflags for mac developers.
+
+set -e
+
+DIR=`mktemp -d /tmp/rocksdb_gflags_XXXX`
+
+cd $DIR
+wget https://gflags.googlecode.com/files/gflags-2.0.tar.gz
+tar xvfz gflags-2.0.tar.gz
+cd gflags-2.0
+
+./configure
+make
+make install
+
+# Add include/lib path for g++
+echo 'export LIBRARY_PATH+=":/usr/local/lib"' >> ~/.bash_profile
+echo 'export CPATH+=":/usr/local/include"' >> ~/.bash_profile
+
+echo ""
+echo "-----------------------------------------------------------------------------"
+echo "|                         Installation Completed                            |"
+echo "-----------------------------------------------------------------------------"
+echo "Please run `. ~/bash_profile` to be able to compile with gflags"
diff --git a/build_tools/make_new_version.sh b/build_tools/make_new_version.sh
new file mode 100755 (executable)
index 0000000..ca8a212
--- /dev/null
@@ -0,0 +1,61 @@
+#!/bin/bash
+#  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+#  This source code is licensed under the BSD-style license found in the
+#  LICENSE file in the root directory of this source tree. An additional grant
+#  of patent rights can be found in the PATENTS file in the same directory.
+
+set -e
+# Print out the colored progress info so that it can be brainlessly 
+# distinguished by users.
+function title() {
+  echo -e "\033[1;32m$*\033[0m"
+}
+
+usage="Create new rocksdb version and prepare it for the release process\n"
+usage+="USAGE: ./make_new_version.sh <version>"
+
+# -- Pre-check
+if [[ $# < 1 ]]; then
+  echo -e $usage
+  exit 1
+fi
+
+ROCKSDB_VERSION=$1
+
+GIT_BRANCH=`git rev-parse --abbrev-ref HEAD`
+if [ $GIT_BRANCH != "master" ]; then
+  echo "Error: Current branch is '$GIT_BRANCH', Please switch to master branch."
+fi
+
+# --Step 1: cutting new tag
+title "Adding new tag for this release ..."
+git tag -a "$ROCKSDB_VERSION.fb" -m "Rocksdb $ROCKSDB_VERSION"
+
+# Setting up the proxy for remote repo access
+export http_proxy=http://172.31.255.99:8080
+export https_proxy="$http_proxy";
+
+title "Pushing new tag to remote repo ..."
+proxycmd.sh git push origin --tags
+
+# --Step 2: Update README.fb
+title "Updating the latest version info in README.fb ..."
+sed -i "s/Latest release is [0-9]\+.[0-9]\+.fb/Latest release is $ROCKSDB_VERSION.fb/" README.fb
+git commit README.fb -m "update the latest version in README.fb to $ROCKSDB_VERSION"
+proxycmd.sh git push
+
+# --Step 3: Prepare this repo for 3rd release
+title "Cleaning up repo ..."
+make clean
+git clean -fxd
+
+title "Generating the build info ..."
+# Comment out the call of `build_detection_version` so that the SHA number and build date of this
+# release will remain constant. Otherwise everytime we run "make" util/build_version.cc will be 
+# overridden.
+sed -i 's/^\$PWD\/build_tools\/build_detect_version$//' build_tools/build_detect_platform
+
+# Generate util/build_version.cc
+build_tools/build_detect_version
+
+title "Done!"
diff --git a/build_tools/regression_build_test.sh b/build_tools/regression_build_test.sh
new file mode 100755 (executable)
index 0000000..d38b67c
--- /dev/null
@@ -0,0 +1,308 @@
+#!/bin/bash
+
+set -e
+
+NUM=10000000
+
+if [ $# -eq 1 ];then
+  DATA_DIR=$1
+elif [ $# -eq 2 ];then
+  DATA_DIR=$1
+  STAT_FILE=$2
+fi
+
+# On the production build servers, set data and stat
+# files/directories not in /tmp or else the tempdir cleaning
+# scripts will make you very unhappy.
+DATA_DIR=${DATA_DIR:-$(mktemp -t -d rocksdb_XXXX)}
+STAT_FILE=${STAT_FILE:-$(mktemp -t -u rocksdb_test_stats_XXXX)}
+
+function cleanup {
+  rm -rf $DATA_DIR
+  rm -f $STAT_FILE.fillseq
+  rm -f $STAT_FILE.readrandom
+  rm -f $STAT_FILE.overwrite
+  rm -f $STAT_FILE.memtablefillreadrandom
+}
+
+trap cleanup EXIT
+
+if [ -z $GIT_BRANCH ]; then
+  git_br=`git rev-parse --abbrev-ref HEAD`
+else
+  git_br=$(basename $GIT_BRANCH)
+fi
+
+if [ $git_br == "master" ]; then
+  git_br=""
+else
+  git_br="."$git_br
+fi
+
+make release
+
+# measure fillseq + fill up the DB for overwrite benchmark
+./db_bench \
+    --benchmarks=fillseq \
+    --db=$DATA_DIR \
+    --use_existing_db=0 \
+    --bloom_bits=10 \
+    --num=$NUM \
+    --writes=$NUM \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6 \
+    --table_cache_numshardbits=4 \
+    --open_files=55000 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0  > ${STAT_FILE}.fillseq
+
+# measure overwrite performance
+./db_bench \
+    --benchmarks=overwrite \
+    --db=$DATA_DIR \
+    --use_existing_db=1 \
+    --bloom_bits=10 \
+    --num=$NUM \
+    --writes=$((NUM / 10)) \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6  \
+    --table_cache_numshardbits=4 \
+    --open_files=55000 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=8 > ${STAT_FILE}.overwrite
+
+# fill up the db for readrandom benchmark (1GB total size)
+./db_bench \
+    --benchmarks=fillseq \
+    --db=$DATA_DIR \
+    --use_existing_db=0 \
+    --bloom_bits=10 \
+    --num=$NUM \
+    --writes=$NUM \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6 \
+    --table_cache_numshardbits=4 \
+    --open_files=55000 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=1 > /dev/null
+
+# measure readrandom with 6GB block cache
+./db_bench \
+    --benchmarks=readrandom \
+    --db=$DATA_DIR \
+    --use_existing_db=1 \
+    --bloom_bits=10 \
+    --num=$NUM \
+    --reads=$((NUM / 5)) \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6 \
+    --table_cache_numshardbits=4 \
+    --open_files=55000 \
+    --disable_seek_compaction=1 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=16 > ${STAT_FILE}.readrandom
+
+# measure readrandom with 100MB block cache
+./db_bench \
+    --benchmarks=readrandom \
+    --db=$DATA_DIR \
+    --use_existing_db=1 \
+    --bloom_bits=10 \
+    --num=$NUM \
+    --reads=$((NUM / 5)) \
+    --cache_size=104857600 \
+    --cache_numshardbits=6 \
+    --table_cache_numshardbits=4 \
+    --open_files=55000 \
+    --disable_seek_compaction=1 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=16 > ${STAT_FILE}.readrandomsmallblockcache
+
+# measure readrandom with 8k data in memtable
+./db_bench \
+    --benchmarks=overwrite,readrandom \
+    --db=$DATA_DIR \
+    --use_existing_db=1 \
+    --bloom_bits=10 \
+    --num=$NUM \
+    --reads=$((NUM / 5)) \
+    --writes=512 \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6 \
+    --table_cache_numshardbits=4 \
+    --write_buffer_size=1000000000 \
+    --open_files=55000 \
+    --disable_seek_compaction=1 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=16 > ${STAT_FILE}.readrandom_mem_sst
+
+
+# fill up the db for readrandom benchmark with filluniquerandom (1GB total size)
+./db_bench \
+    --benchmarks=filluniquerandom \
+    --db=$DATA_DIR \
+    --use_existing_db=0 \
+    --bloom_bits=10 \
+    --num=$((NUM / 4)) \
+    --writes=$((NUM / 4)) \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6 \
+    --table_cache_numshardbits=4 \
+    --open_files=55000 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=1 > /dev/null
+
+# dummy test just to compact the data
+./db_bench \
+    --benchmarks=readrandom \
+    --db=$DATA_DIR \
+    --use_existing_db=1 \
+    --bloom_bits=10 \
+    --num=$((NUM / 1000)) \
+    --reads=$((NUM / 1000)) \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6 \
+    --table_cache_numshardbits=4 \
+    --open_files=55000 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=16 > /dev/null
+
+# measure readrandom after load with filluniquerandom with 6GB block cache
+./db_bench \
+    --benchmarks=readrandom \
+    --db=$DATA_DIR \
+    --use_existing_db=1 \
+    --bloom_bits=10 \
+    --num=$((NUM / 4)) \
+    --reads=$((NUM / 4)) \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6 \
+    --table_cache_numshardbits=4 \
+    --open_files=55000 \
+    --disable_seek_compaction=1 \
+    --disable_auto_compactions=1 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=16 > ${STAT_FILE}.readrandom_filluniquerandom
+
+# measure readwhilewriting after load with filluniquerandom with 6GB block cache
+./db_bench \
+    --benchmarks=readwhilewriting \
+    --db=$DATA_DIR \
+    --use_existing_db=1 \
+    --bloom_bits=10 \
+    --num=$((NUM / 4)) \
+    --reads=$((NUM / 4)) \
+    --writes_per_second=1000 \
+    --write_buffer_size=100000000 \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6 \
+    --table_cache_numshardbits=4 \
+    --open_files=55000 \
+    --disable_seek_compaction=1 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=16 > ${STAT_FILE}.readwhilewriting
+
+# measure memtable performance -- none of the data gets flushed to disk
+./db_bench \
+    --benchmarks=fillrandom,readrandom, \
+    --db=$DATA_DIR \
+    --use_existing_db=0 \
+    --num=$((NUM / 10)) \
+    --reads=$NUM \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6 \
+    --table_cache_numshardbits=4 \
+    --write_buffer_size=1000000000 \
+    --open_files=55000 \
+    --disable_seek_compaction=1 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --value_size=10 \
+    --threads=16 > ${STAT_FILE}.memtablefillreadrandom
+
+# send data to ods
+function send_to_ods {
+  key="$1"
+  value="$2"
+
+  if [ -z $JENKINS_HOME ]; then
+    # running on devbox, just print out the values
+    echo $1 $2
+    return
+  fi
+
+  if [ -z "$value" ];then
+    echo >&2 "ERROR: Key $key doesn't have a value."
+    return
+  fi
+  curl -s "https://www.intern.facebook.com/intern/agent/ods_set.php?entity=rocksdb_build$git_br&key=$key&value=$value" \
+    --connect-timeout 60
+}
+
+function send_benchmark_to_ods {
+  bench="$1"
+  bench_key="$2"
+  file="$3"
+
+  QPS=$(grep $bench $file | awk '{print $5}')
+  P50_MICROS=$(grep $bench $file -A 4 | tail -n1 | awk '{print $3}' )
+  P75_MICROS=$(grep $bench $file -A 4 | tail -n1 | awk '{print $5}' )
+  P99_MICROS=$(grep $bench $file -A 4 | tail -n1 | awk '{print $7}' )
+
+  send_to_ods rocksdb.build.$bench_key.qps $QPS
+  send_to_ods rocksdb.build.$bench_key.p50_micros $P50_MICROS
+  send_to_ods rocksdb.build.$bench_key.p75_micros $P75_MICROS
+  send_to_ods rocksdb.build.$bench_key.p99_micros $P99_MICROS
+}
+
+send_benchmark_to_ods overwrite overwrite $STAT_FILE.overwrite
+send_benchmark_to_ods fillseq fillseq $STAT_FILE.fillseq
+send_benchmark_to_ods readrandom readrandom $STAT_FILE.readrandom
+send_benchmark_to_ods readrandom readrandom_smallblockcache $STAT_FILE.readrandomsmallblockcache
+send_benchmark_to_ods readrandom readrandom_memtable_sst $STAT_FILE.readrandom_mem_sst
+send_benchmark_to_ods readrandom readrandom_fillunique_random $STAT_FILE.readrandom_filluniquerandom
+send_benchmark_to_ods fillrandom memtablefillrandom $STAT_FILE.memtablefillreadrandom
+send_benchmark_to_ods readrandom memtablereadrandom $STAT_FILE.memtablefillreadrandom
+send_benchmark_to_ods readwhilewriting readwhilewriting $STAT_FILE.readwhilewriting
diff --git a/build_tools/valgrind_test.sh b/build_tools/valgrind_test.sh
new file mode 100755 (executable)
index 0000000..8c7e521
--- /dev/null
@@ -0,0 +1,15 @@
+#!/bin/bash
+#A shell script for Jenknis to run valgrind on rocksdb tests
+#Returns 0 on success when there are no failed tests 
+
+VALGRIND_DIR=build_tools/VALGRIND_LOGS
+make clean
+make -j$(nproc) valgrind_check
+NUM_FAILED_TESTS=$((`wc -l $VALGRIND_DIR/valgrind_failed_tests | awk '{print $1}'` - 1))
+if [ $NUM_FAILED_TESTS -lt 1 ]; then
+  echo No tests have valgrind errors
+  exit 0
+else
+  cat $VALGRIND_DIR/valgrind_failed_tests
+  exit 1
+fi
diff --git a/coverage/coverage_test.sh b/coverage/coverage_test.sh
new file mode 100755 (executable)
index 0000000..7a8b5e0
--- /dev/null
@@ -0,0 +1,73 @@
+#!/bin/bash
+
+# Exit on error.
+set -e
+
+if [ -n "$USE_CLANG" ]; then
+  echo "Error: Coverage test is supported only for gcc."
+  exit 1
+fi
+
+ROOT=".."
+# Fetch right version of gcov
+if [ -d /mnt/gvfs/third-party -a -z "$CXX" ]; then
+  source $ROOT/build_tools/fbcode.gcc471.sh
+  GCOV=$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.7.1/cc6c9dc/bin/gcov
+else
+  GCOV=$(which gcov)
+fi
+
+COVERAGE_DIR="$PWD/COVERAGE_REPORT"
+mkdir -p $COVERAGE_DIR
+
+# Find all gcno files to generate the coverage report
+
+GCNO_FILES=`find $ROOT -name "*.gcno"`
+$GCOV --preserve-paths --relative-only --no-output $GCNO_FILES 2>/dev/null |
+  # Parse the raw gcov report to more human readable form.
+  python $ROOT/coverage/parse_gcov_output.py |
+  # Write the output to both stdout and report file.
+  tee $COVERAGE_DIR/coverage_report_all.txt &&
+echo -e "Generated coverage report for all files: $COVERAGE_DIR/coverage_report_all.txt\n"
+
+# TODO: we also need to get the files of the latest commits.
+# Get the most recently committed files.
+LATEST_FILES=`
+  git show --pretty="format:" --name-only HEAD |
+  grep -v "^$" |
+  paste -s -d,`
+RECENT_REPORT=$COVERAGE_DIR/coverage_report_recent.txt
+
+echo -e "Recently updated files: $LATEST_FILES\n" > $RECENT_REPORT
+$GCOV --preserve-paths --relative-only --no-output $GCNO_FILES 2>/dev/null |
+  python $ROOT/coverage/parse_gcov_output.py -interested-files $LATEST_FILES |
+  tee -a $RECENT_REPORT &&
+echo -e "Generated coverage report for recently updated files: $RECENT_REPORT\n"
+
+# Generate the html report. If we cannot find lcov in this machine, we'll simply
+# skip this step.
+echo "Generating the html coverage report..."
+
+LCOV=$(which lcov || true 2>/dev/null)
+if [ -z $LCOV ]
+then
+  echo "Skip: Cannot find lcov to generate the html report."
+  exit 0
+fi
+
+LCOV_VERSION=$(lcov -v | grep 1.1 || true)
+if [ $LCOV_VERSION ]
+then
+  echo "Not supported lcov version. Expect lcov 1.1."
+  exit 0
+fi
+
+(cd $ROOT; lcov --no-external \
+     --capture  \
+     --directory $PWD \
+     --gcov-tool $GCOV \
+     --output-file $COVERAGE_DIR/coverage.info)
+
+genhtml $COVERAGE_DIR/coverage.info -o $COVERAGE_DIR
+
+echo "HTML Coverage report is generated in $COVERAGE_DIR"
diff --git a/coverage/parse_gcov_output.py b/coverage/parse_gcov_output.py
new file mode 100644 (file)
index 0000000..72e8b07
--- /dev/null
@@ -0,0 +1,118 @@
+import optparse
+import re
+import sys
+
+from optparse import OptionParser
+
+# the gcov report follows certain pattern. Each file will have two lines
+# of report, from which we can extract the file name, total lines and coverage
+# percentage.
+def parse_gcov_report(gcov_input):
+    per_file_coverage = {}
+    total_coverage = None
+
+    for line in sys.stdin:
+        line = line.strip()
+
+        # --First line of the coverage report (with file name in it)?
+        match_obj = re.match("^File '(.*)'$", line)
+        if match_obj:
+            # fetch the file name from the first line of the report.
+            current_file = match_obj.group(1)
+            continue
+
+        # -- Second line of the file report (with coverage percentage)
+        match_obj = re.match("^Lines executed:(.*)% of (.*)", line)
+
+        if match_obj:
+            coverage = float(match_obj.group(1))
+            lines = int(match_obj.group(2))
+
+            if current_file is not None:
+                per_file_coverage[current_file] = (coverage, lines)
+                current_file = None
+            else:
+                # If current_file is not set, we reach the last line of report,
+                # which contains the summarized coverage percentage.
+                total_coverage = (coverage, lines)
+            continue
+
+        # If the line's pattern doesn't fall into the above categories. We
+        # can simply ignore them since they're either empty line or doesn't
+        # find executable lines of the given file.
+        current_file = None
+
+    return per_file_coverage, total_coverage
+
+def get_option_parser():
+    usage = "Parse the gcov output and generate more human-readable code " +\
+            "coverage report."
+    parser = OptionParser(usage)
+
+    parser.add_option(
+        "--interested-files", "-i",
+        dest="filenames",
+        help="Comma separated files names. if specified, we will display " +
+             "the coverage report only for interested source files. " +
+             "Otherwise we will display the coverage report for all " +
+             "source files."
+    )
+    return parser
+
+def display_file_coverage(per_file_coverage, total_coverage):
+    # To print out auto-adjustable column, we need to know the longest
+    # length of file names.
+    max_file_name_length = max(
+        len(fname) for fname in per_file_coverage.keys()
+    )
+
+    # -- Print header
+    # size of separator is determined by 3 column sizes:
+    # file name, coverage percentage and lines.
+    header_template = \
+        "%" + str(max_file_name_length) + "s\t%s\t%s"
+    separator = "-" * (max_file_name_length + 10 + 20)
+    print header_template % ("Filename", "Coverage", "Lines")
+    print separator
+
+    # -- Print body
+    # template for printing coverage report for each file.
+    record_template = "%" + str(max_file_name_length) + "s\t%5.2f%%\t%10d"
+
+    for fname, coverage_info in per_file_coverage.items():
+        coverage, lines = coverage_info
+        print record_template % (fname, coverage, lines)
+
+    # -- Print footer
+    if total_coverage:
+        print separator
+        print record_template % ("Total", total_coverage[0], total_coverage[1])
+
+def report_coverage():
+    parser = get_option_parser()
+    (options, args) = parser.parse_args()
+
+    interested_files = set()
+    if options.filenames is not None:
+        interested_files = set(f.strip() for f in options.filenames.split(','))
+
+    # To make things simple, right now we only read gcov report from the input
+    per_file_coverage, total_coverage = parse_gcov_report(sys.stdin)
+
+    # Check if we need to display coverage info for interested files.
+    if len(interested_files):
+        per_file_coverage = dict(
+            (fname, per_file_coverage[fname]) for fname in interested_files
+            if fname in per_file_coverage
+        )
+        # If we only interested in several files, it makes no sense to report
+        # the total_coverage
+        total_coverage = None
+
+    if not len(per_file_coverage):
+        print >> sys.stderr, "Cannot find coverage info for the given files."
+        return
+    display_file_coverage(per_file_coverage, total_coverage)
+
+if __name__ == "__main__":
+    report_coverage()
diff --git a/db/builder.cc b/db/builder.cc
new file mode 100644 (file)
index 0000000..61671db
--- /dev/null
@@ -0,0 +1,227 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/builder.h"
+
+#include "db/filename.h"
+#include "db/dbformat.h"
+#include "db/merge_helper.h"
+#include "db/table_cache.h"
+#include "db/version_edit.h"
+#include "rocksdb/db.h"
+#include "rocksdb/table.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "table/block_based_table_builder.h"
+#include "util/stop_watch.h"
+
+namespace rocksdb {
+
+class TableFactory;
+
+TableBuilder* GetTableBuilder(const Options& options, WritableFile* file,
+                              CompressionType compression_type) {
+  return options.table_factory->GetTableBuilder(options, file,
+                                                compression_type);
+}
+
+Status BuildTable(const std::string& dbname,
+                  Env* env,
+                  const Options& options,
+                  const EnvOptions& soptions,
+                  TableCache* table_cache,
+                  Iterator* iter,
+                  FileMetaData* meta,
+                  const Comparator* user_comparator,
+                  const SequenceNumber newest_snapshot,
+                  const SequenceNumber earliest_seqno_in_memtable,
+                  const CompressionType compression) {
+  Status s;
+  meta->file_size = 0;
+  meta->smallest_seqno = meta->largest_seqno = 0;
+  iter->SeekToFirst();
+
+  // If the sequence number of the smallest entry in the memtable is
+  // smaller than the most recent snapshot, then we do not trigger
+  // removal of duplicate/deleted keys as part of this builder.
+  bool purge = options.purge_redundant_kvs_while_flush;
+  if (earliest_seqno_in_memtable <= newest_snapshot) {
+    purge = false;
+  }
+
+  std::string fname = TableFileName(dbname, meta->number);
+  if (iter->Valid()) {
+    unique_ptr<WritableFile> file;
+    s = env->NewWritableFile(fname, &file, soptions);
+    if (!s.ok()) {
+      return s;
+    }
+
+    TableBuilder* builder = GetTableBuilder(options, file.get(),
+                                            compression);
+
+    // the first key is the smallest key
+    Slice key = iter->key();
+    meta->smallest.DecodeFrom(key);
+    meta->smallest_seqno = GetInternalKeySeqno(key);
+    meta->largest_seqno = meta->smallest_seqno;
+
+    MergeHelper merge(user_comparator, options.merge_operator.get(),
+                      options.info_log.get(),
+                      true /* internal key corruption is not ok */);
+
+    if (purge) {
+      // Ugly walkaround to avoid compiler error for release build
+      bool ok __attribute__((unused)) = true;
+
+      // Will write to builder if current key != prev key
+      ParsedInternalKey prev_ikey;
+      std::string prev_key;
+      bool is_first_key = true;    // Also write if this is the very first key
+
+      while (iter->Valid()) {
+        bool iterator_at_next = false;
+
+        // Get current key
+        ParsedInternalKey this_ikey;
+        Slice key = iter->key();
+        Slice value = iter->value();
+
+        // In-memory key corruption is not ok;
+        // TODO: find a clean way to treat in memory key corruption
+        ok = ParseInternalKey(key, &this_ikey);
+        assert(ok);
+        assert(this_ikey.sequence >= earliest_seqno_in_memtable);
+
+        // If the key is the same as the previous key (and it is not the
+        // first key), then we skip it, since it is an older version.
+        // Otherwise we output the key and mark it as the "new" previous key.
+        if (!is_first_key && !user_comparator->Compare(prev_ikey.user_key,
+                                                       this_ikey.user_key)) {
+          // seqno within the same key are in decreasing order
+          assert(this_ikey.sequence < prev_ikey.sequence);
+        } else {
+          is_first_key = false;
+
+          if (this_ikey.type == kTypeMerge) {
+            // Handle merge-type keys using the MergeHelper
+            // TODO: pass statistics to MergeUntil
+            merge.MergeUntil(iter, 0 /* don't worry about snapshot */);
+            iterator_at_next = true;
+            if (merge.IsSuccess()) {
+              // Merge completed correctly.
+              // Add the resulting merge key/value and continue to next
+              builder->Add(merge.key(), merge.value());
+              prev_key.assign(merge.key().data(), merge.key().size());
+              ok = ParseInternalKey(Slice(prev_key), &prev_ikey);
+              assert(ok);
+            } else {
+              // Merge did not find a Put/Delete.
+              // Can not compact these merges into a kValueType.
+              // Write them out one-by-one. (Proceed back() to front())
+              const std::deque<std::string>& keys = merge.keys();
+              const std::deque<std::string>& values = merge.values();
+              assert(keys.size() == values.size() && keys.size() >= 1);
+              std::deque<std::string>::const_reverse_iterator key_iter;
+              std::deque<std::string>::const_reverse_iterator value_iter;
+              for (key_iter=keys.rbegin(), value_iter = values.rbegin();
+                   key_iter != keys.rend() && value_iter != values.rend();
+                   ++key_iter, ++value_iter) {
+
+                builder->Add(Slice(*key_iter), Slice(*value_iter));
+              }
+
+              // Sanity check. Both iterators should end at the same time
+              assert(key_iter == keys.rend() && value_iter == values.rend());
+
+              prev_key.assign(keys.front());
+              ok = ParseInternalKey(Slice(prev_key), &prev_ikey);
+              assert(ok);
+            }
+          } else {
+            // Handle Put/Delete-type keys by simply writing them
+            builder->Add(key, value);
+            prev_key.assign(key.data(), key.size());
+            ok = ParseInternalKey(Slice(prev_key), &prev_ikey);
+            assert(ok);
+          }
+        }
+
+        if (!iterator_at_next) iter->Next();
+      }
+
+      // The last key is the largest key
+      meta->largest.DecodeFrom(Slice(prev_key));
+      SequenceNumber seqno = GetInternalKeySeqno(Slice(prev_key));
+      meta->smallest_seqno = std::min(meta->smallest_seqno, seqno);
+      meta->largest_seqno = std::max(meta->largest_seqno, seqno);
+
+    } else {
+      for (; iter->Valid(); iter->Next()) {
+        Slice key = iter->key();
+        meta->largest.DecodeFrom(key);
+        builder->Add(key, iter->value());
+        SequenceNumber seqno = GetInternalKeySeqno(key);
+        meta->smallest_seqno = std::min(meta->smallest_seqno, seqno);
+        meta->largest_seqno = std::max(meta->largest_seqno, seqno);
+      }
+    }
+
+    // Finish and check for builder errors
+    if (s.ok()) {
+      s = builder->Finish();
+      if (s.ok()) {
+        meta->file_size = builder->FileSize();
+        assert(meta->file_size > 0);
+      }
+    } else {
+      builder->Abandon();
+    }
+    delete builder;
+
+    // Finish and check for file errors
+    if (s.ok() && !options.disableDataSync) {
+      if (options.use_fsync) {
+        StopWatch sw(env, options.statistics.get(), TABLE_SYNC_MICROS);
+        s = file->Fsync();
+      } else {
+        StopWatch sw(env, options.statistics.get(), TABLE_SYNC_MICROS);
+        s = file->Sync();
+      }
+    }
+    if (s.ok()) {
+      s = file->Close();
+    }
+
+    if (s.ok()) {
+      // Verify that the table is usable
+      Iterator* it = table_cache->NewIterator(ReadOptions(),
+                                              soptions,
+                                              meta->number,
+                                              meta->file_size);
+      s = it->status();
+      delete it;
+    }
+  }
+
+  // Check for input iterator errors
+  if (!iter->status().ok()) {
+    s = iter->status();
+  }
+
+  if (s.ok() && meta->file_size > 0) {
+    // Keep it
+  } else {
+    env->DeleteFile(fname);
+  }
+  return s;
+}
+
+}  // namespace rocksdb
diff --git a/db/builder.h b/db/builder.h
new file mode 100644 (file)
index 0000000..2600dc2
--- /dev/null
@@ -0,0 +1,48 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+#include "rocksdb/comparator.h"
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+#include "rocksdb/options.h"
+
+namespace rocksdb {
+
+struct Options;
+struct FileMetaData;
+
+class Env;
+struct EnvOptions;
+class Iterator;
+class TableCache;
+class VersionEdit;
+class TableBuilder;
+class WritableFile;
+
+
+extern TableBuilder* GetTableBuilder(const Options& options, WritableFile* file,
+                                     CompressionType compression_type);
+
+// Build a Table file from the contents of *iter.  The generated file
+// will be named according to meta->number.  On success, the rest of
+// *meta will be filled with metadata about the generated table.
+// If no data is present in *iter, meta->file_size will be set to
+// zero, and no Table file will be produced.
+extern Status BuildTable(const std::string& dbname,
+                         Env* env,
+                         const Options& options,
+                         const EnvOptions& soptions,
+                         TableCache* table_cache,
+                         Iterator* iter,
+                         FileMetaData* meta,
+                         const Comparator* user_comparator,
+                         const SequenceNumber newest_snapshot,
+                         const SequenceNumber earliest_seqno_in_memtable,
+                         const CompressionType compression);
+
+}  // namespace rocksdb
diff --git a/db/c.cc b/db/c.cc
new file mode 100644 (file)
index 0000000..68f3613
--- /dev/null
+++ b/db/c.cc
@@ -0,0 +1,842 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/c.h"
+
+#include <stdlib.h>
+#include <unistd.h>
+#include "rocksdb/cache.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+#include "rocksdb/write_batch.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/universal_compaction.h"
+
+using rocksdb::Cache;
+using rocksdb::Comparator;
+using rocksdb::CompressionType;
+using rocksdb::DB;
+using rocksdb::Env;
+using rocksdb::FileLock;
+using rocksdb::FilterPolicy;
+using rocksdb::Iterator;
+using rocksdb::Logger;
+using rocksdb::NewBloomFilterPolicy;
+using rocksdb::NewLRUCache;
+using rocksdb::Options;
+using rocksdb::RandomAccessFile;
+using rocksdb::Range;
+using rocksdb::ReadOptions;
+using rocksdb::SequentialFile;
+using rocksdb::Slice;
+using rocksdb::Snapshot;
+using rocksdb::Status;
+using rocksdb::WritableFile;
+using rocksdb::WriteBatch;
+using rocksdb::WriteOptions;
+
+using std::shared_ptr;
+
+extern "C" {
+
+struct rocksdb_t              { DB*               rep; };
+struct rocksdb_iterator_t     { Iterator*         rep; };
+struct rocksdb_writebatch_t   { WriteBatch        rep; };
+struct rocksdb_snapshot_t     { const Snapshot*   rep; };
+struct rocksdb_readoptions_t  { ReadOptions       rep; };
+struct rocksdb_writeoptions_t { WriteOptions      rep; };
+struct rocksdb_options_t      { Options           rep; };
+struct rocksdb_seqfile_t      { SequentialFile*   rep; };
+struct rocksdb_randomfile_t   { RandomAccessFile* rep; };
+struct rocksdb_writablefile_t { WritableFile*     rep; };
+struct rocksdb_filelock_t     { FileLock*         rep; };
+struct rocksdb_logger_t       { shared_ptr<Logger>  rep; };
+struct rocksdb_cache_t        { shared_ptr<Cache>   rep; };
+
+struct rocksdb_comparator_t : public Comparator {
+  void* state_;
+  void (*destructor_)(void*);
+  int (*compare_)(
+      void*,
+      const char* a, size_t alen,
+      const char* b, size_t blen);
+  const char* (*name_)(void*);
+
+  virtual ~rocksdb_comparator_t() {
+    (*destructor_)(state_);
+  }
+
+  virtual int Compare(const Slice& a, const Slice& b) const {
+    return (*compare_)(state_, a.data(), a.size(), b.data(), b.size());
+  }
+
+  virtual const char* Name() const {
+    return (*name_)(state_);
+  }
+
+  // No-ops since the C binding does not support key shortening methods.
+  virtual void FindShortestSeparator(std::string*, const Slice&) const { }
+  virtual void FindShortSuccessor(std::string* key) const { }
+};
+
+struct rocksdb_filterpolicy_t : public FilterPolicy {
+  void* state_;
+  void (*destructor_)(void*);
+  const char* (*name_)(void*);
+  char* (*create_)(
+      void*,
+      const char* const* key_array, const size_t* key_length_array,
+      int num_keys,
+      size_t* filter_length);
+  unsigned char (*key_match_)(
+      void*,
+      const char* key, size_t length,
+      const char* filter, size_t filter_length);
+
+  virtual ~rocksdb_filterpolicy_t() {
+    (*destructor_)(state_);
+  }
+
+  virtual const char* Name() const {
+    return (*name_)(state_);
+  }
+
+  virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const {
+    std::vector<const char*> key_pointers(n);
+    std::vector<size_t> key_sizes(n);
+    for (int i = 0; i < n; i++) {
+      key_pointers[i] = keys[i].data();
+      key_sizes[i] = keys[i].size();
+    }
+    size_t len;
+    char* filter = (*create_)(state_, &key_pointers[0], &key_sizes[0], n, &len);
+    dst->append(filter, len);
+    free(filter);
+  }
+
+  virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const {
+    return (*key_match_)(state_, key.data(), key.size(),
+                         filter.data(), filter.size());
+  }
+};
+
+struct rocksdb_env_t {
+  Env* rep;
+  bool is_default;
+};
+
+struct rocksdb_universal_compaction_options_t {
+  rocksdb::CompactionOptionsUniversal *rep;
+};
+
+
+static bool SaveError(char** errptr, const Status& s) {
+  assert(errptr != NULL);
+  if (s.ok()) {
+    return false;
+  } else if (*errptr == NULL) {
+    *errptr = strdup(s.ToString().c_str());
+  } else {
+    // TODO(sanjay): Merge with existing error?
+    free(*errptr);
+    *errptr = strdup(s.ToString().c_str());
+  }
+  return true;
+}
+
+static char* CopyString(const std::string& str) {
+  char* result = reinterpret_cast<char*>(malloc(sizeof(char) * str.size()));
+  memcpy(result, str.data(), sizeof(char) * str.size());
+  return result;
+}
+
+rocksdb_t* rocksdb_open(
+    const rocksdb_options_t* options,
+    const char* name,
+    char** errptr) {
+  DB* db;
+  if (SaveError(errptr, DB::Open(options->rep, std::string(name), &db))) {
+    return NULL;
+  }
+  rocksdb_t* result = new rocksdb_t;
+  result->rep = db;
+  return result;
+}
+
+void rocksdb_close(rocksdb_t* db) {
+  delete db->rep;
+  delete db;
+}
+
+void rocksdb_put(
+    rocksdb_t* db,
+    const rocksdb_writeoptions_t* options,
+    const char* key, size_t keylen,
+    const char* val, size_t vallen,
+    char** errptr) {
+  SaveError(errptr,
+            db->rep->Put(options->rep, Slice(key, keylen), Slice(val, vallen)));
+}
+
+void rocksdb_delete(
+    rocksdb_t* db,
+    const rocksdb_writeoptions_t* options,
+    const char* key, size_t keylen,
+    char** errptr) {
+  SaveError(errptr, db->rep->Delete(options->rep, Slice(key, keylen)));
+}
+
+
+void rocksdb_write(
+    rocksdb_t* db,
+    const rocksdb_writeoptions_t* options,
+    rocksdb_writebatch_t* batch,
+    char** errptr) {
+  SaveError(errptr, db->rep->Write(options->rep, &batch->rep));
+}
+
+char* rocksdb_get(
+    rocksdb_t* db,
+    const rocksdb_readoptions_t* options,
+    const char* key, size_t keylen,
+    size_t* vallen,
+    char** errptr) {
+  char* result = NULL;
+  std::string tmp;
+  Status s = db->rep->Get(options->rep, Slice(key, keylen), &tmp);
+  if (s.ok()) {
+    *vallen = tmp.size();
+    result = CopyString(tmp);
+  } else {
+    *vallen = 0;
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+  }
+  return result;
+}
+
+rocksdb_iterator_t* rocksdb_create_iterator(
+    rocksdb_t* db,
+    const rocksdb_readoptions_t* options) {
+  rocksdb_iterator_t* result = new rocksdb_iterator_t;
+  result->rep = db->rep->NewIterator(options->rep);
+  return result;
+}
+
+const rocksdb_snapshot_t* rocksdb_create_snapshot(
+    rocksdb_t* db) {
+  rocksdb_snapshot_t* result = new rocksdb_snapshot_t;
+  result->rep = db->rep->GetSnapshot();
+  return result;
+}
+
+void rocksdb_release_snapshot(
+    rocksdb_t* db,
+    const rocksdb_snapshot_t* snapshot) {
+  db->rep->ReleaseSnapshot(snapshot->rep);
+  delete snapshot;
+}
+
+char* rocksdb_property_value(
+    rocksdb_t* db,
+    const char* propname) {
+  std::string tmp;
+  if (db->rep->GetProperty(Slice(propname), &tmp)) {
+    // We use strdup() since we expect human readable output.
+    return strdup(tmp.c_str());
+  } else {
+    return NULL;
+  }
+}
+
+void rocksdb_approximate_sizes(
+    rocksdb_t* db,
+    int num_ranges,
+    const char* const* range_start_key, const size_t* range_start_key_len,
+    const char* const* range_limit_key, const size_t* range_limit_key_len,
+    uint64_t* sizes) {
+  Range* ranges = new Range[num_ranges];
+  for (int i = 0; i < num_ranges; i++) {
+    ranges[i].start = Slice(range_start_key[i], range_start_key_len[i]);
+    ranges[i].limit = Slice(range_limit_key[i], range_limit_key_len[i]);
+  }
+  db->rep->GetApproximateSizes(ranges, num_ranges, sizes);
+  delete[] ranges;
+}
+
+void rocksdb_compact_range(
+    rocksdb_t* db,
+    const char* start_key, size_t start_key_len,
+    const char* limit_key, size_t limit_key_len) {
+  Slice a, b;
+  db->rep->CompactRange(
+      // Pass NULL Slice if corresponding "const char*" is NULL
+      (start_key ? (a = Slice(start_key, start_key_len), &a) : NULL),
+      (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : NULL));
+}
+
+void rocksdb_destroy_db(
+    const rocksdb_options_t* options,
+    const char* name,
+    char** errptr) {
+  SaveError(errptr, DestroyDB(name, options->rep));
+}
+
+void rocksdb_repair_db(
+    const rocksdb_options_t* options,
+    const char* name,
+    char** errptr) {
+  SaveError(errptr, RepairDB(name, options->rep));
+}
+
+void rocksdb_iter_destroy(rocksdb_iterator_t* iter) {
+  delete iter->rep;
+  delete iter;
+}
+
+unsigned char rocksdb_iter_valid(const rocksdb_iterator_t* iter) {
+  return iter->rep->Valid();
+}
+
+void rocksdb_iter_seek_to_first(rocksdb_iterator_t* iter) {
+  iter->rep->SeekToFirst();
+}
+
+void rocksdb_iter_seek_to_last(rocksdb_iterator_t* iter) {
+  iter->rep->SeekToLast();
+}
+
+void rocksdb_iter_seek(rocksdb_iterator_t* iter, const char* k, size_t klen) {
+  iter->rep->Seek(Slice(k, klen));
+}
+
+void rocksdb_iter_next(rocksdb_iterator_t* iter) {
+  iter->rep->Next();
+}
+
+void rocksdb_iter_prev(rocksdb_iterator_t* iter) {
+  iter->rep->Prev();
+}
+
+const char* rocksdb_iter_key(const rocksdb_iterator_t* iter, size_t* klen) {
+  Slice s = iter->rep->key();
+  *klen = s.size();
+  return s.data();
+}
+
+const char* rocksdb_iter_value(const rocksdb_iterator_t* iter, size_t* vlen) {
+  Slice s = iter->rep->value();
+  *vlen = s.size();
+  return s.data();
+}
+
+void rocksdb_iter_get_error(const rocksdb_iterator_t* iter, char** errptr) {
+  SaveError(errptr, iter->rep->status());
+}
+
+rocksdb_writebatch_t* rocksdb_writebatch_create() {
+  return new rocksdb_writebatch_t;
+}
+
+void rocksdb_writebatch_destroy(rocksdb_writebatch_t* b) {
+  delete b;
+}
+
+void rocksdb_writebatch_clear(rocksdb_writebatch_t* b) {
+  b->rep.Clear();
+}
+
+void rocksdb_writebatch_put(
+    rocksdb_writebatch_t* b,
+    const char* key, size_t klen,
+    const char* val, size_t vlen) {
+  b->rep.Put(Slice(key, klen), Slice(val, vlen));
+}
+
+void rocksdb_writebatch_delete(
+    rocksdb_writebatch_t* b,
+    const char* key, size_t klen) {
+  b->rep.Delete(Slice(key, klen));
+}
+
+void rocksdb_writebatch_iterate(
+    rocksdb_writebatch_t* b,
+    void* state,
+    void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen),
+    void (*deleted)(void*, const char* k, size_t klen)) {
+  class H : public WriteBatch::Handler {
+   public:
+    void* state_;
+    void (*put_)(void*, const char* k, size_t klen, const char* v, size_t vlen);
+    void (*deleted_)(void*, const char* k, size_t klen);
+    virtual void Put(const Slice& key, const Slice& value) {
+      (*put_)(state_, key.data(), key.size(), value.data(), value.size());
+    }
+    virtual void Delete(const Slice& key) {
+      (*deleted_)(state_, key.data(), key.size());
+    }
+  };
+  H handler;
+  handler.state_ = state;
+  handler.put_ = put;
+  handler.deleted_ = deleted;
+  b->rep.Iterate(&handler);
+}
+
+rocksdb_options_t* rocksdb_options_create() {
+  return new rocksdb_options_t;
+}
+
+void rocksdb_options_destroy(rocksdb_options_t* options) {
+  delete options;
+}
+
+void rocksdb_options_set_comparator(
+    rocksdb_options_t* opt,
+    rocksdb_comparator_t* cmp) {
+  opt->rep.comparator = cmp;
+}
+
+void rocksdb_options_set_filter_policy(
+    rocksdb_options_t* opt,
+    rocksdb_filterpolicy_t* policy) {
+  opt->rep.filter_policy = policy;
+}
+
+void rocksdb_options_set_create_if_missing(
+    rocksdb_options_t* opt, unsigned char v) {
+  opt->rep.create_if_missing = v;
+}
+
+void rocksdb_options_set_error_if_exists(
+    rocksdb_options_t* opt, unsigned char v) {
+  opt->rep.error_if_exists = v;
+}
+
+void rocksdb_options_set_paranoid_checks(
+    rocksdb_options_t* opt, unsigned char v) {
+  opt->rep.paranoid_checks = v;
+}
+
+void rocksdb_options_set_env(rocksdb_options_t* opt, rocksdb_env_t* env) {
+  opt->rep.env = (env ? env->rep : NULL);
+}
+
+void rocksdb_options_set_info_log(rocksdb_options_t* opt, rocksdb_logger_t* l) {
+  if (l) {
+    opt->rep.info_log = l->rep;
+  }
+}
+
+void rocksdb_options_set_write_buffer_size(rocksdb_options_t* opt, size_t s) {
+  opt->rep.write_buffer_size = s;
+}
+
+void rocksdb_options_set_max_open_files(rocksdb_options_t* opt, int n) {
+  opt->rep.max_open_files = n;
+}
+
+void rocksdb_options_set_cache(rocksdb_options_t* opt, rocksdb_cache_t* c) {
+  if (c) {
+    opt->rep.block_cache = c->rep;
+  }
+}
+
+void rocksdb_options_set_block_size(rocksdb_options_t* opt, size_t s) {
+  opt->rep.block_size = s;
+}
+
+void rocksdb_options_set_block_restart_interval(rocksdb_options_t* opt, int n) {
+  opt->rep.block_restart_interval = n;
+}
+
+void rocksdb_options_set_target_file_size_base(
+    rocksdb_options_t* opt, uint64_t n) {
+  opt->rep.target_file_size_base = n;
+}
+
+void rocksdb_options_set_target_file_size_multiplier(
+    rocksdb_options_t* opt, int n) {
+  opt->rep.target_file_size_multiplier = n;
+}
+
+void rocksdb_options_set_max_bytes_for_level_base(
+    rocksdb_options_t* opt, uint64_t n) {
+  opt->rep.max_bytes_for_level_base = n;
+}
+
+void rocksdb_options_set_max_bytes_for_level_multiplier(
+    rocksdb_options_t* opt, int n) {
+  opt->rep.max_bytes_for_level_multiplier = n;
+}
+
+void rocksdb_options_set_expanded_compaction_factor(
+    rocksdb_options_t* opt, int n) {
+  opt->rep.expanded_compaction_factor = n;
+}
+
+void rocksdb_options_set_max_grandparent_overlap_factor(
+    rocksdb_options_t* opt, int n) {
+  opt->rep.max_grandparent_overlap_factor = n;
+}
+
+void rocksdb_options_set_num_levels(rocksdb_options_t* opt, int n) {
+  opt->rep.num_levels = n;
+}
+
+void rocksdb_options_set_level0_file_num_compaction_trigger(
+    rocksdb_options_t* opt, int n) {
+  opt->rep.level0_file_num_compaction_trigger = n;
+}
+
+void rocksdb_options_set_level0_slowdown_writes_trigger(
+    rocksdb_options_t* opt, int n) {
+  opt->rep.level0_slowdown_writes_trigger = n;
+}
+
+void rocksdb_options_set_level0_stop_writes_trigger(
+    rocksdb_options_t* opt, int n) {
+  opt->rep.level0_stop_writes_trigger = n;
+}
+
+void rocksdb_options_set_max_mem_compaction_level(
+    rocksdb_options_t* opt, int n) {
+  opt->rep.max_mem_compaction_level = n;
+}
+
+void rocksdb_options_set_compression(rocksdb_options_t* opt, int t) {
+  opt->rep.compression = static_cast<CompressionType>(t);
+}
+
+void rocksdb_options_set_compression_per_level(rocksdb_options_t* opt,
+                                               int* level_values,
+                                               size_t num_levels) {
+  opt->rep.compression_per_level.resize(num_levels);
+  for (size_t i = 0; i < num_levels; ++i) {
+    opt->rep.compression_per_level[i] =
+      static_cast<CompressionType>(level_values[i]);
+  }
+}
+
+void rocksdb_options_set_compression_options(
+    rocksdb_options_t* opt, int w_bits, int level, int strategy) {
+  opt->rep.compression_opts.window_bits = w_bits;
+  opt->rep.compression_opts.level = level;
+  opt->rep.compression_opts.strategy = strategy;
+}
+
+void rocksdb_options_set_disable_data_sync(
+    rocksdb_options_t* opt, int disable_data_sync) {
+  opt->rep.disableDataSync = disable_data_sync;
+}
+
+void rocksdb_options_set_use_fsync(
+    rocksdb_options_t* opt, int use_fsync) {
+  opt->rep.use_fsync = use_fsync;
+}
+
+void rocksdb_options_set_db_stats_log_interval(
+    rocksdb_options_t* opt, int db_stats_log_interval) {
+  opt->rep.db_stats_log_interval = db_stats_log_interval;
+}
+
+void rocksdb_options_set_db_log_dir(
+    rocksdb_options_t* opt, const char* db_log_dir) {
+  opt->rep.db_log_dir = db_log_dir;
+}
+
+void rocksdb_options_set_WAL_ttl_seconds(rocksdb_options_t* opt, uint64_t ttl) {
+  opt->rep.WAL_ttl_seconds = ttl;
+}
+
+void rocksdb_options_set_WAL_size_limit_MB(
+    rocksdb_options_t* opt, uint64_t limit) {
+  opt->rep.WAL_size_limit_MB = limit;
+}
+
+void rocksdb_options_set_max_write_buffer_number(rocksdb_options_t* opt, int n) {
+  opt->rep.max_write_buffer_number = n;
+}
+
+void rocksdb_options_set_min_write_buffer_number_to_merge(rocksdb_options_t* opt, int n) {
+  opt->rep.min_write_buffer_number_to_merge = n;
+}
+
+void rocksdb_options_set_max_background_compactions(rocksdb_options_t* opt, int n) {
+  opt->rep.max_background_compactions = n;
+}
+
+void rocksdb_options_set_max_background_flushes(rocksdb_options_t* opt, int n) {
+  opt->rep.max_background_flushes = n;
+}
+
+void rocksdb_options_set_disable_auto_compactions(rocksdb_options_t* opt, int disable) {
+  opt->rep.disable_auto_compactions = disable;
+}
+
+void rocksdb_options_set_disable_seek_compaction(rocksdb_options_t* opt, int disable) {
+  opt->rep.disable_seek_compaction = disable;
+}
+
+void rocksdb_options_set_source_compaction_factor(
+    rocksdb_options_t* opt, int n) {
+  opt->rep.expanded_compaction_factor = n;
+}
+
+void rocksdb_options_prepare_for_bulk_load(rocksdb_options_t* opt) {
+  opt->rep.PrepareForBulkLoad();
+}
+
+void rocksdb_options_set_memtable_vector_rep(rocksdb_options_t *opt) {
+  static rocksdb::VectorRepFactory* factory = 0;
+  if (!factory) {
+    factory = new rocksdb::VectorRepFactory;
+  }
+  opt->rep.memtable_factory.reset(factory);
+}
+
+void rocksdb_options_set_compaction_style(rocksdb_options_t *opt, int style) {
+  opt->rep.compaction_style = static_cast<rocksdb::CompactionStyle>(style);
+}
+
+void rocksdb_options_set_universal_compaction_options(rocksdb_options_t *opt, rocksdb_universal_compaction_options_t *uco) {
+  opt->rep.compaction_options_universal = *(uco->rep);
+}
+
+/*
+TODO:
+merge_operator
+compaction_filter
+prefix_extractor
+whole_key_filtering
+max_bytes_for_level_multiplier_additional
+delete_obsolete_files_period_micros
+max_log_file_size
+log_file_time_to_roll
+keep_log_file_num
+soft_rate_limit
+hard_rate_limit
+rate_limit_delay_max_milliseconds
+max_manifest_file_size
+no_block_cache
+table_cache_numshardbits
+table_cache_remove_scan_count_limit
+arena_block_size
+manifest_preallocation_size
+purge_redundant_kvs_while_flush
+allow_os_buffer
+allow_mmap_reads
+allow_mmap_writes
+is_fd_close_on_exec
+skip_log_error_on_recovery
+stats_dump_period_sec
+block_size_deviation
+advise_random_on_open
+access_hint_on_compaction_start
+use_adaptive_mutex
+bytes_per_sync
+filter_deletes
+max_sequential_skip_in_iterations
+table_factory
+table_properties_collectors
+inplace_update_support
+inplace_update_num_locks
+*/
+
+rocksdb_comparator_t* rocksdb_comparator_create(
+    void* state,
+    void (*destructor)(void*),
+    int (*compare)(
+        void*,
+        const char* a, size_t alen,
+        const char* b, size_t blen),
+    const char* (*name)(void*)) {
+  rocksdb_comparator_t* result = new rocksdb_comparator_t;
+  result->state_ = state;
+  result->destructor_ = destructor;
+  result->compare_ = compare;
+  result->name_ = name;
+  return result;
+}
+
+void rocksdb_comparator_destroy(rocksdb_comparator_t* cmp) {
+  delete cmp;
+}
+
+rocksdb_filterpolicy_t* rocksdb_filterpolicy_create(
+    void* state,
+    void (*destructor)(void*),
+    char* (*create_filter)(
+        void*,
+        const char* const* key_array, const size_t* key_length_array,
+        int num_keys,
+        size_t* filter_length),
+    unsigned char (*key_may_match)(
+        void*,
+        const char* key, size_t length,
+        const char* filter, size_t filter_length),
+    const char* (*name)(void*)) {
+  rocksdb_filterpolicy_t* result = new rocksdb_filterpolicy_t;
+  result->state_ = state;
+  result->destructor_ = destructor;
+  result->create_ = create_filter;
+  result->key_match_ = key_may_match;
+  result->name_ = name;
+  return result;
+}
+
+void rocksdb_filterpolicy_destroy(rocksdb_filterpolicy_t* filter) {
+  delete filter;
+}
+
+rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom(int bits_per_key) {
+  // Make a rocksdb_filterpolicy_t, but override all of its methods so
+  // they delegate to a NewBloomFilterPolicy() instead of user
+  // supplied C functions.
+  struct Wrapper : public rocksdb_filterpolicy_t {
+    const FilterPolicy* rep_;
+    ~Wrapper() { delete rep_; }
+    const char* Name() const { return rep_->Name(); }
+    void CreateFilter(const Slice* keys, int n, std::string* dst) const {
+      return rep_->CreateFilter(keys, n, dst);
+    }
+    bool KeyMayMatch(const Slice& key, const Slice& filter) const {
+      return rep_->KeyMayMatch(key, filter);
+    }
+    static void DoNothing(void*) { }
+  };
+  Wrapper* wrapper = new Wrapper;
+  wrapper->rep_ = NewBloomFilterPolicy(bits_per_key);
+  wrapper->state_ = NULL;
+  wrapper->destructor_ = &Wrapper::DoNothing;
+  return wrapper;
+}
+
+rocksdb_readoptions_t* rocksdb_readoptions_create() {
+  return new rocksdb_readoptions_t;
+}
+
+void rocksdb_readoptions_destroy(rocksdb_readoptions_t* opt) {
+  delete opt;
+}
+
+void rocksdb_readoptions_set_verify_checksums(
+    rocksdb_readoptions_t* opt,
+    unsigned char v) {
+  opt->rep.verify_checksums = v;
+}
+
+void rocksdb_readoptions_set_fill_cache(
+    rocksdb_readoptions_t* opt, unsigned char v) {
+  opt->rep.fill_cache = v;
+}
+
+void rocksdb_readoptions_set_snapshot(
+    rocksdb_readoptions_t* opt,
+    const rocksdb_snapshot_t* snap) {
+  opt->rep.snapshot = (snap ? snap->rep : NULL);
+}
+
+rocksdb_writeoptions_t* rocksdb_writeoptions_create() {
+  return new rocksdb_writeoptions_t;
+}
+
+void rocksdb_writeoptions_destroy(rocksdb_writeoptions_t* opt) {
+  delete opt;
+}
+
+void rocksdb_writeoptions_set_sync(
+    rocksdb_writeoptions_t* opt, unsigned char v) {
+  opt->rep.sync = v;
+}
+
+void rocksdb_writeoptions_disable_WAL(rocksdb_writeoptions_t* opt, int disable) {
+  opt->rep.disableWAL = disable;
+}
+
+
+rocksdb_cache_t* rocksdb_cache_create_lru(size_t capacity) {
+  rocksdb_cache_t* c = new rocksdb_cache_t;
+  c->rep = NewLRUCache(capacity);
+  return c;
+}
+
+void rocksdb_cache_destroy(rocksdb_cache_t* cache) {
+  delete cache;
+}
+
+rocksdb_env_t* rocksdb_create_default_env() {
+  rocksdb_env_t* result = new rocksdb_env_t;
+  result->rep = Env::Default();
+  result->is_default = true;
+  return result;
+}
+
+void rocksdb_env_set_background_threads(rocksdb_env_t* env, int n) {
+  env->rep->SetBackgroundThreads(n);
+}
+
+void rocksdb_env_set_high_priority_background_threads(rocksdb_env_t* env, int n) {
+  env->rep->SetBackgroundThreads(n, Env::HIGH);
+}
+
+void rocksdb_env_destroy(rocksdb_env_t* env) {
+  if (!env->is_default) delete env->rep;
+  delete env;
+}
+
+rocksdb_universal_compaction_options_t* rocksdb_universal_compaction_options_create() {
+  rocksdb_universal_compaction_options_t* result = new rocksdb_universal_compaction_options_t;
+  result->rep = new rocksdb::CompactionOptionsUniversal;
+  return result;
+}
+
+void rocksdb_universal_compaction_options_set_size_ratio(
+  rocksdb_universal_compaction_options_t* uco, int ratio) {
+  uco->rep->size_ratio = ratio;
+}
+
+void rocksdb_universal_compaction_options_set_min_merge_width(
+  rocksdb_universal_compaction_options_t* uco, int w) {
+  uco->rep->min_merge_width = w;
+}
+
+void rocksdb_universal_compaction_options_set_max_merge_width(
+  rocksdb_universal_compaction_options_t* uco, int w) {
+  uco->rep->max_merge_width = w;
+}
+
+void rocksdb_universal_compaction_options_set_max_size_amplification_percent(
+  rocksdb_universal_compaction_options_t* uco, int p) {
+  uco->rep->max_size_amplification_percent = p;
+}
+
+void rocksdb_universal_compaction_options_set_compression_size_percent(
+  rocksdb_universal_compaction_options_t* uco, int p) {
+  uco->rep->compression_size_percent = p;
+}
+
+void rocksdb_universal_compaction_options_set_stop_style(
+  rocksdb_universal_compaction_options_t* uco, int style) {
+  uco->rep->stop_style = static_cast<rocksdb::CompactionStopStyle>(style);
+}
+
+void rocksdb_universal_compaction_options_destroy(
+  rocksdb_universal_compaction_options_t* uco) {
+  delete uco->rep;
+  delete uco;
+}
+
+}  // end extern "C"
diff --git a/db/c_test.c b/db/c_test.c
new file mode 100644 (file)
index 0000000..8c5e8e5
--- /dev/null
@@ -0,0 +1,390 @@
+/* Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+   Use of this source code is governed by a BSD-style license that can be
+   found in the LICENSE file. See the AUTHORS file for names of contributors. */
+
+#include "rocksdb/c.h"
+
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+const char* phase = "";
+static char dbname[200];
+
+static void StartPhase(const char* name) {
+  fprintf(stderr, "=== Test %s\n", name);
+  phase = name;
+}
+
+static const char* GetTempDir(void) {
+    const char* ret = getenv("TEST_TMPDIR");
+    if (ret == NULL || ret[0] == '\0')
+        ret = "/tmp";
+    return ret;
+}
+
+#define CheckNoError(err)                                               \
+  if ((err) != NULL) {                                                  \
+    fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, __LINE__, phase, (err)); \
+    abort();                                                            \
+  }
+
+#define CheckCondition(cond)                                            \
+  if (!(cond)) {                                                        \
+    fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, __LINE__, phase, #cond); \
+    abort();                                                            \
+  }
+
+static void CheckEqual(const char* expected, const char* v, size_t n) {
+  if (expected == NULL && v == NULL) {
+    // ok
+  } else if (expected != NULL && v != NULL && n == strlen(expected) &&
+             memcmp(expected, v, n) == 0) {
+    // ok
+    return;
+  } else {
+    fprintf(stderr, "%s: expected '%s', got '%s'\n",
+            phase,
+            (expected ? expected : "(null)"),
+            (v ? v : "(null"));
+    abort();
+  }
+}
+
+static void Free(char** ptr) {
+  if (*ptr) {
+    free(*ptr);
+    *ptr = NULL;
+  }
+}
+
+static void CheckGet(
+    rocksdb_t* db,
+    const rocksdb_readoptions_t* options,
+    const char* key,
+    const char* expected) {
+  char* err = NULL;
+  size_t val_len;
+  char* val;
+  val = rocksdb_get(db, options, key, strlen(key), &val_len, &err);
+  CheckNoError(err);
+  CheckEqual(expected, val, val_len);
+  Free(&val);
+}
+
+static void CheckIter(rocksdb_iterator_t* iter,
+                      const char* key, const char* val) {
+  size_t len;
+  const char* str;
+  str = rocksdb_iter_key(iter, &len);
+  CheckEqual(key, str, len);
+  str = rocksdb_iter_value(iter, &len);
+  CheckEqual(val, str, len);
+}
+
+// Callback from rocksdb_writebatch_iterate()
+static void CheckPut(void* ptr,
+                     const char* k, size_t klen,
+                     const char* v, size_t vlen) {
+  int* state = (int*) ptr;
+  CheckCondition(*state < 2);
+  switch (*state) {
+    case 0:
+      CheckEqual("bar", k, klen);
+      CheckEqual("b", v, vlen);
+      break;
+    case 1:
+      CheckEqual("box", k, klen);
+      CheckEqual("c", v, vlen);
+      break;
+  }
+  (*state)++;
+}
+
+// Callback from rocksdb_writebatch_iterate()
+static void CheckDel(void* ptr, const char* k, size_t klen) {
+  int* state = (int*) ptr;
+  CheckCondition(*state == 2);
+  CheckEqual("bar", k, klen);
+  (*state)++;
+}
+
+static void CmpDestroy(void* arg) { }
+
+static int CmpCompare(void* arg, const char* a, size_t alen,
+                      const char* b, size_t blen) {
+  int n = (alen < blen) ? alen : blen;
+  int r = memcmp(a, b, n);
+  if (r == 0) {
+    if (alen < blen) r = -1;
+    else if (alen > blen) r = +1;
+  }
+  return r;
+}
+
+static const char* CmpName(void* arg) {
+  return "foo";
+}
+
+// Custom filter policy
+static unsigned char fake_filter_result = 1;
+static void FilterDestroy(void* arg) { }
+static const char* FilterName(void* arg) {
+  return "TestFilter";
+}
+static char* FilterCreate(
+    void* arg,
+    const char* const* key_array, const size_t* key_length_array,
+    int num_keys,
+    size_t* filter_length) {
+  *filter_length = 4;
+  char* result = malloc(4);
+  memcpy(result, "fake", 4);
+  return result;
+}
+unsigned char FilterKeyMatch(
+    void* arg,
+    const char* key, size_t length,
+    const char* filter, size_t filter_length) {
+  CheckCondition(filter_length == 4);
+  CheckCondition(memcmp(filter, "fake", 4) == 0);
+  return fake_filter_result;
+}
+
+int main(int argc, char** argv) {
+  rocksdb_t* db;
+  rocksdb_comparator_t* cmp;
+  rocksdb_cache_t* cache;
+  rocksdb_env_t* env;
+  rocksdb_options_t* options;
+  rocksdb_readoptions_t* roptions;
+  rocksdb_writeoptions_t* woptions;
+  char* err = NULL;
+  int run = -1;
+
+  snprintf(dbname, sizeof(dbname),
+           "%s/rocksdb_c_test-%d",
+           GetTempDir(),
+           ((int) geteuid()));
+
+  StartPhase("create_objects");
+  cmp = rocksdb_comparator_create(NULL, CmpDestroy, CmpCompare, CmpName);
+  env = rocksdb_create_default_env();
+  cache = rocksdb_cache_create_lru(100000);
+
+  options = rocksdb_options_create();
+  rocksdb_options_set_comparator(options, cmp);
+  rocksdb_options_set_error_if_exists(options, 1);
+  rocksdb_options_set_cache(options, cache);
+  rocksdb_options_set_env(options, env);
+  rocksdb_options_set_info_log(options, NULL);
+  rocksdb_options_set_write_buffer_size(options, 100000);
+  rocksdb_options_set_paranoid_checks(options, 1);
+  rocksdb_options_set_max_open_files(options, 10);
+  rocksdb_options_set_block_size(options, 1024);
+  rocksdb_options_set_block_restart_interval(options, 8);
+  rocksdb_options_set_compression(options, rocksdb_no_compression);
+  rocksdb_options_set_compression_options(options, -14, -1, 0);
+  int compression_levels[] = {rocksdb_no_compression, rocksdb_no_compression,
+                              rocksdb_no_compression, rocksdb_no_compression};
+  rocksdb_options_set_compression_per_level(options, compression_levels, 4);
+
+  roptions = rocksdb_readoptions_create();
+  rocksdb_readoptions_set_verify_checksums(roptions, 1);
+  rocksdb_readoptions_set_fill_cache(roptions, 0);
+
+  woptions = rocksdb_writeoptions_create();
+  rocksdb_writeoptions_set_sync(woptions, 1);
+
+  StartPhase("destroy");
+  rocksdb_destroy_db(options, dbname, &err);
+  Free(&err);
+
+  StartPhase("open_error");
+  db = rocksdb_open(options, dbname, &err);
+  CheckCondition(err != NULL);
+  Free(&err);
+
+  StartPhase("open");
+  rocksdb_options_set_create_if_missing(options, 1);
+  db = rocksdb_open(options, dbname, &err);
+  CheckNoError(err);
+  CheckGet(db, roptions, "foo", NULL);
+
+  StartPhase("put");
+  rocksdb_put(db, woptions, "foo", 3, "hello", 5, &err);
+  CheckNoError(err);
+  CheckGet(db, roptions, "foo", "hello");
+
+  StartPhase("compactall");
+  rocksdb_compact_range(db, NULL, 0, NULL, 0);
+  CheckGet(db, roptions, "foo", "hello");
+
+  StartPhase("compactrange");
+  rocksdb_compact_range(db, "a", 1, "z", 1);
+  CheckGet(db, roptions, "foo", "hello");
+
+  StartPhase("writebatch");
+  {
+    rocksdb_writebatch_t* wb = rocksdb_writebatch_create();
+    rocksdb_writebatch_put(wb, "foo", 3, "a", 1);
+    rocksdb_writebatch_clear(wb);
+    rocksdb_writebatch_put(wb, "bar", 3, "b", 1);
+    rocksdb_writebatch_put(wb, "box", 3, "c", 1);
+    rocksdb_writebatch_delete(wb, "bar", 3);
+    rocksdb_write(db, woptions, wb, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "foo", "hello");
+    CheckGet(db, roptions, "bar", NULL);
+    CheckGet(db, roptions, "box", "c");
+    int pos = 0;
+    rocksdb_writebatch_iterate(wb, &pos, CheckPut, CheckDel);
+    CheckCondition(pos == 3);
+    rocksdb_writebatch_destroy(wb);
+  }
+
+  StartPhase("iter");
+  {
+    rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions);
+    CheckCondition(!rocksdb_iter_valid(iter));
+    rocksdb_iter_seek_to_first(iter);
+    CheckCondition(rocksdb_iter_valid(iter));
+    CheckIter(iter, "box", "c");
+    rocksdb_iter_next(iter);
+    CheckIter(iter, "foo", "hello");
+    rocksdb_iter_prev(iter);
+    CheckIter(iter, "box", "c");
+    rocksdb_iter_prev(iter);
+    CheckCondition(!rocksdb_iter_valid(iter));
+    rocksdb_iter_seek_to_last(iter);
+    CheckIter(iter, "foo", "hello");
+    rocksdb_iter_seek(iter, "b", 1);
+    CheckIter(iter, "box", "c");
+    rocksdb_iter_get_error(iter, &err);
+    CheckNoError(err);
+    rocksdb_iter_destroy(iter);
+  }
+
+  StartPhase("approximate_sizes");
+  {
+    int i;
+    int n = 20000;
+    char keybuf[100];
+    char valbuf[100];
+    uint64_t sizes[2];
+    const char* start[2] = { "a", "k00000000000000010000" };
+    size_t start_len[2] = { 1, 21 };
+    const char* limit[2] = { "k00000000000000010000", "z" };
+    size_t limit_len[2] = { 21, 1 };
+    rocksdb_writeoptions_set_sync(woptions, 0);
+    for (i = 0; i < n; i++) {
+      snprintf(keybuf, sizeof(keybuf), "k%020d", i);
+      snprintf(valbuf, sizeof(valbuf), "v%020d", i);
+      rocksdb_put(db, woptions, keybuf, strlen(keybuf), valbuf, strlen(valbuf),
+                  &err);
+      CheckNoError(err);
+    }
+    rocksdb_approximate_sizes(db, 2, start, start_len, limit, limit_len, sizes);
+    CheckCondition(sizes[0] > 0);
+    CheckCondition(sizes[1] > 0);
+  }
+
+  StartPhase("property");
+  {
+    char* prop = rocksdb_property_value(db, "nosuchprop");
+    CheckCondition(prop == NULL);
+    prop = rocksdb_property_value(db, "rocksdb.stats");
+    CheckCondition(prop != NULL);
+    Free(&prop);
+  }
+
+  StartPhase("snapshot");
+  {
+    const rocksdb_snapshot_t* snap;
+    snap = rocksdb_create_snapshot(db);
+    rocksdb_delete(db, woptions, "foo", 3, &err);
+    CheckNoError(err);
+    rocksdb_readoptions_set_snapshot(roptions, snap);
+    CheckGet(db, roptions, "foo", "hello");
+    rocksdb_readoptions_set_snapshot(roptions, NULL);
+    CheckGet(db, roptions, "foo", NULL);
+    rocksdb_release_snapshot(db, snap);
+  }
+
+  StartPhase("repair");
+  {
+    // If we do not compact here, then the lazy deletion of
+    // files (https://reviews.facebook.net/D6123) would leave
+    // around deleted files and the repair process will find
+    // those files and put them back into the database.
+    rocksdb_compact_range(db, NULL, 0, NULL, 0);
+    rocksdb_close(db);
+    rocksdb_options_set_create_if_missing(options, 0);
+    rocksdb_options_set_error_if_exists(options, 0);
+    rocksdb_repair_db(options, dbname, &err);
+    CheckNoError(err);
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "foo", NULL);
+    CheckGet(db, roptions, "bar", NULL);
+    CheckGet(db, roptions, "box", "c");
+    rocksdb_options_set_create_if_missing(options, 1);
+    rocksdb_options_set_error_if_exists(options, 1);
+  }
+
+  StartPhase("filter");
+  for (run = 0; run < 2; run++) {
+    // First run uses custom filter, second run uses bloom filter
+    CheckNoError(err);
+    rocksdb_filterpolicy_t* policy;
+    if (run == 0) {
+      policy = rocksdb_filterpolicy_create(
+          NULL, FilterDestroy, FilterCreate, FilterKeyMatch, FilterName);
+    } else {
+      policy = rocksdb_filterpolicy_create_bloom(10);
+    }
+
+    // Create new database
+    rocksdb_close(db);
+    rocksdb_destroy_db(options, dbname, &err);
+    rocksdb_options_set_filter_policy(options, policy);
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+    rocksdb_put(db, woptions, "foo", 3, "foovalue", 8, &err);
+    CheckNoError(err);
+    rocksdb_put(db, woptions, "bar", 3, "barvalue", 8, &err);
+    CheckNoError(err);
+    rocksdb_compact_range(db, NULL, 0, NULL, 0);
+
+    fake_filter_result = 1;
+    CheckGet(db, roptions, "foo", "foovalue");
+    CheckGet(db, roptions, "bar", "barvalue");
+    if (phase == 0) {
+      // Must not find value when custom filter returns false
+      fake_filter_result = 0;
+      CheckGet(db, roptions, "foo", NULL);
+      CheckGet(db, roptions, "bar", NULL);
+      fake_filter_result = 1;
+
+      CheckGet(db, roptions, "foo", "foovalue");
+      CheckGet(db, roptions, "bar", "barvalue");
+    }
+    rocksdb_options_set_filter_policy(options, NULL);
+    rocksdb_filterpolicy_destroy(policy);
+  }
+
+  StartPhase("cleanup");
+  rocksdb_close(db);
+  rocksdb_options_destroy(options);
+  rocksdb_readoptions_destroy(roptions);
+  rocksdb_writeoptions_destroy(woptions);
+  rocksdb_cache_destroy(cache);
+  rocksdb_comparator_destroy(cmp);
+  rocksdb_env_destroy(env);
+
+  fprintf(stderr, "PASS\n");
+  return 0;
+}
diff --git a/db/corruption_test.cc b/db/corruption_test.cc
new file mode 100644 (file)
index 0000000..e7b7b4c
--- /dev/null
@@ -0,0 +1,378 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/db.h"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include "rocksdb/cache.h"
+#include "rocksdb/env.h"
+#include "rocksdb/table.h"
+#include "rocksdb/write_batch.h"
+#include "db/db_impl.h"
+#include "db/filename.h"
+#include "db/log_format.h"
+#include "db/version_set.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+
+static const int kValueSize = 1000;
+
+class CorruptionTest {
+ public:
+  test::ErrorEnv env_;
+  std::string dbname_;
+  shared_ptr<Cache> tiny_cache_;
+  Options options_;
+  DB* db_;
+
+  CorruptionTest() {
+    tiny_cache_ = NewLRUCache(100);
+    options_.env = &env_;
+    dbname_ = test::TmpDir() + "/db_test";
+    DestroyDB(dbname_, options_);
+
+    db_ = nullptr;
+    options_.create_if_missing = true;
+    options_.block_size_deviation = 0; // make unit test pass for now
+    Reopen();
+    options_.create_if_missing = false;
+  }
+
+  ~CorruptionTest() {
+     delete db_;
+     DestroyDB(dbname_, Options());
+  }
+
+  Status TryReopen(Options* options = nullptr) {
+    delete db_;
+    db_ = nullptr;
+    Options opt = (options ? *options : options_);
+    opt.env = &env_;
+    opt.block_cache = tiny_cache_;
+    opt.block_size_deviation = 0;
+    opt.arena_block_size = 4096;
+    return DB::Open(opt, dbname_, &db_);
+  }
+
+  void Reopen(Options* options = nullptr) {
+    ASSERT_OK(TryReopen(options));
+  }
+
+  void RepairDB() {
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(::rocksdb::RepairDB(dbname_, options_));
+  }
+
+  void Build(int n) {
+    std::string key_space, value_space;
+    WriteBatch batch;
+    for (int i = 0; i < n; i++) {
+      //if ((i % 100) == 0) fprintf(stderr, "@ %d of %d\n", i, n);
+      Slice key = Key(i, &key_space);
+      batch.Clear();
+      batch.Put(key, Value(i, &value_space));
+      ASSERT_OK(db_->Write(WriteOptions(), &batch));
+    }
+  }
+
+  void Check(int min_expected, int max_expected) {
+    unsigned int next_expected = 0;
+    int missed = 0;
+    int bad_keys = 0;
+    int bad_values = 0;
+    int correct = 0;
+    std::string value_space;
+    Iterator* iter = db_->NewIterator(ReadOptions());
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      uint64_t key;
+      Slice in(iter->key());
+      if (!ConsumeDecimalNumber(&in, &key) ||
+          !in.empty() ||
+          key < next_expected) {
+        bad_keys++;
+        continue;
+      }
+      missed += (key - next_expected);
+      next_expected = key + 1;
+      if (iter->value() != Value(key, &value_space)) {
+        bad_values++;
+      } else {
+        correct++;
+      }
+    }
+    delete iter;
+
+    fprintf(stderr,
+            "expected=%d..%d; got=%d; bad_keys=%d; bad_values=%d; missed=%d\n",
+            min_expected, max_expected, correct, bad_keys, bad_values, missed);
+    ASSERT_LE(min_expected, correct);
+    ASSERT_GE(max_expected, correct);
+  }
+
+  void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) {
+    // Pick file to corrupt
+    std::vector<std::string> filenames;
+    ASSERT_OK(env_.GetChildren(dbname_, &filenames));
+    uint64_t number;
+    FileType type;
+    std::string fname;
+    int picked_number = -1;
+    for (unsigned int i = 0; i < filenames.size(); i++) {
+      if (ParseFileName(filenames[i], &number, &type) &&
+          type == filetype &&
+          int(number) > picked_number) {  // Pick latest file
+        fname = dbname_ + "/" + filenames[i];
+        picked_number = number;
+      }
+    }
+    ASSERT_TRUE(!fname.empty()) << filetype;
+
+    struct stat sbuf;
+    if (stat(fname.c_str(), &sbuf) != 0) {
+      const char* msg = strerror(errno);
+      ASSERT_TRUE(false) << fname << ": " << msg;
+    }
+
+    if (offset < 0) {
+      // Relative to end of file; make it absolute
+      if (-offset > sbuf.st_size) {
+        offset = 0;
+      } else {
+        offset = sbuf.st_size + offset;
+      }
+    }
+    if (offset > sbuf.st_size) {
+      offset = sbuf.st_size;
+    }
+    if (offset + bytes_to_corrupt > sbuf.st_size) {
+      bytes_to_corrupt = sbuf.st_size - offset;
+    }
+
+    // Do it
+    std::string contents;
+    Status s = ReadFileToString(Env::Default(), fname, &contents);
+    ASSERT_TRUE(s.ok()) << s.ToString();
+    for (int i = 0; i < bytes_to_corrupt; i++) {
+      contents[i + offset] ^= 0x80;
+    }
+    s = WriteStringToFile(Env::Default(), contents, fname);
+    ASSERT_TRUE(s.ok()) << s.ToString();
+  }
+
+  int Property(const std::string& name) {
+    std::string property;
+    int result;
+    if (db_->GetProperty(name, &property) &&
+        sscanf(property.c_str(), "%d", &result) == 1) {
+      return result;
+    } else {
+      return -1;
+    }
+  }
+
+  // Return the ith key
+  Slice Key(int i, std::string* storage) {
+    char buf[100];
+    snprintf(buf, sizeof(buf), "%016d", i);
+    storage->assign(buf, strlen(buf));
+    return Slice(*storage);
+  }
+
+  // Return the value to associate with the specified key
+  Slice Value(int k, std::string* storage) {
+    Random r(k);
+    return test::RandomString(&r, kValueSize, storage);
+  }
+};
+
+TEST(CorruptionTest, Recovery) {
+  Build(100);
+  Check(100, 100);
+  Corrupt(kLogFile, 19, 1);      // WriteBatch tag for first record
+  Corrupt(kLogFile, log::kBlockSize + 1000, 1);  // Somewhere in second block
+  Reopen();
+
+  // The 64 records in the first two log blocks are completely lost.
+  Check(36, 36);
+}
+
+TEST(CorruptionTest, RecoverWriteError) {
+  env_.writable_file_error_ = true;
+  Status s = TryReopen();
+  ASSERT_TRUE(!s.ok());
+}
+
+TEST(CorruptionTest, NewFileErrorDuringWrite) {
+  // Do enough writing to force minor compaction
+  env_.writable_file_error_ = true;
+  const int num = 3 + (Options().write_buffer_size / kValueSize);
+  std::string value_storage;
+  Status s;
+  bool failed = false;
+  for (int i = 0; i < num; i++) {
+    WriteBatch batch;
+    batch.Put("a", Value(100, &value_storage));
+    s = db_->Write(WriteOptions(), &batch);
+    if (!s.ok()) {
+      failed = true;
+    }
+    ASSERT_TRUE(!failed || !s.ok());
+  }
+  ASSERT_TRUE(!s.ok());
+  ASSERT_GE(env_.num_writable_file_errors_, 1);
+  env_.writable_file_error_ = false;
+  Reopen();
+}
+
+TEST(CorruptionTest, TableFile) {
+  Build(100);
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+  dbi->TEST_FlushMemTable();
+  dbi->TEST_CompactRange(0, nullptr, nullptr);
+  dbi->TEST_CompactRange(1, nullptr, nullptr);
+
+  Corrupt(kTableFile, 100, 1);
+  Check(99, 99);
+}
+
+TEST(CorruptionTest, TableFileIndexData) {
+  Build(10000);  // Enough to build multiple Tables
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+  dbi->TEST_FlushMemTable();
+
+  Corrupt(kTableFile, -2000, 500);
+  Reopen();
+  Check(5000, 9999);
+}
+
+TEST(CorruptionTest, MissingDescriptor) {
+  Build(1000);
+  RepairDB();
+  Reopen();
+  Check(1000, 1000);
+}
+
+TEST(CorruptionTest, SequenceNumberRecovery) {
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1"));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2"));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v3"));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v4"));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v5"));
+  RepairDB();
+  Reopen();
+  std::string v;
+  ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+  ASSERT_EQ("v5", v);
+  // Write something.  If sequence number was not recovered properly,
+  // it will be hidden by an earlier write.
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v6"));
+  ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+  ASSERT_EQ("v6", v);
+  Reopen();
+  ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+  ASSERT_EQ("v6", v);
+}
+
+TEST(CorruptionTest, CorruptedDescriptor) {
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "hello"));
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+  dbi->TEST_FlushMemTable();
+  dbi->TEST_CompactRange(0, nullptr, nullptr);
+
+  Corrupt(kDescriptorFile, 0, 1000);
+  Status s = TryReopen();
+  ASSERT_TRUE(!s.ok());
+
+  RepairDB();
+  Reopen();
+  std::string v;
+  ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+  ASSERT_EQ("hello", v);
+}
+
+TEST(CorruptionTest, CompactionInputError) {
+  Build(10);
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+  dbi->TEST_FlushMemTable();
+  const int last = dbi->MaxMemCompactionLevel();
+  ASSERT_EQ(1, Property("rocksdb.num-files-at-level" + NumberToString(last)));
+
+  Corrupt(kTableFile, 100, 1);
+  Check(9, 9);
+
+  // Force compactions by writing lots of values
+  Build(10000);
+  Check(10000, 10000);
+}
+
+TEST(CorruptionTest, CompactionInputErrorParanoid) {
+  Options options;
+  options.paranoid_checks = true;
+  options.write_buffer_size = 1048576;
+  Reopen(&options);
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+
+  // Fill levels >= 1 so memtable compaction outputs to level 1
+  for (int level = 1; level < dbi->NumberLevels(); level++) {
+    dbi->Put(WriteOptions(), "", "begin");
+    dbi->Put(WriteOptions(), "~", "end");
+    dbi->TEST_FlushMemTable();
+  }
+
+  Build(10);
+  dbi->TEST_FlushMemTable();
+  dbi->TEST_WaitForCompact();
+  ASSERT_EQ(1, Property("rocksdb.num-files-at-level0"));
+
+  Corrupt(kTableFile, 100, 1);
+  Check(9, 9);
+
+  // Write must eventually fail because of corrupted table
+  Status s;
+  std::string tmp1, tmp2;
+  bool failed = false;
+  for (int i = 0; i < 10000 && s.ok(); i++) {
+    s = db_->Put(WriteOptions(), Key(i, &tmp1), Value(i, &tmp2));
+    if (!s.ok()) {
+      failed = true;
+    }
+    // if one write failed, every subsequent write must fail, too
+    ASSERT_TRUE(!failed || !s.ok()) << "write did not fail in a corrupted db";
+  }
+  ASSERT_TRUE(!s.ok()) << "write did not fail in corrupted paranoid db";
+}
+
+TEST(CorruptionTest, UnrelatedKeys) {
+  Build(10);
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+  dbi->TEST_FlushMemTable();
+  Corrupt(kTableFile, 100, 1);
+
+  std::string tmp1, tmp2;
+  ASSERT_OK(db_->Put(WriteOptions(), Key(1000, &tmp1), Value(1000, &tmp2)));
+  std::string v;
+  ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v));
+  ASSERT_EQ(Value(1000, &tmp2).ToString(), v);
+  dbi->TEST_FlushMemTable();
+  ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v));
+  ASSERT_EQ(Value(1000, &tmp2).ToString(), v);
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/db/db_bench.cc b/db/db_bench.cc
new file mode 100644 (file)
index 0000000..e0ba582
--- /dev/null
@@ -0,0 +1,2585 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <cstddef>
+#include <sys/types.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <gflags/gflags.h>
+#include "db/db_impl.h"
+#include "db/version_set.h"
+#include "db/db_statistics.h"
+#include "rocksdb/options.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/write_batch.h"
+#include "rocksdb/statistics.h"
+#include "port/port.h"
+#include "util/bit_set.h"
+#include "util/crc32c.h"
+#include "util/histogram.h"
+#include "util/mutexlock.h"
+#include "util/random.h"
+#include "util/stack_trace.h"
+#include "util/string_util.h"
+#include "util/testutil.h"
+#include "hdfs/env_hdfs.h"
+#include "utilities/merge_operators.h"
+
+
+DEFINE_string(benchmarks,
+
+              "fillseq,"
+              "fillsync,"
+              "fillrandom,"
+              "overwrite,"
+              "readrandom,"
+              "readrandom,"
+              "readseq,"
+              "readreverse,"
+              "compact,"
+              "readrandom,"
+              "readseq,"
+              "readtocache,"
+              "readreverse,"
+              "readwhilewriting,"
+              "readrandomwriterandom,"
+              "updaterandom,"
+              "randomwithverify,"
+              "fill100K,"
+              "crc32c,"
+              "snappycomp,"
+              "snappyuncomp,"
+              "acquireload,"
+              "fillfromstdin,",
+
+              "Comma-separated list of operations to run in the specified order"
+              "Actual benchmarks:\n"
+              "\tfillseq       -- write N values in sequential key"
+              " order in async mode\n"
+              "\tfillrandom    -- write N values in random key order in async"
+              " mode\n"
+              "\toverwrite     -- overwrite N values in random key order in"
+              " async mode\n"
+              "\tfillsync      -- write N/100 values in random key order in "
+              "sync mode\n"
+              "\tfill100K      -- write N/1000 100K values in random order in"
+              " async mode\n"
+              "\tdeleteseq     -- delete N keys in sequential order\n"
+              "\tdeleterandom  -- delete N keys in random order\n"
+              "\treadseq       -- read N times sequentially\n"
+              "\treadtocache   -- 1 thread reading database sequentially\n"
+              "\treadreverse   -- read N times in reverse order\n"
+              "\treadrandom    -- read N times in random order\n"
+              "\treadmissing   -- read N missing keys in random order\n"
+              "\treadhot       -- read N times in random order from 1% section "
+              "of DB\n"
+              "\treadwhilewriting      -- 1 writer, N threads doing random "
+              "reads\n"
+              "\treadrandomwriterandom -- N threads doing random-read, "
+              "random-write\n"
+              "\tprefixscanrandom      -- prefix scan N times in random order\n"
+              "\tupdaterandom  -- N threads doing read-modify-write for random "
+              "keys\n"
+              "\tappendrandom  -- N threads doing read-modify-write with "
+              "growing values\n"
+              "\tmergerandom   -- same as updaterandom/appendrandom using merge"
+              " operator. "
+              "Must be used with merge_operator\n"
+              "\treadrandommergerandom -- perform N random read-or-merge "
+              "operations. Must be used with merge_operator\n"
+              "\tseekrandom    -- N random seeks\n"
+              "\tcrc32c        -- repeated crc32c of 4K of data\n"
+              "\tacquireload   -- load N*1000 times\n"
+              "Meta operations:\n"
+              "\tcompact     -- Compact the entire DB\n"
+              "\tstats       -- Print DB stats\n"
+              "\tlevelstats  -- Print the number of files and bytes per level\n"
+              "\tsstables    -- Print sstable info\n"
+              "\theapprofile -- Dump a heap profile (if supported by this"
+              " port)\n");
+
+DEFINE_int64(num, 1000000, "Number of key/values to place in database");
+
+DEFINE_int64(numdistinct, 1000,
+             "Number of distinct keys to use. Used in RandomWithVerify to "
+             "read/write on fewer keys so that gets are more likely to find the"
+             " key and puts are more likely to update the same key");
+
+DEFINE_int64(merge_keys, -1,
+             "Number of distinct keys to use for MergeRandom and "
+             "ReadRandomMergeRandom. "
+             "If negative, there will be FLAGS_num keys.");
+
+DEFINE_int64(reads, -1, "Number of read operations to do.  "
+             "If negative, do FLAGS_num reads.");
+
+DEFINE_int64(read_range, 1, "When ==1 reads use ::Get, when >1 reads use"
+             " an iterator");
+
+DEFINE_bool(use_prefix_blooms, false, "Whether to place prefixes in blooms");
+
+DEFINE_bool(use_prefix_api, false, "Whether to set ReadOptions.prefix for"
+            " prefixscanrandom. If true, use_prefix_blooms must also be true.");
+
+DEFINE_int64(seed, 0, "Seed base for random number generators. "
+             "When 0 it is deterministic.");
+
+DEFINE_int32(threads, 1, "Number of concurrent threads to run.");
+
+DEFINE_int32(duration, 0, "Time in seconds for the random-ops tests to run."
+             " When 0 then num & reads determine the test duration");
+
+DEFINE_int32(value_size, 100, "Size of each value");
+
+
+// the maximum size of key in bytes
+static const int kMaxKeySize = 128;
+static bool ValidateKeySize(const char* flagname, int32_t value) {
+  if (value > kMaxKeySize) {
+    fprintf(stderr, "Invalid value for --%s: %d, must be < %d\n",
+            flagname, value, kMaxKeySize);
+    return false;
+  }
+  return true;
+}
+DEFINE_int32(key_size, 16, "size of each key");
+
+DEFINE_double(compression_ratio, 0.5, "Arrange to generate values that shrink"
+              " to this fraction of their original size after compression");
+
+DEFINE_bool(histogram, false, "Print histogram of operation timings");
+
+DEFINE_int64(write_buffer_size, rocksdb::Options().write_buffer_size,
+             "Number of bytes to buffer in memtable before compacting");
+
+DEFINE_int32(max_write_buffer_number,
+             rocksdb::Options().max_write_buffer_number,
+             "The number of in-memory memtables. Each memtable is of size"
+             "write_buffer_size.");
+
+DEFINE_int32(min_write_buffer_number_to_merge,
+             rocksdb::Options().min_write_buffer_number_to_merge,
+             "The minimum number of write buffers that will be merged together"
+             "before writing to storage. This is cheap because it is an"
+             "in-memory merge. If this feature is not enabled, then all these"
+             "write buffers are flushed to L0 as separate files and this "
+             "increases read amplification because a get request has to check"
+             " in all of these files. Also, an in-memory merge may result in"
+             " writing less data to storage if there are duplicate records "
+             " in each of these individual write buffers.");
+
+DEFINE_int32(max_background_compactions,
+             rocksdb::Options().max_background_compactions,
+             "The maximum number of concurrent background compactions"
+             " that can occur in parallel.");
+
+static rocksdb::CompactionStyle FLAGS_compaction_style_e;
+DEFINE_int32(compaction_style, (int32_t) rocksdb::Options().compaction_style,
+             "style of compaction: level-based vs universal");
+
+DEFINE_int32(universal_size_ratio, 0,
+             "Percentage flexibility while comparing file size"
+             " (for universal compaction only).");
+
+DEFINE_int32(universal_min_merge_width, 0, "The minimum number of files in a"
+             " single compaction run (for universal compaction only).");
+
+DEFINE_int32(universal_max_merge_width, 0, "The max number of files to compact"
+             " in universal style compaction");
+
+DEFINE_int32(universal_max_size_amplification_percent, 0,
+             "The max size amplification for universal style compaction");
+
+DEFINE_int32(universal_compression_size_percent, -1,
+             "The percentage of the database to compress for universal "
+             "compaction. -1 means compress everything.");
+
+DEFINE_int64(cache_size, -1, "Number of bytes to use as a cache of uncompressed"
+             "data. Negative means use default settings.");
+
+DEFINE_int32(block_size, rocksdb::Options().block_size,
+             "Number of bytes in a block.");
+
+DEFINE_int64(compressed_cache_size, -1,
+             "Number of bytes to use as a cache of compressed data.");
+
+DEFINE_int32(open_files, rocksdb::Options().max_open_files,
+             "Maximum number of files to keep open at the same time"
+             " (use default if == 0)");
+
+DEFINE_int32(bloom_bits, -1, "Bloom filter bits per key. Negative means"
+             " use default settings.");
+
+DEFINE_bool(use_existing_db, false, "If true, do not destroy the existing"
+            " database.  If you set this flag and also specify a benchmark that"
+            " wants a fresh database, that benchmark will fail.");
+
+DEFINE_string(db, "", "Use the db with the following name.");
+
+static bool ValidateCacheNumshardbits(const char* flagname, int32_t value) {
+  if (value >= 20) {
+    fprintf(stderr, "Invalid value for --%s: %d, must be < 20\n",
+            flagname, value);
+    return false;
+  }
+  return true;
+}
+DEFINE_int32(cache_numshardbits, -1, "Number of shards for the block cache"
+             " is 2 ** cache_numshardbits. Negative means use default settings."
+             " This is applied only if FLAGS_cache_size is non-negative.");
+
+DEFINE_int32(cache_remove_scan_count_limit, 32, "");
+
+DEFINE_bool(verify_checksum, false, "Verify checksum for every block read"
+            " from storage");
+
+DEFINE_bool(statistics, false, "Database statistics");
+static class std::shared_ptr<rocksdb::Statistics> dbstats;
+
+DEFINE_int64(writes, -1, "Number of write operations to do. If negative, do"
+             " --num reads.");
+
+DEFINE_int32(writes_per_second, 0, "Per-thread rate limit on writes per second."
+             " No limit when <= 0. Only for the readwhilewriting test.");
+
+DEFINE_bool(sync, false, "Sync all writes to disk");
+
+DEFINE_bool(disable_data_sync, false, "If true, do not wait until data is"
+            " synced to disk.");
+
+DEFINE_bool(use_fsync, false, "If true, issue fsync instead of fdatasync");
+
+DEFINE_bool(disable_wal, false, "If true, do not write WAL for write.");
+
+DEFINE_bool(use_snapshot, false, "If true, create a snapshot per query when"
+            " randomread benchmark is used");
+
+DEFINE_bool(get_approx, false, "If true, call GetApproximateSizes per query"
+            " when read_range is > 1 and randomread benchmark is used");
+
+DEFINE_int32(num_levels, 7, "The total number of levels");
+
+DEFINE_int32(target_file_size_base, 2 * 1048576, "Target file size at level-1");
+
+DEFINE_int32(target_file_size_multiplier, 1,
+             "A multiplier to compute target level-N file size (N >= 2)");
+
+DEFINE_uint64(max_bytes_for_level_base,  10 * 1048576, "Max bytes for level-1");
+
+DEFINE_int32(max_bytes_for_level_multiplier, 10,
+             "A multiplier to compute max bytes for level-N (N >= 2)");
+
+static std::vector<int> FLAGS_max_bytes_for_level_multiplier_additional_v;
+DEFINE_string(max_bytes_for_level_multiplier_additional, "",
+              "A vector that specifies additional fanout per level");
+
+DEFINE_int32(level0_stop_writes_trigger, 12, "Number of files in level-0"
+             " that will trigger put stop.");
+
+DEFINE_int32(level0_slowdown_writes_trigger, 8, "Number of files in level-0"
+             " that will slow down writes.");
+
+DEFINE_int32(level0_file_num_compaction_trigger, 4, "Number of files in level-0"
+             " when compactions start");
+
+static bool ValidateInt32Percent(const char* flagname, int32_t value) {
+  if (value <= 0 || value>=100) {
+    fprintf(stderr, "Invalid value for --%s: %d, 0< pct <100 \n",
+            flagname, value);
+    return false;
+  }
+  return true;
+}
+DEFINE_int32(readwritepercent, 90, "Ratio of reads to reads/writes (expressed"
+             " as percentage) for the ReadRandomWriteRandom workload. The "
+             "default value 90 means 90% operations out of all reads and writes"
+             " operations are reads. In other words, 9 gets for every 1 put.");
+
+DEFINE_int32(mergereadpercent, 70, "Ratio of merges to merges&reads (expressed"
+             " as percentage) for the ReadRandomMergeRandom workload. The"
+             " default value 70 means 70% out of all read and merge operations"
+             " are merges. In other words, 7 merges for every 3 gets.");
+
+DEFINE_int32(deletepercent, 2, "Percentage of deletes out of reads/writes/"
+             "deletes (used in RandomWithVerify only). RandomWithVerify "
+             "calculates writepercent as (100 - FLAGS_readwritepercent - "
+             "deletepercent), so deletepercent must be smaller than (100 - "
+             "FLAGS_readwritepercent)");
+
+DEFINE_int32(disable_seek_compaction, false, "Option to disable compaction"
+             " triggered by read.");
+
+DEFINE_uint64(delete_obsolete_files_period_micros, 0, "Option to delete "
+              "obsolete files periodically. 0 means that obsolete files are"
+              " deleted after every compaction run.");
+
+enum rocksdb::CompressionType StringToCompressionType(const char* ctype) {
+  assert(ctype);
+
+  if (!strcasecmp(ctype, "none"))
+    return rocksdb::kNoCompression;
+  else if (!strcasecmp(ctype, "snappy"))
+    return rocksdb::kSnappyCompression;
+  else if (!strcasecmp(ctype, "zlib"))
+    return rocksdb::kZlibCompression;
+  else if (!strcasecmp(ctype, "bzip2"))
+    return rocksdb::kBZip2Compression;
+
+  fprintf(stdout, "Cannot parse compression type '%s'\n", ctype);
+  return rocksdb::kSnappyCompression; //default value
+}
+DEFINE_string(compression_type, "snappy",
+              "Algorithm to use to compress the database");
+static enum rocksdb::CompressionType FLAGS_compression_type_e =
+    rocksdb::kSnappyCompression;
+
+DEFINE_int32(compression_level, -1,
+             "Compression level. For zlib this should be -1 for the "
+             "default level, or between 0 and 9.");
+
+static bool ValidateCompressionLevel(const char* flagname, int32_t value) {
+  if (value < -1 || value > 9) {
+    fprintf(stderr, "Invalid value for --%s: %d, must be between -1 and 9\n",
+            flagname, value);
+    return false;
+  }
+  return true;
+}
+
+static const bool FLAGS_compression_level_dummy =
+  google::RegisterFlagValidator(&FLAGS_compression_level,
+                                &ValidateCompressionLevel);
+
+DEFINE_int32(min_level_to_compress, -1, "If non-negative, compression starts"
+             " from this level. Levels with number < min_level_to_compress are"
+             " not compressed. Otherwise, apply compression_type to "
+             "all levels.");
+
+static bool ValidateTableCacheNumshardbits(const char* flagname,
+                                           int32_t value) {
+  if (0 >= value || value > 20) {
+    fprintf(stderr, "Invalid value for --%s: %d, must be  0 < val <= 20\n",
+            flagname, value);
+    return false;
+  }
+  return true;
+}
+DEFINE_int32(table_cache_numshardbits, 4, "");
+
+DEFINE_string(hdfs, "", "Name of hdfs environment");
+// posix or hdfs environment
+static rocksdb::Env* FLAGS_env = rocksdb::Env::Default();
+
+DEFINE_int64(stats_interval, 0, "Stats are reported every N operations when "
+             "this is greater than zero. When 0 the interval grows over time.");
+
+DEFINE_int32(stats_per_interval, 0, "Reports additional stats per interval when"
+             " this is greater than 0.");
+
+static bool ValidateRateLimit(const char* flagname, double value) {
+  static constexpr double EPSILON = 1e-10;
+  if ( value < -EPSILON ) {
+    fprintf(stderr, "Invalid value for --%s: %12.6f, must be >= 0.0\n",
+            flagname, value);
+    return false;
+  }
+  return true;
+}
+DEFINE_double(soft_rate_limit, 0.0, "");
+
+DEFINE_double(hard_rate_limit, 0.0, "When not equal to 0 this make threads "
+              "sleep at each stats reporting interval until the compaction"
+              " score for all levels is less than or equal to this value.");
+
+DEFINE_int32(rate_limit_delay_max_milliseconds, 1000,
+             "When hard_rate_limit is set then this is the max time a put will"
+             " be stalled.");
+
+DEFINE_int32(max_grandparent_overlap_factor, 10, "Control maximum bytes of "
+             "overlaps in grandparent (i.e., level+2) before we stop building a"
+             " single file in a level->level+1 compaction.");
+
+DEFINE_bool(readonly, false, "Run read only benchmarks.");
+
+DEFINE_bool(disable_auto_compactions, false, "Do not auto trigger compactions");
+
+DEFINE_int32(source_compaction_factor, 1, "Cap the size of data in level-K for"
+             " a compaction run that compacts Level-K with Level-(K+1) (for"
+             " K >= 1)");
+
+DEFINE_uint64(wal_ttl_seconds, 0, "Set the TTL for the WAL Files in seconds.");
+DEFINE_uint64(wal_size_limit_MB, 0, "Set the size limit for the WAL Files"
+              " in MB.");
+
+DEFINE_bool(bufferedio, rocksdb::EnvOptions().use_os_buffer,
+            "Allow buffered io using OS buffers");
+
+DEFINE_bool(mmap_read, rocksdb::EnvOptions().use_mmap_reads,
+            "Allow reads to occur via mmap-ing files");
+
+DEFINE_bool(mmap_write, rocksdb::EnvOptions().use_mmap_writes,
+            "Allow writes to occur via mmap-ing files");
+
+DEFINE_bool(advise_random_on_open, rocksdb::Options().advise_random_on_open,
+            "Advise random access on table file open");
+
+DEFINE_string(compaction_fadvice, "NORMAL",
+              "Access pattern advice when a file is compacted");
+static auto FLAGS_compaction_fadvice_e =
+  rocksdb::Options().access_hint_on_compaction_start;
+
+DEFINE_bool(use_multiget, false,
+            "Use multiget to access a series of keys instead of get");
+
+DEFINE_int64(keys_per_multiget, 90, "If use_multiget is true, determines number"
+             " of keys to group per call Arbitrary default is good because it"
+             " agrees with readwritepercent");
+
+// TODO: Apply this flag to generic Get calls too. Currently only with Multiget
+DEFINE_bool(warn_missing_keys, true, "Print a message to user when a key is"
+            " missing in a Get/MultiGet call");
+
+DEFINE_bool(use_adaptive_mutex, rocksdb::Options().use_adaptive_mutex,
+            "Use adaptive mutex");
+
+DEFINE_uint64(bytes_per_sync,  rocksdb::Options().bytes_per_sync,
+              "Allows OS to incrementally sync files to disk while they are"
+              " being written, in the background. Issue one request for every"
+              " bytes_per_sync written. 0 turns it off.");
+DEFINE_bool(filter_deletes, false, " On true, deletes use bloom-filter and drop"
+            " the delete if key not present");
+
+DEFINE_int32(max_successive_merges, 0, "Maximum number of successive merge"
+             " operations on a key in the memtable");
+
+static bool ValidatePrefixSize(const char* flagname, int32_t value) {
+  if (value < 0 || value>=2000000000) {
+    fprintf(stderr, "Invalid value for --%s: %d. 0<= PrefixSize <=2000000000\n",
+            flagname, value);
+    return false;
+  }
+  return true;
+}
+DEFINE_int32(prefix_size, 0, "Control the prefix size for HashSkipList");
+
+enum RepFactory {
+  kSkipList,
+  kPrefixHash,
+  kVectorRep
+};
+enum RepFactory StringToRepFactory(const char* ctype) {
+  assert(ctype);
+
+  if (!strcasecmp(ctype, "skip_list"))
+    return kSkipList;
+  else if (!strcasecmp(ctype, "prefix_hash"))
+    return kPrefixHash;
+  else if (!strcasecmp(ctype, "vector"))
+    return kVectorRep;
+
+  fprintf(stdout, "Cannot parse memreptable %s\n", ctype);
+  return kSkipList;
+}
+static enum RepFactory FLAGS_rep_factory;
+DEFINE_string(memtablerep, "skip_list", "");
+
+DEFINE_string(merge_operator, "", "The merge operator to use with the database."
+              "If a new merge operator is specified, be sure to use fresh"
+              " database The possible merge operators are defined in"
+              " utilities/merge_operators.h");
+
+static const bool FLAGS_soft_rate_limit_dummy __attribute__((unused)) =
+  google::RegisterFlagValidator(&FLAGS_soft_rate_limit,
+                                &ValidateRateLimit);
+
+static const bool FLAGS_hard_rate_limit_dummy __attribute__((unused)) =
+  google::RegisterFlagValidator(&FLAGS_hard_rate_limit, &ValidateRateLimit);
+
+static const bool FLAGS_prefix_size_dummy __attribute__((unused)) =
+  google::RegisterFlagValidator(&FLAGS_prefix_size, &ValidatePrefixSize);
+
+static const bool FLAGS_key_size_dummy __attribute__((unused)) =
+  google::RegisterFlagValidator(&FLAGS_key_size, &ValidateKeySize);
+
+static const bool FLAGS_cache_numshardbits_dummy __attribute__((unused)) =
+  google::RegisterFlagValidator(&FLAGS_cache_numshardbits,
+                                &ValidateCacheNumshardbits);
+
+static const bool FLAGS_readwritepercent_dummy __attribute__((unused)) =
+  google::RegisterFlagValidator(&FLAGS_readwritepercent,
+                                &ValidateInt32Percent);
+
+static const bool FLAGS_deletepercent_dummy __attribute__((unused)) =
+  google::RegisterFlagValidator(&FLAGS_deletepercent,
+                                &ValidateInt32Percent);
+static const bool
+  FLAGS_table_cache_numshardbits_dummy __attribute__((unused)) =
+  google::RegisterFlagValidator(&FLAGS_table_cache_numshardbits,
+                                &ValidateTableCacheNumshardbits);
+
+namespace rocksdb {
+
+// Helper for quickly generating random data.
+class RandomGenerator {
+ private:
+  std::string data_;
+  unsigned int pos_;
+
+ public:
+  RandomGenerator() {
+    // We use a limited amount of data over and over again and ensure
+    // that it is larger than the compression window (32KB), and also
+    // large enough to serve all typical value sizes we want to write.
+    Random rnd(301);
+    std::string piece;
+    while (data_.size() < (unsigned)std::max(1048576, FLAGS_value_size)) {
+      // Add a short fragment that is as compressible as specified
+      // by FLAGS_compression_ratio.
+      test::CompressibleString(&rnd, FLAGS_compression_ratio, 100, &piece);
+      data_.append(piece);
+    }
+    pos_ = 0;
+  }
+
+  Slice Generate(unsigned int len) {
+    if (pos_ + len > data_.size()) {
+      pos_ = 0;
+      assert(len < data_.size());
+    }
+    pos_ += len;
+    return Slice(data_.data() + pos_ - len, len);
+  }
+};
+
+static void AppendWithSpace(std::string* str, Slice msg) {
+  if (msg.empty()) return;
+  if (!str->empty()) {
+    str->push_back(' ');
+  }
+  str->append(msg.data(), msg.size());
+}
+
+class Stats {
+ private:
+  int id_;
+  double start_;
+  double finish_;
+  double seconds_;
+  long long done_;
+  long long last_report_done_;
+  long long next_report_;
+  int64_t bytes_;
+  double last_op_finish_;
+  double last_report_finish_;
+  HistogramImpl hist_;
+  std::string message_;
+  bool exclude_from_merge_;
+
+ public:
+  Stats() { Start(-1); }
+
+  void Start(int id) {
+    id_ = id;
+    next_report_ = FLAGS_stats_interval ? FLAGS_stats_interval : 100;
+    last_op_finish_ = start_;
+    hist_.Clear();
+    done_ = 0;
+    last_report_done_ = 0;
+    bytes_ = 0;
+    seconds_ = 0;
+    start_ = FLAGS_env->NowMicros();
+    finish_ = start_;
+    last_report_finish_ = start_;
+    message_.clear();
+    // When set, stats from this thread won't be merged with others.
+    exclude_from_merge_ = false;
+  }
+
+  void Merge(const Stats& other) {
+    if (other.exclude_from_merge_)
+      return;
+
+    hist_.Merge(other.hist_);
+    done_ += other.done_;
+    bytes_ += other.bytes_;
+    seconds_ += other.seconds_;
+    if (other.start_ < start_) start_ = other.start_;
+    if (other.finish_ > finish_) finish_ = other.finish_;
+
+    // Just keep the messages from one thread
+    if (message_.empty()) message_ = other.message_;
+  }
+
+  void Stop() {
+    finish_ = FLAGS_env->NowMicros();
+    seconds_ = (finish_ - start_) * 1e-6;
+  }
+
+  void AddMessage(Slice msg) {
+    AppendWithSpace(&message_, msg);
+  }
+
+  void SetId(int id) { id_ = id; }
+  void SetExcludeFromMerge() { exclude_from_merge_ = true; }
+
+  void FinishedSingleOp(DB* db) {
+    if (FLAGS_histogram) {
+      double now = FLAGS_env->NowMicros();
+      double micros = now - last_op_finish_;
+      hist_.Add(micros);
+      if (micros > 20000 && !FLAGS_stats_interval) {
+        fprintf(stderr, "long op: %.1f micros%30s\r", micros, "");
+        fflush(stderr);
+      }
+      last_op_finish_ = now;
+    }
+
+    done_++;
+    if (done_ >= next_report_) {
+      if (!FLAGS_stats_interval) {
+        if      (next_report_ < 1000)   next_report_ += 100;
+        else if (next_report_ < 5000)   next_report_ += 500;
+        else if (next_report_ < 10000)  next_report_ += 1000;
+        else if (next_report_ < 50000)  next_report_ += 5000;
+        else if (next_report_ < 100000) next_report_ += 10000;
+        else if (next_report_ < 500000) next_report_ += 50000;
+        else                            next_report_ += 100000;
+        fprintf(stderr, "... finished %lld ops%30s\r", done_, "");
+        fflush(stderr);
+      } else {
+        double now = FLAGS_env->NowMicros();
+        fprintf(stderr,
+                "%s ... thread %d: (%lld,%lld) ops and "
+                "(%.1f,%.1f) ops/second in (%.6f,%.6f) seconds\n",
+                FLAGS_env->TimeToString((uint64_t) now/1000000).c_str(),
+                id_,
+                done_ - last_report_done_, done_,
+                (done_ - last_report_done_) /
+                ((now - last_report_finish_) / 1000000.0),
+                done_ / ((now - start_) / 1000000.0),
+                (now - last_report_finish_) / 1000000.0,
+                (now - start_) / 1000000.0);
+
+        if (FLAGS_stats_per_interval) {
+          std::string stats;
+          if (db && db->GetProperty("rocksdb.stats", &stats))
+            fprintf(stderr, "%s\n", stats.c_str());
+        }
+
+        fflush(stderr);
+        next_report_ += FLAGS_stats_interval;
+        last_report_finish_ = now;
+        last_report_done_ = done_;
+      }
+    }
+  }
+
+  void AddBytes(int64_t n) {
+    bytes_ += n;
+  }
+
+  void Report(const Slice& name) {
+    // Pretend at least one op was done in case we are running a benchmark
+    // that does not call FinishedSingleOp().
+    if (done_ < 1) done_ = 1;
+
+    std::string extra;
+    if (bytes_ > 0) {
+      // Rate is computed on actual elapsed time, not the sum of per-thread
+      // elapsed times.
+      double elapsed = (finish_ - start_) * 1e-6;
+      char rate[100];
+      snprintf(rate, sizeof(rate), "%6.1f MB/s",
+               (bytes_ / 1048576.0) / elapsed);
+      extra = rate;
+    }
+    AppendWithSpace(&extra, message_);
+    double elapsed = (finish_ - start_) * 1e-6;
+    double throughput = (double)done_/elapsed;
+
+    fprintf(stdout, "%-12s : %11.3f micros/op %ld ops/sec;%s%s\n",
+            name.ToString().c_str(),
+            elapsed * 1e6 / done_,
+            (long)throughput,
+            (extra.empty() ? "" : " "),
+            extra.c_str());
+    if (FLAGS_histogram) {
+      fprintf(stdout, "Microseconds per op:\n%s\n", hist_.ToString().c_str());
+    }
+    fflush(stdout);
+  }
+};
+
+// State shared by all concurrent executions of the same benchmark.
+struct SharedState {
+  port::Mutex mu;
+  port::CondVar cv;
+  int total;
+
+  // Each thread goes through the following states:
+  //    (1) initializing
+  //    (2) waiting for others to be initialized
+  //    (3) running
+  //    (4) done
+
+  long num_initialized;
+  long num_done;
+  bool start;
+
+  SharedState() : cv(&mu) { }
+};
+
+// Per-thread state for concurrent executions of the same benchmark.
+struct ThreadState {
+  int tid;             // 0..n-1 when running in n threads
+  Random64 rand;         // Has different seeds for different threads
+  Stats stats;
+  SharedState* shared;
+
+  /* implicit */ ThreadState(int index)
+      : tid(index),
+        rand((FLAGS_seed ? FLAGS_seed : 1000) + index) {
+  }
+};
+
+class Duration {
+ public:
+  Duration(int max_seconds, long long max_ops) {
+    max_seconds_ = max_seconds;
+    max_ops_= max_ops;
+    ops_ = 0;
+    start_at_ = FLAGS_env->NowMicros();
+  }
+
+  bool Done(int increment) {
+    if (increment <= 0) increment = 1;    // avoid Done(0) and infinite loops
+    ops_ += increment;
+
+    if (max_seconds_) {
+      // Recheck every appx 1000 ops (exact iff increment is factor of 1000)
+      if ((ops_/1000) != ((ops_-increment)/1000)) {
+        double now = FLAGS_env->NowMicros();
+        return ((now - start_at_) / 1000000.0) >= max_seconds_;
+      } else {
+        return false;
+      }
+    } else {
+      return ops_ > max_ops_;
+    }
+  }
+
+ private:
+  int max_seconds_;
+  long long max_ops_;
+  long long ops_;
+  double start_at_;
+};
+
+class Benchmark {
+ private:
+  shared_ptr<Cache> cache_;
+  shared_ptr<Cache> compressed_cache_;
+  const FilterPolicy* filter_policy_;
+  const SliceTransform* prefix_extractor_;
+  DB* db_;
+  long long num_;
+  int value_size_;
+  int key_size_;
+  int entries_per_batch_;
+  WriteOptions write_options_;
+  long long reads_;
+  long long writes_;
+  long long readwrites_;
+  long long merge_keys_;
+  int heap_counter_;
+  char keyFormat_[100]; // will contain the format of key. e.g "%016d"
+  void PrintHeader() {
+    PrintEnvironment();
+    fprintf(stdout, "Keys:       %d bytes each\n", FLAGS_key_size);
+    fprintf(stdout, "Values:     %d bytes each (%d bytes after compression)\n",
+            FLAGS_value_size,
+            static_cast<int>(FLAGS_value_size * FLAGS_compression_ratio + 0.5));
+    fprintf(stdout, "Entries:    %lld\n", num_);
+    fprintf(stdout, "RawSize:    %.1f MB (estimated)\n",
+            ((static_cast<int64_t>(FLAGS_key_size + FLAGS_value_size) * num_)
+             / 1048576.0));
+    fprintf(stdout, "FileSize:   %.1f MB (estimated)\n",
+            (((FLAGS_key_size + FLAGS_value_size * FLAGS_compression_ratio)
+              * num_)
+             / 1048576.0));
+    fprintf(stdout, "Write rate limit: %d\n", FLAGS_writes_per_second);
+    switch (FLAGS_compression_type_e) {
+      case rocksdb::kNoCompression:
+        fprintf(stdout, "Compression: none\n");
+        break;
+      case rocksdb::kSnappyCompression:
+        fprintf(stdout, "Compression: snappy\n");
+        break;
+      case rocksdb::kZlibCompression:
+        fprintf(stdout, "Compression: zlib\n");
+        break;
+      case rocksdb::kBZip2Compression:
+        fprintf(stdout, "Compression: bzip2\n");
+        break;
+    }
+
+    switch (FLAGS_rep_factory) {
+      case kPrefixHash:
+        fprintf(stdout, "Memtablerep: prefix_hash\n");
+        break;
+      case kSkipList:
+        fprintf(stdout, "Memtablerep: skip_list\n");
+        break;
+      case kVectorRep:
+        fprintf(stdout, "Memtablerep: vector\n");
+        break;
+    }
+
+    PrintWarnings();
+    fprintf(stdout, "------------------------------------------------\n");
+  }
+
+  void PrintWarnings() {
+#if defined(__GNUC__) && !defined(__OPTIMIZE__)
+    fprintf(stdout,
+            "WARNING: Optimization is disabled: benchmarks unnecessarily slow\n"
+            );
+#endif
+#ifndef NDEBUG
+    fprintf(stdout,
+            "WARNING: Assertions are enabled; benchmarks unnecessarily slow\n");
+#endif
+    if (FLAGS_compression_type_e != rocksdb::kNoCompression) {
+      // The test string should not be too small.
+      const int len = FLAGS_block_size;
+      char* text = (char*) malloc(len+1);
+      bool result = true;
+      const char* name = nullptr;
+      std::string compressed;
+
+      memset(text, (int) 'y', len);
+      text[len] = '\0';
+      switch (FLAGS_compression_type_e) {
+        case kSnappyCompression:
+          result = port::Snappy_Compress(Options().compression_opts, text,
+                                         strlen(text), &compressed);
+          name = "Snappy";
+          break;
+        case kZlibCompression:
+          result = port::Zlib_Compress(Options().compression_opts, text,
+                                       strlen(text), &compressed);
+          name = "Zlib";
+          break;
+        case kBZip2Compression:
+          result = port::BZip2_Compress(Options().compression_opts, text,
+                                        strlen(text), &compressed);
+          name = "BZip2";
+          break;
+        case kNoCompression:
+          assert(false); // cannot happen
+          break;
+      }
+
+      if (!result) {
+        fprintf(stdout, "WARNING: %s compression is not enabled\n", name);
+      } else if (name && compressed.size() >= strlen(text)) {
+        fprintf(stdout, "WARNING: %s compression is not effective\n", name);
+      }
+
+      free(text);
+    }
+  }
+
+// Current the following isn't equivalent to OS_LINUX.
+#if defined(__linux)
+  static Slice TrimSpace(Slice s) {
+    unsigned int start = 0;
+    while (start < s.size() && isspace(s[start])) {
+      start++;
+    }
+    unsigned int limit = s.size();
+    while (limit > start && isspace(s[limit-1])) {
+      limit--;
+    }
+    return Slice(s.data() + start, limit - start);
+  }
+#endif
+
+  void PrintEnvironment() {
+    fprintf(stderr, "LevelDB:    version %d.%d\n",
+            kMajorVersion, kMinorVersion);
+
+#if defined(__linux)
+    time_t now = time(nullptr);
+    fprintf(stderr, "Date:       %s", ctime(&now));  // ctime() adds newline
+
+    FILE* cpuinfo = fopen("/proc/cpuinfo", "r");
+    if (cpuinfo != nullptr) {
+      char line[1000];
+      int num_cpus = 0;
+      std::string cpu_type;
+      std::string cache_size;
+      while (fgets(line, sizeof(line), cpuinfo) != nullptr) {
+        const char* sep = strchr(line, ':');
+        if (sep == nullptr) {
+          continue;
+        }
+        Slice key = TrimSpace(Slice(line, sep - 1 - line));
+        Slice val = TrimSpace(Slice(sep + 1));
+        if (key == "model name") {
+          ++num_cpus;
+          cpu_type = val.ToString();
+        } else if (key == "cache size") {
+          cache_size = val.ToString();
+        }
+      }
+      fclose(cpuinfo);
+      fprintf(stderr, "CPU:        %d * %s\n", num_cpus, cpu_type.c_str());
+      fprintf(stderr, "CPUCache:   %s\n", cache_size.c_str());
+    }
+#endif
+  }
+
+ public:
+  Benchmark()
+  : cache_(FLAGS_cache_size >= 0 ?
+           (FLAGS_cache_numshardbits >= 1 ?
+            NewLRUCache(FLAGS_cache_size, FLAGS_cache_numshardbits,
+                        FLAGS_cache_remove_scan_count_limit) :
+            NewLRUCache(FLAGS_cache_size)) : nullptr),
+    compressed_cache_(FLAGS_compressed_cache_size >= 0 ?
+           (FLAGS_cache_numshardbits >= 1 ?
+            NewLRUCache(FLAGS_compressed_cache_size, FLAGS_cache_numshardbits) :
+            NewLRUCache(FLAGS_compressed_cache_size)) : nullptr),
+    filter_policy_(FLAGS_bloom_bits >= 0
+                   ? NewBloomFilterPolicy(FLAGS_bloom_bits)
+                   : nullptr),
+    prefix_extractor_(NewFixedPrefixTransform(FLAGS_key_size-1)),
+    db_(nullptr),
+    num_(FLAGS_num),
+    value_size_(FLAGS_value_size),
+    key_size_(FLAGS_key_size),
+    entries_per_batch_(1),
+    reads_(FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads),
+    writes_(FLAGS_writes < 0 ? FLAGS_num : FLAGS_writes),
+    readwrites_((FLAGS_writes < 0  && FLAGS_reads < 0)? FLAGS_num :
+                ((FLAGS_writes > FLAGS_reads) ? FLAGS_writes : FLAGS_reads)
+               ),
+    merge_keys_(FLAGS_merge_keys < 0 ? FLAGS_num : FLAGS_merge_keys),
+    heap_counter_(0) {
+    std::vector<std::string> files;
+    FLAGS_env->GetChildren(FLAGS_db, &files);
+    for (unsigned int i = 0; i < files.size(); i++) {
+      if (Slice(files[i]).starts_with("heap-")) {
+        FLAGS_env->DeleteFile(FLAGS_db + "/" + files[i]);
+      }
+    }
+    if (!FLAGS_use_existing_db) {
+      DestroyDB(FLAGS_db, Options());
+    }
+  }
+
+  ~Benchmark() {
+    delete db_;
+    delete filter_policy_;
+    delete prefix_extractor_;
+  }
+
+  //this function will construct string format for key. e.g "%016lld"
+  void ConstructStrFormatForKey(char* str, int keySize) {
+    str[0] = '%';
+    str[1] = '0';
+    sprintf(str+2, "%dlld%s", keySize, "%s");
+  }
+
+  unique_ptr<char []> GenerateKeyFromInt(long long v, const char* suffix = "") {
+    unique_ptr<char []> keyInStr(new char[kMaxKeySize + 1]);
+    snprintf(keyInStr.get(), kMaxKeySize + 1, keyFormat_, v, suffix);
+    return keyInStr;
+  }
+
+  void Run() {
+    PrintHeader();
+    Open();
+    const char* benchmarks = FLAGS_benchmarks.c_str();
+    while (benchmarks != nullptr) {
+      const char* sep = strchr(benchmarks, ',');
+      Slice name;
+      if (sep == nullptr) {
+        name = benchmarks;
+        benchmarks = nullptr;
+      } else {
+        name = Slice(benchmarks, sep - benchmarks);
+        benchmarks = sep + 1;
+      }
+
+      // Sanitize parameters
+      num_ = FLAGS_num;
+      reads_ = (FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads);
+      writes_ = (FLAGS_writes < 0 ? FLAGS_num : FLAGS_writes);
+      value_size_ = FLAGS_value_size;
+      key_size_ = FLAGS_key_size;
+      ConstructStrFormatForKey(keyFormat_, key_size_);
+      entries_per_batch_ = 1;
+      write_options_ = WriteOptions();
+      if (FLAGS_sync) {
+        write_options_.sync = true;
+      }
+      write_options_.disableWAL = FLAGS_disable_wal;
+
+      void (Benchmark::*method)(ThreadState*) = nullptr;
+      bool fresh_db = false;
+      int num_threads = FLAGS_threads;
+
+      if (name == Slice("fillseq")) {
+        fresh_db = true;
+        method = &Benchmark::WriteSeq;
+      } else if (name == Slice("fillbatch")) {
+        fresh_db = true;
+        entries_per_batch_ = 1000;
+        method = &Benchmark::WriteSeq;
+      } else if (name == Slice("fillrandom")) {
+        fresh_db = true;
+        method = &Benchmark::WriteRandom;
+      } else if (name == Slice("fillfromstdin")) {
+        fresh_db = true;
+        method = &Benchmark::WriteFromStdin;
+      } else if (name == Slice("filluniquerandom")) {
+        fresh_db = true;
+        if (num_threads > 1) {
+          fprintf(stderr, "filluniquerandom multithreaded not supported"
+                           " set --threads=1");
+          exit(1);
+        }
+        method = &Benchmark::WriteUniqueRandom;
+      } else if (name == Slice("overwrite")) {
+        fresh_db = false;
+        method = &Benchmark::WriteRandom;
+      } else if (name == Slice("fillsync")) {
+        fresh_db = true;
+        num_ /= 1000;
+        write_options_.sync = true;
+        method = &Benchmark::WriteRandom;
+      } else if (name == Slice("fill100K")) {
+        fresh_db = true;
+        num_ /= 1000;
+        value_size_ = 100 * 1000;
+        method = &Benchmark::WriteRandom;
+      } else if (name == Slice("readseq")) {
+        method = &Benchmark::ReadSequential;
+      } else if (name == Slice("readtocache")) {
+        method = &Benchmark::ReadSequential;
+        num_threads = 1;
+        reads_ = num_;
+      } else if (name == Slice("readreverse")) {
+        method = &Benchmark::ReadReverse;
+      } else if (name == Slice("readrandom")) {
+        method = &Benchmark::ReadRandom;
+      } else if (name == Slice("readmissing")) {
+        method = &Benchmark::ReadMissing;
+      } else if (name == Slice("seekrandom")) {
+        method = &Benchmark::SeekRandom;
+      } else if (name == Slice("readhot")) {
+        method = &Benchmark::ReadHot;
+      } else if (name == Slice("readrandomsmall")) {
+        reads_ /= 1000;
+        method = &Benchmark::ReadRandom;
+      } else if (name == Slice("prefixscanrandom")) {
+        method = &Benchmark::PrefixScanRandom;
+      } else if (name == Slice("deleteseq")) {
+        method = &Benchmark::DeleteSeq;
+      } else if (name == Slice("deleterandom")) {
+        method = &Benchmark::DeleteRandom;
+      } else if (name == Slice("readwhilewriting")) {
+        num_threads++;  // Add extra thread for writing
+        method = &Benchmark::ReadWhileWriting;
+      } else if (name == Slice("readrandomwriterandom")) {
+        method = &Benchmark::ReadRandomWriteRandom;
+      } else if (name == Slice("readrandommergerandom")) {
+        if (FLAGS_merge_operator.empty()) {
+          fprintf(stdout, "%-12s : skipped (--merge_operator is unknown)\n",
+                  name.ToString().c_str());
+          method = nullptr;
+        } else {
+          method = &Benchmark::ReadRandomMergeRandom;
+        }
+      } else if (name == Slice("updaterandom")) {
+        method = &Benchmark::UpdateRandom;
+      } else if (name == Slice("appendrandom")) {
+        method = &Benchmark::AppendRandom;
+      } else if (name == Slice("mergerandom")) {
+        if (FLAGS_merge_operator.empty()) {
+          fprintf(stdout, "%-12s : skipped (--merge_operator is unknown)\n",
+                  name.ToString().c_str());
+          method = nullptr;
+        } else {
+          method = &Benchmark::MergeRandom;
+        }
+      } else if (name == Slice("randomwithverify")) {
+        method = &Benchmark::RandomWithVerify;
+      } else if (name == Slice("compact")) {
+        method = &Benchmark::Compact;
+      } else if (name == Slice("crc32c")) {
+        method = &Benchmark::Crc32c;
+      } else if (name == Slice("acquireload")) {
+        method = &Benchmark::AcquireLoad;
+      } else if (name == Slice("snappycomp")) {
+        method = &Benchmark::SnappyCompress;
+      } else if (name == Slice("snappyuncomp")) {
+        method = &Benchmark::SnappyUncompress;
+      } else if (name == Slice("heapprofile")) {
+        HeapProfile();
+      } else if (name == Slice("stats")) {
+        PrintStats("rocksdb.stats");
+      } else if (name == Slice("levelstats")) {
+        PrintStats("rocksdb.levelstats");
+      } else if (name == Slice("sstables")) {
+        PrintStats("rocksdb.sstables");
+      } else {
+        if (name != Slice()) {  // No error message for empty name
+          fprintf(stderr, "unknown benchmark '%s'\n", name.ToString().c_str());
+        }
+      }
+
+      if (fresh_db) {
+        if (FLAGS_use_existing_db) {
+          fprintf(stdout, "%-12s : skipped (--use_existing_db is true)\n",
+                  name.ToString().c_str());
+          method = nullptr;
+        } else {
+          delete db_;
+          db_ = nullptr;
+          DestroyDB(FLAGS_db, Options());
+          Open();
+        }
+      }
+
+      if (method != nullptr) {
+        fprintf(stdout, "DB path: [%s]\n", FLAGS_db.c_str());
+        RunBenchmark(num_threads, name, method);
+      }
+    }
+    if (FLAGS_statistics) {
+     fprintf(stdout, "STATISTICS:\n%s\n", dbstats->ToString().c_str());
+    }
+  }
+
+ private:
+  struct ThreadArg {
+    Benchmark* bm;
+    SharedState* shared;
+    ThreadState* thread;
+    void (Benchmark::*method)(ThreadState*);
+  };
+
+  static void ThreadBody(void* v) {
+    ThreadArg* arg = reinterpret_cast<ThreadArg*>(v);
+    SharedState* shared = arg->shared;
+    ThreadState* thread = arg->thread;
+    {
+      MutexLock l(&shared->mu);
+      shared->num_initialized++;
+      if (shared->num_initialized >= shared->total) {
+        shared->cv.SignalAll();
+      }
+      while (!shared->start) {
+        shared->cv.Wait();
+      }
+    }
+
+    thread->stats.Start(thread->tid);
+    (arg->bm->*(arg->method))(thread);
+    thread->stats.Stop();
+
+    {
+      MutexLock l(&shared->mu);
+      shared->num_done++;
+      if (shared->num_done >= shared->total) {
+        shared->cv.SignalAll();
+      }
+    }
+  }
+
+  void RunBenchmark(int n, Slice name,
+                    void (Benchmark::*method)(ThreadState*)) {
+    SharedState shared;
+    shared.total = n;
+    shared.num_initialized = 0;
+    shared.num_done = 0;
+    shared.start = false;
+
+    ThreadArg* arg = new ThreadArg[n];
+    for (int i = 0; i < n; i++) {
+      arg[i].bm = this;
+      arg[i].method = method;
+      arg[i].shared = &shared;
+      arg[i].thread = new ThreadState(i);
+      arg[i].thread->shared = &shared;
+      FLAGS_env->StartThread(ThreadBody, &arg[i]);
+    }
+
+    shared.mu.Lock();
+    while (shared.num_initialized < n) {
+      shared.cv.Wait();
+    }
+
+    shared.start = true;
+    shared.cv.SignalAll();
+    while (shared.num_done < n) {
+      shared.cv.Wait();
+    }
+    shared.mu.Unlock();
+
+    // Stats for some threads can be excluded.
+    Stats merge_stats;
+    for (int i = 0; i < n; i++) {
+      merge_stats.Merge(arg[i].thread->stats);
+    }
+    merge_stats.Report(name);
+
+    for (int i = 0; i < n; i++) {
+      delete arg[i].thread;
+    }
+    delete[] arg;
+  }
+
+  void Crc32c(ThreadState* thread) {
+    // Checksum about 500MB of data total
+    const int size = 4096;
+    const char* label = "(4K per op)";
+    std::string data(size, 'x');
+    int64_t bytes = 0;
+    uint32_t crc = 0;
+    while (bytes < 500 * 1048576) {
+      crc = crc32c::Value(data.data(), size);
+      thread->stats.FinishedSingleOp(nullptr);
+      bytes += size;
+    }
+    // Print so result is not dead
+    fprintf(stderr, "... crc=0x%x\r", static_cast<unsigned int>(crc));
+
+    thread->stats.AddBytes(bytes);
+    thread->stats.AddMessage(label);
+  }
+
+  void AcquireLoad(ThreadState* thread) {
+    int dummy;
+    port::AtomicPointer ap(&dummy);
+    int count = 0;
+    void *ptr = nullptr;
+    thread->stats.AddMessage("(each op is 1000 loads)");
+    while (count < 100000) {
+      for (int i = 0; i < 1000; i++) {
+        ptr = ap.Acquire_Load();
+      }
+      count++;
+      thread->stats.FinishedSingleOp(nullptr);
+    }
+    if (ptr == nullptr) exit(1); // Disable unused variable warning.
+  }
+
+  void SnappyCompress(ThreadState* thread) {
+    RandomGenerator gen;
+    Slice input = gen.Generate(Options().block_size);
+    int64_t bytes = 0;
+    int64_t produced = 0;
+    bool ok = true;
+    std::string compressed;
+    while (ok && bytes < 1024 * 1048576) {  // Compress 1G
+      ok = port::Snappy_Compress(Options().compression_opts, input.data(),
+                                 input.size(), &compressed);
+      produced += compressed.size();
+      bytes += input.size();
+      thread->stats.FinishedSingleOp(nullptr);
+    }
+
+    if (!ok) {
+      thread->stats.AddMessage("(snappy failure)");
+    } else {
+      char buf[100];
+      snprintf(buf, sizeof(buf), "(output: %.1f%%)",
+               (produced * 100.0) / bytes);
+      thread->stats.AddMessage(buf);
+      thread->stats.AddBytes(bytes);
+    }
+  }
+
+  void SnappyUncompress(ThreadState* thread) {
+    RandomGenerator gen;
+    Slice input = gen.Generate(Options().block_size);
+    std::string compressed;
+    bool ok = port::Snappy_Compress(Options().compression_opts, input.data(),
+                                    input.size(), &compressed);
+    int64_t bytes = 0;
+    char* uncompressed = new char[input.size()];
+    while (ok && bytes < 1024 * 1048576) {  // Compress 1G
+      ok =  port::Snappy_Uncompress(compressed.data(), compressed.size(),
+                                    uncompressed);
+      bytes += input.size();
+      thread->stats.FinishedSingleOp(nullptr);
+    }
+    delete[] uncompressed;
+
+    if (!ok) {
+      thread->stats.AddMessage("(snappy failure)");
+    } else {
+      thread->stats.AddBytes(bytes);
+    }
+  }
+
+  void Open() {
+    assert(db_ == nullptr);
+    Options options;
+    options.create_if_missing = !FLAGS_use_existing_db;
+    options.block_cache = cache_;
+    options.block_cache_compressed = compressed_cache_;
+    if (cache_ == nullptr) {
+      options.no_block_cache = true;
+    }
+    options.write_buffer_size = FLAGS_write_buffer_size;
+    options.max_write_buffer_number = FLAGS_max_write_buffer_number;
+    options.min_write_buffer_number_to_merge =
+      FLAGS_min_write_buffer_number_to_merge;
+    options.max_background_compactions = FLAGS_max_background_compactions;
+    options.compaction_style = FLAGS_compaction_style_e;
+    options.block_size = FLAGS_block_size;
+    options.filter_policy = filter_policy_;
+    options.prefix_extractor = FLAGS_use_prefix_blooms ? prefix_extractor_
+                                                       : nullptr;
+    options.max_open_files = FLAGS_open_files;
+    options.statistics = dbstats;
+    options.env = FLAGS_env;
+    options.disableDataSync = FLAGS_disable_data_sync;
+    options.use_fsync = FLAGS_use_fsync;
+    options.num_levels = FLAGS_num_levels;
+    options.target_file_size_base = FLAGS_target_file_size_base;
+    options.target_file_size_multiplier = FLAGS_target_file_size_multiplier;
+    options.max_bytes_for_level_base = FLAGS_max_bytes_for_level_base;
+    options.max_bytes_for_level_multiplier =
+        FLAGS_max_bytes_for_level_multiplier;
+    options.filter_deletes = FLAGS_filter_deletes;
+    if ((FLAGS_prefix_size == 0) == (FLAGS_rep_factory == kPrefixHash)) {
+      fprintf(stderr,
+            "prefix_size should be non-zero iff memtablerep == prefix_hash\n");
+      exit(1);
+    }
+    switch (FLAGS_rep_factory) {
+      case kPrefixHash:
+        options.memtable_factory.reset(NewHashSkipListRepFactory(
+            NewFixedPrefixTransform(FLAGS_prefix_size)));
+        break;
+      case kSkipList:
+        // no need to do anything
+        break;
+      case kVectorRep:
+        options.memtable_factory.reset(
+          new VectorRepFactory
+        );
+        break;
+    }
+    if (FLAGS_max_bytes_for_level_multiplier_additional_v.size() > 0) {
+      if (FLAGS_max_bytes_for_level_multiplier_additional_v.size() !=
+          (unsigned int)FLAGS_num_levels) {
+        fprintf(stderr, "Insufficient number of fanouts specified %d\n",
+                (int)FLAGS_max_bytes_for_level_multiplier_additional_v.size());
+        exit(1);
+      }
+      options.max_bytes_for_level_multiplier_additional =
+        FLAGS_max_bytes_for_level_multiplier_additional_v;
+    }
+    options.level0_stop_writes_trigger = FLAGS_level0_stop_writes_trigger;
+    options.level0_file_num_compaction_trigger =
+        FLAGS_level0_file_num_compaction_trigger;
+    options.level0_slowdown_writes_trigger =
+      FLAGS_level0_slowdown_writes_trigger;
+    options.compression = FLAGS_compression_type_e;
+    options.compression_opts.level = FLAGS_compression_level;
+    options.WAL_ttl_seconds = FLAGS_wal_ttl_seconds;
+    options.WAL_size_limit_MB = FLAGS_wal_size_limit_MB;
+    if (FLAGS_min_level_to_compress >= 0) {
+      assert(FLAGS_min_level_to_compress <= FLAGS_num_levels);
+      options.compression_per_level.resize(FLAGS_num_levels);
+      for (int i = 0; i < FLAGS_min_level_to_compress; i++) {
+        options.compression_per_level[i] = kNoCompression;
+      }
+      for (int i = FLAGS_min_level_to_compress;
+           i < FLAGS_num_levels; i++) {
+        options.compression_per_level[i] = FLAGS_compression_type_e;
+      }
+    }
+    options.disable_seek_compaction = FLAGS_disable_seek_compaction;
+    options.delete_obsolete_files_period_micros =
+      FLAGS_delete_obsolete_files_period_micros;
+    options.soft_rate_limit = FLAGS_soft_rate_limit;
+    options.hard_rate_limit = FLAGS_hard_rate_limit;
+    options.rate_limit_delay_max_milliseconds =
+      FLAGS_rate_limit_delay_max_milliseconds;
+    options.table_cache_numshardbits = FLAGS_table_cache_numshardbits;
+    options.max_grandparent_overlap_factor =
+      FLAGS_max_grandparent_overlap_factor;
+    options.disable_auto_compactions = FLAGS_disable_auto_compactions;
+    options.source_compaction_factor = FLAGS_source_compaction_factor;
+
+    // fill storage options
+    options.allow_os_buffer = FLAGS_bufferedio;
+    options.allow_mmap_reads = FLAGS_mmap_read;
+    options.allow_mmap_writes = FLAGS_mmap_write;
+    options.advise_random_on_open = FLAGS_advise_random_on_open;
+    options.access_hint_on_compaction_start = FLAGS_compaction_fadvice_e;
+    options.use_adaptive_mutex = FLAGS_use_adaptive_mutex;
+    options.bytes_per_sync = FLAGS_bytes_per_sync;
+
+    // merge operator options
+    options.merge_operator = MergeOperators::CreateFromStringId(
+        FLAGS_merge_operator);
+    if (options.merge_operator == nullptr && !FLAGS_merge_operator.empty()) {
+      fprintf(stderr, "invalid merge operator: %s\n",
+              FLAGS_merge_operator.c_str());
+      exit(1);
+    }
+    options.max_successive_merges = FLAGS_max_successive_merges;
+
+    // set universal style compaction configurations, if applicable
+    if (FLAGS_universal_size_ratio != 0) {
+      options.compaction_options_universal.size_ratio =
+        FLAGS_universal_size_ratio;
+    }
+    if (FLAGS_universal_min_merge_width != 0) {
+      options.compaction_options_universal.min_merge_width =
+        FLAGS_universal_min_merge_width;
+    }
+    if (FLAGS_universal_max_merge_width != 0) {
+      options.compaction_options_universal.max_merge_width =
+        FLAGS_universal_max_merge_width;
+    }
+    if (FLAGS_universal_max_size_amplification_percent != 0) {
+      options.compaction_options_universal.max_size_amplification_percent =
+        FLAGS_universal_max_size_amplification_percent;
+    }
+    if (FLAGS_universal_compression_size_percent != -1) {
+      options.compaction_options_universal.compression_size_percent =
+        FLAGS_universal_compression_size_percent;
+    }
+
+    Status s;
+    if(FLAGS_readonly) {
+      s = DB::OpenForReadOnly(options, FLAGS_db, &db_);
+    } else {
+      s = DB::Open(options, FLAGS_db, &db_);
+    }
+    if (!s.ok()) {
+      fprintf(stderr, "open error: %s\n", s.ToString().c_str());
+      exit(1);
+    }
+    if (FLAGS_min_level_to_compress >= 0) {
+      options.compression_per_level.clear();
+    }
+  }
+
+  enum WriteMode {
+    RANDOM, SEQUENTIAL, UNIQUE_RANDOM
+  };
+
+  void WriteSeq(ThreadState* thread) {
+    DoWrite(thread, SEQUENTIAL);
+  }
+
+  void WriteRandom(ThreadState* thread) {
+    DoWrite(thread, RANDOM);
+  }
+
+  void WriteUniqueRandom(ThreadState* thread) {
+    DoWrite(thread, UNIQUE_RANDOM);
+  }
+
+  void writeOrFail(WriteBatch& batch) {
+    Status s = db_->Write(write_options_, &batch);
+    if (!s.ok()) {
+      fprintf(stderr, "put error: %s\n", s.ToString().c_str());
+      exit(1);
+    }
+  }
+
+  void WriteFromStdin(ThreadState* thread) {
+    size_t count = 0;
+    WriteBatch batch;
+    const size_t bufferLen = 32 << 20;
+    unique_ptr<char[]> line = unique_ptr<char[]>(new char[bufferLen]);
+    char* linep = line.get();
+    const int batchSize = 100 << 10;
+    const char columnSeparator = '\t';
+    const char lineSeparator = '\n';
+
+    while (fgets(linep, bufferLen, stdin) != nullptr) {
+      ++count;
+      char* tab = std::find(linep, linep + bufferLen, columnSeparator);
+      if (tab == linep + bufferLen) {
+        fprintf(stderr, "[Error] No Key delimiter TAB at line %ld\n", count);
+        continue;
+      }
+      Slice key(linep, tab - linep);
+      tab++;
+      char* endLine = std::find(tab, linep + bufferLen, lineSeparator);
+      if (endLine  == linep + bufferLen) {
+        fprintf(stderr, "[Error] No ENTER at end of line # %ld\n", count);
+        continue;
+      }
+      Slice value(tab, endLine - tab);
+      thread->stats.FinishedSingleOp(db_);
+      thread->stats.AddBytes(endLine - linep - 1);
+
+      if (batch.Count() < batchSize) {
+        batch.Put(key, value);
+        continue;
+      }
+      writeOrFail(batch);
+      batch.Clear();
+    }
+    if (batch.Count() > 0) {
+      writeOrFail(batch);
+    }
+  }
+
+  void DoWrite(ThreadState* thread, WriteMode write_mode) {
+    const int test_duration = write_mode == RANDOM ? FLAGS_duration : 0;
+    const int num_ops = writes_ == 0 ? num_ : writes_ ;
+    Duration duration(test_duration, num_ops);
+    unique_ptr<BitSet> bit_set;
+
+    if (write_mode == UNIQUE_RANDOM) {
+      bit_set.reset(new BitSet(num_ops));
+    }
+
+    if (num_ != FLAGS_num) {
+      char msg[100];
+      snprintf(msg, sizeof(msg), "(%lld ops)", num_);
+      thread->stats.AddMessage(msg);
+    }
+
+    RandomGenerator gen;
+    WriteBatch batch;
+    Status s;
+    int64_t bytes = 0;
+    int i = 0;
+    while (!duration.Done(entries_per_batch_)) {
+      batch.Clear();
+      for (int j = 0; j < entries_per_batch_; j++) {
+        long long k = 0;
+        switch(write_mode) {
+          case SEQUENTIAL:
+            k = i +j;
+            break;
+          case RANDOM:
+            k = thread->rand.Next() % FLAGS_num;
+            break;
+          case UNIQUE_RANDOM:
+            {
+              const long long t = thread->rand.Next() % FLAGS_num;
+              if (!bit_set->test(t)) {
+                // best case
+                k = t;
+              } else {
+                bool found = false;
+                // look forward
+                for (size_t i = t + 1; i < bit_set->size(); ++i) {
+                  if (!bit_set->test(i)) {
+                    found = true;
+                    k = i;
+                    break;
+                  }
+                }
+                if (!found) {
+                  for (size_t i = t; i-- > 0;) {
+                    if (!bit_set->test(i)) {
+                      found = true;
+                      k = i;
+                      break;
+                    }
+                  }
+                }
+              }
+              bit_set->set(k);
+              break;
+            }
+        };
+        unique_ptr<char []> key = GenerateKeyFromInt(k);
+        batch.Put(key.get(), gen.Generate(value_size_));
+        bytes += value_size_ + strlen(key.get());
+        thread->stats.FinishedSingleOp(db_);
+      }
+      s = db_->Write(write_options_, &batch);
+      if (!s.ok()) {
+        fprintf(stderr, "put error: %s\n", s.ToString().c_str());
+        exit(1);
+      }
+      i += entries_per_batch_;
+    }
+    thread->stats.AddBytes(bytes);
+  }
+
+  void ReadSequential(ThreadState* thread) {
+    Iterator* iter = db_->NewIterator(ReadOptions(FLAGS_verify_checksum, true));
+    long long i = 0;
+    int64_t bytes = 0;
+    for (iter->SeekToFirst(); i < reads_ && iter->Valid(); iter->Next()) {
+      bytes += iter->key().size() + iter->value().size();
+      thread->stats.FinishedSingleOp(db_);
+      ++i;
+    }
+    delete iter;
+    thread->stats.AddBytes(bytes);
+  }
+
+  void ReadReverse(ThreadState* thread) {
+    Iterator* iter = db_->NewIterator(ReadOptions(FLAGS_verify_checksum, true));
+    long long i = 0;
+    int64_t bytes = 0;
+    for (iter->SeekToLast(); i < reads_ && iter->Valid(); iter->Prev()) {
+      bytes += iter->key().size() + iter->value().size();
+      thread->stats.FinishedSingleOp(db_);
+      ++i;
+    }
+    delete iter;
+    thread->stats.AddBytes(bytes);
+  }
+
+  // Calls MultiGet over a list of keys from a random distribution.
+  // Returns the total number of keys found.
+  long MultiGetRandom(ReadOptions& options, int num_keys,
+                     Random64& rand, long long range, const char* suffix) {
+    assert(num_keys > 0);
+    std::vector<Slice> keys(num_keys);
+    std::vector<std::string> values(num_keys);
+    std::vector<unique_ptr<char []> > gen_keys(num_keys);
+
+    int i;
+    long long k;
+
+    // Fill the keys vector
+    for(i=0; i<num_keys; ++i) {
+      k = rand.Next() % range;
+      gen_keys[i] = GenerateKeyFromInt(k,suffix);
+      keys[i] = gen_keys[i].get();
+    }
+
+    if (FLAGS_use_snapshot) {
+      options.snapshot = db_->GetSnapshot();
+    }
+
+    // Apply the operation
+    std::vector<Status> statuses = db_->MultiGet(options, keys, &values);
+    assert((long)statuses.size() == num_keys);
+    assert((long)keys.size() == num_keys);  // Should always be the case.
+    assert((long)values.size() == num_keys);
+
+    if (FLAGS_use_snapshot) {
+      db_->ReleaseSnapshot(options.snapshot);
+      options.snapshot = nullptr;
+    }
+
+    // Count number found
+    long found = 0;
+    for(i=0; i<num_keys; ++i) {
+      if (statuses[i].ok()){
+        ++found;
+      } else if (FLAGS_warn_missing_keys == true) {
+        // Key not found, or error.
+        fprintf(stderr, "get error: %s\n", statuses[i].ToString().c_str());
+      }
+    }
+
+    return found;
+  }
+
+  void ReadRandom(ThreadState* thread) {
+    ReadOptions options(FLAGS_verify_checksum, true);
+    Duration duration(FLAGS_duration, reads_);
+
+    long long found = 0;
+
+    if (FLAGS_use_multiget) {   // MultiGet
+      const long& kpg = FLAGS_keys_per_multiget;  // keys per multiget group
+      long keys_left = reads_;
+
+      // Recalculate number of keys per group, and call MultiGet until done
+      long num_keys;
+      while(num_keys = std::min(keys_left, kpg), !duration.Done(num_keys)) {
+        found += MultiGetRandom(options, num_keys, thread->rand, FLAGS_num, "");
+        thread->stats.FinishedSingleOp(db_);
+        keys_left -= num_keys;
+      }
+    } else {    // Regular case. Do one "get" at a time Get
+      Iterator* iter = db_->NewIterator(options);
+      std::string value;
+      while (!duration.Done(1)) {
+        const long long k = thread->rand.Next() % FLAGS_num;
+        unique_ptr<char []> key = GenerateKeyFromInt(k);
+        if (FLAGS_use_snapshot) {
+          options.snapshot = db_->GetSnapshot();
+        }
+
+        if (FLAGS_read_range < 2) {
+          if (db_->Get(options, key.get(), &value).ok()) {
+            found++;
+          }
+        } else {
+          Slice skey(key.get());
+          int count = 1;
+
+          if (FLAGS_get_approx) {
+            unique_ptr<char []> key2 =
+                GenerateKeyFromInt(k + (int) FLAGS_read_range);
+            Slice skey2(key2.get());
+            Range range(skey, skey2);
+            uint64_t sizes;
+            db_->GetApproximateSizes(&range, 1, &sizes);
+          }
+
+          for (iter->Seek(skey);
+               iter->Valid() && count <= FLAGS_read_range;
+               ++count, iter->Next()) {
+            found++;
+          }
+        }
+
+        if (FLAGS_use_snapshot) {
+          db_->ReleaseSnapshot(options.snapshot);
+          options.snapshot = nullptr;
+        }
+
+        thread->stats.FinishedSingleOp(db_);
+      }
+
+      delete iter;
+    }
+
+    char msg[100];
+    snprintf(msg, sizeof(msg), "(%lld of %lld found)", found, reads_);
+    thread->stats.AddMessage(msg);
+  }
+
+  void PrefixScanRandom(ThreadState* thread) {
+    if (FLAGS_use_prefix_api) {
+      assert(FLAGS_use_prefix_blooms);
+      assert(FLAGS_bloom_bits >= 1);
+    }
+
+    ReadOptions options(FLAGS_verify_checksum, true);
+    Duration duration(FLAGS_duration, reads_);
+
+    long long found = 0;
+
+    while (!duration.Done(1)) {
+      std::string value;
+      const int k = thread->rand.Next() % FLAGS_num;
+      unique_ptr<char []> key = GenerateKeyFromInt(k);
+      Slice skey(key.get());
+      Slice prefix = prefix_extractor_->Transform(skey);
+      options.prefix = FLAGS_use_prefix_api ? &prefix : nullptr;
+
+      Iterator* iter = db_->NewIterator(options);
+      for (iter->Seek(skey);
+           iter->Valid() && iter->key().starts_with(prefix);
+           iter->Next()) {
+        found++;
+      }
+      delete iter;
+
+      thread->stats.FinishedSingleOp(db_);
+    }
+
+    char msg[100];
+    snprintf(msg, sizeof(msg), "(%lld of %lld found)", found, reads_);
+    thread->stats.AddMessage(msg);
+  }
+
+  void ReadMissing(ThreadState* thread) {
+    FLAGS_warn_missing_keys = false;    // Never warn about missing keys
+
+    Duration duration(FLAGS_duration, reads_);
+    ReadOptions options(FLAGS_verify_checksum, true);
+
+    if (FLAGS_use_multiget) {
+      const long& kpg = FLAGS_keys_per_multiget;  // keys per multiget group
+      long keys_left = reads_;
+
+      // Recalculate number of keys per group, and call MultiGet until done
+      long num_keys;
+      long found;
+      while(num_keys = std::min(keys_left, kpg), !duration.Done(num_keys)) {
+        found = MultiGetRandom(options, num_keys, thread->rand, FLAGS_num, ".");
+
+        // We should not find any key since the key we try to get has a
+        // different suffix
+        if (found) {
+          assert(false);
+        }
+
+        thread->stats.FinishedSingleOp(db_);
+        keys_left -= num_keys;
+      }
+    } else {  // Regular case (not MultiGet)
+      std::string value;
+      Status s;
+      while (!duration.Done(1)) {
+        const long long k = thread->rand.Next() % FLAGS_num;
+        unique_ptr<char []> key = GenerateKeyFromInt(k, ".");
+        s = db_->Get(options, key.get(), &value);
+        assert(!s.ok() && s.IsNotFound());
+        thread->stats.FinishedSingleOp(db_);
+      }
+    }
+  }
+
+  void ReadHot(ThreadState* thread) {
+    Duration duration(FLAGS_duration, reads_);
+    ReadOptions options(FLAGS_verify_checksum, true);
+    const long long range = (FLAGS_num + 99) / 100;
+    long long found = 0;
+
+    if (FLAGS_use_multiget) {
+      const long long kpg = FLAGS_keys_per_multiget;  // keys per multiget group
+      long long keys_left = reads_;
+
+      // Recalculate number of keys per group, and call MultiGet until done
+      long num_keys;
+      while(num_keys = std::min(keys_left, kpg), !duration.Done(num_keys)) {
+        found += MultiGetRandom(options, num_keys, thread->rand, range, "");
+        thread->stats.FinishedSingleOp(db_);
+        keys_left -= num_keys;
+      }
+    } else {
+      std::string value;
+      while (!duration.Done(1)) {
+        const long long k = thread->rand.Next() % range;
+        unique_ptr<char []> key = GenerateKeyFromInt(k);
+        if (db_->Get(options, key.get(), &value).ok()){
+          ++found;
+        }
+        thread->stats.FinishedSingleOp(db_);
+      }
+    }
+
+    char msg[100];
+    snprintf(msg, sizeof(msg), "(%lld of %lld found)", found, reads_);
+    thread->stats.AddMessage(msg);
+  }
+
+  void SeekRandom(ThreadState* thread) {
+    Duration duration(FLAGS_duration, reads_);
+    ReadOptions options(FLAGS_verify_checksum, true);
+    std::string value;
+    long long found = 0;
+    while (!duration.Done(1)) {
+      Iterator* iter = db_->NewIterator(options);
+      const long long k = thread->rand.Next() % FLAGS_num;
+      unique_ptr<char []> key = GenerateKeyFromInt(k);
+      iter->Seek(key.get());
+      if (iter->Valid() && iter->key() == key.get()) found++;
+      delete iter;
+      thread->stats.FinishedSingleOp(db_);
+    }
+    char msg[100];
+    snprintf(msg, sizeof(msg), "(%lld of %lld found)", found, num_);
+    thread->stats.AddMessage(msg);
+  }
+
+  void DoDelete(ThreadState* thread, bool seq) {
+    WriteBatch batch;
+    Status s;
+    Duration duration(seq ? 0 : FLAGS_duration, num_);
+    long i = 0;
+    while (!duration.Done(entries_per_batch_)) {
+      batch.Clear();
+      for (int j = 0; j < entries_per_batch_; j++) {
+        const long long k = seq ? i+j : (thread->rand.Next() % FLAGS_num);
+        unique_ptr<char []> key = GenerateKeyFromInt(k);
+        batch.Delete(key.get());
+        thread->stats.FinishedSingleOp(db_);
+      }
+      s = db_->Write(write_options_, &batch);
+      if (!s.ok()) {
+        fprintf(stderr, "del error: %s\n", s.ToString().c_str());
+        exit(1);
+      }
+      ++i;
+    }
+  }
+
+  void DeleteSeq(ThreadState* thread) {
+    DoDelete(thread, true);
+  }
+
+  void DeleteRandom(ThreadState* thread) {
+    DoDelete(thread, false);
+  }
+
+  void ReadWhileWriting(ThreadState* thread) {
+    if (thread->tid > 0) {
+      ReadRandom(thread);
+    } else {
+      // Special thread that keeps writing until other threads are done.
+      RandomGenerator gen;
+      double last = FLAGS_env->NowMicros();
+      int writes_per_second_by_10 = 0;
+      int num_writes = 0;
+
+      // --writes_per_second rate limit is enforced per 100 milliseconds
+      // intervals to avoid a burst of writes at the start of each second.
+
+      if (FLAGS_writes_per_second > 0)
+        writes_per_second_by_10 = FLAGS_writes_per_second / 10;
+
+      // Don't merge stats from this thread with the readers.
+      thread->stats.SetExcludeFromMerge();
+
+      while (true) {
+        {
+          MutexLock l(&thread->shared->mu);
+          if (thread->shared->num_done + 1 >= thread->shared->num_initialized) {
+            // Other threads have finished
+            break;
+          }
+        }
+
+        const long long k = thread->rand.Next() % FLAGS_num;
+        unique_ptr<char []> key = GenerateKeyFromInt(k);
+        Status s = db_->Put(write_options_, key.get(),
+                            gen.Generate(value_size_));
+        if (!s.ok()) {
+          fprintf(stderr, "put error: %s\n", s.ToString().c_str());
+          exit(1);
+        }
+        thread->stats.FinishedSingleOp(db_);
+
+        ++num_writes;
+        if (writes_per_second_by_10 && num_writes >= writes_per_second_by_10) {
+          double now = FLAGS_env->NowMicros();
+          double usecs_since_last = now - last;
+
+          num_writes = 0;
+          last = now;
+
+          if (usecs_since_last < 100000.0) {
+            FLAGS_env->SleepForMicroseconds(100000.0 - usecs_since_last);
+            last = FLAGS_env->NowMicros();
+          }
+        }
+      }
+    }
+  }
+
+  // Given a key K and value V, this puts (K+"0", V), (K+"1", V), (K+"2", V)
+  // in DB atomically i.e in a single batch. Also refer GetMany.
+  Status PutMany(const WriteOptions& writeoptions,
+                  const Slice& key, const Slice& value) {
+    std::string suffixes[3] = {"2", "1", "0"};
+    std::string keys[3];
+
+    WriteBatch batch;
+    Status s;
+    for (int i = 0; i < 3; i++) {
+      keys[i] = key.ToString() + suffixes[i];
+      batch.Put(keys[i], value);
+    }
+
+    s = db_->Write(writeoptions, &batch);
+    return s;
+  }
+
+
+  // Given a key K, this deletes (K+"0", V), (K+"1", V), (K+"2", V)
+  // in DB atomically i.e in a single batch. Also refer GetMany.
+  Status DeleteMany(const WriteOptions& writeoptions,
+                  const Slice& key) {
+    std::string suffixes[3] = {"1", "2", "0"};
+    std::string keys[3];
+
+    WriteBatch batch;
+    Status s;
+    for (int i = 0; i < 3; i++) {
+      keys[i] = key.ToString() + suffixes[i];
+      batch.Delete(keys[i]);
+    }
+
+    s = db_->Write(writeoptions, &batch);
+    return s;
+  }
+
+  // Given a key K and value V, this gets values for K+"0", K+"1" and K+"2"
+  // in the same snapshot, and verifies that all the values are identical.
+  // ASSUMES that PutMany was used to put (K, V) into the DB.
+  Status GetMany(const ReadOptions& readoptions,
+                  const Slice& key, std::string* value) {
+    std::string suffixes[3] = {"0", "1", "2"};
+    std::string keys[3];
+    Slice key_slices[3];
+    std::string values[3];
+    ReadOptions readoptionscopy = readoptions;
+    readoptionscopy.snapshot = db_->GetSnapshot();
+    Status s;
+    for (int i = 0; i < 3; i++) {
+      keys[i] = key.ToString() + suffixes[i];
+      key_slices[i] = keys[i];
+      s = db_->Get(readoptionscopy, key_slices[i], value);
+      if (!s.ok() && !s.IsNotFound()) {
+        fprintf(stderr, "get error: %s\n", s.ToString().c_str());
+        values[i] = "";
+        // we continue after error rather than exiting so that we can
+        // find more errors if any
+      } else if (s.IsNotFound()) {
+        values[i] = "";
+      } else {
+        values[i] = *value;
+      }
+    }
+    db_->ReleaseSnapshot(readoptionscopy.snapshot);
+
+    if ((values[0] != values[1]) || (values[1] != values[2])) {
+      fprintf(stderr, "inconsistent values for key %s: %s, %s, %s\n",
+              key.ToString().c_str(), values[0].c_str(), values[1].c_str(),
+              values[2].c_str());
+      // we continue after error rather than exiting so that we can
+      // find more errors if any
+    }
+
+    return s;
+  }
+
+  // Differs from readrandomwriterandom in the following ways:
+  // (a) Uses GetMany/PutMany to read/write key values. Refer to those funcs.
+  // (b) Does deletes as well (per FLAGS_deletepercent)
+  // (c) In order to achieve high % of 'found' during lookups, and to do
+  //     multiple writes (including puts and deletes) it uses upto
+  //     FLAGS_numdistinct distinct keys instead of FLAGS_num distinct keys.
+  // (d) Does not have a MultiGet option.
+  void RandomWithVerify(ThreadState* thread) {
+    ReadOptions options(FLAGS_verify_checksum, true);
+    RandomGenerator gen;
+    std::string value;
+    long long found = 0;
+    int get_weight = 0;
+    int put_weight = 0;
+    int delete_weight = 0;
+    long long gets_done = 0;
+    long long puts_done = 0;
+    long long deletes_done = 0;
+
+    // the number of iterations is the larger of read_ or write_
+    for (long long i = 0; i < readwrites_; i++) {
+      const long long k = thread->rand.Next() % (FLAGS_numdistinct);
+      unique_ptr<char []> key = GenerateKeyFromInt(k);
+      if (get_weight == 0 && put_weight == 0 && delete_weight == 0) {
+        // one batch completed, reinitialize for next batch
+        get_weight = FLAGS_readwritepercent;
+        delete_weight = FLAGS_deletepercent;
+        put_weight = 100 - get_weight - delete_weight;
+      }
+      if (get_weight > 0) {
+        // do all the gets first
+        Status s = GetMany(options, key.get(), &value);
+        if (!s.ok() && !s.IsNotFound()) {
+          fprintf(stderr, "getmany error: %s\n", s.ToString().c_str());
+          // we continue after error rather than exiting so that we can
+          // find more errors if any
+        } else if (!s.IsNotFound()) {
+          found++;
+        }
+        get_weight--;
+        gets_done++;
+      } else if (put_weight > 0) {
+        // then do all the corresponding number of puts
+        // for all the gets we have done earlier
+        Status s = PutMany(write_options_, key.get(),
+                           gen.Generate(value_size_));
+        if (!s.ok()) {
+          fprintf(stderr, "putmany error: %s\n", s.ToString().c_str());
+          exit(1);
+        }
+        put_weight--;
+        puts_done++;
+      } else if (delete_weight > 0) {
+        Status s = DeleteMany(write_options_, key.get());
+        if (!s.ok()) {
+          fprintf(stderr, "deletemany error: %s\n", s.ToString().c_str());
+          exit(1);
+        }
+        delete_weight--;
+        deletes_done++;
+      }
+
+      thread->stats.FinishedSingleOp(db_);
+    }
+    char msg[100];
+    snprintf(msg, sizeof(msg),
+             "( get:%lld put:%lld del:%lld total:%lld found:%lld)",
+             gets_done, puts_done, deletes_done, readwrites_, found);
+    thread->stats.AddMessage(msg);
+  }
+
+  // This is different from ReadWhileWriting because it does not use
+  // an extra thread.
+  void ReadRandomWriteRandom(ThreadState* thread) {
+    if (FLAGS_use_multiget){
+      // Separate function for multiget (for ease of reading)
+      ReadRandomWriteRandomMultiGet(thread);
+      return;
+    }
+
+    ReadOptions options(FLAGS_verify_checksum, true);
+    RandomGenerator gen;
+    std::string value;
+    long long found = 0;
+    int get_weight = 0;
+    int put_weight = 0;
+    long long reads_done = 0;
+    long long writes_done = 0;
+    Duration duration(FLAGS_duration, readwrites_);
+
+    // the number of iterations is the larger of read_ or write_
+    while (!duration.Done(1)) {
+      const long long k = thread->rand.Next() % FLAGS_num;
+      unique_ptr<char []> key = GenerateKeyFromInt(k);
+      if (get_weight == 0 && put_weight == 0) {
+        // one batch completed, reinitialize for next batch
+        get_weight = FLAGS_readwritepercent;
+        put_weight = 100 - get_weight;
+      }
+      if (get_weight > 0) {
+
+        if (FLAGS_use_snapshot) {
+          options.snapshot = db_->GetSnapshot();
+        }
+
+        if (FLAGS_get_approx) {
+          char key2[100];
+          snprintf(key2, sizeof(key2), "%016lld", k + 1);
+          Slice skey2(key2);
+          Slice skey(key2);
+          Range range(skey, skey2);
+          uint64_t sizes;
+          db_->GetApproximateSizes(&range, 1, &sizes);
+        }
+
+        // do all the gets first
+        Status s = db_->Get(options, key.get(), &value);
+        if (!s.ok() && !s.IsNotFound()) {
+          fprintf(stderr, "get error: %s\n", s.ToString().c_str());
+          // we continue after error rather than exiting so that we can
+          // find more errors if any
+        } else if (!s.IsNotFound()) {
+          found++;
+        }
+
+        get_weight--;
+        reads_done++;
+
+        if (FLAGS_use_snapshot) {
+          db_->ReleaseSnapshot(options.snapshot);
+        }
+
+      } else  if (put_weight > 0) {
+        // then do all the corresponding number of puts
+        // for all the gets we have done earlier
+        Status s = db_->Put(write_options_, key.get(),
+                            gen.Generate(value_size_));
+        if (!s.ok()) {
+          fprintf(stderr, "put error: %s\n", s.ToString().c_str());
+          exit(1);
+        }
+        put_weight--;
+        writes_done++;
+      }
+      thread->stats.FinishedSingleOp(db_);
+    }
+    char msg[100];
+    snprintf(msg, sizeof(msg),
+             "( reads:%lld writes:%lld total:%lld found:%lld)",
+             reads_done, writes_done, readwrites_, found);
+    thread->stats.AddMessage(msg);
+  }
+
+  // ReadRandomWriteRandom (with multiget)
+  // Does FLAGS_keys_per_multiget reads (per multiget), followed by some puts.
+  // FLAGS_readwritepercent will specify the ratio of gets to puts.
+  // e.g.: If FLAGS_keys_per_multiget == 100 and FLAGS_readwritepercent == 75
+  // Then each block will do 100 multigets and 33 puts
+  // So there are 133 operations in-total: 100 of them (75%) are gets, and 33
+  // of them (25%) are puts.
+  void ReadRandomWriteRandomMultiGet(ThreadState* thread) {
+    ReadOptions options(FLAGS_verify_checksum, true);
+    RandomGenerator gen;
+
+    // For multiget
+    const long& kpg = FLAGS_keys_per_multiget;  // keys per multiget group
+
+    long keys_left = readwrites_;  // number of keys still left to read
+    long num_keys;                  // number of keys to read in current group
+    long num_put_keys;              // number of keys to put in current group
+
+    long found = 0;
+    long reads_done = 0;
+    long writes_done = 0;
+    long multigets_done = 0;
+
+    // the number of iterations is the larger of read_ or write_
+    Duration duration(FLAGS_duration, readwrites_);
+    while(true) {
+      // Read num_keys keys, then write num_put_keys keys.
+      // The ratio of num_keys to num_put_keys is always FLAGS_readwritepercent
+      // And num_keys is set to be FLAGS_keys_per_multiget (kpg)
+      // num_put_keys is calculated accordingly (to maintain the ratio)
+      // Note: On the final iteration, num_keys and num_put_keys will be smaller
+      num_keys = std::min(keys_left*(FLAGS_readwritepercent + 99)/100, kpg);
+      num_put_keys = num_keys * (100-FLAGS_readwritepercent)
+                     / FLAGS_readwritepercent;
+
+      // This will break the loop when duration is complete
+      if (duration.Done(num_keys + num_put_keys)) {
+        break;
+      }
+
+      // A quick check to make sure our formula doesn't break on edge cases
+      assert(num_keys >= 1);
+      assert(num_keys + num_put_keys <= keys_left);
+
+      // Apply the MultiGet operations
+      found += MultiGetRandom(options, num_keys, thread->rand, FLAGS_num, "");
+      ++multigets_done;
+      reads_done+=num_keys;
+      thread->stats.FinishedSingleOp(db_);
+
+      // Now do the puts
+      int i;
+      long long k;
+      for(i=0; i<num_put_keys; ++i) {
+        k = thread->rand.Next() % FLAGS_num;
+        unique_ptr<char []> key = GenerateKeyFromInt(k);
+        Status s = db_->Put(write_options_, key.get(),
+                            gen.Generate(value_size_));
+        if (!s.ok()) {
+          fprintf(stderr, "put error: %s\n", s.ToString().c_str());
+          exit(1);
+        }
+        writes_done++;
+        thread->stats.FinishedSingleOp(db_);
+      }
+
+      keys_left -= (num_keys + num_put_keys);
+    }
+    char msg[100];
+    snprintf(msg, sizeof(msg),
+             "( reads:%ld writes:%ld total:%lld multiget_ops:%ld found:%ld)",
+             reads_done, writes_done, readwrites_, multigets_done, found);
+    thread->stats.AddMessage(msg);
+  }
+
+  //
+  // Read-modify-write for random keys
+  void UpdateRandom(ThreadState* thread) {
+    ReadOptions options(FLAGS_verify_checksum, true);
+    RandomGenerator gen;
+    std::string value;
+    long long found = 0;
+    Duration duration(FLAGS_duration, readwrites_);
+
+    // the number of iterations is the larger of read_ or write_
+    while (!duration.Done(1)) {
+      const long long k = thread->rand.Next() % FLAGS_num;
+      unique_ptr<char []> key = GenerateKeyFromInt(k);
+
+      if (FLAGS_use_snapshot) {
+        options.snapshot = db_->GetSnapshot();
+      }
+
+      if (FLAGS_get_approx) {
+        char key2[100];
+        snprintf(key2, sizeof(key2), "%016lld", k + 1);
+        Slice skey2(key2);
+        Slice skey(key2);
+        Range range(skey, skey2);
+        uint64_t sizes;
+        db_->GetApproximateSizes(&range, 1, &sizes);
+      }
+
+      if (db_->Get(options, key.get(), &value).ok()) {
+        found++;
+      }
+
+      if (FLAGS_use_snapshot) {
+        db_->ReleaseSnapshot(options.snapshot);
+      }
+
+      Status s = db_->Put(write_options_, key.get(), gen.Generate(value_size_));
+      if (!s.ok()) {
+        fprintf(stderr, "put error: %s\n", s.ToString().c_str());
+        exit(1);
+      }
+      thread->stats.FinishedSingleOp(db_);
+    }
+    char msg[100];
+    snprintf(msg, sizeof(msg),
+             "( updates:%lld found:%lld)", readwrites_, found);
+    thread->stats.AddMessage(msg);
+  }
+
+  // Read-modify-write for random keys.
+  // Each operation causes the key grow by value_size (simulating an append).
+  // Generally used for benchmarking against merges of similar type
+  void AppendRandom(ThreadState* thread) {
+    ReadOptions options(FLAGS_verify_checksum, true);
+    RandomGenerator gen;
+    std::string value;
+    long found = 0;
+
+    // The number of iterations is the larger of read_ or write_
+    Duration duration(FLAGS_duration, readwrites_);
+    while (!duration.Done(1)) {
+      const long long k = thread->rand.Next() % FLAGS_num;
+      unique_ptr<char []> key = GenerateKeyFromInt(k);
+
+      if (FLAGS_use_snapshot) {
+        options.snapshot = db_->GetSnapshot();
+      }
+
+      if (FLAGS_get_approx) {
+        char key2[100];
+        snprintf(key2, sizeof(key2), "%016lld", k + 1);
+        Slice skey2(key2);
+        Slice skey(key2);
+        Range range(skey, skey2);
+        uint64_t sizes;
+        db_->GetApproximateSizes(&range, 1, &sizes);
+      }
+
+      // Get the existing value
+      if (db_->Get(options, key.get(), &value).ok()) {
+        found++;
+      } else {
+        // If not existing, then just assume an empty string of data
+        value.clear();
+      }
+
+      if (FLAGS_use_snapshot) {
+        db_->ReleaseSnapshot(options.snapshot);
+      }
+
+      // Update the value (by appending data)
+      Slice operand = gen.Generate(value_size_);
+      if (value.size() > 0) {
+        // Use a delimeter to match the semantics for StringAppendOperator
+        value.append(1,',');
+      }
+      value.append(operand.data(), operand.size());
+
+      // Write back to the database
+      Status s = db_->Put(write_options_, key.get(), value);
+      if (!s.ok()) {
+        fprintf(stderr, "put error: %s\n", s.ToString().c_str());
+        exit(1);
+      }
+      thread->stats.FinishedSingleOp(db_);
+    }
+    char msg[100];
+    snprintf(msg, sizeof(msg), "( updates:%lld found:%ld)", readwrites_, found);
+    thread->stats.AddMessage(msg);
+  }
+
+  // Read-modify-write for random keys (using MergeOperator)
+  // The merge operator to use should be defined by FLAGS_merge_operator
+  // Adjust FLAGS_value_size so that the keys are reasonable for this operator
+  // Assumes that the merge operator is non-null (i.e.: is well-defined)
+  //
+  // For example, use FLAGS_merge_operator="uint64add" and FLAGS_value_size=8
+  // to simulate random additions over 64-bit integers using merge.
+  //
+  // The number of merges on the same key can be controlled by adjusting
+  // FLAGS_merge_keys.
+  void MergeRandom(ThreadState* thread) {
+    RandomGenerator gen;
+
+    // The number of iterations is the larger of read_ or write_
+    Duration duration(FLAGS_duration, readwrites_);
+    while (!duration.Done(1)) {
+      const long long k = thread->rand.Next() % merge_keys_;
+      unique_ptr<char []> key = GenerateKeyFromInt(k);
+
+      Status s = db_->Merge(write_options_, key.get(),
+                            gen.Generate(value_size_));
+
+      if (!s.ok()) {
+        fprintf(stderr, "merge error: %s\n", s.ToString().c_str());
+        exit(1);
+      }
+      thread->stats.FinishedSingleOp(db_);
+    }
+
+    // Print some statistics
+    char msg[100];
+    snprintf(msg, sizeof(msg), "( updates:%lld)", readwrites_);
+    thread->stats.AddMessage(msg);
+  }
+
+  // Read and merge random keys. The amount of reads and merges are controlled
+  // by adjusting FLAGS_num and FLAGS_mergereadpercent. The number of distinct
+  // keys (and thus also the number of reads and merges on the same key) can be
+  // adjusted with FLAGS_merge_keys.
+  //
+  // As with MergeRandom, the merge operator to use should be defined by
+  // FLAGS_merge_operator.
+  void ReadRandomMergeRandom(ThreadState* thread) {
+    ReadOptions options(FLAGS_verify_checksum, true);
+    RandomGenerator gen;
+    std::string value;
+    long long num_hits = 0;
+    long long num_gets = 0;
+    long long num_merges = 0;
+    size_t max_length = 0;
+
+    // the number of iterations is the larger of read_ or write_
+    Duration duration(FLAGS_duration, readwrites_);
+
+    while (!duration.Done(1)) {
+      const long long k = thread->rand.Next() % merge_keys_;
+      unique_ptr<char []> key = GenerateKeyFromInt(k);
+
+      bool do_merge = int(thread->rand.Next() % 100) < FLAGS_mergereadpercent;
+
+      if (do_merge) {
+        Status s = db_->Merge(write_options_, key.get(),
+                              gen.Generate(value_size_));
+        if (!s.ok()) {
+          fprintf(stderr, "merge error: %s\n", s.ToString().c_str());
+          exit(1);
+        }
+
+        num_merges++;
+
+      } else {
+        Status s = db_->Get(options, key.get(), &value);
+        if (value.length() > max_length)
+          max_length = value.length();
+
+        if (!s.ok() && !s.IsNotFound()) {
+          fprintf(stderr, "get error: %s\n", s.ToString().c_str());
+          // we continue after error rather than exiting so that we can
+          // find more errors if any
+        } else if (!s.IsNotFound()) {
+          num_hits++;
+        }
+
+        num_gets++;
+
+      }
+
+      thread->stats.FinishedSingleOp(db_);
+    }
+    char msg[100];
+    snprintf(msg, sizeof(msg),
+             "(reads:%lld merges:%lld total:%lld hits:%lld maxlength:%zu)",
+             num_gets, num_merges, readwrites_, num_hits, max_length);
+    thread->stats.AddMessage(msg);
+  }
+
+
+  void Compact(ThreadState* thread) {
+    db_->CompactRange(nullptr, nullptr);
+  }
+
+  void PrintStats(const char* key) {
+    std::string stats;
+    if (!db_->GetProperty(key, &stats)) {
+      stats = "(failed)";
+    }
+    fprintf(stdout, "\n%s\n", stats.c_str());
+  }
+
+  static void WriteToFile(void* arg, const char* buf, int n) {
+    reinterpret_cast<WritableFile*>(arg)->Append(Slice(buf, n));
+  }
+
+  void HeapProfile() {
+    char fname[100];
+    EnvOptions soptions;
+    snprintf(fname, sizeof(fname), "%s/heap-%04d", FLAGS_db.c_str(),
+             ++heap_counter_);
+    unique_ptr<WritableFile> file;
+    Status s = FLAGS_env->NewWritableFile(fname, &file, soptions);
+    if (!s.ok()) {
+      fprintf(stderr, "%s\n", s.ToString().c_str());
+      return;
+    }
+    bool ok = port::GetHeapProfile(WriteToFile, file.get());
+    if (!ok) {
+      fprintf(stderr, "heap profiling not supported\n");
+      FLAGS_env->DeleteFile(fname);
+    }
+  }
+};
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  rocksdb::InstallStackTraceHandler();
+  google::SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
+                          " [OPTIONS]...");
+  google::ParseCommandLineFlags(&argc, &argv, true);
+
+  FLAGS_compaction_style_e = (rocksdb::CompactionStyle) FLAGS_compaction_style;
+  if (FLAGS_statistics) {
+    dbstats = rocksdb::CreateDBStatistics();
+  }
+
+  std::vector<std::string> fanout =
+    rocksdb::stringSplit(FLAGS_max_bytes_for_level_multiplier_additional, ',');
+  for (unsigned int j= 0; j < fanout.size(); j++) {
+    FLAGS_max_bytes_for_level_multiplier_additional_v.push_back(
+      std::stoi(fanout[j]));
+  }
+
+  FLAGS_compression_type_e =
+    StringToCompressionType(FLAGS_compression_type.c_str());
+
+  if (!FLAGS_hdfs.empty()) {
+    FLAGS_env  = new rocksdb::HdfsEnv(FLAGS_hdfs);
+  }
+
+  if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "NONE"))
+    FLAGS_compaction_fadvice_e = rocksdb::Options::NONE;
+  else if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "NORMAL"))
+    FLAGS_compaction_fadvice_e = rocksdb::Options::NORMAL;
+  else if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "SEQUENTIAL"))
+    FLAGS_compaction_fadvice_e = rocksdb::Options::SEQUENTIAL;
+  else if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "WILLNEED"))
+    FLAGS_compaction_fadvice_e = rocksdb::Options::WILLNEED;
+  else {
+    fprintf(stdout, "Unknown compaction fadvice:%s\n",
+            FLAGS_compaction_fadvice.c_str());
+  }
+
+  FLAGS_rep_factory = StringToRepFactory(FLAGS_memtablerep.c_str());
+
+  // The number of background threads should be at least as much the
+  // max number of concurrent compactions.
+  FLAGS_env->SetBackgroundThreads(FLAGS_max_background_compactions);
+  // Choose a location for the test database if none given with --db=<path>
+  if (FLAGS_db.empty()) {
+    std::string default_db_path;
+    rocksdb::Env::Default()->GetTestDirectory(&default_db_path);
+    default_db_path += "/dbbench";
+    FLAGS_db = default_db_path;
+  }
+
+  rocksdb::Benchmark benchmark;
+  benchmark.Run();
+  return 0;
+}
diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc
new file mode 100644 (file)
index 0000000..04d6d0e
--- /dev/null
@@ -0,0 +1,114 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2012 Facebook.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <algorithm>
+#include <string>
+#include <stdint.h>
+#include "db/db_impl.h"
+#include "db/filename.h"
+#include "db/version_set.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "port/port.h"
+#include "util/mutexlock.h"
+
+namespace rocksdb {
+
+Status DBImpl::DisableFileDeletions() {
+  MutexLock l(&mutex_);
+  ++disable_delete_obsolete_files_;
+  if (disable_delete_obsolete_files_ == 1) {
+    // if not, it has already been disabled, so don't log anything
+    Log(options_.info_log, "File Deletions Disabled");
+  }
+  return Status::OK();
+}
+
+Status DBImpl::EnableFileDeletions(bool force) {
+  DeletionState deletion_state;
+  bool should_purge_files = false;
+  {
+    MutexLock l(&mutex_);
+    if (force) {
+      // if force, we need to enable file deletions right away
+      disable_delete_obsolete_files_ = 0;
+    } else if (disable_delete_obsolete_files_ > 0) {
+      --disable_delete_obsolete_files_;
+    }
+    if (disable_delete_obsolete_files_ == 0)  {
+      Log(options_.info_log, "File Deletions Enabled");
+      should_purge_files = true;
+      FindObsoleteFiles(deletion_state, true);
+    }
+  }
+  if (should_purge_files)  {
+    PurgeObsoleteFiles(deletion_state);
+  }
+  LogFlush(options_.info_log);
+  return Status::OK();
+}
+
+Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,
+                            uint64_t* manifest_file_size,
+                            bool flush_memtable) {
+
+  *manifest_file_size = 0;
+
+  if (flush_memtable) {
+    // flush all dirty data to disk.
+    Status status =  Flush(FlushOptions());
+    if (!status.ok()) {
+      Log(options_.info_log, "Cannot Flush data %s\n",
+          status.ToString().c_str());
+      return status;
+    }
+  }
+
+  MutexLock l(&mutex_);
+
+  // Make a set of all of the live *.sst files
+  std::set<uint64_t> live;
+  versions_->current()->AddLiveFiles(&live);
+
+  ret.clear();
+  ret.reserve(live.size() + 2); //*.sst + CURRENT + MANIFEST
+
+  // create names of the live files. The names are not absolute
+  // paths, instead they are relative to dbname_;
+  for (auto live_file : live) {
+    ret.push_back(TableFileName("", live_file));
+  }
+
+  ret.push_back(CurrentFileName(""));
+  ret.push_back(DescriptorFileName("", versions_->ManifestFileNumber()));
+
+  // find length of manifest file while holding the mutex lock
+  *manifest_file_size = versions_->ManifestFileSize();
+
+  return Status::OK();
+}
+
+Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) {
+  // First get sorted files in archive dir, then append sorted files from main
+  // dir to maintain sorted order
+
+  // list wal files in archive dir.
+  Status s;
+  std::string archivedir = ArchivalDirectory(options_.wal_dir);
+  if (env_->FileExists(archivedir)) {
+    s = AppendSortedWalsOfType(archivedir, files, kArchivedLogFile);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+  // list wal files in main db dir.
+  return AppendSortedWalsOfType(options_.wal_dir, files, kAliveLogFile);
+}
+
+}
diff --git a/db/db_impl.cc b/db/db_impl.cc
new file mode 100644 (file)
index 0000000..e84817b
--- /dev/null
@@ -0,0 +1,3960 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_impl.h"
+
+#include <algorithm>
+#include <climits>
+#include <cstdio>
+#include <set>
+#include <stdexcept>
+#include <stdint.h>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "db/builder.h"
+#include "db/dbformat.h"
+#include "db/db_iter.h"
+#include "db/filename.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/memtable.h"
+#include "db/memtablelist.h"
+#include "db/merge_context.h"
+#include "db/merge_helper.h"
+#include "db/prefix_filter_iterator.h"
+#include "db/table_cache.h"
+#include "db/table_properties_collector.h"
+#include "db/transaction_log_impl.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "port/port.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "port/port.h"
+#include "table/block.h"
+#include "table/block_based_table_factory.h"
+#include "table/merger.h"
+#include "table/two_level_iterator.h"
+#include "util/auto_roll_logger.h"
+#include "util/build_version.h"
+#include "util/coding.h"
+#include "util/hash_skiplist_rep.h"
+#include "util/logging.h"
+#include "util/mutexlock.h"
+#include "util/perf_context_imp.h"
+#include "util/stop_watch.h"
+#include "util/autovector.h"
+
+namespace rocksdb {
+
+void dumpLeveldbBuildVersion(Logger * log);
+
+// Information kept for every waiting writer
+struct DBImpl::Writer {
+  Status status;
+  WriteBatch* batch;
+  bool sync;
+  bool disableWAL;
+  bool done;
+  port::CondVar cv;
+
+  explicit Writer(port::Mutex* mu) : cv(mu) { }
+};
+
+struct DBImpl::CompactionState {
+  Compaction* const compaction;
+
+  // If there were two snapshots with seq numbers s1 and
+  // s2 and s1 < s2, and if we find two instances of a key k1 then lies
+  // entirely within s1 and s2, then the earlier version of k1 can be safely
+  // deleted because that version is not visible in any snapshot.
+  std::vector<SequenceNumber> existing_snapshots;
+
+  // Files produced by compaction
+  struct Output {
+    uint64_t number;
+    uint64_t file_size;
+    InternalKey smallest, largest;
+    SequenceNumber smallest_seqno, largest_seqno;
+  };
+  std::vector<Output> outputs;
+  std::list<uint64_t> allocated_file_numbers;
+
+  // State kept for output being generated
+  unique_ptr<WritableFile> outfile;
+  unique_ptr<TableBuilder> builder;
+
+  uint64_t total_bytes;
+
+  Output* current_output() { return &outputs[outputs.size()-1]; }
+
+  explicit CompactionState(Compaction* c)
+      : compaction(c),
+        total_bytes(0) {
+  }
+
+  // Create a client visible context of this compaction
+  CompactionFilter::Context GetFilterContext() {
+    CompactionFilter::Context context;
+    context.is_full_compaction = compaction->IsFullCompaction();
+    return context;
+  }
+};
+
+// Fix user-supplied options to be reasonable
+template <class T, class V>
+static void ClipToRange(T* ptr, V minvalue, V maxvalue) {
+  if (static_cast<V>(*ptr) > maxvalue) *ptr = maxvalue;
+  if (static_cast<V>(*ptr) < minvalue) *ptr = minvalue;
+}
+Options SanitizeOptions(const std::string& dbname,
+                        const InternalKeyComparator* icmp,
+                        const InternalFilterPolicy* ipolicy,
+                        const Options& src) {
+  Options result = src;
+  result.comparator = icmp;
+  result.filter_policy = (src.filter_policy != nullptr) ? ipolicy : nullptr;
+  ClipToRange(&result.max_open_files,            20,     1000000);
+  ClipToRange(&result.write_buffer_size,         ((size_t)64)<<10,
+                                                 ((size_t)64)<<30);
+  ClipToRange(&result.block_size,                1<<10,  4<<20);
+
+  // if user sets arena_block_size, we trust user to use this value. Otherwise,
+  // calculate a proper value from writer_buffer_size;
+  if (result.arena_block_size <= 0) {
+    result.arena_block_size = result.write_buffer_size / 10;
+  }
+
+  result.min_write_buffer_number_to_merge = std::min(
+    result.min_write_buffer_number_to_merge, result.max_write_buffer_number-1);
+  if (result.info_log == nullptr) {
+    Status s = CreateLoggerFromOptions(dbname, result.db_log_dir, src.env,
+                                       result, &result.info_log);
+    if (!s.ok()) {
+      // No place suitable for logging
+      result.info_log = nullptr;
+    }
+  }
+  if (result.block_cache == nullptr && !result.no_block_cache) {
+    result.block_cache = NewLRUCache(8 << 20);
+  }
+  result.compression_per_level = src.compression_per_level;
+  if (result.block_size_deviation < 0 || result.block_size_deviation > 100) {
+    result.block_size_deviation = 0;
+  }
+  if (result.max_mem_compaction_level >= result.num_levels) {
+    result.max_mem_compaction_level = result.num_levels - 1;
+  }
+  if (result.soft_rate_limit > result.hard_rate_limit) {
+    result.soft_rate_limit = result.hard_rate_limit;
+  }
+  if (result.compaction_filter) {
+    Log(result.info_log, "Compaction filter specified, ignore factory");
+  }
+  if (result.prefix_extractor) {
+    // If a prefix extractor has been supplied and a HashSkipListRepFactory is
+    // being used, make sure that the latter uses the former as its transform
+    // function.
+    auto factory = dynamic_cast<HashSkipListRepFactory*>(
+      result.memtable_factory.get());
+    if (factory &&
+        factory->GetTransform() != result.prefix_extractor) {
+      Log(result.info_log, "A prefix hash representation factory was supplied "
+          "whose prefix extractor does not match options.prefix_extractor. "
+          "Falling back to skip list representation factory");
+      result.memtable_factory = std::make_shared<SkipListFactory>();
+    } else if (factory) {
+      Log(result.info_log, "Prefix hash memtable rep is in use.");
+    }
+  }
+
+  if (result.wal_dir.empty()) {
+    // Use dbname as default
+    result.wal_dir = dbname;
+  }
+
+  // -- Sanitize the table properties collector
+  // All user defined properties collectors will be wrapped by
+  // UserKeyTablePropertiesCollector since for them they only have the
+  // knowledge of the user keys; internal keys are invisible to them.
+  auto& collectors = result.table_properties_collectors;
+  for (size_t i = 0; i < result.table_properties_collectors.size(); ++i) {
+    assert(collectors[i]);
+    collectors[i] =
+      std::make_shared<UserKeyTablePropertiesCollector>(collectors[i]);
+  }
+
+  // Add collector to collect internal key statistics
+  collectors.push_back(
+      std::make_shared<InternalKeyPropertiesCollector>()
+  );
+
+  return result;
+}
+
+CompressionType GetCompressionType(const Options& options, int level,
+                                   const bool enable_compression) {
+  if (!enable_compression) {
+    // disable compression
+    return kNoCompression;
+  }
+  // If the use has specified a different compression level for each level,
+  // then pick the compresison for that level.
+  if (!options.compression_per_level.empty()) {
+    const int n = options.compression_per_level.size() - 1;
+    // It is possible for level_ to be -1; in that case, we use level
+    // 0's compression.  This occurs mostly in backwards compatibility
+    // situations when the builder doesn't know what level the file
+    // belongs to.  Likewise, if level_ is beyond the end of the
+    // specified compression levels, use the last value.
+    return options.compression_per_level[std::max(0, std::min(level, n))];
+  } else {
+    return options.compression;
+  }
+}
+
+CompressionType GetCompressionFlush(const Options& options) {
+  // Compressing memtable flushes might not help unless the sequential load
+  // optimization is used for leveled compaction. Otherwise the CPU and
+  // latency overhead is not offset by saving much space.
+
+  bool can_compress;
+
+  if  (options.compaction_style == kCompactionStyleUniversal) {
+    can_compress =
+        (options.compaction_options_universal.compression_size_percent < 0);
+  } else {
+    // For leveled compress when min_level_to_compress == 0.
+    can_compress = (GetCompressionType(options, 0, true) != kNoCompression);
+  }
+
+  if (can_compress) {
+    return options.compression;
+  } else {
+    return kNoCompression;
+  }
+}
+
+DBImpl::DBImpl(const Options& options, const std::string& dbname)
+    : env_(options.env),
+      dbname_(dbname),
+      internal_comparator_(options.comparator),
+      options_(SanitizeOptions(dbname, &internal_comparator_,
+                               &internal_filter_policy_, options)),
+      internal_filter_policy_(options.filter_policy),
+      owns_info_log_(options_.info_log != options.info_log),
+      db_lock_(nullptr),
+      mutex_(options.use_adaptive_mutex),
+      shutting_down_(nullptr),
+      bg_cv_(&mutex_),
+      mem_rep_factory_(options_.memtable_factory.get()),
+      mem_(new MemTable(internal_comparator_, options_)),
+      logfile_number_(0),
+      super_version_(nullptr),
+      tmp_batch_(),
+      bg_compaction_scheduled_(0),
+      bg_manual_only_(0),
+      bg_flush_scheduled_(0),
+      bg_logstats_scheduled_(false),
+      manual_compaction_(nullptr),
+      logger_(nullptr),
+      disable_delete_obsolete_files_(0),
+      delete_obsolete_files_last_run_(options.env->NowMicros()),
+      purge_wal_files_last_run_(0),
+      last_stats_dump_time_microsec_(0),
+      default_interval_to_delete_obsolete_WAL_(600),
+      stall_level0_slowdown_(0),
+      stall_memtable_compaction_(0),
+      stall_level0_num_files_(0),
+      stall_level0_slowdown_count_(0),
+      stall_memtable_compaction_count_(0),
+      stall_level0_num_files_count_(0),
+      started_at_(options.env->NowMicros()),
+      flush_on_destroy_(false),
+      stats_(options.num_levels),
+      delayed_writes_(0),
+      storage_options_(options),
+      bg_work_gate_closed_(false),
+      refitting_level_(false) {
+
+  mem_->Ref();
+
+  env_->GetAbsolutePath(dbname, &db_absolute_path_);
+
+  stall_leveln_slowdown_.resize(options.num_levels);
+  stall_leveln_slowdown_count_.resize(options.num_levels);
+  for (int i = 0; i < options.num_levels; ++i) {
+    stall_leveln_slowdown_[i] = 0;
+    stall_leveln_slowdown_count_[i] = 0;
+  }
+
+  // Reserve ten files or so for other uses and give the rest to TableCache.
+  const int table_cache_size = options_.max_open_files - 10;
+  table_cache_.reset(new TableCache(dbname_, &options_,
+                                    storage_options_, table_cache_size));
+
+  versions_.reset(new VersionSet(dbname_, &options_, storage_options_,
+                                 table_cache_.get(), &internal_comparator_));
+
+  dumpLeveldbBuildVersion(options_.info_log.get());
+  options_.Dump(options_.info_log.get());
+
+  char name[100];
+  Status st = env_->GetHostName(name, 100L);
+  if (st.ok()) {
+    host_name_ = name;
+  } else {
+    Log(options_.info_log, "Can't get hostname, use localhost as host name.");
+    host_name_ = "localhost";
+  }
+  last_log_ts = 0;
+
+  LogFlush(options_.info_log);
+}
+
+DBImpl::~DBImpl() {
+  std::vector<MemTable*> to_delete;
+  to_delete.reserve(options_.max_write_buffer_number);
+
+  // Wait for background work to finish
+  if (flush_on_destroy_ && mem_->GetFirstSequenceNumber() != 0) {
+    FlushMemTable(FlushOptions());
+  }
+  mutex_.Lock();
+  shutting_down_.Release_Store(this);  // Any non-nullptr value is ok
+  while (bg_compaction_scheduled_ ||
+         bg_flush_scheduled_ ||
+         bg_logstats_scheduled_) {
+    bg_cv_.Wait();
+  }
+  if (super_version_ != nullptr) {
+    bool is_last_reference __attribute__((unused));
+    is_last_reference = super_version_->Unref();
+    assert(is_last_reference);
+    super_version_->Cleanup();
+    delete super_version_;
+  }
+  mutex_.Unlock();
+
+  if (db_lock_ != nullptr) {
+    env_->UnlockFile(db_lock_);
+  }
+
+  if (mem_ != nullptr) {
+    delete mem_->Unref();
+  }
+
+  imm_.UnrefAll(&to_delete);
+  for (MemTable* m: to_delete) {
+    delete m;
+  }
+  LogFlush(options_.info_log);
+}
+
+// Do not flush and close database elegantly. Simulate a crash.
+void DBImpl::TEST_Destroy_DBImpl() {
+  // ensure that no new memtable flushes can occur
+  flush_on_destroy_ = false;
+
+  // wait till all background compactions are done.
+  mutex_.Lock();
+  while (bg_compaction_scheduled_ ||
+         bg_flush_scheduled_ ||
+         bg_logstats_scheduled_) {
+    bg_cv_.Wait();
+  }
+  if (super_version_ != nullptr) {
+    bool is_last_reference __attribute__((unused));
+    is_last_reference = super_version_->Unref();
+    assert(is_last_reference);
+    super_version_->Cleanup();
+    delete super_version_;
+  }
+
+  // Prevent new compactions from occuring.
+  bg_work_gate_closed_ = true;
+  const int LargeNumber = 10000000;
+  bg_compaction_scheduled_ += LargeNumber;
+
+  mutex_.Unlock();
+  LogFlush(options_.info_log);
+
+  // force release the lock file.
+  if (db_lock_ != nullptr) {
+    env_->UnlockFile(db_lock_);
+  }
+
+  log_.reset();
+  versions_.reset();
+  table_cache_.reset();
+}
+
+uint64_t DBImpl::TEST_Current_Manifest_FileNo() {
+  return versions_->ManifestFileNumber();
+}
+
+Status DBImpl::NewDB() {
+  VersionEdit new_db;
+  new_db.SetComparatorName(user_comparator()->Name());
+  new_db.SetLogNumber(0);
+  new_db.SetNextFile(2);
+  new_db.SetLastSequence(0);
+
+  const std::string manifest = DescriptorFileName(dbname_, 1);
+  unique_ptr<WritableFile> file;
+  Status s = env_->NewWritableFile(manifest, &file, storage_options_);
+  if (!s.ok()) {
+    return s;
+  }
+  file->SetPreallocationBlockSize(options_.manifest_preallocation_size);
+  {
+    log::Writer log(std::move(file));
+    std::string record;
+    new_db.EncodeTo(&record);
+    s = log.AddRecord(record);
+  }
+  if (s.ok()) {
+    // Make "CURRENT" file that points to the new manifest file.
+    s = SetCurrentFile(env_, dbname_, 1);
+  } else {
+    env_->DeleteFile(manifest);
+  }
+  return s;
+}
+
+void DBImpl::MaybeIgnoreError(Status* s) const {
+  if (s->ok() || options_.paranoid_checks) {
+    // No change needed
+  } else {
+    Log(options_.info_log, "Ignoring error %s", s->ToString().c_str());
+    *s = Status::OK();
+  }
+}
+
+const Status DBImpl::CreateArchivalDirectory() {
+  if (options_.WAL_ttl_seconds > 0 || options_.WAL_size_limit_MB > 0) {
+    std::string archivalPath = ArchivalDirectory(options_.wal_dir);
+    return env_->CreateDirIfMissing(archivalPath);
+  }
+  return Status::OK();
+}
+
+void DBImpl::PrintStatistics() {
+  auto dbstats = options_.statistics.get();
+  if (dbstats) {
+    Log(options_.info_log,
+        "STATISTCS:\n %s",
+        dbstats->ToString().c_str());
+  }
+}
+
+void DBImpl::MaybeDumpStats() {
+  if (options_.stats_dump_period_sec == 0) return;
+
+  const uint64_t now_micros = env_->NowMicros();
+
+  if (last_stats_dump_time_microsec_ +
+      options_.stats_dump_period_sec * 1000000
+      <= now_micros) {
+    // Multiple threads could race in here simultaneously.
+    // However, the last one will update last_stats_dump_time_microsec_
+    // atomically. We could see more than one dump during one dump
+    // period in rare cases.
+    last_stats_dump_time_microsec_ = now_micros;
+    std::string stats;
+    GetProperty("rocksdb.stats", &stats);
+    Log(options_.info_log, "%s", stats.c_str());
+    PrintStatistics();
+  }
+}
+
+// DBImpl::SuperVersion methods
+DBImpl::SuperVersion::SuperVersion(const int num_memtables) {
+  to_delete.resize(num_memtables);
+}
+
+DBImpl::SuperVersion::~SuperVersion() {
+  for (auto td : to_delete) {
+    delete td;
+  }
+}
+
+DBImpl::SuperVersion* DBImpl::SuperVersion::Ref() {
+  refs.fetch_add(1, std::memory_order_relaxed);
+  return this;
+}
+
+bool DBImpl::SuperVersion::Unref() {
+  assert(refs > 0);
+  // fetch_sub returns the previous value of ref
+  return refs.fetch_sub(1, std::memory_order_relaxed) == 1;
+}
+
+void DBImpl::SuperVersion::Cleanup() {
+  assert(refs.load(std::memory_order_relaxed) == 0);
+  imm.UnrefAll(&to_delete);
+  MemTable* m = mem->Unref();
+  if (m != nullptr) {
+    to_delete.push_back(m);
+  }
+  current->Unref();
+}
+
+void DBImpl::SuperVersion::Init(MemTable* new_mem, const MemTableList& new_imm,
+                                Version* new_current) {
+  mem = new_mem;
+  imm = new_imm;
+  current = new_current;
+  mem->Ref();
+  imm.RefAll();
+  current->Ref();
+  refs.store(1, std::memory_order_relaxed);
+}
+
+// Returns the list of live files in 'sst_live' and the list
+// of all files in the filesystem in 'all_files'.
+// no_full_scan = true -- never do the full scan using GetChildren()
+// force = false -- don't force the full scan, except every
+//  options_.delete_obsolete_files_period_micros
+// force = true -- force the full scan
+void DBImpl::FindObsoleteFiles(DeletionState& deletion_state,
+                               bool force,
+                               bool no_full_scan) {
+  mutex_.AssertHeld();
+
+  // if deletion is disabled, do nothing
+  if (disable_delete_obsolete_files_ > 0) {
+    return;
+  }
+
+  bool doing_the_full_scan = false;
+
+  // logic for figurint out if we're doing the full scan
+  if (no_full_scan) {
+    doing_the_full_scan = false;
+  } else if (force || options_.delete_obsolete_files_period_micros == 0) {
+    doing_the_full_scan = true;
+  } else {
+    const uint64_t now_micros = env_->NowMicros();
+    if (delete_obsolete_files_last_run_ +
+        options_.delete_obsolete_files_period_micros < now_micros) {
+      doing_the_full_scan = true;
+      delete_obsolete_files_last_run_ = now_micros;
+    }
+  }
+
+  // get obsolete files
+  versions_->GetObsoleteFiles(&deletion_state.sst_delete_files);
+
+  // store the current filenum, lognum, etc
+  deletion_state.manifest_file_number = versions_->ManifestFileNumber();
+  deletion_state.log_number = versions_->LogNumber();
+  deletion_state.prev_log_number = versions_->PrevLogNumber();
+
+  if (!doing_the_full_scan && !deletion_state.HaveSomethingToDelete()) {
+    // avoid filling up sst_live if we're sure that we
+    // are not going to do the full scan and that we don't have
+    // anything to delete at the moment
+    return;
+  }
+
+  // don't delete live files
+  deletion_state.sst_live.assign(pending_outputs_.begin(),
+                                 pending_outputs_.end());
+  versions_->AddLiveFiles(&deletion_state.sst_live);
+
+  if (doing_the_full_scan) {
+    // set of all files in the directory
+    env_->GetChildren(dbname_, &deletion_state.all_files); // Ignore errors
+
+    //Add log files in wal_dir
+    if (options_.wal_dir != dbname_) {
+      std::vector<std::string> log_files;
+      env_->GetChildren(options_.wal_dir, &log_files); // Ignore errors
+      deletion_state.all_files.insert(
+        deletion_state.all_files.end(),
+        log_files.begin(),
+        log_files.end()
+      );
+    }
+  }
+}
+
+// Diffs the files listed in filenames and those that do not
+// belong to live files are posibly removed. Also, removes all the
+// files in sst_delete_files and log_delete_files.
+// It is not necessary to hold the mutex when invoking this method.
+void DBImpl::PurgeObsoleteFiles(DeletionState& state) {
+
+  // check if there is anything to do
+  if (!state.all_files.size() &&
+      !state.sst_delete_files.size() &&
+      !state.log_delete_files.size()) {
+    return;
+  }
+
+  // this checks if FindObsoleteFiles() was run before. If not, don't do
+  // PurgeObsoleteFiles(). If FindObsoleteFiles() was run, we need to also
+  // run PurgeObsoleteFiles(), even if disable_delete_obsolete_files_ is true
+  if (state.manifest_file_number == 0) {
+    return;
+  }
+
+  uint64_t number;
+  FileType type;
+  std::vector<std::string> old_log_files;
+
+  // Now, convert live list to an unordered set, WITHOUT mutex held;
+  // set is slow.
+  std::unordered_set<uint64_t> live_set(state.sst_live.begin(),
+                                        state.sst_live.end());
+
+  state.all_files.reserve(state.all_files.size() +
+      state.sst_delete_files.size());
+  for (auto file : state.sst_delete_files) {
+    state.all_files.push_back(TableFileName("", file->number).substr(1));
+    delete file;
+  }
+
+  state.all_files.reserve(state.all_files.size() +
+      state.log_delete_files.size());
+  for (auto filenum : state.log_delete_files) {
+    if (filenum > 0) {
+      state.all_files.push_back(LogFileName("", filenum).substr(1));
+    }
+  }
+
+  // dedup state.all_files so we don't try to delete the same
+  // file twice
+  sort(state.all_files.begin(), state.all_files.end());
+  auto unique_end = unique(state.all_files.begin(), state.all_files.end());
+
+  for (size_t i = 0; state.all_files.begin() + i < unique_end; i++) {
+    if (ParseFileName(state.all_files[i], &number, &type)) {
+      bool keep = true;
+      switch (type) {
+        case kLogFile:
+          keep = ((number >= state.log_number) ||
+                  (number == state.prev_log_number));
+          break;
+        case kDescriptorFile:
+          // Keep my manifest file, and any newer incarnations'
+          // (in case there is a race that allows other incarnations)
+          keep = (number >= state.manifest_file_number);
+          break;
+        case kTableFile:
+          keep = (live_set.find(number) != live_set.end());
+          break;
+        case kTempFile:
+          // Any temp files that are currently being written to must
+          // be recorded in pending_outputs_, which is inserted into "live"
+          keep = (live_set.find(number) != live_set.end());
+          break;
+        case kInfoLogFile:
+          keep = true;
+          if (number != 0) {
+            old_log_files.push_back(state.all_files[i]);
+          }
+          break;
+        case kCurrentFile:
+        case kDBLockFile:
+        case kIdentityFile:
+        case kMetaDatabase:
+          keep = true;
+          break;
+      }
+
+      if (!keep) {
+        if (type == kTableFile) {
+          // evict from cache
+          table_cache_->Evict(number);
+        }
+        std::string fname = ((type == kLogFile) ? options_.wal_dir : dbname_) +
+            "/" + state.all_files[i];
+        Log(options_.info_log,
+            "Delete type=%d #%lu",
+            int(type),
+            (unsigned long)number);
+
+        Status st;
+        if (type == kLogFile && (options_.WAL_ttl_seconds > 0 ||
+              options_.WAL_size_limit_MB > 0)) {
+            st = env_->RenameFile(fname,
+                ArchivedLogFileName(options_.wal_dir, number));
+            if (!st.ok()) {
+              Log(options_.info_log,
+                  "RenameFile logfile #%lu FAILED -- %s\n",
+                  (unsigned long)number, st.ToString().c_str());
+            }
+        } else {
+          st = env_->DeleteFile(fname);
+          if (!st.ok()) {
+            Log(options_.info_log, "Delete type=%d #%lu FAILED -- %s\n",
+                int(type), (unsigned long)number, st.ToString().c_str());
+          }
+        }
+      }
+    }
+  }
+
+  // Delete old info log files.
+  size_t old_log_file_count = old_log_files.size();
+  // NOTE: Currently we only support log purge when options_.db_log_dir is
+  // located in `dbname` directory.
+  if (old_log_file_count >= options_.keep_log_file_num &&
+      options_.db_log_dir.empty()) {
+    std::sort(old_log_files.begin(), old_log_files.end());
+    size_t end = old_log_file_count - options_.keep_log_file_num;
+    for (unsigned int i = 0; i <= end; i++) {
+      std::string& to_delete = old_log_files.at(i);
+      // Log(options_.info_log, "Delete type=%d %s\n",
+      //     int(kInfoLogFile), to_delete.c_str());
+      env_->DeleteFile(dbname_ + "/" + to_delete);
+    }
+  }
+  PurgeObsoleteWALFiles();
+  LogFlush(options_.info_log);
+}
+
+void DBImpl::DeleteObsoleteFiles() {
+  mutex_.AssertHeld();
+  DeletionState deletion_state;
+  FindObsoleteFiles(deletion_state, true);
+  PurgeObsoleteFiles(deletion_state);
+}
+
+// 1. Go through all archived files and
+//    a. if ttl is enabled, delete outdated files
+//    b. if archive size limit is enabled, delete empty files,
+//        compute file number and size.
+// 2. If size limit is enabled:
+//    a. compute how many files should be deleted
+//    b. get sorted non-empty archived logs
+//    c. delete what should be deleted
+void DBImpl::PurgeObsoleteWALFiles() {
+  bool const ttl_enabled = options_.WAL_ttl_seconds > 0;
+  bool const size_limit_enabled =  options_.WAL_size_limit_MB > 0;
+  if (!ttl_enabled && !size_limit_enabled) {
+    return;
+  }
+
+  int64_t current_time;
+  Status s = env_->GetCurrentTime(&current_time);
+  if (!s.ok()) {
+    Log(options_.info_log, "Can't get current time: %s", s.ToString().c_str());
+    assert(false);
+    return;
+  }
+  uint64_t const now_seconds = static_cast<uint64_t>(current_time);
+  uint64_t const time_to_check = (ttl_enabled && !size_limit_enabled) ?
+    options_.WAL_ttl_seconds / 2 : default_interval_to_delete_obsolete_WAL_;
+
+  if (purge_wal_files_last_run_ + time_to_check > now_seconds) {
+    return;
+  }
+
+  purge_wal_files_last_run_ = now_seconds;
+
+  std::string archival_dir = ArchivalDirectory(options_.wal_dir);
+  std::vector<std::string> files;
+  s = env_->GetChildren(archival_dir, &files);
+  if (!s.ok()) {
+    Log(options_.info_log, "Can't get archive files: %s", s.ToString().c_str());
+    assert(false);
+    return;
+  }
+
+  size_t log_files_num = 0;
+  uint64_t log_file_size = 0;
+
+  for (auto& f : files) {
+    uint64_t number;
+    FileType type;
+    if (ParseFileName(f, &number, &type) && type == kLogFile) {
+      std::string const file_path = archival_dir + "/" + f;
+      if (ttl_enabled) {
+        uint64_t file_m_time;
+        Status const s = env_->GetFileModificationTime(file_path,
+          &file_m_time);
+        if (!s.ok()) {
+          Log(options_.info_log, "Can't get file mod time: %s: %s",
+              file_path.c_str(), s.ToString().c_str());
+          continue;
+        }
+        if (now_seconds - file_m_time > options_.WAL_ttl_seconds) {
+          Status const s = env_->DeleteFile(file_path);
+          if (!s.ok()) {
+            Log(options_.info_log, "Can't delete file: %s: %s",
+                file_path.c_str(), s.ToString().c_str());
+            continue;
+          }
+          continue;
+        }
+      }
+
+      if (size_limit_enabled) {
+        uint64_t file_size;
+        Status const s = env_->GetFileSize(file_path, &file_size);
+        if (!s.ok()) {
+          Log(options_.info_log, "Can't get file size: %s: %s",
+              file_path.c_str(), s.ToString().c_str());
+          return;
+        } else {
+          if (file_size > 0) {
+            log_file_size = std::max(log_file_size, file_size);
+            ++log_files_num;
+          } else {
+            Status s = env_->DeleteFile(file_path);
+            if (!s.ok()) {
+              Log(options_.info_log, "Can't delete file: %s: %s",
+                  file_path.c_str(), s.ToString().c_str());
+              continue;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  if (0 == log_files_num || !size_limit_enabled) {
+    return;
+  }
+
+  size_t const files_keep_num = options_.WAL_size_limit_MB *
+    1024 * 1024 / log_file_size;
+  if (log_files_num <= files_keep_num) {
+    return;
+  }
+
+  size_t files_del_num = log_files_num - files_keep_num;
+  VectorLogPtr archived_logs;
+  AppendSortedWalsOfType(archival_dir, archived_logs, kArchivedLogFile);
+
+  if (files_del_num > archived_logs.size()) {
+    Log(options_.info_log, "Trying to delete more archived log files than "
+        "exist. Deleting all");
+    files_del_num = archived_logs.size();
+  }
+
+  for (size_t i = 0; i < files_del_num; ++i) {
+    std::string const file_path = archived_logs[i]->PathName();
+    Status const s = DeleteFile(file_path);
+    if (!s.ok()) {
+      Log(options_.info_log, "Can't delete file: %s: %s",
+          file_path.c_str(), s.ToString().c_str());
+      continue;
+    }
+  }
+}
+
+// If externalTable is set, then apply recovered transactions
+// to that table. This is used for readonly mode.
+Status DBImpl::Recover(VersionEdit* edit, MemTable* external_table,
+                       bool error_if_log_file_exist) {
+  mutex_.AssertHeld();
+
+  assert(db_lock_ == nullptr);
+  if (!external_table) {
+    // We call CreateDirIfMissing() as the directory may already exist (if we
+    // are reopening a DB), when this happens we don't want creating the
+    // directory to cause an error. However, we need to check if creating the
+    // directory fails or else we may get an obscure message about the lock
+    // file not existing. One real-world example of this occurring is if
+    // env->CreateDirIfMissing() doesn't create intermediate directories, e.g.
+    // when dbname_ is "dir/db" but when "dir" doesn't exist.
+    Status s = env_->CreateDirIfMissing(dbname_);
+    if (!s.ok()) {
+      return s;
+    }
+
+    s = env_->LockFile(LockFileName(dbname_), &db_lock_);
+    if (!s.ok()) {
+      return s;
+    }
+
+    if (!env_->FileExists(CurrentFileName(dbname_))) {
+      if (options_.create_if_missing) {
+        // TODO: add merge_operator name check
+        s = NewDB();
+        if (!s.ok()) {
+          return s;
+        }
+      } else {
+        return Status::InvalidArgument(
+            dbname_, "does not exist (create_if_missing is false)");
+      }
+    } else {
+      if (options_.error_if_exists) {
+        return Status::InvalidArgument(
+            dbname_, "exists (error_if_exists is true)");
+      }
+    }
+    // Check for the IDENTITY file and create it if not there
+    if (!env_->FileExists(IdentityFileName(dbname_))) {
+      s = SetIdentityFile(env_, dbname_);
+      if (!s.ok()) {
+        return s;
+      }
+    }
+  }
+
+  Status s = versions_->Recover();
+  if (s.ok()) {
+    SequenceNumber max_sequence(0);
+
+    // Recover from all newer log files than the ones named in the
+    // descriptor (new log files may have been added by the previous
+    // incarnation without registering them in the descriptor).
+    //
+    // Note that PrevLogNumber() is no longer used, but we pay
+    // attention to it in case we are recovering a database
+    // produced by an older version of rocksdb.
+    const uint64_t min_log = versions_->LogNumber();
+    const uint64_t prev_log = versions_->PrevLogNumber();
+    std::vector<std::string> filenames;
+    s = env_->GetChildren(options_.wal_dir, &filenames);
+    if (!s.ok()) {
+      return s;
+    }
+    uint64_t number;
+    FileType type;
+    std::vector<uint64_t> logs;
+    for (size_t i = 0; i < filenames.size(); i++) {
+      if (ParseFileName(filenames[i], &number, &type)
+          && type == kLogFile
+          && ((number >= min_log) || (number == prev_log))) {
+        logs.push_back(number);
+      }
+    }
+
+    if (logs.size() > 0 && error_if_log_file_exist) {
+      return Status::Corruption(""
+          "The db was opened in readonly mode with error_if_log_file_exist"
+          "flag but a log file already exists");
+    }
+
+    // Recover in the order in which the logs were generated
+    std::sort(logs.begin(), logs.end());
+    for (size_t i = 0; i < logs.size(); i++) {
+      s = RecoverLogFile(logs[i], edit, &max_sequence, external_table);
+      // The previous incarnation may not have written any MANIFEST
+      // records after allocating this log number.  So we manually
+      // update the file number allocation counter in VersionSet.
+      versions_->MarkFileNumberUsed(logs[i]);
+    }
+
+    if (s.ok()) {
+      if (versions_->LastSequence() < max_sequence) {
+        versions_->SetLastSequence(max_sequence);
+      }
+      SetTickerCount(options_.statistics.get(), SEQUENCE_NUMBER,
+                     versions_->LastSequence());
+    }
+  }
+
+  return s;
+}
+
+Status DBImpl::RecoverLogFile(uint64_t log_number,
+                              VersionEdit* edit,
+                              SequenceNumber* max_sequence,
+                              MemTable* external_table) {
+  struct LogReporter : public log::Reader::Reporter {
+    Env* env;
+    Logger* info_log;
+    const char* fname;
+    Status* status;  // nullptr if options_.paranoid_checks==false or
+                     //            options_.skip_log_error_on_recovery==true
+    virtual void Corruption(size_t bytes, const Status& s) {
+      Log(info_log, "%s%s: dropping %d bytes; %s",
+          (this->status == nullptr ? "(ignoring error) " : ""),
+          fname, static_cast<int>(bytes), s.ToString().c_str());
+      if (this->status != nullptr && this->status->ok()) *this->status = s;
+    }
+  };
+
+  mutex_.AssertHeld();
+
+  // Open the log file
+  std::string fname = LogFileName(options_.wal_dir, log_number);
+  unique_ptr<SequentialFile> file;
+  Status status = env_->NewSequentialFile(fname, &file, storage_options_);
+  if (!status.ok()) {
+    MaybeIgnoreError(&status);
+    return status;
+  }
+
+  // Create the log reader.
+  LogReporter reporter;
+  reporter.env = env_;
+  reporter.info_log = options_.info_log.get();
+  reporter.fname = fname.c_str();
+  reporter.status = (options_.paranoid_checks &&
+                     !options_.skip_log_error_on_recovery ? &status : nullptr);
+  // We intentially make log::Reader do checksumming even if
+  // paranoid_checks==false so that corruptions cause entire commits
+  // to be skipped instead of propagating bad information (like overly
+  // large sequence numbers).
+  log::Reader reader(std::move(file), &reporter, true/*checksum*/,
+                     0/*initial_offset*/);
+  Log(options_.info_log, "Recovering log #%lu",
+      (unsigned long) log_number);
+
+  // Read all the records and add to a memtable
+  std::string scratch;
+  Slice record;
+  WriteBatch batch;
+  MemTable* mem = nullptr;
+  if (external_table) {
+    mem = external_table;
+  }
+  while (reader.ReadRecord(&record, &scratch) && status.ok()) {
+    if (record.size() < 12) {
+      reporter.Corruption(
+          record.size(), Status::Corruption("log record too small"));
+      continue;
+    }
+    WriteBatchInternal::SetContents(&batch, record);
+
+    if (mem == nullptr) {
+      mem = new MemTable(internal_comparator_, options_);
+      mem->Ref();
+    }
+    status = WriteBatchInternal::InsertInto(&batch, mem, &options_);
+    MaybeIgnoreError(&status);
+    if (!status.ok()) {
+      break;
+    }
+    const SequenceNumber last_seq =
+        WriteBatchInternal::Sequence(&batch) +
+        WriteBatchInternal::Count(&batch) - 1;
+    if (last_seq > *max_sequence) {
+      *max_sequence = last_seq;
+    }
+
+    if (!external_table &&
+        mem->ApproximateMemoryUsage() > options_.write_buffer_size) {
+      status = WriteLevel0TableForRecovery(mem, edit);
+      if (!status.ok()) {
+        // Reflect errors immediately so that conditions like full
+        // file-systems cause the DB::Open() to fail.
+        break;
+      }
+      delete mem->Unref();
+      mem = nullptr;
+    }
+  }
+
+  if (status.ok() && mem != nullptr && !external_table) {
+    status = WriteLevel0TableForRecovery(mem, edit);
+    // Reflect errors immediately so that conditions like full
+    // file-systems cause the DB::Open() to fail.
+  }
+
+  if (mem != nullptr && !external_table) {
+    delete mem->Unref();
+  }
+  return status;
+}
+
+Status DBImpl::WriteLevel0TableForRecovery(MemTable* mem, VersionEdit* edit) {
+  mutex_.AssertHeld();
+  const uint64_t start_micros = env_->NowMicros();
+  FileMetaData meta;
+  meta.number = versions_->NewFileNumber();
+  pending_outputs_.insert(meta.number);
+  Iterator* iter = mem->NewIterator();
+  const SequenceNumber newest_snapshot = snapshots_.GetNewest();
+  const SequenceNumber earliest_seqno_in_memtable =
+    mem->GetFirstSequenceNumber();
+  Log(options_.info_log, "Level-0 table #%lu: started",
+      (unsigned long) meta.number);
+
+  Status s;
+  {
+    mutex_.Unlock();
+    s = BuildTable(dbname_, env_, options_, storage_options_,
+                   table_cache_.get(), iter, &meta,
+                   user_comparator(), newest_snapshot,
+                   earliest_seqno_in_memtable,
+                   GetCompressionFlush(options_));
+    LogFlush(options_.info_log);
+    mutex_.Lock();
+  }
+
+  Log(options_.info_log, "Level-0 table #%lu: %lu bytes %s",
+      (unsigned long) meta.number,
+      (unsigned long) meta.file_size,
+      s.ToString().c_str());
+  delete iter;
+
+  pending_outputs_.erase(meta.number);
+
+  // Note that if file_size is zero, the file has been deleted and
+  // should not be added to the manifest.
+  int level = 0;
+  if (s.ok() && meta.file_size > 0) {
+    edit->AddFile(level, meta.number, meta.file_size,
+                  meta.smallest, meta.largest,
+                  meta.smallest_seqno, meta.largest_seqno);
+  }
+
+  CompactionStats stats;
+  stats.micros = env_->NowMicros() - start_micros;
+  stats.bytes_written = meta.file_size;
+  stats.files_out_levelnp1 = 1;
+  stats_[level].Add(stats);
+  RecordTick(options_.statistics.get(), COMPACT_WRITE_BYTES, meta.file_size);
+  return s;
+}
+
+
+Status DBImpl::WriteLevel0Table(std::vector<MemTable*> &mems, VersionEdit* edit,
+                                uint64_t* filenumber) {
+  mutex_.AssertHeld();
+  const uint64_t start_micros = env_->NowMicros();
+  FileMetaData meta;
+  meta.number = versions_->NewFileNumber();
+  *filenumber = meta.number;
+  pending_outputs_.insert(meta.number);
+
+  const SequenceNumber newest_snapshot = snapshots_.GetNewest();
+  const SequenceNumber earliest_seqno_in_memtable =
+    mems[0]->GetFirstSequenceNumber();
+  Version* base = versions_->current();
+  base->Ref();          // it is likely that we do not need this reference
+  Status s;
+  {
+    mutex_.Unlock();
+    std::vector<Iterator*> list;
+    for (MemTable* m : mems) {
+      Log(options_.info_log,
+          "Flushing memtable with log file: %lu\n",
+          (unsigned long)m->GetLogNumber());
+      list.push_back(m->NewIterator());
+    }
+    Iterator* iter = NewMergingIterator(&internal_comparator_, &list[0],
+                                        list.size());
+    Log(options_.info_log,
+        "Level-0 flush table #%lu: started",
+        (unsigned long)meta.number);
+
+    s = BuildTable(dbname_, env_, options_, storage_options_,
+                   table_cache_.get(), iter, &meta,
+                   user_comparator(), newest_snapshot,
+                   earliest_seqno_in_memtable, GetCompressionFlush(options_));
+    LogFlush(options_.info_log);
+    delete iter;
+    Log(options_.info_log, "Level-0 flush table #%lu: %lu bytes %s",
+        (unsigned long) meta.number,
+        (unsigned long) meta.file_size,
+        s.ToString().c_str());
+    mutex_.Lock();
+  }
+  base->Unref();
+
+
+  // re-acquire the most current version
+  base = versions_->current();
+
+  // There could be multiple threads writing to its own level-0 file.
+  // The pending_outputs cannot be cleared here, otherwise this newly
+  // created file might not be considered as a live-file by another
+  // compaction thread that is concurrently deleting obselete files.
+  // The pending_outputs can be cleared only after the new version is
+  // committed so that other threads can recognize this file as a
+  // valid one.
+  // pending_outputs_.erase(meta.number);
+
+  // Note that if file_size is zero, the file has been deleted and
+  // should not be added to the manifest.
+  int level = 0;
+  if (s.ok() && meta.file_size > 0) {
+    const Slice min_user_key = meta.smallest.user_key();
+    const Slice max_user_key = meta.largest.user_key();
+    // if we have more than 1 background thread, then we cannot
+    // insert files directly into higher levels because some other
+    // threads could be concurrently producing compacted files for
+    // that key range.
+    if (base != nullptr && options_.max_background_compactions <= 1 &&
+        options_.compaction_style == kCompactionStyleLevel) {
+      level = base->PickLevelForMemTableOutput(min_user_key, max_user_key);
+    }
+    edit->AddFile(level, meta.number, meta.file_size,
+                  meta.smallest, meta.largest,
+                  meta.smallest_seqno, meta.largest_seqno);
+  }
+
+  CompactionStats stats;
+  stats.micros = env_->NowMicros() - start_micros;
+  stats.bytes_written = meta.file_size;
+  stats_[level].Add(stats);
+  RecordTick(options_.statistics.get(), COMPACT_WRITE_BYTES, meta.file_size);
+  return s;
+}
+
+Status DBImpl::FlushMemTableToOutputFile(bool* madeProgress,
+                                         DeletionState& deletion_state) {
+  mutex_.AssertHeld();
+  assert(imm_.size() != 0);
+
+  if (!imm_.IsFlushPending(options_.min_write_buffer_number_to_merge)) {
+    Log(options_.info_log, "FlushMemTableToOutputFile already in progress");
+    Status s = Status::IOError("FlushMemTableToOutputFile already in progress");
+    return s;
+  }
+
+  // Save the contents of the earliest memtable as a new Table
+  uint64_t file_number;
+  std::vector<MemTable*> mems;
+  imm_.PickMemtablesToFlush(&mems);
+  if (mems.empty()) {
+    Log(options_.info_log, "Nothing in memstore to flush");
+    Status s = Status::IOError("Nothing in memstore to flush");
+    return s;
+  }
+
+  // record the logfile_number_ before we release the mutex
+  // entries mems are (implicitly) sorted in ascending order by their created
+  // time. We will use the first memtable's `edit` to keep the meta info for
+  // this flush.
+  MemTable* m = mems[0];
+  VersionEdit* edit = m->GetEdits();
+  edit->SetPrevLogNumber(0);
+  // SetLogNumber(log_num) indicates logs with number smaller than log_num
+  // will no longer be picked up for recovery.
+  edit->SetLogNumber(
+      mems.back()->GetNextLogNumber()
+  );
+
+  std::vector<uint64_t> logs_to_delete;
+  for (auto mem : mems) {
+    logs_to_delete.push_back(mem->GetLogNumber());
+  }
+
+  // This will release and re-acquire the mutex.
+  Status s = WriteLevel0Table(mems, edit, &file_number);
+
+  if (s.ok() && shutting_down_.Acquire_Load()) {
+    s = Status::IOError(
+      "Database shutdown started during memtable compaction"
+    );
+  }
+
+  // Replace immutable memtable with the generated Table
+  s = imm_.InstallMemtableFlushResults(
+    mems, versions_.get(), s, &mutex_, options_.info_log.get(),
+    file_number, pending_outputs_, &deletion_state.memtables_to_free);
+
+  if (s.ok()) {
+    InstallSuperVersion(deletion_state);
+    if (madeProgress) {
+      *madeProgress = 1;
+    }
+
+    MaybeScheduleLogDBDeployStats();
+
+    if (disable_delete_obsolete_files_ == 0) {
+      // add to deletion state
+      deletion_state.log_delete_files.insert(
+          deletion_state.log_delete_files.end(),
+          logs_to_delete.begin(),
+          logs_to_delete.end());
+    }
+  }
+  return s;
+}
+
+void DBImpl::CompactRange(const Slice* begin,
+                          const Slice* end,
+                          bool reduce_level,
+                          int target_level) {
+  FlushMemTable(FlushOptions());
+  int max_level_with_files = 1;
+  {
+    MutexLock l(&mutex_);
+    Version* base = versions_->current();
+    for (int level = 1; level < NumberLevels(); level++) {
+      if (base->OverlapInLevel(level, begin, end)) {
+        max_level_with_files = level;
+      }
+    }
+  }
+  for (int level = 0; level <= max_level_with_files; level++) {
+    // in case the compaction is unversal or if we're compacting the
+    // bottom-most level, the output level will be the same as input one
+    if (options_.compaction_style == kCompactionStyleUniversal ||
+        level == max_level_with_files) {
+      RunManualCompaction(level, level, begin, end);
+    } else {
+      RunManualCompaction(level, level + 1, begin, end);
+    }
+  }
+
+  if (reduce_level) {
+    ReFitLevel(max_level_with_files, target_level);
+  }
+  LogFlush(options_.info_log);
+}
+
+// return the same level if it cannot be moved
+int DBImpl::FindMinimumEmptyLevelFitting(int level) {
+  mutex_.AssertHeld();
+  Version* current = versions_->current();
+  int minimum_level = level;
+  for (int i = level - 1; i > 0; --i) {
+    // stop if level i is not empty
+    if (current->NumLevelFiles(i) > 0) break;
+    // stop if level i is too small (cannot fit the level files)
+    if (versions_->MaxBytesForLevel(i) < current->NumLevelBytes(level)) break;
+
+    minimum_level = i;
+  }
+  return minimum_level;
+}
+
+void DBImpl::ReFitLevel(int level, int target_level) {
+  assert(level < NumberLevels());
+
+  SuperVersion* superversion_to_free = nullptr;
+  SuperVersion* new_superversion =
+      new SuperVersion(options_.max_write_buffer_number);
+
+  mutex_.Lock();
+
+  // only allow one thread refitting
+  if (refitting_level_) {
+    mutex_.Unlock();
+    Log(options_.info_log, "ReFitLevel: another thread is refitting");
+    delete new_superversion;
+    return;
+  }
+  refitting_level_ = true;
+
+  // wait for all background threads to stop
+  bg_work_gate_closed_ = true;
+  while (bg_compaction_scheduled_ > 0 || bg_flush_scheduled_) {
+    Log(options_.info_log,
+        "RefitLevel: waiting for background threads to stop: %d %d",
+        bg_compaction_scheduled_, bg_flush_scheduled_);
+    bg_cv_.Wait();
+  }
+
+  // move to a smaller level
+  int to_level = target_level;
+  if (target_level < 0) {
+    to_level = FindMinimumEmptyLevelFitting(level);
+  }
+
+  assert(to_level <= level);
+
+  if (to_level < level) {
+    Log(options_.info_log, "Before refitting:\n%s",
+        versions_->current()->DebugString().data());
+
+    VersionEdit edit;
+    for (const auto& f : versions_->current()->files_[level]) {
+      edit.DeleteFile(level, f->number);
+      edit.AddFile(to_level, f->number, f->file_size, f->smallest, f->largest,
+                   f->smallest_seqno, f->largest_seqno);
+    }
+    Log(options_.info_log, "Apply version edit:\n%s",
+        edit.DebugString().data());
+
+    auto status = versions_->LogAndApply(&edit, &mutex_);
+    superversion_to_free = InstallSuperVersion(new_superversion);
+    new_superversion = nullptr;
+
+    Log(options_.info_log, "LogAndApply: %s\n", status.ToString().data());
+
+    if (status.ok()) {
+      Log(options_.info_log, "After refitting:\n%s",
+          versions_->current()->DebugString().data());
+    }
+  }
+
+  refitting_level_ = false;
+  bg_work_gate_closed_ = false;
+
+  mutex_.Unlock();
+  delete superversion_to_free;
+  delete new_superversion;
+}
+
+int DBImpl::NumberLevels() {
+  return options_.num_levels;
+}
+
+int DBImpl::MaxMemCompactionLevel() {
+  return options_.max_mem_compaction_level;
+}
+
+int DBImpl::Level0StopWriteTrigger() {
+  return options_.level0_stop_writes_trigger;
+}
+
+Status DBImpl::Flush(const FlushOptions& options) {
+  Status status = FlushMemTable(options);
+  return status;
+}
+
+SequenceNumber DBImpl::GetLatestSequenceNumber() const {
+  return versions_->LastSequence();
+}
+
+Status DBImpl::GetUpdatesSince(SequenceNumber seq,
+                               unique_ptr<TransactionLogIterator>* iter) {
+
+  RecordTick(options_.statistics.get(), GET_UPDATES_SINCE_CALLS);
+  if (seq > versions_->LastSequence()) {
+    return Status::IOError("Requested sequence not yet written in the db");
+  }
+  //  Get all sorted Wal Files.
+  //  Do binary search and open files and find the seq number.
+
+  std::unique_ptr<VectorLogPtr> wal_files(new VectorLogPtr);
+  Status s = GetSortedWalFiles(*wal_files);
+  if (!s.ok()) {
+    return s;
+  }
+
+  s = RetainProbableWalFiles(*wal_files, seq);
+  if (!s.ok()) {
+    return s;
+  }
+  iter->reset(
+    new TransactionLogIteratorImpl(options_.wal_dir,
+                                   &options_,
+                                   storage_options_,
+                                   seq,
+                                   std::move(wal_files),
+                                   this));
+  return (*iter)->status();
+}
+
+Status DBImpl::RetainProbableWalFiles(VectorLogPtr& all_logs,
+                                      const SequenceNumber target) {
+  long start = 0; // signed to avoid overflow when target is < first file.
+  long end = static_cast<long>(all_logs.size()) - 1;
+  // Binary Search. avoid opening all files.
+  while (end >= start) {
+    long mid = start + (end - start) / 2;  // Avoid overflow.
+    SequenceNumber current_seq_num = all_logs.at(mid)->StartSequence();
+    if (current_seq_num == target) {
+      end = mid;
+      break;
+    } else if (current_seq_num < target) {
+      start = mid + 1;
+    } else {
+      end = mid - 1;
+    }
+  }
+  size_t start_index = std::max(0l, end); // end could be -ve.
+  // The last wal file is always included
+  all_logs.erase(all_logs.begin(), all_logs.begin() + start_index);
+  return Status::OK();
+}
+
+bool DBImpl::CheckWalFileExistsAndEmpty(const WalFileType type,
+                                        const uint64_t number) {
+  const std::string fname = (type == kAliveLogFile) ?
+    LogFileName(options_.wal_dir, number) :
+    ArchivedLogFileName(options_.wal_dir, number);
+  uint64_t file_size;
+  Status s = env_->GetFileSize(fname, &file_size);
+  return (s.ok() && (file_size == 0));
+}
+
+Status DBImpl::ReadFirstRecord(const WalFileType type, const uint64_t number,
+                               WriteBatch* const result) {
+
+  if (type == kAliveLogFile) {
+    std::string fname = LogFileName(options_.wal_dir, number);
+    Status status = ReadFirstLine(fname, result);
+    if (!status.ok()) {
+      //  check if the file got moved to archive.
+      std::string archived_file =
+        ArchivedLogFileName(options_.wal_dir, number);
+      Status s = ReadFirstLine(archived_file, result);
+      if (!s.ok()) {
+        return Status::IOError("Log File has been deleted: " + archived_file);
+      }
+    }
+    return Status::OK();
+  } else if (type == kArchivedLogFile) {
+    std::string fname = ArchivedLogFileName(options_.wal_dir, number);
+    Status status = ReadFirstLine(fname, result);
+    return status;
+  }
+  return Status::NotSupported("File Type Not Known: " + std::to_string(type));
+}
+
+Status DBImpl::ReadFirstLine(const std::string& fname,
+                             WriteBatch* const batch) {
+  struct LogReporter : public log::Reader::Reporter {
+    Env* env;
+    Logger* info_log;
+    const char* fname;
+    Status* status;  // nullptr if options_.paranoid_checks==false
+    virtual void Corruption(size_t bytes, const Status& s) {
+      Log(info_log, "%s%s: dropping %d bytes; %s",
+          (this->status == nullptr ? "(ignoring error) " : ""),
+          fname, static_cast<int>(bytes), s.ToString().c_str());
+      if (this->status != nullptr && this->status->ok()) *this->status = s;
+    }
+  };
+
+  unique_ptr<SequentialFile> file;
+  Status status = env_->NewSequentialFile(fname, &file, storage_options_);
+
+  if (!status.ok()) {
+    return status;
+  }
+
+
+  LogReporter reporter;
+  reporter.env = env_;
+  reporter.info_log = options_.info_log.get();
+  reporter.fname = fname.c_str();
+  reporter.status = (options_.paranoid_checks ? &status : nullptr);
+  log::Reader reader(std::move(file), &reporter, true/*checksum*/,
+                     0/*initial_offset*/);
+  std::string scratch;
+  Slice record;
+
+  if (reader.ReadRecord(&record, &scratch) && status.ok()) {
+    if (record.size() < 12) {
+      reporter.Corruption(
+          record.size(), Status::Corruption("log record too small"));
+      return Status::IOError("Corruption noted");
+      //  TODO read record's till the first no corrupt entry?
+    }
+    WriteBatchInternal::SetContents(batch, record);
+    return Status::OK();
+  }
+  return Status::IOError("Error reading from file " + fname);
+}
+
+struct CompareLogByPointer {
+  bool operator() (const unique_ptr<LogFile>& a,
+                   const unique_ptr<LogFile>& b) {
+    LogFileImpl* a_impl = dynamic_cast<LogFileImpl*>(a.get());
+    LogFileImpl* b_impl = dynamic_cast<LogFileImpl*>(b.get());
+    return *a_impl < *b_impl;
+  }
+};
+
+Status DBImpl::AppendSortedWalsOfType(const std::string& path,
+    VectorLogPtr& log_files, WalFileType log_type) {
+  std::vector<std::string> all_files;
+  const Status status = env_->GetChildren(path, &all_files);
+  if (!status.ok()) {
+    return status;
+  }
+  log_files.reserve(log_files.size() + all_files.size());
+  VectorLogPtr::iterator pos_start;
+  if (!log_files.empty()) {
+    pos_start = log_files.end() - 1;
+  } else {
+    pos_start = log_files.begin();
+  }
+  for (const auto& f : all_files) {
+    uint64_t number;
+    FileType type;
+    if (ParseFileName(f, &number, &type) && type == kLogFile){
+
+      WriteBatch batch;
+      Status s = ReadFirstRecord(log_type, number, &batch);
+      if (!s.ok()) {
+        if (CheckWalFileExistsAndEmpty(log_type, number)) {
+          continue;
+        }
+        return s;
+      }
+
+      uint64_t size_bytes;
+      s = env_->GetFileSize(LogFileName(path, number), &size_bytes);
+      if (!s.ok()) {
+        return s;
+      }
+
+      log_files.push_back(std::move(unique_ptr<LogFile>(new LogFileImpl(
+        number, log_type, WriteBatchInternal::Sequence(&batch), size_bytes))));
+    }
+  }
+  CompareLogByPointer compare_log_files;
+  std::sort(pos_start, log_files.end(), compare_log_files);
+  return status;
+}
+
+void DBImpl::RunManualCompaction(int input_level,
+                                 int output_level,
+                                 const Slice* begin,
+                                 const Slice* end) {
+  assert(input_level >= 0);
+
+  InternalKey begin_storage, end_storage;
+
+  ManualCompaction manual;
+  manual.input_level = input_level;
+  manual.output_level = output_level;
+  manual.done = false;
+  manual.in_progress = false;
+  // For universal compaction, we enforce every manual compaction to compact
+  // all files.
+  if (begin == nullptr ||
+      options_.compaction_style == kCompactionStyleUniversal) {
+    manual.begin = nullptr;
+  } else {
+    begin_storage = InternalKey(*begin, kMaxSequenceNumber, kValueTypeForSeek);
+    manual.begin = &begin_storage;
+  }
+  if (end == nullptr ||
+      options_.compaction_style == kCompactionStyleUniversal) {
+    manual.end = nullptr;
+  } else {
+    end_storage = InternalKey(*end, 0, static_cast<ValueType>(0));
+    manual.end = &end_storage;
+  }
+
+  MutexLock l(&mutex_);
+
+  // When a manual compaction arrives, temporarily disable scheduling of
+  // non-manual compactions and wait until the number of scheduled compaction
+  // jobs drops to zero. This is needed to ensure that this manual compaction
+  // can compact any range of keys/files.
+  //
+  // bg_manual_only_ is non-zero when at least one thread is inside
+  // RunManualCompaction(), i.e. during that time no other compaction will
+  // get scheduled (see MaybeScheduleFlushOrCompaction).
+  //
+  // Note that the following loop doesn't stop more that one thread calling
+  // RunManualCompaction() from getting to the second while loop below.
+  // However, only one of them will actually schedule compaction, while
+  // others will wait on a condition variable until it completes.
+
+  ++bg_manual_only_;
+  while (bg_compaction_scheduled_ > 0) {
+    Log(options_.info_log,
+        "Manual compaction waiting for all other scheduled background "
+        "compactions to finish");
+    bg_cv_.Wait();
+  }
+
+  Log(options_.info_log, "Manual compaction starting");
+
+  while (!manual.done && !shutting_down_.Acquire_Load() && bg_error_.ok()) {
+    assert(bg_manual_only_ > 0);
+    if (manual_compaction_ != nullptr) {
+      // Running either this or some other manual compaction
+      bg_cv_.Wait();
+    } else {
+      manual_compaction_ = &manual;
+      MaybeScheduleFlushOrCompaction();
+    }
+  }
+
+  assert(!manual.in_progress);
+  assert(bg_manual_only_ > 0);
+  --bg_manual_only_;
+}
+
+void DBImpl::TEST_CompactRange(int level,
+                               const Slice* begin,
+                               const Slice* end) {
+  int output_level = (options_.compaction_style == kCompactionStyleUniversal)
+                         ? level
+                         : level + 1;
+  RunManualCompaction(level, output_level, begin, end);
+}
+
+Status DBImpl::FlushMemTable(const FlushOptions& options) {
+  // nullptr batch means just wait for earlier writes to be done
+  Status s = Write(WriteOptions(), nullptr);
+  if (s.ok() && options.wait) {
+    // Wait until the compaction completes
+    s = WaitForFlushMemTable();
+  }
+  return s;
+}
+
+Status DBImpl::WaitForFlushMemTable() {
+  Status s;
+  // Wait until the compaction completes
+  MutexLock l(&mutex_);
+  while (imm_.size() > 0 && bg_error_.ok()) {
+    bg_cv_.Wait();
+  }
+  if (imm_.size() != 0) {
+    s = bg_error_;
+  }
+  return s;
+}
+
+Status DBImpl::TEST_FlushMemTable() {
+  return FlushMemTable(FlushOptions());
+}
+
+Status DBImpl::TEST_WaitForFlushMemTable() {
+  return WaitForFlushMemTable();
+}
+
+Status DBImpl::TEST_WaitForCompact() {
+  // Wait until the compaction completes
+
+  // TODO: a bug here. This function actually does not necessarily
+  // wait for compact. It actually waits for scheduled compaction
+  // OR flush to finish.
+
+  MutexLock l(&mutex_);
+  while ((bg_compaction_scheduled_ || bg_flush_scheduled_) &&
+         bg_error_.ok()) {
+    bg_cv_.Wait();
+  }
+  return bg_error_;
+}
+
+void DBImpl::MaybeScheduleFlushOrCompaction() {
+  mutex_.AssertHeld();
+  if (bg_work_gate_closed_) {
+    // gate closed for backgrond work
+  } else if (shutting_down_.Acquire_Load()) {
+    // DB is being deleted; no more background compactions
+  } else {
+    bool is_flush_pending =
+      imm_.IsFlushPending(options_.min_write_buffer_number_to_merge);
+    if (is_flush_pending &&
+        (bg_flush_scheduled_ < options_.max_background_flushes)) {
+      // memtable flush needed
+      bg_flush_scheduled_++;
+      env_->Schedule(&DBImpl::BGWorkFlush, this, Env::Priority::HIGH);
+    }
+
+    // Schedule BGWorkCompaction if there's a compaction pending (or a memtable
+    // flush, but the HIGH pool is not enabled). Do it only if
+    // max_background_compactions hasn't been reached and, in case
+    // bg_manual_only_ > 0, if it's a manual compaction.
+    if ((manual_compaction_ ||
+         versions_->NeedsCompaction() ||
+         (is_flush_pending && (options_.max_background_flushes <= 0))) &&
+        bg_compaction_scheduled_ < options_.max_background_compactions &&
+        (!bg_manual_only_ || manual_compaction_)) {
+
+      bg_compaction_scheduled_++;
+      env_->Schedule(&DBImpl::BGWorkCompaction, this, Env::Priority::LOW);
+    }
+  }
+}
+
+void DBImpl::BGWorkFlush(void* db) {
+  reinterpret_cast<DBImpl*>(db)->BackgroundCallFlush();
+}
+
+void DBImpl::BGWorkCompaction(void* db) {
+  reinterpret_cast<DBImpl*>(db)->BackgroundCallCompaction();
+}
+
+Status DBImpl::BackgroundFlush(bool* madeProgress,
+                               DeletionState& deletion_state) {
+  Status stat;
+  while (stat.ok() &&
+         imm_.IsFlushPending(options_.min_write_buffer_number_to_merge)) {
+    Log(options_.info_log,
+        "BackgroundCallFlush doing FlushMemTableToOutputFile, flush slots available %d",
+        options_.max_background_flushes - bg_flush_scheduled_);
+    stat = FlushMemTableToOutputFile(madeProgress, deletion_state);
+  }
+  return stat;
+}
+
+void DBImpl::BackgroundCallFlush() {
+  bool madeProgress = false;
+  DeletionState deletion_state(options_.max_write_buffer_number, true);
+  assert(bg_flush_scheduled_);
+  MutexLock l(&mutex_);
+
+  Status s;
+  if (!shutting_down_.Acquire_Load()) {
+    s = BackgroundFlush(&madeProgress, deletion_state);
+    if (!s.ok()) {
+      // Wait a little bit before retrying background compaction in
+      // case this is an environmental problem and we do not want to
+      // chew up resources for failed compactions for the duration of
+      // the problem.
+      bg_cv_.SignalAll();  // In case a waiter can proceed despite the error
+      Log(options_.info_log, "Waiting after background flush error: %s",
+          s.ToString().c_str());
+      mutex_.Unlock();
+      LogFlush(options_.info_log);
+      env_->SleepForMicroseconds(1000000);
+      mutex_.Lock();
+    }
+  }
+
+  // If !s.ok(), this means that Flush failed. In that case, we want
+  // to delete all obsolete files and we force FindObsoleteFiles()
+  FindObsoleteFiles(deletion_state, !s.ok());
+  // delete unnecessary files if any, this is done outside the mutex
+  if (deletion_state.HaveSomethingToDelete()) {
+    mutex_.Unlock();
+    PurgeObsoleteFiles(deletion_state);
+    mutex_.Lock();
+  }
+
+  bg_flush_scheduled_--;
+  if (madeProgress) {
+    MaybeScheduleFlushOrCompaction();
+  }
+  bg_cv_.SignalAll();
+}
+
+
+void DBImpl::TEST_PurgeObsoleteteWAL() {
+  PurgeObsoleteWALFiles();
+}
+
+uint64_t DBImpl::TEST_GetLevel0TotalSize() {
+  MutexLock l(&mutex_);
+  return versions_->current()->NumLevelBytes(0);
+}
+
+void DBImpl::BackgroundCallCompaction() {
+  bool madeProgress = false;
+  DeletionState deletion_state(options_.max_write_buffer_number, true);
+
+  MaybeDumpStats();
+
+  MutexLock l(&mutex_);
+  // Log(options_.info_log, "XXX BG Thread %llx process new work item", pthread_self());
+  assert(bg_compaction_scheduled_);
+  Status s;
+  if (!shutting_down_.Acquire_Load()) {
+    s = BackgroundCompaction(&madeProgress, deletion_state);
+    if (!s.ok()) {
+      // Wait a little bit before retrying background compaction in
+      // case this is an environmental problem and we do not want to
+      // chew up resources for failed compactions for the duration of
+      // the problem.
+      bg_cv_.SignalAll();  // In case a waiter can proceed despite the error
+      Log(options_.info_log, "Waiting after background compaction error: %s",
+          s.ToString().c_str());
+      mutex_.Unlock();
+      LogFlush(options_.info_log);
+      env_->SleepForMicroseconds(1000000);
+      mutex_.Lock();
+    }
+  }
+
+  // If !s.ok(), this means that Compaction failed. In that case, we want
+  // to delete all obsolete files we might have created and we force
+  // FindObsoleteFiles(). This is because deletion_state does not catch
+  // all created files if compaction failed.
+  FindObsoleteFiles(deletion_state, !s.ok());
+
+  // delete unnecessary files if any, this is done outside the mutex
+  if (deletion_state.HaveSomethingToDelete()) {
+    mutex_.Unlock();
+    PurgeObsoleteFiles(deletion_state);
+    mutex_.Lock();
+  }
+
+  bg_compaction_scheduled_--;
+
+  MaybeScheduleLogDBDeployStats();
+
+  // Previous compaction may have produced too many files in a level,
+  // So reschedule another compaction if we made progress in the
+  // last compaction.
+  if (madeProgress) {
+    MaybeScheduleFlushOrCompaction();
+  }
+  bg_cv_.SignalAll();
+
+}
+
+Status DBImpl::BackgroundCompaction(bool* madeProgress,
+                                    DeletionState& deletion_state) {
+  *madeProgress = false;
+  mutex_.AssertHeld();
+
+  // TODO: remove memtable flush from formal compaction
+  while (imm_.IsFlushPending(options_.min_write_buffer_number_to_merge)) {
+    Log(options_.info_log,
+        "BackgroundCompaction doing FlushMemTableToOutputFile, compaction slots "
+        "available %d",
+        options_.max_background_compactions - bg_compaction_scheduled_);
+    Status stat = FlushMemTableToOutputFile(madeProgress, deletion_state);
+    if (!stat.ok()) {
+      return stat;
+    }
+  }
+
+  unique_ptr<Compaction> c;
+  bool is_manual = (manual_compaction_ != nullptr) &&
+                   (manual_compaction_->in_progress == false);
+  InternalKey manual_end_storage;
+  InternalKey* manual_end = &manual_end_storage;
+  if (is_manual) {
+    ManualCompaction* m = manual_compaction_;
+    assert(!m->in_progress);
+    m->in_progress = true; // another thread cannot pick up the same work
+    c.reset(versions_->CompactRange(
+        m->input_level, m->output_level, m->begin, m->end, &manual_end));
+    if (!c) {
+      m->done = true;
+    }
+    Log(options_.info_log,
+        "Manual compaction from level-%d to level-%d from %s .. %s; will stop "
+        "at %s\n",
+        m->input_level,
+        m->output_level,
+        (m->begin ? m->begin->DebugString().c_str() : "(begin)"),
+        (m->end ? m->end->DebugString().c_str() : "(end)"),
+        ((m->done || manual_end == nullptr)
+             ? "(end)"
+             : manual_end->DebugString().c_str()));
+  } else if (!options_.disable_auto_compactions) {
+    c.reset(versions_->PickCompaction());
+  }
+
+  Status status;
+  if (!c) {
+    // Nothing to do
+    Log(options_.info_log, "Compaction nothing to do");
+  } else if (!is_manual && c->IsTrivialMove()) {
+    // Move file to next level
+    assert(c->num_input_files(0) == 1);
+    FileMetaData* f = c->input(0, 0);
+    c->edit()->DeleteFile(c->level(), f->number);
+    c->edit()->AddFile(c->level() + 1, f->number, f->file_size,
+                       f->smallest, f->largest,
+                       f->smallest_seqno, f->largest_seqno);
+    status = versions_->LogAndApply(c->edit(), &mutex_);
+    InstallSuperVersion(deletion_state);
+    Version::LevelSummaryStorage tmp;
+    Log(options_.info_log, "Moved #%lld to level-%d %lld bytes %s: %s\n",
+        static_cast<unsigned long long>(f->number), c->level() + 1,
+        static_cast<unsigned long long>(f->file_size),
+        status.ToString().c_str(), versions_->current()->LevelSummary(&tmp));
+    versions_->ReleaseCompactionFiles(c.get(), status);
+    *madeProgress = true;
+  } else {
+    MaybeScheduleFlushOrCompaction(); // do more compaction work in parallel.
+    CompactionState* compact = new CompactionState(c.get());
+    status = DoCompactionWork(compact, deletion_state);
+    CleanupCompaction(compact, status);
+    versions_->ReleaseCompactionFiles(c.get(), status);
+    c->ReleaseInputs();
+    *madeProgress = true;
+  }
+  c.reset();
+
+  if (status.ok()) {
+    // Done
+  } else if (shutting_down_.Acquire_Load()) {
+    // Ignore compaction errors found during shutting down
+  } else {
+    Log(options_.info_log,
+        "Compaction error: %s", status.ToString().c_str());
+    if (options_.paranoid_checks && bg_error_.ok()) {
+      bg_error_ = status;
+    }
+  }
+
+  if (is_manual) {
+    ManualCompaction* m = manual_compaction_;
+    if (!status.ok()) {
+      m->done = true;
+    }
+    // For universal compaction:
+    //   Because universal compaction always happens at level 0, so one
+    //   compaction will pick up all overlapped files. No files will be
+    //   filtered out due to size limit and left for a successive compaction.
+    //   So we can safely conclude the current compaction.
+    //
+    //   Also note that, if we don't stop here, then the current compaction
+    //   writes a new file back to level 0, which will be used in successive
+    //   compaction. Hence the manual compaction will never finish.
+    //
+    // Stop the compaction if manual_end points to nullptr -- this means
+    // that we compacted the whole range. manual_end should always point
+    // to nullptr in case of universal compaction
+    if (manual_end == nullptr) {
+      m->done = true;
+    }
+    if (!m->done) {
+      // We only compacted part of the requested range.  Update *m
+      // to the range that is left to be compacted.
+      // Universal compaction should always compact the whole range
+      assert(options_.compaction_style != kCompactionStyleUniversal);
+      m->tmp_storage = *manual_end;
+      m->begin = &m->tmp_storage;
+    }
+    m->in_progress = false; // not being processed anymore
+    manual_compaction_ = nullptr;
+  }
+  return status;
+}
+
+void DBImpl::CleanupCompaction(CompactionState* compact, Status status) {
+  mutex_.AssertHeld();
+  if (compact->builder != nullptr) {
+    // May happen if we get a shutdown call in the middle of compaction
+    compact->builder->Abandon();
+    compact->builder.reset();
+  } else {
+    assert(compact->outfile == nullptr);
+  }
+  for (size_t i = 0; i < compact->outputs.size(); i++) {
+    const CompactionState::Output& out = compact->outputs[i];
+    pending_outputs_.erase(out.number);
+
+    // If this file was inserted into the table cache then remove
+    // them here because this compaction was not committed.
+    if (!status.ok()) {
+      table_cache_->Evict(out.number);
+    }
+  }
+  delete compact;
+}
+
+// Allocate the file numbers for the output file. We allocate as
+// many output file numbers as there are files in level+1 (at least one)
+// Insert them into pending_outputs so that they do not get deleted.
+void DBImpl::AllocateCompactionOutputFileNumbers(CompactionState* compact) {
+  mutex_.AssertHeld();
+  assert(compact != nullptr);
+  assert(compact->builder == nullptr);
+  int filesNeeded = compact->compaction->num_input_files(1);
+  for (int i = 0; i < std::max(filesNeeded, 1); i++) {
+    uint64_t file_number = versions_->NewFileNumber();
+    pending_outputs_.insert(file_number);
+    compact->allocated_file_numbers.push_back(file_number);
+  }
+}
+
+// Frees up unused file number.
+void DBImpl::ReleaseCompactionUnusedFileNumbers(CompactionState* compact) {
+  mutex_.AssertHeld();
+  for (const auto file_number : compact->allocated_file_numbers) {
+    pending_outputs_.erase(file_number);
+    // Log(options_.info_log, "XXX releasing unused file num %d", file_number);
+  }
+}
+
+Status DBImpl::OpenCompactionOutputFile(CompactionState* compact) {
+  assert(compact != nullptr);
+  assert(compact->builder == nullptr);
+  uint64_t file_number;
+  // If we have not yet exhausted the pre-allocated file numbers,
+  // then use the one from the front. Otherwise, we have to acquire
+  // the heavyweight lock and allocate a new file number.
+  if (!compact->allocated_file_numbers.empty()) {
+    file_number = compact->allocated_file_numbers.front();
+    compact->allocated_file_numbers.pop_front();
+  } else {
+    mutex_.Lock();
+    file_number = versions_->NewFileNumber();
+    pending_outputs_.insert(file_number);
+    mutex_.Unlock();
+  }
+  CompactionState::Output out;
+  out.number = file_number;
+  out.smallest.Clear();
+  out.largest.Clear();
+  out.smallest_seqno = out.largest_seqno = 0;
+  compact->outputs.push_back(out);
+
+  // Make the output file
+  std::string fname = TableFileName(dbname_, file_number);
+  Status s = env_->NewWritableFile(fname, &compact->outfile, storage_options_);
+
+  if (s.ok()) {
+    // Over-estimate slightly so we don't end up just barely crossing
+    // the threshold.
+    compact->outfile->SetPreallocationBlockSize(
+      1.1 * versions_->MaxFileSizeForLevel(compact->compaction->output_level()));
+
+    CompressionType compression_type = GetCompressionType(
+        options_, compact->compaction->output_level(),
+        compact->compaction->enable_compression());
+
+    compact->builder.reset(
+        GetTableBuilder(options_, compact->outfile.get(), compression_type));
+  }
+  LogFlush(options_.info_log);
+  return s;
+}
+
+Status DBImpl::FinishCompactionOutputFile(CompactionState* compact,
+                                          Iterator* input) {
+  assert(compact != nullptr);
+  assert(compact->outfile);
+  assert(compact->builder != nullptr);
+
+  const uint64_t output_number = compact->current_output()->number;
+  assert(output_number != 0);
+
+  // Check for iterator errors
+  Status s = input->status();
+  const uint64_t current_entries = compact->builder->NumEntries();
+  if (s.ok()) {
+    s = compact->builder->Finish();
+  } else {
+    compact->builder->Abandon();
+  }
+  const uint64_t current_bytes = compact->builder->FileSize();
+  compact->current_output()->file_size = current_bytes;
+  compact->total_bytes += current_bytes;
+  compact->builder.reset();
+
+  // Finish and check for file errors
+  if (s.ok() && !options_.disableDataSync) {
+    if (options_.use_fsync) {
+      StopWatch sw(env_, options_.statistics.get(),
+                   COMPACTION_OUTFILE_SYNC_MICROS, false);
+      s = compact->outfile->Fsync();
+    } else {
+      StopWatch sw(env_, options_.statistics.get(),
+                   COMPACTION_OUTFILE_SYNC_MICROS, false);
+      s = compact->outfile->Sync();
+    }
+  }
+  if (s.ok()) {
+    s = compact->outfile->Close();
+  }
+  compact->outfile.reset();
+
+  if (s.ok() && current_entries > 0) {
+    // Verify that the table is usable
+    Iterator* iter = table_cache_->NewIterator(ReadOptions(),
+                                               storage_options_,
+                                               output_number,
+                                               current_bytes);
+    s = iter->status();
+    delete iter;
+    if (s.ok()) {
+      Log(options_.info_log,
+          "Generated table #%lu: %lu keys, %lu bytes",
+          (unsigned long) output_number,
+          (unsigned long) current_entries,
+          (unsigned long) current_bytes);
+    }
+  }
+  return s;
+}
+
+
+Status DBImpl::InstallCompactionResults(CompactionState* compact) {
+  mutex_.AssertHeld();
+
+  // paranoia: verify that the files that we started with
+  // still exist in the current version and in the same original level.
+  // This ensures that a concurrent compaction did not erroneously
+  // pick the same files to compact.
+  if (!versions_->VerifyCompactionFileConsistency(compact->compaction)) {
+    Log(options_.info_log,  "Compaction %d@%d + %d@%d files aborted",
+      compact->compaction->num_input_files(0),
+      compact->compaction->level(),
+      compact->compaction->num_input_files(1),
+      compact->compaction->level() + 1);
+    return Status::IOError("Compaction input files inconsistent");
+  }
+
+  Log(options_.info_log,  "Compacted %d@%d + %d@%d files => %lld bytes",
+      compact->compaction->num_input_files(0),
+      compact->compaction->level(),
+      compact->compaction->num_input_files(1),
+      compact->compaction->level() + 1,
+      static_cast<long long>(compact->total_bytes));
+
+  // Add compaction outputs
+  compact->compaction->AddInputDeletions(compact->compaction->edit());
+  for (size_t i = 0; i < compact->outputs.size(); i++) {
+    const CompactionState::Output& out = compact->outputs[i];
+    compact->compaction->edit()->AddFile(
+        compact->compaction->output_level(), out.number, out.file_size,
+        out.smallest, out.largest, out.smallest_seqno, out.largest_seqno);
+  }
+  return versions_->LogAndApply(compact->compaction->edit(), &mutex_);
+}
+
+//
+// Given a sequence number, return the sequence number of the
+// earliest snapshot that this sequence number is visible in.
+// The snapshots themselves are arranged in ascending order of
+// sequence numbers.
+// Employ a sequential search because the total number of
+// snapshots are typically small.
+inline SequenceNumber DBImpl::findEarliestVisibleSnapshot(
+  SequenceNumber in, std::vector<SequenceNumber>& snapshots,
+  SequenceNumber* prev_snapshot) {
+  SequenceNumber prev __attribute__((unused)) = 0;
+  for (const auto cur : snapshots) {
+    assert(prev <= cur);
+    if (cur >= in) {
+      *prev_snapshot = prev;
+      return cur;
+    }
+    prev = cur; // assignment
+    assert(prev);
+  }
+  Log(options_.info_log,
+      "Looking for seqid %lu but maxseqid is %lu",
+      (unsigned long)in,
+      (unsigned long)snapshots[snapshots.size()-1]);
+  assert(0);
+  return 0;
+}
+
+Status DBImpl::DoCompactionWork(CompactionState* compact,
+                                DeletionState& deletion_state) {
+  assert(compact);
+  int64_t imm_micros = 0;  // Micros spent doing imm_ compactions
+  Log(options_.info_log,
+      "Compacting %d@%d + %d@%d files, score %.2f slots available %d",
+      compact->compaction->num_input_files(0),
+      compact->compaction->level(),
+      compact->compaction->num_input_files(1),
+      compact->compaction->output_level(),
+      compact->compaction->score(),
+      options_.max_background_compactions - bg_compaction_scheduled_);
+  char scratch[256];
+  compact->compaction->Summary(scratch, sizeof(scratch));
+  Log(options_.info_log, "Compaction start summary: %s\n", scratch);
+
+  assert(versions_->current()->NumLevelFiles(compact->compaction->level()) > 0);
+  assert(compact->builder == nullptr);
+  assert(!compact->outfile);
+
+  SequenceNumber visible_at_tip = 0;
+  SequenceNumber earliest_snapshot;
+  SequenceNumber latest_snapshot = 0;
+  snapshots_.getAll(compact->existing_snapshots);
+  if (compact->existing_snapshots.size() == 0) {
+    // optimize for fast path if there are no snapshots
+    visible_at_tip = versions_->LastSequence();
+    earliest_snapshot = visible_at_tip;
+  } else {
+    latest_snapshot = compact->existing_snapshots.back();
+    // Add the current seqno as the 'latest' virtual
+    // snapshot to the end of this list.
+    compact->existing_snapshots.push_back(versions_->LastSequence());
+    earliest_snapshot = compact->existing_snapshots[0];
+  }
+
+  // Is this compaction producing files at the bottommost level?
+  bool bottommost_level = compact->compaction->BottomMostLevel();
+
+  // Allocate the output file numbers before we release the lock
+  AllocateCompactionOutputFileNumbers(compact);
+
+  // Release mutex while we're actually doing the compaction work
+  mutex_.Unlock();
+
+  const uint64_t start_micros = env_->NowMicros();
+  unique_ptr<Iterator> input(versions_->MakeInputIterator(compact->compaction));
+  input->SeekToFirst();
+  Status status;
+  ParsedInternalKey ikey;
+  std::string current_user_key;
+  bool has_current_user_key = false;
+  SequenceNumber last_sequence_for_key __attribute__((unused)) =
+    kMaxSequenceNumber;
+  SequenceNumber visible_in_snapshot = kMaxSequenceNumber;
+  std::string compaction_filter_value;
+  std::vector<char> delete_key; // for compaction filter
+  MergeHelper merge(user_comparator(), options_.merge_operator.get(),
+                    options_.info_log.get(),
+                    false /* internal key corruption is expected */);
+  auto compaction_filter = options_.compaction_filter;
+  std::unique_ptr<CompactionFilter> compaction_filter_from_factory = nullptr;
+  if (!compaction_filter) {
+    auto context = compact->GetFilterContext();
+    compaction_filter_from_factory =
+      options_.compaction_filter_factory->CreateCompactionFilter(context);
+    compaction_filter = compaction_filter_from_factory.get();
+  }
+
+  for (; input->Valid() && !shutting_down_.Acquire_Load(); ) {
+    // Prioritize immutable compaction work
+    // TODO: remove memtable flush from normal compaction work
+    if (imm_.imm_flush_needed.NoBarrier_Load() != nullptr) {
+      const uint64_t imm_start = env_->NowMicros();
+      LogFlush(options_.info_log);
+      mutex_.Lock();
+      if (imm_.IsFlushPending(options_.min_write_buffer_number_to_merge)) {
+        FlushMemTableToOutputFile(nullptr, deletion_state);
+        bg_cv_.SignalAll();  // Wakeup MakeRoomForWrite() if necessary
+      }
+      mutex_.Unlock();
+      imm_micros += (env_->NowMicros() - imm_start);
+    }
+
+    Slice key = input->key();
+    Slice value = input->value();
+
+    if (compact->compaction->ShouldStopBefore(key) &&
+        compact->builder != nullptr) {
+      status = FinishCompactionOutputFile(compact, input.get());
+      if (!status.ok()) {
+        break;
+      }
+    }
+
+    // Handle key/value, add to state, etc.
+    bool drop = false;
+    bool current_entry_is_merging = false;
+    if (!ParseInternalKey(key, &ikey)) {
+      // Do not hide error keys
+      // TODO: error key stays in db forever? Figure out the intention/rationale
+      // v10 error v8 : we cannot hide v8 even though it's pretty obvious.
+      current_user_key.clear();
+      has_current_user_key = false;
+      last_sequence_for_key = kMaxSequenceNumber;
+      visible_in_snapshot = kMaxSequenceNumber;
+    } else {
+      if (!has_current_user_key ||
+          user_comparator()->Compare(ikey.user_key,
+                                     Slice(current_user_key)) != 0) {
+        // First occurrence of this user key
+        current_user_key.assign(ikey.user_key.data(), ikey.user_key.size());
+        has_current_user_key = true;
+        last_sequence_for_key = kMaxSequenceNumber;
+        visible_in_snapshot = kMaxSequenceNumber;
+
+        // apply the compaction filter to the first occurrence of the user key
+        if (compaction_filter &&
+            ikey.type == kTypeValue &&
+            (visible_at_tip || ikey.sequence > latest_snapshot)) {
+          // If the user has specified a compaction filter and the sequence
+          // number is greater than any external snapshot, then invoke the
+          // filter.
+          // If the return value of the compaction filter is true, replace
+          // the entry with a delete marker.
+          bool value_changed = false;
+          compaction_filter_value.clear();
+          bool to_delete =
+            compaction_filter->Filter(compact->compaction->level(),
+                                               ikey.user_key, value,
+                                               &compaction_filter_value,
+                                               &value_changed);
+          if (to_delete) {
+            // make a copy of the original key
+            delete_key.assign(key.data(), key.data() + key.size());
+            // convert it to a delete
+            UpdateInternalKey(&delete_key[0], delete_key.size(),
+                              ikey.sequence, kTypeDeletion);
+            // anchor the key again
+            key = Slice(&delete_key[0], delete_key.size());
+            // needed because ikey is backed by key
+            ParseInternalKey(key, &ikey);
+            // no value associated with delete
+            value.clear();
+            RecordTick(options_.statistics.get(), COMPACTION_KEY_DROP_USER);
+          } else if (value_changed) {
+            value = compaction_filter_value;
+          }
+        }
+
+      }
+
+      // If there are no snapshots, then this kv affect visibility at tip.
+      // Otherwise, search though all existing snapshots to find
+      // the earlist snapshot that is affected by this kv.
+      SequenceNumber prev_snapshot = 0; // 0 means no previous snapshot
+      SequenceNumber visible = visible_at_tip ?
+        visible_at_tip :
+        findEarliestVisibleSnapshot(ikey.sequence,
+                                    compact->existing_snapshots,
+                                    &prev_snapshot);
+
+      if (visible_in_snapshot == visible) {
+        // If the earliest snapshot is which this key is visible in
+        // is the same as the visibily of a previous instance of the
+        // same key, then this kv is not visible in any snapshot.
+        // Hidden by an newer entry for same user key
+        // TODO: why not > ?
+        assert(last_sequence_for_key >= ikey.sequence);
+        drop = true;    // (A)
+        RecordTick(options_.statistics.get(), COMPACTION_KEY_DROP_NEWER_ENTRY);
+      } else if (ikey.type == kTypeDeletion &&
+                 ikey.sequence <= earliest_snapshot &&
+                 compact->compaction->IsBaseLevelForKey(ikey.user_key)) {
+        // For this user key:
+        // (1) there is no data in higher levels
+        // (2) data in lower levels will have larger sequence numbers
+        // (3) data in layers that are being compacted here and have
+        //     smaller sequence numbers will be dropped in the next
+        //     few iterations of this loop (by rule (A) above).
+        // Therefore this deletion marker is obsolete and can be dropped.
+        drop = true;
+        RecordTick(options_.statistics.get(), COMPACTION_KEY_DROP_OBSOLETE);
+      } else if (ikey.type == kTypeMerge) {
+        // We know the merge type entry is not hidden, otherwise we would
+        // have hit (A)
+        // We encapsulate the merge related state machine in a different
+        // object to minimize change to the existing flow. Turn out this
+        // logic could also be nicely re-used for memtable flush purge
+        // optimization in BuildTable.
+        merge.MergeUntil(input.get(), prev_snapshot, bottommost_level,
+                         options_.statistics.get());
+        current_entry_is_merging = true;
+        if (merge.IsSuccess()) {
+          // Successfully found Put/Delete/(end-of-key-range) while merging
+          // Get the merge result
+          key = merge.key();
+          ParseInternalKey(key, &ikey);
+          value = merge.value();
+        } else {
+          // Did not find a Put/Delete/(end-of-key-range) while merging
+          // We now have some stack of merge operands to write out.
+          // NOTE: key,value, and ikey are now referring to old entries.
+          //       These will be correctly set below.
+          assert(!merge.keys().empty());
+          assert(merge.keys().size() == merge.values().size());
+
+          // Hack to make sure last_sequence_for_key is correct
+          ParseInternalKey(merge.keys().front(), &ikey);
+        }
+      }
+
+      last_sequence_for_key = ikey.sequence;
+      visible_in_snapshot = visible;
+    }
+#if 0
+    Log(options_.info_log,
+        "  Compact: %s, seq %d, type: %d %d, drop: %d, is_base: %d, "
+        "%d smallest_snapshot: %d level: %d bottommost %d",
+        ikey.user_key.ToString().c_str(),
+        (int)ikey.sequence, ikey.type, kTypeValue, drop,
+        compact->compaction->IsBaseLevelForKey(ikey.user_key),
+        (int)last_sequence_for_key, (int)earliest_snapshot,
+        compact->compaction->level(), bottommost_level);
+#endif
+
+    if (!drop) {
+      // We may write a single key (e.g.: for Put/Delete or successful merge).
+      // Or we may instead have to write a sequence/list of keys.
+      // We have to write a sequence iff we have an unsuccessful merge
+      bool has_merge_list = current_entry_is_merging && !merge.IsSuccess();
+      const std::deque<std::string>* keys = nullptr;
+      const std::deque<std::string>* values = nullptr;
+      std::deque<std::string>::const_reverse_iterator key_iter;
+      std::deque<std::string>::const_reverse_iterator value_iter;
+      if (has_merge_list) {
+        keys = &merge.keys();
+        values = &merge.values();
+        key_iter = keys->rbegin();    // The back (*rbegin()) is the first key
+        value_iter = values->rbegin();
+
+        key = Slice(*key_iter);
+        value = Slice(*value_iter);
+      }
+
+      // If we have a list of keys to write, traverse the list.
+      // If we have a single key to write, simply write that key.
+      while (true) {
+        // Invariant: key,value,ikey will always be the next entry to write
+        char* kptr = (char*)key.data();
+        std::string kstr;
+
+        // Zeroing out the sequence number leads to better compression.
+        // If this is the bottommost level (no files in lower levels)
+        // and the earliest snapshot is larger than this seqno
+        // then we can squash the seqno to zero.
+        if (options_.compaction_style == kCompactionStyleLevel &&
+            bottommost_level && ikey.sequence < earliest_snapshot &&
+            ikey.type != kTypeMerge) {
+          assert(ikey.type != kTypeDeletion);
+          // make a copy because updating in place would cause problems
+          // with the priority queue that is managing the input key iterator
+          kstr.assign(key.data(), key.size());
+          kptr = (char *)kstr.c_str();
+          UpdateInternalKey(kptr, key.size(), (uint64_t)0, ikey.type);
+        }
+
+        Slice newkey(kptr, key.size());
+        assert((key.clear(), 1)); // we do not need 'key' anymore
+
+        // Open output file if necessary
+        if (compact->builder == nullptr) {
+          status = OpenCompactionOutputFile(compact);
+          if (!status.ok()) {
+            break;
+          }
+        }
+
+        SequenceNumber seqno = GetInternalKeySeqno(newkey);
+        if (compact->builder->NumEntries() == 0) {
+          compact->current_output()->smallest.DecodeFrom(newkey);
+          compact->current_output()->smallest_seqno = seqno;
+        } else {
+          compact->current_output()->smallest_seqno =
+            std::min(compact->current_output()->smallest_seqno, seqno);
+        }
+        compact->current_output()->largest.DecodeFrom(newkey);
+        compact->builder->Add(newkey, value);
+        compact->current_output()->largest_seqno =
+          std::max(compact->current_output()->largest_seqno, seqno);
+
+        // Close output file if it is big enough
+        if (compact->builder->FileSize() >=
+            compact->compaction->MaxOutputFileSize()) {
+          status = FinishCompactionOutputFile(compact, input.get());
+          if (!status.ok()) {
+            break;
+          }
+        }
+
+        // If we have a list of entries, move to next element
+        // If we only had one entry, then break the loop.
+        if (has_merge_list) {
+          ++key_iter;
+          ++value_iter;
+
+          // If at end of list
+          if (key_iter == keys->rend() || value_iter == values->rend()) {
+            // Sanity Check: if one ends, then both end
+            assert(key_iter == keys->rend() && value_iter == values->rend());
+            break;
+          }
+
+          // Otherwise not at end of list. Update key, value, and ikey.
+          key = Slice(*key_iter);
+          value = Slice(*value_iter);
+          ParseInternalKey(key, &ikey);
+
+        } else{
+          // Only had one item to begin with (Put/Delete)
+          break;
+        }
+      }
+    }
+
+    // MergeUntil has moved input to the next entry
+    if (!current_entry_is_merging) {
+      input->Next();
+    }
+  }
+
+  if (status.ok() && shutting_down_.Acquire_Load()) {
+    status = Status::IOError("Database shutdown started during compaction");
+  }
+  if (status.ok() && compact->builder != nullptr) {
+    status = FinishCompactionOutputFile(compact, input.get());
+  }
+  if (status.ok()) {
+    status = input->status();
+  }
+  input.reset();
+
+  CompactionStats stats;
+  stats.micros = env_->NowMicros() - start_micros - imm_micros;
+  if (options_.statistics.get()) {
+    options_.statistics.get()->measureTime(COMPACTION_TIME, stats.micros);
+  }
+  stats.files_in_leveln = compact->compaction->num_input_files(0);
+  stats.files_in_levelnp1 = compact->compaction->num_input_files(1);
+
+  int num_output_files = compact->outputs.size();
+  if (compact->builder != nullptr) {
+    // An error occurred so ignore the last output.
+    assert(num_output_files > 0);
+    --num_output_files;
+  }
+  stats.files_out_levelnp1 = num_output_files;
+
+  for (int i = 0; i < compact->compaction->num_input_files(0); i++) {
+    stats.bytes_readn += compact->compaction->input(0, i)->file_size;
+    RecordTick(options_.statistics.get(), COMPACT_READ_BYTES,
+               compact->compaction->input(0, i)->file_size);
+  }
+
+  for (int i = 0; i < compact->compaction->num_input_files(1); i++) {
+    stats.bytes_readnp1 += compact->compaction->input(1, i)->file_size;
+    RecordTick(options_.statistics.get(), COMPACT_READ_BYTES,
+               compact->compaction->input(1, i)->file_size);
+  }
+
+  for (int i = 0; i < num_output_files; i++) {
+    stats.bytes_written += compact->outputs[i].file_size;
+    RecordTick(options_.statistics.get(), COMPACT_WRITE_BYTES,
+               compact->outputs[i].file_size);
+  }
+
+  LogFlush(options_.info_log);
+  mutex_.Lock();
+  stats_[compact->compaction->output_level()].Add(stats);
+
+  // if there were any unused file number (mostly in case of
+  // compaction error), free up the entry from pending_putputs
+  ReleaseCompactionUnusedFileNumbers(compact);
+
+  if (status.ok()) {
+    status = InstallCompactionResults(compact);
+    InstallSuperVersion(deletion_state);
+  }
+  Version::LevelSummaryStorage tmp;
+  Log(options_.info_log,
+      "compacted to: %s, %.1f MB/sec, level %d, files in(%d, %d) out(%d) "
+      "MB in(%.1f, %.1f) out(%.1f), read-write-amplify(%.1f) "
+      "write-amplify(%.1f) %s\n",
+      versions_->current()->LevelSummary(&tmp),
+      (stats.bytes_readn + stats.bytes_readnp1 + stats.bytes_written) /
+          (double)stats.micros,
+      compact->compaction->output_level(), stats.files_in_leveln,
+      stats.files_in_levelnp1, stats.files_out_levelnp1,
+      stats.bytes_readn / 1048576.0, stats.bytes_readnp1 / 1048576.0,
+      stats.bytes_written / 1048576.0,
+      (stats.bytes_written + stats.bytes_readnp1 + stats.bytes_readn) /
+          (double)stats.bytes_readn,
+      stats.bytes_written / (double)stats.bytes_readn,
+      status.ToString().c_str());
+
+  return status;
+}
+
+namespace {
+struct IterState {
+  port::Mutex* mu;
+  Version* version;
+  std::vector<MemTable*> mem; // includes both mem_ and imm_
+  DBImpl *db;
+};
+
+static void CleanupIteratorState(void* arg1, void* arg2) {
+  IterState* state = reinterpret_cast<IterState*>(arg1);
+  DBImpl::DeletionState deletion_state(state->db->GetOptions().
+                                       max_write_buffer_number);
+  state->mu->Lock();
+  for (unsigned int i = 0; i < state->mem.size(); i++) {
+    MemTable* m = state->mem[i]->Unref();
+    if (m != nullptr) {
+      deletion_state.memtables_to_free.push_back(m);
+    }
+  }
+  state->version->Unref();
+  // fast path FindObsoleteFiles
+  state->db->FindObsoleteFiles(deletion_state, false, true);
+  state->mu->Unlock();
+  state->db->PurgeObsoleteFiles(deletion_state);
+  delete state;
+}
+}  // namespace
+
+Iterator* DBImpl::NewInternalIterator(const ReadOptions& options,
+                                      SequenceNumber* latest_snapshot) {
+  IterState* cleanup = new IterState;
+  MemTable* mutable_mem;
+  std::vector<MemTable*> immutables;
+  Version* version;
+
+  // Collect together all needed child iterators for mem
+  mutex_.Lock();
+  *latest_snapshot = versions_->LastSequence();
+  mem_->Ref();
+  mutable_mem = mem_;
+  // Collect together all needed child iterators for imm_
+  imm_.GetMemTables(&immutables);
+  for (unsigned int i = 0; i < immutables.size(); i++) {
+    immutables[i]->Ref();
+  }
+  // Collect iterators for files in L0 - Ln
+  versions_->current()->Ref();
+  version = versions_->current();
+  mutex_.Unlock();
+
+  std::vector<Iterator*> list;
+  list.push_back(mutable_mem->NewIterator(options));
+  cleanup->mem.push_back(mutable_mem);
+  for (MemTable* m : immutables) {
+    list.push_back(m->NewIterator(options));
+    cleanup->mem.push_back(m);
+  }
+  version->AddIterators(options, storage_options_, &list);
+  Iterator* internal_iter =
+      NewMergingIterator(&internal_comparator_, &list[0], list.size());
+  cleanup->version = version;
+  cleanup->mu = &mutex_;
+  cleanup->db = this;
+  internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, nullptr);
+
+  return internal_iter;
+}
+
+Iterator* DBImpl::TEST_NewInternalIterator() {
+  SequenceNumber ignored;
+  return NewInternalIterator(ReadOptions(), &ignored);
+}
+
+int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes() {
+  MutexLock l(&mutex_);
+  return versions_->current()->MaxNextLevelOverlappingBytes();
+}
+
+Status DBImpl::Get(const ReadOptions& options,
+                   const Slice& key,
+                   std::string* value) {
+  return GetImpl(options, key, value);
+}
+
+// DeletionState gets created and destructed outside of the lock -- we
+// use this convinently to:
+// * malloc one SuperVersion() outside of the lock -- new_superversion
+// * delete one SuperVersion() outside of the lock -- superversion_to_free
+//
+// However, if InstallSuperVersion() gets called twice with the same,
+// deletion_state, we can't reuse the SuperVersion() that got malloced because
+// first call already used it. In that rare case, we take a hit and create a
+// new SuperVersion() inside of the mutex. We do similar thing
+// for superversion_to_free
+void DBImpl::InstallSuperVersion(DeletionState& deletion_state) {
+  // if new_superversion == nullptr, it means somebody already used it
+  SuperVersion* new_superversion =
+    (deletion_state.new_superversion != nullptr) ?
+    deletion_state.new_superversion : new SuperVersion();
+  SuperVersion* old_superversion = InstallSuperVersion(new_superversion);
+  deletion_state.new_superversion = nullptr;
+  if (deletion_state.superversion_to_free != nullptr) {
+    // somebody already put it there
+    delete old_superversion;
+  } else {
+    deletion_state.superversion_to_free = old_superversion;
+  }
+}
+
+DBImpl::SuperVersion* DBImpl::InstallSuperVersion(
+    SuperVersion* new_superversion) {
+  mutex_.AssertHeld();
+  new_superversion->Init(mem_, imm_, versions_->current());
+  SuperVersion* old_superversion = super_version_;
+  super_version_ = new_superversion;
+  if (old_superversion != nullptr && old_superversion->Unref()) {
+    old_superversion->Cleanup();
+    return old_superversion; // will let caller delete outside of mutex
+  }
+  return nullptr;
+}
+
+Status DBImpl::GetImpl(const ReadOptions& options,
+                       const Slice& key,
+                       std::string* value,
+                       bool* value_found) {
+  Status s;
+
+  StopWatch sw(env_, options_.statistics.get(), DB_GET, false);
+  SequenceNumber snapshot;
+  if (options.snapshot != nullptr) {
+    snapshot = reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_;
+  } else {
+    snapshot = versions_->LastSequence();
+  }
+
+  // This can be replaced by using atomics and spinlock instead of big mutex
+  mutex_.Lock();
+  SuperVersion* get_version = super_version_->Ref();
+  mutex_.Unlock();
+
+  bool have_stat_update = false;
+  Version::GetStats stats;
+
+  // Prepare to store a list of merge operations if merge occurs.
+  MergeContext merge_context;
+
+  // First look in the memtable, then in the immutable memtable (if any).
+  // s is both in/out. When in, s could either be OK or MergeInProgress.
+  // merge_operands will contain the sequence of merges in the latter case.
+  LookupKey lkey(key, snapshot);
+  if (get_version->mem->Get(lkey, value, &s, merge_context, options_)) {
+    // Done
+    RecordTick(options_.statistics.get(), MEMTABLE_HIT);
+  } else if (get_version->imm.Get(lkey, value, &s, merge_context, options_)) {
+    // Done
+    RecordTick(options_.statistics.get(), MEMTABLE_HIT);
+  } else {
+    get_version->current->Get(options, lkey, value, &s, &merge_context, &stats,
+                              options_, value_found);
+    have_stat_update = true;
+    RecordTick(options_.statistics.get(), MEMTABLE_MISS);
+  }
+
+  bool delete_get_version = false;
+  if (!options_.disable_seek_compaction && have_stat_update) {
+    mutex_.Lock();
+    if (get_version->current->UpdateStats(stats)) {
+      MaybeScheduleFlushOrCompaction();
+    }
+    if (get_version->Unref()) {
+      get_version->Cleanup();
+      delete_get_version = true;
+    }
+    mutex_.Unlock();
+  } else {
+    if (get_version->Unref()) {
+      mutex_.Lock();
+      get_version->Cleanup();
+      mutex_.Unlock();
+      delete_get_version = true;
+    }
+  }
+  if (delete_get_version) {
+    delete get_version;
+  }
+
+  // Note, tickers are atomic now - no lock protection needed any more.
+  RecordTick(options_.statistics.get(), NUMBER_KEYS_READ);
+  RecordTick(options_.statistics.get(), BYTES_READ, value->size());
+  return s;
+}
+
+std::vector<Status> DBImpl::MultiGet(const ReadOptions& options,
+                                     const std::vector<Slice>& keys,
+                                     std::vector<std::string>* values) {
+
+  StopWatch sw(env_, options_.statistics.get(), DB_MULTIGET, false);
+  SequenceNumber snapshot;
+  std::vector<MemTable*> to_delete;
+
+  mutex_.Lock();
+  if (options.snapshot != nullptr) {
+    snapshot = reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_;
+  } else {
+    snapshot = versions_->LastSequence();
+  }
+
+  MemTable* mem = mem_;
+  MemTableList imm = imm_;
+  Version* current = versions_->current();
+  mem->Ref();
+  imm.RefAll();
+  current->Ref();
+
+  // Unlock while reading from files and memtables
+
+  mutex_.Unlock();
+  bool have_stat_update = false;
+  Version::GetStats stats;
+
+  // Contain a list of merge operations if merge occurs.
+  MergeContext merge_context;
+
+  // Note: this always resizes the values array
+  int numKeys = keys.size();
+  std::vector<Status> statList(numKeys);
+  values->resize(numKeys);
+
+  // Keep track of bytes that we read for statistics-recording later
+  uint64_t bytesRead = 0;
+
+  // For each of the given keys, apply the entire "get" process as follows:
+  // First look in the memtable, then in the immutable memtable (if any).
+  // s is both in/out. When in, s could either be OK or MergeInProgress.
+  // merge_operands will contain the sequence of merges in the latter case.
+  for (int i=0; i<numKeys; ++i) {
+    merge_context.Clear();
+    Status& s = statList[i];
+    std::string* value = &(*values)[i];
+
+    LookupKey lkey(keys[i], snapshot);
+    if (mem->Get(lkey, value, &s, merge_context, options_)) {
+      // Done
+    } else if (imm.Get(lkey, value, &s, merge_context, options_)) {
+      // Done
+    } else {
+      current->Get(options, lkey, value, &s, &merge_context, &stats, options_);
+      have_stat_update = true;
+    }
+
+    if (s.ok()) {
+      bytesRead += value->size();
+    }
+  }
+
+  // Post processing (decrement reference counts and record statistics)
+  mutex_.Lock();
+  if (!options_.disable_seek_compaction &&
+      have_stat_update && current->UpdateStats(stats)) {
+    MaybeScheduleFlushOrCompaction();
+  }
+  MemTable* m = mem->Unref();
+  imm.UnrefAll(&to_delete);
+  current->Unref();
+  mutex_.Unlock();
+
+  // free up all obsolete memtables outside the mutex
+  delete m;
+  for (MemTable* v: to_delete) delete v;
+
+  RecordTick(options_.statistics.get(), NUMBER_MULTIGET_CALLS);
+  RecordTick(options_.statistics.get(), NUMBER_MULTIGET_KEYS_READ, numKeys);
+  RecordTick(options_.statistics.get(), NUMBER_MULTIGET_BYTES_READ, bytesRead);
+
+  return statList;
+}
+
+bool DBImpl::KeyMayExist(const ReadOptions& options,
+                         const Slice& key,
+                         std::string* value,
+                         bool* value_found) {
+  if (value_found != nullptr) {
+    // falsify later if key-may-exist but can't fetch value
+    *value_found = true;
+  }
+  ReadOptions roptions = options;
+  roptions.read_tier = kBlockCacheTier; // read from block cache only
+  auto s = GetImpl(roptions, key, value, value_found);
+
+  // If options.block_cache != nullptr and the index block of the table didn't
+  // not present in block_cache, the return value will be Status::Incomplete.
+  // In this case, key may still exist in the table.
+  return s.ok() || s.IsIncomplete();
+}
+
+Iterator* DBImpl::NewIterator(const ReadOptions& options) {
+  SequenceNumber latest_snapshot;
+  Iterator* iter = NewInternalIterator(options, &latest_snapshot);
+  iter = NewDBIterator(
+             &dbname_, env_, options_, user_comparator(), iter,
+             (options.snapshot != nullptr
+              ? reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_
+              : latest_snapshot));
+  if (options.prefix) {
+    // use extra wrapper to exclude any keys from the results which
+    // don't begin with the prefix
+    iter = new PrefixFilterIterator(iter, *options.prefix,
+                                    options_.prefix_extractor);
+  }
+  return iter;
+}
+
+const Snapshot* DBImpl::GetSnapshot() {
+  MutexLock l(&mutex_);
+  return snapshots_.New(versions_->LastSequence());
+}
+
+void DBImpl::ReleaseSnapshot(const Snapshot* s) {
+  MutexLock l(&mutex_);
+  snapshots_.Delete(reinterpret_cast<const SnapshotImpl*>(s));
+}
+
+// Convenience methods
+Status DBImpl::Put(const WriteOptions& o, const Slice& key, const Slice& val) {
+  return DB::Put(o, key, val);
+}
+
+Status DBImpl::Merge(const WriteOptions& o, const Slice& key,
+                     const Slice& val) {
+  if (!options_.merge_operator) {
+    return Status::NotSupported("Provide a merge_operator when opening DB");
+  } else {
+    return DB::Merge(o, key, val);
+  }
+}
+
+Status DBImpl::Delete(const WriteOptions& options, const Slice& key) {
+  return DB::Delete(options, key);
+}
+
+Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
+  Writer w(&mutex_);
+  w.batch = my_batch;
+  w.sync = options.sync;
+  w.disableWAL = options.disableWAL;
+  w.done = false;
+
+  StopWatch sw(env_, options_.statistics.get(), DB_WRITE, false);
+  mutex_.Lock();
+  writers_.push_back(&w);
+  while (!w.done && &w != writers_.front()) {
+    w.cv.Wait();
+  }
+
+  if (!options.disableWAL) {
+    RecordTick(options_.statistics.get(), WRITE_WITH_WAL, 1);
+  }
+
+  if (w.done) {
+    mutex_.Unlock();
+    RecordTick(options_.statistics.get(), WRITE_DONE_BY_OTHER, 1);
+    return w.status;
+  } else {
+    RecordTick(options_.statistics.get(), WRITE_DONE_BY_SELF, 1);
+  }
+
+  // May temporarily unlock and wait.
+  SuperVersion* superversion_to_free = nullptr;
+  Status status = MakeRoomForWrite(my_batch == nullptr, &superversion_to_free);
+  uint64_t last_sequence = versions_->LastSequence();
+  Writer* last_writer = &w;
+  if (status.ok() && my_batch != nullptr) {  // nullptr batch is for compactions
+    autovector<WriteBatch*> write_batch_group;
+    BuildBatchGroup(&last_writer, &write_batch_group);
+
+    // Add to log and apply to memtable.  We can release the lock
+    // during this phase since &w is currently responsible for logging
+    // and protects against concurrent loggers and concurrent writes
+    // into mem_.
+    {
+      mutex_.Unlock();
+      WriteBatch* updates = nullptr;
+      if (write_batch_group.size() == 1) {
+        updates = write_batch_group[0];
+      } else {
+        updates = &tmp_batch_;
+        for (size_t i = 0; i < write_batch_group.size(); ++i) {
+          WriteBatchInternal::Append(updates, write_batch_group[i]);
+        }
+      }
+
+      const SequenceNumber current_sequence = last_sequence + 1;
+      WriteBatchInternal::SetSequence(updates, current_sequence);
+      int my_batch_count = WriteBatchInternal::Count(updates);
+      last_sequence += my_batch_count;
+      // Record statistics
+      RecordTick(options_.statistics.get(),
+                 NUMBER_KEYS_WRITTEN, my_batch_count);
+      RecordTick(options_.statistics.get(),
+                 BYTES_WRITTEN,
+                 WriteBatchInternal::ByteSize(updates));
+      if (options.disableWAL) {
+        flush_on_destroy_ = true;
+      }
+
+      if (!options.disableWAL) {
+        StopWatchNano timer(env_);
+        StartPerfTimer(&timer);
+        Slice log_entry = WriteBatchInternal::Contents(updates);
+        status = log_->AddRecord(log_entry);
+        RecordTick(options_.statistics.get(), WAL_FILE_SYNCED, 1);
+        RecordTick(options_.statistics.get(), WAL_FILE_BYTES, log_entry.size());
+        BumpPerfTime(&perf_context.wal_write_time, &timer);
+        if (status.ok() && options.sync) {
+          if (options_.use_fsync) {
+            StopWatch(env_, options_.statistics.get(), WAL_FILE_SYNC_MICROS);
+            status = log_->file()->Fsync();
+          } else {
+            StopWatch(env_, options_.statistics.get(), WAL_FILE_SYNC_MICROS);
+            status = log_->file()->Sync();
+          }
+        }
+      }
+      if (status.ok()) {
+        status = WriteBatchInternal::InsertInto(updates, mem_, &options_, this,
+                                                options_.filter_deletes);
+        if (!status.ok()) {
+          // Panic for in-memory corruptions
+          // Note that existing logic was not sound. Any partial failure writing
+          // into the memtable would result in a state that some write ops might
+          // have succeeded in memtable but Status reports error for all writes.
+          throw std::runtime_error("In memory WriteBatch corruption!");
+        }
+        SetTickerCount(options_.statistics.get(),
+                       SEQUENCE_NUMBER, last_sequence);
+      }
+      if (updates == &tmp_batch_) tmp_batch_.Clear();
+      mutex_.Lock();
+      if (status.ok()) {
+        versions_->SetLastSequence(last_sequence);
+      }
+    }
+  }
+  if (options_.paranoid_checks && !status.ok() && bg_error_.ok()) {
+    bg_error_ = status; // stop compaction & fail any further writes
+  }
+
+  while (true) {
+    Writer* ready = writers_.front();
+    writers_.pop_front();
+    if (ready != &w) {
+      ready->status = status;
+      ready->done = true;
+      ready->cv.Signal();
+    }
+    if (ready == last_writer) break;
+  }
+
+  // Notify new head of write queue
+  if (!writers_.empty()) {
+    writers_.front()->cv.Signal();
+  }
+  mutex_.Unlock();
+  delete superversion_to_free;
+  return status;
+}
+
+// REQUIRES: Writer list must be non-empty
+// REQUIRES: First writer must have a non-nullptr batch
+void DBImpl::BuildBatchGroup(Writer** last_writer,
+                             autovector<WriteBatch*>* write_batch_group) {
+  assert(!writers_.empty());
+  Writer* first = writers_.front();
+  assert(first->batch != nullptr);
+
+  size_t size = WriteBatchInternal::ByteSize(first->batch);
+  write_batch_group->push_back(first->batch);
+
+  // Allow the group to grow up to a maximum size, but if the
+  // original write is small, limit the growth so we do not slow
+  // down the small write too much.
+  size_t max_size = 1 << 20;
+  if (size <= (128<<10)) {
+    max_size = size + (128<<10);
+  }
+
+  *last_writer = first;
+  std::deque<Writer*>::iterator iter = writers_.begin();
+  ++iter;  // Advance past "first"
+  for (; iter != writers_.end(); ++iter) {
+    Writer* w = *iter;
+    if (w->sync && !first->sync) {
+      // Do not include a sync write into a batch handled by a non-sync write.
+      break;
+    }
+
+    if (!w->disableWAL && first->disableWAL) {
+      // Do not include a write that needs WAL into a batch that has
+      // WAL disabled.
+      break;
+    }
+
+    if (w->batch != nullptr) {
+      size += WriteBatchInternal::ByteSize(w->batch);
+      if (size > max_size) {
+        // Do not make batch too big
+        break;
+      }
+
+      write_batch_group->push_back(w->batch);
+    }
+    *last_writer = w;
+  }
+}
+
+// This function computes the amount of time in microseconds by which a write
+// should be delayed based on the number of level-0 files according to the
+// following formula:
+// if n < bottom, return 0;
+// if n >= top, return 1000;
+// otherwise, let r = (n - bottom) /
+//                    (top - bottom)
+//  and return r^2 * 1000.
+// The goal of this formula is to gradually increase the rate at which writes
+// are slowed. We also tried linear delay (r * 1000), but it seemed to do
+// slightly worse. There is no other particular reason for choosing quadratic.
+uint64_t DBImpl::SlowdownAmount(int n, int top, int bottom) {
+  uint64_t delay;
+  if (n >= top) {
+    delay = 1000;
+  }
+  else if (n < bottom) {
+    delay = 0;
+  }
+  else {
+    // If we are here, we know that:
+    //   level0_start_slowdown <= n < level0_slowdown
+    // since the previous two conditions are false.
+    float how_much =
+      (float) (n - bottom) /
+              (top - bottom);
+    delay = how_much * how_much * 1000;
+  }
+  assert(delay <= 1000);
+  return delay;
+}
+
+// REQUIRES: mutex_ is held
+// REQUIRES: this thread is currently at the front of the writer queue
+Status DBImpl::MakeRoomForWrite(bool force,
+                                SuperVersion** superversion_to_free) {
+  mutex_.AssertHeld();
+  assert(!writers_.empty());
+  bool allow_delay = !force;
+  bool allow_hard_rate_limit_delay = !force;
+  bool allow_soft_rate_limit_delay = !force;
+  uint64_t rate_limit_delay_millis = 0;
+  Status s;
+  double score;
+  *superversion_to_free = nullptr;
+
+  while (true) {
+    if (!bg_error_.ok()) {
+      // Yield previous error
+      s = bg_error_;
+      break;
+    } else if (allow_delay && versions_->NeedSlowdownForNumLevel0Files()) {
+      // We are getting close to hitting a hard limit on the number of
+      // L0 files.  Rather than delaying a single write by several
+      // seconds when we hit the hard limit, start delaying each
+      // individual write by 0-1ms to reduce latency variance.  Also,
+      // this delay hands over some CPU to the compaction thread in
+      // case it is sharing the same core as the writer.
+      mutex_.Unlock();
+      uint64_t delayed;
+      {
+        StopWatch sw(env_, options_.statistics.get(), STALL_L0_SLOWDOWN_COUNT);
+        env_->SleepForMicroseconds(
+          SlowdownAmount(versions_->current()->NumLevelFiles(0),
+                         options_.level0_slowdown_writes_trigger,
+                         options_.level0_stop_writes_trigger)
+        );
+        delayed = sw.ElapsedMicros();
+      }
+      RecordTick(options_.statistics.get(), STALL_L0_SLOWDOWN_MICROS, delayed);
+      stall_level0_slowdown_ += delayed;
+      stall_level0_slowdown_count_++;
+      allow_delay = false;  // Do not delay a single write more than once
+      mutex_.Lock();
+      delayed_writes_++;
+    } else if (!force &&
+               (mem_->ApproximateMemoryUsage() <= options_.write_buffer_size)) {
+      // There is room in current memtable
+      if (allow_delay) {
+        DelayLoggingAndReset();
+      }
+      break;
+    } else if (imm_.size() == options_.max_write_buffer_number - 1) {
+      // We have filled up the current memtable, but the previous
+      // ones are still being compacted, so we wait.
+      DelayLoggingAndReset();
+      Log(options_.info_log, "wait for memtable compaction...\n");
+      uint64_t stall;
+      {
+        StopWatch sw(env_, options_.statistics.get(),
+          STALL_MEMTABLE_COMPACTION_COUNT);
+        bg_cv_.Wait();
+        stall = sw.ElapsedMicros();
+      }
+      RecordTick(options_.statistics.get(),
+                 STALL_MEMTABLE_COMPACTION_MICROS, stall);
+      stall_memtable_compaction_ += stall;
+      stall_memtable_compaction_count_++;
+    } else if (versions_->current()->NumLevelFiles(0) >=
+               options_.level0_stop_writes_trigger) {
+      // There are too many level-0 files.
+      DelayLoggingAndReset();
+      Log(options_.info_log, "wait for fewer level0 files...\n");
+      uint64_t stall;
+      {
+        StopWatch sw(env_, options_.statistics.get(),
+                     STALL_L0_NUM_FILES_COUNT);
+        bg_cv_.Wait();
+        stall = sw.ElapsedMicros();
+      }
+      RecordTick(options_.statistics.get(), STALL_L0_NUM_FILES_MICROS, stall);
+      stall_level0_num_files_ += stall;
+      stall_level0_num_files_count_++;
+    } else if (
+        allow_hard_rate_limit_delay &&
+        options_.hard_rate_limit > 1.0 &&
+        (score = versions_->MaxCompactionScore()) > options_.hard_rate_limit) {
+      // Delay a write when the compaction score for any level is too large.
+      int max_level = versions_->MaxCompactionScoreLevel();
+      mutex_.Unlock();
+      uint64_t delayed;
+      {
+        StopWatch sw(env_, options_.statistics.get(),
+                     HARD_RATE_LIMIT_DELAY_COUNT);
+        env_->SleepForMicroseconds(1000);
+        delayed = sw.ElapsedMicros();
+      }
+      stall_leveln_slowdown_[max_level] += delayed;
+      stall_leveln_slowdown_count_[max_level]++;
+      // Make sure the following value doesn't round to zero.
+      uint64_t rate_limit = std::max((delayed / 1000), (uint64_t) 1);
+      rate_limit_delay_millis += rate_limit;
+      RecordTick(options_.statistics.get(),
+                 RATE_LIMIT_DELAY_MILLIS, rate_limit);
+      if (options_.rate_limit_delay_max_milliseconds > 0 &&
+          rate_limit_delay_millis >=
+          (unsigned)options_.rate_limit_delay_max_milliseconds) {
+        allow_hard_rate_limit_delay = false;
+      }
+      mutex_.Lock();
+    } else if (
+        allow_soft_rate_limit_delay &&
+        options_.soft_rate_limit > 0.0 &&
+        (score = versions_->MaxCompactionScore()) > options_.soft_rate_limit) {
+      // Delay a write when the compaction score for any level is too large.
+      // TODO: add statistics
+      mutex_.Unlock();
+      {
+        StopWatch sw(env_, options_.statistics.get(),
+                     SOFT_RATE_LIMIT_DELAY_COUNT);
+        env_->SleepForMicroseconds(SlowdownAmount(
+          score,
+          options_.soft_rate_limit,
+          options_.hard_rate_limit)
+        );
+        rate_limit_delay_millis += sw.ElapsedMicros();
+      }
+      allow_soft_rate_limit_delay = false;
+      mutex_.Lock();
+
+    } else {
+      unique_ptr<WritableFile> lfile;
+      MemTable* memtmp = nullptr;
+
+      // Attempt to switch to a new memtable and trigger compaction of old.
+      // Do this without holding the dbmutex lock.
+      assert(versions_->PrevLogNumber() == 0);
+      uint64_t new_log_number = versions_->NewFileNumber();
+      SuperVersion* new_superversion = nullptr;
+      mutex_.Unlock();
+      {
+        EnvOptions soptions(storage_options_);
+        soptions.use_mmap_writes = false;
+        DelayLoggingAndReset();
+        s = env_->NewWritableFile(LogFileName(options_.wal_dir, new_log_number),
+                                  &lfile, soptions);
+        if (s.ok()) {
+          // Our final size should be less than write_buffer_size
+          // (compression, etc) but err on the side of caution.
+          lfile->SetPreallocationBlockSize(1.1 * options_.write_buffer_size);
+          memtmp = new MemTable(internal_comparator_, options_);
+          new_superversion = new SuperVersion(options_.max_write_buffer_number);
+        }
+      }
+      mutex_.Lock();
+      if (!s.ok()) {
+        // Avoid chewing through file number space in a tight loop.
+        versions_->ReuseFileNumber(new_log_number);
+        assert (!memtmp);
+        break;
+      }
+      logfile_number_ = new_log_number;
+      log_.reset(new log::Writer(std::move(lfile)));
+      mem_->SetNextLogNumber(logfile_number_);
+      imm_.Add(mem_);
+      if (force) {
+        imm_.FlushRequested();
+      }
+      mem_ = memtmp;
+      mem_->Ref();
+      Log(options_.info_log,
+          "New memtable created with log file: #%lu\n",
+          (unsigned long)logfile_number_);
+      mem_->SetLogNumber(logfile_number_);
+      force = false;   // Do not force another compaction if have room
+      MaybeScheduleFlushOrCompaction();
+      *superversion_to_free = InstallSuperVersion(new_superversion);
+    }
+  }
+  return s;
+}
+
+const std::string& DBImpl::GetName() const {
+  return dbname_;
+}
+
+Env* DBImpl::GetEnv() const {
+  return env_;
+}
+
+const Options& DBImpl::GetOptions() const {
+  return options_;
+}
+
+bool DBImpl::GetProperty(const Slice& property, std::string* value) {
+  value->clear();
+
+  MutexLock l(&mutex_);
+  Version* current = versions_->current();
+  Slice in = property;
+  Slice prefix("rocksdb.");
+  if (!in.starts_with(prefix)) return false;
+  in.remove_prefix(prefix.size());
+
+  if (in.starts_with("num-files-at-level")) {
+    in.remove_prefix(strlen("num-files-at-level"));
+    uint64_t level;
+    bool ok = ConsumeDecimalNumber(&in, &level) && in.empty();
+    if (!ok || (int)level >= NumberLevels()) {
+      return false;
+    } else {
+      char buf[100];
+      snprintf(buf, sizeof(buf), "%d",
+               current->NumLevelFiles(static_cast<int>(level)));
+      *value = buf;
+      return true;
+    }
+  } else if (in == "levelstats") {
+    char buf[1000];
+    snprintf(buf, sizeof(buf),
+             "Level Files Size(MB)\n"
+             "--------------------\n");
+    value->append(buf);
+
+    for (int level = 0; level < NumberLevels(); level++) {
+      snprintf(buf, sizeof(buf),
+               "%3d %8d %8.0f\n",
+               level,
+               current->NumLevelFiles(level),
+               current->NumLevelBytes(level) / 1048576.0);
+      value->append(buf);
+    }
+    return true;
+
+  } else if (in == "stats") {
+    char buf[1000];
+
+    uint64_t wal_bytes = 0;
+    uint64_t wal_synced = 0;
+    uint64_t user_bytes_written = 0;
+    uint64_t write_other = 0;
+    uint64_t write_self = 0;
+    uint64_t write_with_wal = 0;
+    uint64_t total_bytes_written = 0;
+    uint64_t total_bytes_read = 0;
+    uint64_t micros_up = env_->NowMicros() - started_at_;
+    // Add "+1" to make sure seconds_up is > 0 and avoid NaN later
+    double seconds_up = (micros_up + 1) / 1000000.0;
+    uint64_t total_slowdown = 0;
+    uint64_t total_slowdown_count = 0;
+    uint64_t interval_bytes_written = 0;
+    uint64_t interval_bytes_read = 0;
+    uint64_t interval_bytes_new = 0;
+    double   interval_seconds_up = 0;
+
+    Statistics* s = options_.statistics.get();
+    if (s) {
+      wal_bytes = s->getTickerCount(WAL_FILE_BYTES);
+      wal_synced = s->getTickerCount(WAL_FILE_SYNCED);
+      user_bytes_written = s->getTickerCount(BYTES_WRITTEN);
+      write_other = s->getTickerCount(WRITE_DONE_BY_OTHER);
+      write_self = s->getTickerCount(WRITE_DONE_BY_SELF);
+      write_with_wal = s->getTickerCount(WRITE_WITH_WAL);
+    }
+
+    // Pardon the long line but I think it is easier to read this way.
+    snprintf(buf, sizeof(buf),
+             "                               Compactions\n"
+             "Level  Files Size(MB) Score Time(sec)  Read(MB) Write(MB)    Rn(MB)  Rnp1(MB)  Wnew(MB) RW-Amplify Read(MB/s) Write(MB/s)      Rn     Rnp1     Wnp1     NewW    Count  Ln-stall Stall-cnt\n"
+             "--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n"
+             );
+    value->append(buf);
+    for (int level = 0; level < current->NumberLevels(); level++) {
+      int files = current->NumLevelFiles(level);
+      if (stats_[level].micros > 0 || files > 0) {
+        int64_t bytes_read = stats_[level].bytes_readn +
+                             stats_[level].bytes_readnp1;
+        int64_t bytes_new = stats_[level].bytes_written -
+                            stats_[level].bytes_readnp1;
+        double amplify = (stats_[level].bytes_readn == 0)
+            ? 0.0
+            : (stats_[level].bytes_written +
+               stats_[level].bytes_readnp1 +
+               stats_[level].bytes_readn) /
+                (double) stats_[level].bytes_readn;
+
+        total_bytes_read += bytes_read;
+        total_bytes_written += stats_[level].bytes_written;
+
+        snprintf(
+            buf, sizeof(buf),
+            "%3d %8d %8.0f %5.1f %9.0f %9.0f %9.0f %9.0f %9.0f %9.0f %10.1f %9.1f %11.1f %8d %8d %8d %8d %8d %9.1f %9lu\n",
+            level,
+            files,
+            current->NumLevelBytes(level) / 1048576.0,
+            current->NumLevelBytes(level) /
+                versions_->MaxBytesForLevel(level),
+            stats_[level].micros / 1e6,
+            bytes_read / 1048576.0,
+            stats_[level].bytes_written / 1048576.0,
+            stats_[level].bytes_readn / 1048576.0,
+            stats_[level].bytes_readnp1 / 1048576.0,
+            bytes_new / 1048576.0,
+            amplify,
+            // +1 to avoid division by 0
+            (bytes_read / 1048576.0) / ((stats_[level].micros+1) / 1000000.0),
+            (stats_[level].bytes_written / 1048576.0) /
+                ((stats_[level].micros+1) / 1000000.0),
+            stats_[level].files_in_leveln,
+            stats_[level].files_in_levelnp1,
+            stats_[level].files_out_levelnp1,
+            stats_[level].files_out_levelnp1 - stats_[level].files_in_levelnp1,
+            stats_[level].count,
+            stall_leveln_slowdown_[level] / 1000000.0,
+            (unsigned long) stall_leveln_slowdown_count_[level]);
+        total_slowdown += stall_leveln_slowdown_[level];
+        total_slowdown_count += stall_leveln_slowdown_count_[level];
+        value->append(buf);
+      }
+    }
+
+    interval_bytes_new = user_bytes_written - last_stats_.ingest_bytes_;
+    interval_bytes_read = total_bytes_read - last_stats_.compaction_bytes_read_;
+    interval_bytes_written =
+        total_bytes_written - last_stats_.compaction_bytes_written_;
+    interval_seconds_up = seconds_up - last_stats_.seconds_up_;
+
+    snprintf(buf, sizeof(buf), "Uptime(secs): %.1f total, %.1f interval\n",
+             seconds_up, interval_seconds_up);
+    value->append(buf);
+
+    snprintf(buf, sizeof(buf),
+             "Writes cumulative: %llu total, %llu batches, "
+             "%.1f per batch, %.2f ingest GB\n",
+             (unsigned long long) (write_other + write_self),
+             (unsigned long long) write_self,
+             (write_other + write_self) / (double) (write_self + 1),
+             user_bytes_written / (1048576.0 * 1024));
+    value->append(buf);
+
+    snprintf(buf, sizeof(buf),
+             "WAL cumulative: %llu WAL writes, %llu WAL syncs, "
+             "%.2f writes per sync, %.2f GB written\n",
+             (unsigned long long) write_with_wal,
+             (unsigned long long ) wal_synced,
+             write_with_wal / (double) (wal_synced + 1),
+             wal_bytes / (1048576.0 * 1024));
+    value->append(buf);
+
+    snprintf(buf, sizeof(buf),
+             "Compaction IO cumulative (GB): "
+             "%.2f new, %.2f read, %.2f write, %.2f read+write\n",
+             user_bytes_written / (1048576.0 * 1024),
+             total_bytes_read / (1048576.0 * 1024),
+             total_bytes_written / (1048576.0 * 1024),
+             (total_bytes_read + total_bytes_written) / (1048576.0 * 1024));
+    value->append(buf);
+
+    snprintf(buf, sizeof(buf),
+             "Compaction IO cumulative (MB/sec): "
+             "%.1f new, %.1f read, %.1f write, %.1f read+write\n",
+             user_bytes_written / 1048576.0 / seconds_up,
+             total_bytes_read / 1048576.0 / seconds_up,
+             total_bytes_written / 1048576.0 / seconds_up,
+             (total_bytes_read + total_bytes_written) / 1048576.0 / seconds_up);
+    value->append(buf);
+
+    // +1 to avoid divide by 0 and NaN
+    snprintf(buf, sizeof(buf),
+             "Amplification cumulative: %.1f write, %.1f compaction\n",
+             (double) (total_bytes_written + wal_bytes)
+                 / (user_bytes_written + 1),
+             (double) (total_bytes_written + total_bytes_read + wal_bytes)
+                 / (user_bytes_written + 1));
+    value->append(buf);
+
+    uint64_t interval_write_other = write_other - last_stats_.write_other_;
+    uint64_t interval_write_self = write_self - last_stats_.write_self_;
+
+    snprintf(buf, sizeof(buf),
+             "Writes interval: %llu total, %llu batches, "
+             "%.1f per batch, %.1f ingest MB\n",
+             (unsigned long long) (interval_write_other + interval_write_self),
+             (unsigned long long) interval_write_self,
+             (double) (interval_write_other + interval_write_self)
+                 / (interval_write_self + 1),
+             (user_bytes_written - last_stats_.ingest_bytes_) /  1048576.0);
+    value->append(buf);
+
+    uint64_t interval_write_with_wal =
+        write_with_wal - last_stats_.write_with_wal_;
+
+    uint64_t interval_wal_synced = wal_synced - last_stats_.wal_synced_;
+    uint64_t interval_wal_bytes = wal_bytes - last_stats_.wal_bytes_;
+
+    snprintf(buf, sizeof(buf),
+             "WAL interval: %llu WAL writes, %llu WAL syncs, "
+             "%.2f writes per sync, %.2f MB written\n",
+             (unsigned long long) interval_write_with_wal,
+             (unsigned long long ) interval_wal_synced,
+             interval_write_with_wal / (double) (interval_wal_synced + 1),
+             interval_wal_bytes / (1048576.0 * 1024));
+    value->append(buf);
+
+    snprintf(buf, sizeof(buf),
+             "Compaction IO interval (MB): "
+             "%.2f new, %.2f read, %.2f write, %.2f read+write\n",
+             interval_bytes_new / 1048576.0,
+             interval_bytes_read/ 1048576.0,
+             interval_bytes_written / 1048576.0,
+             (interval_bytes_read + interval_bytes_written) / 1048576.0);
+    value->append(buf);
+
+    snprintf(buf, sizeof(buf),
+             "Compaction IO interval (MB/sec): "
+             "%.1f new, %.1f read, %.1f write, %.1f read+write\n",
+             interval_bytes_new / 1048576.0 / interval_seconds_up,
+             interval_bytes_read / 1048576.0 / interval_seconds_up,
+             interval_bytes_written / 1048576.0 / interval_seconds_up,
+             (interval_bytes_read + interval_bytes_written)
+                 / 1048576.0 / interval_seconds_up);
+    value->append(buf);
+
+    // +1 to avoid divide by 0 and NaN
+    snprintf(buf, sizeof(buf),
+             "Amplification interval: %.1f write, %.1f compaction\n",
+             (double) (interval_bytes_written + wal_bytes)
+                 / (interval_bytes_new + 1),
+             (double) (interval_bytes_written + interval_bytes_read + wal_bytes)
+                 / (interval_bytes_new + 1));
+    value->append(buf);
+
+    snprintf(buf, sizeof(buf),
+            "Stalls(secs): %.3f level0_slowdown, %.3f level0_numfiles, "
+            "%.3f memtable_compaction, %.3f leveln_slowdown\n",
+            stall_level0_slowdown_ / 1000000.0,
+            stall_level0_num_files_ / 1000000.0,
+            stall_memtable_compaction_ / 1000000.0,
+            total_slowdown / 1000000.0);
+    value->append(buf);
+
+    snprintf(buf, sizeof(buf),
+            "Stalls(count): %lu level0_slowdown, %lu level0_numfiles, "
+            "%lu memtable_compaction, %lu leveln_slowdown\n",
+            (unsigned long) stall_level0_slowdown_count_,
+            (unsigned long) stall_level0_num_files_count_,
+            (unsigned long) stall_memtable_compaction_count_,
+            (unsigned long) total_slowdown_count);
+    value->append(buf);
+
+    last_stats_.compaction_bytes_read_ = total_bytes_read;
+    last_stats_.compaction_bytes_written_ = total_bytes_written;
+    last_stats_.ingest_bytes_ = user_bytes_written;
+    last_stats_.seconds_up_ = seconds_up;
+    last_stats_.wal_bytes_ = wal_bytes;
+    last_stats_.wal_synced_ = wal_synced;
+    last_stats_.write_with_wal_ = write_with_wal;
+    last_stats_.write_other_ = write_other;
+    last_stats_.write_self_ = write_self;
+
+    return true;
+  } else if (in == "sstables") {
+    *value = versions_->current()->DebugString();
+    return true;
+  } else if (in == "num-immutable-mem-table") {
+    *value = std::to_string(imm_.size());
+    return true;
+  }
+
+  return false;
+}
+
+void DBImpl::GetApproximateSizes(
+    const Range* range, int n,
+    uint64_t* sizes) {
+  // TODO(opt): better implementation
+  Version* v;
+  {
+    MutexLock l(&mutex_);
+    versions_->current()->Ref();
+    v = versions_->current();
+  }
+
+  for (int i = 0; i < n; i++) {
+    // Convert user_key into a corresponding internal key.
+    InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek);
+    InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek);
+    uint64_t start = versions_->ApproximateOffsetOf(v, k1);
+    uint64_t limit = versions_->ApproximateOffsetOf(v, k2);
+    sizes[i] = (limit >= start ? limit - start : 0);
+  }
+
+  {
+    MutexLock l(&mutex_);
+    v->Unref();
+  }
+}
+
+inline void DBImpl::DelayLoggingAndReset() {
+  if (delayed_writes_ > 0) {
+    Log(options_.info_log, "delayed %d write...\n", delayed_writes_ );
+    delayed_writes_ = 0;
+  }
+}
+
+Status DBImpl::DeleteFile(std::string name) {
+  uint64_t number;
+  FileType type;
+  WalFileType log_type;
+  if (!ParseFileName(name, &number, &type, &log_type) ||
+      (type != kTableFile && type != kLogFile)) {
+    Log(options_.info_log, "DeleteFile %s failed.\n", name.c_str());
+    return Status::InvalidArgument("Invalid file name");
+  }
+
+  Status status;
+  if (type == kLogFile) {
+    // Only allow deleting archived log files
+    if (log_type != kArchivedLogFile) {
+      Log(options_.info_log, "DeleteFile %s failed.\n", name.c_str());
+      return Status::NotSupported("Delete only supported for archived logs");
+    }
+    status = env_->DeleteFile(options_.wal_dir + "/" + name.c_str());
+    if (!status.ok()) {
+      Log(options_.info_log, "DeleteFile %s failed.\n", name.c_str());
+    }
+    return status;
+  }
+
+  int level;
+  FileMetaData metadata;
+  int maxlevel = NumberLevels();
+  VersionEdit edit;
+  DeletionState deletion_state(0, true);
+  {
+    MutexLock l(&mutex_);
+    status = versions_->GetMetadataForFile(number, &level, &metadata);
+    if (!status.ok()) {
+      Log(options_.info_log, "DeleteFile %s failed. File not found\n",
+                             name.c_str());
+      return Status::InvalidArgument("File not found");
+    }
+    assert((level > 0) && (level < maxlevel));
+
+    // If the file is being compacted no need to delete.
+    if (metadata.being_compacted) {
+      Log(options_.info_log,
+          "DeleteFile %s Skipped. File about to be compacted\n", name.c_str());
+      return Status::OK();
+    }
+
+    // Only the files in the last level can be deleted externally.
+    // This is to make sure that any deletion tombstones are not
+    // lost. Check that the level passed is the last level.
+    for (int i = level + 1; i < maxlevel; i++) {
+      if (versions_->current()->NumLevelFiles(i) != 0) {
+        Log(options_.info_log,
+            "DeleteFile %s FAILED. File not in last level\n", name.c_str());
+        return Status::InvalidArgument("File not in last level");
+      }
+    }
+    edit.DeleteFile(level, number);
+    status = versions_->LogAndApply(&edit, &mutex_);
+    if (status.ok()) {
+      InstallSuperVersion(deletion_state);
+    }
+    FindObsoleteFiles(deletion_state, false);
+  } // lock released here
+  LogFlush(options_.info_log);
+  // remove files outside the db-lock
+  PurgeObsoleteFiles(deletion_state);
+  return status;
+}
+
+void DBImpl::GetLiveFilesMetaData(std::vector<LiveFileMetaData> *metadata) {
+  MutexLock l(&mutex_);
+  return versions_->GetLiveFilesMetaData(metadata);
+}
+
+Status DBImpl::GetDbIdentity(std::string& identity) {
+  std::string idfilename = IdentityFileName(dbname_);
+  unique_ptr<SequentialFile> idfile;
+  const EnvOptions soptions;
+  Status s = env_->NewSequentialFile(idfilename, &idfile, soptions);
+  if (!s.ok()) {
+    return s;
+  }
+  uint64_t file_size;
+  s = env_->GetFileSize(idfilename, &file_size);
+  if (!s.ok()) {
+    return s;
+  }
+  char buffer[file_size];
+  Slice id;
+  s = idfile->Read(file_size, &id, buffer);
+  if (!s.ok()) {
+    return s;
+  }
+  identity.assign(id.ToString());
+  // If last character is '\n' remove it from identity
+  if (identity.size() > 0 && identity.back() == '\n') {
+    identity.pop_back();
+  }
+  return s;
+}
+
+// Default implementations of convenience methods that subclasses of DB
+// can call if they wish
+Status DB::Put(const WriteOptions& opt, const Slice& key, const Slice& value) {
+  // Pre-allocate size of write batch conservatively.
+  // 8 bytes are taken by header, 4 bytes for count, 1 byte for type,
+  // and we allocate 11 extra bytes for key length, as well as value length.
+  WriteBatch batch(key.size() + value.size() + 24);
+  batch.Put(key, value);
+  return Write(opt, &batch);
+}
+
+Status DB::Delete(const WriteOptions& opt, const Slice& key) {
+  WriteBatch batch;
+  batch.Delete(key);
+  return Write(opt, &batch);
+}
+
+Status DB::Merge(const WriteOptions& opt, const Slice& key,
+                 const Slice& value) {
+  WriteBatch batch;
+  batch.Merge(key, value);
+  return Write(opt, &batch);
+}
+
+DB::~DB() { }
+
+Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) {
+  *dbptr = nullptr;
+  EnvOptions soptions;
+
+  if (options.block_cache != nullptr && options.no_block_cache) {
+    return Status::InvalidArgument(
+        "no_block_cache is true while block_cache is not nullptr");
+  }
+
+  DBImpl* impl = new DBImpl(options, dbname);
+  Status s = impl->env_->CreateDirIfMissing(impl->options_.wal_dir);
+  if (!s.ok()) {
+    delete impl;
+    return s;
+  }
+
+  s = impl->CreateArchivalDirectory();
+  if (!s.ok()) {
+    delete impl;
+    return s;
+  }
+  impl->mutex_.Lock();
+  VersionEdit edit;
+  s = impl->Recover(&edit); // Handles create_if_missing, error_if_exists
+  if (s.ok()) {
+    uint64_t new_log_number = impl->versions_->NewFileNumber();
+    unique_ptr<WritableFile> lfile;
+    soptions.use_mmap_writes = false;
+    s = impl->options_.env->NewWritableFile(
+      LogFileName(impl->options_.wal_dir, new_log_number),
+      &lfile,
+      soptions
+    );
+    if (s.ok()) {
+      lfile->SetPreallocationBlockSize(1.1 * impl->options_.write_buffer_size);
+      edit.SetLogNumber(new_log_number);
+      impl->logfile_number_ = new_log_number;
+      impl->log_.reset(new log::Writer(std::move(lfile)));
+      s = impl->versions_->LogAndApply(&edit, &impl->mutex_);
+    }
+    if (s.ok()) {
+      delete impl->InstallSuperVersion(new DBImpl::SuperVersion());
+      impl->mem_->SetLogNumber(impl->logfile_number_);
+      impl->DeleteObsoleteFiles();
+      impl->MaybeScheduleFlushOrCompaction();
+      impl->MaybeScheduleLogDBDeployStats();
+    }
+  }
+
+  if (s.ok() && impl->options_.compaction_style == kCompactionStyleUniversal) {
+    Version* current = impl->versions_->current();
+    for (int i = 1; i < impl->NumberLevels(); i++) {
+      int num_files = current->NumLevelFiles(i);
+      if (num_files > 0) {
+        s = Status::InvalidArgument("Not all files are at level 0. Cannot "
+          "open with universal compaction style.");
+        break;
+      }
+    }
+  }
+
+  impl->mutex_.Unlock();
+
+  if (s.ok()) {
+    *dbptr = impl;
+  } else {
+    delete impl;
+  }
+  return s;
+}
+
+Snapshot::~Snapshot() {
+}
+
+Status DestroyDB(const std::string& dbname, const Options& options) {
+  const InternalKeyComparator comparator(options.comparator);
+  const InternalFilterPolicy filter_policy(options.filter_policy);
+  const Options& soptions(SanitizeOptions(
+    dbname, &comparator, &filter_policy, options));
+  Env* env = soptions.env;
+  std::vector<std::string> filenames;
+  std::vector<std::string> archiveFiles;
+
+  std::string archivedir = ArchivalDirectory(dbname);
+  // Ignore error in case directory does not exist
+  env->GetChildren(dbname, &filenames);
+
+  if (dbname != soptions.wal_dir) {
+    std::vector<std::string> logfilenames;
+    env->GetChildren(soptions.wal_dir, &logfilenames);
+    filenames.insert(filenames.end(), logfilenames.begin(), logfilenames.end());
+    archivedir = ArchivalDirectory(soptions.wal_dir);
+  }
+
+  if (filenames.empty()) {
+    return Status::OK();
+  }
+
+  FileLock* lock;
+  const std::string lockname = LockFileName(dbname);
+  Status result = env->LockFile(lockname, &lock);
+  if (result.ok()) {
+    uint64_t number;
+    FileType type;
+    for (size_t i = 0; i < filenames.size(); i++) {
+      if (ParseFileName(filenames[i], &number, &type) &&
+          type != kDBLockFile) {  // Lock file will be deleted at end
+        Status del;
+        if (type == kMetaDatabase) {
+          del = DestroyDB(dbname + "/" + filenames[i], options);
+        } else if (type == kLogFile) {
+          del = env->DeleteFile(soptions.wal_dir + "/" + filenames[i]);
+        } else {
+          del = env->DeleteFile(dbname + "/" + filenames[i]);
+        }
+        if (result.ok() && !del.ok()) {
+          result = del;
+        }
+      }
+    }
+
+    env->GetChildren(archivedir, &archiveFiles);
+    // Delete archival files.
+    for (size_t i = 0; i < archiveFiles.size(); ++i) {
+      if (ParseFileName(archiveFiles[i], &number, &type) &&
+          type == kLogFile) {
+        Status del = env->DeleteFile(archivedir + "/" + archiveFiles[i]);
+        if (result.ok() && !del.ok()) {
+          result = del;
+        }
+      }
+    }
+    // ignore case where no archival directory is present.
+    env->DeleteDir(archivedir);
+
+    env->UnlockFile(lock);  // Ignore error since state is already gone
+    env->DeleteFile(lockname);
+    env->DeleteDir(dbname);  // Ignore error in case dir contains other files
+    env->DeleteDir(soptions.wal_dir);
+  }
+  return result;
+}
+
+//
+// A global method that can dump out the build version
+void dumpLeveldbBuildVersion(Logger * log) {
+  Log(log, "Git sha %s", rocksdb_build_git_sha);
+  Log(log, "Compile time %s %s",
+      rocksdb_build_compile_time, rocksdb_build_compile_date);
+}
+
+}  // namespace rocksdb
diff --git a/db/db_impl.h b/db/db_impl.h
new file mode 100644 (file)
index 0000000..214affa
--- /dev/null
@@ -0,0 +1,605 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+#include <atomic>
+#include <deque>
+#include <set>
+#include <vector>
+#include "db/dbformat.h"
+#include "db/log_writer.h"
+#include "db/snapshot.h"
+#include "db/version_edit.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/transaction_log.h"
+#include "port/port.h"
+#include "util/stats_logger.h"
+#include "memtablelist.h"
+#include "util/autovector.h"
+
+namespace rocksdb {
+
+class MemTable;
+class TableCache;
+class Version;
+class VersionEdit;
+class VersionSet;
+
+class DBImpl : public DB {
+ public:
+  DBImpl(const Options& options, const std::string& dbname);
+  virtual ~DBImpl();
+
+  // Implementations of the DB interface
+  virtual Status Put(const WriteOptions&, const Slice& key, const Slice& value);
+  virtual Status Merge(const WriteOptions&, const Slice& key,
+                       const Slice& value);
+  virtual Status Delete(const WriteOptions&, const Slice& key);
+  virtual Status Write(const WriteOptions& options, WriteBatch* updates);
+  virtual Status Get(const ReadOptions& options,
+                     const Slice& key,
+                     std::string* value);
+  virtual std::vector<Status> MultiGet(const ReadOptions& options,
+                                       const std::vector<Slice>& keys,
+                                       std::vector<std::string>* values);
+
+  // Returns false if key doesn't exist in the database and true if it may.
+  // If value_found is not passed in as null, then return the value if found in
+  // memory. On return, if value was found, then value_found will be set to true
+  // , otherwise false.
+  virtual bool KeyMayExist(const ReadOptions& options,
+                           const Slice& key,
+                           std::string* value,
+                           bool* value_found = nullptr);
+  virtual Iterator* NewIterator(const ReadOptions&);
+  virtual const Snapshot* GetSnapshot();
+  virtual void ReleaseSnapshot(const Snapshot* snapshot);
+  virtual bool GetProperty(const Slice& property, std::string* value);
+  virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes);
+  virtual void CompactRange(const Slice* begin, const Slice* end,
+                            bool reduce_level = false, int target_level = -1);
+  virtual int NumberLevels();
+  virtual int MaxMemCompactionLevel();
+  virtual int Level0StopWriteTrigger();
+  virtual const std::string& GetName() const;
+  virtual Env* GetEnv() const;
+  virtual const Options& GetOptions() const;
+  virtual Status Flush(const FlushOptions& options);
+  virtual Status DisableFileDeletions();
+  virtual Status EnableFileDeletions(bool force);
+  // All the returned filenames start with "/"
+  virtual Status GetLiveFiles(std::vector<std::string>&,
+                              uint64_t* manifest_file_size,
+                              bool flush_memtable = true);
+  virtual Status GetSortedWalFiles(VectorLogPtr& files);
+  virtual SequenceNumber GetLatestSequenceNumber() const;
+  virtual Status GetUpdatesSince(SequenceNumber seq_number,
+                                 unique_ptr<TransactionLogIterator>* iter);
+  virtual Status DeleteFile(std::string name);
+
+  virtual void GetLiveFilesMetaData(
+    std::vector<LiveFileMetaData> *metadata);
+
+  virtual Status GetDbIdentity(std::string& identity);
+
+  void RunManualCompaction(int input_level,
+                           int output_level,
+                           const Slice* begin,
+                           const Slice* end);
+
+  // Extra methods (for testing) that are not in the public DB interface
+
+  // Compact any files in the named level that overlap [*begin, *end]
+  void TEST_CompactRange(int level,
+                         const Slice* begin,
+                         const Slice* end);
+
+  // Force current memtable contents to be flushed.
+  Status TEST_FlushMemTable();
+
+  // Wait for memtable compaction
+  Status TEST_WaitForFlushMemTable();
+
+  // Wait for any compaction
+  Status TEST_WaitForCompact();
+
+  // Return an internal iterator over the current state of the database.
+  // The keys of this iterator are internal keys (see format.h).
+  // The returned iterator should be deleted when no longer needed.
+  Iterator* TEST_NewInternalIterator();
+
+  // Return the maximum overlapping data (in bytes) at next level for any
+  // file at a level >= 1.
+  int64_t TEST_MaxNextLevelOverlappingBytes();
+
+  // Simulate a db crash, no elegant closing of database.
+  void TEST_Destroy_DBImpl();
+
+  // Return the current manifest file no.
+  uint64_t TEST_Current_Manifest_FileNo();
+
+  // Trigger's a background call for testing.
+  void TEST_PurgeObsoleteteWAL();
+
+  // get total level0 file size. Only for testing.
+  uint64_t TEST_GetLevel0TotalSize();
+
+  void TEST_SetDefaultTimeToCheck(uint64_t default_interval_to_delete_obsolete_WAL)
+  {
+    default_interval_to_delete_obsolete_WAL_ = default_interval_to_delete_obsolete_WAL;
+  }
+
+  // holds references to memtable, all immutable memtables and version
+  struct SuperVersion {
+    MemTable* mem;
+    MemTableList imm;
+    Version* current;
+    std::atomic<uint32_t> refs;
+    // We need to_delete because during Cleanup(), imm.UnrefAll() returns
+    // all memtables that we need to free through this vector. We then
+    // delete all those memtables outside of mutex, during destruction
+    std::vector<MemTable*> to_delete;
+
+    // should be called outside the mutex
+    explicit SuperVersion(const int num_memtables = 0);
+    ~SuperVersion();
+    SuperVersion* Ref();
+    // Returns true if this was the last reference and caller should
+    // call Clenaup() and delete the object
+    bool Unref();
+
+    // call these two methods with db mutex held
+    // Cleanup unrefs mem, imm and current. Also, it stores all memtables
+    // that needs to be deleted in to_delete vector. Unrefing those
+    // objects needs to be done in the mutex
+    void Cleanup();
+    void Init(MemTable* new_mem, const MemTableList& new_imm,
+              Version* new_current);
+  };
+
+  // needed for CleanupIteratorState
+  struct DeletionState {
+    inline bool HaveSomethingToDelete() const {
+      return  all_files.size() ||
+        sst_delete_files.size() ||
+        log_delete_files.size();
+    }
+
+    // a list of all files that we'll consider deleting
+    // (every once in a while this is filled up with all files
+    // in the DB directory)
+    std::vector<std::string> all_files;
+
+    // the list of all live sst files that cannot be deleted
+    std::vector<uint64_t> sst_live;
+
+    // a list of sst files that we need to delete
+    std::vector<FileMetaData*> sst_delete_files;
+
+    // a list of log files that we need to delete
+    std::vector<uint64_t> log_delete_files;
+
+    // a list of memtables to be free
+    std::vector<MemTable *> memtables_to_free;
+
+    SuperVersion* superversion_to_free; // if nullptr nothing to free
+
+    SuperVersion* new_superversion; // if nullptr no new superversion
+
+    // the current manifest_file_number, log_number and prev_log_number
+    // that corresponds to the set of files in 'live'.
+    uint64_t manifest_file_number, log_number, prev_log_number;
+
+    explicit DeletionState(const int num_memtables = 0,
+                           bool create_superversion = false) {
+      manifest_file_number = 0;
+      log_number = 0;
+      prev_log_number = 0;
+      memtables_to_free.reserve(num_memtables);
+      superversion_to_free = nullptr;
+      new_superversion =
+          create_superversion ? new SuperVersion(num_memtables) : nullptr;
+    }
+
+    ~DeletionState() {
+      // free pending memtables
+      for (auto m : memtables_to_free) {
+        delete m;
+      }
+      // free superversion. if nullptr, this will be noop
+      delete superversion_to_free;
+      // if new_superversion was not used, it will be non-nullptr and needs
+      // to be freed here
+      delete new_superversion;
+    }
+  };
+
+  // Returns the list of live files in 'live' and the list
+  // of all files in the filesystem in 'all_files'.
+  // If force == false and the last call was less than
+  // options_.delete_obsolete_files_period_micros microseconds ago,
+  // it will not fill up the deletion_state
+  void FindObsoleteFiles(DeletionState& deletion_state,
+                         bool force,
+                         bool no_full_scan = false);
+
+  // Diffs the files listed in filenames and those that do not
+  // belong to live files are posibly removed. Also, removes all the
+  // files in sst_delete_files and log_delete_files.
+  // It is not necessary to hold the mutex when invoking this method.
+  void PurgeObsoleteFiles(DeletionState& deletion_state);
+
+ protected:
+  Env* const env_;
+  const std::string dbname_;
+  unique_ptr<VersionSet> versions_;
+  const InternalKeyComparator internal_comparator_;
+  const Options options_;  // options_.comparator == &internal_comparator_
+
+  const Comparator* user_comparator() const {
+    return internal_comparator_.user_comparator();
+  }
+
+  MemTable* GetMemTable() {
+    return mem_;
+  }
+
+  Iterator* NewInternalIterator(const ReadOptions&,
+                                SequenceNumber* latest_snapshot);
+
+ private:
+  friend class DB;
+  struct CompactionState;
+  struct Writer;
+
+  Status NewDB();
+
+  // Recover the descriptor from persistent storage.  May do a significant
+  // amount of work to recover recently logged updates.  Any changes to
+  // be made to the descriptor are added to *edit.
+  Status Recover(VersionEdit* edit, MemTable* external_table = nullptr,
+      bool error_if_log_file_exist = false);
+
+  void MaybeIgnoreError(Status* s) const;
+
+  const Status CreateArchivalDirectory();
+
+  // Delete any unneeded files and stale in-memory entries.
+  void DeleteObsoleteFiles();
+
+  // Flush the in-memory write buffer to storage.  Switches to a new
+  // log-file/memtable and writes a new descriptor iff successful.
+  Status FlushMemTableToOutputFile(bool* madeProgress,
+                                   DeletionState& deletion_state);
+
+  Status RecoverLogFile(uint64_t log_number,
+                        VersionEdit* edit,
+                        SequenceNumber* max_sequence,
+                        MemTable* external_table);
+
+  // The following two methods are used to flush a memtable to
+  // storage. The first one is used atdatabase RecoveryTime (when the
+  // database is opened) and is heavyweight because it holds the mutex
+  // for the entire period. The second method WriteLevel0Table supports
+  // concurrent flush memtables to storage.
+  Status WriteLevel0TableForRecovery(MemTable* mem, VersionEdit* edit);
+  Status WriteLevel0Table(std::vector<MemTable*> &mems, VersionEdit* edit,
+                                uint64_t* filenumber);
+
+  uint64_t SlowdownAmount(int n, int top, int bottom);
+  // MakeRoomForWrite will return superversion_to_free through an arugment,
+  // which the caller needs to delete. We do it because caller can delete
+  // the superversion outside of mutex
+  Status MakeRoomForWrite(bool force /* compact even if there is room? */,
+                          SuperVersion** superversion_to_free);
+  void BuildBatchGroup(Writer** last_writer,
+                       autovector<WriteBatch*>* write_batch_group);
+
+  // Force current memtable contents to be flushed.
+  Status FlushMemTable(const FlushOptions& options);
+
+  // Wait for memtable flushed
+  Status WaitForFlushMemTable();
+
+  void MaybeScheduleLogDBDeployStats();
+  static void BGLogDBDeployStats(void* db);
+  void LogDBDeployStats();
+
+  void MaybeScheduleFlushOrCompaction();
+  static void BGWorkCompaction(void* db);
+  static void BGWorkFlush(void* db);
+  void BackgroundCallCompaction();
+  void BackgroundCallFlush();
+  Status BackgroundCompaction(bool* madeProgress,DeletionState& deletion_state);
+  Status BackgroundFlush(bool* madeProgress, DeletionState& deletion_state);
+  void CleanupCompaction(CompactionState* compact, Status status);
+  Status DoCompactionWork(CompactionState* compact,
+                          DeletionState& deletion_state);
+
+  Status OpenCompactionOutputFile(CompactionState* compact);
+  Status FinishCompactionOutputFile(CompactionState* compact, Iterator* input);
+  Status InstallCompactionResults(CompactionState* compact);
+  void AllocateCompactionOutputFileNumbers(CompactionState* compact);
+  void ReleaseCompactionUnusedFileNumbers(CompactionState* compact);
+
+  void PurgeObsoleteWALFiles();
+
+  Status AppendSortedWalsOfType(const std::string& path,
+                                VectorLogPtr& log_files,
+                                WalFileType type);
+
+  // Requires: all_logs should be sorted with earliest log file first
+  // Retains all log files in all_logs which contain updates with seq no.
+  // Greater Than or Equal to the requested SequenceNumber.
+  Status RetainProbableWalFiles(VectorLogPtr& all_logs,
+                                const SequenceNumber target);
+  //  return true if
+  bool CheckWalFileExistsAndEmpty(const WalFileType type,
+                                  const uint64_t number);
+
+  Status ReadFirstRecord(const WalFileType type, const uint64_t number,
+                         WriteBatch* const result);
+
+  Status ReadFirstLine(const std::string& fname, WriteBatch* const batch);
+
+  void PrintStatistics();
+
+  // dump rocksdb.stats to LOG
+  void MaybeDumpStats();
+
+  // Return the minimum empty level that could hold the total data in the
+  // input level. Return the input level, if such level could not be found.
+  int FindMinimumEmptyLevelFitting(int level);
+
+  // Move the files in the input level to the target level.
+  // If target_level < 0, automatically calculate the minimum level that could
+  // hold the data set.
+  void ReFitLevel(int level, int target_level = -1);
+
+  // Constant after construction
+  const InternalFilterPolicy internal_filter_policy_;
+  bool owns_info_log_;
+
+  // table_cache_ provides its own synchronization
+  unique_ptr<TableCache> table_cache_;
+
+  // Lock over the persistent DB state.  Non-nullptr iff successfully acquired.
+  FileLock* db_lock_;
+
+  // State below is protected by mutex_
+  port::Mutex mutex_;
+  port::AtomicPointer shutting_down_;
+  port::CondVar bg_cv_;          // Signalled when background work finishes
+  MemTableRepFactory* mem_rep_factory_;
+  MemTable* mem_;
+  MemTableList imm_;             // Memtable that are not changing
+  uint64_t logfile_number_;
+  unique_ptr<log::Writer> log_;
+
+  SuperVersion* super_version_;
+
+  std::string host_name_;
+
+  // Queue of writers.
+  std::deque<Writer*> writers_;
+  WriteBatch tmp_batch_;
+
+  SnapshotList snapshots_;
+
+  // Set of table files to protect from deletion because they are
+  // part of ongoing compactions.
+  std::set<uint64_t> pending_outputs_;
+
+  // count how many background compactions are running or have been scheduled
+  int bg_compaction_scheduled_;
+
+  // If non-zero, MaybeScheduleFlushOrCompaction() will only schedule manual
+  // compactions (if manual_compaction_ is not null). This mechanism enables
+  // manual compactions to wait until all other compactions are finished.
+  int bg_manual_only_;
+
+  // number of background memtable flush jobs, submitted to the HIGH pool
+  int bg_flush_scheduled_;
+
+  // Has a background stats log thread scheduled?
+  bool bg_logstats_scheduled_;
+
+  // Information for a manual compaction
+  struct ManualCompaction {
+    int input_level;
+    int output_level;
+    bool done;
+    bool in_progress;           // compaction request being processed?
+    const InternalKey* begin;   // nullptr means beginning of key range
+    const InternalKey* end;     // nullptr means end of key range
+    InternalKey tmp_storage;    // Used to keep track of compaction progress
+  };
+  ManualCompaction* manual_compaction_;
+
+  // Have we encountered a background error in paranoid mode?
+  Status bg_error_;
+
+  std::unique_ptr<StatsLogger> logger_;
+
+  int64_t volatile last_log_ts;
+
+  // shall we disable deletion of obsolete files
+  // if 0 the deletion is enabled.
+  // if non-zero, files will not be getting deleted
+  // This enables two different threads to call
+  // EnableFileDeletions() and DisableFileDeletions()
+  // without any synchronization
+  int disable_delete_obsolete_files_;
+
+  // last time when DeleteObsoleteFiles was invoked
+  uint64_t delete_obsolete_files_last_run_;
+
+  // last time when PurgeObsoleteWALFiles ran.
+  uint64_t purge_wal_files_last_run_;
+
+  // last time stats were dumped to LOG
+  std::atomic<uint64_t> last_stats_dump_time_microsec_;
+
+  // obsolete files will be deleted every this seconds if ttl deletion is
+  // enabled and archive size_limit is disabled.
+  uint64_t default_interval_to_delete_obsolete_WAL_;
+
+  // These count the number of microseconds for which MakeRoomForWrite stalls.
+  uint64_t stall_level0_slowdown_;
+  uint64_t stall_memtable_compaction_;
+  uint64_t stall_level0_num_files_;
+  std::vector<uint64_t> stall_leveln_slowdown_;
+  uint64_t stall_level0_slowdown_count_;
+  uint64_t stall_memtable_compaction_count_;
+  uint64_t stall_level0_num_files_count_;
+  std::vector<uint64_t> stall_leveln_slowdown_count_;
+
+  // Time at which this instance was started.
+  const uint64_t started_at_;
+
+  bool flush_on_destroy_; // Used when disableWAL is true.
+
+  // Per level compaction stats.  stats_[level] stores the stats for
+  // compactions that produced data for the specified "level".
+  struct CompactionStats {
+    uint64_t micros;
+
+    // Bytes read from level N during compaction between levels N and N+1
+    int64_t bytes_readn;
+
+    // Bytes read from level N+1 during compaction between levels N and N+1
+    int64_t bytes_readnp1;
+
+    // Total bytes written during compaction between levels N and N+1
+    int64_t bytes_written;
+
+    // Files read from level N during compaction between levels N and N+1
+    int     files_in_leveln;
+
+    // Files read from level N+1 during compaction between levels N and N+1
+    int     files_in_levelnp1;
+
+    // Files written during compaction between levels N and N+1
+    int     files_out_levelnp1;
+
+    // Number of compactions done
+    int     count;
+
+    CompactionStats() : micros(0), bytes_readn(0), bytes_readnp1(0),
+                        bytes_written(0), files_in_leveln(0),
+                        files_in_levelnp1(0), files_out_levelnp1(0),
+                        count(0) { }
+
+    void Add(const CompactionStats& c) {
+      this->micros += c.micros;
+      this->bytes_readn += c.bytes_readn;
+      this->bytes_readnp1 += c.bytes_readnp1;
+      this->bytes_written += c.bytes_written;
+      this->files_in_leveln += c.files_in_leveln;
+      this->files_in_levelnp1 += c.files_in_levelnp1;
+      this->files_out_levelnp1 += c.files_out_levelnp1;
+      this->count += 1;
+    }
+  };
+
+  std::vector<CompactionStats> stats_;
+
+  // Used to compute per-interval statistics
+  struct StatsSnapshot {
+    uint64_t compaction_bytes_read_;     // Bytes read by compaction
+    uint64_t compaction_bytes_written_;  // Bytes written by compaction
+    uint64_t ingest_bytes_;              // Bytes written by user
+    uint64_t wal_bytes_;                 // Bytes written to WAL
+    uint64_t wal_synced_;                // Number of times WAL is synced
+    uint64_t write_with_wal_;            // Number of writes that request WAL
+    // These count the number of writes processed by the calling thread or
+    // another thread.
+    uint64_t write_other_;
+    uint64_t write_self_;
+    double   seconds_up_;
+
+    StatsSnapshot() : compaction_bytes_read_(0), compaction_bytes_written_(0),
+                      ingest_bytes_(0), wal_bytes_(0), wal_synced_(0),
+                      write_with_wal_(0), write_other_(0), write_self_(0),
+                      seconds_up_(0) {}
+  };
+
+  // Counters from the previous time per-interval stats were computed
+  StatsSnapshot last_stats_;
+
+  static const int KEEP_LOG_FILE_NUM = 1000;
+  std::string db_absolute_path_;
+
+  // count of the number of contiguous delaying writes
+  int delayed_writes_;
+
+  // The options to access storage files
+  const EnvOptions storage_options_;
+
+  // A value of true temporarily disables scheduling of background work
+  bool bg_work_gate_closed_;
+
+  // Guard against multiple concurrent refitting
+  bool refitting_level_;
+
+  // No copying allowed
+  DBImpl(const DBImpl&);
+  void operator=(const DBImpl&);
+
+  // dump the delayed_writes_ to the log file and reset counter.
+  void DelayLoggingAndReset();
+
+  // Return the earliest snapshot where seqno is visible.
+  // Store the snapshot right before that, if any, in prev_snapshot
+  inline SequenceNumber findEarliestVisibleSnapshot(
+    SequenceNumber in,
+    std::vector<SequenceNumber>& snapshots,
+    SequenceNumber* prev_snapshot);
+
+  // will return a pointer to SuperVersion* if previous SuperVersion
+  // if its reference count is zero and needs deletion or nullptr if not
+  // As argument takes a pointer to allocated SuperVersion
+  // Foreground threads call this function directly (they don't carry
+  // deletion state and have to handle their own creation and deletion
+  // of SuperVersion)
+  SuperVersion* InstallSuperVersion(SuperVersion* new_superversion);
+  // Background threads call this function, which is just a wrapper around
+  // the InstallSuperVersion() function above. Background threads carry
+  // deletion_state which can have new_superversion already allocated.
+  void InstallSuperVersion(DeletionState& deletion_state);
+
+  // Function that Get and KeyMayExist call with no_io true or false
+  // Note: 'value_found' from KeyMayExist propagates here
+  Status GetImpl(const ReadOptions& options,
+                 const Slice& key,
+                 std::string* value,
+                 bool* value_found = nullptr);
+};
+
+// Sanitize db options.  The caller should delete result.info_log if
+// it is not equal to src.info_log.
+extern Options SanitizeOptions(const std::string& db,
+                               const InternalKeyComparator* icmp,
+                               const InternalFilterPolicy* ipolicy,
+                               const Options& src);
+
+
+// Determine compression type, based on user options, level of the output
+// file and whether compression is disabled.
+// If enable_compression is false, then compression is always disabled no
+// matter what the values of the other two parameters are.
+// Otherwise, the compression type is determined based on options and level.
+CompressionType GetCompressionType(const Options& options, int level,
+                                   const bool enable_compression);
+
+// Determine compression type for L0 file written by memtable flush.
+CompressionType GetCompressionFlush(const Options& options);
+
+}  // namespace rocksdb
diff --git a/db/db_impl_readonly.cc b/db/db_impl_readonly.cc
new file mode 100644 (file)
index 0000000..04033b2
--- /dev/null
@@ -0,0 +1,101 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2012 Facebook. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "db/db_impl_readonly.h"
+#include "db/db_impl.h"
+
+#include <algorithm>
+#include <set>
+#include <string>
+#include <stdint.h>
+#include <stdio.h>
+#include <vector>
+#include <algorithm>
+#include "db/db_iter.h"
+#include "db/dbformat.h"
+#include "db/filename.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/memtable.h"
+#include "db/merge_context.h"
+#include "db/table_cache.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "rocksdb/merge_operator.h"
+#include "port/port.h"
+#include "table/block.h"
+#include "table/merger.h"
+#include "table/two_level_iterator.h"
+#include "util/coding.h"
+#include "util/logging.h"
+#include "util/build_version.h"
+
+namespace rocksdb {
+
+DBImplReadOnly::DBImplReadOnly(const Options& options,
+    const std::string& dbname)
+    : DBImpl(options, dbname) {
+  Log(options_.info_log, "Opening the db in read only mode");
+}
+
+DBImplReadOnly::~DBImplReadOnly() {
+}
+
+// Implementations of the DB interface
+Status DBImplReadOnly::Get(const ReadOptions& options,
+                   const Slice& key,
+                   std::string* value) {
+  Status s;
+  MemTable* mem = GetMemTable();
+  Version* current = versions_->current();
+  SequenceNumber snapshot = versions_->LastSequence();
+  MergeContext merge_context;
+  LookupKey lkey(key, snapshot);
+  if (mem->Get(lkey, value, &s, merge_context, options_)) {
+  } else {
+    Version::GetStats stats;
+    current->Get(options, lkey, value, &s, &merge_context, &stats, options_);
+  }
+  return s;
+}
+
+Iterator* DBImplReadOnly::NewIterator(const ReadOptions& options) {
+  SequenceNumber latest_snapshot;
+  Iterator* internal_iter = NewInternalIterator(options, &latest_snapshot);
+  return NewDBIterator(
+    &dbname_, env_, options_,  user_comparator(),internal_iter,
+      (options.snapshot != nullptr
+      ? reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_
+      : latest_snapshot));
+}
+
+
+Status DB::OpenForReadOnly(const Options& options, const std::string& dbname,
+                DB** dbptr, bool error_if_log_file_exist) {
+  *dbptr = nullptr;
+
+  DBImplReadOnly* impl = new DBImplReadOnly(options, dbname);
+  impl->mutex_.Lock();
+  VersionEdit edit;
+  Status s = impl->Recover(&edit, impl->GetMemTable(),
+                           error_if_log_file_exist);
+  impl->mutex_.Unlock();
+  if (s.ok()) {
+    *dbptr = impl;
+  } else {
+    delete impl;
+  }
+  return s;
+}
+
+}
diff --git a/db/db_impl_readonly.h b/db/db_impl_readonly.h
new file mode 100644 (file)
index 0000000..4beaedd
--- /dev/null
@@ -0,0 +1,78 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2012 Facebook. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#pragma once
+#include "db/db_impl.h"
+
+#include <deque>
+#include <set>
+#include "db/dbformat.h"
+#include "db/log_writer.h"
+#include "db/snapshot.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "port/port.h"
+#include "util/stats_logger.h"
+
+namespace rocksdb {
+
+class DBImplReadOnly : public DBImpl {
+public:
+  DBImplReadOnly(const Options& options, const std::string& dbname);
+ virtual ~DBImplReadOnly();
+
+ // Implementations of the DB interface
+ virtual Status Get(const ReadOptions& options,
+                    const Slice& key,
+                    std::string* value);
+
+ // TODO: Implement ReadOnly MultiGet?
+
+ virtual Iterator* NewIterator(const ReadOptions&);
+
+ virtual Status Put(const WriteOptions&, const Slice& key, const Slice& value) {
+   return Status::NotSupported("Not supported operation in read only mode.");
+ }
+ virtual Status Merge(const WriteOptions&, const Slice& key,
+                      const Slice& value) {
+   return Status::NotSupported("Not supported operation in read only mode.");
+ }
+ virtual Status Delete(const WriteOptions&, const Slice& key) {
+   return Status::NotSupported("Not supported operation in read only mode.");
+ }
+ virtual Status Write(const WriteOptions& options, WriteBatch* updates) {
+   return Status::NotSupported("Not supported operation in read only mode.");
+ }
+ virtual void CompactRange(const Slice* begin, const Slice* end,
+                           bool reduce_level = false, int target_level = -1) {
+ }
+ virtual Status DisableFileDeletions() {
+   return Status::NotSupported("Not supported operation in read only mode.");
+ }
+ virtual Status EnableFileDeletions(bool force) {
+   return Status::NotSupported("Not supported operation in read only mode.");
+ }
+ virtual Status GetLiveFiles(std::vector<std::string>&,
+                             uint64_t* manifest_file_size,
+                             bool flush_memtable = true) {
+   return Status::NotSupported("Not supported operation in read only mode.");
+ }
+ virtual Status Flush(const FlushOptions& options) {
+   return Status::NotSupported("Not supported operation in read only mode.");
+ }
+
+private:
+ friend class DB;
+
+ // No copying allowed
+ DBImplReadOnly(const DBImplReadOnly&);
+ void operator=(const DBImplReadOnly&);
+};
+
+}
diff --git a/db/db_iter.cc b/db/db_iter.cc
new file mode 100644 (file)
index 0000000..596a9f6
--- /dev/null
@@ -0,0 +1,481 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_iter.h"
+#include <stdexcept>
+#include <deque>
+
+#include "db/filename.h"
+#include "db/dbformat.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/merge_operator.h"
+#include "port/port.h"
+#include "util/logging.h"
+#include "util/mutexlock.h"
+#include "util/perf_context_imp.h"
+
+namespace rocksdb {
+
+#if 0
+static void DumpInternalIter(Iterator* iter) {
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ParsedInternalKey k;
+    if (!ParseInternalKey(iter->key(), &k)) {
+      fprintf(stderr, "Corrupt '%s'\n", EscapeString(iter->key()).c_str());
+    } else {
+      fprintf(stderr, "@ '%s'\n", k.DebugString().c_str());
+    }
+  }
+}
+#endif
+
+namespace {
+
+// Memtables and sstables that make the DB representation contain
+// (userkey,seq,type) => uservalue entries.  DBIter
+// combines multiple entries for the same userkey found in the DB
+// representation into a single entry while accounting for sequence
+// numbers, deletion markers, overwrites, etc.
+class DBIter: public Iterator {
+ public:
+  // The following is grossly complicated. TODO: clean it up
+  // Which direction is the iterator currently moving?
+  // (1) When moving forward, the internal iterator is positioned at
+  //     the exact entry that yields this->key(), this->value()
+  // (2) When moving backwards, the internal iterator is positioned
+  //     just before all entries whose user key == this->key().
+  enum Direction {
+    kForward,
+    kReverse
+  };
+
+  DBIter(const std::string* dbname, Env* env, const Options& options,
+         const Comparator* cmp, Iterator* iter, SequenceNumber s)
+      : dbname_(dbname),
+        env_(env),
+        logger_(options.info_log.get()),
+        user_comparator_(cmp),
+        user_merge_operator_(options.merge_operator.get()),
+        iter_(iter),
+        sequence_(s),
+        direction_(kForward),
+        valid_(false),
+        current_entry_is_merged_(false),
+        statistics_(options.statistics.get()) {
+    RecordTick(statistics_, NO_ITERATORS, 1);
+    max_skip_ = options.max_sequential_skip_in_iterations;
+  }
+  virtual ~DBIter() {
+    RecordTick(statistics_, NO_ITERATORS, -1);
+    delete iter_;
+  }
+  virtual bool Valid() const { return valid_; }
+  virtual Slice key() const {
+    assert(valid_);
+    return saved_key_;
+  }
+  virtual Slice value() const {
+    assert(valid_);
+    return (direction_ == kForward && !current_entry_is_merged_) ?
+      iter_->value() : saved_value_;
+  }
+  virtual Status status() const {
+    if (status_.ok()) {
+      return iter_->status();
+    } else {
+      return status_;
+    }
+  }
+
+  virtual void Next();
+  virtual void Prev();
+  virtual void Seek(const Slice& target);
+  virtual void SeekToFirst();
+  virtual void SeekToLast();
+
+ private:
+  void FindNextUserEntry(bool skipping);
+  void FindPrevUserEntry();
+  bool ParseKey(ParsedInternalKey* key);
+  void MergeValuesNewToOld();
+
+  inline void SaveKey(const Slice& k, std::string* dst) {
+    dst->assign(k.data(), k.size());
+  }
+
+  inline void ClearSavedValue() {
+    if (saved_value_.capacity() > 1048576) {
+      std::string empty;
+      swap(empty, saved_value_);
+    } else {
+      saved_value_.clear();
+    }
+  }
+
+  const std::string* const dbname_;
+  Env* const env_;
+  Logger* logger_;
+  const Comparator* const user_comparator_;
+  const MergeOperator* const user_merge_operator_;
+  Iterator* const iter_;
+  SequenceNumber const sequence_;
+
+  Status status_;
+  std::string saved_key_;     // == current key when direction_==kReverse
+  std::string saved_value_;   // == current raw value when direction_==kReverse
+  std::string skip_key_;
+  Direction direction_;
+  bool valid_;
+  bool current_entry_is_merged_;
+  Statistics* statistics_;
+  uint64_t max_skip_;
+
+  // No copying allowed
+  DBIter(const DBIter&);
+  void operator=(const DBIter&);
+};
+
+inline bool DBIter::ParseKey(ParsedInternalKey* ikey) {
+  if (!ParseInternalKey(iter_->key(), ikey)) {
+    status_ = Status::Corruption("corrupted internal key in DBIter");
+    Log(logger_, "corrupted internal key in DBIter: %s",
+        iter_->key().ToString(true).c_str());
+    return false;
+  } else {
+    return true;
+  }
+}
+
+void DBIter::Next() {
+  assert(valid_);
+
+  if (direction_ == kReverse) {  // Switch directions?
+    direction_ = kForward;
+    // iter_ is pointing just before the entries for this->key(),
+    // so advance into the range of entries for this->key() and then
+    // use the normal skipping code below.
+    if (!iter_->Valid()) {
+      iter_->SeekToFirst();
+    } else {
+      iter_->Next();
+    }
+    if (!iter_->Valid()) {
+      valid_ = false;
+      saved_key_.clear();
+      return;
+    }
+  }
+
+  // If the current value is merged, we might already hit end of iter_
+  if (!iter_->Valid()) {
+    valid_ = false;
+    return;
+  }
+  FindNextUserEntry(true /* skipping the current user key */);
+}
+
+
+// PRE: saved_key_ has the current user key if skipping
+// POST: saved_key_ should have the next user key if valid_,
+//       if the current entry is a result of merge
+//           current_entry_is_merged_ => true
+//           saved_value_             => the merged value
+//
+// NOTE: In between, saved_key_ can point to a user key that has
+//       a delete marker
+void DBIter::FindNextUserEntry(bool skipping) {
+  // Loop until we hit an acceptable entry to yield
+  assert(iter_->Valid());
+  assert(direction_ == kForward);
+  current_entry_is_merged_ = false;
+  uint64_t num_skipped = 0;
+  do {
+    ParsedInternalKey ikey;
+    if (ParseKey(&ikey) && ikey.sequence <= sequence_) {
+      if (skipping &&
+          user_comparator_->Compare(ikey.user_key, saved_key_) <= 0) {
+        num_skipped++; // skip this entry
+        BumpPerfCount(&perf_context.internal_key_skipped_count);
+      } else {
+        skipping = false;
+        switch (ikey.type) {
+          case kTypeDeletion:
+            // Arrange to skip all upcoming entries for this key since
+            // they are hidden by this deletion.
+            SaveKey(ikey.user_key, &saved_key_);
+            skipping = true;
+            num_skipped = 0;
+            BumpPerfCount(&perf_context.internal_delete_skipped_count);
+            break;
+          case kTypeValue:
+            valid_ = true;
+            SaveKey(ikey.user_key, &saved_key_);
+            return;
+          case kTypeMerge:
+            // By now, we are sure the current ikey is going to yield a value
+            SaveKey(ikey.user_key, &saved_key_);
+            current_entry_is_merged_ = true;
+            valid_ = true;
+            MergeValuesNewToOld();  // Go to a different state machine
+            return;
+          case kTypeLogData:
+            assert(false);
+            break;
+        }
+      }
+    }
+    // If we have sequentially iterated via numerous keys and still not
+    // found the next user-key, then it is better to seek so that we can
+    // avoid too many key comparisons. We seek to the last occurence of
+    // our current key by looking for sequence number 0.
+    if (skipping && num_skipped > max_skip_) {
+      num_skipped = 0;
+      std::string last_key;
+      AppendInternalKey(&last_key,
+        ParsedInternalKey(Slice(saved_key_), 0, kValueTypeForSeek));
+      iter_->Seek(last_key);
+      RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
+    } else {
+      iter_->Next();
+    }
+  } while (iter_->Valid());
+  valid_ = false;
+}
+
+// Merge values of the same user key starting from the current iter_ position
+// Scan from the newer entries to older entries.
+// PRE: iter_->key() points to the first merge type entry
+//      saved_key_ stores the user key
+// POST: saved_value_ has the merged value for the user key
+//       iter_ points to the next entry (or invalid)
+void DBIter::MergeValuesNewToOld() {
+  if (!user_merge_operator_) {
+    Log(logger_, "Options::merge_operator is null.");
+    throw std::logic_error("DBIter::MergeValuesNewToOld() with"
+                           " Options::merge_operator null");
+  }
+
+  // Start the merge process by pushing the first operand
+  std::deque<std::string> operands;
+  operands.push_front(iter_->value().ToString());
+
+  std::string merge_result;   // Temporary string to hold merge result later
+  ParsedInternalKey ikey;
+  for (iter_->Next(); iter_->Valid(); iter_->Next()) {
+    if (!ParseKey(&ikey)) {
+      // skip corrupted key
+      continue;
+    }
+
+    if (user_comparator_->Compare(ikey.user_key, saved_key_) != 0) {
+      // hit the next user key, stop right here
+      break;
+    }
+
+    if (kTypeDeletion == ikey.type) {
+      // hit a delete with the same user key, stop right here
+      // iter_ is positioned after delete
+      iter_->Next();
+      break;
+    }
+
+    if (kTypeValue == ikey.type) {
+      // hit a put, merge the put value with operands and store the
+      // final result in saved_value_. We are done!
+      // ignore corruption if there is any.
+      const Slice value = iter_->value();
+      user_merge_operator_->FullMerge(ikey.user_key, &value, operands,
+                                      &saved_value_, logger_);
+      // iter_ is positioned after put
+      iter_->Next();
+      return;
+    }
+
+    if (kTypeMerge == ikey.type) {
+      // hit a merge, add the value as an operand and run associative merge.
+      // when complete, add result to operands and continue.
+      const Slice& value = iter_->value();
+      operands.push_front(value.ToString());
+      while(operands.size() >= 2) {
+        // Call user associative-merge until it returns false
+        if (user_merge_operator_->PartialMerge(ikey.user_key,
+                                               Slice(operands[0]),
+                                               Slice(operands[1]),
+                                               &merge_result,
+                                               logger_)) {
+          operands.pop_front();
+          swap(operands.front(), merge_result);
+        } else {
+          // Associative merge returns false ==> stack the operands
+          break;
+        }
+      }
+
+    }
+  }
+
+  // we either exhausted all internal keys under this user key, or hit
+  // a deletion marker.
+  // feed null as the existing value to the merge operator, such that
+  // client can differentiate this scenario and do things accordingly.
+  user_merge_operator_->FullMerge(saved_key_, nullptr, operands,
+                                  &saved_value_, logger_);
+}
+
+void DBIter::Prev() {
+  assert(valid_);
+
+  // Throw an exception now if merge_operator is provided
+  // TODO: support backward iteration
+  if (user_merge_operator_) {
+    Log(logger_, "Prev not supported yet if merge_operator is provided");
+    throw std::logic_error("DBIter::Prev backward iteration not supported"
+                           " if merge_operator is provided");
+  }
+
+  if (direction_ == kForward) {  // Switch directions?
+    // iter_ is pointing at the current entry.  Scan backwards until
+    // the key changes so we can use the normal reverse scanning code.
+    assert(iter_->Valid());  // Otherwise valid_ would have been false
+    SaveKey(ExtractUserKey(iter_->key()), &saved_key_);
+    while (true) {
+      iter_->Prev();
+      if (!iter_->Valid()) {
+        valid_ = false;
+        saved_key_.clear();
+        ClearSavedValue();
+        return;
+      }
+      if (user_comparator_->Compare(ExtractUserKey(iter_->key()),
+                                    saved_key_) < 0) {
+        break;
+      }
+    }
+    direction_ = kReverse;
+  }
+
+  FindPrevUserEntry();
+}
+
+void DBIter::FindPrevUserEntry() {
+  assert(direction_ == kReverse);
+  uint64_t num_skipped = 0;
+
+  ValueType value_type = kTypeDeletion;
+  if (iter_->Valid()) {
+    do {
+      ParsedInternalKey ikey;
+      bool saved_key_cleared = false;
+      if (ParseKey(&ikey) && ikey.sequence <= sequence_) {
+        if ((value_type != kTypeDeletion) &&
+            user_comparator_->Compare(ikey.user_key, saved_key_) < 0) {
+          // We encountered a non-deleted value in entries for previous keys,
+          break;
+        }
+        value_type = ikey.type;
+        if (value_type == kTypeDeletion) {
+          saved_key_.clear();
+          ClearSavedValue();
+          saved_key_cleared = true;
+        } else {
+          Slice raw_value = iter_->value();
+          if (saved_value_.capacity() > raw_value.size() + 1048576) {
+            std::string empty;
+            swap(empty, saved_value_);
+          }
+          SaveKey(ExtractUserKey(iter_->key()), &saved_key_);
+          saved_value_.assign(raw_value.data(), raw_value.size());
+        }
+      }
+      num_skipped++;
+      // If we have sequentially iterated via numerous keys and still not
+      // found the prev user-key, then it is better to seek so that we can
+      // avoid too many key comparisons. We seek to the first occurence of
+      // our current key by looking for max sequence number.
+      if (!saved_key_cleared && num_skipped > max_skip_) {
+        num_skipped = 0;
+        std::string last_key;
+        AppendInternalKey(&last_key,
+          ParsedInternalKey(Slice(saved_key_), kMaxSequenceNumber,
+                            kValueTypeForSeek));
+        iter_->Seek(last_key);
+        RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
+      } else {
+        iter_->Prev();
+      }
+    } while (iter_->Valid());
+  }
+
+  if (value_type == kTypeDeletion) {
+    // End
+    valid_ = false;
+    saved_key_.clear();
+    ClearSavedValue();
+    direction_ = kForward;
+  } else {
+    valid_ = true;
+  }
+}
+
+void DBIter::Seek(const Slice& target) {
+  direction_ = kForward;
+  ClearSavedValue();
+  saved_key_.clear();
+  AppendInternalKey(
+      &saved_key_, ParsedInternalKey(target, sequence_, kValueTypeForSeek));
+  iter_->Seek(saved_key_);
+  if (iter_->Valid()) {
+    FindNextUserEntry(false /*not skipping */);
+  } else {
+    valid_ = false;
+  }
+}
+
+void DBIter::SeekToFirst() {
+  direction_ = kForward;
+  ClearSavedValue();
+  iter_->SeekToFirst();
+  if (iter_->Valid()) {
+    FindNextUserEntry(false /* not skipping */);
+  } else {
+    valid_ = false;
+  }
+}
+
+void DBIter::SeekToLast() {
+  // Throw an exception for now if merge_operator is provided
+  // TODO: support backward iteration
+  if (user_merge_operator_) {
+    Log(logger_, "SeekToLast not supported yet if merge_operator is provided");
+    throw std::logic_error("DBIter::SeekToLast: backward iteration not"
+                           " supported if merge_operator is provided");
+  }
+
+  direction_ = kReverse;
+  ClearSavedValue();
+  iter_->SeekToLast();
+  FindPrevUserEntry();
+}
+
+}  // anonymous namespace
+
+Iterator* NewDBIterator(
+    const std::string* dbname,
+    Env* env,
+    const Options& options,
+    const Comparator *user_key_comparator,
+    Iterator* internal_iter,
+    const SequenceNumber& sequence) {
+  return new DBIter(dbname, env, options, user_key_comparator,
+                    internal_iter, sequence);
+}
+
+}  // namespace rocksdb
diff --git a/db/db_iter.h b/db/db_iter.h
new file mode 100644 (file)
index 0000000..b44e674
--- /dev/null
@@ -0,0 +1,28 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <stdint.h>
+#include "rocksdb/db.h"
+#include "db/dbformat.h"
+
+namespace rocksdb {
+
+// Return a new iterator that converts internal keys (yielded by
+// "*internal_iter") that were live at the specified "sequence" number
+// into appropriate user keys.
+extern Iterator* NewDBIterator(
+    const std::string* dbname,
+    Env* env,
+    const Options& options,
+    const Comparator *user_key_comparator,
+    Iterator* internal_iter,
+    const SequenceNumber& sequence);
+
+}  // namespace rocksdb
diff --git a/db/db_statistics.cc b/db/db_statistics.cc
new file mode 100644 (file)
index 0000000..f0cfd67
--- /dev/null
@@ -0,0 +1,14 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include "db/db_statistics.h"
+
+namespace rocksdb {
+
+std::shared_ptr<Statistics> CreateDBStatistics() {
+  return std::make_shared<DBStatistics>();
+}
+
+} // namespace rocksdb
diff --git a/db/db_statistics.h b/db/db_statistics.h
new file mode 100644 (file)
index 0000000..ec71e16
--- /dev/null
@@ -0,0 +1,63 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <cassert>
+#include <stdlib.h>
+#include <vector>
+#include <memory>
+
+#include "rocksdb/statistics.h"
+#include "util/histogram.h"
+#include "port/port.h"
+#include "util/mutexlock.h"
+
+
+namespace rocksdb {
+
+class DBStatistics: public Statistics {
+ public:
+  DBStatistics() : allTickers_(TICKER_ENUM_MAX),
+                   allHistograms_(HISTOGRAM_ENUM_MAX) { }
+
+  virtual ~DBStatistics() {}
+
+  virtual long getTickerCount(Tickers tickerType) {
+    assert(tickerType < TICKER_ENUM_MAX);
+    return allTickers_[tickerType].getCount();
+  }
+
+  virtual void setTickerCount(Tickers tickerType, uint64_t count) {
+    assert(tickerType < TICKER_ENUM_MAX);
+    allTickers_[tickerType].setTickerCount(count);
+  }
+
+  virtual void recordTick(Tickers tickerType, uint64_t count) {
+    assert(tickerType < TICKER_ENUM_MAX);
+    allTickers_[tickerType].recordTick(count);
+  }
+
+  virtual void measureTime(Histograms histogramType, uint64_t value) {
+    assert(histogramType < HISTOGRAM_ENUM_MAX);
+    allHistograms_[histogramType].Add(value);
+  }
+
+  virtual void histogramData(Histograms histogramType,
+                             HistogramData * const data) {
+    assert(histogramType < HISTOGRAM_ENUM_MAX);
+    allHistograms_[histogramType].Data(data);
+  }
+
+  std::vector<Ticker> allTickers_;
+  std::vector<HistogramImpl> allHistograms_;
+};
+
+std::shared_ptr<Statistics> CreateDBStatistics();
+
+} // namespace rocksdb
diff --git a/db/db_stats_logger.cc b/db/db_stats_logger.cc
new file mode 100644 (file)
index 0000000..db86865
--- /dev/null
@@ -0,0 +1,94 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_impl.h"
+#include <string>
+#include <stdint.h>
+#include <stdio.h>
+#include "db/version_set.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "port/port.h"
+#include "util/mutexlock.h"
+
+namespace rocksdb {
+
+void DBImpl::MaybeScheduleLogDBDeployStats() {
+
+  // There is a lock in the actual logger.
+  if (!logger_ || options_.db_stats_log_interval < 0
+      || host_name_.empty()) {
+    return;
+  }
+
+  if(bg_logstats_scheduled_ || shutting_down_.Acquire_Load()) {
+    // Already scheduled
+  } else {
+    int64_t current_ts = 0;
+    Status st = env_->GetCurrentTime(&current_ts);
+    if (!st.ok()) {
+      return;
+    }
+    if ((current_ts - last_log_ts) < options_.db_stats_log_interval) {
+      return;
+    }
+    last_log_ts = current_ts;
+    bg_logstats_scheduled_ = true;
+    env_->Schedule(&DBImpl::BGLogDBDeployStats, this);
+  }
+}
+
+void DBImpl::BGLogDBDeployStats(void* db) {
+  DBImpl* db_inst = reinterpret_cast<DBImpl*>(db);
+  db_inst->LogDBDeployStats();
+}
+
+void DBImpl::LogDBDeployStats() {
+  mutex_.Lock();
+
+  if (shutting_down_.Acquire_Load()) {
+    bg_logstats_scheduled_ = false;
+    bg_cv_.SignalAll();
+    mutex_.Unlock();
+    return;
+  }
+
+  char tmp_ver[100];
+  sprintf(tmp_ver, "%d.%d", kMajorVersion, kMinorVersion);
+  std::string version_info(tmp_ver);
+
+  uint64_t file_total_size = 0;
+  uint32_t file_total_num = 0;
+  Version* current = versions_->current();
+  for (int i = 0; i < current->NumberLevels(); i++) {
+    file_total_num += current->NumLevelFiles(i);
+    file_total_size += current->NumLevelBytes(i);
+  }
+
+  Version::LevelSummaryStorage scratch;
+  const char* file_num_summary = current->LevelSummary(&scratch);
+  std::string file_num_per_level(file_num_summary);
+  std::string data_size_per_level(file_num_summary);
+
+  mutex_.Unlock();
+
+  int64_t unix_ts;
+  env_->GetCurrentTime(&unix_ts);
+
+  logger_->Log_Deploy_Stats(version_info, host_name_,
+      db_absolute_path_, file_total_size, file_total_num, file_num_per_level,
+      data_size_per_level, unix_ts);
+
+  mutex_.Lock();
+  bg_logstats_scheduled_ = false;
+  bg_cv_.SignalAll();
+  mutex_.Unlock();
+}
+
+}
diff --git a/db/db_test.cc b/db/db_test.cc
new file mode 100644 (file)
index 0000000..9c8a97f
--- /dev/null
@@ -0,0 +1,4991 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <algorithm>
+#include <set>
+#include <unistd.h>
+
+#include "rocksdb/db.h"
+#include "rocksdb/filter_policy.h"
+#include "db/db_impl.h"
+#include "db/filename.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "db/db_statistics.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/env.h"
+#include "rocksdb/table.h"
+#include "util/hash.h"
+#include "util/logging.h"
+#include "util/mutexlock.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+#include "utilities/merge_operators.h"
+
+namespace rocksdb {
+
+static bool SnappyCompressionSupported(const CompressionOptions& options) {
+  std::string out;
+  Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
+  return port::Snappy_Compress(options, in.data(), in.size(), &out);
+}
+
+static bool ZlibCompressionSupported(const CompressionOptions& options) {
+  std::string out;
+  Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
+  return port::Zlib_Compress(options, in.data(), in.size(), &out);
+}
+
+static bool BZip2CompressionSupported(const CompressionOptions& options) {
+  std::string out;
+  Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
+  return port::BZip2_Compress(options, in.data(), in.size(), &out);
+}
+
+static std::string RandomString(Random* rnd, int len) {
+  std::string r;
+  test::RandomString(rnd, len, &r);
+  return r;
+}
+
+namespace anon {
+class AtomicCounter {
+ private:
+  port::Mutex mu_;
+  int count_;
+ public:
+  AtomicCounter() : count_(0) { }
+  void Increment() {
+    MutexLock l(&mu_);
+    count_++;
+  }
+  int Read() {
+    MutexLock l(&mu_);
+    return count_;
+  }
+  void Reset() {
+    MutexLock l(&mu_);
+    count_ = 0;
+  }
+};
+
+}
+
+// Special Env used to delay background operations
+class SpecialEnv : public EnvWrapper {
+ public:
+  // sstable Sync() calls are blocked while this pointer is non-nullptr.
+  port::AtomicPointer delay_sstable_sync_;
+
+  // Simulate no-space errors while this pointer is non-nullptr.
+  port::AtomicPointer no_space_;
+
+  // Simulate non-writable file system while this pointer is non-nullptr
+  port::AtomicPointer non_writable_;
+
+  // Force sync of manifest files to fail while this pointer is non-nullptr
+  port::AtomicPointer manifest_sync_error_;
+
+  // Force write to manifest files to fail while this pointer is non-nullptr
+  port::AtomicPointer manifest_write_error_;
+
+  // Force write to log files to fail while this pointer is non-nullptr
+  port::AtomicPointer log_write_error_;
+
+  bool count_random_reads_;
+  anon::AtomicCounter random_read_counter_;
+
+  anon::AtomicCounter sleep_counter_;
+
+  explicit SpecialEnv(Env* base) : EnvWrapper(base) {
+    delay_sstable_sync_.Release_Store(nullptr);
+    no_space_.Release_Store(nullptr);
+    non_writable_.Release_Store(nullptr);
+    count_random_reads_ = false;
+    manifest_sync_error_.Release_Store(nullptr);
+    manifest_write_error_.Release_Store(nullptr);
+    log_write_error_.Release_Store(nullptr);
+   }
+
+  Status NewWritableFile(const std::string& f, unique_ptr<WritableFile>* r,
+                         const EnvOptions& soptions) {
+    class SSTableFile : public WritableFile {
+     private:
+      SpecialEnv* env_;
+      unique_ptr<WritableFile> base_;
+
+     public:
+      SSTableFile(SpecialEnv* env, unique_ptr<WritableFile>&& base)
+          : env_(env),
+            base_(std::move(base)) {
+      }
+      Status Append(const Slice& data) {
+        if (env_->no_space_.Acquire_Load() != nullptr) {
+          // Drop writes on the floor
+          return Status::OK();
+        } else {
+          return base_->Append(data);
+        }
+      }
+      Status Close() { return base_->Close(); }
+      Status Flush() { return base_->Flush(); }
+      Status Sync() {
+        while (env_->delay_sstable_sync_.Acquire_Load() != nullptr) {
+          env_->SleepForMicroseconds(100000);
+        }
+        return base_->Sync();
+      }
+    };
+    class ManifestFile : public WritableFile {
+     private:
+      SpecialEnv* env_;
+      unique_ptr<WritableFile> base_;
+     public:
+      ManifestFile(SpecialEnv* env, unique_ptr<WritableFile>&& b)
+          : env_(env), base_(std::move(b)) { }
+      Status Append(const Slice& data) {
+        if (env_->manifest_write_error_.Acquire_Load() != nullptr) {
+          return Status::IOError("simulated writer error");
+        } else {
+          return base_->Append(data);
+        }
+      }
+      Status Close() { return base_->Close(); }
+      Status Flush() { return base_->Flush(); }
+      Status Sync() {
+        if (env_->manifest_sync_error_.Acquire_Load() != nullptr) {
+          return Status::IOError("simulated sync error");
+        } else {
+          return base_->Sync();
+        }
+      }
+    };
+    class LogFile : public WritableFile {
+     private:
+      SpecialEnv* env_;
+      unique_ptr<WritableFile> base_;
+     public:
+      LogFile(SpecialEnv* env, unique_ptr<WritableFile>&& b)
+          : env_(env), base_(std::move(b)) { }
+      Status Append(const Slice& data) {
+        if (env_->log_write_error_.Acquire_Load() != nullptr) {
+          return Status::IOError("simulated writer error");
+        } else {
+          return base_->Append(data);
+        }
+      }
+      Status Close() { return base_->Close(); }
+      Status Flush() { return base_->Flush(); }
+      Status Sync() { return base_->Sync(); }
+    };
+
+    if (non_writable_.Acquire_Load() != nullptr) {
+      return Status::IOError("simulated write error");
+    }
+
+    Status s = target()->NewWritableFile(f, r, soptions);
+    if (s.ok()) {
+      if (strstr(f.c_str(), ".sst") != nullptr) {
+        r->reset(new SSTableFile(this, std::move(*r)));
+      } else if (strstr(f.c_str(), "MANIFEST") != nullptr) {
+        r->reset(new ManifestFile(this, std::move(*r)));
+      } else if (strstr(f.c_str(), "log") != nullptr) {
+        r->reset(new LogFile(this, std::move(*r)));
+      }
+    }
+    return s;
+  }
+
+  Status NewRandomAccessFile(const std::string& f,
+                             unique_ptr<RandomAccessFile>* r,
+                             const EnvOptions& soptions) {
+    class CountingFile : public RandomAccessFile {
+     private:
+      unique_ptr<RandomAccessFile> target_;
+      anon::AtomicCounter* counter_;
+     public:
+      CountingFile(unique_ptr<RandomAccessFile>&& target,
+                   anon::AtomicCounter* counter)
+          : target_(std::move(target)), counter_(counter) {
+      }
+      virtual Status Read(uint64_t offset, size_t n, Slice* result,
+                          char* scratch) const {
+        counter_->Increment();
+        return target_->Read(offset, n, result, scratch);
+      }
+    };
+
+    Status s = target()->NewRandomAccessFile(f, r, soptions);
+    if (s.ok() && count_random_reads_) {
+      r->reset(new CountingFile(std::move(*r), &random_read_counter_));
+    }
+    return s;
+  }
+
+  virtual void SleepForMicroseconds(int micros) {
+    sleep_counter_.Increment();
+    target()->SleepForMicroseconds(micros);
+  }
+};
+
+class DBTest {
+ private:
+  const FilterPolicy* filter_policy_;
+
+ protected:
+  // Sequence of option configurations to try
+  enum OptionConfig {
+    kDefault,
+    kVectorRep,
+    kMergePut,
+    kFilter,
+    kUncompressed,
+    kNumLevel_3,
+    kDBLogDir,
+    kWalDir,
+    kManifestFileSize,
+    kCompactOnFlush,
+    kPerfOptions,
+    kDeletesFilterFirst,
+    kHashSkipList,
+    kUniversalCompaction,
+    kCompressedBlockCache,
+    kEnd
+  };
+  int option_config_;
+
+ public:
+  std::string dbname_;
+  SpecialEnv* env_;
+  DB* db_;
+
+  Options last_options_;
+
+  // Skip some options, as they may not be applicable to a specific test.
+  // To add more skip constants, use values 4, 8, 16, etc.
+  enum OptionSkip {
+    kNoSkip = 0,
+    kSkipDeletesFilterFirst = 1,
+    kSkipUniversalCompaction = 2,
+    kSkipMergePut = 4
+  };
+
+  DBTest() : option_config_(kDefault),
+             env_(new SpecialEnv(Env::Default())) {
+    filter_policy_ = NewBloomFilterPolicy(10);
+    dbname_ = test::TmpDir() + "/db_test";
+    ASSERT_OK(DestroyDB(dbname_, Options()));
+    db_ = nullptr;
+    Reopen();
+  }
+
+  ~DBTest() {
+    delete db_;
+    ASSERT_OK(DestroyDB(dbname_, Options()));
+    delete env_;
+    delete filter_policy_;
+  }
+
+  // Switch to a fresh database with the next option configuration to
+  // test.  Return false if there are no more configurations to test.
+  bool ChangeOptions(int skip_mask = kNoSkip) {
+    option_config_++;
+
+    // skip some options
+    if (skip_mask & kSkipDeletesFilterFirst &&
+        option_config_ == kDeletesFilterFirst) {
+      option_config_++;
+    }
+    if (skip_mask & kSkipUniversalCompaction &&
+        option_config_ == kUniversalCompaction) {
+      option_config_++;
+    }
+    if (skip_mask & kSkipMergePut && option_config_ == kMergePut) {
+      option_config_++;
+    }
+    if (option_config_ >= kEnd) {
+      Destroy(&last_options_);
+      return false;
+    } else {
+      DestroyAndReopen();
+      return true;
+    }
+  }
+
+  // Switch between different compaction styles (we have only 2 now).
+  bool ChangeCompactOptions(Options* prev_options = nullptr) {
+    if (option_config_ == kDefault) {
+      option_config_ = kUniversalCompaction;
+      if (prev_options == nullptr) {
+        prev_options = &last_options_;
+      }
+      Destroy(prev_options);
+      TryReopen();
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  // Return the current option configuration.
+  Options CurrentOptions() {
+    Options options;
+    switch (option_config_) {
+      case kHashSkipList:
+        options.memtable_factory.reset(
+            NewHashSkipListRepFactory(NewFixedPrefixTransform(1)));
+        break;
+      case kMergePut:
+        options.merge_operator = MergeOperators::CreatePutOperator();
+        break;
+      case kFilter:
+        options.filter_policy = filter_policy_;
+        break;
+      case kUncompressed:
+        options.compression = kNoCompression;
+        break;
+      case kNumLevel_3:
+        options.num_levels = 3;
+        break;
+      case kDBLogDir:
+        options.db_log_dir = test::TmpDir();
+        break;
+      case kWalDir:
+        options.wal_dir = "/tmp/wal";
+        break;
+      case kManifestFileSize:
+        options.max_manifest_file_size = 50; // 50 bytes
+      case kCompactOnFlush:
+        options.purge_redundant_kvs_while_flush =
+          !options.purge_redundant_kvs_while_flush;
+        break;
+      case kPerfOptions:
+        options.hard_rate_limit = 2.0;
+        options.rate_limit_delay_max_milliseconds = 2;
+        // TODO -- test more options
+        break;
+      case kDeletesFilterFirst:
+        options.filter_deletes = true;
+        break;
+      case kVectorRep:
+        options.memtable_factory.reset(new VectorRepFactory(100));
+        break;
+      case kUniversalCompaction:
+        options.compaction_style = kCompactionStyleUniversal;
+        break;
+      case kCompressedBlockCache:
+        options.block_cache_compressed = NewLRUCache(8*1024*1024);
+        break;
+      default:
+        break;
+    }
+    return options;
+  }
+
+  DBImpl* dbfull() {
+    return reinterpret_cast<DBImpl*>(db_);
+  }
+
+  void Reopen(Options* options = nullptr) {
+    ASSERT_OK(TryReopen(options));
+  }
+
+  void Close() {
+    delete db_;
+    db_ = nullptr;
+  }
+
+  void DestroyAndReopen(Options* options = nullptr) {
+    //Destroy using last options
+    Destroy(&last_options_);
+    ASSERT_OK(TryReopen(options));
+  }
+
+  void Destroy(Options* options) {
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(DestroyDB(dbname_, *options));
+  }
+
+  Status PureReopen(Options* options, DB** db) {
+    return DB::Open(*options, dbname_, db);
+  }
+
+  Status TryReopen(Options* options = nullptr) {
+    delete db_;
+    db_ = nullptr;
+    Options opts;
+    if (options != nullptr) {
+      opts = *options;
+    } else {
+      opts = CurrentOptions();
+      opts.create_if_missing = true;
+    }
+    last_options_ = opts;
+
+    return DB::Open(opts, dbname_, &db_);
+  }
+
+  Status Put(const Slice& k, const Slice& v, WriteOptions wo = WriteOptions()) {
+    if (kMergePut == option_config_ ) {
+      return db_->Merge(wo, k, v);
+    } else {
+      return db_->Put(wo, k, v);
+    }
+  }
+
+  Status Delete(const std::string& k) {
+    return db_->Delete(WriteOptions(), k);
+  }
+
+  std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) {
+    ReadOptions options;
+    options.verify_checksums = true;
+    options.snapshot = snapshot;
+    std::string result;
+    Status s = db_->Get(options, k, &result);
+    if (s.IsNotFound()) {
+      result = "NOT_FOUND";
+    } else if (!s.ok()) {
+      result = s.ToString();
+    }
+    return result;
+  }
+
+  // Return a string that contains all key,value pairs in order,
+  // formatted like "(k1->v1)(k2->v2)".
+  std::string Contents() {
+    std::vector<std::string> forward;
+    std::string result;
+    Iterator* iter = db_->NewIterator(ReadOptions());
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      std::string s = IterStatus(iter);
+      result.push_back('(');
+      result.append(s);
+      result.push_back(')');
+      forward.push_back(s);
+    }
+
+    // Check reverse iteration results are the reverse of forward results
+    unsigned int matched = 0;
+    for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
+      ASSERT_LT(matched, forward.size());
+      ASSERT_EQ(IterStatus(iter), forward[forward.size() - matched - 1]);
+      matched++;
+    }
+    ASSERT_EQ(matched, forward.size());
+
+    delete iter;
+    return result;
+  }
+
+  std::string AllEntriesFor(const Slice& user_key) {
+    Iterator* iter = dbfull()->TEST_NewInternalIterator();
+    InternalKey target(user_key, kMaxSequenceNumber, kTypeValue);
+    iter->Seek(target.Encode());
+    std::string result;
+    if (!iter->status().ok()) {
+      result = iter->status().ToString();
+    } else {
+      result = "[ ";
+      bool first = true;
+      while (iter->Valid()) {
+        ParsedInternalKey ikey(Slice(), 0, kTypeValue);
+        if (!ParseInternalKey(iter->key(), &ikey)) {
+          result += "CORRUPTED";
+        } else {
+          if (last_options_.comparator->Compare(ikey.user_key, user_key) != 0) {
+            break;
+          }
+          if (!first) {
+            result += ", ";
+          }
+          first = false;
+          switch (ikey.type) {
+            case kTypeValue:
+              result += iter->value().ToString();
+              break;
+            case kTypeMerge:
+              // keep it the same as kTypeValue for testing kMergePut
+              result += iter->value().ToString();
+              break;
+            case kTypeDeletion:
+              result += "DEL";
+              break;
+            case kTypeLogData:
+              assert(false);
+              break;
+          }
+        }
+        iter->Next();
+      }
+      if (!first) {
+        result += " ";
+      }
+      result += "]";
+    }
+    delete iter;
+    return result;
+  }
+
+  int NumTableFilesAtLevel(int level) {
+    std::string property;
+    ASSERT_TRUE(
+        db_->GetProperty("rocksdb.num-files-at-level" + NumberToString(level),
+                         &property));
+    return atoi(property.c_str());
+  }
+
+  int TotalTableFiles() {
+    int result = 0;
+    for (int level = 0; level < db_->NumberLevels(); level++) {
+      result += NumTableFilesAtLevel(level);
+    }
+    return result;
+  }
+
+  // Return spread of files per level
+  std::string FilesPerLevel() {
+    std::string result;
+    int last_non_zero_offset = 0;
+    for (int level = 0; level < db_->NumberLevels(); level++) {
+      int f = NumTableFilesAtLevel(level);
+      char buf[100];
+      snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
+      result += buf;
+      if (f > 0) {
+        last_non_zero_offset = result.size();
+      }
+    }
+    result.resize(last_non_zero_offset);
+    return result;
+  }
+
+  int CountFiles() {
+    std::vector<std::string> files;
+    env_->GetChildren(dbname_, &files);
+
+    std::vector<std::string> logfiles;
+    if (dbname_ != last_options_.wal_dir) {
+      env_->GetChildren(last_options_.wal_dir, &logfiles);
+    }
+
+    return static_cast<int>(files.size() + logfiles.size());
+  }
+
+  int CountLiveFiles() {
+    std::vector<std::string> files;
+    uint64_t manifest_file_size;
+    db_->GetLiveFiles(files, &manifest_file_size);
+    return files.size();
+  }
+
+  uint64_t Size(const Slice& start, const Slice& limit) {
+    Range r(start, limit);
+    uint64_t size;
+    db_->GetApproximateSizes(&r, 1, &size);
+    return size;
+  }
+
+  void Compact(const Slice& start, const Slice& limit) {
+    db_->CompactRange(&start, &limit);
+  }
+
+  // Do n memtable compactions, each of which produces an sstable
+  // covering the range [small,large].
+  void MakeTables(int n, const std::string& small, const std::string& large) {
+    for (int i = 0; i < n; i++) {
+      Put(small, "begin");
+      Put(large, "end");
+      dbfull()->TEST_FlushMemTable();
+    }
+  }
+
+  // Prevent pushing of new sstables into deeper levels by adding
+  // tables that cover a specified range to all levels.
+  void FillLevels(const std::string& smallest, const std::string& largest) {
+    MakeTables(db_->NumberLevels(), smallest, largest);
+  }
+
+  void DumpFileCounts(const char* label) {
+    fprintf(stderr, "---\n%s:\n", label);
+    fprintf(stderr, "maxoverlap: %lld\n",
+            static_cast<long long>(
+                dbfull()->TEST_MaxNextLevelOverlappingBytes()));
+    for (int level = 0; level < db_->NumberLevels(); level++) {
+      int num = NumTableFilesAtLevel(level);
+      if (num > 0) {
+        fprintf(stderr, "  level %3d : %d files\n", level, num);
+      }
+    }
+  }
+
+  std::string DumpSSTableList() {
+    std::string property;
+    db_->GetProperty("rocksdb.sstables", &property);
+    return property;
+  }
+
+  std::string IterStatus(Iterator* iter) {
+    std::string result;
+    if (iter->Valid()) {
+      result = iter->key().ToString() + "->" + iter->value().ToString();
+    } else {
+      result = "(invalid)";
+    }
+    return result;
+  }
+
+  Options OptionsForLogIterTest() {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.WAL_ttl_seconds = 1000;
+    return options;
+  }
+
+  std::unique_ptr<TransactionLogIterator> OpenTransactionLogIter(
+      const SequenceNumber seq) {
+    unique_ptr<TransactionLogIterator> iter;
+    Status status = dbfull()->GetUpdatesSince(seq, &iter);
+    ASSERT_OK(status);
+    ASSERT_TRUE(iter->Valid());
+    return std::move(iter);
+  }
+
+  std::string DummyString(size_t len, char c = 'a') {
+    return std::string(len, c);
+  }
+
+  void VerifyIterLast(std::string expected_key) {
+    Iterator* iter = db_->NewIterator(ReadOptions());
+    iter->SeekToLast();
+    ASSERT_EQ(IterStatus(iter), expected_key);
+    delete iter;
+  }
+};
+
+static std::string Key(int i) {
+  char buf[100];
+  snprintf(buf, sizeof(buf), "key%06d", i);
+  return std::string(buf);
+}
+
+TEST(DBTest, Empty) {
+  do {
+    ASSERT_TRUE(db_ != nullptr);
+    ASSERT_EQ("NOT_FOUND", Get("foo"));
+  } while (ChangeOptions());
+}
+
+TEST(DBTest, ReadWrite) {
+  do {
+    ASSERT_OK(Put("foo", "v1"));
+    ASSERT_EQ("v1", Get("foo"));
+    ASSERT_OK(Put("bar", "v2"));
+    ASSERT_OK(Put("foo", "v3"));
+    ASSERT_EQ("v3", Get("foo"));
+    ASSERT_EQ("v2", Get("bar"));
+  } while (ChangeOptions());
+}
+
+// Make sure that when options.block_cache is set, after a new table is
+// created its index/filter blocks are added to block cache.
+TEST(DBTest, IndexAndFilterBlocksOfNewTableAddedToCache) {
+  Options options = CurrentOptions();
+  std::unique_ptr<const FilterPolicy> filter_policy(NewBloomFilterPolicy(20));
+  options.filter_policy = filter_policy.get();
+  options.create_if_missing = true;
+  options.statistics = rocksdb::CreateDBStatistics();
+  DestroyAndReopen(&options);
+
+  ASSERT_OK(db_->Put(WriteOptions(), "key", "val"));
+  // Create a new talbe.
+  dbfull()->Flush(FlushOptions());
+
+  // index/filter blocks added to block cache right after table creation.
+  ASSERT_EQ(1,
+            options.statistics.get()->getTickerCount(BLOCK_CACHE_INDEX_MISS));
+  ASSERT_EQ(1,
+            options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(2, /* only index/filter were added */
+            options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD));
+  ASSERT_EQ(0,
+            options.statistics.get()->getTickerCount(BLOCK_CACHE_DATA_MISS));
+
+  // Make sure filter block is in cache.
+  std::string value;
+  ReadOptions ropt;
+  db_->KeyMayExist(ReadOptions(), "key", &value);
+
+  // Miss count should remain the same.
+  ASSERT_EQ(1,
+            options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(1,
+            options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_HIT));
+
+  db_->KeyMayExist(ReadOptions(), "key", &value);
+  ASSERT_EQ(1,
+            options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(2,
+            options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_HIT));
+
+  // Make sure index block is in cache.
+  auto index_block_hit =
+    options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_HIT);
+  value = Get("key");
+  ASSERT_EQ(1,
+            options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(index_block_hit + 1,
+            options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_HIT));
+
+  value = Get("key");
+  ASSERT_EQ(1,
+            options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(index_block_hit + 2,
+            options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_HIT));
+}
+
+TEST(DBTest, LevelLimitReopen) {
+  Options options = CurrentOptions();
+  Reopen(&options);
+
+  const std::string value(1024 * 1024, ' ');
+  int i = 0;
+  while (NumTableFilesAtLevel(2) == 0) {
+    ASSERT_OK(Put(Key(i++), value));
+  }
+
+  options.num_levels = 1;
+  options.max_bytes_for_level_multiplier_additional.resize(1, 1);
+  Status s = TryReopen(&options);
+  ASSERT_EQ(s.IsInvalidArgument(), true);
+  ASSERT_EQ(s.ToString(),
+            "Invalid argument: db has more levels than options.num_levels");
+
+  options.num_levels = 10;
+  options.max_bytes_for_level_multiplier_additional.resize(10, 1);
+  ASSERT_OK(TryReopen(&options));
+}
+
+TEST(DBTest, Preallocation) {
+  const std::string src = dbname_ + "/alloc_test";
+  unique_ptr<WritableFile> srcfile;
+  const EnvOptions soptions;
+  ASSERT_OK(env_->NewWritableFile(src, &srcfile, soptions));
+  srcfile->SetPreallocationBlockSize(1024 * 1024);
+
+  // No writes should mean no preallocation
+  size_t block_size, last_allocated_block;
+  srcfile->GetPreallocationStatus(&block_size, &last_allocated_block);
+  ASSERT_EQ(last_allocated_block, 0UL);
+
+  // Small write should preallocate one block
+  srcfile->Append("test");
+  srcfile->GetPreallocationStatus(&block_size, &last_allocated_block);
+  ASSERT_EQ(last_allocated_block, 1UL);
+
+  // Write an entire preallocation block, make sure we increased by two.
+  std::string buf(block_size, ' ');
+  srcfile->Append(buf);
+  srcfile->GetPreallocationStatus(&block_size, &last_allocated_block);
+  ASSERT_EQ(last_allocated_block, 2UL);
+
+  // Write five more blocks at once, ensure we're where we need to be.
+  buf = std::string(block_size * 5, ' ');
+  srcfile->Append(buf);
+  srcfile->GetPreallocationStatus(&block_size, &last_allocated_block);
+  ASSERT_EQ(last_allocated_block, 7UL);
+}
+
+TEST(DBTest, PutDeleteGet) {
+  do {
+    ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1"));
+    ASSERT_EQ("v1", Get("foo"));
+    ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2"));
+    ASSERT_EQ("v2", Get("foo"));
+    ASSERT_OK(db_->Delete(WriteOptions(), "foo"));
+    ASSERT_EQ("NOT_FOUND", Get("foo"));
+  } while (ChangeOptions());
+}
+
+
+TEST(DBTest, GetFromImmutableLayer) {
+  do {
+    Options options = CurrentOptions();
+    options.env = env_;
+    options.write_buffer_size = 100000;  // Small write buffer
+    Reopen(&options);
+
+    ASSERT_OK(Put("foo", "v1"));
+    ASSERT_EQ("v1", Get("foo"));
+
+    env_->delay_sstable_sync_.Release_Store(env_);   // Block sync calls
+    Put("k1", std::string(100000, 'x'));             // Fill memtable
+    Put("k2", std::string(100000, 'y'));             // Trigger compaction
+    ASSERT_EQ("v1", Get("foo"));
+    env_->delay_sstable_sync_.Release_Store(nullptr);   // Release sync calls
+  } while (ChangeOptions());
+}
+
+TEST(DBTest, GetFromVersions) {
+  do {
+    ASSERT_OK(Put("foo", "v1"));
+    dbfull()->TEST_FlushMemTable();
+    ASSERT_EQ("v1", Get("foo"));
+  } while (ChangeOptions());
+}
+
+TEST(DBTest, GetSnapshot) {
+  do {
+    // Try with both a short key and a long key
+    for (int i = 0; i < 2; i++) {
+      std::string key = (i == 0) ? std::string("foo") : std::string(200, 'x');
+      ASSERT_OK(Put(key, "v1"));
+      const Snapshot* s1 = db_->GetSnapshot();
+      ASSERT_OK(Put(key, "v2"));
+      ASSERT_EQ("v2", Get(key));
+      ASSERT_EQ("v1", Get(key, s1));
+      dbfull()->TEST_FlushMemTable();
+      ASSERT_EQ("v2", Get(key));
+      ASSERT_EQ("v1", Get(key, s1));
+      db_->ReleaseSnapshot(s1);
+    }
+  } while (ChangeOptions());
+}
+
+TEST(DBTest, GetLevel0Ordering) {
+  do {
+    // Check that we process level-0 files in correct order.  The code
+    // below generates two level-0 files where the earlier one comes
+    // before the later one in the level-0 file list since the earlier
+    // one has a smaller "smallest" key.
+    ASSERT_OK(Put("bar", "b"));
+    ASSERT_OK(Put("foo", "v1"));
+    dbfull()->TEST_FlushMemTable();
+    ASSERT_OK(Put("foo", "v2"));
+    dbfull()->TEST_FlushMemTable();
+    ASSERT_EQ("v2", Get("foo"));
+  } while (ChangeOptions());
+}
+
+TEST(DBTest, GetOrderedByLevels) {
+  do {
+    ASSERT_OK(Put("foo", "v1"));
+    Compact("a", "z");
+    ASSERT_EQ("v1", Get("foo"));
+    ASSERT_OK(Put("foo", "v2"));
+    ASSERT_EQ("v2", Get("foo"));
+    dbfull()->TEST_FlushMemTable();
+    ASSERT_EQ("v2", Get("foo"));
+  } while (ChangeOptions());
+}
+
+TEST(DBTest, GetPicksCorrectFile) {
+  do {
+    // Arrange to have multiple files in a non-level-0 level.
+    ASSERT_OK(Put("a", "va"));
+    Compact("a", "b");
+    ASSERT_OK(Put("x", "vx"));
+    Compact("x", "y");
+    ASSERT_OK(Put("f", "vf"));
+    Compact("f", "g");
+    ASSERT_EQ("va", Get("a"));
+    ASSERT_EQ("vf", Get("f"));
+    ASSERT_EQ("vx", Get("x"));
+  } while (ChangeOptions());
+}
+
+TEST(DBTest, GetEncountersEmptyLevel) {
+  do {
+    // Arrange for the following to happen:
+    //   * sstable A in level 0
+    //   * nothing in level 1
+    //   * sstable B in level 2
+    // Then do enough Get() calls to arrange for an automatic compaction
+    // of sstable A.  A bug would cause the compaction to be marked as
+    // occuring at level 1 (instead of the correct level 0).
+
+    // Step 1: First place sstables in levels 0 and 2
+    int compaction_count = 0;
+    while (NumTableFilesAtLevel(0) == 0 ||
+           NumTableFilesAtLevel(2) == 0) {
+      ASSERT_LE(compaction_count, 100) << "could not fill levels 0 and 2";
+      compaction_count++;
+      Put("a", "begin");
+      Put("z", "end");
+      dbfull()->TEST_FlushMemTable();
+    }
+
+    // Step 2: clear level 1 if necessary.
+    dbfull()->TEST_CompactRange(1, nullptr, nullptr);
+    ASSERT_EQ(NumTableFilesAtLevel(0), 1);
+    ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+    ASSERT_EQ(NumTableFilesAtLevel(2), 1);
+
+    // Step 3: read a bunch of times
+    for (int i = 0; i < 1000; i++) {
+      ASSERT_EQ("NOT_FOUND", Get("missing"));
+    }
+
+    // Step 4: Wait for compaction to finish
+    env_->SleepForMicroseconds(1000000);
+
+    ASSERT_EQ(NumTableFilesAtLevel(0), 1); // XXX
+  } while (ChangeOptions(kSkipUniversalCompaction));
+}
+
+// KeyMayExist can lead to a few false positives, but not false negatives.
+// To make test deterministic, use a much larger number of bits per key-20 than
+// bits in the key, so that false positives are eliminated
+TEST(DBTest, KeyMayExist) {
+  do {
+    ReadOptions ropts;
+    std::string value;
+    Options options = CurrentOptions();
+    options.filter_policy = NewBloomFilterPolicy(20);
+    options.statistics = rocksdb::CreateDBStatistics();
+    Reopen(&options);
+
+    ASSERT_TRUE(!db_->KeyMayExist(ropts, "a", &value));
+
+    ASSERT_OK(db_->Put(WriteOptions(), "a", "b"));
+    bool value_found = false;
+    ASSERT_TRUE(db_->KeyMayExist(ropts, "a", &value, &value_found));
+    ASSERT_TRUE(value_found);
+    ASSERT_EQ("b", value);
+
+    dbfull()->Flush(FlushOptions());
+    value.clear();
+
+    long numopen = options.statistics.get()->getTickerCount(NO_FILE_OPENS);
+    long cache_added =
+      options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD);
+    ASSERT_TRUE(db_->KeyMayExist(ropts, "a", &value, &value_found));
+    ASSERT_TRUE(!value_found);
+    // assert that no new files were opened and no new blocks were
+    // read into block cache.
+    ASSERT_EQ(numopen, options.statistics.get()->getTickerCount(NO_FILE_OPENS));
+    ASSERT_EQ(cache_added,
+              options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD));
+
+    ASSERT_OK(db_->Delete(WriteOptions(), "a"));
+
+    numopen = options.statistics.get()->getTickerCount(NO_FILE_OPENS);
+    cache_added =
+      options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD);
+    ASSERT_TRUE(!db_->KeyMayExist(ropts, "a", &value));
+    ASSERT_EQ(numopen, options.statistics.get()->getTickerCount(NO_FILE_OPENS));
+    ASSERT_EQ(cache_added,
+              options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD));
+
+    dbfull()->Flush(FlushOptions());
+    dbfull()->CompactRange(nullptr, nullptr);
+
+    numopen = options.statistics.get()->getTickerCount(NO_FILE_OPENS);
+    cache_added =
+      options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD);
+    ASSERT_TRUE(!db_->KeyMayExist(ropts, "a", &value));
+    ASSERT_EQ(numopen, options.statistics.get()->getTickerCount(NO_FILE_OPENS));
+    ASSERT_EQ(cache_added,
+              options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD));
+
+    ASSERT_OK(db_->Delete(WriteOptions(), "c"));
+
+    numopen = options.statistics.get()->getTickerCount(NO_FILE_OPENS);
+    cache_added =
+      options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD);
+    ASSERT_TRUE(!db_->KeyMayExist(ropts, "c", &value));
+    ASSERT_EQ(numopen, options.statistics.get()->getTickerCount(NO_FILE_OPENS));
+    ASSERT_EQ(cache_added,
+              options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD));
+
+    delete options.filter_policy;
+  } while (ChangeOptions());
+}
+
+TEST(DBTest, NonBlockingIteration) {
+  do {
+    ReadOptions non_blocking_opts, regular_opts;
+    Options options = CurrentOptions();
+    options.statistics = rocksdb::CreateDBStatistics();
+    non_blocking_opts.read_tier = kBlockCacheTier;
+    Reopen(&options);
+    // write one kv to the database.
+    ASSERT_OK(db_->Put(WriteOptions(), "a", "b"));
+
+    // scan using non-blocking iterator. We should find it because
+    // it is in memtable.
+    Iterator* iter = db_->NewIterator(non_blocking_opts);
+    int count = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ASSERT_OK(iter->status());
+      count++;
+    }
+    ASSERT_EQ(count, 1);
+    delete iter;
+
+    // flush memtable to storage. Now, the key should not be in the
+    // memtable neither in the block cache.
+    dbfull()->Flush(FlushOptions());
+
+    // verify that a non-blocking iterator does not find any
+    // kvs. Neither does it do any IOs to storage.
+    long numopen = options.statistics.get()->getTickerCount(NO_FILE_OPENS);
+    long cache_added =
+      options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD);
+    iter = db_->NewIterator(non_blocking_opts);
+    count = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      count++;
+    }
+    ASSERT_EQ(count, 0);
+    ASSERT_TRUE(iter->status().IsIncomplete());
+    ASSERT_EQ(numopen, options.statistics.get()->getTickerCount(NO_FILE_OPENS));
+    ASSERT_EQ(cache_added,
+              options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD));
+    delete iter;
+
+    // read in the specified block via a regular get
+    ASSERT_EQ(Get("a"), "b");
+
+    // verify that we can find it via a non-blocking scan
+    numopen = options.statistics.get()->getTickerCount(NO_FILE_OPENS);
+    cache_added =
+      options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD);
+    iter = db_->NewIterator(non_blocking_opts);
+    count = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ASSERT_OK(iter->status());
+      count++;
+    }
+    ASSERT_EQ(count, 1);
+    ASSERT_EQ(numopen, options.statistics.get()->getTickerCount(NO_FILE_OPENS));
+    ASSERT_EQ(cache_added,
+              options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD));
+    delete iter;
+
+  } while (ChangeOptions());
+}
+
+// A delete is skipped for key if KeyMayExist(key) returns False
+// Tests Writebatch consistency and proper delete behaviour
+TEST(DBTest, FilterDeletes) {
+  do {
+    Options options = CurrentOptions();
+    options.filter_policy = NewBloomFilterPolicy(20);
+    options.filter_deletes = true;
+    Reopen(&options);
+    WriteBatch batch;
+
+    batch.Delete("a");
+    dbfull()->Write(WriteOptions(), &batch);
+    ASSERT_EQ(AllEntriesFor("a"), "[ ]"); // Delete skipped
+    batch.Clear();
+
+    batch.Put("a", "b");
+    batch.Delete("a");
+    dbfull()->Write(WriteOptions(), &batch);
+    ASSERT_EQ(Get("a"), "NOT_FOUND");
+    ASSERT_EQ(AllEntriesFor("a"), "[ DEL, b ]"); // Delete issued
+    batch.Clear();
+
+    batch.Delete("c");
+    batch.Put("c", "d");
+    dbfull()->Write(WriteOptions(), &batch);
+    ASSERT_EQ(Get("c"), "d");
+    ASSERT_EQ(AllEntriesFor("c"), "[ d ]"); // Delete skipped
+    batch.Clear();
+
+    dbfull()->Flush(FlushOptions()); // A stray Flush
+
+    batch.Delete("c");
+    dbfull()->Write(WriteOptions(), &batch);
+    ASSERT_EQ(AllEntriesFor("c"), "[ DEL, d ]"); // Delete issued
+    batch.Clear();
+
+    delete options.filter_policy;
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, IterEmpty) {
+  do {
+    Iterator* iter = db_->NewIterator(ReadOptions());
+
+    iter->SeekToFirst();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    iter->SeekToLast();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    iter->Seek("foo");
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    delete iter;
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, IterSingle) {
+  do {
+    ASSERT_OK(Put("a", "va"));
+    Iterator* iter = db_->NewIterator(ReadOptions());
+
+    iter->SeekToFirst();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+    iter->SeekToFirst();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    iter->SeekToLast();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+    iter->SeekToLast();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    iter->Seek("");
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    iter->Seek("a");
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    iter->Seek("b");
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    delete iter;
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, IterMulti) {
+  do {
+    ASSERT_OK(Put("a", "va"));
+    ASSERT_OK(Put("b", "vb"));
+    ASSERT_OK(Put("c", "vc"));
+    Iterator* iter = db_->NewIterator(ReadOptions());
+
+    iter->SeekToFirst();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "b->vb");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "c->vc");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+    iter->SeekToFirst();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    iter->SeekToLast();
+    ASSERT_EQ(IterStatus(iter), "c->vc");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "b->vb");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+    iter->SeekToLast();
+    ASSERT_EQ(IterStatus(iter), "c->vc");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    iter->Seek("");
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Seek("a");
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Seek("ax");
+    ASSERT_EQ(IterStatus(iter), "b->vb");
+    iter->Seek("b");
+    ASSERT_EQ(IterStatus(iter), "b->vb");
+    iter->Seek("z");
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    // Switch from reverse to forward
+    iter->SeekToLast();
+    iter->Prev();
+    iter->Prev();
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "b->vb");
+
+    // Switch from forward to reverse
+    iter->SeekToFirst();
+    iter->Next();
+    iter->Next();
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "b->vb");
+
+    // Make sure iter stays at snapshot
+    ASSERT_OK(Put("a",  "va2"));
+    ASSERT_OK(Put("a2", "va3"));
+    ASSERT_OK(Put("b",  "vb2"));
+    ASSERT_OK(Put("c",  "vc2"));
+    ASSERT_OK(Delete("b"));
+    iter->SeekToFirst();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "b->vb");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "c->vc");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+    iter->SeekToLast();
+    ASSERT_EQ(IterStatus(iter), "c->vc");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "b->vb");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    delete iter;
+  } while (ChangeCompactOptions());
+}
+
+// Check that we can skip over a run of user keys
+// by using reseek rather than sequential scan
+TEST(DBTest, IterReseek) {
+  Options options = CurrentOptions();
+  options.max_sequential_skip_in_iterations = 3;
+  options.create_if_missing = true;
+  options.statistics = rocksdb::CreateDBStatistics();
+  DestroyAndReopen(&options);
+
+  // insert two keys with same userkey and verify that
+  // reseek is not invoked. For each of these test cases,
+  // verify that we can find the next key "b".
+  ASSERT_OK(Put("a",  "one"));
+  ASSERT_OK(Put("a",  "two"));
+  ASSERT_OK(Put("b",  "bone"));
+  Iterator* iter = db_->NewIterator(ReadOptions());
+  iter->SeekToFirst();
+  ASSERT_EQ(options.statistics.get()->getTickerCount(
+            NUMBER_OF_RESEEKS_IN_ITERATION), 0);
+  ASSERT_EQ(IterStatus(iter), "a->two");
+  iter->Next();
+  ASSERT_EQ(options.statistics.get()->getTickerCount(
+            NUMBER_OF_RESEEKS_IN_ITERATION), 0);
+  ASSERT_EQ(IterStatus(iter), "b->bone");
+  delete iter;
+
+  // insert a total of three keys with same userkey and verify
+  // that reseek is still not invoked.
+  ASSERT_OK(Put("a",  "three"));
+  iter = db_->NewIterator(ReadOptions());
+  iter->SeekToFirst();
+  ASSERT_EQ(IterStatus(iter), "a->three");
+  iter->Next();
+  ASSERT_EQ(options.statistics.get()->getTickerCount(
+            NUMBER_OF_RESEEKS_IN_ITERATION), 0);
+  ASSERT_EQ(IterStatus(iter), "b->bone");
+  delete iter;
+
+  // insert a total of four keys with same userkey and verify
+  // that reseek is invoked.
+  ASSERT_OK(Put("a",  "four"));
+  iter = db_->NewIterator(ReadOptions());
+  iter->SeekToFirst();
+  ASSERT_EQ(IterStatus(iter), "a->four");
+  ASSERT_EQ(options.statistics.get()->getTickerCount(
+            NUMBER_OF_RESEEKS_IN_ITERATION), 0);
+  iter->Next();
+  ASSERT_EQ(options.statistics.get()->getTickerCount(
+            NUMBER_OF_RESEEKS_IN_ITERATION), 1);
+  ASSERT_EQ(IterStatus(iter), "b->bone");
+  delete iter;
+
+  // Testing reverse iterator
+  // At this point, we have three versions of "a" and one version of "b".
+  // The reseek statistics is already at 1.
+  int num_reseeks = (int)options.statistics.get()->getTickerCount(
+                 NUMBER_OF_RESEEKS_IN_ITERATION);
+
+  // Insert another version of b and assert that reseek is not invoked
+  ASSERT_OK(Put("b",  "btwo"));
+  iter = db_->NewIterator(ReadOptions());
+  iter->SeekToLast();
+  ASSERT_EQ(IterStatus(iter), "b->btwo");
+  ASSERT_EQ(options.statistics.get()->getTickerCount(
+            NUMBER_OF_RESEEKS_IN_ITERATION), num_reseeks);
+  iter->Prev();
+  ASSERT_EQ(options.statistics.get()->getTickerCount(
+            NUMBER_OF_RESEEKS_IN_ITERATION), num_reseeks+1);
+  ASSERT_EQ(IterStatus(iter), "a->four");
+  delete iter;
+
+  // insert two more versions of b. This makes a total of 4 versions
+  // of b and 4 versions of a.
+  ASSERT_OK(Put("b",  "bthree"));
+  ASSERT_OK(Put("b",  "bfour"));
+  iter = db_->NewIterator(ReadOptions());
+  iter->SeekToLast();
+  ASSERT_EQ(IterStatus(iter), "b->bfour");
+  ASSERT_EQ(options.statistics.get()->getTickerCount(
+            NUMBER_OF_RESEEKS_IN_ITERATION), num_reseeks + 2);
+  iter->Prev();
+
+  // the previous Prev call should have invoked reseek
+  ASSERT_EQ(options.statistics.get()->getTickerCount(
+            NUMBER_OF_RESEEKS_IN_ITERATION), num_reseeks + 3);
+  ASSERT_EQ(IterStatus(iter), "a->four");
+  delete iter;
+}
+
+TEST(DBTest, IterSmallAndLargeMix) {
+  do {
+    ASSERT_OK(Put("a", "va"));
+    ASSERT_OK(Put("b", std::string(100000, 'b')));
+    ASSERT_OK(Put("c", "vc"));
+    ASSERT_OK(Put("d", std::string(100000, 'd')));
+    ASSERT_OK(Put("e", std::string(100000, 'e')));
+
+    Iterator* iter = db_->NewIterator(ReadOptions());
+
+    iter->SeekToFirst();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b'));
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "c->vc");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd'));
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e'));
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    iter->SeekToLast();
+    ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e'));
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd'));
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "c->vc");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b'));
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    delete iter;
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, IterMultiWithDelete) {
+  do {
+    ASSERT_OK(Put("a", "va"));
+    ASSERT_OK(Put("b", "vb"));
+    ASSERT_OK(Put("c", "vc"));
+    ASSERT_OK(Delete("b"));
+    ASSERT_EQ("NOT_FOUND", Get("b"));
+
+    Iterator* iter = db_->NewIterator(ReadOptions());
+    iter->Seek("c");
+    ASSERT_EQ(IterStatus(iter), "c->vc");
+    if (!CurrentOptions().merge_operator) {
+      // TODO: merge operator does not support backward iteration yet
+      iter->Prev();
+      ASSERT_EQ(IterStatus(iter), "a->va");
+    }
+    delete iter;
+  } while (ChangeOptions());
+}
+
+TEST(DBTest, IterPrevMaxSkip) {
+  do {
+    for (int i = 0; i < 2; i++) {
+      db_->Put(WriteOptions(), "key1", "v1");
+      db_->Put(WriteOptions(), "key2", "v2");
+      db_->Put(WriteOptions(), "key3", "v3");
+      db_->Put(WriteOptions(), "key4", "v4");
+      db_->Put(WriteOptions(), "key5", "v5");
+    }
+
+    VerifyIterLast("key5->v5");
+
+    ASSERT_OK(db_->Delete(WriteOptions(), "key5"));
+    VerifyIterLast("key4->v4");
+
+    ASSERT_OK(db_->Delete(WriteOptions(), "key4"));
+    VerifyIterLast("key3->v3");
+
+    ASSERT_OK(db_->Delete(WriteOptions(), "key3"));
+    VerifyIterLast("key2->v2");
+
+    ASSERT_OK(db_->Delete(WriteOptions(), "key2"));
+    VerifyIterLast("key1->v1");
+
+    ASSERT_OK(db_->Delete(WriteOptions(), "key1"));
+    VerifyIterLast("(invalid)");
+  } while (ChangeOptions(kSkipMergePut));
+}
+
+TEST(DBTest, IterWithSnapshot) {
+  do {
+    ASSERT_OK(Put("key1", "val1"));
+    ASSERT_OK(Put("key2", "val2"));
+    ASSERT_OK(Put("key3", "val3"));
+    ASSERT_OK(Put("key4", "val4"));
+    ASSERT_OK(Put("key5", "val5"));
+
+    const Snapshot *snapshot = db_->GetSnapshot();
+    ReadOptions options;
+    options.snapshot = snapshot;
+    Iterator* iter = db_->NewIterator(options);
+
+    // Put more values after the snapshot
+    ASSERT_OK(Put("key100", "val100"));
+    ASSERT_OK(Put("key101", "val101"));
+
+    iter->Seek("key5");
+    ASSERT_EQ(IterStatus(iter), "key5->val5");
+    if (!CurrentOptions().merge_operator) {
+      // TODO: merge operator does not support backward iteration yet
+      iter->Prev();
+      ASSERT_EQ(IterStatus(iter), "key4->val4");
+      iter->Prev();
+      ASSERT_EQ(IterStatus(iter), "key3->val3");
+
+      iter->Next();
+      ASSERT_EQ(IterStatus(iter), "key4->val4");
+      iter->Next();
+      ASSERT_EQ(IterStatus(iter), "key5->val5");
+      iter->Next();
+      ASSERT_TRUE(!iter->Valid());
+    }
+    db_->ReleaseSnapshot(snapshot);
+    delete iter;
+  } while (ChangeOptions());
+}
+
+TEST(DBTest, Recover) {
+  do {
+    ASSERT_OK(Put("foo", "v1"));
+    ASSERT_OK(Put("baz", "v5"));
+
+    Reopen();
+    ASSERT_EQ("v1", Get("foo"));
+
+    ASSERT_EQ("v1", Get("foo"));
+    ASSERT_EQ("v5", Get("baz"));
+    ASSERT_OK(Put("bar", "v2"));
+    ASSERT_OK(Put("foo", "v3"));
+
+    Reopen();
+    ASSERT_EQ("v3", Get("foo"));
+    ASSERT_OK(Put("foo", "v4"));
+    ASSERT_EQ("v4", Get("foo"));
+    ASSERT_EQ("v2", Get("bar"));
+    ASSERT_EQ("v5", Get("baz"));
+  } while (ChangeOptions());
+}
+
+TEST(DBTest, RollLog) {
+  do {
+    ASSERT_OK(Put("foo", "v1"));
+    ASSERT_OK(Put("baz", "v5"));
+
+    Reopen();
+    for (int i = 0; i < 10; i++) {
+      Reopen();
+    }
+    ASSERT_OK(Put("foo", "v4"));
+    for (int i = 0; i < 10; i++) {
+      Reopen();
+    }
+  } while (ChangeOptions());
+}
+
+TEST(DBTest, WAL) {
+  do {
+    Options options = CurrentOptions();
+    WriteOptions writeOpt = WriteOptions();
+    writeOpt.disableWAL = true;
+    ASSERT_OK(dbfull()->Put(writeOpt, "foo", "v1"));
+    ASSERT_OK(dbfull()->Put(writeOpt, "bar", "v1"));
+
+    Reopen();
+    ASSERT_EQ("v1", Get("foo"));
+    ASSERT_EQ("v1", Get("bar"));
+
+    writeOpt.disableWAL = false;
+    ASSERT_OK(dbfull()->Put(writeOpt, "bar", "v2"));
+    writeOpt.disableWAL = true;
+    ASSERT_OK(dbfull()->Put(writeOpt, "foo", "v2"));
+
+    Reopen();
+    // Both value's should be present.
+    ASSERT_EQ("v2", Get("bar"));
+    ASSERT_EQ("v2", Get("foo"));
+
+    writeOpt.disableWAL = true;
+    ASSERT_OK(dbfull()->Put(writeOpt, "bar", "v3"));
+    writeOpt.disableWAL = false;
+    ASSERT_OK(dbfull()->Put(writeOpt, "foo", "v3"));
+
+    Reopen();
+    // again both values should be present.
+    ASSERT_EQ("v3", Get("foo"));
+    ASSERT_EQ("v3", Get("bar"));
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, CheckLock) {
+  do {
+    DB* localdb;
+    Options options = CurrentOptions();
+    ASSERT_OK(TryReopen(&options));
+
+    // second open should fail
+    ASSERT_TRUE(!(PureReopen(&options, &localdb)).ok());
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, FlushMultipleMemtable) {
+  do {
+    Options options = CurrentOptions();
+    WriteOptions writeOpt = WriteOptions();
+    writeOpt.disableWAL = true;
+    options.max_write_buffer_number = 4;
+    options.min_write_buffer_number_to_merge = 3;
+    Reopen(&options);
+    ASSERT_OK(dbfull()->Put(writeOpt, "foo", "v1"));
+    dbfull()->Flush(FlushOptions());
+    ASSERT_OK(dbfull()->Put(writeOpt, "bar", "v1"));
+
+    ASSERT_EQ("v1", Get("foo"));
+    ASSERT_EQ("v1", Get("bar"));
+    dbfull()->Flush(FlushOptions());
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, NumImmutableMemTable) {
+  do {
+    Options options = CurrentOptions();
+    WriteOptions writeOpt = WriteOptions();
+    writeOpt.disableWAL = true;
+    options.max_write_buffer_number = 4;
+    options.min_write_buffer_number_to_merge = 3;
+    options.write_buffer_size = 1000000;
+    Reopen(&options);
+
+    std::string big_value(1000000, 'x');
+    std::string num;
+
+    ASSERT_OK(dbfull()->Put(writeOpt, "k1", big_value));
+    ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
+    ASSERT_EQ(num, "0");
+
+    ASSERT_OK(dbfull()->Put(writeOpt, "k2", big_value));
+    ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
+    ASSERT_EQ(num, "1");
+
+    ASSERT_OK(dbfull()->Put(writeOpt, "k3", big_value));
+    ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
+    ASSERT_EQ(num, "2");
+
+    dbfull()->Flush(FlushOptions());
+    ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
+    ASSERT_EQ(num, "0");
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, FLUSH) {
+  do {
+    Options options = CurrentOptions();
+    WriteOptions writeOpt = WriteOptions();
+    writeOpt.disableWAL = true;
+    ASSERT_OK(dbfull()->Put(writeOpt, "foo", "v1"));
+    // this will now also flush the last 2 writes
+    dbfull()->Flush(FlushOptions());
+    ASSERT_OK(dbfull()->Put(writeOpt, "bar", "v1"));
+
+    Reopen();
+    ASSERT_EQ("v1", Get("foo"));
+    ASSERT_EQ("v1", Get("bar"));
+
+    writeOpt.disableWAL = true;
+    ASSERT_OK(dbfull()->Put(writeOpt, "bar", "v2"));
+    ASSERT_OK(dbfull()->Put(writeOpt, "foo", "v2"));
+    dbfull()->Flush(FlushOptions());
+
+    Reopen();
+    ASSERT_EQ("v2", Get("bar"));
+    ASSERT_EQ("v2", Get("foo"));
+
+    writeOpt.disableWAL = false;
+    ASSERT_OK(dbfull()->Put(writeOpt, "bar", "v3"));
+    ASSERT_OK(dbfull()->Put(writeOpt, "foo", "v3"));
+    dbfull()->Flush(FlushOptions());
+
+    Reopen();
+    // 'foo' should be there because its put
+    // has WAL enabled.
+    ASSERT_EQ("v3", Get("foo"));
+    ASSERT_EQ("v3", Get("bar"));
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, RecoveryWithEmptyLog) {
+  do {
+    ASSERT_OK(Put("foo", "v1"));
+    ASSERT_OK(Put("foo", "v2"));
+    Reopen();
+    Reopen();
+    ASSERT_OK(Put("foo", "v3"));
+    Reopen();
+    ASSERT_EQ("v3", Get("foo"));
+  } while (ChangeOptions());
+}
+
+// Check that writes done during a memtable compaction are recovered
+// if the database is shutdown during the memtable compaction.
+TEST(DBTest, RecoverDuringMemtableCompaction) {
+  do {
+    Options options = CurrentOptions();
+    options.env = env_;
+    options.write_buffer_size = 1000000;
+    Reopen(&options);
+
+    // Trigger a long memtable compaction and reopen the database during it
+    ASSERT_OK(Put("foo", "v1"));                         // Goes to 1st log file
+    ASSERT_OK(Put("big1", std::string(10000000, 'x')));  // Fills memtable
+    ASSERT_OK(Put("big2", std::string(1000, 'y')));      // Triggers compaction
+    ASSERT_OK(Put("bar", "v2"));                         // Goes to new log file
+
+    Reopen(&options);
+    ASSERT_EQ("v1", Get("foo"));
+    ASSERT_EQ("v2", Get("bar"));
+    ASSERT_EQ(std::string(10000000, 'x'), Get("big1"));
+    ASSERT_EQ(std::string(1000, 'y'), Get("big2"));
+  } while (ChangeOptions());
+}
+
+TEST(DBTest, MinorCompactionsHappen) {
+  do {
+    Options options = CurrentOptions();
+    options.write_buffer_size = 10000;
+    Reopen(&options);
+
+    const int N = 500;
+
+    int starting_num_tables = TotalTableFiles();
+    for (int i = 0; i < N; i++) {
+      ASSERT_OK(Put(Key(i), Key(i) + std::string(1000, 'v')));
+    }
+    int ending_num_tables = TotalTableFiles();
+    ASSERT_GT(ending_num_tables, starting_num_tables);
+
+    for (int i = 0; i < N; i++) {
+      ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(Key(i)));
+    }
+
+    Reopen();
+
+    for (int i = 0; i < N; i++) {
+      ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(Key(i)));
+    }
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, ManifestRollOver) {
+  do {
+    Options options = CurrentOptions();
+    options.max_manifest_file_size = 10 ;  // 10 bytes
+    Reopen(&options);
+    {
+      ASSERT_OK(Put("manifest_key1", std::string(1000, '1')));
+      ASSERT_OK(Put("manifest_key2", std::string(1000, '2')));
+      ASSERT_OK(Put("manifest_key3", std::string(1000, '3')));
+      uint64_t manifest_before_flush =
+        dbfull()->TEST_Current_Manifest_FileNo();
+      dbfull()->Flush(FlushOptions()); // This should trigger LogAndApply.
+      uint64_t manifest_after_flush =
+        dbfull()->TEST_Current_Manifest_FileNo();
+      ASSERT_GT(manifest_after_flush, manifest_before_flush);
+      Reopen(&options);
+      ASSERT_GT(dbfull()->TEST_Current_Manifest_FileNo(),
+                manifest_after_flush);
+      // check if a new manifest file got inserted or not.
+      ASSERT_EQ(std::string(1000, '1'), Get("manifest_key1"));
+      ASSERT_EQ(std::string(1000, '2'), Get("manifest_key2"));
+      ASSERT_EQ(std::string(1000, '3'), Get("manifest_key3"));
+    }
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, IdentityAcrossRestarts) {
+  do {
+    std::string id1;
+    ASSERT_OK(db_->GetDbIdentity(id1));
+
+    Options options = CurrentOptions();
+    Reopen(&options);
+    std::string id2;
+    ASSERT_OK(db_->GetDbIdentity(id2));
+    // id1 should match id2 because identity was not regenerated
+    ASSERT_EQ(id1.compare(id2), 0);
+
+    std::string idfilename = IdentityFileName(dbname_);
+    ASSERT_OK(env_->DeleteFile(idfilename));
+    Reopen(&options);
+    std::string id3;
+    ASSERT_OK(db_->GetDbIdentity(id3));
+    // id1 should NOT match id3 because identity was regenerated
+    ASSERT_NE(id1.compare(id3), 0);
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, RecoverWithLargeLog) {
+  do {
+    {
+      Options options = CurrentOptions();
+      Reopen(&options);
+      ASSERT_OK(Put("big1", std::string(200000, '1')));
+      ASSERT_OK(Put("big2", std::string(200000, '2')));
+      ASSERT_OK(Put("small3", std::string(10, '3')));
+      ASSERT_OK(Put("small4", std::string(10, '4')));
+      ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+    }
+
+    // Make sure that if we re-open with a small write buffer size that
+    // we flush table files in the middle of a large log file.
+    Options options = CurrentOptions();
+    options.write_buffer_size = 100000;
+    Reopen(&options);
+    ASSERT_EQ(NumTableFilesAtLevel(0), 3);
+    ASSERT_EQ(std::string(200000, '1'), Get("big1"));
+    ASSERT_EQ(std::string(200000, '2'), Get("big2"));
+    ASSERT_EQ(std::string(10, '3'), Get("small3"));
+    ASSERT_EQ(std::string(10, '4'), Get("small4"));
+    ASSERT_GT(NumTableFilesAtLevel(0), 1);
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, CompactionsGenerateMultipleFiles) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100000000;        // Large write buffer
+  Reopen(&options);
+
+  Random rnd(301);
+
+  // Write 8MB (80 values, each 100K)
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+  std::vector<std::string> values;
+  for (int i = 0; i < 80; i++) {
+    values.push_back(RandomString(&rnd, 100000));
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+
+  // Reopening moves updates to level-0
+  Reopen(&options);
+  dbfull()->TEST_CompactRange(0, nullptr, nullptr);
+
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+  ASSERT_GT(NumTableFilesAtLevel(1), 1);
+  for (int i = 0; i < 80; i++) {
+    ASSERT_EQ(Get(Key(i)), values[i]);
+  }
+}
+
+TEST(DBTest, CompactionTrigger) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100<<10; //100KB
+  options.num_levels = 3;
+  options.max_mem_compaction_level = 0;
+  options.level0_file_num_compaction_trigger = 3;
+  Reopen(&options);
+
+  Random rnd(301);
+
+  for (int num = 0;
+       num < options.level0_file_num_compaction_trigger - 1;
+       num++) {
+    std::vector<std::string> values;
+    // Write 120KB (12 values, each 10K)
+    for (int i = 0; i < 12; i++) {
+      values.push_back(RandomString(&rnd, 10000));
+      ASSERT_OK(Put(Key(i), values[i]));
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_EQ(NumTableFilesAtLevel(0), num + 1);
+  }
+
+  //generate one more file in level-0, and should trigger level-0 compaction
+  std::vector<std::string> values;
+  for (int i = 0; i < 12; i++) {
+    values.push_back(RandomString(&rnd, 10000));
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  dbfull()->TEST_WaitForCompact();
+
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1), 1);
+}
+
+TEST(DBTest, UniversalCompactionTrigger) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.write_buffer_size = 100<<10; //100KB
+  // trigger compaction if there are >= 4 files
+  options.level0_file_num_compaction_trigger = 4;
+  Reopen(&options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  // Stage 1:
+  //   Generate a set of files at level 0, but don't trigger level-0
+  //   compaction.
+  for (int num = 0;
+       num < options.level0_file_num_compaction_trigger-1;
+       num++) {
+    // Write 120KB (12 values, each 10K)
+    for (int i = 0; i < 12; i++) {
+      ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 10000)));
+      key_idx++;
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_EQ(NumTableFilesAtLevel(0), num + 1);
+  }
+
+  // Generate one more file at level-0, which should trigger level-0
+  // compaction.
+  for (int i = 0; i < 12; i++) {
+    ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 10000)));
+    key_idx++;
+  }
+  dbfull()->TEST_WaitForCompact();
+  // Suppose each file flushed from mem table has size 1. Now we compact
+  // (level0_file_num_compaction_trigger+1)=4 files and should have a big
+  // file of size 4.
+  ASSERT_EQ(NumTableFilesAtLevel(0), 1);
+  for (int i = 1; i < options.num_levels ; i++) {
+    ASSERT_EQ(NumTableFilesAtLevel(i), 0);
+  }
+
+  // Stage 2:
+  //   Now we have one file at level 0, with size 4. We also have some data in
+  //   mem table. Let's continue generating new files at level 0, but don't
+  //   trigger level-0 compaction.
+  //   First, clean up memtable before inserting new data. This will generate
+  //   a level-0 file, with size around 0.4 (according to previously written
+  //   data amount).
+  dbfull()->Flush(FlushOptions());
+  for (int num = 0;
+       num < options.level0_file_num_compaction_trigger-3;
+       num++) {
+    // Write 120KB (12 values, each 10K)
+    for (int i = 0; i < 12; i++) {
+      ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 10000)));
+      key_idx++;
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_EQ(NumTableFilesAtLevel(0), num + 3);
+  }
+
+  // Generate one more file at level-0, which should trigger level-0
+  // compaction.
+  for (int i = 0; i < 12; i++) {
+    ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 10000)));
+    key_idx++;
+  }
+  dbfull()->TEST_WaitForCompact();
+  // Before compaction, we have 4 files at level 0, with size 4, 0.4, 1, 1.
+  // After comapction, we should have 2 files, with size 4, 2.4.
+  ASSERT_EQ(NumTableFilesAtLevel(0), 2);
+  for (int i = 1; i < options.num_levels ; i++) {
+    ASSERT_EQ(NumTableFilesAtLevel(i), 0);
+  }
+
+  // Stage 3:
+  //   Now we have 2 files at level 0, with size 4 and 2.4. Continue
+  //   generating new files at level 0.
+  for (int num = 0;
+       num < options.level0_file_num_compaction_trigger-3;
+       num++) {
+    // Write 120KB (12 values, each 10K)
+    for (int i = 0; i < 12; i++) {
+      ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 10000)));
+      key_idx++;
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_EQ(NumTableFilesAtLevel(0), num + 3);
+  }
+
+  // Generate one more file at level-0, which should trigger level-0
+  // compaction.
+  for (int i = 0; i < 12; i++) {
+    ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 10000)));
+    key_idx++;
+  }
+  dbfull()->TEST_WaitForCompact();
+  // Before compaction, we have 4 files at level 0, with size 4, 2.4, 1, 1.
+  // After comapction, we should have 3 files, with size 4, 2.4, 2.
+  ASSERT_EQ(NumTableFilesAtLevel(0), 3);
+  for (int i = 1; i < options.num_levels ; i++) {
+    ASSERT_EQ(NumTableFilesAtLevel(i), 0);
+  }
+
+  // Stage 4:
+  //   Now we have 3 files at level 0, with size 4, 2.4, 2. Let's generate a
+  //   new file of size 1.
+  for (int i = 0; i < 12; i++) {
+    ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 10000)));
+    key_idx++;
+  }
+  dbfull()->TEST_WaitForCompact();
+  // Level-0 compaction is triggered, but no file will be picked up.
+  ASSERT_EQ(NumTableFilesAtLevel(0), 4);
+  for (int i = 1; i < options.num_levels ; i++) {
+    ASSERT_EQ(NumTableFilesAtLevel(i), 0);
+  }
+
+  // Stage 5:
+  //   Now we have 4 files at level 0, with size 4, 2.4, 2, 1. Let's generate
+  //   a new file of size 1.
+  for (int i = 0; i < 12; i++) {
+    ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 10000)));
+    key_idx++;
+  }
+  dbfull()->TEST_WaitForCompact();
+  // All files at level 0 will be compacted into a single one.
+  ASSERT_EQ(NumTableFilesAtLevel(0), 1);
+  for (int i = 1; i < options.num_levels ; i++) {
+    ASSERT_EQ(NumTableFilesAtLevel(i), 0);
+  }
+}
+
+TEST(DBTest, UniversalCompactionSizeAmplification) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.write_buffer_size = 100<<10; //100KB
+  options.level0_file_num_compaction_trigger = 3;
+
+  // Trigger compaction if size amplification exceeds 110%
+  options.compaction_options_universal.
+    max_size_amplification_percent = 110;
+  Reopen(&options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  //   Generate two files in Level 0. Both files are approx the same size.
+  for (int num = 0;
+       num < options.level0_file_num_compaction_trigger-1;
+       num++) {
+    // Write 120KB (12 values, each 10K)
+    for (int i = 0; i < 12; i++) {
+      ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 10000)));
+      key_idx++;
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_EQ(NumTableFilesAtLevel(0), num + 1);
+  }
+  ASSERT_EQ(NumTableFilesAtLevel(0), 2);
+
+  // Flush whatever is remaining in memtable. This is typically
+  // small, which should not trigger size ratio based compaction
+  // but will instead trigger size amplification.
+  dbfull()->Flush(FlushOptions());
+
+  dbfull()->TEST_WaitForCompact();
+
+  // Verify that size amplification did occur
+  ASSERT_EQ(NumTableFilesAtLevel(0), 1);
+}
+
+TEST(DBTest, UniversalCompactionOptions) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.write_buffer_size = 100<<10; //100KB
+  options.level0_file_num_compaction_trigger = 4;
+  options.num_levels = 1;
+  options.compaction_options_universal.compression_size_percent = -1;
+  Reopen(&options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  for (int num = 0;
+       num < options.level0_file_num_compaction_trigger;
+       num++) {
+    // Write 120KB (12 values, each 10K)
+    for (int i = 0; i < 12; i++) {
+      ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 10000)));
+      key_idx++;
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+
+    if (num < options.level0_file_num_compaction_trigger - 1) {
+      ASSERT_EQ(NumTableFilesAtLevel(0), num + 1);
+    }
+  }
+
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(NumTableFilesAtLevel(0), 1);
+  for (int i = 1; i < options.num_levels ; i++) {
+    ASSERT_EQ(NumTableFilesAtLevel(i), 0);
+  }
+}
+
+#if defined(SNAPPY) && defined(ZLIB) && defined(BZIP2)
+TEST(DBTest, CompressedCache) {
+  int num_iter = 80;
+
+  // Run this test three iterations.
+  // Iteration 1: only a uncompressed block cache
+  // Iteration 2: only a compressed block cache
+  // Iteration 3: both block cache and compressed cache
+  for (int iter = 0; iter < 3; iter++) {
+    Options options = CurrentOptions();
+    options.write_buffer_size = 64*1024;        // small write buffer
+    options.statistics = rocksdb::CreateDBStatistics();
+
+    switch (iter) {
+      case 0:
+        // only uncompressed block cache
+        options.block_cache = NewLRUCache(8*1024);
+        options.block_cache_compressed = nullptr;
+        break;
+      case 1:
+        // no block cache, only compressed cache
+        options.no_block_cache = true;
+        options.block_cache = nullptr;
+        options.block_cache_compressed = NewLRUCache(8*1024);
+        break;
+      case 2:
+        // both compressed and uncompressed block cache
+        options.block_cache = NewLRUCache(1024);
+        options.block_cache_compressed = NewLRUCache(8*1024);
+        break;
+      default:
+        ASSERT_TRUE(false);
+    }
+    Reopen(&options);
+
+    Random rnd(301);
+
+    // Write 8MB (80 values, each 100K)
+    ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+    std::vector<std::string> values;
+    std::string str;
+    for (int i = 0; i < num_iter; i++) {
+      if (i % 4 == 0) {        // high compression ratio
+        str = RandomString(&rnd, 1000);
+      }
+      values.push_back(str);
+      ASSERT_OK(Put(Key(i), values[i]));
+    }
+
+    // flush all data from memtable so that reads are from block cache
+    dbfull()->Flush(FlushOptions());
+
+    for (int i = 0; i < num_iter; i++) {
+      ASSERT_EQ(Get(Key(i)), values[i]);
+    }
+
+    // check that we triggered the appropriate code paths in the cache
+    switch (iter) {
+      case 0:
+        // only uncompressed block cache
+        ASSERT_GT(options.statistics.get()->getTickerCount(BLOCK_CACHE_MISS),
+                  0);
+        ASSERT_EQ(options.statistics.get()->getTickerCount
+                  (BLOCK_CACHE_COMPRESSED_MISS), 0);
+        break;
+      case 1:
+        // no block cache, only compressed cache
+        ASSERT_EQ(options.statistics.get()->getTickerCount(BLOCK_CACHE_MISS),
+                  0);
+        ASSERT_GT(options.statistics.get()->getTickerCount
+                  (BLOCK_CACHE_COMPRESSED_MISS), 0);
+        break;
+      case 2:
+        // both compressed and uncompressed block cache
+        ASSERT_GT(options.statistics.get()->getTickerCount(BLOCK_CACHE_MISS),
+                  0);
+        ASSERT_GT(options.statistics.get()->getTickerCount
+                  (BLOCK_CACHE_COMPRESSED_MISS), 0);
+        break;
+      default:
+        ASSERT_TRUE(false);
+    }
+  }
+}
+
+static std::string CompressibleString(Random* rnd, int len) {
+  std::string r;
+  test::CompressibleString(rnd, 0.8, len, &r);
+  return r;
+}
+
+TEST(DBTest, UniversalCompactionCompressRatio1) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.write_buffer_size = 100<<10; //100KB
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 1;
+  options.compaction_options_universal.compression_size_percent = 70;
+  Reopen(&options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  // The first compaction (2) is compressed.
+  for (int num = 0; num < 2; num++) {
+    // Write 120KB (12 values, each 10K)
+    for (int i = 0; i < 12; i++) {
+      ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000)));
+      key_idx++;
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+    dbfull()->TEST_WaitForCompact();
+  }
+  ASSERT_LT((int ) dbfull()->TEST_GetLevel0TotalSize(), 120000 * 2 * 0.9);
+
+  // The second compaction (4) is compressed
+  for (int num = 0; num < 2; num++) {
+    // Write 120KB (12 values, each 10K)
+    for (int i = 0; i < 12; i++) {
+      ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000)));
+      key_idx++;
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+    dbfull()->TEST_WaitForCompact();
+  }
+  ASSERT_LT((int ) dbfull()->TEST_GetLevel0TotalSize(), 120000 * 4 * 0.9);
+
+  // The third compaction (2 4) is compressed since this time it is
+  // (1 1 3.2) and 3.2/5.2 doesn't reach ratio.
+  for (int num = 0; num < 2; num++) {
+    // Write 120KB (12 values, each 10K)
+    for (int i = 0; i < 12; i++) {
+      ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000)));
+      key_idx++;
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+    dbfull()->TEST_WaitForCompact();
+  }
+  ASSERT_LT((int ) dbfull()->TEST_GetLevel0TotalSize(), 120000 * 6 * 0.9);
+
+  // When we start for the compaction up to (2 4 8), the latest
+  // compressed is not compressed.
+  for (int num = 0; num < 8; num++) {
+    // Write 120KB (12 values, each 10K)
+    for (int i = 0; i < 12; i++) {
+      ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000)));
+      key_idx++;
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+    dbfull()->TEST_WaitForCompact();
+  }
+  ASSERT_GT((int) dbfull()->TEST_GetLevel0TotalSize(),
+            120000 * 12 * 0.8 + 110000 * 2);
+}
+
+TEST(DBTest, UniversalCompactionCompressRatio2) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.write_buffer_size = 100<<10; //100KB
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 1;
+  options.compaction_options_universal.compression_size_percent = 95;
+  Reopen(&options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  // When we start for the compaction up to (2 4 8), the latest
+  // compressed is compressed given the size ratio to compress.
+  for (int num = 0; num < 14; num++) {
+    // Write 120KB (12 values, each 10K)
+    for (int i = 0; i < 12; i++) {
+      ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000)));
+      key_idx++;
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+    dbfull()->TEST_WaitForCompact();
+  }
+  ASSERT_LT((int ) dbfull()->TEST_GetLevel0TotalSize(),
+            120000 * 12 * 0.8 + 110000 * 2);
+}
+#endif
+
+TEST(DBTest, ConvertCompactionStyle) {
+  Random rnd(301);
+  int max_key_level_insert = 200;
+  int max_key_universal_insert = 600;
+
+  // Stage 1: generate a db with level compaction
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100<<10; //100KB
+  options.num_levels = 4;
+  options.level0_file_num_compaction_trigger = 3;
+  options.max_bytes_for_level_base = 500<<10; // 500KB
+  options.max_bytes_for_level_multiplier = 1;
+  options.target_file_size_base = 200<<10; // 200KB
+  options.target_file_size_multiplier = 1;
+  Reopen(&options);
+
+  for (int i = 0; i <= max_key_level_insert; i++) {
+    // each value is 10K
+    ASSERT_OK(Put(Key(i), RandomString(&rnd, 10000)));
+  }
+  dbfull()->Flush(FlushOptions());
+  dbfull()->TEST_WaitForCompact();
+
+  ASSERT_GT(TotalTableFiles(), 1);
+  int non_level0_num_files = 0;
+  for (int i = 1; i < dbfull()->NumberLevels(); i++) {
+    non_level0_num_files += NumTableFilesAtLevel(i);
+  }
+  ASSERT_GT(non_level0_num_files, 0);
+
+  // Stage 2: reopen with universal compaction - should fail
+  options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  Status s = TryReopen(&options);
+  ASSERT_TRUE(s.IsInvalidArgument());
+
+  // Stage 3: compact into a single file and move the file to level 0
+  options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.target_file_size_base = INT_MAX;
+  options.target_file_size_multiplier = 1;
+  options.max_bytes_for_level_base = INT_MAX;
+  options.max_bytes_for_level_multiplier = 1;
+  Reopen(&options);
+
+  dbfull()->CompactRange(nullptr, nullptr,
+                         true /* reduce level */,
+                         0    /* reduce to level 0 */);
+
+  for (int i = 0; i < dbfull()->NumberLevels(); i++) {
+    int num = NumTableFilesAtLevel(i);
+    if (i == 0) {
+      ASSERT_EQ(num, 1);
+    } else {
+      ASSERT_EQ(num, 0);
+    }
+  }
+
+  // Stage 4: re-open in universal compaction style and do some db operations
+  options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.write_buffer_size = 100<<10; //100KB
+  options.level0_file_num_compaction_trigger = 3;
+  Reopen(&options);
+
+  for (int i = max_key_level_insert / 2; i <= max_key_universal_insert; i++) {
+    ASSERT_OK(Put(Key(i), RandomString(&rnd, 10000)));
+  }
+  dbfull()->Flush(FlushOptions());
+  dbfull()->TEST_WaitForCompact();
+
+  for (int i = 1; i < dbfull()->NumberLevels(); i++) {
+    ASSERT_EQ(NumTableFilesAtLevel(i), 0);
+  }
+
+  // verify keys inserted in both level compaction style and universal
+  // compaction style
+  std::string keys_in_db;
+  Iterator* iter = dbfull()->NewIterator(ReadOptions());
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    keys_in_db.append(iter->key().ToString());
+    keys_in_db.push_back(',');
+  }
+  delete iter;
+
+  std::string expected_keys;
+  for (int i = 0; i <= max_key_universal_insert; i++) {
+    expected_keys.append(Key(i));
+    expected_keys.push_back(',');
+  }
+
+  ASSERT_EQ(keys_in_db, expected_keys);
+}
+
+void MinLevelHelper(DBTest* self, Options& options) {
+  Random rnd(301);
+
+  for (int num = 0;
+    num < options.level0_file_num_compaction_trigger - 1;
+    num++)
+  {
+    std::vector<std::string> values;
+    // Write 120KB (12 values, each 10K)
+    for (int i = 0; i < 12; i++) {
+      values.push_back(RandomString(&rnd, 10000));
+      ASSERT_OK(self->Put(Key(i), values[i]));
+    }
+    self->dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_EQ(self->NumTableFilesAtLevel(0), num + 1);
+  }
+
+  //generate one more file in level-0, and should trigger level-0 compaction
+  std::vector<std::string> values;
+  for (int i = 0; i < 12; i++) {
+    values.push_back(RandomString(&rnd, 10000));
+    ASSERT_OK(self->Put(Key(i), values[i]));
+  }
+  self->dbfull()->TEST_WaitForCompact();
+
+  ASSERT_EQ(self->NumTableFilesAtLevel(0), 0);
+  ASSERT_EQ(self->NumTableFilesAtLevel(1), 1);
+}
+
+// returns false if the calling-Test should be skipped
+bool MinLevelToCompress(CompressionType& type, Options& options, int wbits,
+                        int lev, int strategy) {
+  fprintf(stderr, "Test with compression options : window_bits = %d, level =  %d, strategy = %d}\n", wbits, lev, strategy);
+  options.write_buffer_size = 100<<10; //100KB
+  options.num_levels = 3;
+  options.max_mem_compaction_level = 0;
+  options.level0_file_num_compaction_trigger = 3;
+  options.create_if_missing = true;
+
+  if (SnappyCompressionSupported(CompressionOptions(wbits, lev, strategy))) {
+    type = kSnappyCompression;
+    fprintf(stderr, "using snappy\n");
+  } else if (ZlibCompressionSupported(
+               CompressionOptions(wbits, lev, strategy))) {
+    type = kZlibCompression;
+    fprintf(stderr, "using zlib\n");
+  } else if (BZip2CompressionSupported(
+               CompressionOptions(wbits, lev, strategy))) {
+    type = kBZip2Compression;
+    fprintf(stderr, "using bzip2\n");
+  } else {
+    fprintf(stderr, "skipping test, compression disabled\n");
+    return false;
+  }
+  options.compression_per_level.resize(options.num_levels);
+
+  // do not compress L0
+  for (int i = 0; i < 1; i++) {
+    options.compression_per_level[i] = kNoCompression;
+  }
+  for (int i = 1; i < options.num_levels; i++) {
+    options.compression_per_level[i] = type;
+  }
+  return true;
+}
+
+TEST(DBTest, MinLevelToCompress1) {
+  Options options = CurrentOptions();
+  CompressionType type;
+  if (!MinLevelToCompress(type, options, -14, -1, 0)) {
+    return;
+  }
+  Reopen(&options);
+  MinLevelHelper(this, options);
+
+  // do not compress L0 and L1
+  for (int i = 0; i < 2; i++) {
+    options.compression_per_level[i] = kNoCompression;
+  }
+  for (int i = 2; i < options.num_levels; i++) {
+    options.compression_per_level[i] = type;
+  }
+  DestroyAndReopen(&options);
+  MinLevelHelper(this, options);
+}
+
+TEST(DBTest, MinLevelToCompress2) {
+  Options options = CurrentOptions();
+  CompressionType type;
+  if (!MinLevelToCompress(type, options, 15, -1, 0)) {
+    return;
+  }
+  Reopen(&options);
+  MinLevelHelper(this, options);
+
+  // do not compress L0 and L1
+  for (int i = 0; i < 2; i++) {
+    options.compression_per_level[i] = kNoCompression;
+  }
+  for (int i = 2; i < options.num_levels; i++) {
+    options.compression_per_level[i] = type;
+  }
+  DestroyAndReopen(&options);
+  MinLevelHelper(this, options);
+}
+
+TEST(DBTest, RepeatedWritesToSameKey) {
+  do {
+    Options options = CurrentOptions();
+    options.env = env_;
+    options.write_buffer_size = 100000;  // Small write buffer
+    Reopen(&options);
+
+    // We must have at most one file per level except for level-0,
+    // which may have up to kL0_StopWritesTrigger files.
+    const int kMaxFiles = dbfull()->NumberLevels() +
+      dbfull()->Level0StopWriteTrigger();
+
+    Random rnd(301);
+    std::string value = RandomString(&rnd, 2 * options.write_buffer_size);
+    for (int i = 0; i < 5 * kMaxFiles; i++) {
+      Put("key", value);
+      ASSERT_LE(TotalTableFiles(), kMaxFiles);
+    }
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, InPlaceUpdate) {
+  do {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.inplace_update_support = true;
+    options.env = env_;
+    options.write_buffer_size = 100000;
+
+    // Update key with values of smaller size
+    Reopen(&options);
+    int numValues = 10;
+    for (int i = numValues; i > 0; i--) {
+      std::string value = DummyString(i, 'a');
+      ASSERT_OK(Put("key", value));
+      ASSERT_EQ(value, Get("key"));
+    }
+
+    int count = 0;
+    Iterator* iter = dbfull()->TEST_NewInternalIterator();
+    iter->SeekToFirst();
+    ASSERT_EQ(iter->status().ok(), true);
+    while (iter->Valid()) {
+      ParsedInternalKey ikey(Slice(), 0, kTypeValue);
+      ikey.sequence = -1;
+      ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
+      count++;
+      // All updates with the same sequence number.
+      ASSERT_EQ(ikey.sequence, (unsigned)1);
+      iter->Next();
+    }
+    // Only 1 instance for that key.
+    ASSERT_EQ(count, 1);
+    delete iter;
+
+    // Update key with values of larger size
+    DestroyAndReopen(&options);
+    numValues = 10;
+    for (int i = 0; i < numValues; i++) {
+      std::string value = DummyString(i, 'a');
+      ASSERT_OK(Put("key", value));
+      ASSERT_EQ(value, Get("key"));
+    }
+
+    count = 0;
+    iter = dbfull()->TEST_NewInternalIterator();
+    iter->SeekToFirst();
+    ASSERT_EQ(iter->status().ok(), true);
+    int seq = numValues;
+    while (iter->Valid()) {
+      ParsedInternalKey ikey(Slice(), 0, kTypeValue);
+      ikey.sequence = -1;
+      ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
+      count++;
+      // No inplace updates. All updates are puts with new seq number
+      ASSERT_EQ(ikey.sequence, (unsigned)seq--);
+      iter->Next();
+    }
+    // All 10 updates exist in the internal iterator
+    ASSERT_EQ(count, numValues);
+    delete iter;
+
+
+  } while (ChangeCompactOptions());
+}
+
+// This is a static filter used for filtering
+// kvs during the compaction process.
+static int cfilter_count;
+static std::string NEW_VALUE = "NewValue";
+
+class KeepFilter : public CompactionFilter {
+ public:
+  virtual bool Filter(int level, const Slice& key,
+                      const Slice& value, std::string* new_value,
+                      bool* value_changed) const override {
+    cfilter_count++;
+    return false;
+  }
+
+  virtual const char* Name() const override {
+    return "KeepFilter";
+  }
+
+};
+
+class DeleteFilter : public CompactionFilter {
+ public:
+  virtual bool Filter(int level, const Slice& key,
+                      const Slice& value, std::string* new_value,
+                      bool* value_changed) const override {
+    cfilter_count++;
+    return true;
+  }
+
+  virtual const char* Name() const override {
+    return "DeleteFilter";
+  }
+};
+
+class ChangeFilter : public CompactionFilter {
+ public:
+  explicit ChangeFilter(int argv) {
+    assert(argv == 100);
+  }
+
+  virtual bool Filter(int level, const Slice& key,
+                      const Slice& value, std::string* new_value,
+                      bool* value_changed) const override {
+    assert(new_value != nullptr);
+    *new_value = NEW_VALUE;
+    *value_changed = true;
+    return false;
+  }
+
+  virtual const char* Name() const override {
+    return "ChangeFilter";
+  }
+};
+
+class KeepFilterFactory : public CompactionFilterFactory {
+  public:
+    virtual std::unique_ptr<CompactionFilter>
+    CreateCompactionFilter(const CompactionFilter::Context& context) override {
+      return std::unique_ptr<CompactionFilter>(new KeepFilter());
+    }
+
+    virtual const char* Name() const override {
+      return "KeepFilterFactory";
+    }
+};
+
+class DeleteFilterFactory : public CompactionFilterFactory {
+  public:
+    virtual std::unique_ptr<CompactionFilter>
+    CreateCompactionFilter(const CompactionFilter::Context& context) override {
+      return std::unique_ptr<CompactionFilter>(new DeleteFilter());
+    }
+
+    virtual const char* Name() const override {
+      return "DeleteFilterFactory";
+    }
+};
+
+class ChangeFilterFactory : public CompactionFilterFactory {
+  public:
+    explicit ChangeFilterFactory(int argv) : argv_(argv) {}
+
+    virtual std::unique_ptr<CompactionFilter>
+    CreateCompactionFilter(const CompactionFilter::Context& context) override {
+      return std::unique_ptr<CompactionFilter>(new ChangeFilter(argv_));
+    }
+
+    virtual const char* Name() const override {
+      return "ChangeFilterFactory";
+    }
+
+  private:
+    const int argv_;
+};
+
+TEST(DBTest, CompactionFilter) {
+  Options options = CurrentOptions();
+  options.num_levels = 3;
+  options.max_mem_compaction_level = 0;
+  options.compaction_filter_factory = std::make_shared<KeepFilterFactory>();
+  Reopen(&options);
+
+  // Write 100K keys, these are written to a few files in L0.
+  const std::string value(10, 'x');
+  for (int i = 0; i < 100000; i++) {
+    char key[100];
+    snprintf(key, sizeof(key), "B%010d", i);
+    Put(key, value);
+  }
+  dbfull()->TEST_FlushMemTable();
+
+  // Push all files to the highest level L2. Verify that
+  // the compaction is each level invokes the filter for
+  // all the keys in that level.
+  cfilter_count = 0;
+  dbfull()->TEST_CompactRange(0, nullptr, nullptr);
+  ASSERT_EQ(cfilter_count, 100000);
+  cfilter_count = 0;
+  dbfull()->TEST_CompactRange(1, nullptr, nullptr);
+  ASSERT_EQ(cfilter_count, 100000);
+
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+  ASSERT_NE(NumTableFilesAtLevel(2), 0);
+  cfilter_count = 0;
+
+  // All the files are in the lowest level.
+  // Verify that all but the 100001st record
+  // has sequence number zero. The 100001st record
+  // is at the tip of this snapshot and cannot
+  // be zeroed out.
+  // TODO: figure out sequence number squashtoo
+  int count = 0;
+  int total = 0;
+  Iterator* iter = dbfull()->TEST_NewInternalIterator();
+  iter->SeekToFirst();
+  ASSERT_OK(iter->status());
+  while (iter->Valid()) {
+    ParsedInternalKey ikey(Slice(), 0, kTypeValue);
+    ikey.sequence = -1;
+    ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
+    total++;
+    if (ikey.sequence != 0) {
+      count++;
+    }
+    iter->Next();
+  }
+  ASSERT_EQ(total, 100000);
+  ASSERT_EQ(count, 1);
+  delete iter;
+
+  // overwrite all the 100K keys once again.
+  for (int i = 0; i < 100000; i++) {
+    char key[100];
+    snprintf(key, sizeof(key), "B%010d", i);
+    Put(key, value);
+  }
+  dbfull()->TEST_FlushMemTable();
+
+  // push all files to the highest level L2. This
+  // means that all keys should pass at least once
+  // via the compaction filter
+  cfilter_count = 0;
+  dbfull()->TEST_CompactRange(0, nullptr, nullptr);
+  ASSERT_EQ(cfilter_count, 100000);
+  cfilter_count = 0;
+  dbfull()->TEST_CompactRange(1, nullptr, nullptr);
+  ASSERT_EQ(cfilter_count, 100000);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+  ASSERT_NE(NumTableFilesAtLevel(2), 0);
+
+  // create a new database with the compaction
+  // filter in such a way that it deletes all keys
+  options.compaction_filter_factory = std::make_shared<DeleteFilterFactory>();
+  options.create_if_missing = true;
+  DestroyAndReopen(&options);
+
+  // write all the keys once again.
+  for (int i = 0; i < 100000; i++) {
+    char key[100];
+    snprintf(key, sizeof(key), "B%010d", i);
+    Put(key, value);
+  }
+  dbfull()->TEST_FlushMemTable();
+  ASSERT_NE(NumTableFilesAtLevel(0), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(2), 0);
+
+  // Push all files to the highest level L2. This
+  // triggers the compaction filter to delete all keys,
+  // verify that at the end of the compaction process,
+  // nothing is left.
+  cfilter_count = 0;
+  dbfull()->TEST_CompactRange(0, nullptr, nullptr);
+  ASSERT_EQ(cfilter_count, 100000);
+  cfilter_count = 0;
+  dbfull()->TEST_CompactRange(1, nullptr, nullptr);
+  ASSERT_EQ(cfilter_count, 0);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+
+  // Scan the entire database to ensure that nothing is left
+  iter = db_->NewIterator(ReadOptions());
+  iter->SeekToFirst();
+  count = 0;
+  while (iter->Valid()) {
+    count++;
+    iter->Next();
+  }
+  ASSERT_EQ(count, 0);
+  delete iter;
+
+  // The sequence number of the remaining record
+  // is not zeroed out even though it is at the
+  // level Lmax because this record is at the tip
+  // TODO: remove the following or design a different
+  // test
+  count = 0;
+  iter = dbfull()->TEST_NewInternalIterator();
+  iter->SeekToFirst();
+  ASSERT_OK(iter->status());
+  while (iter->Valid()) {
+    ParsedInternalKey ikey(Slice(), 0, kTypeValue);
+    ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
+    ASSERT_NE(ikey.sequence, (unsigned)0);
+    count++;
+    iter->Next();
+  }
+  ASSERT_EQ(count, 0);
+  delete iter;
+}
+
+TEST(DBTest, CompactionFilterWithValueChange) {
+  do {
+    Options options = CurrentOptions();
+    options.num_levels = 3;
+    options.max_mem_compaction_level = 0;
+    options.compaction_filter_factory =
+      std::make_shared<ChangeFilterFactory>(100);
+    Reopen(&options);
+
+    // Write 100K+1 keys, these are written to a few files
+    // in L0. We do this so that the current snapshot points
+    // to the 100001 key.The compaction filter is  not invoked
+    // on keys that are visible via a snapshot because we
+    // anyways cannot delete it.
+    const std::string value(10, 'x');
+    for (int i = 0; i < 100001; i++) {
+      char key[100];
+      snprintf(key, sizeof(key), "B%010d", i);
+      Put(key, value);
+    }
+
+    // push all files to  lower levels
+    dbfull()->TEST_FlushMemTable();
+    dbfull()->TEST_CompactRange(0, nullptr, nullptr);
+    dbfull()->TEST_CompactRange(1, nullptr, nullptr);
+
+    // re-write all data again
+    for (int i = 0; i < 100001; i++) {
+      char key[100];
+      snprintf(key, sizeof(key), "B%010d", i);
+      Put(key, value);
+    }
+
+    // push all files to  lower levels. This should
+    // invoke the compaction filter for all 100000 keys.
+    dbfull()->TEST_FlushMemTable();
+    dbfull()->TEST_CompactRange(0, nullptr, nullptr);
+    dbfull()->TEST_CompactRange(1, nullptr, nullptr);
+
+    // verify that all keys now have the new value that
+    // was set by the compaction process.
+    for (int i = 0; i < 100000; i++) {
+      char key[100];
+      snprintf(key, sizeof(key), "B%010d", i);
+      std::string newvalue = Get(key);
+      ASSERT_EQ(newvalue.compare(NEW_VALUE), 0);
+    }
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, SparseMerge) {
+  do {
+    Options options = CurrentOptions();
+    options.compression = kNoCompression;
+    Reopen(&options);
+
+    FillLevels("A", "Z");
+
+    // Suppose there is:
+    //    small amount of data with prefix A
+    //    large amount of data with prefix B
+    //    small amount of data with prefix C
+    // and that recent updates have made small changes to all three prefixes.
+    // Check that we do not do a compaction that merges all of B in one shot.
+    const std::string value(1000, 'x');
+    Put("A", "va");
+    // Write approximately 100MB of "B" values
+    for (int i = 0; i < 100000; i++) {
+      char key[100];
+      snprintf(key, sizeof(key), "B%010d", i);
+      Put(key, value);
+    }
+    Put("C", "vc");
+    dbfull()->TEST_FlushMemTable();
+    dbfull()->TEST_CompactRange(0, nullptr, nullptr);
+
+    // Make sparse update
+    Put("A",    "va2");
+    Put("B100", "bvalue2");
+    Put("C",    "vc2");
+    dbfull()->TEST_FlushMemTable();
+
+    // Compactions should not cause us to create a situation where
+    // a file overlaps too much data at the next level.
+    ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576);
+    dbfull()->TEST_CompactRange(0, nullptr, nullptr);
+    ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576);
+    dbfull()->TEST_CompactRange(1, nullptr, nullptr);
+    ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576);
+  } while (ChangeCompactOptions());
+}
+
+static bool Between(uint64_t val, uint64_t low, uint64_t high) {
+  bool result = (val >= low) && (val <= high);
+  if (!result) {
+    fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n",
+            (unsigned long long)(val),
+            (unsigned long long)(low),
+            (unsigned long long)(high));
+  }
+  return result;
+}
+
+TEST(DBTest, ApproximateSizes) {
+  do {
+    Options options = CurrentOptions();
+    options.write_buffer_size = 100000000;        // Large write buffer
+    options.compression = kNoCompression;
+    DestroyAndReopen();
+
+    ASSERT_TRUE(Between(Size("", "xyz"), 0, 0));
+    Reopen(&options);
+    ASSERT_TRUE(Between(Size("", "xyz"), 0, 0));
+
+    // Write 8MB (80 values, each 100K)
+    ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+    const int N = 80;
+    static const int S1 = 100000;
+    static const int S2 = 105000;  // Allow some expansion from metadata
+    Random rnd(301);
+    for (int i = 0; i < N; i++) {
+      ASSERT_OK(Put(Key(i), RandomString(&rnd, S1)));
+    }
+
+    // 0 because GetApproximateSizes() does not account for memtable space
+    ASSERT_TRUE(Between(Size("", Key(50)), 0, 0));
+
+    // Check sizes across recovery by reopening a few times
+    for (int run = 0; run < 3; run++) {
+      Reopen(&options);
+
+      for (int compact_start = 0; compact_start < N; compact_start += 10) {
+        for (int i = 0; i < N; i += 10) {
+          ASSERT_TRUE(Between(Size("", Key(i)), S1*i, S2*i));
+          ASSERT_TRUE(Between(Size("", Key(i)+".suffix"), S1*(i+1), S2*(i+1)));
+          ASSERT_TRUE(Between(Size(Key(i), Key(i+10)), S1*10, S2*10));
+        }
+        ASSERT_TRUE(Between(Size("", Key(50)), S1*50, S2*50));
+        ASSERT_TRUE(Between(Size("", Key(50)+".suffix"), S1*50, S2*50));
+
+        std::string cstart_str = Key(compact_start);
+        std::string cend_str = Key(compact_start + 9);
+        Slice cstart = cstart_str;
+        Slice cend = cend_str;
+        dbfull()->TEST_CompactRange(0, &cstart, &cend);
+      }
+
+      ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+      ASSERT_GT(NumTableFilesAtLevel(1), 0);
+    }
+  } while (ChangeOptions(kSkipUniversalCompaction));
+}
+
+TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) {
+  do {
+    Options options = CurrentOptions();
+    options.compression = kNoCompression;
+    Reopen();
+
+    Random rnd(301);
+    std::string big1 = RandomString(&rnd, 100000);
+    ASSERT_OK(Put(Key(0), RandomString(&rnd, 10000)));
+    ASSERT_OK(Put(Key(1), RandomString(&rnd, 10000)));
+    ASSERT_OK(Put(Key(2), big1));
+    ASSERT_OK(Put(Key(3), RandomString(&rnd, 10000)));
+    ASSERT_OK(Put(Key(4), big1));
+    ASSERT_OK(Put(Key(5), RandomString(&rnd, 10000)));
+    ASSERT_OK(Put(Key(6), RandomString(&rnd, 300000)));
+    ASSERT_OK(Put(Key(7), RandomString(&rnd, 10000)));
+
+    // Check sizes across recovery by reopening a few times
+    for (int run = 0; run < 3; run++) {
+      Reopen(&options);
+
+      ASSERT_TRUE(Between(Size("", Key(0)), 0, 0));
+      ASSERT_TRUE(Between(Size("", Key(1)), 10000, 11000));
+      ASSERT_TRUE(Between(Size("", Key(2)), 20000, 21000));
+      ASSERT_TRUE(Between(Size("", Key(3)), 120000, 121000));
+      ASSERT_TRUE(Between(Size("", Key(4)), 130000, 131000));
+      ASSERT_TRUE(Between(Size("", Key(5)), 230000, 231000));
+      ASSERT_TRUE(Between(Size("", Key(6)), 240000, 241000));
+      ASSERT_TRUE(Between(Size("", Key(7)), 540000, 541000));
+      ASSERT_TRUE(Between(Size("", Key(8)), 550000, 560000));
+
+      ASSERT_TRUE(Between(Size(Key(3), Key(5)), 110000, 111000));
+
+      dbfull()->TEST_CompactRange(0, nullptr, nullptr);
+    }
+  } while (ChangeOptions());
+}
+
+TEST(DBTest, IteratorPinsRef) {
+  do {
+    Put("foo", "hello");
+
+    // Get iterator that will yield the current contents of the DB.
+    Iterator* iter = db_->NewIterator(ReadOptions());
+
+    // Write to force compactions
+    Put("foo", "newvalue1");
+    for (int i = 0; i < 100; i++) {
+      ASSERT_OK(Put(Key(i), Key(i) + std::string(100000, 'v'))); // 100K values
+    }
+    Put("foo", "newvalue2");
+
+    iter->SeekToFirst();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("foo", iter->key().ToString());
+    ASSERT_EQ("hello", iter->value().ToString());
+    iter->Next();
+    ASSERT_TRUE(!iter->Valid());
+    delete iter;
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, Snapshot) {
+  do {
+    Put("foo", "v1");
+    const Snapshot* s1 = db_->GetSnapshot();
+    Put("foo", "v2");
+    const Snapshot* s2 = db_->GetSnapshot();
+    Put("foo", "v3");
+    const Snapshot* s3 = db_->GetSnapshot();
+
+    Put("foo", "v4");
+    ASSERT_EQ("v1", Get("foo", s1));
+    ASSERT_EQ("v2", Get("foo", s2));
+    ASSERT_EQ("v3", Get("foo", s3));
+    ASSERT_EQ("v4", Get("foo"));
+
+    db_->ReleaseSnapshot(s3);
+    ASSERT_EQ("v1", Get("foo", s1));
+    ASSERT_EQ("v2", Get("foo", s2));
+    ASSERT_EQ("v4", Get("foo"));
+
+    db_->ReleaseSnapshot(s1);
+    ASSERT_EQ("v2", Get("foo", s2));
+    ASSERT_EQ("v4", Get("foo"));
+
+    db_->ReleaseSnapshot(s2);
+    ASSERT_EQ("v4", Get("foo"));
+  } while (ChangeOptions());
+}
+
+TEST(DBTest, HiddenValuesAreRemoved) {
+  do {
+    Random rnd(301);
+    FillLevels("a", "z");
+
+    std::string big = RandomString(&rnd, 50000);
+    Put("foo", big);
+    Put("pastfoo", "v");
+    const Snapshot* snapshot = db_->GetSnapshot();
+    Put("foo", "tiny");
+    Put("pastfoo2", "v2");        // Advance sequence number one more
+
+    ASSERT_OK(dbfull()->TEST_FlushMemTable());
+    ASSERT_GT(NumTableFilesAtLevel(0), 0);
+
+    ASSERT_EQ(big, Get("foo", snapshot));
+    ASSERT_TRUE(Between(Size("", "pastfoo"), 50000, 60000));
+    db_->ReleaseSnapshot(snapshot);
+    ASSERT_EQ(AllEntriesFor("foo"), "[ tiny, " + big + " ]");
+    Slice x("x");
+    dbfull()->TEST_CompactRange(0, nullptr, &x);
+    ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]");
+    ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+    ASSERT_GE(NumTableFilesAtLevel(1), 1);
+    dbfull()->TEST_CompactRange(1, nullptr, &x);
+    ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]");
+
+    ASSERT_TRUE(Between(Size("", "pastfoo"), 0, 1000));
+  } while (ChangeOptions(kSkipUniversalCompaction));
+}
+
+TEST(DBTest, CompactBetweenSnapshots) {
+  do {
+    Random rnd(301);
+    FillLevels("a", "z");
+
+    Put("foo", "first");
+    const Snapshot* snapshot1 = db_->GetSnapshot();
+    Put("foo", "second");
+    Put("foo", "third");
+    Put("foo", "fourth");
+    const Snapshot* snapshot2 = db_->GetSnapshot();
+    Put("foo", "fifth");
+    Put("foo", "sixth");
+
+    // All entries (including duplicates) exist
+    // before any compaction is triggered.
+    ASSERT_OK(dbfull()->TEST_FlushMemTable());
+    ASSERT_EQ("sixth", Get("foo"));
+    ASSERT_EQ("fourth", Get("foo", snapshot2));
+    ASSERT_EQ("first", Get("foo", snapshot1));
+    ASSERT_EQ(AllEntriesFor("foo"),
+              "[ sixth, fifth, fourth, third, second, first ]");
+
+    // After a compaction, "second", "third" and "fifth" should
+    // be removed
+    FillLevels("a", "z");
+    dbfull()->CompactRange(nullptr, nullptr);
+    ASSERT_EQ("sixth", Get("foo"));
+    ASSERT_EQ("fourth", Get("foo", snapshot2));
+    ASSERT_EQ("first", Get("foo", snapshot1));
+    ASSERT_EQ(AllEntriesFor("foo"), "[ sixth, fourth, first ]");
+
+    // after we release the snapshot1, only two values left
+    db_->ReleaseSnapshot(snapshot1);
+    FillLevels("a", "z");
+    dbfull()->CompactRange(nullptr, nullptr);
+
+    // We have only one valid snapshot snapshot2. Since snapshot1 is
+    // not valid anymore, "first" should be removed by a compaction.
+    ASSERT_EQ("sixth", Get("foo"));
+    ASSERT_EQ("fourth", Get("foo", snapshot2));
+    ASSERT_EQ(AllEntriesFor("foo"), "[ sixth, fourth ]");
+
+    // after we release the snapshot2, only one value should be left
+    db_->ReleaseSnapshot(snapshot2);
+    FillLevels("a", "z");
+    dbfull()->CompactRange(nullptr, nullptr);
+    ASSERT_EQ("sixth", Get("foo"));
+    ASSERT_EQ(AllEntriesFor("foo"), "[ sixth ]");
+
+  } while (ChangeOptions());
+}
+
+TEST(DBTest, DeletionMarkers1) {
+  Put("foo", "v1");
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
+  const int last = dbfull()->MaxMemCompactionLevel();
+  ASSERT_EQ(NumTableFilesAtLevel(last), 1);   // foo => v1 is now in last level
+
+  // Place a table at level last-1 to prevent merging with preceding mutation
+  Put("a", "begin");
+  Put("z", "end");
+  dbfull()->TEST_FlushMemTable();
+  ASSERT_EQ(NumTableFilesAtLevel(last), 1);
+  ASSERT_EQ(NumTableFilesAtLevel(last-1), 1);
+
+  Delete("foo");
+  Put("foo", "v2");
+  ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]");
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());  // Moves to level last-2
+  if (CurrentOptions().purge_redundant_kvs_while_flush) {
+    ASSERT_EQ(AllEntriesFor("foo"), "[ v2, v1 ]");
+  } else {
+    ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]");
+  }
+  Slice z("z");
+  dbfull()->TEST_CompactRange(last-2, nullptr, &z);
+  // DEL eliminated, but v1 remains because we aren't compacting that level
+  // (DEL can be eliminated because v2 hides v1).
+  ASSERT_EQ(AllEntriesFor("foo"), "[ v2, v1 ]");
+  dbfull()->TEST_CompactRange(last-1, nullptr, nullptr);
+  // Merging last-1 w/ last, so we are the base level for "foo", so
+  // DEL is removed.  (as is v1).
+  ASSERT_EQ(AllEntriesFor("foo"), "[ v2 ]");
+}
+
+TEST(DBTest, DeletionMarkers2) {
+  Put("foo", "v1");
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
+  const int last = dbfull()->MaxMemCompactionLevel();
+  ASSERT_EQ(NumTableFilesAtLevel(last), 1);   // foo => v1 is now in last level
+
+  // Place a table at level last-1 to prevent merging with preceding mutation
+  Put("a", "begin");
+  Put("z", "end");
+  dbfull()->TEST_FlushMemTable();
+  ASSERT_EQ(NumTableFilesAtLevel(last), 1);
+  ASSERT_EQ(NumTableFilesAtLevel(last-1), 1);
+
+  Delete("foo");
+  ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]");
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());  // Moves to level last-2
+  ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]");
+  dbfull()->TEST_CompactRange(last-2, nullptr, nullptr);
+  // DEL kept: "last" file overlaps
+  ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]");
+  dbfull()->TEST_CompactRange(last-1, nullptr, nullptr);
+  // Merging last-1 w/ last, so we are the base level for "foo", so
+  // DEL is removed.  (as is v1).
+  ASSERT_EQ(AllEntriesFor("foo"), "[ ]");
+}
+
+TEST(DBTest, OverlapInLevel0) {
+  do {
+    int tmp = dbfull()->MaxMemCompactionLevel();
+    ASSERT_EQ(tmp, 2) << "Fix test to match config";
+
+    //Fill levels 1 and 2 to disable the pushing of new memtables to levels > 0.
+    ASSERT_OK(Put("100", "v100"));
+    ASSERT_OK(Put("999", "v999"));
+    dbfull()->TEST_FlushMemTable();
+    ASSERT_OK(Delete("100"));
+    ASSERT_OK(Delete("999"));
+    dbfull()->TEST_FlushMemTable();
+    ASSERT_EQ("0,1,1", FilesPerLevel());
+
+    // Make files spanning the following ranges in level-0:
+    //  files[0]  200 .. 900
+    //  files[1]  300 .. 500
+    // Note that files are sorted by smallest key.
+    ASSERT_OK(Put("300", "v300"));
+    ASSERT_OK(Put("500", "v500"));
+    dbfull()->TEST_FlushMemTable();
+    ASSERT_OK(Put("200", "v200"));
+    ASSERT_OK(Put("600", "v600"));
+    ASSERT_OK(Put("900", "v900"));
+    dbfull()->TEST_FlushMemTable();
+    ASSERT_EQ("2,1,1", FilesPerLevel());
+
+    // Compact away the placeholder files we created initially
+    dbfull()->TEST_CompactRange(1, nullptr, nullptr);
+    dbfull()->TEST_CompactRange(2, nullptr, nullptr);
+    ASSERT_EQ("2", FilesPerLevel());
+
+    // Do a memtable compaction.  Before bug-fix, the compaction would
+    // not detect the overlap with level-0 files and would incorrectly place
+    // the deletion in a deeper level.
+    ASSERT_OK(Delete("600"));
+    dbfull()->TEST_FlushMemTable();
+    ASSERT_EQ("3", FilesPerLevel());
+    ASSERT_EQ("NOT_FOUND", Get("600"));
+  } while (ChangeOptions(kSkipUniversalCompaction));
+}
+
+TEST(DBTest, L0_CompactionBug_Issue44_a) {
+  do {
+    Reopen();
+    ASSERT_OK(Put("b", "v"));
+    Reopen();
+    ASSERT_OK(Delete("b"));
+    ASSERT_OK(Delete("a"));
+    Reopen();
+    ASSERT_OK(Delete("a"));
+    Reopen();
+    ASSERT_OK(Put("a", "v"));
+    Reopen();
+    Reopen();
+    ASSERT_EQ("(a->v)", Contents());
+    env_->SleepForMicroseconds(1000000);  // Wait for compaction to finish
+    ASSERT_EQ("(a->v)", Contents());
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, L0_CompactionBug_Issue44_b) {
+  do {
+    Reopen();
+    Put("","");
+    Reopen();
+    Delete("e");
+    Put("","");
+    Reopen();
+    Put("c", "cv");
+    Reopen();
+    Put("","");
+    Reopen();
+    Put("","");
+    env_->SleepForMicroseconds(1000000);  // Wait for compaction to finish
+    Reopen();
+    Put("d","dv");
+    Reopen();
+    Put("","");
+    Reopen();
+    Delete("d");
+    Delete("b");
+    Reopen();
+    ASSERT_EQ("(->)(c->cv)", Contents());
+    env_->SleepForMicroseconds(1000000);  // Wait for compaction to finish
+    ASSERT_EQ("(->)(c->cv)", Contents());
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, ComparatorCheck) {
+  class NewComparator : public Comparator {
+   public:
+    virtual const char* Name() const { return "rocksdb.NewComparator"; }
+    virtual int Compare(const Slice& a, const Slice& b) const {
+      return BytewiseComparator()->Compare(a, b);
+    }
+    virtual void FindShortestSeparator(std::string* s, const Slice& l) const {
+      BytewiseComparator()->FindShortestSeparator(s, l);
+    }
+    virtual void FindShortSuccessor(std::string* key) const {
+      BytewiseComparator()->FindShortSuccessor(key);
+    }
+  };
+  Options new_options;
+  NewComparator cmp;
+  do {
+    new_options = CurrentOptions();
+    new_options.comparator = &cmp;
+    Status s = TryReopen(&new_options);
+    ASSERT_TRUE(!s.ok());
+    ASSERT_TRUE(s.ToString().find("comparator") != std::string::npos)
+        << s.ToString();
+  } while (ChangeCompactOptions(&new_options));
+}
+
+TEST(DBTest, CustomComparator) {
+  class NumberComparator : public Comparator {
+   public:
+    virtual const char* Name() const { return "test.NumberComparator"; }
+    virtual int Compare(const Slice& a, const Slice& b) const {
+      return ToNumber(a) - ToNumber(b);
+    }
+    virtual void FindShortestSeparator(std::string* s, const Slice& l) const {
+      ToNumber(*s);     // Check format
+      ToNumber(l);      // Check format
+    }
+    virtual void FindShortSuccessor(std::string* key) const {
+      ToNumber(*key);   // Check format
+    }
+   private:
+    static int ToNumber(const Slice& x) {
+      // Check that there are no extra characters.
+      ASSERT_TRUE(x.size() >= 2 && x[0] == '[' && x[x.size()-1] == ']')
+          << EscapeString(x);
+      int val;
+      char ignored;
+      ASSERT_TRUE(sscanf(x.ToString().c_str(), "[%i]%c", &val, &ignored) == 1)
+          << EscapeString(x);
+      return val;
+    }
+  };
+  Options new_options;
+  NumberComparator cmp;
+  do {
+    new_options = CurrentOptions();
+    new_options.create_if_missing = true;
+    new_options.comparator = &cmp;
+    new_options.filter_policy = nullptr;     // Cannot use bloom filters
+    new_options.write_buffer_size = 1000;  // Compact more often
+    DestroyAndReopen(&new_options);
+    ASSERT_OK(Put("[10]", "ten"));
+    ASSERT_OK(Put("[0x14]", "twenty"));
+    for (int i = 0; i < 2; i++) {
+      ASSERT_EQ("ten", Get("[10]"));
+      ASSERT_EQ("ten", Get("[0xa]"));
+      ASSERT_EQ("twenty", Get("[20]"));
+      ASSERT_EQ("twenty", Get("[0x14]"));
+      ASSERT_EQ("NOT_FOUND", Get("[15]"));
+      ASSERT_EQ("NOT_FOUND", Get("[0xf]"));
+      Compact("[0]", "[9999]");
+    }
+
+    for (int run = 0; run < 2; run++) {
+      for (int i = 0; i < 1000; i++) {
+        char buf[100];
+        snprintf(buf, sizeof(buf), "[%d]", i*10);
+        ASSERT_OK(Put(buf, buf));
+      }
+      Compact("[0]", "[1000000]");
+    }
+  } while (ChangeCompactOptions(&new_options));
+}
+
+TEST(DBTest, ManualCompaction) {
+  ASSERT_EQ(dbfull()->MaxMemCompactionLevel(), 2)
+      << "Need to update this test to match kMaxMemCompactLevel";
+
+  // iter - 0 with 7 levels
+  // iter - 1 with 3 levels
+  for (int iter = 0; iter < 2; ++iter) {
+    MakeTables(3, "p", "q");
+    ASSERT_EQ("1,1,1", FilesPerLevel());
+
+    // Compaction range falls before files
+    Compact("", "c");
+    ASSERT_EQ("1,1,1", FilesPerLevel());
+
+    // Compaction range falls after files
+    Compact("r", "z");
+    ASSERT_EQ("1,1,1", FilesPerLevel());
+
+    // Compaction range overlaps files
+    Compact("p1", "p9");
+    ASSERT_EQ("0,0,1", FilesPerLevel());
+
+    // Populate a different range
+    MakeTables(3, "c", "e");
+    ASSERT_EQ("1,1,2", FilesPerLevel());
+
+    // Compact just the new range
+    Compact("b", "f");
+    ASSERT_EQ("0,0,2", FilesPerLevel());
+
+    // Compact all
+    MakeTables(1, "a", "z");
+    ASSERT_EQ("0,1,2", FilesPerLevel());
+    db_->CompactRange(nullptr, nullptr);
+    ASSERT_EQ("0,0,1", FilesPerLevel());
+
+    if (iter == 0) {
+      Options options = CurrentOptions();
+      options.num_levels = 3;
+      options.create_if_missing = true;
+      DestroyAndReopen(&options);
+    }
+  }
+
+}
+
+TEST(DBTest, DBOpen_Options) {
+  std::string dbname = test::TmpDir() + "/db_options_test";
+  ASSERT_OK(DestroyDB(dbname, Options()));
+
+  // Does not exist, and create_if_missing == false: error
+  DB* db = nullptr;
+  Options opts;
+  opts.create_if_missing = false;
+  Status s = DB::Open(opts, dbname, &db);
+  ASSERT_TRUE(strstr(s.ToString().c_str(), "does not exist") != nullptr);
+  ASSERT_TRUE(db == nullptr);
+
+  // Does not exist, and create_if_missing == true: OK
+  opts.create_if_missing = true;
+  s = DB::Open(opts, dbname, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != nullptr);
+
+  delete db;
+  db = nullptr;
+
+  // Does exist, and error_if_exists == true: error
+  opts.create_if_missing = false;
+  opts.error_if_exists = true;
+  s = DB::Open(opts, dbname, &db);
+  ASSERT_TRUE(strstr(s.ToString().c_str(), "exists") != nullptr);
+  ASSERT_TRUE(db == nullptr);
+
+  // Does exist, and error_if_exists == false: OK
+  opts.create_if_missing = true;
+  opts.error_if_exists = false;
+  s = DB::Open(opts, dbname, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != nullptr);
+
+  delete db;
+  db = nullptr;
+}
+
+TEST(DBTest, DBOpen_Change_NumLevels) {
+  std::string dbname = test::TmpDir() + "/db_change_num_levels";
+  ASSERT_OK(DestroyDB(dbname, Options()));
+  Options opts;
+  Status s;
+  DB* db = nullptr;
+  opts.create_if_missing = true;
+  s = DB::Open(opts, dbname, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != nullptr);
+  db->Put(WriteOptions(), "a", "123");
+  db->Put(WriteOptions(), "b", "234");
+  db->CompactRange(nullptr, nullptr);
+  delete db;
+  db = nullptr;
+
+  opts.create_if_missing = false;
+  opts.num_levels = 2;
+  s = DB::Open(opts, dbname, &db);
+  ASSERT_TRUE(strstr(s.ToString().c_str(), "Invalid argument") != nullptr);
+  ASSERT_TRUE(db == nullptr);
+}
+
+TEST(DBTest, DestroyDBMetaDatabase) {
+  std::string dbname = test::TmpDir() + "/db_meta";
+  std::string metadbname = MetaDatabaseName(dbname, 0);
+  std::string metametadbname = MetaDatabaseName(metadbname, 0);
+
+  // Destroy previous versions if they exist. Using the long way.
+  ASSERT_OK(DestroyDB(metametadbname, Options()));
+  ASSERT_OK(DestroyDB(metadbname, Options()));
+  ASSERT_OK(DestroyDB(dbname, Options()));
+
+  // Setup databases
+  Options opts;
+  opts.create_if_missing = true;
+  DB* db = nullptr;
+  ASSERT_OK(DB::Open(opts, dbname, &db));
+  delete db;
+  db = nullptr;
+  ASSERT_OK(DB::Open(opts, metadbname, &db));
+  delete db;
+  db = nullptr;
+  ASSERT_OK(DB::Open(opts, metametadbname, &db));
+  delete db;
+  db = nullptr;
+
+  // Delete databases
+  ASSERT_OK(DestroyDB(dbname, Options()));
+
+  // Check if deletion worked.
+  opts.create_if_missing = false;
+  ASSERT_TRUE(!(DB::Open(opts, dbname, &db)).ok());
+  ASSERT_TRUE(!(DB::Open(opts, metadbname, &db)).ok());
+  ASSERT_TRUE(!(DB::Open(opts, metametadbname, &db)).ok());
+}
+
+// Check that number of files does not grow when we are out of space
+TEST(DBTest, NoSpace) {
+  do {
+    Options options = CurrentOptions();
+    options.env = env_;
+    Reopen(&options);
+
+    ASSERT_OK(Put("foo", "v1"));
+    ASSERT_EQ("v1", Get("foo"));
+    Compact("a", "z");
+    const int num_files = CountFiles();
+    env_->no_space_.Release_Store(env_);   // Force out-of-space errors
+    env_->sleep_counter_.Reset();
+    for (int i = 0; i < 5; i++) {
+      for (int level = 0; level < dbfull()->NumberLevels()-1; level++) {
+        dbfull()->TEST_CompactRange(level, nullptr, nullptr);
+      }
+    }
+    env_->no_space_.Release_Store(nullptr);
+    ASSERT_LT(CountFiles(), num_files + 3);
+
+    // Check that compaction attempts slept after errors
+    ASSERT_GE(env_->sleep_counter_.Read(), 5);
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, NonWritableFileSystem) {
+  do {
+    Options options = CurrentOptions();
+    options.write_buffer_size = 1000;
+    options.env = env_;
+    Reopen(&options);
+    ASSERT_OK(Put("foo", "v1"));
+    env_->non_writable_.Release_Store(env_); // Force errors for new files
+    std::string big(100000, 'x');
+    int errors = 0;
+    for (int i = 0; i < 20; i++) {
+      if (!Put("foo", big).ok()) {
+        errors++;
+        env_->SleepForMicroseconds(100000);
+      }
+    }
+    ASSERT_GT(errors, 0);
+    env_->non_writable_.Release_Store(nullptr);
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, ManifestWriteError) {
+  // Test for the following problem:
+  // (a) Compaction produces file F
+  // (b) Log record containing F is written to MANIFEST file, but Sync() fails
+  // (c) GC deletes F
+  // (d) After reopening DB, reads fail since deleted F is named in log record
+
+  // We iterate twice.  In the second iteration, everything is the
+  // same except the log record never makes it to the MANIFEST file.
+  for (int iter = 0; iter < 2; iter++) {
+    port::AtomicPointer* error_type = (iter == 0)
+        ? &env_->manifest_sync_error_
+        : &env_->manifest_write_error_;
+
+    // Insert foo=>bar mapping
+    Options options = CurrentOptions();
+    options.env = env_;
+    options.create_if_missing = true;
+    options.error_if_exists = false;
+    DestroyAndReopen(&options);
+    ASSERT_OK(Put("foo", "bar"));
+    ASSERT_EQ("bar", Get("foo"));
+
+    // Memtable compaction (will succeed)
+    dbfull()->TEST_FlushMemTable();
+    ASSERT_EQ("bar", Get("foo"));
+    const int last = dbfull()->MaxMemCompactionLevel();
+    ASSERT_EQ(NumTableFilesAtLevel(last), 1);   // foo=>bar is now in last level
+
+    // Merging compaction (will fail)
+    error_type->Release_Store(env_);
+    dbfull()->TEST_CompactRange(last, nullptr, nullptr);  // Should fail
+    ASSERT_EQ("bar", Get("foo"));
+
+    // Recovery: should not lose data
+    error_type->Release_Store(nullptr);
+    Reopen(&options);
+    ASSERT_EQ("bar", Get("foo"));
+  }
+}
+
+TEST(DBTest, PutFailsParanoid) {
+  // Test the following:
+  // (a) A random put fails in paranoid mode (simulate by sync fail)
+  // (b) All other puts have to fail, even if writes would succeed
+  // (c) All of that should happen ONLY if paranoid_checks = true
+
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.error_if_exists = false;
+  options.paranoid_checks = true;
+  DestroyAndReopen(&options);
+  Status s;
+
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Put("foo1", "bar1"));
+  // simulate error
+  env_->log_write_error_.Release_Store(env_);
+  s = Put("foo2", "bar2");
+  ASSERT_TRUE(!s.ok());
+  env_->log_write_error_.Release_Store(nullptr);
+  s = Put("foo3", "bar3");
+  // the next put should fail, too
+  ASSERT_TRUE(!s.ok());
+  // but we're still able to read
+  ASSERT_EQ("bar", Get("foo"));
+
+  // do the same thing with paranoid checks off
+  options.paranoid_checks = false;
+  DestroyAndReopen(&options);
+
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Put("foo1", "bar1"));
+  // simulate error
+  env_->log_write_error_.Release_Store(env_);
+  s = Put("foo2", "bar2");
+  ASSERT_TRUE(!s.ok());
+  env_->log_write_error_.Release_Store(nullptr);
+  s = Put("foo3", "bar3");
+  // the next put should NOT fail
+  ASSERT_TRUE(s.ok());
+}
+
+TEST(DBTest, FilesDeletedAfterCompaction) {
+  do {
+    ASSERT_OK(Put("foo", "v2"));
+    Compact("a", "z");
+    const int num_files = CountLiveFiles();
+    for (int i = 0; i < 10; i++) {
+      ASSERT_OK(Put("foo", "v2"));
+      Compact("a", "z");
+    }
+    ASSERT_EQ(CountLiveFiles(), num_files);
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, BloomFilter) {
+  do {
+    env_->count_random_reads_ = true;
+    Options options = CurrentOptions();
+    options.env = env_;
+    options.no_block_cache = true;
+    options.filter_policy = NewBloomFilterPolicy(10);
+    Reopen(&options);
+
+    // Populate multiple layers
+    const int N = 10000;
+    for (int i = 0; i < N; i++) {
+      ASSERT_OK(Put(Key(i), Key(i)));
+    }
+    Compact("a", "z");
+    for (int i = 0; i < N; i += 100) {
+      ASSERT_OK(Put(Key(i), Key(i)));
+    }
+    dbfull()->TEST_FlushMemTable();
+
+    // Prevent auto compactions triggered by seeks
+    env_->delay_sstable_sync_.Release_Store(env_);
+
+    // Lookup present keys.  Should rarely read from small sstable.
+    env_->random_read_counter_.Reset();
+    for (int i = 0; i < N; i++) {
+      ASSERT_EQ(Key(i), Get(Key(i)));
+    }
+    int reads = env_->random_read_counter_.Read();
+    fprintf(stderr, "%d present => %d reads\n", N, reads);
+    ASSERT_GE(reads, N);
+    ASSERT_LE(reads, N + 2*N/100);
+
+    // Lookup present keys.  Should rarely read from either sstable.
+    env_->random_read_counter_.Reset();
+    for (int i = 0; i < N; i++) {
+      ASSERT_EQ("NOT_FOUND", Get(Key(i) + ".missing"));
+    }
+    reads = env_->random_read_counter_.Read();
+    fprintf(stderr, "%d missing => %d reads\n", N, reads);
+    ASSERT_LE(reads, 3*N/100);
+
+    env_->delay_sstable_sync_.Release_Store(nullptr);
+    Close();
+    delete options.filter_policy;
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, SnapshotFiles) {
+  do {
+    Options options = CurrentOptions();
+    const EnvOptions soptions;
+    options.write_buffer_size = 100000000;        // Large write buffer
+    Reopen(&options);
+
+    Random rnd(301);
+
+    // Write 8MB (80 values, each 100K)
+    ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+    std::vector<std::string> values;
+    for (int i = 0; i < 80; i++) {
+      values.push_back(RandomString(&rnd, 100000));
+      ASSERT_OK(Put(Key(i), values[i]));
+    }
+
+    // assert that nothing makes it to disk yet.
+    ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+
+    // get a file snapshot
+    uint64_t manifest_number = 0;
+    uint64_t manifest_size = 0;
+    std::vector<std::string> files;
+    dbfull()->DisableFileDeletions();
+    dbfull()->GetLiveFiles(files, &manifest_size);
+
+    // CURRENT, MANIFEST, *.sst files
+    ASSERT_EQ(files.size(), 3U);
+
+    uint64_t number = 0;
+    FileType type;
+
+    // copy these files to a new snapshot directory
+    std::string snapdir = dbname_ + ".snapdir/";
+    std::string mkdir = "mkdir -p " + snapdir;
+    ASSERT_EQ(system(mkdir.c_str()), 0);
+
+    for (unsigned int i = 0; i < files.size(); i++) {
+      // our clients require that GetLiveFiles returns
+      // files with "/" as first character!
+      ASSERT_EQ(files[i][0], '/');
+      std::string src = dbname_ + files[i];
+      std::string dest = snapdir + files[i];
+
+      uint64_t size;
+      ASSERT_OK(env_->GetFileSize(src, &size));
+
+      // record the number and the size of the
+      // latest manifest file
+      if (ParseFileName(files[i].substr(1), &number, &type)) {
+        if (type == kDescriptorFile) {
+          if (number > manifest_number) {
+            manifest_number = number;
+            ASSERT_GE(size, manifest_size);
+            size = manifest_size; // copy only valid MANIFEST data
+          }
+        }
+      }
+      unique_ptr<SequentialFile> srcfile;
+      ASSERT_OK(env_->NewSequentialFile(src, &srcfile, soptions));
+      unique_ptr<WritableFile> destfile;
+      ASSERT_OK(env_->NewWritableFile(dest, &destfile, soptions));
+
+      char buffer[4096];
+      Slice slice;
+      while (size > 0) {
+        uint64_t one = std::min(uint64_t(sizeof(buffer)), size);
+        ASSERT_OK(srcfile->Read(one, &slice, buffer));
+        ASSERT_OK(destfile->Append(slice));
+        size -= slice.size();
+      }
+      ASSERT_OK(destfile->Close());
+    }
+
+    // release file snapshot
+    dbfull()->DisableFileDeletions();
+
+    // overwrite one key, this key should not appear in the snapshot
+    std::vector<std::string> extras;
+    for (unsigned int i = 0; i < 1; i++) {
+      extras.push_back(RandomString(&rnd, 100000));
+      ASSERT_OK(Put(Key(i), extras[i]));
+    }
+
+    // verify that data in the snapshot are correct
+    Options opts;
+    DB* snapdb;
+    opts.create_if_missing = false;
+    Status stat = DB::Open(opts, snapdir, &snapdb);
+    ASSERT_OK(stat);
+
+    ReadOptions roptions;
+    std::string val;
+    for (unsigned int i = 0; i < 80; i++) {
+      stat = snapdb->Get(roptions, Key(i), &val);
+      ASSERT_EQ(values[i].compare(val), 0);
+    }
+    delete snapdb;
+
+    // look at the new live files after we added an 'extra' key
+    // and after we took the first snapshot.
+    uint64_t new_manifest_number = 0;
+    uint64_t new_manifest_size = 0;
+    std::vector<std::string> newfiles;
+    dbfull()->DisableFileDeletions();
+    dbfull()->GetLiveFiles(newfiles, &new_manifest_size);
+
+    // find the new manifest file. assert that this manifest file is
+    // the same one as in the previous snapshot. But its size should be
+    // larger because we added an extra key after taking the
+    // previous shapshot.
+    for (unsigned int i = 0; i < newfiles.size(); i++) {
+      std::string src = dbname_ + "/" + newfiles[i];
+      // record the lognumber and the size of the
+      // latest manifest file
+      if (ParseFileName(newfiles[i].substr(1), &number, &type)) {
+        if (type == kDescriptorFile) {
+          if (number > new_manifest_number) {
+            uint64_t size;
+            new_manifest_number = number;
+            ASSERT_OK(env_->GetFileSize(src, &size));
+            ASSERT_GE(size, new_manifest_size);
+          }
+        }
+      }
+    }
+    ASSERT_EQ(manifest_number, new_manifest_number);
+    ASSERT_GT(new_manifest_size, manifest_size);
+
+    // release file snapshot
+    dbfull()->DisableFileDeletions();
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, CompactOnFlush) {
+  do {
+    Options options = CurrentOptions();
+    options.purge_redundant_kvs_while_flush = true;
+    options.disable_auto_compactions = true;
+    Reopen(&options);
+
+    Put("foo", "v1");
+    ASSERT_OK(dbfull()->TEST_FlushMemTable());
+    ASSERT_EQ(AllEntriesFor("foo"), "[ v1 ]");
+
+    // Write two new keys
+    Put("a", "begin");
+    Put("z", "end");
+    dbfull()->TEST_FlushMemTable();
+
+    // Case1: Delete followed by a put
+    Delete("foo");
+    Put("foo", "v2");
+    ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]");
+
+    // After the current memtable is flushed, the DEL should
+    // have been removed
+    ASSERT_OK(dbfull()->TEST_FlushMemTable());
+    ASSERT_EQ(AllEntriesFor("foo"), "[ v2, v1 ]");
+
+    dbfull()->CompactRange(nullptr, nullptr);
+    ASSERT_EQ(AllEntriesFor("foo"), "[ v2 ]");
+
+    // Case 2: Delete followed by another delete
+    Delete("foo");
+    Delete("foo");
+    ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, DEL, v2 ]");
+    ASSERT_OK(dbfull()->TEST_FlushMemTable());
+    ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v2 ]");
+    dbfull()->CompactRange(nullptr, nullptr);
+    ASSERT_EQ(AllEntriesFor("foo"), "[ ]");
+
+    // Case 3: Put followed by a delete
+    Put("foo", "v3");
+    Delete("foo");
+    ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v3 ]");
+    ASSERT_OK(dbfull()->TEST_FlushMemTable());
+    ASSERT_EQ(AllEntriesFor("foo"), "[ DEL ]");
+    dbfull()->CompactRange(nullptr, nullptr);
+    ASSERT_EQ(AllEntriesFor("foo"), "[ ]");
+
+    // Case 4: Put followed by another Put
+    Put("foo", "v4");
+    Put("foo", "v5");
+    ASSERT_EQ(AllEntriesFor("foo"), "[ v5, v4 ]");
+    ASSERT_OK(dbfull()->TEST_FlushMemTable());
+    ASSERT_EQ(AllEntriesFor("foo"), "[ v5 ]");
+    dbfull()->CompactRange(nullptr, nullptr);
+    ASSERT_EQ(AllEntriesFor("foo"), "[ v5 ]");
+
+    // clear database
+    Delete("foo");
+    dbfull()->CompactRange(nullptr, nullptr);
+    ASSERT_EQ(AllEntriesFor("foo"), "[ ]");
+
+    // Case 5: Put followed by snapshot followed by another Put
+    // Both puts should remain.
+    Put("foo", "v6");
+    const Snapshot* snapshot = db_->GetSnapshot();
+    Put("foo", "v7");
+    ASSERT_OK(dbfull()->TEST_FlushMemTable());
+    ASSERT_EQ(AllEntriesFor("foo"), "[ v7, v6 ]");
+    db_->ReleaseSnapshot(snapshot);
+
+    // clear database
+    Delete("foo");
+    dbfull()->CompactRange(nullptr, nullptr);
+    ASSERT_EQ(AllEntriesFor("foo"), "[ ]");
+
+    // Case 5: snapshot followed by a put followed by another Put
+    // Only the last put should remain.
+    const Snapshot* snapshot1 = db_->GetSnapshot();
+    Put("foo", "v8");
+    Put("foo", "v9");
+    ASSERT_OK(dbfull()->TEST_FlushMemTable());
+    ASSERT_EQ(AllEntriesFor("foo"), "[ v9 ]");
+    db_->ReleaseSnapshot(snapshot1);
+  } while (ChangeCompactOptions());
+}
+
+std::vector<std::uint64_t> ListLogFiles(Env* env, const std::string& path) {
+  std::vector<std::string> files;
+  std::vector<uint64_t> log_files;
+  env->GetChildren(path, &files);
+  uint64_t number;
+  FileType type;
+  for (size_t i = 0; i < files.size(); ++i) {
+    if (ParseFileName(files[i], &number, &type)) {
+      if (type == kLogFile) {
+        log_files.push_back(number);
+      }
+    }
+  }
+  return std::move(log_files);
+}
+
+TEST(DBTest, WALArchivalTtl) {
+  do {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.WAL_ttl_seconds = 1000;
+    DestroyAndReopen(&options);
+
+    //  TEST : Create DB with a ttl and no size limit.
+    //  Put some keys. Count the log files present in the DB just after insert.
+    //  Re-open db. Causes deletion/archival to take place.
+    //  Assert that the files moved under "/archive".
+    //  Reopen db with small ttl.
+    //  Assert that archive was removed.
+
+    std::string archiveDir = ArchivalDirectory(dbname_);
+
+    for (int i = 0; i < 10; ++i) {
+      for (int j = 0; j < 10; ++j) {
+        ASSERT_OK(Put(Key(10 * i + j), DummyString(1024)));
+      }
+
+      std::vector<uint64_t> log_files = ListLogFiles(env_, dbname_);
+
+      options.create_if_missing = false;
+      Reopen(&options);
+
+      std::vector<uint64_t> logs = ListLogFiles(env_, archiveDir);
+      std::set<uint64_t> archivedFiles(logs.begin(), logs.end());
+
+      for (auto& log : log_files) {
+        ASSERT_TRUE(archivedFiles.find(log) != archivedFiles.end());
+      }
+    }
+
+    std::vector<uint64_t> log_files = ListLogFiles(env_, archiveDir);
+    ASSERT_TRUE(log_files.size() > 0);
+
+    options.WAL_ttl_seconds = 1;
+    env_->SleepForMicroseconds(2 * 1000 * 1000);
+    Reopen(&options);
+
+    log_files = ListLogFiles(env_, archiveDir);
+    ASSERT_TRUE(log_files.empty());
+  } while (ChangeCompactOptions());
+}
+
+uint64_t GetLogDirSize(std::string dir_path, SpecialEnv* env) {
+  uint64_t dir_size = 0;
+  std::vector<std::string> files;
+  env->GetChildren(dir_path, &files);
+  for (auto& f : files) {
+    uint64_t number;
+    FileType type;
+    if (ParseFileName(f, &number, &type) && type == kLogFile) {
+      std::string const file_path = dir_path + "/" + f;
+      uint64_t file_size;
+      env->GetFileSize(file_path, &file_size);
+      dir_size += file_size;
+    }
+  }
+  return dir_size;
+}
+
+TEST(DBTest, WALArchivalSizeLimit) {
+  do {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.WAL_ttl_seconds = 0;
+    options.WAL_size_limit_MB = 1000;
+
+    // TEST : Create DB with huge size limit and no ttl.
+    // Put some keys. Count the archived log files present in the DB
+    // just after insert. Assert that there are many enough.
+    // Change size limit. Re-open db.
+    // Assert that archive is not greater than WAL_size_limit_MB.
+    // Set ttl and time_to_check_ to small values. Re-open db.
+    // Assert that there are no archived logs left.
+
+    DestroyAndReopen(&options);
+    for (int i = 0; i < 128 * 128; ++i) {
+      ASSERT_OK(Put(Key(i), DummyString(1024)));
+    }
+    Reopen(&options);
+
+    std::string archive_dir = ArchivalDirectory(dbname_);
+    std::vector<std::uint64_t> log_files = ListLogFiles(env_, archive_dir);
+    ASSERT_TRUE(log_files.size() > 2);
+
+    options.WAL_size_limit_MB = 8;
+    Reopen(&options);
+    dbfull()->TEST_PurgeObsoleteteWAL();
+
+    uint64_t archive_size = GetLogDirSize(archive_dir, env_);
+    ASSERT_TRUE(archive_size <= options.WAL_size_limit_MB * 1024 * 1024);
+
+    options.WAL_ttl_seconds = 1;
+    dbfull()->TEST_SetDefaultTimeToCheck(1);
+    env_->SleepForMicroseconds(2 * 1000 * 1000);
+    Reopen(&options);
+    dbfull()->TEST_PurgeObsoleteteWAL();
+
+    log_files = ListLogFiles(env_, archive_dir);
+    ASSERT_TRUE(log_files.empty());
+  } while (ChangeCompactOptions());
+}
+
+SequenceNumber ReadRecords(
+    std::unique_ptr<TransactionLogIterator>& iter,
+    int& count) {
+  count = 0;
+  SequenceNumber lastSequence = 0;
+  BatchResult res;
+  while (iter->Valid()) {
+    res = iter->GetBatch();
+    ASSERT_TRUE(res.sequence > lastSequence);
+    ++count;
+    lastSequence = res.sequence;
+    ASSERT_OK(iter->status());
+    iter->Next();
+  }
+  return res.sequence;
+}
+
+void ExpectRecords(
+    const int expected_no_records,
+    std::unique_ptr<TransactionLogIterator>& iter) {
+  int num_records;
+  ReadRecords(iter, num_records);
+  ASSERT_EQ(num_records, expected_no_records);
+}
+
+TEST(DBTest, TransactionLogIterator) {
+  do {
+    Options options = OptionsForLogIterTest();
+    DestroyAndReopen(&options);
+    Put("key1", DummyString(1024));
+    Put("key2", DummyString(1024));
+    Put("key2", DummyString(1024));
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 3U);
+    {
+      auto iter = OpenTransactionLogIter(0);
+      ExpectRecords(3, iter);
+    }
+    Reopen(&options);
+      env_->SleepForMicroseconds(2 * 1000 * 1000);{
+      Put("key4", DummyString(1024));
+      Put("key5", DummyString(1024));
+      Put("key6", DummyString(1024));
+    }
+    {
+      auto iter = OpenTransactionLogIter(0);
+      ExpectRecords(6, iter);
+    }
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, TransactionLogIteratorMoveOverZeroFiles) {
+  do {
+    Options options = OptionsForLogIterTest();
+    DestroyAndReopen(&options);
+    // Do a plain Reopen.
+    Put("key1", DummyString(1024));
+    // Two reopens should create a zero record WAL file.
+    Reopen(&options);
+    Reopen(&options);
+
+    Put("key2", DummyString(1024));
+
+    auto iter = OpenTransactionLogIter(0);
+    ExpectRecords(2, iter);
+  } while (ChangeCompactOptions());
+}
+
+// TODO(kailiu) disable the in non-linux platforms to temporarily solve
+// // the unit test failure.
+#ifdef OS_LINUX
+TEST(DBTest, TransactionLogIteratorStallAtLastRecord) {
+  do {
+    Options options = OptionsForLogIterTest();
+    DestroyAndReopen(&options);
+    Put("key1", DummyString(1024));
+    auto iter = OpenTransactionLogIter(0);
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    iter->Next();
+    ASSERT_TRUE(!iter->Valid());
+    ASSERT_OK(iter->status());
+    Put("key2", DummyString(1024));
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+  } while (ChangeCompactOptions());
+}
+#endif
+
+TEST(DBTest, TransactionLogIteratorJustEmptyFile) {
+  do {
+    Options options = OptionsForLogIterTest();
+    DestroyAndReopen(&options);
+    unique_ptr<TransactionLogIterator> iter;
+    Status status = dbfull()->GetUpdatesSince(0, &iter);
+    // Check that an empty iterator is returned
+    ASSERT_TRUE(!iter->Valid());
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, TransactionLogIteratorCheckAfterRestart) {
+  do {
+    Options options = OptionsForLogIterTest();
+    DestroyAndReopen(&options);
+    Put("key1", DummyString(1024));
+    Put("key2", DummyString(1023));
+    dbfull()->Flush(FlushOptions());
+    Reopen(&options);
+    auto iter = OpenTransactionLogIter(0);
+    ExpectRecords(2, iter);
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, TransactionLogIteratorCorruptedLog) {
+  do {
+    Options options = OptionsForLogIterTest();
+    DestroyAndReopen(&options);
+    for (int i = 0; i < 1024; i++) {
+      Put("key"+std::to_string(i), DummyString(10));
+    }
+    dbfull()->Flush(FlushOptions());
+    // Corrupt this log to create a gap
+    rocksdb::VectorLogPtr wal_files;
+    ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files));
+    const auto logfilePath = dbname_ + "/" + wal_files.front()->PathName();
+    ASSERT_EQ(
+      0,
+      truncate(logfilePath.c_str(), wal_files.front()->SizeFileBytes() / 2));
+    // Insert a new entry to a new log file
+    Put("key1025", DummyString(10));
+    // Try to read from the beginning. Should stop before the gap and read less
+    // than 1025 entries
+    auto iter = OpenTransactionLogIter(0);
+    int count;
+    int last_sequence_read = ReadRecords(iter, count);
+    ASSERT_LT(last_sequence_read, 1025);
+    // Try to read past the gap, should be able to seek to key1025
+    auto iter2 = OpenTransactionLogIter(last_sequence_read + 1);
+    ExpectRecords(1, iter2);
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, TransactionLogIteratorBatchOperations) {
+  do {
+    Options options = OptionsForLogIterTest();
+    DestroyAndReopen(&options);
+    WriteBatch batch;
+    batch.Put("key1", DummyString(1024));
+    batch.Put("key2", DummyString(1024));
+    batch.Put("key3", DummyString(1024));
+    batch.Delete("key2");
+    dbfull()->Write(WriteOptions(), &batch);
+    dbfull()->Flush(FlushOptions());
+    Reopen(&options);
+    Put("key4", DummyString(1024));
+    auto iter = OpenTransactionLogIter(3);
+    ExpectRecords(2, iter);
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, TransactionLogIteratorBlobs) {
+  Options options = OptionsForLogIterTest();
+  DestroyAndReopen(&options);
+  {
+    WriteBatch batch;
+    batch.Put("key1", DummyString(1024));
+    batch.Put("key2", DummyString(1024));
+    batch.PutLogData(Slice("blob1"));
+    batch.Put("key3", DummyString(1024));
+    batch.PutLogData(Slice("blob2"));
+    batch.Delete("key2");
+    dbfull()->Write(WriteOptions(), &batch);
+    Reopen(&options);
+  }
+
+  auto res = OpenTransactionLogIter(0)->GetBatch();
+  struct Handler : public WriteBatch::Handler {
+    std::string seen;
+    virtual void Put(const Slice& key, const Slice& value) {
+      seen += "Put(" + key.ToString() + ", " + std::to_string(value.size()) +
+        ")";
+    }
+    virtual void Merge(const Slice& key, const Slice& value) {
+      seen += "Merge(" + key.ToString() + ", " + std::to_string(value.size()) +
+        ")";
+    }
+    virtual void LogData(const Slice& blob) {
+      seen += "LogData(" + blob.ToString() + ")";
+    }
+    virtual void Delete(const Slice& key) {
+      seen += "Delete(" + key.ToString() + ")";
+    }
+  } handler;
+  res.writeBatchPtr->Iterate(&handler);
+  ASSERT_EQ("Put(key1, 1024)"
+            "Put(key2, 1024)"
+            "LogData(blob1)"
+            "Put(key3, 1024)"
+            "LogData(blob2)"
+            "Delete(key2)", handler.seen);
+}
+
+TEST(DBTest, ReadCompaction) {
+  std::string value(4096, '4'); // a string of size 4K
+  {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.max_open_files = 20; // only 10 file in file-cache
+    options.target_file_size_base = 512;
+    options.write_buffer_size = 64 * 1024;
+    options.filter_policy = nullptr;
+    options.block_size = 4096;
+    options.no_block_cache = true;
+
+    Reopen(&options);
+
+    // Write 8MB (2000 values, each 4K)
+    ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+    std::vector<std::string> values;
+    for (int i = 0; i < 2000; i++) {
+      ASSERT_OK(Put(Key(i), value));
+    }
+
+    // clear level 0 and 1 if necessary.
+    dbfull()->TEST_FlushMemTable();
+    dbfull()->TEST_CompactRange(0, nullptr, nullptr);
+    dbfull()->TEST_CompactRange(1, nullptr, nullptr);
+    ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+    ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+
+    // write some new keys into level 0
+    for (int i = 0; i < 2000; i = i + 16) {
+      ASSERT_OK(Put(Key(i), value));
+    }
+    dbfull()->Flush(FlushOptions());
+
+    // Wait for any write compaction to finish
+    dbfull()->TEST_WaitForCompact();
+
+    // remember number of files in each level
+    int l1 = NumTableFilesAtLevel(0);
+    int l2 = NumTableFilesAtLevel(1);
+    int l3 = NumTableFilesAtLevel(3);
+    ASSERT_NE(NumTableFilesAtLevel(0), 0);
+    ASSERT_NE(NumTableFilesAtLevel(1), 0);
+    ASSERT_NE(NumTableFilesAtLevel(2), 0);
+
+    // read a bunch of times, trigger read compaction
+    for (int j = 0; j < 100; j++) {
+      for (int i = 0; i < 2000; i++) {
+        Get(Key(i));
+      }
+    }
+    // wait for read compaction to finish
+    env_->SleepForMicroseconds(1000000);
+
+    // verify that the number of files have decreased
+    // in some level, indicating that there was a compaction
+    ASSERT_TRUE(NumTableFilesAtLevel(0) < l1 ||
+                NumTableFilesAtLevel(1) < l2 ||
+                NumTableFilesAtLevel(2) < l3);
+  }
+}
+
+// Multi-threaded test:
+namespace {
+
+static const int kNumThreads = 4;
+static const int kTestSeconds = 10;
+static const int kNumKeys = 1000;
+
+struct MTState {
+  DBTest* test;
+  port::AtomicPointer stop;
+  port::AtomicPointer counter[kNumThreads];
+  port::AtomicPointer thread_done[kNumThreads];
+};
+
+struct MTThread {
+  MTState* state;
+  int id;
+};
+
+static void MTThreadBody(void* arg) {
+  MTThread* t = reinterpret_cast<MTThread*>(arg);
+  int id = t->id;
+  DB* db = t->state->test->db_;
+  uintptr_t counter = 0;
+  fprintf(stderr, "... starting thread %d\n", id);
+  Random rnd(1000 + id);
+  std::string value;
+  char valbuf[1500];
+  while (t->state->stop.Acquire_Load() == nullptr) {
+    t->state->counter[id].Release_Store(reinterpret_cast<void*>(counter));
+
+    int key = rnd.Uniform(kNumKeys);
+    char keybuf[20];
+    snprintf(keybuf, sizeof(keybuf), "%016d", key);
+
+    if (rnd.OneIn(2)) {
+      // Write values of the form <key, my id, counter>.
+      // We add some padding for force compactions.
+      snprintf(valbuf, sizeof(valbuf), "%d.%d.%-1000d",
+               key, id, static_cast<int>(counter));
+      ASSERT_OK(t->state->test->Put(Slice(keybuf), Slice(valbuf)));
+    } else {
+      // Read a value and verify that it matches the pattern written above.
+      Status s = db->Get(ReadOptions(), Slice(keybuf), &value);
+      if (s.IsNotFound()) {
+        // Key has not yet been written
+      } else {
+        // Check that the writer thread counter is >= the counter in the value
+        ASSERT_OK(s);
+        int k, w, c;
+        ASSERT_EQ(3, sscanf(value.c_str(), "%d.%d.%d", &k, &w, &c)) << value;
+        ASSERT_EQ(k, key);
+        ASSERT_GE(w, 0);
+        ASSERT_LT(w, kNumThreads);
+        ASSERT_LE((unsigned int)c, reinterpret_cast<uintptr_t>(
+            t->state->counter[w].Acquire_Load()));
+      }
+    }
+    counter++;
+  }
+  t->state->thread_done[id].Release_Store(t);
+  fprintf(stderr, "... stopping thread %d after %d ops\n", id, int(counter));
+}
+
+}  // namespace
+
+TEST(DBTest, MultiThreaded) {
+  do {
+    // Initialize state
+    MTState mt;
+    mt.test = this;
+    mt.stop.Release_Store(0);
+    for (int id = 0; id < kNumThreads; id++) {
+      mt.counter[id].Release_Store(0);
+      mt.thread_done[id].Release_Store(0);
+    }
+
+    // Start threads
+    MTThread thread[kNumThreads];
+    for (int id = 0; id < kNumThreads; id++) {
+      thread[id].state = &mt;
+      thread[id].id = id;
+      env_->StartThread(MTThreadBody, &thread[id]);
+    }
+
+    // Let them run for a while
+    env_->SleepForMicroseconds(kTestSeconds * 1000000);
+
+    // Stop the threads and wait for them to finish
+    mt.stop.Release_Store(&mt);
+    for (int id = 0; id < kNumThreads; id++) {
+      while (mt.thread_done[id].Acquire_Load() == nullptr) {
+        env_->SleepForMicroseconds(100000);
+      }
+    }
+  } while (ChangeOptions());
+}
+
+// Group commit test:
+namespace {
+
+static const int kGCNumThreads = 4;
+static const int kGCNumKeys = 1000;
+
+struct GCThread {
+  DB* db;
+  int id;
+  std::atomic<bool> done;
+};
+
+static void GCThreadBody(void* arg) {
+  GCThread* t = reinterpret_cast<GCThread*>(arg);
+  int id = t->id;
+  DB* db = t->db;
+  WriteOptions wo;
+
+  for (int i = 0; i < kGCNumKeys; ++i) {
+    std::string kv(std::to_string(i + id * kGCNumKeys));
+    ASSERT_OK(db->Put(wo, kv, kv));
+  }
+  t->done = true;
+}
+
+}  // namespace
+
+TEST(DBTest, GroupCommitTest) {
+  do {
+    // Start threads
+    GCThread thread[kGCNumThreads];
+    for (int id = 0; id < kGCNumThreads; id++) {
+      thread[id].id = id;
+      thread[id].db = db_;
+      thread[id].done = false;
+      env_->StartThread(GCThreadBody, &thread[id]);
+    }
+
+    for (int id = 0; id < kGCNumThreads; id++) {
+      while (thread[id].done == false) {
+        env_->SleepForMicroseconds(100000);
+      }
+    }
+
+    std::vector<std::string> expected_db;
+    for (int i = 0; i < kGCNumThreads * kGCNumKeys; ++i) {
+      expected_db.push_back(std::to_string(i));
+    }
+    sort(expected_db.begin(), expected_db.end());
+
+    Iterator* itr = db_->NewIterator(ReadOptions());
+    itr->SeekToFirst();
+    for (auto x : expected_db) {
+      ASSERT_TRUE(itr->Valid());
+      ASSERT_EQ(itr->key().ToString(), x);
+      ASSERT_EQ(itr->value().ToString(), x);
+      itr->Next();
+    }
+    ASSERT_TRUE(!itr->Valid());
+    delete itr;
+
+  } while (ChangeOptions());
+}
+
+namespace {
+typedef std::map<std::string, std::string> KVMap;
+}
+
+class ModelDB: public DB {
+ public:
+  class ModelSnapshot : public Snapshot {
+   public:
+    KVMap map_;
+  };
+
+  explicit ModelDB(const Options& options): options_(options) { }
+  virtual Status Put(const WriteOptions& o, const Slice& k, const Slice& v) {
+    return DB::Put(o, k, v);
+  }
+  virtual Status Merge(const WriteOptions& o, const Slice& k, const Slice& v) {
+    return DB::Merge(o, k, v);
+  }
+  virtual Status Delete(const WriteOptions& o, const Slice& key) {
+    return DB::Delete(o, key);
+  }
+  virtual Status Get(const ReadOptions& options,
+                     const Slice& key, std::string* value) {
+    return Status::NotSupported(key);
+  }
+
+  virtual std::vector<Status> MultiGet(const ReadOptions& options,
+                                       const std::vector<Slice>& keys,
+                                       std::vector<std::string>* values) {
+    std::vector<Status> s(keys.size(),
+                          Status::NotSupported("Not implemented."));
+    return s;
+  }
+  virtual bool KeyMayExist(const ReadOptions& options,
+                           const Slice& key,
+                           std::string* value,
+                           bool* value_found = nullptr) {
+    if (value_found != nullptr) {
+      *value_found = false;
+    }
+    return true; // Not Supported directly
+  }
+  virtual Iterator* NewIterator(const ReadOptions& options) {
+    if (options.snapshot == nullptr) {
+      KVMap* saved = new KVMap;
+      *saved = map_;
+      return new ModelIter(saved, true);
+    } else {
+      const KVMap* snapshot_state =
+          &(reinterpret_cast<const ModelSnapshot*>(options.snapshot)->map_);
+      return new ModelIter(snapshot_state, false);
+    }
+  }
+  virtual const Snapshot* GetSnapshot() {
+    ModelSnapshot* snapshot = new ModelSnapshot;
+    snapshot->map_ = map_;
+    return snapshot;
+  }
+
+  virtual void ReleaseSnapshot(const Snapshot* snapshot) {
+    delete reinterpret_cast<const ModelSnapshot*>(snapshot);
+  }
+  virtual Status Write(const WriteOptions& options, WriteBatch* batch) {
+    class Handler : public WriteBatch::Handler {
+     public:
+      KVMap* map_;
+      virtual void Put(const Slice& key, const Slice& value) {
+        (*map_)[key.ToString()] = value.ToString();
+      }
+      virtual void Merge(const Slice& key, const Slice& value) {
+        // ignore merge for now
+        //(*map_)[key.ToString()] = value.ToString();
+      }
+      virtual void Delete(const Slice& key) {
+        map_->erase(key.ToString());
+      }
+    };
+    Handler handler;
+    handler.map_ = &map_;
+    return batch->Iterate(&handler);
+  }
+
+  virtual bool GetProperty(const Slice& property, std::string* value) {
+    return false;
+  }
+  virtual void GetApproximateSizes(const Range* r, int n, uint64_t* sizes) {
+    for (int i = 0; i < n; i++) {
+      sizes[i] = 0;
+    }
+  }
+  virtual void CompactRange(const Slice* start, const Slice* end,
+                            bool reduce_level, int target_level) {
+  }
+
+  virtual int NumberLevels()
+  {
+  return 1;
+  }
+
+  virtual int MaxMemCompactionLevel()
+  {
+  return 1;
+  }
+
+  virtual int Level0StopWriteTrigger()
+  {
+  return -1;
+  }
+
+  virtual const std::string& GetName() const {
+    return name_;
+  }
+
+  virtual Env* GetEnv() const {
+    return nullptr;
+  }
+
+  virtual const Options& GetOptions() const {
+    return options_;
+  }
+
+  virtual Status Flush(const rocksdb::FlushOptions& options) {
+    Status ret;
+    return ret;
+  }
+
+  virtual Status DisableFileDeletions() {
+    return Status::OK();
+  }
+  virtual Status EnableFileDeletions(bool force) {
+    return Status::OK();
+  }
+  virtual Status GetLiveFiles(std::vector<std::string>&, uint64_t* size,
+                              bool flush_memtable = true) {
+    return Status::OK();
+  }
+
+  virtual Status GetSortedWalFiles(VectorLogPtr& files) {
+    return Status::OK();
+  }
+
+  virtual Status DeleteFile(std::string name) {
+    return Status::OK();
+  }
+
+  virtual Status GetDbIdentity(std::string& identity) {
+    return Status::OK();
+  }
+
+  virtual SequenceNumber GetLatestSequenceNumber() const {
+    return 0;
+  }
+  virtual Status GetUpdatesSince(rocksdb::SequenceNumber,
+                                 unique_ptr<rocksdb::TransactionLogIterator>*) {
+    return Status::NotSupported("Not supported in Model DB");
+  }
+
+ private:
+  class ModelIter: public Iterator {
+   public:
+    ModelIter(const KVMap* map, bool owned)
+        : map_(map), owned_(owned), iter_(map_->end()) {
+    }
+    ~ModelIter() {
+      if (owned_) delete map_;
+    }
+    virtual bool Valid() const { return iter_ != map_->end(); }
+    virtual void SeekToFirst() { iter_ = map_->begin(); }
+    virtual void SeekToLast() {
+      if (map_->empty()) {
+        iter_ = map_->end();
+      } else {
+        iter_ = map_->find(map_->rbegin()->first);
+      }
+    }
+    virtual void Seek(const Slice& k) {
+      iter_ = map_->lower_bound(k.ToString());
+    }
+    virtual void Next() { ++iter_; }
+    virtual void Prev() { --iter_; }
+    virtual Slice key() const { return iter_->first; }
+    virtual Slice value() const { return iter_->second; }
+    virtual Status status() const { return Status::OK(); }
+   private:
+    const KVMap* const map_;
+    const bool owned_;  // Do we own map_
+    KVMap::const_iterator iter_;
+  };
+  const Options options_;
+  KVMap map_;
+  std::string name_ = "";
+};
+
+static std::string RandomKey(Random* rnd, int minimum = 0) {
+  int len;
+  do {
+    len = (rnd->OneIn(3)
+           ? 1                // Short sometimes to encourage collisions
+           : (rnd->OneIn(100) ? rnd->Skewed(10) : rnd->Uniform(10)));
+  } while (len < minimum);
+  return test::RandomKey(rnd, len);
+}
+
+static bool CompareIterators(int step,
+                             DB* model,
+                             DB* db,
+                             const Snapshot* model_snap,
+                             const Snapshot* db_snap) {
+  ReadOptions options;
+  options.snapshot = model_snap;
+  Iterator* miter = model->NewIterator(options);
+  options.snapshot = db_snap;
+  Iterator* dbiter = db->NewIterator(options);
+  bool ok = true;
+  int count = 0;
+  for (miter->SeekToFirst(), dbiter->SeekToFirst();
+       ok && miter->Valid() && dbiter->Valid();
+       miter->Next(), dbiter->Next()) {
+    count++;
+    if (miter->key().compare(dbiter->key()) != 0) {
+      fprintf(stderr, "step %d: Key mismatch: '%s' vs. '%s'\n",
+              step,
+              EscapeString(miter->key()).c_str(),
+              EscapeString(dbiter->key()).c_str());
+      ok = false;
+      break;
+    }
+
+    if (miter->value().compare(dbiter->value()) != 0) {
+      fprintf(stderr, "step %d: Value mismatch for key '%s': '%s' vs. '%s'\n",
+              step,
+              EscapeString(miter->key()).c_str(),
+              EscapeString(miter->value()).c_str(),
+              EscapeString(miter->value()).c_str());
+      ok = false;
+    }
+  }
+
+  if (ok) {
+    if (miter->Valid() != dbiter->Valid()) {
+      fprintf(stderr, "step %d: Mismatch at end of iterators: %d vs. %d\n",
+              step, miter->Valid(), dbiter->Valid());
+      ok = false;
+    }
+  }
+  delete miter;
+  delete dbiter;
+  return ok;
+}
+
+TEST(DBTest, Randomized) {
+  Random rnd(test::RandomSeed());
+  do {
+    ModelDB model(CurrentOptions());
+    const int N = 10000;
+    const Snapshot* model_snap = nullptr;
+    const Snapshot* db_snap = nullptr;
+    std::string k, v;
+    for (int step = 0; step < N; step++) {
+      // TODO(sanjay): Test Get() works
+      int p = rnd.Uniform(100);
+      int minimum = 0;
+      if (option_config_ == kHashSkipList) {
+        minimum = 1;
+      }
+      if (p < 45) {                               // Put
+        k = RandomKey(&rnd, minimum);
+        v = RandomString(&rnd,
+                         rnd.OneIn(20)
+                         ? 100 + rnd.Uniform(100)
+                         : rnd.Uniform(8));
+        ASSERT_OK(model.Put(WriteOptions(), k, v));
+        ASSERT_OK(db_->Put(WriteOptions(), k, v));
+
+      } else if (p < 90) {                        // Delete
+        k = RandomKey(&rnd, minimum);
+        ASSERT_OK(model.Delete(WriteOptions(), k));
+        ASSERT_OK(db_->Delete(WriteOptions(), k));
+
+
+      } else {                                    // Multi-element batch
+        WriteBatch b;
+        const int num = rnd.Uniform(8);
+        for (int i = 0; i < num; i++) {
+          if (i == 0 || !rnd.OneIn(10)) {
+            k = RandomKey(&rnd, minimum);
+          } else {
+            // Periodically re-use the same key from the previous iter, so
+            // we have multiple entries in the write batch for the same key
+          }
+          if (rnd.OneIn(2)) {
+            v = RandomString(&rnd, rnd.Uniform(10));
+            b.Put(k, v);
+          } else {
+            b.Delete(k);
+          }
+        }
+        ASSERT_OK(model.Write(WriteOptions(), &b));
+        ASSERT_OK(db_->Write(WriteOptions(), &b));
+      }
+
+      if ((step % 100) == 0) {
+        ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr));
+        ASSERT_TRUE(CompareIterators(step, &model, db_, model_snap, db_snap));
+        // Save a snapshot from each DB this time that we'll use next
+        // time we compare things, to make sure the current state is
+        // preserved with the snapshot
+        if (model_snap != nullptr) model.ReleaseSnapshot(model_snap);
+        if (db_snap != nullptr) db_->ReleaseSnapshot(db_snap);
+
+        Reopen();
+        ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr));
+
+        model_snap = model.GetSnapshot();
+        db_snap = db_->GetSnapshot();
+      }
+    }
+    if (model_snap != nullptr) model.ReleaseSnapshot(model_snap);
+    if (db_snap != nullptr) db_->ReleaseSnapshot(db_snap);
+  } while (ChangeOptions(kSkipDeletesFilterFirst));
+}
+
+TEST(DBTest, MultiGetSimple) {
+  do {
+    ASSERT_OK(db_->Put(WriteOptions(),"k1","v1"));
+    ASSERT_OK(db_->Put(WriteOptions(),"k2","v2"));
+    ASSERT_OK(db_->Put(WriteOptions(),"k3","v3"));
+    ASSERT_OK(db_->Put(WriteOptions(),"k4","v4"));
+    ASSERT_OK(db_->Delete(WriteOptions(),"k4"));
+    ASSERT_OK(db_->Put(WriteOptions(),"k5","v5"));
+    ASSERT_OK(db_->Delete(WriteOptions(),"no_key"));
+
+    std::vector<Slice> keys(6);
+    keys[0] = "k1";
+    keys[1] = "k2";
+    keys[2] = "k3";
+    keys[3] = "k4";
+    keys[4] = "k5";
+    keys[5] = "no_key";
+
+    std::vector<std::string> values(20,"Temporary data to be overwritten");
+
+    std::vector<Status> s =  db_->MultiGet(ReadOptions(),keys,&values);
+    ASSERT_EQ(values.size(),keys.size());
+    ASSERT_EQ(values[0], "v1");
+    ASSERT_EQ(values[1], "v2");
+    ASSERT_EQ(values[2], "v3");
+    ASSERT_EQ(values[4], "v5");
+
+    ASSERT_OK(s[0]);
+    ASSERT_OK(s[1]);
+    ASSERT_OK(s[2]);
+    ASSERT_TRUE(s[3].IsNotFound());
+    ASSERT_OK(s[4]);
+    ASSERT_TRUE(s[5].IsNotFound());
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, MultiGetEmpty) {
+  do {
+    // Empty Key Set
+    std::vector<Slice> keys;
+    std::vector<std::string> values;
+    std::vector<Status> s = db_->MultiGet(ReadOptions(),keys,&values);
+    ASSERT_EQ((int)s.size(),0);
+
+    // Empty Database, Empty Key Set
+    DestroyAndReopen();
+    s = db_->MultiGet(ReadOptions(), keys, &values);
+    ASSERT_EQ((int)s.size(),0);
+
+    // Empty Database, Search for Keys
+    keys.resize(2);
+    keys[0] = "a";
+    keys[1] = "b";
+    s = db_->MultiGet(ReadOptions(),keys,&values);
+    ASSERT_EQ((int)s.size(), 2);
+    ASSERT_TRUE(s[0].IsNotFound() && s[1].IsNotFound());
+  } while (ChangeCompactOptions());
+}
+
+void PrefixScanInit(DBTest *dbtest) {
+  char buf[100];
+  std::string keystr;
+  const int small_range_sstfiles = 5;
+  const int big_range_sstfiles = 5;
+
+  // Generate 11 sst files with the following prefix ranges.
+  // GROUP 0: [0,10]                              (level 1)
+  // GROUP 1: [1,2], [2,3], [3,4], [4,5], [5, 6]  (level 0)
+  // GROUP 2: [0,6], [0,7], [0,8], [0,9], [0,10]  (level 0)
+  //
+  // A seek with the previous API would do 11 random I/Os (to all the
+  // files).  With the new API and a prefix filter enabled, we should
+  // only do 2 random I/O, to the 2 files containing the key.
+
+  // GROUP 0
+  snprintf(buf, sizeof(buf), "%02d______:start", 0);
+  keystr = std::string(buf);
+  ASSERT_OK(dbtest->Put(keystr, keystr));
+  snprintf(buf, sizeof(buf), "%02d______:end", 10);
+  keystr = std::string(buf);
+  ASSERT_OK(dbtest->Put(keystr, keystr));
+  dbtest->dbfull()->TEST_FlushMemTable();
+  dbtest->dbfull()->CompactRange(nullptr, nullptr); // move to level 1
+
+  // GROUP 1
+  for (int i = 1; i <= small_range_sstfiles; i++) {
+    snprintf(buf, sizeof(buf), "%02d______:start", i);
+    keystr = std::string(buf);
+    ASSERT_OK(dbtest->Put(keystr, keystr));
+    snprintf(buf, sizeof(buf), "%02d______:end", i+1);
+    keystr = std::string(buf);
+    ASSERT_OK(dbtest->Put(keystr, keystr));
+    dbtest->dbfull()->TEST_FlushMemTable();
+  }
+
+  // GROUP 2
+  for (int i = 1; i <= big_range_sstfiles; i++) {
+    std::string keystr;
+    snprintf(buf, sizeof(buf), "%02d______:start", 0);
+    keystr = std::string(buf);
+    ASSERT_OK(dbtest->Put(keystr, keystr));
+    snprintf(buf, sizeof(buf), "%02d______:end",
+             small_range_sstfiles+i+1);
+    keystr = std::string(buf);
+    ASSERT_OK(dbtest->Put(keystr, keystr));
+    dbtest->dbfull()->TEST_FlushMemTable();
+  }
+}
+
+TEST(DBTest, PrefixScan) {
+  ReadOptions ro = ReadOptions();
+  int count;
+  Slice prefix;
+  Slice key;
+  char buf[100];
+  Iterator* iter;
+  snprintf(buf, sizeof(buf), "03______:");
+  prefix = Slice(buf, 8);
+  key = Slice(buf, 9);
+  auto prefix_extractor = NewFixedPrefixTransform(8);
+  // db configs
+  env_->count_random_reads_ = true;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.no_block_cache = true;
+  options.filter_policy =  NewBloomFilterPolicy(10);
+  options.prefix_extractor = prefix_extractor;
+  options.whole_key_filtering = false;
+  options.disable_auto_compactions = true;
+  options.max_background_compactions = 2;
+  options.create_if_missing = true;
+  options.disable_seek_compaction = true;
+  options.memtable_factory.reset(NewHashSkipListRepFactory(prefix_extractor));
+
+  // prefix specified, with blooms: 2 RAND I/Os
+  // SeekToFirst
+  DestroyAndReopen(&options);
+  PrefixScanInit(this);
+  count = 0;
+  env_->random_read_counter_.Reset();
+  ro.prefix = &prefix;
+  iter = db_->NewIterator(ro);
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    assert(iter->key().starts_with(prefix));
+    count++;
+  }
+  ASSERT_OK(iter->status());
+  delete iter;
+  ASSERT_EQ(count, 2);
+  ASSERT_EQ(env_->random_read_counter_.Read(), 2);
+
+  // prefix specified, with blooms: 2 RAND I/Os
+  // Seek
+  DestroyAndReopen(&options);
+  PrefixScanInit(this);
+  count = 0;
+  env_->random_read_counter_.Reset();
+  ro.prefix = &prefix;
+  iter = db_->NewIterator(ro);
+  for (iter->Seek(key); iter->Valid(); iter->Next()) {
+    assert(iter->key().starts_with(prefix));
+    count++;
+  }
+  ASSERT_OK(iter->status());
+  delete iter;
+  ASSERT_EQ(count, 2);
+  ASSERT_EQ(env_->random_read_counter_.Read(), 2);
+
+  // no prefix specified: 11 RAND I/Os
+  DestroyAndReopen(&options);
+  PrefixScanInit(this);
+  count = 0;
+  env_->random_read_counter_.Reset();
+  iter = db_->NewIterator(ReadOptions());
+  for (iter->Seek(prefix); iter->Valid(); iter->Next()) {
+    if (! iter->key().starts_with(prefix)) {
+      break;
+    }
+    count++;
+  }
+  ASSERT_OK(iter->status());
+  delete iter;
+  ASSERT_EQ(count, 2);
+  ASSERT_EQ(env_->random_read_counter_.Read(), 11);
+  Close();
+  delete options.filter_policy;
+}
+
+std::string MakeKey(unsigned int num) {
+  char buf[30];
+  snprintf(buf, sizeof(buf), "%016u", num);
+  return std::string(buf);
+}
+
+void BM_LogAndApply(int iters, int num_base_files) {
+  std::string dbname = test::TmpDir() + "/rocksdb_test_benchmark";
+  ASSERT_OK(DestroyDB(dbname, Options()));
+
+  DB* db = nullptr;
+  Options opts;
+  opts.create_if_missing = true;
+  Status s = DB::Open(opts, dbname, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != nullptr);
+
+  delete db;
+  db = nullptr;
+
+  Env* env = Env::Default();
+
+  port::Mutex mu;
+  MutexLock l(&mu);
+
+  InternalKeyComparator cmp(BytewiseComparator());
+  Options options;
+  EnvOptions sopt;
+  VersionSet vset(dbname, &options, sopt, nullptr, &cmp);
+  ASSERT_OK(vset.Recover());
+  VersionEdit vbase;
+  uint64_t fnum = 1;
+  for (int i = 0; i < num_base_files; i++) {
+    InternalKey start(MakeKey(2*fnum), 1, kTypeValue);
+    InternalKey limit(MakeKey(2*fnum+1), 1, kTypeDeletion);
+    vbase.AddFile(2, fnum++, 1 /* file size */, start, limit, 1, 1);
+  }
+  ASSERT_OK(vset.LogAndApply(&vbase, &mu));
+
+  uint64_t start_micros = env->NowMicros();
+
+  for (int i = 0; i < iters; i++) {
+    VersionEdit vedit;
+    vedit.DeleteFile(2, fnum);
+    InternalKey start(MakeKey(2*fnum), 1, kTypeValue);
+    InternalKey limit(MakeKey(2*fnum+1), 1, kTypeDeletion);
+    vedit.AddFile(2, fnum++, 1 /* file size */, start, limit, 1, 1);
+    vset.LogAndApply(&vedit, &mu);
+  }
+  uint64_t stop_micros = env->NowMicros();
+  unsigned int us = stop_micros - start_micros;
+  char buf[16];
+  snprintf(buf, sizeof(buf), "%d", num_base_files);
+  fprintf(stderr,
+          "BM_LogAndApply/%-6s   %8d iters : %9u us (%7.0f us / iter)\n",
+          buf, iters, us, ((float)us) / iters);
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  if (argc > 1 && std::string(argv[1]) == "--benchmark") {
+    rocksdb::BM_LogAndApply(1000, 1);
+    rocksdb::BM_LogAndApply(1000, 100);
+    rocksdb::BM_LogAndApply(1000, 10000);
+    rocksdb::BM_LogAndApply(100, 100000);
+    return 0;
+  }
+
+  return rocksdb::test::RunAllTests();
+}
diff --git a/db/dbformat.cc b/db/dbformat.cc
new file mode 100644 (file)
index 0000000..3d7e610
--- /dev/null
@@ -0,0 +1,147 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <stdio.h>
+#include "db/dbformat.h"
+#include "port/port.h"
+#include "util/coding.h"
+#include "util/perf_context_imp.h"
+
+namespace rocksdb {
+
+static uint64_t PackSequenceAndType(uint64_t seq, ValueType t) {
+  assert(seq <= kMaxSequenceNumber);
+  assert(t <= kValueTypeForSeek);
+  return (seq << 8) | t;
+}
+
+void AppendInternalKey(std::string* result, const ParsedInternalKey& key) {
+  result->append(key.user_key.data(), key.user_key.size());
+  PutFixed64(result, PackSequenceAndType(key.sequence, key.type));
+}
+
+std::string ParsedInternalKey::DebugString(bool hex) const {
+  char buf[50];
+  snprintf(buf, sizeof(buf), "' @ %llu : %d",
+           (unsigned long long) sequence,
+           int(type));
+  std::string result = "'";
+  result += user_key.ToString(hex);
+  result += buf;
+  return result;
+}
+
+std::string InternalKey::DebugString(bool hex) const {
+  std::string result;
+  ParsedInternalKey parsed;
+  if (ParseInternalKey(rep_, &parsed)) {
+    result = parsed.DebugString(hex);
+  } else {
+    result = "(bad)";
+    result.append(EscapeString(rep_));
+  }
+  return result;
+}
+
+const char* InternalKeyComparator::Name() const {
+  return name_.c_str();
+}
+
+int InternalKeyComparator::Compare(const Slice& akey, const Slice& bkey) const {
+  // Order by:
+  //    increasing user key (according to user-supplied comparator)
+  //    decreasing sequence number
+  //    decreasing type (though sequence# should be enough to disambiguate)
+  int r = user_comparator_->Compare(ExtractUserKey(akey), ExtractUserKey(bkey));
+  BumpPerfCount(&perf_context.user_key_comparison_count);
+  if (r == 0) {
+    const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8);
+    const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8);
+    if (anum > bnum) {
+      r = -1;
+    } else if (anum < bnum) {
+      r = +1;
+    }
+  }
+  return r;
+}
+
+void InternalKeyComparator::FindShortestSeparator(
+      std::string* start,
+      const Slice& limit) const {
+  // Attempt to shorten the user portion of the key
+  Slice user_start = ExtractUserKey(*start);
+  Slice user_limit = ExtractUserKey(limit);
+  std::string tmp(user_start.data(), user_start.size());
+  user_comparator_->FindShortestSeparator(&tmp, user_limit);
+  if (tmp.size() < user_start.size() &&
+      user_comparator_->Compare(user_start, tmp) < 0) {
+    // User key has become shorter physically, but larger logically.
+    // Tack on the earliest possible number to the shortened user key.
+    PutFixed64(&tmp, PackSequenceAndType(kMaxSequenceNumber,kValueTypeForSeek));
+    assert(this->Compare(*start, tmp) < 0);
+    assert(this->Compare(tmp, limit) < 0);
+    start->swap(tmp);
+  }
+}
+
+void InternalKeyComparator::FindShortSuccessor(std::string* key) const {
+  Slice user_key = ExtractUserKey(*key);
+  std::string tmp(user_key.data(), user_key.size());
+  user_comparator_->FindShortSuccessor(&tmp);
+  if (tmp.size() < user_key.size() &&
+      user_comparator_->Compare(user_key, tmp) < 0) {
+    // User key has become shorter physically, but larger logically.
+    // Tack on the earliest possible number to the shortened user key.
+    PutFixed64(&tmp, PackSequenceAndType(kMaxSequenceNumber,kValueTypeForSeek));
+    assert(this->Compare(*key, tmp) < 0);
+    key->swap(tmp);
+  }
+}
+
+const char* InternalFilterPolicy::Name() const {
+  return user_policy_->Name();
+}
+
+void InternalFilterPolicy::CreateFilter(const Slice* keys, int n,
+                                        std::string* dst) const {
+  // We rely on the fact that the code in table.cc does not mind us
+  // adjusting keys[].
+  Slice* mkey = const_cast<Slice*>(keys);
+  for (int i = 0; i < n; i++) {
+    mkey[i] = ExtractUserKey(keys[i]);
+    // TODO(sanjay): Suppress dups?
+  }
+  user_policy_->CreateFilter(keys, n, dst);
+}
+
+bool InternalFilterPolicy::KeyMayMatch(const Slice& key, const Slice& f) const {
+  return user_policy_->KeyMayMatch(ExtractUserKey(key), f);
+}
+
+LookupKey::LookupKey(const Slice& user_key, SequenceNumber s) {
+  size_t usize = user_key.size();
+  size_t needed = usize + 13;  // A conservative estimate
+  char* dst;
+  if (needed <= sizeof(space_)) {
+    dst = space_;
+  } else {
+    dst = new char[needed];
+  }
+  start_ = dst;
+  dst = EncodeVarint32(dst, usize + 8);
+  kstart_ = dst;
+  memcpy(dst, user_key.data(), usize);
+  dst += usize;
+  EncodeFixed64(dst, PackSequenceAndType(s, kValueTypeForSeek));
+  dst += 8;
+  end_ = dst;
+}
+
+}  // namespace rocksdb
diff --git a/db/dbformat.h b/db/dbformat.h
new file mode 100644 (file)
index 0000000..64a2c9f
--- /dev/null
@@ -0,0 +1,229 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <stdio.h>
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/table.h"
+#include "rocksdb/types.h"
+#include "util/coding.h"
+#include "util/logging.h"
+
+namespace rocksdb {
+
+class InternalKey;
+
+// Value types encoded as the last component of internal keys.
+// DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk
+// data structures.
+enum ValueType {
+  kTypeDeletion = 0x0,
+  kTypeValue = 0x1,
+  kTypeMerge = 0x2,
+  kTypeLogData = 0x3
+};
+// kValueTypeForSeek defines the ValueType that should be passed when
+// constructing a ParsedInternalKey object for seeking to a particular
+// sequence number (since we sort sequence numbers in decreasing order
+// and the value type is embedded as the low 8 bits in the sequence
+// number in internal keys, we need to use the highest-numbered
+// ValueType, not the lowest).
+static const ValueType kValueTypeForSeek = kTypeMerge;
+
+// We leave eight bits empty at the bottom so a type and sequence#
+// can be packed together into 64-bits.
+static const SequenceNumber kMaxSequenceNumber =
+    ((0x1ull << 56) - 1);
+
+struct ParsedInternalKey {
+  Slice user_key;
+  SequenceNumber sequence;
+  ValueType type;
+
+  ParsedInternalKey() { }  // Intentionally left uninitialized (for speed)
+  ParsedInternalKey(const Slice& u, const SequenceNumber& seq, ValueType t)
+      : user_key(u), sequence(seq), type(t) { }
+  std::string DebugString(bool hex = false) const;
+};
+
+// Return the length of the encoding of "key".
+inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) {
+  return key.user_key.size() + 8;
+}
+
+// Append the serialization of "key" to *result.
+extern void AppendInternalKey(std::string* result,
+                              const ParsedInternalKey& key);
+
+// Attempt to parse an internal key from "internal_key".  On success,
+// stores the parsed data in "*result", and returns true.
+//
+// On error, returns false, leaves "*result" in an undefined state.
+extern bool ParseInternalKey(const Slice& internal_key,
+                             ParsedInternalKey* result);
+
+// Returns the user key portion of an internal key.
+inline Slice ExtractUserKey(const Slice& internal_key) {
+  assert(internal_key.size() >= 8);
+  return Slice(internal_key.data(), internal_key.size() - 8);
+}
+
+inline ValueType ExtractValueType(const Slice& internal_key) {
+  assert(internal_key.size() >= 8);
+  const size_t n = internal_key.size();
+  uint64_t num = DecodeFixed64(internal_key.data() + n - 8);
+  unsigned char c = num & 0xff;
+  return static_cast<ValueType>(c);
+}
+
+// A comparator for internal keys that uses a specified comparator for
+// the user key portion and breaks ties by decreasing sequence number.
+class InternalKeyComparator : public Comparator {
+ private:
+  const Comparator* user_comparator_;
+  std::string name_;
+ public:
+  explicit InternalKeyComparator(const Comparator* c) : user_comparator_(c),
+    name_("rocksdb.InternalKeyComparator:" +
+          std::string(user_comparator_->Name())) {
+  }
+
+  virtual const char* Name() const;
+  virtual int Compare(const Slice& a, const Slice& b) const;
+  virtual void FindShortestSeparator(
+      std::string* start,
+      const Slice& limit) const;
+  virtual void FindShortSuccessor(std::string* key) const;
+
+  const Comparator* user_comparator() const { return user_comparator_; }
+
+  int Compare(const InternalKey& a, const InternalKey& b) const;
+};
+
+// Filter policy wrapper that converts from internal keys to user keys
+class InternalFilterPolicy : public FilterPolicy {
+ private:
+  const FilterPolicy* const user_policy_;
+ public:
+  explicit InternalFilterPolicy(const FilterPolicy* p) : user_policy_(p) { }
+  virtual const char* Name() const;
+  virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const;
+  virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const;
+};
+
+// Modules in this directory should keep internal keys wrapped inside
+// the following class instead of plain strings so that we do not
+// incorrectly use string comparisons instead of an InternalKeyComparator.
+class InternalKey {
+ private:
+  std::string rep_;
+ public:
+  InternalKey() { }   // Leave rep_ as empty to indicate it is invalid
+  InternalKey(const Slice& user_key, SequenceNumber s, ValueType t) {
+    AppendInternalKey(&rep_, ParsedInternalKey(user_key, s, t));
+  }
+
+  void DecodeFrom(const Slice& s) { rep_.assign(s.data(), s.size()); }
+  Slice Encode() const {
+    assert(!rep_.empty());
+    return rep_;
+  }
+
+  Slice user_key() const { return ExtractUserKey(rep_); }
+
+  void SetFrom(const ParsedInternalKey& p) {
+    rep_.clear();
+    AppendInternalKey(&rep_, p);
+  }
+
+  void Clear() { rep_.clear(); }
+
+  std::string DebugString(bool hex = false) const;
+};
+
+inline int InternalKeyComparator::Compare(
+    const InternalKey& a, const InternalKey& b) const {
+  return Compare(a.Encode(), b.Encode());
+}
+
+inline bool ParseInternalKey(const Slice& internal_key,
+                             ParsedInternalKey* result) {
+  const size_t n = internal_key.size();
+  if (n < 8) return false;
+  uint64_t num = DecodeFixed64(internal_key.data() + n - 8);
+  unsigned char c = num & 0xff;
+  result->sequence = num >> 8;
+  result->type = static_cast<ValueType>(c);
+  result->user_key = Slice(internal_key.data(), n - 8);
+  return (c <= static_cast<unsigned char>(kValueTypeForSeek));
+}
+
+// Update the sequence number in the internal key
+inline void UpdateInternalKey(char* internal_key,
+                              const size_t internal_key_size,
+                              uint64_t seq, ValueType t) {
+  assert(internal_key_size >= 8);
+  char* seqtype = internal_key + internal_key_size - 8;
+  uint64_t newval = (seq << 8) | t;
+  EncodeFixed64(seqtype, newval);
+}
+
+// Get the sequence number from the internal key
+inline uint64_t GetInternalKeySeqno(const Slice& internal_key) {
+  const size_t n = internal_key.size();
+  assert(n >= 8);
+  uint64_t num = DecodeFixed64(internal_key.data() + n - 8);
+  return num >> 8;
+}
+
+
+// A helper class useful for DBImpl::Get()
+class LookupKey {
+ public:
+  // Initialize *this for looking up user_key at a snapshot with
+  // the specified sequence number.
+  LookupKey(const Slice& user_key, SequenceNumber sequence);
+
+  ~LookupKey();
+
+  // Return a key suitable for lookup in a MemTable.
+  Slice memtable_key() const { return Slice(start_, end_ - start_); }
+
+  // Return an internal key (suitable for passing to an internal iterator)
+  Slice internal_key() const { return Slice(kstart_, end_ - kstart_); }
+
+  // Return the user key
+  Slice user_key() const { return Slice(kstart_, end_ - kstart_ - 8); }
+
+ private:
+  // We construct a char array of the form:
+  //    klength  varint32               <-- start_
+  //    userkey  char[klength]          <-- kstart_
+  //    tag      uint64
+  //                                    <-- end_
+  // The array is a suitable MemTable key.
+  // The suffix starting with "userkey" can be used as an InternalKey.
+  const char* start_;
+  const char* kstart_;
+  const char* end_;
+  char space_[200];      // Avoid allocation for short keys
+
+  // No copying allowed
+  LookupKey(const LookupKey&);
+  void operator=(const LookupKey&);
+};
+
+inline LookupKey::~LookupKey() {
+  if (start_ != space_) delete[] start_;
+}
+
+}  // namespace rocksdb
diff --git a/db/dbformat_test.cc b/db/dbformat_test.cc
new file mode 100644 (file)
index 0000000..b520f3c
--- /dev/null
@@ -0,0 +1,117 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/dbformat.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+
+namespace rocksdb {
+
+static std::string IKey(const std::string& user_key,
+                        uint64_t seq,
+                        ValueType vt) {
+  std::string encoded;
+  AppendInternalKey(&encoded, ParsedInternalKey(user_key, seq, vt));
+  return encoded;
+}
+
+static std::string Shorten(const std::string& s, const std::string& l) {
+  std::string result = s;
+  InternalKeyComparator(BytewiseComparator()).FindShortestSeparator(&result, l);
+  return result;
+}
+
+static std::string ShortSuccessor(const std::string& s) {
+  std::string result = s;
+  InternalKeyComparator(BytewiseComparator()).FindShortSuccessor(&result);
+  return result;
+}
+
+static void TestKey(const std::string& key,
+                    uint64_t seq,
+                    ValueType vt) {
+  std::string encoded = IKey(key, seq, vt);
+
+  Slice in(encoded);
+  ParsedInternalKey decoded("", 0, kTypeValue);
+
+  ASSERT_TRUE(ParseInternalKey(in, &decoded));
+  ASSERT_EQ(key, decoded.user_key.ToString());
+  ASSERT_EQ(seq, decoded.sequence);
+  ASSERT_EQ(vt, decoded.type);
+
+  ASSERT_TRUE(!ParseInternalKey(Slice("bar"), &decoded));
+}
+
+class FormatTest { };
+
+TEST(FormatTest, InternalKey_EncodeDecode) {
+  const char* keys[] = { "", "k", "hello", "longggggggggggggggggggggg" };
+  const uint64_t seq[] = {
+    1, 2, 3,
+    (1ull << 8) - 1, 1ull << 8, (1ull << 8) + 1,
+    (1ull << 16) - 1, 1ull << 16, (1ull << 16) + 1,
+    (1ull << 32) - 1, 1ull << 32, (1ull << 32) + 1
+  };
+  for (unsigned int k = 0; k < sizeof(keys) / sizeof(keys[0]); k++) {
+    for (unsigned int s = 0; s < sizeof(seq) / sizeof(seq[0]); s++) {
+      TestKey(keys[k], seq[s], kTypeValue);
+      TestKey("hello", 1, kTypeDeletion);
+    }
+  }
+}
+
+TEST(FormatTest, InternalKeyShortSeparator) {
+  // When user keys are same
+  ASSERT_EQ(IKey("foo", 100, kTypeValue),
+            Shorten(IKey("foo", 100, kTypeValue),
+                    IKey("foo", 99, kTypeValue)));
+  ASSERT_EQ(IKey("foo", 100, kTypeValue),
+            Shorten(IKey("foo", 100, kTypeValue),
+                    IKey("foo", 101, kTypeValue)));
+  ASSERT_EQ(IKey("foo", 100, kTypeValue),
+            Shorten(IKey("foo", 100, kTypeValue),
+                    IKey("foo", 100, kTypeValue)));
+  ASSERT_EQ(IKey("foo", 100, kTypeValue),
+            Shorten(IKey("foo", 100, kTypeValue),
+                    IKey("foo", 100, kTypeDeletion)));
+
+  // When user keys are misordered
+  ASSERT_EQ(IKey("foo", 100, kTypeValue),
+            Shorten(IKey("foo", 100, kTypeValue),
+                    IKey("bar", 99, kTypeValue)));
+
+  // When user keys are different, but correctly ordered
+  ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek),
+            Shorten(IKey("foo", 100, kTypeValue),
+                    IKey("hello", 200, kTypeValue)));
+
+  // When start user key is prefix of limit user key
+  ASSERT_EQ(IKey("foo", 100, kTypeValue),
+            Shorten(IKey("foo", 100, kTypeValue),
+                    IKey("foobar", 200, kTypeValue)));
+
+  // When limit user key is prefix of start user key
+  ASSERT_EQ(IKey("foobar", 100, kTypeValue),
+            Shorten(IKey("foobar", 100, kTypeValue),
+                    IKey("foo", 200, kTypeValue)));
+}
+
+TEST(FormatTest, InternalKeyShortestSuccessor) {
+  ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek),
+            ShortSuccessor(IKey("foo", 100, kTypeValue)));
+  ASSERT_EQ(IKey("\xff\xff", 100, kTypeValue),
+            ShortSuccessor(IKey("\xff\xff", 100, kTypeValue)));
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/db/deletefile_test.cc b/db/deletefile_test.cc
new file mode 100644 (file)
index 0000000..14f0324
--- /dev/null
@@ -0,0 +1,295 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/db.h"
+#include "db/db_impl.h"
+#include "db/filename.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+#include "rocksdb/env.h"
+#include "rocksdb/transaction_log.h"
+#include <vector>
+#include <stdlib.h>
+#include <map>
+#include <string>
+
+namespace rocksdb {
+
+class DeleteFileTest {
+ public:
+  std::string dbname_;
+  Options options_;
+  DB* db_;
+  Env* env_;
+  int numlevels_;
+
+  DeleteFileTest() {
+    db_ = nullptr;
+    env_ = Env::Default();
+    options_.write_buffer_size = 1024*1024*1000;
+    options_.target_file_size_base = 1024*1024*1000;
+    options_.max_bytes_for_level_base = 1024*1024*1000;
+    options_.WAL_ttl_seconds = 300; // Used to test log files
+    options_.WAL_size_limit_MB = 1024; // Used to test log files
+    dbname_ = test::TmpDir() + "/deletefile_test";
+    options_.wal_dir = dbname_ + "/wal_files";
+
+    // clean up all the files that might have been there before
+    std::vector<std::string> old_files;
+    env_->GetChildren(dbname_, &old_files);
+    for (auto file : old_files) {
+      env_->DeleteFile(dbname_ + "/" + file);
+    }
+    env_->GetChildren(options_.wal_dir, &old_files);
+    for (auto file : old_files) {
+      env_->DeleteFile(options_.wal_dir + "/" + file);
+    }
+
+    DestroyDB(dbname_, options_);
+    numlevels_ = 7;
+    ASSERT_OK(ReopenDB(true));
+  }
+
+  Status ReopenDB(bool create) {
+    delete db_;
+    if (create) {
+      DestroyDB(dbname_, options_);
+    }
+    db_ = nullptr;
+    options_.create_if_missing = create;
+    return DB::Open(options_, dbname_, &db_);
+  }
+
+  void CloseDB() {
+    delete db_;
+  }
+
+  void AddKeys(int numkeys, int startkey = 0) {
+    WriteOptions options;
+    options.sync = false;
+    ReadOptions roptions;
+    for (int i = startkey; i < (numkeys + startkey) ; i++) {
+      std::string temp = std::to_string(i);
+      Slice key(temp);
+      Slice value(temp);
+      ASSERT_OK(db_->Put(options, key, value));
+    }
+  }
+
+  int numKeysInLevels(
+    std::vector<LiveFileMetaData> &metadata,
+    std::vector<int> *keysperlevel = nullptr) {
+
+    if (keysperlevel != nullptr) {
+      keysperlevel->resize(numlevels_);
+    }
+
+    int numKeys = 0;
+    for (size_t i = 0; i < metadata.size(); i++) {
+      int startkey = atoi(metadata[i].smallestkey.c_str());
+      int endkey = atoi(metadata[i].largestkey.c_str());
+      int numkeysinfile = (endkey - startkey + 1);
+      numKeys += numkeysinfile;
+      if (keysperlevel != nullptr) {
+        (*keysperlevel)[(int)metadata[i].level] += numkeysinfile;
+      }
+      fprintf(stderr, "level %d name %s smallest %s largest %s\n",
+              metadata[i].level, metadata[i].name.c_str(),
+              metadata[i].smallestkey.c_str(),
+              metadata[i].largestkey.c_str());
+    }
+    return numKeys;
+  }
+
+  void CreateTwoLevels() {
+    AddKeys(50000, 10000);
+    DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+    ASSERT_OK(dbi->TEST_FlushMemTable());
+    ASSERT_OK(dbi->TEST_WaitForFlushMemTable());
+
+    AddKeys(50000, 10000);
+    ASSERT_OK(dbi->TEST_FlushMemTable());
+    ASSERT_OK(dbi->TEST_WaitForFlushMemTable());
+  }
+
+  void CheckFileTypeCounts(std::string& dir,
+                            int required_log,
+                            int required_sst,
+                            int required_manifest) {
+    std::vector<std::string> filenames;
+    env_->GetChildren(dir, &filenames);
+
+    int log_cnt = 0, sst_cnt = 0, manifest_cnt = 0;
+    for (auto file : filenames) {
+      uint64_t number;
+      FileType type;
+      if (ParseFileName(file, &number, &type)) {
+        log_cnt += (type == kLogFile);
+        sst_cnt += (type == kTableFile);
+        manifest_cnt += (type == kDescriptorFile);
+      }
+    }
+    ASSERT_EQ(required_log, log_cnt);
+    ASSERT_EQ(required_sst, sst_cnt);
+    ASSERT_EQ(required_manifest, manifest_cnt);
+  }
+
+};
+
+TEST(DeleteFileTest, AddKeysAndQueryLevels) {
+  CreateTwoLevels();
+  std::vector<LiveFileMetaData> metadata;
+  std::vector<int> keysinlevel;
+  db_->GetLiveFilesMetaData(&metadata);
+
+  std::string level1file = "";
+  int level1keycount = 0;
+  std::string level2file = "";
+  int level2keycount = 0;
+  int level1index = 0;
+  int level2index = 1;
+
+  ASSERT_EQ((int)metadata.size(), 2);
+  if (metadata[0].level == 2) {
+    level1index = 1;
+    level2index = 0;
+  }
+
+  level1file = metadata[level1index].name;
+  int startkey = atoi(metadata[level1index].smallestkey.c_str());
+  int endkey = atoi(metadata[level1index].largestkey.c_str());
+  level1keycount = (endkey - startkey + 1);
+  level2file = metadata[level2index].name;
+  startkey = atoi(metadata[level2index].smallestkey.c_str());
+  endkey = atoi(metadata[level2index].largestkey.c_str());
+  level2keycount = (endkey - startkey + 1);
+
+  // COntrolled setup. Levels 1 and 2 should both have 50K files.
+  // This is a little fragile as it depends on the current
+  // compaction heuristics.
+  ASSERT_EQ(level1keycount, 50000);
+  ASSERT_EQ(level2keycount, 50000);
+
+  Status status = db_->DeleteFile("0.sst");
+  ASSERT_TRUE(status.IsInvalidArgument());
+
+  // intermediate level files cannot be deleted.
+  status = db_->DeleteFile(level1file);
+  ASSERT_TRUE(status.IsInvalidArgument());
+
+  // Lowest level file deletion should succeed.
+  ASSERT_OK(db_->DeleteFile(level2file));
+
+  CloseDB();
+}
+
+TEST(DeleteFileTest, PurgeObsoleteFilesTest) {
+  CreateTwoLevels();
+  // there should be only one (empty) log file because CreateTwoLevels()
+  // flushes the memtables to disk
+  CheckFileTypeCounts(options_.wal_dir, 1, 0, 0);
+  // 2 ssts, 1 manifest
+  CheckFileTypeCounts(dbname_, 0, 2, 1);
+  std::string first("0"), last("999999");
+  Slice first_slice(first), last_slice(last);
+  db_->CompactRange(&first_slice, &last_slice, true, 2);
+  // 1 sst after compaction
+  CheckFileTypeCounts(dbname_, 0, 1, 1);
+
+  // this time, we keep an iterator alive
+  ReopenDB(true);
+  Iterator *itr = 0;
+  CreateTwoLevels();
+  itr = db_->NewIterator(ReadOptions());
+  db_->CompactRange(&first_slice, &last_slice, true, 2);
+  // 3 sst after compaction with live iterator
+  CheckFileTypeCounts(dbname_, 0, 3, 1);
+  delete itr;
+  // 1 sst after iterator deletion
+  CheckFileTypeCounts(dbname_, 0, 1, 1);
+
+  CloseDB();
+}
+
+TEST(DeleteFileTest, DeleteFileWithIterator) {
+  CreateTwoLevels();
+  ReadOptions options;
+  Iterator* it = db_->NewIterator(options);
+  std::vector<LiveFileMetaData> metadata;
+  db_->GetLiveFilesMetaData(&metadata);
+
+  std::string level2file = "";
+
+  ASSERT_EQ((int)metadata.size(), 2);
+  if (metadata[0].level == 1) {
+    level2file = metadata[1].name;
+  } else {
+    level2file = metadata[0].name;
+  }
+
+  Status status = db_->DeleteFile(level2file);
+  fprintf(stdout, "Deletion status %s: %s\n",
+          level2file.c_str(), status.ToString().c_str());
+  ASSERT_TRUE(status.ok());
+  it->SeekToFirst();
+  int numKeysIterated = 0;
+  while(it->Valid()) {
+    numKeysIterated++;
+    it->Next();
+  }
+  ASSERT_EQ(numKeysIterated, 50000);
+  delete it;
+  CloseDB();
+}
+
+TEST(DeleteFileTest, DeleteLogFiles) {
+  AddKeys(10, 0);
+  VectorLogPtr logfiles;
+  db_->GetSortedWalFiles(logfiles);
+  ASSERT_GT(logfiles.size(), 0UL);
+  // Take the last log file which is expected to be alive and try to delete it
+  // Should not succeed because live logs are not allowed to be deleted
+  std::unique_ptr<LogFile> alive_log = std::move(logfiles.back());
+  ASSERT_EQ(alive_log->Type(), kAliveLogFile);
+  ASSERT_TRUE(env_->FileExists(options_.wal_dir + "/" + alive_log->PathName()));
+  fprintf(stdout, "Deleting alive log file %s\n",
+          alive_log->PathName().c_str());
+  ASSERT_TRUE(!db_->DeleteFile(alive_log->PathName()).ok());
+  ASSERT_TRUE(env_->FileExists(options_.wal_dir + "/" + alive_log->PathName()));
+  logfiles.clear();
+
+  // Call Flush to bring about a new working log file and add more keys
+  // Call Flush again to flush out memtable and move alive log to archived log
+  // and try to delete the archived log file
+  FlushOptions fopts;
+  db_->Flush(fopts);
+  AddKeys(10, 0);
+  db_->Flush(fopts);
+  db_->GetSortedWalFiles(logfiles);
+  ASSERT_GT(logfiles.size(), 0UL);
+  std::unique_ptr<LogFile> archived_log = std::move(logfiles.front());
+  ASSERT_EQ(archived_log->Type(), kArchivedLogFile);
+  ASSERT_TRUE(env_->FileExists(options_.wal_dir + "/" +
+        archived_log->PathName()));
+  fprintf(stdout, "Deleting archived log file %s\n",
+          archived_log->PathName().c_str());
+  ASSERT_OK(db_->DeleteFile(archived_log->PathName()));
+  ASSERT_TRUE(!env_->FileExists(options_.wal_dir + "/" +
+        archived_log->PathName()));
+  CloseDB();
+}
+
+} //namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
+
diff --git a/db/filename.cc b/db/filename.cc
new file mode 100644 (file)
index 0000000..cdbd1bc
--- /dev/null
@@ -0,0 +1,266 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/filename.h"
+
+#include <ctype.h>
+#include <stdio.h>
+#include "db/dbformat.h"
+#include "rocksdb/env.h"
+#include "util/logging.h"
+
+namespace rocksdb {
+
+// Given a path, flatten the path name by replacing all chars not in
+// {[0-9,a-z,A-Z,-,_,.]} with _. And append '\0' at the end.
+// Return the number of chars stored in dest not including the trailing '\0'.
+static int FlattenPath(const std::string& path, char* dest, int len) {
+  int write_idx = 0;
+  int i = 0;
+  int src_len = path.size();
+
+  while (i < src_len && write_idx < len - 1) {
+    if ((path[i] >= 'a' && path[i] <= 'z') ||
+        (path[i] >= '0' && path[i] <= '9') ||
+        (path[i] >= 'A' && path[i] <= 'Z') ||
+        path[i] == '-' ||
+        path[i] == '.' ||
+        path[i] == '_'){
+      dest[write_idx++] = path[i];
+    } else {
+      if (i > 0)
+        dest[write_idx++] = '_';
+    }
+    i++;
+  }
+
+  dest[write_idx] = '\0';
+  return write_idx;
+}
+
+// A utility routine: write "data" to the named file and Sync() it.
+extern Status WriteStringToFileSync(Env* env, const Slice& data,
+                                    const std::string& fname);
+
+static std::string MakeFileName(const std::string& name, uint64_t number,
+                                const char* suffix) {
+  char buf[100];
+  snprintf(buf, sizeof(buf), "/%06llu.%s",
+           static_cast<unsigned long long>(number),
+           suffix);
+  return name + buf;
+}
+
+std::string LogFileName(const std::string& name, uint64_t number) {
+  assert(number > 0);
+  return MakeFileName(name, number, "log");
+}
+
+std::string ArchivalDirectory(const std::string& dir) {
+  return dir + "/" + ARCHIVAL_DIR;
+}
+std::string ArchivedLogFileName(const std::string& name, uint64_t number) {
+  assert(number > 0);
+  return MakeFileName(name + "/" + ARCHIVAL_DIR, number, "log");
+}
+
+std::string TableFileName(const std::string& name, uint64_t number) {
+  assert(number > 0);
+  return MakeFileName(name, number, "sst");
+}
+
+std::string DescriptorFileName(const std::string& dbname, uint64_t number) {
+  assert(number > 0);
+  char buf[100];
+  snprintf(buf, sizeof(buf), "/MANIFEST-%06llu",
+           static_cast<unsigned long long>(number));
+  return dbname + buf;
+}
+
+std::string CurrentFileName(const std::string& dbname) {
+  return dbname + "/CURRENT";
+}
+
+std::string LockFileName(const std::string& dbname) {
+  return dbname + "/LOCK";
+}
+
+std::string TempFileName(const std::string& dbname, uint64_t number) {
+  assert(number >= 0);
+  return MakeFileName(dbname, number, "dbtmp");
+}
+
+std::string InfoLogFileName(const std::string& dbname,
+    const std::string& db_path, const std::string& log_dir) {
+  if (log_dir.empty())
+    return dbname + "/LOG";
+
+  char flatten_db_path[256];
+  FlattenPath(db_path, flatten_db_path, 256);
+  return log_dir + "/" + flatten_db_path + "_LOG";
+}
+
+// Return the name of the old info log file for "dbname".
+std::string OldInfoLogFileName(const std::string& dbname, uint64_t ts,
+    const std::string& db_path, const std::string& log_dir) {
+  char buf[50];
+  snprintf(buf, sizeof(buf), "%llu", static_cast<unsigned long long>(ts));
+
+  if (log_dir.empty())
+    return dbname + "/LOG.old." + buf;
+
+  char flatten_db_path[256];
+  FlattenPath(db_path, flatten_db_path, 256);
+  return log_dir + "/" + flatten_db_path + "_LOG.old." + buf;
+}
+
+std::string MetaDatabaseName(const std::string& dbname, uint64_t number) {
+  char buf[100];
+  snprintf(buf, sizeof(buf), "/METADB-%llu",
+           static_cast<unsigned long long>(number));
+  return dbname + buf;
+}
+
+std::string IdentityFileName(const std::string& dbname) {
+  return dbname + "/IDENTITY";
+}
+
+// Owned filenames have the form:
+//    dbname/IDENTITY
+//    dbname/CURRENT
+//    dbname/LOCK
+//    dbname/LOG
+//    dbname/LOG.old.[0-9]+
+//    dbname/MANIFEST-[0-9]+
+//    dbname/[0-9]+.(log|sst)
+//    dbname/METADB-[0-9]+
+//    Disregards / at the beginning
+bool ParseFileName(const std::string& fname,
+                   uint64_t* number,
+                   FileType* type,
+                   WalFileType* log_type) {
+  Slice rest(fname);
+  if (fname.length() > 1 && fname[0] == '/') {
+    rest.remove_prefix(1);
+  }
+  if (rest == "IDENTITY") {
+    *number = 0;
+    *type = kIdentityFile;
+  } else if (rest == "CURRENT") {
+    *number = 0;
+    *type = kCurrentFile;
+  } else if (rest == "LOCK") {
+    *number = 0;
+    *type = kDBLockFile;
+  } else if (rest == "LOG" || rest == "LOG.old") {
+    *number = 0;
+    *type = kInfoLogFile;
+  } else if (rest.starts_with("LOG.old.")) {
+    uint64_t ts_suffix;
+    // sizeof also counts the trailing '\0'.
+    rest.remove_prefix(sizeof("LOG.old.") - 1);
+    if (!ConsumeDecimalNumber(&rest, &ts_suffix)) {
+      return false;
+    }
+    *number = ts_suffix;
+    *type = kInfoLogFile;
+  } else if (rest.starts_with("MANIFEST-")) {
+    rest.remove_prefix(strlen("MANIFEST-"));
+    uint64_t num;
+    if (!ConsumeDecimalNumber(&rest, &num)) {
+      return false;
+    }
+    if (!rest.empty()) {
+      return false;
+    }
+    *type = kDescriptorFile;
+    *number = num;
+  } else if (rest.starts_with("METADB-")) {
+    rest.remove_prefix(strlen("METADB-"));
+    uint64_t num;
+    if (!ConsumeDecimalNumber(&rest, &num)) {
+      return false;
+    }
+    if (!rest.empty()) {
+      return false;
+    }
+    *type = kMetaDatabase;
+    *number = num;
+  } else {
+    // Avoid strtoull() to keep filename format independent of the
+    // current locale
+    bool archive_dir_found = false;
+    if (rest.starts_with(ARCHIVAL_DIR)) {
+      if (rest.size() <= ARCHIVAL_DIR.size()) {
+        return false;
+      }
+      rest.remove_prefix(ARCHIVAL_DIR.size() + 1); // Add 1 to remove / also
+      if (log_type) {
+        *log_type = kArchivedLogFile;
+      }
+      archive_dir_found = true;
+    }
+    uint64_t num;
+    if (!ConsumeDecimalNumber(&rest, &num)) {
+      return false;
+    }
+    Slice suffix = rest;
+    if (suffix == Slice(".log")) {
+      *type = kLogFile;
+      if (log_type && !archive_dir_found) {
+        *log_type = kAliveLogFile;
+      }
+    } else if (archive_dir_found) {
+      return false; // Archive dir can contain only log files
+    } else if (suffix == Slice(".sst")) {
+      *type = kTableFile;
+    } else if (suffix == Slice(".dbtmp")) {
+      *type = kTempFile;
+    } else {
+      return false;
+    }
+    *number = num;
+  }
+  return true;
+}
+
+Status SetCurrentFile(Env* env, const std::string& dbname,
+                      uint64_t descriptor_number) {
+  // Remove leading "dbname/" and add newline to manifest file name
+  std::string manifest = DescriptorFileName(dbname, descriptor_number);
+  Slice contents = manifest;
+  assert(contents.starts_with(dbname + "/"));
+  contents.remove_prefix(dbname.size() + 1);
+  std::string tmp = TempFileName(dbname, descriptor_number);
+  Status s = WriteStringToFileSync(env, contents.ToString() + "\n", tmp);
+  if (s.ok()) {
+    s = env->RenameFile(tmp, CurrentFileName(dbname));
+  }
+  if (!s.ok()) {
+    env->DeleteFile(tmp);
+  }
+  return s;
+}
+
+Status SetIdentityFile(Env* env, const std::string& dbname) {
+  std::string id = env->GenerateUniqueId();
+  assert(!id.empty());
+  // Reserve the filename dbname/000000.dbtmp for the temporary identity file
+  std::string tmp = TempFileName(dbname, 0);
+  Status s = WriteStringToFileSync(env, id, tmp);
+  if (s.ok()) {
+    s = env->RenameFile(tmp, IdentityFileName(dbname));
+  }
+  if (!s.ok()) {
+    env->DeleteFile(tmp);
+  }
+  return s;
+}
+
+}  // namespace rocksdb
diff --git a/db/filename.h b/db/filename.h
new file mode 100644 (file)
index 0000000..8e55f11
--- /dev/null
@@ -0,0 +1,108 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// File names used by DB code
+
+#pragma once
+#include <stdint.h>
+#include <string>
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "rocksdb/transaction_log.h"
+#include "port/port.h"
+
+namespace rocksdb {
+
+class Env;
+
+enum FileType {
+  kLogFile,
+  kDBLockFile,
+  kTableFile,
+  kDescriptorFile,
+  kCurrentFile,
+  kTempFile,
+  kInfoLogFile,  // Either the current one, or an old one
+  kMetaDatabase,
+  kIdentityFile
+};
+
+// Return the name of the log file with the specified number
+// in the db named by "dbname".  The result will be prefixed with
+// "dbname".
+extern std::string LogFileName(const std::string& dbname, uint64_t number);
+
+static const std::string ARCHIVAL_DIR = "archive";
+
+extern std::string ArchivalDirectory(const std::string& dbname);
+
+//  Return the name of the archived log file with the specified number
+//  in the db named by "dbname". The result will be prefixed with "dbname".
+extern std::string ArchivedLogFileName(const std::string& dbname,
+                                       uint64_t num);
+
+// Return the name of the sstable with the specified number
+// in the db named by "dbname".  The result will be prefixed with
+// "dbname".
+extern std::string TableFileName(const std::string& dbname, uint64_t number);
+
+// Return the name of the descriptor file for the db named by
+// "dbname" and the specified incarnation number.  The result will be
+// prefixed with "dbname".
+extern std::string DescriptorFileName(const std::string& dbname,
+                                      uint64_t number);
+
+// Return the name of the current file.  This file contains the name
+// of the current manifest file.  The result will be prefixed with
+// "dbname".
+extern std::string CurrentFileName(const std::string& dbname);
+
+// Return the name of the lock file for the db named by
+// "dbname".  The result will be prefixed with "dbname".
+extern std::string LockFileName(const std::string& dbname);
+
+// Return the name of a temporary file owned by the db named "dbname".
+// The result will be prefixed with "dbname".
+extern std::string TempFileName(const std::string& dbname, uint64_t number);
+
+// Return the name of the info log file for "dbname".
+extern std::string InfoLogFileName(const std::string& dbname,
+    const std::string& db_path="", const std::string& log_dir="");
+
+// Return the name of the old info log file for "dbname".
+extern std::string OldInfoLogFileName(const std::string& dbname, uint64_t ts,
+    const std::string& db_path="", const std::string& log_dir="");
+
+// Return the name to use for a metadatabase. The result will be prefixed with
+// "dbname".
+extern std::string MetaDatabaseName(const std::string& dbname,
+                                    uint64_t number);
+
+// Return the name of the Identity file which stores a unique number for the db
+// that will get regenerated if the db loses all its data and is recreated fresh
+// either from a backup-image or empty
+extern std::string IdentityFileName(const std::string& dbname);
+
+// If filename is a rocksdb file, store the type of the file in *type.
+// The number encoded in the filename is stored in *number.  If the
+// filename was successfully parsed, returns true.  Else return false.
+extern bool ParseFileName(const std::string& filename,
+                          uint64_t* number,
+                          FileType* type,
+                          WalFileType* log_type = nullptr);
+
+// Make the CURRENT file point to the descriptor file with the
+// specified number.
+extern Status SetCurrentFile(Env* env, const std::string& dbname,
+                             uint64_t descriptor_number);
+
+// Make the IDENTITY file for the db
+extern Status SetIdentityFile(Env* env, const std::string& dbname);
+
+}  // namespace rocksdb
diff --git a/db/filename_test.cc b/db/filename_test.cc
new file mode 100644 (file)
index 0000000..0baa7fd
--- /dev/null
@@ -0,0 +1,140 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/filename.h"
+
+#include "db/dbformat.h"
+#include "port/port.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+
+namespace rocksdb {
+
+class FileNameTest { };
+
+TEST(FileNameTest, Parse) {
+  Slice db;
+  FileType type;
+  uint64_t number;
+
+  // Successful parses
+  static struct {
+    const char* fname;
+    uint64_t number;
+    FileType type;
+  } cases[] = {
+    { "100.log",            100,   kLogFile },
+    { "0.log",              0,     kLogFile },
+    { "0.sst",              0,     kTableFile },
+    { "CURRENT",            0,     kCurrentFile },
+    { "LOCK",               0,     kDBLockFile },
+    { "MANIFEST-2",         2,     kDescriptorFile },
+    { "MANIFEST-7",         7,     kDescriptorFile },
+    { "METADB-2",           2,     kMetaDatabase },
+    { "METADB-7",           7,     kMetaDatabase },
+    { "LOG",                0,     kInfoLogFile },
+    { "LOG.old",            0,     kInfoLogFile },
+    { "18446744073709551615.log", 18446744073709551615ull, kLogFile },
+  };
+  for (unsigned int i = 0; i < sizeof(cases) / sizeof(cases[0]); i++) {
+    std::string f = cases[i].fname;
+    ASSERT_TRUE(ParseFileName(f, &number, &type)) << f;
+    ASSERT_EQ(cases[i].type, type) << f;
+    ASSERT_EQ(cases[i].number, number) << f;
+  }
+
+  // Errors
+  static const char* errors[] = {
+    "",
+    "foo",
+    "foo-dx-100.log",
+    ".log",
+    "",
+    "manifest",
+    "CURREN",
+    "CURRENTX",
+    "MANIFES",
+    "MANIFEST",
+    "MANIFEST-",
+    "XMANIFEST-3",
+    "MANIFEST-3x",
+    "META",
+    "METADB",
+    "METADB-",
+    "XMETADB-3",
+    "METADB-3x",
+    "LOC",
+    "LOCKx",
+    "LO",
+    "LOGx",
+    "18446744073709551616.log",
+    "184467440737095516150.log",
+    "100",
+    "100.",
+    "100.lop"
+  };
+  for (unsigned int i = 0; i < sizeof(errors) / sizeof(errors[0]); i++) {
+    std::string f = errors[i];
+    ASSERT_TRUE(!ParseFileName(f, &number, &type)) << f;
+  };
+}
+
+TEST(FileNameTest, Construction) {
+  uint64_t number;
+  FileType type;
+  std::string fname;
+
+  fname = CurrentFileName("foo");
+  ASSERT_EQ("foo/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+  ASSERT_EQ(0U, number);
+  ASSERT_EQ(kCurrentFile, type);
+
+  fname = LockFileName("foo");
+  ASSERT_EQ("foo/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+  ASSERT_EQ(0U, number);
+  ASSERT_EQ(kDBLockFile, type);
+
+  fname = LogFileName("foo", 192);
+  ASSERT_EQ("foo/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+  ASSERT_EQ(192U, number);
+  ASSERT_EQ(kLogFile, type);
+
+  fname = TableFileName("bar", 200);
+  ASSERT_EQ("bar/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+  ASSERT_EQ(200U, number);
+  ASSERT_EQ(kTableFile, type);
+
+  fname = DescriptorFileName("bar", 100);
+  ASSERT_EQ("bar/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+  ASSERT_EQ(100U, number);
+  ASSERT_EQ(kDescriptorFile, type);
+
+  fname = TempFileName("tmp", 999);
+  ASSERT_EQ("tmp/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+  ASSERT_EQ(999U, number);
+  ASSERT_EQ(kTempFile, type);
+
+  fname = MetaDatabaseName("met", 100);
+  ASSERT_EQ("met/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+  ASSERT_EQ(100U, number);
+  ASSERT_EQ(kMetaDatabase, type);
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/db/log_format.h b/db/log_format.h
new file mode 100644 (file)
index 0000000..10a31ba
--- /dev/null
@@ -0,0 +1,36 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Log format information shared by reader and writer.
+// See ../doc/log_format.txt for more detail.
+
+#pragma once
+namespace rocksdb {
+namespace log {
+
+enum RecordType {
+  // Zero is reserved for preallocated files
+  kZeroType = 0,
+
+  kFullType = 1,
+
+  // For fragments
+  kFirstType = 2,
+  kMiddleType = 3,
+  kLastType = 4
+};
+static const int kMaxRecordType = kLastType;
+
+static const unsigned int kBlockSize = 32768;
+
+// Header is checksum (4 bytes), type (1 byte), length (2 bytes).
+static const int kHeaderSize = 4 + 1 + 2;
+
+}  // namespace log
+}  // namespace rocksdb
diff --git a/db/log_reader.cc b/db/log_reader.cc
new file mode 100644 (file)
index 0000000..6596cd8
--- /dev/null
@@ -0,0 +1,264 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/log_reader.h"
+
+#include <stdio.h>
+#include "rocksdb/env.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+
+namespace rocksdb {
+namespace log {
+
+Reader::Reporter::~Reporter() {
+}
+
+Reader::Reader(unique_ptr<SequentialFile>&& file, Reporter* reporter,
+               bool checksum, uint64_t initial_offset)
+    : file_(std::move(file)),
+      reporter_(reporter),
+      checksum_(checksum),
+      backing_store_(new char[kBlockSize]),
+      buffer_(),
+      eof_(false),
+      last_record_offset_(0),
+      end_of_buffer_offset_(0),
+      initial_offset_(initial_offset) {
+}
+
+Reader::~Reader() {
+  delete[] backing_store_;
+}
+
+bool Reader::SkipToInitialBlock() {
+  size_t offset_in_block = initial_offset_ % kBlockSize;
+  uint64_t block_start_location = initial_offset_ - offset_in_block;
+
+  // Don't search a block if we'd be in the trailer
+  if (offset_in_block > kBlockSize - 6) {
+    offset_in_block = 0;
+    block_start_location += kBlockSize;
+  }
+
+  end_of_buffer_offset_ = block_start_location;
+
+  // Skip to start of first block that can contain the initial record
+  if (block_start_location > 0) {
+    Status skip_status = file_->Skip(block_start_location);
+    if (!skip_status.ok()) {
+      ReportDrop(block_start_location, skip_status);
+      return false;
+    }
+  }
+
+  return true;
+}
+
+bool Reader::ReadRecord(Slice* record, std::string* scratch) {
+  if (last_record_offset_ < initial_offset_) {
+    if (!SkipToInitialBlock()) {
+      return false;
+    }
+  }
+
+  scratch->clear();
+  record->clear();
+  bool in_fragmented_record = false;
+  // Record offset of the logical record that we're reading
+  // 0 is a dummy value to make compilers happy
+  uint64_t prospective_record_offset = 0;
+
+  Slice fragment;
+  while (true) {
+    uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size();
+    const unsigned int record_type = ReadPhysicalRecord(&fragment);
+    switch (record_type) {
+      case kFullType:
+        if (in_fragmented_record) {
+          // Handle bug in earlier versions of log::Writer where
+          // it could emit an empty kFirstType record at the tail end
+          // of a block followed by a kFullType or kFirstType record
+          // at the beginning of the next block.
+          if (scratch->empty()) {
+            in_fragmented_record = false;
+          } else {
+            ReportCorruption(scratch->size(), "partial record without end(1)");
+          }
+        }
+        prospective_record_offset = physical_record_offset;
+        scratch->clear();
+        *record = fragment;
+        last_record_offset_ = prospective_record_offset;
+        return true;
+
+      case kFirstType:
+        if (in_fragmented_record) {
+          // Handle bug in earlier versions of log::Writer where
+          // it could emit an empty kFirstType record at the tail end
+          // of a block followed by a kFullType or kFirstType record
+          // at the beginning of the next block.
+          if (scratch->empty()) {
+            in_fragmented_record = false;
+          } else {
+            ReportCorruption(scratch->size(), "partial record without end(2)");
+          }
+        }
+        prospective_record_offset = physical_record_offset;
+        scratch->assign(fragment.data(), fragment.size());
+        in_fragmented_record = true;
+        break;
+
+      case kMiddleType:
+        if (!in_fragmented_record) {
+          ReportCorruption(fragment.size(),
+                           "missing start of fragmented record(1)");
+        } else {
+          scratch->append(fragment.data(), fragment.size());
+        }
+        break;
+
+      case kLastType:
+        if (!in_fragmented_record) {
+          ReportCorruption(fragment.size(),
+                           "missing start of fragmented record(2)");
+        } else {
+          scratch->append(fragment.data(), fragment.size());
+          *record = Slice(*scratch);
+          last_record_offset_ = prospective_record_offset;
+          return true;
+        }
+        break;
+
+      case kEof:
+        if (in_fragmented_record) {
+          ReportCorruption(scratch->size(), "partial record without end(3)");
+          scratch->clear();
+        }
+        return false;
+
+      case kBadRecord:
+        if (in_fragmented_record) {
+          ReportCorruption(scratch->size(), "error in middle of record");
+          in_fragmented_record = false;
+          scratch->clear();
+        }
+        break;
+
+      default: {
+        char buf[40];
+        snprintf(buf, sizeof(buf), "unknown record type %u", record_type);
+        ReportCorruption(
+            (fragment.size() + (in_fragmented_record ? scratch->size() : 0)),
+            buf);
+        in_fragmented_record = false;
+        scratch->clear();
+        break;
+      }
+    }
+  }
+  return false;
+}
+
+uint64_t Reader::LastRecordOffset() {
+  return last_record_offset_;
+}
+
+void Reader::ReportCorruption(size_t bytes, const char* reason) {
+  ReportDrop(bytes, Status::Corruption(reason));
+}
+
+void Reader::ReportDrop(size_t bytes, const Status& reason) {
+  if (reporter_ != nullptr &&
+      end_of_buffer_offset_ - buffer_.size() - bytes >= initial_offset_) {
+    reporter_->Corruption(bytes, reason);
+  }
+}
+
+unsigned int Reader::ReadPhysicalRecord(Slice* result) {
+  while (true) {
+    if (buffer_.size() < (size_t)kHeaderSize) {
+      if (!eof_) {
+        // Last read was a full read, so this is a trailer to skip
+        buffer_.clear();
+        Status status = file_->Read(kBlockSize, &buffer_, backing_store_);
+        end_of_buffer_offset_ += buffer_.size();
+        if (!status.ok()) {
+          buffer_.clear();
+          ReportDrop(kBlockSize, status);
+          eof_ = true;
+          return kEof;
+        } else if (buffer_.size() < (size_t)kBlockSize) {
+          eof_ = true;
+        }
+        continue;
+      } else if (buffer_.size() == 0) {
+        // End of file
+        return kEof;
+      } else {
+        size_t drop_size = buffer_.size();
+        buffer_.clear();
+        ReportCorruption(drop_size, "truncated record at end of file");
+        return kEof;
+      }
+    }
+
+    // Parse the header
+    const char* header = buffer_.data();
+    const uint32_t a = static_cast<uint32_t>(header[4]) & 0xff;
+    const uint32_t b = static_cast<uint32_t>(header[5]) & 0xff;
+    const unsigned int type = header[6];
+    const uint32_t length = a | (b << 8);
+    if (kHeaderSize + length > buffer_.size()) {
+      size_t drop_size = buffer_.size();
+      buffer_.clear();
+      ReportCorruption(drop_size, "bad record length");
+      return kBadRecord;
+    }
+
+    if (type == kZeroType && length == 0) {
+      // Skip zero length record without reporting any drops since
+      // such records are produced by the mmap based writing code in
+      // env_posix.cc that preallocates file regions.
+      buffer_.clear();
+      return kBadRecord;
+    }
+
+    // Check crc
+    if (checksum_) {
+      uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header));
+      uint32_t actual_crc = crc32c::Value(header + 6, 1 + length);
+      if (actual_crc != expected_crc) {
+        // Drop the rest of the buffer since "length" itself may have
+        // been corrupted and if we trust it, we could find some
+        // fragment of a real log record that just happens to look
+        // like a valid log record.
+        size_t drop_size = buffer_.size();
+        buffer_.clear();
+        ReportCorruption(drop_size, "checksum mismatch");
+        return kBadRecord;
+      }
+    }
+
+    buffer_.remove_prefix(kHeaderSize + length);
+
+    // Skip physical record that started before initial_offset_
+    if (end_of_buffer_offset_ - buffer_.size() - kHeaderSize - length <
+        initial_offset_) {
+      result->clear();
+      return kBadRecord;
+    }
+
+    *result = Slice(header + kHeaderSize, length);
+    return type;
+  }
+}
+
+}  // namespace log
+}  // namespace rocksdb
diff --git a/db/log_reader.h b/db/log_reader.h
new file mode 100644 (file)
index 0000000..8e277c8
--- /dev/null
@@ -0,0 +1,124 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <memory>
+#include <stdint.h>
+
+#include "db/log_format.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+
+namespace rocksdb {
+
+class SequentialFile;
+using std::unique_ptr;
+
+namespace log {
+
+class Reader {
+ public:
+  // Interface for reporting errors.
+  class Reporter {
+   public:
+    virtual ~Reporter();
+
+    // Some corruption was detected.  "size" is the approximate number
+    // of bytes dropped due to the corruption.
+    virtual void Corruption(size_t bytes, const Status& status) = 0;
+  };
+
+  // Create a reader that will return log records from "*file".
+  // "*file" must remain live while this Reader is in use.
+  //
+  // If "reporter" is non-nullptr, it is notified whenever some data is
+  // dropped due to a detected corruption.  "*reporter" must remain
+  // live while this Reader is in use.
+  //
+  // If "checksum" is true, verify checksums if available.
+  //
+  // The Reader will start reading at the first record located at physical
+  // position >= initial_offset within the file.
+  Reader(unique_ptr<SequentialFile>&& file, Reporter* reporter,
+         bool checksum, uint64_t initial_offset);
+
+  ~Reader();
+
+  // Read the next record into *record.  Returns true if read
+  // successfully, false if we hit end of the input.  May use
+  // "*scratch" as temporary storage.  The contents filled in *record
+  // will only be valid until the next mutating operation on this
+  // reader or the next mutation to *scratch.
+  bool ReadRecord(Slice* record, std::string* scratch);
+
+  // Returns the physical offset of the last record returned by ReadRecord.
+  //
+  // Undefined before the first call to ReadRecord.
+  uint64_t LastRecordOffset();
+
+  // returns true if the reader has encountered an eof condition.
+  bool IsEOF() {
+    return eof_;
+  }
+
+  // when we know more data has been written to the file. we can use this
+  // function to force the reader to look again in the file.
+  void UnmarkEOF() {
+    eof_ = false;
+  }
+
+  SequentialFile* file() { return file_.get(); }
+
+ private:
+  const unique_ptr<SequentialFile> file_;
+  Reporter* const reporter_;
+  bool const checksum_;
+  char* const backing_store_;
+  Slice buffer_;
+  bool eof_;   // Last Read() indicated EOF by returning < kBlockSize
+
+  // Offset of the last record returned by ReadRecord.
+  uint64_t last_record_offset_;
+  // Offset of the first location past the end of buffer_.
+  uint64_t end_of_buffer_offset_;
+
+  // Offset at which to start looking for the first record to return
+  uint64_t const initial_offset_;
+
+  // Extend record types with the following special values
+  enum {
+    kEof = kMaxRecordType + 1,
+    // Returned whenever we find an invalid physical record.
+    // Currently there are three situations in which this happens:
+    // * The record has an invalid CRC (ReadPhysicalRecord reports a drop)
+    // * The record is a 0-length record (No drop is reported)
+    // * The record is below constructor's initial_offset (No drop is reported)
+    kBadRecord = kMaxRecordType + 2
+  };
+
+  // Skips all blocks that are completely before "initial_offset_".
+  //
+  // Returns true on success. Handles reporting.
+  bool SkipToInitialBlock();
+
+  // Return type, or one of the preceding special values
+  unsigned int ReadPhysicalRecord(Slice* result);
+
+  // Reports dropped bytes to the reporter.
+  // buffer_ must be updated to remove the dropped bytes prior to invocation.
+  void ReportCorruption(size_t bytes, const char* reason);
+  void ReportDrop(size_t bytes, const Status& reason);
+
+  // No copying allowed
+  Reader(const Reader&);
+  void operator=(const Reader&);
+};
+
+}  // namespace log
+}  // namespace rocksdb
diff --git a/db/log_test.cc b/db/log_test.cc
new file mode 100644 (file)
index 0000000..dedbff0
--- /dev/null
@@ -0,0 +1,528 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "rocksdb/env.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+#include "util/random.h"
+#include "util/testharness.h"
+
+namespace rocksdb {
+namespace log {
+
+// Construct a string of the specified length made out of the supplied
+// partial string.
+static std::string BigString(const std::string& partial_string, size_t n) {
+  std::string result;
+  while (result.size() < n) {
+    result.append(partial_string);
+  }
+  result.resize(n);
+  return result;
+}
+
+// Construct a string from a number
+static std::string NumberString(int n) {
+  char buf[50];
+  snprintf(buf, sizeof(buf), "%d.", n);
+  return std::string(buf);
+}
+
+// Return a skewed potentially long string
+static std::string RandomSkewedString(int i, Random* rnd) {
+  return BigString(NumberString(i), rnd->Skewed(17));
+}
+
+class LogTest {
+ private:
+  class StringDest : public WritableFile {
+   public:
+    std::string contents_;
+
+    virtual Status Close() { return Status::OK(); }
+    virtual Status Flush() { return Status::OK(); }
+    virtual Status Sync() { return Status::OK(); }
+    virtual Status Append(const Slice& slice) {
+      contents_.append(slice.data(), slice.size());
+      return Status::OK();
+    }
+  };
+
+  class StringSource : public SequentialFile {
+   public:
+    Slice contents_;
+    bool force_error_;
+    bool returned_partial_;
+    StringSource() : force_error_(false), returned_partial_(false) { }
+
+    virtual Status Read(size_t n, Slice* result, char* scratch) {
+      ASSERT_TRUE(!returned_partial_) << "must not Read() after eof/error";
+
+      if (force_error_) {
+        force_error_ = false;
+        returned_partial_ = true;
+        return Status::Corruption("read error");
+      }
+
+      if (contents_.size() < n) {
+        n = contents_.size();
+        returned_partial_ = true;
+      }
+      *result = Slice(contents_.data(), n);
+      contents_.remove_prefix(n);
+      return Status::OK();
+    }
+
+    virtual Status Skip(uint64_t n) {
+      if (n > contents_.size()) {
+        contents_.clear();
+        return Status::NotFound("in-memory file skipepd past end");
+      }
+
+      contents_.remove_prefix(n);
+
+      return Status::OK();
+    }
+  };
+
+  class ReportCollector : public Reader::Reporter {
+   public:
+    size_t dropped_bytes_;
+    std::string message_;
+
+    ReportCollector() : dropped_bytes_(0) { }
+    virtual void Corruption(size_t bytes, const Status& status) {
+      dropped_bytes_ += bytes;
+      message_.append(status.ToString());
+    }
+  };
+
+  std::string& dest_contents() {
+    auto dest = dynamic_cast<StringDest*>(writer_.file());
+    assert(dest);
+    return dest->contents_;
+  }
+
+  const std::string& dest_contents() const {
+    auto dest = dynamic_cast<const StringDest*>(writer_.file());
+    assert(dest);
+    return dest->contents_;
+  }
+
+  void reset_source_contents() {
+    auto src = dynamic_cast<StringSource*>(reader_.file());
+    assert(src);
+    src->contents_ = dest_contents();
+  }
+
+  unique_ptr<StringDest> dest_holder_;
+  unique_ptr<StringSource> source_holder_;
+  ReportCollector report_;
+  bool reading_;
+  Writer writer_;
+  Reader reader_;
+
+  // Record metadata for testing initial offset functionality
+  static size_t initial_offset_record_sizes_[];
+  static uint64_t initial_offset_last_record_offsets_[];
+
+ public:
+  LogTest() : dest_holder_(new StringDest),
+              source_holder_(new StringSource),
+              reading_(false),
+              writer_(std::move(dest_holder_)),
+              reader_(std::move(source_holder_), &report_, true/*checksum*/,
+                      0/*initial_offset*/) {
+  }
+
+  void Write(const std::string& msg) {
+    ASSERT_TRUE(!reading_) << "Write() after starting to read";
+    writer_.AddRecord(Slice(msg));
+  }
+
+  size_t WrittenBytes() const {
+    return dest_contents().size();
+  }
+
+  std::string Read() {
+    if (!reading_) {
+      reading_ = true;
+      reset_source_contents();
+    }
+    std::string scratch;
+    Slice record;
+    if (reader_.ReadRecord(&record, &scratch)) {
+      return record.ToString();
+    } else {
+      return "EOF";
+    }
+  }
+
+  void IncrementByte(int offset, int delta) {
+    dest_contents()[offset] += delta;
+  }
+
+  void SetByte(int offset, char new_byte) {
+    dest_contents()[offset] = new_byte;
+  }
+
+  void ShrinkSize(int bytes) {
+    dest_contents().resize(dest_contents().size() - bytes);
+  }
+
+  void FixChecksum(int header_offset, int len) {
+    // Compute crc of type/len/data
+    uint32_t crc = crc32c::Value(&dest_contents()[header_offset+6], 1 + len);
+    crc = crc32c::Mask(crc);
+    EncodeFixed32(&dest_contents()[header_offset], crc);
+  }
+
+  void ForceError() {
+    auto src = dynamic_cast<StringSource*>(reader_.file());
+    src->force_error_ = true;
+  }
+
+  size_t DroppedBytes() const {
+    return report_.dropped_bytes_;
+  }
+
+  std::string ReportMessage() const {
+    return report_.message_;
+  }
+
+  // Returns OK iff recorded error message contains "msg"
+  std::string MatchError(const std::string& msg) const {
+    if (report_.message_.find(msg) == std::string::npos) {
+      return report_.message_;
+    } else {
+      return "OK";
+    }
+  }
+
+  void WriteInitialOffsetLog() {
+    for (int i = 0; i < 4; i++) {
+      std::string record(initial_offset_record_sizes_[i],
+                         static_cast<char>('a' + i));
+      Write(record);
+    }
+  }
+
+  void CheckOffsetPastEndReturnsNoRecords(uint64_t offset_past_end) {
+    WriteInitialOffsetLog();
+    reading_ = true;
+    unique_ptr<StringSource> source(new StringSource);
+    source->contents_ = dest_contents();
+    unique_ptr<Reader> offset_reader(
+      new Reader(std::move(source), &report_, true/*checksum*/,
+                 WrittenBytes() + offset_past_end));
+    Slice record;
+    std::string scratch;
+    ASSERT_TRUE(!offset_reader->ReadRecord(&record, &scratch));
+  }
+
+  void CheckInitialOffsetRecord(uint64_t initial_offset,
+                                int expected_record_offset) {
+    WriteInitialOffsetLog();
+    reading_ = true;
+    unique_ptr<StringSource> source(new StringSource);
+    source->contents_ = dest_contents();
+    unique_ptr<Reader> offset_reader(
+      new Reader(std::move(source), &report_, true/*checksum*/,
+                 initial_offset));
+    Slice record;
+    std::string scratch;
+    ASSERT_TRUE(offset_reader->ReadRecord(&record, &scratch));
+    ASSERT_EQ(initial_offset_record_sizes_[expected_record_offset],
+              record.size());
+    ASSERT_EQ(initial_offset_last_record_offsets_[expected_record_offset],
+              offset_reader->LastRecordOffset());
+    ASSERT_EQ((char)('a' + expected_record_offset), record.data()[0]);
+  }
+
+};
+
+size_t LogTest::initial_offset_record_sizes_[] =
+    {10000,  // Two sizable records in first block
+     10000,
+     2 * log::kBlockSize - 1000,  // Span three blocks
+     1};
+
+uint64_t LogTest::initial_offset_last_record_offsets_[] =
+    {0,
+     kHeaderSize + 10000,
+     2 * (kHeaderSize + 10000),
+     2 * (kHeaderSize + 10000) +
+         (2 * log::kBlockSize - 1000) + 3 * kHeaderSize};
+
+
+TEST(LogTest, Empty) {
+  ASSERT_EQ("EOF", Read());
+}
+
+TEST(LogTest, ReadWrite) {
+  Write("foo");
+  Write("bar");
+  Write("");
+  Write("xxxx");
+  ASSERT_EQ("foo", Read());
+  ASSERT_EQ("bar", Read());
+  ASSERT_EQ("", Read());
+  ASSERT_EQ("xxxx", Read());
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ("EOF", Read());  // Make sure reads at eof work
+}
+
+TEST(LogTest, ManyBlocks) {
+  for (int i = 0; i < 100000; i++) {
+    Write(NumberString(i));
+  }
+  for (int i = 0; i < 100000; i++) {
+    ASSERT_EQ(NumberString(i), Read());
+  }
+  ASSERT_EQ("EOF", Read());
+}
+
+TEST(LogTest, Fragmentation) {
+  Write("small");
+  Write(BigString("medium", 50000));
+  Write(BigString("large", 100000));
+  ASSERT_EQ("small", Read());
+  ASSERT_EQ(BigString("medium", 50000), Read());
+  ASSERT_EQ(BigString("large", 100000), Read());
+  ASSERT_EQ("EOF", Read());
+}
+
+TEST(LogTest, MarginalTrailer) {
+  // Make a trailer that is exactly the same length as an empty record.
+  const int n = kBlockSize - 2*kHeaderSize;
+  Write(BigString("foo", n));
+  ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize), WrittenBytes());
+  Write("");
+  Write("bar");
+  ASSERT_EQ(BigString("foo", n), Read());
+  ASSERT_EQ("", Read());
+  ASSERT_EQ("bar", Read());
+  ASSERT_EQ("EOF", Read());
+}
+
+TEST(LogTest, MarginalTrailer2) {
+  // Make a trailer that is exactly the same length as an empty record.
+  const int n = kBlockSize - 2*kHeaderSize;
+  Write(BigString("foo", n));
+  ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize), WrittenBytes());
+  Write("bar");
+  ASSERT_EQ(BigString("foo", n), Read());
+  ASSERT_EQ("bar", Read());
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(0U, DroppedBytes());
+  ASSERT_EQ("", ReportMessage());
+}
+
+TEST(LogTest, ShortTrailer) {
+  const int n = kBlockSize - 2*kHeaderSize + 4;
+  Write(BigString("foo", n));
+  ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize + 4), WrittenBytes());
+  Write("");
+  Write("bar");
+  ASSERT_EQ(BigString("foo", n), Read());
+  ASSERT_EQ("", Read());
+  ASSERT_EQ("bar", Read());
+  ASSERT_EQ("EOF", Read());
+}
+
+TEST(LogTest, AlignedEof) {
+  const int n = kBlockSize - 2*kHeaderSize + 4;
+  Write(BigString("foo", n));
+  ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize + 4), WrittenBytes());
+  ASSERT_EQ(BigString("foo", n), Read());
+  ASSERT_EQ("EOF", Read());
+}
+
+TEST(LogTest, RandomRead) {
+  const int N = 500;
+  Random write_rnd(301);
+  for (int i = 0; i < N; i++) {
+    Write(RandomSkewedString(i, &write_rnd));
+  }
+  Random read_rnd(301);
+  for (int i = 0; i < N; i++) {
+    ASSERT_EQ(RandomSkewedString(i, &read_rnd), Read());
+  }
+  ASSERT_EQ("EOF", Read());
+}
+
+// Tests of all the error paths in log_reader.cc follow:
+
+TEST(LogTest, ReadError) {
+  Write("foo");
+  ForceError();
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ((unsigned int)kBlockSize, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("read error"));
+}
+
+TEST(LogTest, BadRecordType) {
+  Write("foo");
+  // Type is stored in header[6]
+  IncrementByte(6, 100);
+  FixChecksum(0, 3);
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(3U, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("unknown record type"));
+}
+
+TEST(LogTest, TruncatedTrailingRecord) {
+  Write("foo");
+  ShrinkSize(4);   // Drop all payload as well as a header byte
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ((unsigned int)(kHeaderSize - 1), DroppedBytes());
+  ASSERT_EQ("OK", MatchError("truncated record at end of file"));
+}
+
+TEST(LogTest, BadLength) {
+  Write("foo");
+  ShrinkSize(1);
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ((unsigned int)(kHeaderSize + 2), DroppedBytes());
+  ASSERT_EQ("OK", MatchError("bad record length"));
+}
+
+TEST(LogTest, ChecksumMismatch) {
+  Write("foo");
+  IncrementByte(0, 10);
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(10U, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("checksum mismatch"));
+}
+
+TEST(LogTest, UnexpectedMiddleType) {
+  Write("foo");
+  SetByte(6, kMiddleType);
+  FixChecksum(0, 3);
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(3U, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("missing start"));
+}
+
+TEST(LogTest, UnexpectedLastType) {
+  Write("foo");
+  SetByte(6, kLastType);
+  FixChecksum(0, 3);
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(3U, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("missing start"));
+}
+
+TEST(LogTest, UnexpectedFullType) {
+  Write("foo");
+  Write("bar");
+  SetByte(6, kFirstType);
+  FixChecksum(0, 3);
+  ASSERT_EQ("bar", Read());
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(3U, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("partial record without end"));
+}
+
+TEST(LogTest, UnexpectedFirstType) {
+  Write("foo");
+  Write(BigString("bar", 100000));
+  SetByte(6, kFirstType);
+  FixChecksum(0, 3);
+  ASSERT_EQ(BigString("bar", 100000), Read());
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(3U, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("partial record without end"));
+}
+
+TEST(LogTest, ErrorJoinsRecords) {
+  // Consider two fragmented records:
+  //    first(R1) last(R1) first(R2) last(R2)
+  // where the middle two fragments disappear.  We do not want
+  // first(R1),last(R2) to get joined and returned as a valid record.
+
+  // Write records that span two blocks
+  Write(BigString("foo", kBlockSize));
+  Write(BigString("bar", kBlockSize));
+  Write("correct");
+
+  // Wipe the middle block
+  for (unsigned int offset = kBlockSize; offset < 2*kBlockSize; offset++) {
+    SetByte(offset, 'x');
+  }
+
+  ASSERT_EQ("correct", Read());
+  ASSERT_EQ("EOF", Read());
+  const unsigned int dropped = DroppedBytes();
+  ASSERT_LE(dropped, 2*kBlockSize + 100);
+  ASSERT_GE(dropped, 2*kBlockSize);
+}
+
+TEST(LogTest, ReadStart) {
+  CheckInitialOffsetRecord(0, 0);
+}
+
+TEST(LogTest, ReadSecondOneOff) {
+  CheckInitialOffsetRecord(1, 1);
+}
+
+TEST(LogTest, ReadSecondTenThousand) {
+  CheckInitialOffsetRecord(10000, 1);
+}
+
+TEST(LogTest, ReadSecondStart) {
+  CheckInitialOffsetRecord(10007, 1);
+}
+
+TEST(LogTest, ReadThirdOneOff) {
+  CheckInitialOffsetRecord(10008, 2);
+}
+
+TEST(LogTest, ReadThirdStart) {
+  CheckInitialOffsetRecord(20014, 2);
+}
+
+TEST(LogTest, ReadFourthOneOff) {
+  CheckInitialOffsetRecord(20015, 3);
+}
+
+TEST(LogTest, ReadFourthFirstBlockTrailer) {
+  CheckInitialOffsetRecord(log::kBlockSize - 4, 3);
+}
+
+TEST(LogTest, ReadFourthMiddleBlock) {
+  CheckInitialOffsetRecord(log::kBlockSize + 1, 3);
+}
+
+TEST(LogTest, ReadFourthLastBlock) {
+  CheckInitialOffsetRecord(2 * log::kBlockSize + 1, 3);
+}
+
+TEST(LogTest, ReadFourthStart) {
+  CheckInitialOffsetRecord(
+      2 * (kHeaderSize + 1000) + (2 * log::kBlockSize - 1000) + 3 * kHeaderSize,
+      3);
+}
+
+TEST(LogTest, ReadEnd) {
+  CheckOffsetPastEndReturnsNoRecords(0);
+}
+
+TEST(LogTest, ReadPastEnd) {
+  CheckOffsetPastEndReturnsNoRecords(5);
+}
+
+}  // namespace log
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/db/log_writer.cc b/db/log_writer.cc
new file mode 100644 (file)
index 0000000..df601a4
--- /dev/null
@@ -0,0 +1,108 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/log_writer.h"
+
+#include <stdint.h>
+#include "rocksdb/env.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+
+namespace rocksdb {
+namespace log {
+
+Writer::Writer(unique_ptr<WritableFile>&& dest)
+    : dest_(std::move(dest)),
+      block_offset_(0) {
+  for (int i = 0; i <= kMaxRecordType; i++) {
+    char t = static_cast<char>(i);
+    type_crc_[i] = crc32c::Value(&t, 1);
+  }
+}
+
+Writer::~Writer() {
+}
+
+Status Writer::AddRecord(const Slice& slice) {
+  const char* ptr = slice.data();
+  size_t left = slice.size();
+
+  // Fragment the record if necessary and emit it.  Note that if slice
+  // is empty, we still want to iterate once to emit a single
+  // zero-length record
+  Status s;
+  bool begin = true;
+  do {
+    const int leftover = kBlockSize - block_offset_;
+    assert(leftover >= 0);
+    if (leftover < kHeaderSize) {
+      // Switch to a new block
+      if (leftover > 0) {
+        // Fill the trailer (literal below relies on kHeaderSize being 7)
+        assert(kHeaderSize == 7);
+        dest_->Append(Slice("\x00\x00\x00\x00\x00\x00", leftover));
+      }
+      block_offset_ = 0;
+    }
+
+    // Invariant: we never leave < kHeaderSize bytes in a block.
+    assert(kBlockSize - block_offset_ - kHeaderSize >= 0);
+
+    const size_t avail = kBlockSize - block_offset_ - kHeaderSize;
+    const size_t fragment_length = (left < avail) ? left : avail;
+
+    RecordType type;
+    const bool end = (left == fragment_length);
+    if (begin && end) {
+      type = kFullType;
+    } else if (begin) {
+      type = kFirstType;
+    } else if (end) {
+      type = kLastType;
+    } else {
+      type = kMiddleType;
+    }
+
+    s = EmitPhysicalRecord(type, ptr, fragment_length);
+    ptr += fragment_length;
+    left -= fragment_length;
+    begin = false;
+  } while (s.ok() && left > 0);
+  return s;
+}
+
+Status Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n) {
+  assert(n <= 0xffff);  // Must fit in two bytes
+  assert(block_offset_ + kHeaderSize + n <= kBlockSize);
+
+  // Format the header
+  char buf[kHeaderSize];
+  buf[4] = static_cast<char>(n & 0xff);
+  buf[5] = static_cast<char>(n >> 8);
+  buf[6] = static_cast<char>(t);
+
+  // Compute the crc of the record type and the payload.
+  uint32_t crc = crc32c::Extend(type_crc_[t], ptr, n);
+  crc = crc32c::Mask(crc);                 // Adjust for storage
+  EncodeFixed32(buf, crc);
+
+  // Write the header and the payload
+  Status s = dest_->Append(Slice(buf, kHeaderSize));
+  if (s.ok()) {
+    s = dest_->Append(Slice(ptr, n));
+    if (s.ok()) {
+      s = dest_->Flush();
+    }
+  }
+  block_offset_ += kHeaderSize + n;
+  return s;
+}
+
+}  // namespace log
+}  // namespace rocksdb
diff --git a/db/log_writer.h b/db/log_writer.h
new file mode 100644 (file)
index 0000000..d7b7aff
--- /dev/null
@@ -0,0 +1,55 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <memory>
+#include <stdint.h>
+#include "db/log_format.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+
+namespace rocksdb {
+
+class WritableFile;
+
+using std::unique_ptr;
+
+namespace log {
+
+class Writer {
+ public:
+  // Create a writer that will append data to "*dest".
+  // "*dest" must be initially empty.
+  // "*dest" must remain live while this Writer is in use.
+  explicit Writer(unique_ptr<WritableFile>&& dest);
+  ~Writer();
+
+  Status AddRecord(const Slice& slice);
+
+  WritableFile* file() { return dest_.get(); }
+  const WritableFile* file() const { return dest_.get(); }
+
+ private:
+  unique_ptr<WritableFile> dest_;
+  int block_offset_;       // Current offset in block
+
+  // crc32c values for all supported record types.  These are
+  // pre-computed to reduce the overhead of computing the crc of the
+  // record type stored in the header.
+  uint32_t type_crc_[kMaxRecordType + 1];
+
+  Status EmitPhysicalRecord(RecordType type, const char* ptr, size_t length);
+
+  // No copying allowed
+  Writer(const Writer&);
+  void operator=(const Writer&);
+};
+
+}  // namespace log
+}  // namespace rocksdb
diff --git a/db/memtable.cc b/db/memtable.cc
new file mode 100644 (file)
index 0000000..baff4fb
--- /dev/null
@@ -0,0 +1,358 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/memtable.h"
+
+#include <memory>
+
+#include "db/dbformat.h"
+#include "db/merge_context.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/merge_operator.h"
+#include "util/coding.h"
+#include "util/mutexlock.h"
+#include "util/murmurhash.h"
+#include "util/statistics_imp.h"
+
+namespace std {
+template <>
+struct hash<rocksdb::Slice> {
+  size_t operator()(const rocksdb::Slice& slice) const {
+    return MurmurHash(slice.data(), slice.size(), 0);
+  }
+};
+}
+
+namespace rocksdb {
+
+MemTable::MemTable(const InternalKeyComparator& cmp, const Options& options)
+    : comparator_(cmp),
+      refs_(0),
+      arena_impl_(options.arena_block_size),
+      table_(options.memtable_factory->CreateMemTableRep(comparator_,
+                                                         &arena_impl_)),
+      flush_in_progress_(false),
+      flush_completed_(false),
+      file_number_(0),
+      first_seqno_(0),
+      mem_next_logfile_number_(0),
+      mem_logfile_number_(0),
+      locks_(options.inplace_update_support ? options.inplace_update_num_locks
+                                            : 0) {}
+
+MemTable::~MemTable() {
+  assert(refs_ == 0);
+}
+
+size_t MemTable::ApproximateMemoryUsage() {
+  return arena_impl_.ApproximateMemoryUsage() +
+         table_->ApproximateMemoryUsage();
+}
+
+int MemTable::KeyComparator::operator()(const char* aptr, const char* bptr)
+    const {
+  // Internal keys are encoded as length-prefixed strings.
+  Slice a = GetLengthPrefixedSlice(aptr);
+  Slice b = GetLengthPrefixedSlice(bptr);
+  return comparator.Compare(a, b);
+}
+
+Slice MemTableRep::UserKey(const char* key) const {
+  Slice slice = GetLengthPrefixedSlice(key);
+  return Slice(slice.data(), slice.size() - 8);
+}
+
+// Encode a suitable internal key target for "target" and return it.
+// Uses *scratch as scratch space, and the returned pointer will point
+// into this scratch space.
+static const char* EncodeKey(std::string* scratch, const Slice& target) {
+  scratch->clear();
+  PutVarint32(scratch, target.size());
+  scratch->append(target.data(), target.size());
+  return scratch->data();
+}
+
+class MemTableIterator: public Iterator {
+ public:
+  MemTableIterator(MemTableRep* table, const ReadOptions& options)
+    : iter_() {
+    if (options.prefix) {
+      iter_ = table->GetPrefixIterator(*options.prefix);
+    } else if (options.prefix_seek) {
+      iter_ = table->GetDynamicPrefixIterator();
+    } else {
+      iter_ = table->GetIterator();
+    }
+  }
+
+  virtual bool Valid() const { return iter_->Valid(); }
+  virtual void Seek(const Slice& k) { iter_->Seek(EncodeKey(&tmp_, k)); }
+  virtual void SeekToFirst() { iter_->SeekToFirst(); }
+  virtual void SeekToLast() { iter_->SeekToLast(); }
+  virtual void Next() { iter_->Next(); }
+  virtual void Prev() { iter_->Prev(); }
+  virtual Slice key() const {
+    return GetLengthPrefixedSlice(iter_->key());
+  }
+  virtual Slice value() const {
+    Slice key_slice = GetLengthPrefixedSlice(iter_->key());
+    return GetLengthPrefixedSlice(key_slice.data() + key_slice.size());
+  }
+
+  virtual Status status() const { return Status::OK(); }
+
+ private:
+  std::shared_ptr<MemTableRep::Iterator> iter_;
+  std::string tmp_;       // For passing to EncodeKey
+
+  // No copying allowed
+  MemTableIterator(const MemTableIterator&);
+  void operator=(const MemTableIterator&);
+};
+
+Iterator* MemTable::NewIterator(const ReadOptions& options) {
+  return new MemTableIterator(table_.get(), options);
+}
+
+port::RWMutex* MemTable::GetLock(const Slice& key) {
+  return &locks_[std::hash<Slice>()(key) % locks_.size()];
+}
+
+void MemTable::Add(SequenceNumber s, ValueType type,
+                   const Slice& key,
+                   const Slice& value) {
+  // Format of an entry is concatenation of:
+  //  key_size     : varint32 of internal_key.size()
+  //  key bytes    : char[internal_key.size()]
+  //  value_size   : varint32 of value.size()
+  //  value bytes  : char[value.size()]
+  size_t key_size = key.size();
+  size_t val_size = value.size();
+  size_t internal_key_size = key_size + 8;
+  const size_t encoded_len =
+      VarintLength(internal_key_size) + internal_key_size +
+      VarintLength(val_size) + val_size;
+  char* buf = arena_impl_.Allocate(encoded_len);
+  char* p = EncodeVarint32(buf, internal_key_size);
+  memcpy(p, key.data(), key_size);
+  p += key_size;
+  EncodeFixed64(p, (s << 8) | type);
+  p += 8;
+  p = EncodeVarint32(p, val_size);
+  memcpy(p, value.data(), val_size);
+  assert((p + val_size) - buf == (unsigned)encoded_len);
+  table_->Insert(buf);
+
+  // The first sequence number inserted into the memtable
+  assert(first_seqno_ == 0 || s > first_seqno_);
+  if (first_seqno_ == 0) {
+    first_seqno_ = s;
+  }
+}
+
+bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
+                   MergeContext& merge_context, const Options& options) {
+  Slice memkey = key.memtable_key();
+  std::shared_ptr<MemTableRep::Iterator> iter(
+    table_->GetIterator(key.user_key()));
+  iter->Seek(memkey.data());
+
+  bool merge_in_progress = s->IsMergeInProgress();
+  auto merge_operator = options.merge_operator.get();
+  auto logger = options.info_log;
+  std::string merge_result;
+
+  for (; iter->Valid(); iter->Next()) {
+    // entry format is:
+    //    klength  varint32
+    //    userkey  char[klength-8]
+    //    tag      uint64
+    //    vlength  varint32
+    //    value    char[vlength]
+    // Check that it belongs to same user key.  We do not check the
+    // sequence number since the Seek() call above should have skipped
+    // all entries with overly large sequence numbers.
+    const char* entry = iter->key();
+    uint32_t key_length;
+    const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
+    if (comparator_.comparator.user_comparator()->Compare(
+        Slice(key_ptr, key_length - 8), key.user_key()) == 0) {
+      // Correct user key
+      const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
+      switch (static_cast<ValueType>(tag & 0xff)) {
+        case kTypeValue: {
+          if (options.inplace_update_support) {
+            GetLock(key.user_key())->ReadLock();
+          }
+          Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
+          *s = Status::OK();
+          if (merge_in_progress) {
+            assert(merge_operator);
+          if (!merge_operator->FullMerge(key.user_key(), &v,
+                                         merge_context.GetOperands(), value,
+                                         logger.get())) {
+              RecordTick(options.statistics.get(), NUMBER_MERGE_FAILURES);
+              *s = Status::Corruption("Error: Could not perform merge.");
+            }
+          } else {
+            value->assign(v.data(), v.size());
+          }
+          if (options.inplace_update_support) {
+            GetLock(key.user_key())->Unlock();
+          }
+          return true;
+        }
+        case kTypeDeletion: {
+          if (merge_in_progress) {
+            assert(merge_operator);
+            *s = Status::OK();
+          if (!merge_operator->FullMerge(key.user_key(), nullptr,
+                                         merge_context.GetOperands(), value,
+                                         logger.get())) {
+              RecordTick(options.statistics.get(), NUMBER_MERGE_FAILURES);
+              *s = Status::Corruption("Error: Could not perform merge.");
+            }
+          } else {
+            *s = Status::NotFound();
+          }
+          return true;
+        }
+        case kTypeMerge: {
+          Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
+          merge_in_progress = true;
+          merge_context.PushOperand(v);
+          while(merge_context.GetNumOperands() >= 2) {
+            // Attempt to associative merge. (Returns true if successful)
+          if (merge_operator->PartialMerge(key.user_key(),
+                                           merge_context.GetOperand(0),
+                                           merge_context.GetOperand(1),
+                                           &merge_result, logger.get())) {
+              merge_context.PushPartialMergeResult(merge_result);
+            } else {
+              // Stack them because user can't associative merge
+              break;
+            }
+          }
+          break;
+        }
+        case kTypeLogData:
+          assert(false);
+          break;
+      }
+    } else {
+      // exit loop if user key does not match
+      break;
+    }
+  }
+
+  // No change to value, since we have not yet found a Put/Delete
+
+  if (merge_in_progress) {
+    *s = Status::MergeInProgress("");
+  }
+  return false;
+}
+
+bool MemTable::Update(SequenceNumber seq, ValueType type,
+                      const Slice& key,
+                      const Slice& value) {
+  LookupKey lkey(key, seq);
+  Slice memkey = lkey.memtable_key();
+
+  std::shared_ptr<MemTableRep::Iterator> iter(
+    table_->GetIterator(lkey.user_key()));
+  iter->Seek(memkey.data());
+
+  if (iter->Valid()) {
+    // entry format is:
+    //    klength  varint32
+    //    userkey  char[klength-8]
+    //    tag      uint64
+    //    vlength  varint32
+    //    value    char[vlength]
+    // Check that it belongs to same user key.  We do not check the
+    // sequence number since the Seek() call above should have skipped
+    // all entries with overly large sequence numbers.
+    const char* entry = iter->key();
+    uint32_t key_length;
+    const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
+    if (comparator_.comparator.user_comparator()->Compare(
+        Slice(key_ptr, key_length - 8), lkey.user_key()) == 0) {
+      // Correct user key
+      const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
+      switch (static_cast<ValueType>(tag & 0xff)) {
+        case kTypeValue: {
+          uint32_t vlength;
+          GetVarint32Ptr(key_ptr + key_length,
+                         key_ptr + key_length+5, &vlength);
+          // Update value, if newValue size  <= curValue size
+          if (value.size() <= vlength) {
+            char* p = EncodeVarint32(const_cast<char*>(key_ptr) + key_length,
+                                     value.size());
+            WriteLock wl(GetLock(lkey.user_key()));
+            memcpy(p, value.data(), value.size());
+            assert(
+              (p + value.size()) - entry ==
+              (unsigned) (VarintLength(key_length) +
+                          key_length +
+                          VarintLength(value.size()) +
+                          value.size())
+            );
+            return true;
+          }
+        }
+        default:
+          // If the latest value is kTypeDeletion, kTypeMerge or kTypeLogData
+          // then we probably don't have enough space to update in-place
+          // Maybe do something later
+          // Return false, and do normal Add()
+          return false;
+      }
+    }
+  }
+
+  // Key doesn't exist
+  return false;
+}
+
+size_t MemTable::CountSuccessiveMergeEntries(const LookupKey& key) {
+  Slice memkey = key.memtable_key();
+
+  // A total ordered iterator is costly for some memtablerep (prefix aware
+  // reps). By passing in the user key, we allow efficient iterator creation.
+  // The iterator only needs to be ordered within the same user key.
+  std::shared_ptr<MemTableRep::Iterator> iter(
+    table_->GetIterator(key.user_key()));
+  iter->Seek(memkey.data());
+
+  size_t num_successive_merges = 0;
+
+  for (; iter->Valid(); iter->Next()) {
+    const char* entry = iter->key();
+    uint32_t key_length;
+    const char* iter_key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
+    if (!comparator_.comparator.user_comparator()->Compare(
+        Slice(iter_key_ptr, key_length - 8), key.user_key()) == 0) {
+      break;
+    }
+
+    const uint64_t tag = DecodeFixed64(iter_key_ptr + key_length - 8);
+    if (static_cast<ValueType>(tag & 0xff) != kTypeMerge) {
+      break;
+    }
+
+    ++num_successive_merges;
+  }
+
+  return num_successive_merges;
+}
+
+}  // namespace rocksdb
diff --git a/db/memtable.h b/db/memtable.h
new file mode 100644 (file)
index 0000000..24a2c85
--- /dev/null
@@ -0,0 +1,178 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <string>
+#include <memory>
+#include <deque>
+#include "db/dbformat.h"
+#include "db/skiplist.h"
+#include "db/version_set.h"
+#include "rocksdb/db.h"
+#include "rocksdb/memtablerep.h"
+#include "util/arena_impl.h"
+
+namespace rocksdb {
+
+class Mutex;
+class MemTableIterator;
+class MergeContext;
+
+class MemTable {
+ public:
+  struct KeyComparator : public MemTableRep::KeyComparator {
+    const InternalKeyComparator comparator;
+    explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) { }
+    virtual int operator()(const char* a, const char* b) const;
+  };
+
+  // MemTables are reference counted.  The initial reference count
+  // is zero and the caller must call Ref() at least once.
+  explicit MemTable(const InternalKeyComparator& comparator,
+                    const Options& options = Options());
+
+  ~MemTable();
+
+  // Increase reference count.
+  void Ref() { ++refs_; }
+
+  // Drop reference count.
+  // If the refcount goes to zero return this memtable, otherwise return null
+  MemTable* Unref() {
+    --refs_;
+    assert(refs_ >= 0);
+    if (refs_ <= 0) {
+      return this;
+    }
+    return nullptr;
+  }
+
+  // Returns an estimate of the number of bytes of data in use by this
+  // data structure.
+  //
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable.
+  size_t ApproximateMemoryUsage();
+
+  // Return an iterator that yields the contents of the memtable.
+  //
+  // The caller must ensure that the underlying MemTable remains live
+  // while the returned iterator is live.  The keys returned by this
+  // iterator are internal keys encoded by AppendInternalKey in the
+  // db/dbformat.{h,cc} module.
+  //
+  // If options.prefix is supplied, it is passed to the underlying MemTableRep
+  // as a hint that the iterator only need to support access to keys with that
+  // specific prefix.
+  // If options.prefix is not supplied and options.prefix_seek is set, the
+  // iterator is not bound to a specific prefix. However, the semantics of
+  // Seek is changed - the result might only include keys with the same prefix
+  // as the seek-key.
+  Iterator* NewIterator(const ReadOptions& options = ReadOptions());
+
+  // Add an entry into memtable that maps key to value at the
+  // specified sequence number and with the specified type.
+  // Typically value will be empty if type==kTypeDeletion.
+  void Add(SequenceNumber seq, ValueType type,
+           const Slice& key,
+           const Slice& value);
+
+  // If memtable contains a value for key, store it in *value and return true.
+  // If memtable contains a deletion for key, store a NotFound() error
+  // in *status and return true.
+  // If memtable contains Merge operation as the most recent entry for a key,
+  //   and the merge process does not stop (not reaching a value or delete),
+  //   prepend the current merge operand to *operands.
+  //   store MergeInProgress in s, and return false.
+  // Else, return false.
+  bool Get(const LookupKey& key, std::string* value, Status* s,
+           MergeContext& merge_context, const Options& options);
+
+  // Update the value and return status ok,
+  //   if key exists in current memtable
+  //     if new sizeof(new_value) <= sizeof(old_value) &&
+  //       old_value for that key is a put i.e. kTypeValue
+  //     else return false, and status - NotUpdatable()
+  //   else return false, and status - NotFound()
+  bool Update(SequenceNumber seq, ValueType type,
+              const Slice& key,
+              const Slice& value);
+
+  // Returns the number of successive merge entries starting from the newest
+  // entry for the key up to the last non-merge entry or last entry for the
+  // key in the memtable.
+  size_t CountSuccessiveMergeEntries(const LookupKey& key);
+
+  // Returns the edits area that is needed for flushing the memtable
+  VersionEdit* GetEdits() { return &edit_; }
+
+  // Returns the sequence number of the first element that was inserted
+  // into the memtable
+  SequenceNumber GetFirstSequenceNumber() { return first_seqno_; }
+
+  // Returns the next active logfile number when this memtable is about to
+  // be flushed to storage
+  uint64_t GetNextLogNumber() { return mem_next_logfile_number_; }
+
+  // Sets the next active logfile number when this memtable is about to
+  // be flushed to storage
+  void SetNextLogNumber(uint64_t num) { mem_next_logfile_number_ = num; }
+
+  // Returns the logfile number that can be safely deleted when this
+  // memstore is flushed to storage
+  uint64_t GetLogNumber() { return mem_logfile_number_; }
+
+  // Sets the logfile number that can be safely deleted when this
+  // memstore is flushed to storage
+  void SetLogNumber(uint64_t num) { mem_logfile_number_ = num; }
+
+  // Notify the underlying storage that no more items will be added
+  void MarkImmutable() { table_->MarkReadOnly(); }
+
+ private:
+  friend class MemTableIterator;
+  friend class MemTableBackwardIterator;
+  friend class MemTableList;
+
+  KeyComparator comparator_;
+  int refs_;
+  ArenaImpl arena_impl_;
+  shared_ptr<MemTableRep> table_;
+
+  // These are used to manage memtable flushes to storage
+  bool flush_in_progress_; // started the flush
+  bool flush_completed_;   // finished the flush
+  uint64_t file_number_;    // filled up after flush is complete
+
+  // The udpates to be applied to the transaction log when this
+  // memtable is flushed to storage.
+  VersionEdit edit_;
+
+  // The sequence number of the kv that was inserted first
+  SequenceNumber first_seqno_;
+
+  // The log files earlier than this number can be deleted.
+  uint64_t mem_next_logfile_number_;
+
+  // The log file that backs this memtable (to be deleted when
+  // memtable flush is done)
+  uint64_t mem_logfile_number_;
+
+  // rw locks for inplace updates
+  std::vector<port::RWMutex> locks_;
+
+  // No copying allowed
+  MemTable(const MemTable&);
+  void operator=(const MemTable&);
+
+  // Get the lock associated for the key
+  port::RWMutex* GetLock(const Slice& key);
+};
+
+}  // namespace rocksdb
diff --git a/db/memtablelist.cc b/db/memtablelist.cc
new file mode 100644 (file)
index 0000000..71e4e5a
--- /dev/null
@@ -0,0 +1,222 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#include "db/memtablelist.h"
+
+#include <string>
+#include "rocksdb/db.h"
+#include "db/memtable.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "util/coding.h"
+
+namespace rocksdb {
+
+class InternalKeyComparator;
+class Mutex;
+class MemTableListIterator;
+class VersionSet;
+
+using std::list;
+
+// Increase reference count on all underling memtables
+void MemTableList::RefAll() {
+  for (auto &memtable : memlist_) {
+    memtable->Ref();
+  }
+}
+
+// Drop reference count on all underling memtables. If the
+// refcount of an underlying memtable drops to zero, then
+// return it in to_delete vector.
+void MemTableList::UnrefAll(std::vector<MemTable*>* to_delete) {
+  for (auto &memtable : memlist_) {
+    MemTable* m = memtable->Unref();
+    if (m != nullptr) {
+      to_delete->push_back(m);
+    }
+  }
+}
+
+// Returns the total number of memtables in the list
+int MemTableList::size() {
+  assert(num_flush_not_started_ <= size_);
+  return size_;
+}
+
+// Returns true if there is at least one memtable on which flush has
+// not yet started.
+bool MemTableList::IsFlushPending(int min_write_buffer_number_to_merge) {
+  if ((flush_requested_ && num_flush_not_started_ >= 1) ||
+      (num_flush_not_started_ >= min_write_buffer_number_to_merge)) {
+    assert(imm_flush_needed.NoBarrier_Load() != nullptr);
+    return true;
+  }
+  return false;
+}
+
+// Returns the memtables that need to be flushed.
+void MemTableList::PickMemtablesToFlush(std::vector<MemTable*>* ret) {
+  for (auto it = memlist_.rbegin(); it != memlist_.rend(); it++) {
+    MemTable* m = *it;
+    if (!m->flush_in_progress_) {
+      assert(!m->flush_completed_);
+      num_flush_not_started_--;
+      if (num_flush_not_started_ == 0) {
+        imm_flush_needed.Release_Store(nullptr);
+      }
+      m->flush_in_progress_ = true; // flushing will start very soon
+      ret->push_back(m);
+    }
+  }
+  flush_requested_ = false; // start-flush request is complete
+}
+
+// Record a successful flush in the manifest file
+Status MemTableList::InstallMemtableFlushResults(
+                      const std::vector<MemTable*> &mems,
+                      VersionSet* vset, Status flushStatus,
+                      port::Mutex* mu, Logger* info_log,
+                      uint64_t file_number,
+                      std::set<uint64_t>& pending_outputs,
+                      std::vector<MemTable*>* to_delete) {
+  mu->AssertHeld();
+
+  // If the flush was not successful, then just reset state.
+  // Maybe a suceeding attempt to flush will be successful.
+  if (!flushStatus.ok()) {
+    for (MemTable* m : mems) {
+      assert(m->flush_in_progress_);
+      assert(m->file_number_ == 0);
+
+      m->flush_in_progress_ = false;
+      m->flush_completed_ = false;
+      m->edit_.Clear();
+      num_flush_not_started_++;
+      imm_flush_needed.Release_Store((void *)1);
+      pending_outputs.erase(file_number);
+    }
+    return flushStatus;
+  }
+
+  // flush was sucessful
+  for (size_t i = 0; i < mems.size(); ++i) {
+    // All the edits are associated with the first memtable of this batch.
+    assert(i == 0 || mems[i]->GetEdits()->NumEntries() == 0);
+
+    mems[i]->flush_completed_ = true;
+    mems[i]->file_number_ = file_number;
+  }
+
+  // if some other thread is already commiting, then return
+  Status s;
+  if (commit_in_progress_) {
+    return s;
+  }
+
+  // Only a single thread can be executing this piece of code
+  commit_in_progress_ = true;
+
+  // scan all memtables from the earliest, and commit those
+  // (in that order) that have finished flushing. Memetables
+  // are always committed in the order that they were created.
+  while (!memlist_.empty() && s.ok()) {
+    MemTable* m = memlist_.back(); // get the last element
+    if (!m->flush_completed_) {
+      break;
+    }
+
+    Log(info_log,
+        "Level-0 commit table #%lu started",
+        (unsigned long)m->file_number_);
+
+    // this can release and reacquire the mutex.
+    s = vset->LogAndApply(&m->edit_, mu);
+
+    // All the later memtables that have the same filenum
+    // are part of the same batch. They can be committed now.
+    uint64_t mem_id = 1;  // how many memtables has been flushed.
+    do {
+      if (s.ok()) { // commit new state
+        Log(info_log,
+            "Level-0 commit table #%lu: memtable #%lu done",
+            (unsigned long)m->file_number_,
+            (unsigned long)mem_id);
+        memlist_.remove(m);
+        assert(m->file_number_ > 0);
+
+        // pending_outputs can be cleared only after the newly created file
+        // has been written to a committed version so that other concurrently
+        // executing compaction threads do not mistakenly assume that this
+        // file is not live.
+        pending_outputs.erase(m->file_number_);
+        if (m->Unref() != nullptr) {
+          to_delete->push_back(m);
+        }
+        size_--;
+      } else {
+        //commit failed. setup state so that we can flush again.
+        Log(info_log,
+            "Level-0 commit table #%lu: memtable #%lu failed",
+            (unsigned long)m->file_number_,
+            (unsigned long)mem_id);
+        m->flush_completed_ = false;
+        m->flush_in_progress_ = false;
+        m->edit_.Clear();
+        num_flush_not_started_++;
+        pending_outputs.erase(m->file_number_);
+        m->file_number_ = 0;
+        imm_flush_needed.Release_Store((void *)1);
+        s = Status::IOError("Unable to commit flushed memtable");
+      }
+      ++mem_id;
+    } while (!memlist_.empty() && (m = memlist_.back()) &&
+             m->file_number_ == file_number);
+  }
+  commit_in_progress_ = false;
+  return s;
+}
+
+// New memtables are inserted at the front of the list.
+void MemTableList::Add(MemTable* m) {
+  assert(size_ >= num_flush_not_started_);
+  size_++;
+  memlist_.push_front(m);
+  m->MarkImmutable();
+  num_flush_not_started_++;
+  if (num_flush_not_started_ == 1) {
+    imm_flush_needed.Release_Store((void *)1);
+  }
+}
+
+// Returns an estimate of the number of bytes of data in use.
+size_t MemTableList::ApproximateMemoryUsage() {
+  size_t size = 0;
+  for (auto &memtable : memlist_) {
+    size += memtable->ApproximateMemoryUsage();
+  }
+  return size;
+}
+
+// Search all the memtables starting from the most recent one.
+// Return the most recent value found, if any.
+// Operands stores the list of merge operations to apply, so far.
+bool MemTableList::Get(const LookupKey& key, std::string* value, Status* s,
+                       MergeContext& merge_context, const Options& options) {
+  for (auto &memtable : memlist_) {
+    if (memtable->Get(key, value, s, merge_context, options)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+void MemTableList::GetMemTables(std::vector<MemTable*>* output) {
+  for (auto &memtable : memlist_) {
+    output->push_back(memtable);
+  }
+}
+
+}  // namespace rocksdb
diff --git a/db/memtablelist.h b/db/memtablelist.h
new file mode 100644 (file)
index 0000000..ed353c8
--- /dev/null
@@ -0,0 +1,108 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+
+#pragma once
+#include <string>
+#include <list>
+#include <deque>
+#include "rocksdb/db.h"
+#include "db/dbformat.h"
+#include "db/skiplist.h"
+#include "memtable.h"
+
+namespace rocksdb {
+
+class InternalKeyComparator;
+class Mutex;
+class MemTableListIterator;
+
+//
+// This class stores references to all the immutable memtables.
+// The memtables are flushed to L0 as soon as possible and in
+// any order. If there are more than one immutable memtable, their
+// flushes can occur concurrently.  However, they are 'committed'
+// to the manifest in FIFO order to maintain correctness and
+// recoverability from a crash.
+//
+class MemTableList {
+ public:
+  // A list of memtables.
+  MemTableList() : size_(0), num_flush_not_started_(0),
+    commit_in_progress_(false),
+    flush_requested_(false) {
+    imm_flush_needed.Release_Store(nullptr);
+  }
+  ~MemTableList() {};
+
+  // so that backgrund threads can detect non-nullptr pointer to
+  // determine whether this is anything more to start flushing.
+  port::AtomicPointer imm_flush_needed;
+
+  // Increase reference count on all underling memtables
+  void RefAll();
+
+  // Drop reference count on all underling memtables. If the refcount
+  // on an underlying memtable drops to zero, then return it in
+  // to_delete vector.
+  void UnrefAll(std::vector<MemTable*>* to_delete);
+
+  // Returns the total number of memtables in the list
+  int size();
+
+  // Returns true if there is at least one memtable on which flush has
+  // not yet started.
+  bool IsFlushPending(int min_write_buffer_number_to_merge);
+
+  // Returns the earliest memtables that needs to be flushed. The returned
+  // memtables are guaranteed to be in the ascending order of created time.
+  void PickMemtablesToFlush(std::vector<MemTable*>* mems);
+
+  // Commit a successful flush in the manifest file
+  Status InstallMemtableFlushResults(const std::vector<MemTable*> &m,
+                      VersionSet* vset, Status flushStatus,
+                      port::Mutex* mu, Logger* info_log,
+                      uint64_t file_number,
+                      std::set<uint64_t>& pending_outputs,
+                      std::vector<MemTable*>* to_delete);
+
+  // New memtables are inserted at the front of the list.
+  // Takes ownership of the referenced held on *m by the caller of Add().
+  void Add(MemTable* m);
+
+  // Returns an estimate of the number of bytes of data in use.
+  size_t ApproximateMemoryUsage();
+
+  // Search all the memtables starting from the most recent one.
+  // Return the most recent value found, if any.
+  bool Get(const LookupKey& key, std::string* value, Status* s,
+           MergeContext& merge_context, const Options& options);
+
+  // Returns the list of underlying memtables.
+  void GetMemTables(std::vector<MemTable*>* list);
+
+  // Request a flush of all existing memtables to storage
+  void FlushRequested() { flush_requested_ = true; }
+
+  // Copying allowed
+  // MemTableList(const MemTableList&);
+  // void operator=(const MemTableList&);
+
+ private:
+  std::list<MemTable*> memlist_;
+  int size_;
+
+  // the number of elements that still need flushing
+  int num_flush_not_started_;
+
+  // committing in progress
+  bool commit_in_progress_;
+
+  // Requested a flush of all memtables to storage
+  bool flush_requested_;
+
+};
+
+}  // namespace rocksdb
diff --git a/db/merge_context.h b/db/merge_context.h
new file mode 100644 (file)
index 0000000..91d9f8a
--- /dev/null
@@ -0,0 +1,69 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#pragma once
+#include "db/dbformat.h"
+#include "rocksdb/slice.h"
+#include <string>
+#include <deque>
+
+namespace rocksdb {
+
+const std::deque<std::string> empty_operand_list;
+
+// The merge context for merging a user key.
+// When doing a Get(), DB will create such a class and pass it when
+// issuing Get() operation to memtables and version_set. The operands
+// will be fetched from the context when issuing partial of full merge.
+class MergeContext {
+public:
+  // Clear all the operands
+  void Clear() {
+    if (operand_list) {
+      operand_list->clear();
+    }
+  }
+  // Replace the first two operands of merge_result, which are expected be the
+  // merge results of them.
+  void PushPartialMergeResult(std::string& merge_result) {
+    assert (operand_list);
+    operand_list->pop_front();
+    swap(operand_list->front(), merge_result);
+  }
+  // Push a merge operand
+  void PushOperand(const Slice& operand_slice) {
+    Initialize();
+    operand_list->push_front(operand_slice.ToString());
+  }
+  // return total number of operands in the list
+  size_t GetNumOperands() const {
+    if (!operand_list) {
+      return 0;
+    }
+    return operand_list->size();
+  }
+  // Get the operand at the index.
+  Slice GetOperand(int index) const {
+    assert (operand_list);
+    return (*operand_list)[index];
+  }
+  // Return all the operands.
+  const std::deque<std::string>& GetOperands() const {
+    if (!operand_list) {
+      return empty_operand_list;
+    }
+    return *operand_list;
+  }
+private:
+  void Initialize() {
+    if (!operand_list) {
+      operand_list.reset(new std::deque<std::string>());
+    }
+  }
+  std::unique_ptr<std::deque<std::string>> operand_list;
+};
+
+} // namespace rocksdb
+
diff --git a/db/merge_helper.cc b/db/merge_helper.cc
new file mode 100644 (file)
index 0000000..a7e2df0
--- /dev/null
@@ -0,0 +1,198 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#include "merge_helper.h"
+#include "db/dbformat.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/merge_operator.h"
+#include "util/statistics_imp.h"
+#include <string>
+#include <stdio.h>
+
+namespace rocksdb {
+
+// PRE:  iter points to the first merge type entry
+// POST: iter points to the first entry beyond the merge process (or the end)
+//       keys_, operands_ are updated to reflect the merge result.
+//       keys_ stores the list of keys encountered while merging.
+//       operands_ stores the list of merge operands encountered while merging.
+//       keys_[i] corresponds to operands_[i] for each i.
+void MergeHelper::MergeUntil(Iterator* iter, SequenceNumber stop_before,
+                             bool at_bottom, Statistics* stats) {
+  // Get a copy of the internal key, before it's invalidated by iter->Next()
+  // Also maintain the list of merge operands seen.
+  keys_.clear();
+  operands_.clear();
+  keys_.push_front(iter->key().ToString());
+  operands_.push_front(iter->value().ToString());
+
+  success_ = false;   // Will become true if we hit Put/Delete or bottom
+
+  // We need to parse the internal key again as the parsed key is
+  // backed by the internal key!
+  // Assume no internal key corruption as it has been successfully parsed
+  // by the caller.
+  // Invariant: keys_.back() will not change. Hence, orig_ikey is always valid.
+  ParsedInternalKey orig_ikey;
+  ParseInternalKey(keys_.back(), &orig_ikey);
+
+  bool hit_the_next_user_key = false;
+  ParsedInternalKey ikey;
+  std::string merge_result;  // Temporary value for merge results
+  for (iter->Next(); iter->Valid(); iter->Next()) {
+    assert(operands_.size() >= 1);        // Should be invariants!
+    assert(keys_.size() == operands_.size());
+
+    if (!ParseInternalKey(iter->key(), &ikey)) {
+      // stop at corrupted key
+      if (assert_valid_internal_key_) {
+        assert(!"corrupted internal key is not expected");
+      }
+      break;
+    }
+
+    if (user_comparator_->Compare(ikey.user_key, orig_ikey.user_key) != 0) {
+      // hit a different user key, stop right here
+      hit_the_next_user_key = true;
+      break;
+    }
+
+    if (stop_before && ikey.sequence <= stop_before) {
+      // hit an entry that's visible by the previous snapshot, can't touch that
+      break;
+    }
+
+    // At this point we are guaranteed that we need to process this key.
+
+    if (kTypeDeletion == ikey.type) {
+      // hit a delete
+      //   => merge nullptr with operands_
+      //   => store result in operands_.back() (and update keys_.back())
+      //   => change the entry type to kTypeValue for keys_.back()
+      // We are done! Return a success if the merge passes.
+      success_ = user_merge_operator_->FullMerge(ikey.user_key, nullptr,
+                                                 operands_, &merge_result,
+                                                 logger_);
+
+      // We store the result in keys_.back() and operands_.back()
+      // if nothing went wrong (i.e.: no operand corruption on disk)
+      if (success_) {
+        std::string& key = keys_.back();  // The original key encountered
+        orig_ikey.type = kTypeValue;
+        UpdateInternalKey(&key[0], key.size(),
+                          orig_ikey.sequence, orig_ikey.type);
+        swap(operands_.back(), merge_result);
+      } else {
+        RecordTick(stats, NUMBER_MERGE_FAILURES);
+      }
+
+      // move iter to the next entry (before doing anything else)
+      iter->Next();
+      return;
+    }
+
+    if (kTypeValue == ikey.type) {
+      // hit a put
+      //   => merge the put value with operands_
+      //   => store result in operands_.back() (and update keys_.back())
+      //   => change the entry type to kTypeValue for keys_.back()
+      // We are done! Success!
+      const Slice value = iter->value();
+      success_ = user_merge_operator_->FullMerge(ikey.user_key, &value,
+                                                 operands_, &merge_result,
+                                                 logger_);
+
+      // We store the result in keys_.back() and operands_.back()
+      // if nothing went wrong (i.e.: no operand corruption on disk)
+      if (success_) {
+        std::string& key = keys_.back();  // The original key encountered
+        orig_ikey.type = kTypeValue;
+        UpdateInternalKey(&key[0], key.size(),
+                          orig_ikey.sequence, orig_ikey.type);
+        swap(operands_.back(), merge_result);
+      } else {
+        RecordTick(stats, NUMBER_MERGE_FAILURES);
+      }
+
+      // move iter to the next entry
+      iter->Next();
+      return;
+    }
+
+    if (kTypeMerge == ikey.type) {
+      // hit a merge
+      //   => merge the operand into the front of the operands_ list
+      //   => use the user's associative merge function to determine how.
+      //   => then continue because we haven't yet seen a Put/Delete.
+      assert(!operands_.empty()); // Should have at least one element in it
+
+      keys_.push_front(iter->key().ToString());
+      operands_.push_front(iter->value().ToString());
+      while (operands_.size() >= 2) {
+        // Returns false when the merge_operator can no longer process it
+        if (user_merge_operator_->PartialMerge(ikey.user_key,
+                                               Slice(operands_[0]),
+                                               Slice(operands_[1]),
+                                               &merge_result,
+                                               logger_)) {
+          // Merging of operands (associative merge) was successful.
+          // Replace these frontmost two operands with the merge result
+          keys_.pop_front();
+          operands_.pop_front();
+          swap(operands_.front(), merge_result);
+        } else {
+          // Merging of operands (associative merge) returned false.
+          // The user merge_operator does not know how to merge these operands.
+          // So we just stack them up until we find a Put/Delete or end of key.
+          break;
+        }
+      }
+      continue;
+    }
+  }
+
+  // We are sure we have seen this key's entire history if we are at the
+  // last level and exhausted all internal keys of this user key.
+  // NOTE: !iter->Valid() does not necessarily mean we hit the
+  // beginning of a user key, as versions of a user key might be
+  // split into multiple files (even files on the same level)
+  // and some files might not be included in the compaction/merge.
+  //
+  // There are also cases where we have seen the root of history of this
+  // key without being sure of it. Then, we simply miss the opportunity
+  // to combine the keys. Since VersionSet::SetupOtherInputs() always makes
+  // sure that all merge-operands on the same level get compacted together,
+  // this will simply lead to these merge operands moving to the next level.
+  //
+  // So, we only perform the following logic (to merge all operands together
+  // without a Put/Delete) if we are certain that we have seen the end of key.
+  bool surely_seen_the_beginning = hit_the_next_user_key && at_bottom;
+  if (surely_seen_the_beginning) {
+    // do a final merge with nullptr as the existing value and say
+    // bye to the merge type (it's now converted to a Put)
+    assert(kTypeMerge == orig_ikey.type);
+    assert(operands_.size() >= 1);
+    assert(operands_.size() == keys_.size());
+    success_ = user_merge_operator_->FullMerge(orig_ikey.user_key, nullptr,
+                                               operands_, &merge_result,
+                                               logger_);
+
+    if (success_) {
+      std::string& key = keys_.back();  // The original key encountered
+      orig_ikey.type = kTypeValue;
+      UpdateInternalKey(&key[0], key.size(),
+                        orig_ikey.sequence, orig_ikey.type);
+
+      // The final value() is always stored in operands_.back()
+      swap(operands_.back(),merge_result);
+    } else {
+      RecordTick(stats, NUMBER_MERGE_FAILURES);
+      // Do nothing if not success_. Leave keys() and operands() as they are.
+    }
+  }
+}
+
+} // namespace rocksdb
diff --git a/db/merge_helper.h b/db/merge_helper.h
new file mode 100644 (file)
index 0000000..6fe9bfb
--- /dev/null
@@ -0,0 +1,102 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#ifndef MERGE_HELPER_H
+#define MERGE_HELPER_H
+
+#include "db/dbformat.h"
+#include "rocksdb/slice.h"
+#include <string>
+#include <deque>
+
+namespace rocksdb {
+
+class Comparator;
+class Iterator;
+class Logger;
+class MergeOperator;
+class Statistics;
+
+class MergeHelper {
+ public:
+  MergeHelper(const Comparator* user_comparator,
+              const MergeOperator* user_merge_operator,
+              Logger* logger,
+              bool assert_valid_internal_key)
+      : user_comparator_(user_comparator),
+        user_merge_operator_(user_merge_operator),
+        logger_(logger),
+        assert_valid_internal_key_(assert_valid_internal_key),
+        keys_(),
+        operands_(),
+        success_(false) {}
+
+  // Merge entries until we hit
+  //     - a corrupted key
+  //     - a Put/Delete,
+  //     - a different user key,
+  //     - a specific sequence number (snapshot boundary),
+  //  or - the end of iteration
+  // iter: (IN)  points to the first merge type entry
+  //       (OUT) points to the first entry not included in the merge process
+  // stop_before: (IN) a sequence number that merge should not cross.
+  //                   0 means no restriction
+  // at_bottom:   (IN) true if the iterator covers the bottem level, which means
+  //                   we could reach the start of the history of this user key.
+  void MergeUntil(Iterator* iter, SequenceNumber stop_before = 0,
+                  bool at_bottom = false, Statistics* stats = nullptr);
+
+  // Query the merge result
+  // These are valid until the next MergeUntil call
+  // If the merging was successful:
+  //   - IsSuccess() will be true
+  //   - key() will have the latest sequence number of the merges.
+  //           The type will be Put or Merge. See IMPORTANT 1 note, below.
+  //   - value() will be the result of merging all the operands together
+  //   - The user should ignore keys() and values().
+  //
+  //   IMPORTANT 1: the key type could change after the MergeUntil call.
+  //        Put/Delete + Merge + ... + Merge => Put
+  //        Merge + ... + Merge => Merge
+  //
+  // If the merge operator is not associative, and if a Put/Delete is not found
+  // then the merging will be unsuccessful. In this case:
+  //   - IsSuccess() will be false
+  //   - keys() contains the list of internal keys seen in order of iteration.
+  //   - values() contains the list of values (merges) seen in the same order.
+  //              values() is parallel to keys() so that the first entry in
+  //              keys() is the key associated with the first entry in values()
+  //              and so on. These lists will be the same length.
+  //              All of these pairs will be merges over the same user key.
+  //              See IMPORTANT 2 note below.
+  //   - The user should ignore key() and value().
+  //
+  //   IMPORTANT 2: The entries were traversed in order from BACK to FRONT.
+  //                So keys().back() was the first key seen by iterator.
+  // TODO: Re-style this comment to be like the first one
+  bool IsSuccess() { return success_; }
+  Slice key() { assert(success_); return Slice(keys_.back()); }
+  Slice value() { assert(success_); return Slice(operands_.back()); }
+  const std::deque<std::string>& keys() { assert(!success_); return keys_; }
+  const std::deque<std::string>& values() {
+    assert(!success_); return operands_;
+  }
+
+ private:
+  const Comparator* user_comparator_;
+  const MergeOperator* user_merge_operator_;
+  Logger* logger_;
+  bool assert_valid_internal_key_; // enforce no internal key corruption?
+
+  // the scratch area that holds the result of MergeUntil
+  // valid up to the next MergeUntil call
+  std::deque<std::string> keys_;    // Keeps track of the sequence of keys seen
+  std::deque<std::string> operands_;  // Parallel with keys_; stores the values
+  bool success_;
+};
+
+} // namespace rocksdb
+
+#endif
diff --git a/db/merge_operator.cc b/db/merge_operator.cc
new file mode 100644 (file)
index 0000000..7d1ee4e
--- /dev/null
@@ -0,0 +1,53 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+/**
+ * Back-end implementation details specific to the Merge Operator.
+ */
+
+#include "rocksdb/merge_operator.h"
+
+namespace rocksdb {
+
+// Given a "real" merge from the library, call the user's
+// associative merge function one-by-one on each of the operands.
+// NOTE: It is assumed that the client's merge-operator will handle any errors.
+bool AssociativeMergeOperator::FullMerge(
+    const Slice& key,
+    const Slice* existing_value,
+    const std::deque<std::string>& operand_list,
+    std::string* new_value,
+    Logger* logger) const {
+
+  // Simply loop through the operands
+  Slice temp_existing;
+  std::string temp_value;
+  for (const auto& operand : operand_list) {
+    Slice value(operand);
+    if (!Merge(key, existing_value, value, &temp_value, logger)) {
+      return false;
+    }
+    swap(temp_value, *new_value);
+    temp_existing = Slice(*new_value);
+    existing_value = &temp_existing;
+  }
+
+  // The result will be in *new_value. All merges succeeded.
+  return true;
+}
+
+// Call the user defined simple merge on the operands;
+// NOTE: It is assumed that the client's merge-operator will handle any errors.
+bool AssociativeMergeOperator::PartialMerge(
+    const Slice& key,
+    const Slice& left_operand,
+    const Slice& right_operand,
+    std::string* new_value,
+    Logger* logger) const {
+
+  return Merge(key, &left_operand, right_operand, new_value, logger);
+}
+
+} // namespace rocksdb
diff --git a/db/merge_test.cc b/db/merge_test.cc
new file mode 100644 (file)
index 0000000..887d8ad
--- /dev/null
@@ -0,0 +1,389 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#include <assert.h>
+#include <memory>
+#include <iostream>
+
+#include "rocksdb/cache.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/merge_operator.h"
+#include "db/dbformat.h"
+#include "db/db_impl.h"
+#include "db/write_batch_internal.h"
+#include "utilities/merge_operators.h"
+#include "util/testharness.h"
+#include "utilities/utility_db.h"
+
+using namespace std;
+using namespace rocksdb;
+
+namespace {
+  int numMergeOperatorCalls;
+
+  void resetNumMergeOperatorCalls() {
+    numMergeOperatorCalls = 0;
+  }
+}
+
+class CountMergeOperator : public AssociativeMergeOperator {
+ public:
+  CountMergeOperator() {
+    mergeOperator_ = MergeOperators::CreateUInt64AddOperator();
+  }
+
+  virtual bool Merge(const Slice& key,
+                     const Slice* existing_value,
+                     const Slice& value,
+                     std::string* new_value,
+                     Logger* logger) const override {
+    ++numMergeOperatorCalls;
+    return mergeOperator_->PartialMerge(
+        key,
+        *existing_value,
+        value,
+        new_value,
+        logger);
+  }
+
+  virtual const char* Name() const override {
+    return "UInt64AddOperator";
+  }
+
+ private:
+  std::shared_ptr<MergeOperator> mergeOperator_;
+};
+
+std::shared_ptr<DB> OpenDb(
+    const string& dbname,
+    const bool ttl = false,
+    const unsigned max_successive_merges = 0) {
+  DB* db;
+  StackableDB* sdb;
+  Options options;
+  options.create_if_missing = true;
+  options.merge_operator = std::make_shared<CountMergeOperator>();
+  options.max_successive_merges = max_successive_merges;
+  Status s;
+  DestroyDB(dbname, Options());
+  if (ttl) {
+    cout << "Opening database with TTL\n";
+    s = UtilityDB::OpenTtlDB(options, dbname, &sdb);
+    db = sdb;
+  } else {
+    s = DB::Open(options, dbname, &db);
+  }
+  if (!s.ok()) {
+    cerr << s.ToString() << endl;
+    assert(false);
+  }
+  return std::shared_ptr<DB>(db);
+}
+
+// Imagine we are maintaining a set of uint64 counters.
+// Each counter has a distinct name. And we would like
+// to support four high level operations:
+// set, add, get and remove
+// This is a quick implementation without a Merge operation.
+class Counters {
+
+ protected:
+  std::shared_ptr<DB> db_;
+
+  WriteOptions put_option_;
+  ReadOptions get_option_;
+  WriteOptions delete_option_;
+
+  uint64_t default_;
+
+ public:
+  explicit Counters(std::shared_ptr<DB> db, uint64_t defaultCount = 0)
+      : db_(db),
+        put_option_(),
+        get_option_(),
+        delete_option_(),
+        default_(defaultCount) {
+    assert(db_);
+  }
+
+  virtual ~Counters() {}
+
+  // public interface of Counters.
+  // All four functions return false
+  // if the underlying level db operation failed.
+
+  // mapped to a levedb Put
+  bool set(const string& key, uint64_t value) {
+    // just treat the internal rep of int64 as the string
+    Slice slice((char *)&value, sizeof(value));
+    auto s = db_->Put(put_option_, key, slice);
+
+    if (s.ok()) {
+      return true;
+    } else {
+      cerr << s.ToString() << endl;
+      return false;
+    }
+  }
+
+  // mapped to a rocksdb Delete
+  bool remove(const string& key) {
+    auto s = db_->Delete(delete_option_, key);
+
+    if (s.ok()) {
+      return true;
+    } else {
+      cerr << s.ToString() << std::endl;
+      return false;
+    }
+  }
+
+  // mapped to a rocksdb Get
+  bool get(const string& key, uint64_t *value) {
+    string str;
+    auto s = db_->Get(get_option_, key, &str);
+
+    if (s.IsNotFound()) {
+      // return default value if not found;
+      *value = default_;
+      return true;
+    } else if (s.ok()) {
+      // deserialization
+      if (str.size() != sizeof(uint64_t)) {
+        cerr << "value corruption\n";
+        return false;
+      }
+      *value = DecodeFixed64(&str[0]);
+      return true;
+    } else {
+      cerr << s.ToString() << std::endl;
+      return false;
+    }
+  }
+
+  // 'add' is implemented as get -> modify -> set
+  // An alternative is a single merge operation, see MergeBasedCounters
+  virtual bool add(const string& key, uint64_t value) {
+    uint64_t base = default_;
+    return get(key, &base) && set(key, base + value);
+  }
+
+
+  // convenience functions for testing
+  void assert_set(const string& key, uint64_t value) {
+    assert(set(key, value));
+  }
+
+  void assert_remove(const string& key) {
+    assert(remove(key));
+  }
+
+  uint64_t assert_get(const string& key) {
+    uint64_t value = default_;
+    assert(get(key, &value));
+    return value;
+  }
+
+  void assert_add(const string& key, uint64_t value) {
+    assert(add(key, value));
+  }
+};
+
+// Implement 'add' directly with the new Merge operation
+class MergeBasedCounters : public Counters {
+ private:
+  WriteOptions merge_option_; // for merge
+
+ public:
+  explicit MergeBasedCounters(std::shared_ptr<DB> db, uint64_t defaultCount = 0)
+      : Counters(db, defaultCount),
+        merge_option_() {
+  }
+
+  // mapped to a rocksdb Merge operation
+  virtual bool add(const string& key, uint64_t value) override {
+    char encoded[sizeof(uint64_t)];
+    EncodeFixed64(encoded, value);
+    Slice slice(encoded, sizeof(uint64_t));
+    auto s = db_->Merge(merge_option_, key, slice);
+
+    if (s.ok()) {
+      return true;
+    } else {
+      cerr << s.ToString() << endl;
+      return false;
+    }
+  }
+};
+
+void dumpDb(DB* db) {
+  auto it = unique_ptr<Iterator>(db->NewIterator(ReadOptions()));
+  for (it->SeekToFirst(); it->Valid(); it->Next()) {
+    uint64_t value = DecodeFixed64(it->value().data());
+    cout << it->key().ToString() << ": "  << value << endl;
+  }
+  assert(it->status().ok());  // Check for any errors found during the scan
+}
+
+void testCounters(Counters& counters, DB* db, bool test_compaction) {
+
+  FlushOptions o;
+  o.wait = true;
+
+  counters.assert_set("a", 1);
+
+  if (test_compaction) db->Flush(o);
+
+  assert(counters.assert_get("a") == 1);
+
+  counters.assert_remove("b");
+
+  // defaut value is 0 if non-existent
+  assert(counters.assert_get("b") == 0);
+
+  counters.assert_add("a", 2);
+
+  if (test_compaction) db->Flush(o);
+
+  // 1+2 = 3
+  assert(counters.assert_get("a")== 3);
+
+  dumpDb(db);
+
+  std::cout << "1\n";
+
+  // 1+...+49 = ?
+  uint64_t sum = 0;
+  for (int i = 1; i < 50; i++) {
+    counters.assert_add("b", i);
+    sum += i;
+  }
+  assert(counters.assert_get("b") == sum);
+
+  std::cout << "2\n";
+  dumpDb(db);
+
+  std::cout << "3\n";
+
+  if (test_compaction) {
+    db->Flush(o);
+
+    cout << "Compaction started ...\n";
+    db->CompactRange(nullptr, nullptr);
+    cout << "Compaction ended\n";
+
+    dumpDb(db);
+
+    assert(counters.assert_get("a")== 3);
+    assert(counters.assert_get("b") == sum);
+  }
+}
+
+void testSuccessiveMerge(
+    Counters& counters, int max_num_merges, int num_merges) {
+
+  counters.assert_remove("z");
+  uint64_t sum = 0;
+
+  for (int i = 1; i <= num_merges; ++i) {
+    resetNumMergeOperatorCalls();
+    counters.assert_add("z", i);
+    sum += i;
+
+    if (i % (max_num_merges + 1) == 0) {
+      assert(numMergeOperatorCalls == max_num_merges + 1);
+    } else {
+      assert(numMergeOperatorCalls == 0);
+    }
+
+    resetNumMergeOperatorCalls();
+    assert(counters.assert_get("z") == sum);
+    assert(numMergeOperatorCalls == i % (max_num_merges + 1));
+  }
+}
+
+void testSingleBatchSuccessiveMerge(
+    DB* db,
+    int max_num_merges,
+    int num_merges) {
+  assert(num_merges > max_num_merges);
+
+  Slice key("BatchSuccessiveMerge");
+  uint64_t merge_value = 1;
+  Slice merge_value_slice((char *)&merge_value, sizeof(merge_value));
+
+  // Create the batch
+  WriteBatch batch;
+  for (int i = 0; i < num_merges; ++i) {
+    batch.Merge(key, merge_value_slice);
+  }
+
+  // Apply to memtable and count the number of merges
+  resetNumMergeOperatorCalls();
+  {
+    Status s = db->Write(WriteOptions(), &batch);
+    assert(s.ok());
+  }
+  assert(numMergeOperatorCalls ==
+      num_merges - (num_merges % (max_num_merges + 1)));
+
+  // Get the value
+  resetNumMergeOperatorCalls();
+  string get_value_str;
+  {
+    Status s = db->Get(ReadOptions(), key, &get_value_str);
+    assert(s.ok());
+  }
+  assert(get_value_str.size() == sizeof(uint64_t));
+  uint64_t get_value = DecodeFixed64(&get_value_str[0]);
+  ASSERT_EQ(get_value, num_merges * merge_value);
+  ASSERT_EQ(numMergeOperatorCalls, (num_merges % (max_num_merges + 1)));
+}
+
+void runTest(int argc, const string& dbname, const bool use_ttl = false) {
+  auto db = OpenDb(dbname, use_ttl);
+
+  {
+    cout << "Test read-modify-write counters... \n";
+    Counters counters(db, 0);
+    testCounters(counters, db.get(), true);
+  }
+
+  bool compact = false;
+  if (argc > 1) {
+    compact = true;
+    cout << "Turn on Compaction\n";
+  }
+
+  {
+    cout << "Test merge-based counters... \n";
+    MergeBasedCounters counters(db, 0);
+    testCounters(counters, db.get(), compact);
+  }
+
+  DestroyDB(dbname, Options());
+  db.reset();
+
+  {
+    cout << "Test merge in memtable... \n";
+    unsigned maxMerge = 5;
+    auto db = OpenDb(dbname, use_ttl, maxMerge);
+    MergeBasedCounters counters(db, 0);
+    testCounters(counters, db.get(), compact);
+    testSuccessiveMerge(counters, maxMerge, maxMerge * 2);
+    testSingleBatchSuccessiveMerge(db.get(), 5, 7);
+    DestroyDB(dbname, Options());
+  }
+
+}
+
+int main(int argc, char *argv[]) {
+  //TODO: Make this test like a general rocksdb unit-test
+  runTest(argc, test::TmpDir() + "/merge_testdb");
+  runTest(argc, test::TmpDir() + "/merge_testdbttl", true); // Run test on TTL database
+  return 0;
+}
diff --git a/db/perf_context_test.cc b/db/perf_context_test.cc
new file mode 100644 (file)
index 0000000..0934de0
--- /dev/null
@@ -0,0 +1,328 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#include <algorithm>
+#include <iostream>
+#include <vector>
+#include "/usr/include/valgrind/callgrind.h"
+
+#include "rocksdb/db.h"
+#include "rocksdb/perf_context.h"
+#include "util/histogram.h"
+#include "util/stop_watch.h"
+#include "util/testharness.h"
+
+
+bool FLAGS_random_key = false;
+bool FLAGS_use_set_based_memetable = false;
+int FLAGS_total_keys = 100;
+int FLAGS_write_buffer_size = 1000000000;
+int FLAGS_max_write_buffer_number = 8;
+int FLAGS_min_write_buffer_number_to_merge = 7;
+
+// Path to the database on file system
+const std::string kDbName = rocksdb::test::TmpDir() + "/perf_context_test";
+
+namespace rocksdb {
+
+std::shared_ptr<DB> OpenDb() {
+    DB* db;
+    Options options;
+    options.create_if_missing = true;
+    options.write_buffer_size = FLAGS_write_buffer_size;
+    options.max_write_buffer_number = FLAGS_max_write_buffer_number;
+    options.min_write_buffer_number_to_merge =
+      FLAGS_min_write_buffer_number_to_merge;
+
+    if (FLAGS_use_set_based_memetable) {
+      auto prefix_extractor = rocksdb::NewFixedPrefixTransform(0);
+      options.memtable_factory.reset(
+          NewHashSkipListRepFactory(prefix_extractor));
+    }
+
+    Status s = DB::Open(options, kDbName,  &db);
+    ASSERT_OK(s);
+    return std::shared_ptr<DB>(db);
+}
+
+class PerfContextTest { };
+
+TEST(PerfContextTest, SeekIntoDeletion) {
+  DestroyDB(kDbName, Options());
+  auto db = OpenDb();
+  WriteOptions write_options;
+  ReadOptions read_options;
+
+  for (int i = 0; i < FLAGS_total_keys; ++i) {
+    std::string key = "k" + std::to_string(i);
+    std::string value = "v" + std::to_string(i);
+
+    db->Put(write_options, key, value);
+  }
+
+  for (int i = 0; i < FLAGS_total_keys -1 ; ++i) {
+    std::string key = "k" + std::to_string(i);
+    db->Delete(write_options, key);
+  }
+
+  HistogramImpl hist_get;
+  HistogramImpl hist_get_time;
+  for (int i = 0; i < FLAGS_total_keys - 1; ++i) {
+    std::string key = "k" + std::to_string(i);
+    std::string value;
+
+    perf_context.Reset();
+    StopWatchNano timer(Env::Default(), true);
+    auto status = db->Get(read_options, key, &value);
+    auto elapsed_nanos = timer.ElapsedNanos();
+    ASSERT_TRUE(status.IsNotFound());
+    hist_get.Add(perf_context.user_key_comparison_count);
+    hist_get_time.Add(elapsed_nanos);
+  }
+
+  std::cout << "Get uesr key comparison: \n" << hist_get.ToString()
+            << "Get time: \n" << hist_get_time.ToString();
+
+  HistogramImpl hist_seek_to_first;
+  std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+
+  perf_context.Reset();
+  StopWatchNano timer(Env::Default(), true);
+  iter->SeekToFirst();
+  hist_seek_to_first.Add(perf_context.user_key_comparison_count);
+  auto elapsed_nanos = timer.ElapsedNanos();
+
+  std::cout << "SeekToFirst uesr key comparison: \n" << hist_seek_to_first.ToString()
+            << "ikey skipped: " << perf_context.internal_key_skipped_count << "\n"
+            << "idelete skipped: " << perf_context.internal_delete_skipped_count << "\n"
+            << "elapsed: " << elapsed_nanos << "\n";
+
+  HistogramImpl hist_seek;
+  for (int i = 0; i < FLAGS_total_keys; ++i) {
+    std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+    std::string key = "k" + std::to_string(i);
+
+    perf_context.Reset();
+    StopWatchNano timer(Env::Default(), true);
+    iter->Seek(key);
+    auto elapsed_nanos = timer.ElapsedNanos();
+    hist_seek.Add(perf_context.user_key_comparison_count);
+    std::cout << "seek cmp: " << perf_context.user_key_comparison_count
+              << " ikey skipped " << perf_context.internal_key_skipped_count
+              << " idelete skipped " << perf_context.internal_delete_skipped_count
+              << " elapsed: " << elapsed_nanos << "ns\n";
+
+    perf_context.Reset();
+    ASSERT_TRUE(iter->Valid());
+    StopWatchNano timer2(Env::Default(), true);
+    iter->Next();
+    auto elapsed_nanos2 = timer2.ElapsedNanos();
+    std::cout << "next cmp: " << perf_context.user_key_comparison_count
+              << "elapsed: " << elapsed_nanos2 << "ns\n";
+  }
+
+  std::cout << "Seek uesr key comparison: \n" << hist_seek.ToString();
+}
+
+TEST(PerfContextTest, StopWatchNanoOverhead) {
+  // profile the timer cost by itself!
+  const int kTotalIterations = 1000000;
+  std::vector<uint64_t> timings(kTotalIterations);
+
+  StopWatchNano timer(Env::Default(), true);
+  for (auto& timing : timings) {
+    timing = timer.ElapsedNanos(true /* reset */);
+  }
+
+  HistogramImpl histogram;
+  for (const auto timing : timings) {
+    histogram.Add(timing);
+  }
+
+  std::cout << histogram.ToString();
+}
+
+TEST(PerfContextTest, StopWatchOverhead) {
+  // profile the timer cost by itself!
+  const int kTotalIterations = 1000000;
+  std::vector<uint64_t> timings(kTotalIterations);
+
+  StopWatch timer(Env::Default());
+  for (auto& timing : timings) {
+    timing = timer.ElapsedMicros();
+  }
+
+  HistogramImpl histogram;
+  uint64_t prev_timing = 0;
+  for (const auto timing : timings) {
+    histogram.Add(timing - prev_timing);
+    prev_timing = timing;
+  }
+
+  std::cout << histogram.ToString();
+}
+
+void ProfileKeyComparison() {
+  DestroyDB(kDbName, Options());    // Start this test with a fresh DB
+
+  auto db = OpenDb();
+
+  WriteOptions write_options;
+  ReadOptions read_options;
+
+  HistogramImpl hist_put;
+  HistogramImpl hist_get;
+
+  std::cout << "Inserting " << FLAGS_total_keys << " key/value pairs\n...\n";
+
+  std::vector<int> keys;
+  for (int i = 0; i < FLAGS_total_keys; ++i) {
+    keys.push_back(i);
+  }
+
+  if (FLAGS_random_key) {
+    std::random_shuffle(keys.begin(), keys.end());
+  }
+
+  for (const int i : keys) {
+    std::string key = "k" + std::to_string(i);
+    std::string value = "v" + std::to_string(i);
+
+    perf_context.Reset();
+    db->Put(write_options, key, value);
+    hist_put.Add(perf_context.user_key_comparison_count);
+
+    perf_context.Reset();
+    db->Get(read_options, key, &value);
+    hist_get.Add(perf_context.user_key_comparison_count);
+  }
+
+  std::cout << "Put uesr key comparison: \n" << hist_put.ToString()
+            << "Get uesr key comparison: \n" << hist_get.ToString();
+
+}
+
+TEST(PerfContextTest, KeyComparisonCount) {
+  SetPerfLevel(kEnableCount);
+  ProfileKeyComparison();
+
+  SetPerfLevel(kDisable);
+  ProfileKeyComparison();
+
+  SetPerfLevel(kEnableTime);
+  ProfileKeyComparison();
+}
+
+// make perf_context_test
+// export ROCKSDB_TESTS=PerfContextTest.SeekKeyComparison
+// For one memtable:
+// ./perf_context_test --write_buffer_size=500000 --total_keys=10000
+// For two memtables:
+// ./perf_context_test --write_buffer_size=250000 --total_keys=10000
+// Specify --random_key=1 to shuffle the key before insertion
+// Results show that, for sequential insertion, worst-case Seek Key comparison
+// is close to the total number of keys (linear), when there is only one
+// memtable. When there are two memtables, even the avg Seek Key comparison
+// starts to become linear to the input size.
+
+TEST(PerfContextTest, SeekKeyComparison) {
+  DestroyDB(kDbName, Options());
+  auto db = OpenDb();
+  WriteOptions write_options;
+  ReadOptions read_options;
+
+  std::cout << "Inserting " << FLAGS_total_keys << " key/value pairs\n...\n";
+
+  std::vector<int> keys;
+  for (int i = 0; i < FLAGS_total_keys; ++i) {
+    keys.push_back(i);
+  }
+
+  if (FLAGS_random_key) {
+    std::random_shuffle(keys.begin(), keys.end());
+  }
+
+  HistogramImpl hist_put_time;
+  HistogramImpl hist_wal_time;
+  HistogramImpl hist_time_diff;
+
+  SetPerfLevel(kEnableTime);
+  StopWatchNano timer(Env::Default());
+  for (const int i : keys) {
+    std::string key = "k" + std::to_string(i);
+    std::string value = "v" + std::to_string(i);
+
+    perf_context.Reset();
+    timer.Start();
+    db->Put(write_options, key, value);
+    auto put_time = timer.ElapsedNanos();
+    hist_put_time.Add(put_time);
+    hist_wal_time.Add(perf_context.wal_write_time);
+    hist_time_diff.Add(put_time - perf_context.wal_write_time);
+  }
+
+  std::cout << "Put time:\n" << hist_put_time.ToString()
+            << "WAL time:\n" << hist_wal_time.ToString()
+            << "time diff:\n" << hist_time_diff.ToString();
+
+  HistogramImpl hist_seek;
+  HistogramImpl hist_next;
+
+  for (int i = 0; i < FLAGS_total_keys; ++i) {
+    std::string key = "k" + std::to_string(i);
+    std::string value = "v" + std::to_string(i);
+
+    std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+    perf_context.Reset();
+    iter->Seek(key);
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->value().ToString(), value);
+    hist_seek.Add(perf_context.user_key_comparison_count);
+  }
+
+  std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+  for (iter->SeekToFirst(); iter->Valid();) {
+    perf_context.Reset();
+    iter->Next();
+    hist_next.Add(perf_context.user_key_comparison_count);
+  }
+
+  std::cout << "Seek:\n" << hist_seek.ToString()
+            << "Next:\n" << hist_next.ToString();
+}
+
+}
+
+int main(int argc, char** argv) {
+
+  for (int i = 1; i < argc; i++) {
+    int n;
+    char junk;
+
+    if (sscanf(argv[i], "--write_buffer_size=%d%c", &n, &junk) == 1) {
+      FLAGS_write_buffer_size = n;
+    }
+
+    if (sscanf(argv[i], "--total_keys=%d%c", &n, &junk) == 1) {
+      FLAGS_total_keys = n;
+    }
+
+    if (sscanf(argv[i], "--random_key=%d%c", &n, &junk) == 1 &&
+        (n == 0 || n == 1)) {
+      FLAGS_random_key = n;
+    }
+
+    if (sscanf(argv[i], "--use_set_based_memetable=%d%c", &n, &junk) == 1 &&
+        (n == 0 || n == 1)) {
+      FLAGS_use_set_based_memetable = n;
+    }
+
+  }
+
+  std::cout << kDbName << "\n";
+
+  rocksdb::test::RunAllTests();
+  return 0;
+}
diff --git a/db/prefix_filter_iterator.h b/db/prefix_filter_iterator.h
new file mode 100644 (file)
index 0000000..f448837
--- /dev/null
@@ -0,0 +1,73 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Wrap an underlying iterator, but exclude any results not starting
+// with a given prefix.  Seeking to keys not beginning with the prefix
+// is invalid, and SeekToLast is not implemented (that would be
+// non-trivial), but otherwise this iterator will behave just like the
+// underlying iterator would if there happened to be no non-matching
+// keys in the dataset.
+
+#pragma once
+#include "rocksdb/iterator.h"
+
+namespace rocksdb {
+
+class PrefixFilterIterator : public Iterator {
+ private:
+  Iterator* iter_;
+  const Slice &prefix_;
+  const SliceTransform *prefix_extractor_;
+  Status status_;
+
+ public:
+  PrefixFilterIterator(Iterator* iter,
+                       const Slice &prefix,
+                       const SliceTransform* prefix_extractor)
+                             : iter_(iter), prefix_(prefix),
+                               prefix_extractor_(prefix_extractor),
+                               status_(Status::OK()) {
+    if (prefix_extractor == nullptr) {
+      status_ = Status::InvalidArgument("A prefix filter may not be used "
+                                        "unless a function is also defined "
+                                        "for extracting prefixes");
+    } else if (!prefix_extractor_->InRange(prefix)) {
+      status_ = Status::InvalidArgument("Must provide a slice for prefix which"
+                                        "is a prefix for some key");
+    }
+  }
+  ~PrefixFilterIterator() {
+    delete iter_;
+  }
+  Slice key() const { return iter_->key(); }
+  Slice value() const { return iter_->value(); }
+  Status status() const {
+    if (!status_.ok()) {
+      return status_;
+    }
+    return iter_->status();
+  }
+  void Next() { iter_->Next(); }
+  void Prev() { iter_->Prev(); }
+  void Seek(const Slice& k) {
+    if (prefix_extractor_->Transform(k) == prefix_) {
+      iter_->Seek(k);
+    } else {
+      status_ = Status::InvalidArgument("Seek must begin with target prefix");
+    }
+  }
+  void SeekToFirst() {
+    Seek(prefix_);
+  }
+  void SeekToLast() {
+    status_ = Status::NotSupported("SeekToLast is incompatible with prefixes");
+  }
+  bool Valid() const {
+    return (status_.ok() && iter_->Valid() &&
+            prefix_extractor_->Transform(iter_->key()) == prefix_);
+  }
+};
+
+}  // namespace rocksdb
diff --git a/db/prefix_test.cc b/db/prefix_test.cc
new file mode 100644 (file)
index 0000000..7e5e9cc
--- /dev/null
@@ -0,0 +1,329 @@
+#include <algorithm>
+#include <iostream>
+#include <vector>
+
+#include <gflags/gflags.h>
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/perf_context.h"
+#include "util/histogram.h"
+#include "util/stop_watch.h"
+#include "util/testharness.h"
+
+DEFINE_bool(use_prefix_hash_memtable, true, "");
+DEFINE_bool(trigger_deadlock, false,
+            "issue delete in range scan to trigger PrefixHashMap deadlock");
+DEFINE_uint64(bucket_count, 100000, "number of buckets");
+DEFINE_uint64(num_locks, 10001, "number of locks");
+DEFINE_bool(random_prefix, false, "randomize prefix");
+DEFINE_uint64(total_prefixes, 1000, "total number of prefixes");
+DEFINE_uint64(items_per_prefix, 10, "total number of values per prefix");
+DEFINE_int64(write_buffer_size, 1000000000, "");
+DEFINE_int64(max_write_buffer_number, 8, "");
+DEFINE_int64(min_write_buffer_number_to_merge, 7, "");
+
+// Path to the database on file system
+const std::string kDbName = rocksdb::test::TmpDir() + "/prefix_test";
+
+namespace rocksdb {
+
+struct TestKey {
+  uint64_t prefix;
+  uint64_t sorted;
+
+  TestKey(uint64_t prefix, uint64_t sorted) : prefix(prefix), sorted(sorted) {}
+};
+
+// return a slice backed by test_key
+inline Slice TestKeyToSlice(const TestKey& test_key) {
+  return Slice((const char*)&test_key, sizeof(test_key));
+}
+
+inline const TestKey* SliceToTestKey(const Slice& slice) {
+  return (const TestKey*)slice.data();
+}
+
+class TestKeyComparator : public Comparator {
+ public:
+
+  // Compare needs to be aware of the possibility of a and/or b is
+  // prefix only
+  virtual int Compare(const Slice& a, const Slice& b) const {
+    const TestKey* key_a = SliceToTestKey(a);
+    const TestKey* key_b = SliceToTestKey(b);
+    if (key_a->prefix != key_b->prefix) {
+      if (key_a->prefix < key_b->prefix) return -1;
+      if (key_a->prefix > key_b->prefix) return 1;
+    } else {
+      ASSERT_TRUE(key_a->prefix == key_b->prefix);
+      // note, both a and b could be prefix only
+      if (a.size() != b.size()) {
+        // one of them is prefix
+        ASSERT_TRUE(
+          (a.size() == sizeof(uint64_t) && b.size() == sizeof(TestKey)) ||
+          (b.size() == sizeof(uint64_t) && a.size() == sizeof(TestKey)));
+        if (a.size() < b.size()) return -1;
+        if (a.size() > b.size()) return 1;
+      } else {
+        // both a and b are prefix
+        if (a.size() == sizeof(uint64_t)) {
+          return 0;
+        }
+
+        // both a and b are whole key
+        ASSERT_TRUE(a.size() == sizeof(TestKey) && b.size() == sizeof(TestKey));
+        if (key_a->sorted < key_b->sorted) return -1;
+        if (key_a->sorted > key_b->sorted) return 1;
+        if (key_a->sorted == key_b->sorted) return 0;
+      }
+    }
+    return 0;
+  }
+
+  virtual const char* Name() const override {
+    return "TestKeyComparator";
+  }
+
+  virtual void FindShortestSeparator(
+      std::string* start,
+      const Slice& limit) const {
+  }
+
+  virtual void FindShortSuccessor(std::string* key) const {}
+
+};
+
+class PrefixTest {
+ public:
+  std::shared_ptr<DB> OpenDb() {
+    DB* db;
+
+    options.create_if_missing = true;
+    options.write_buffer_size = FLAGS_write_buffer_size;
+    options.max_write_buffer_number = FLAGS_max_write_buffer_number;
+    options.min_write_buffer_number_to_merge =
+      FLAGS_min_write_buffer_number_to_merge;
+
+    options.comparator = new TestKeyComparator();
+    if (FLAGS_use_prefix_hash_memtable) {
+      auto prefix_extractor = NewFixedPrefixTransform(8);
+      options.prefix_extractor = prefix_extractor;
+      options.memtable_factory.reset(NewHashSkipListRepFactory(
+          prefix_extractor, FLAGS_bucket_count));
+    }
+
+    Status s = DB::Open(options, kDbName,  &db);
+    ASSERT_OK(s);
+    return std::shared_ptr<DB>(db);
+  }
+  ~PrefixTest() {
+    delete options.comparator;
+  }
+ protected:
+  Options options;
+};
+
+TEST(PrefixTest, DynamicPrefixIterator) {
+
+  DestroyDB(kDbName, Options());
+  auto db = OpenDb();
+  WriteOptions write_options;
+  ReadOptions read_options;
+
+  std::vector<uint64_t> prefixes;
+  for (uint64_t i = 0; i < FLAGS_total_prefixes; ++i) {
+    prefixes.push_back(i);
+  }
+
+  if (FLAGS_random_prefix) {
+    std::random_shuffle(prefixes.begin(), prefixes.end());
+  }
+
+  // insert x random prefix, each with y continuous element.
+  for (auto prefix : prefixes) {
+     for (uint64_t sorted = 0; sorted < FLAGS_items_per_prefix; sorted++) {
+      TestKey test_key(prefix, sorted);
+
+      Slice key = TestKeyToSlice(test_key);
+      std::string value = "v" + std::to_string(sorted);
+
+      ASSERT_OK(db->Put(write_options, key, value));
+    }
+  }
+
+  // test seek existing keys
+  HistogramImpl hist_seek_time;
+  HistogramImpl hist_seek_comparison;
+
+  if (FLAGS_use_prefix_hash_memtable) {
+    read_options.prefix_seek = true;
+  }
+  std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+
+  for (auto prefix : prefixes) {
+    TestKey test_key(prefix, FLAGS_items_per_prefix / 2);
+    Slice key = TestKeyToSlice(test_key);
+    std::string value = "v" + std::to_string(0);
+
+    perf_context.Reset();
+    StopWatchNano timer(Env::Default(), true);
+    uint64_t total_keys = 0;
+    for (iter->Seek(key); iter->Valid(); iter->Next()) {
+      if (FLAGS_trigger_deadlock) {
+        std::cout << "Behold the deadlock!\n";
+        db->Delete(write_options, iter->key());
+      }
+      auto test_key = SliceToTestKey(iter->key());
+      if (test_key->prefix != prefix) break;
+      total_keys++;
+    }
+    hist_seek_time.Add(timer.ElapsedNanos());
+    hist_seek_comparison.Add(perf_context.user_key_comparison_count);
+    ASSERT_EQ(total_keys, FLAGS_items_per_prefix - FLAGS_items_per_prefix/2);
+  }
+
+  std::cout << "Seek key comparison: \n"
+            << hist_seek_comparison.ToString()
+            << "Seek time: \n"
+            << hist_seek_time.ToString();
+
+  // test non-existing keys
+  HistogramImpl hist_no_seek_time;
+  HistogramImpl hist_no_seek_comparison;
+
+  for (auto prefix = FLAGS_total_prefixes;
+       prefix < FLAGS_total_prefixes + 100;
+       prefix++) {
+    TestKey test_key(prefix, 0);
+    Slice key = TestKeyToSlice(test_key);
+
+    perf_context.Reset();
+    StopWatchNano timer(Env::Default(), true);
+    iter->Seek(key);
+    hist_no_seek_time.Add(timer.ElapsedNanos());
+    hist_no_seek_comparison.Add(perf_context.user_key_comparison_count);
+    ASSERT_TRUE(!iter->Valid());
+  }
+
+  std::cout << "non-existing Seek key comparison: \n"
+            << hist_no_seek_comparison.ToString()
+            << "non-existing Seek time: \n"
+            << hist_no_seek_time.ToString();
+}
+
+TEST(PrefixTest, PrefixHash) {
+
+  DestroyDB(kDbName, Options());
+  auto db = OpenDb();
+  WriteOptions write_options;
+  ReadOptions read_options;
+
+  std::vector<uint64_t> prefixes;
+  for (uint64_t i = 0; i < FLAGS_total_prefixes; ++i) {
+    prefixes.push_back(i);
+  }
+
+  if (FLAGS_random_prefix) {
+    std::random_shuffle(prefixes.begin(), prefixes.end());
+  }
+
+  // insert x random prefix, each with y continuous element.
+  HistogramImpl hist_put_time;
+  HistogramImpl hist_put_comparison;
+
+  for (auto prefix : prefixes) {
+     for (uint64_t sorted = 0; sorted < FLAGS_items_per_prefix; sorted++) {
+      TestKey test_key(prefix, sorted);
+
+      Slice key = TestKeyToSlice(test_key);
+      std::string value = "v" + std::to_string(sorted);
+
+      perf_context.Reset();
+      StopWatchNano timer(Env::Default(), true);
+      ASSERT_OK(db->Put(write_options, key, value));
+      hist_put_time.Add(timer.ElapsedNanos());
+      hist_put_comparison.Add(perf_context.user_key_comparison_count);
+    }
+  }
+
+  std::cout << "Put key comparison: \n" << hist_put_comparison.ToString()
+            << "Put time: \n" << hist_put_time.ToString();
+
+
+  // test seek existing keys
+  HistogramImpl hist_seek_time;
+  HistogramImpl hist_seek_comparison;
+
+  for (auto prefix : prefixes) {
+    TestKey test_key(prefix, 0);
+    Slice key = TestKeyToSlice(test_key);
+    std::string value = "v" + std::to_string(0);
+
+    Slice key_prefix;
+    if (FLAGS_use_prefix_hash_memtable) {
+      key_prefix = options.prefix_extractor->Transform(key);
+      read_options.prefix = &key_prefix;
+    }
+    std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+
+    perf_context.Reset();
+    StopWatchNano timer(Env::Default(), true);
+    uint64_t total_keys = 0;
+    for (iter->Seek(key); iter->Valid(); iter->Next()) {
+      if (FLAGS_trigger_deadlock) {
+        std::cout << "Behold the deadlock!\n";
+        db->Delete(write_options, iter->key());
+      }
+      auto test_key = SliceToTestKey(iter->key());
+      if (test_key->prefix != prefix) break;
+      total_keys++;
+    }
+    hist_seek_time.Add(timer.ElapsedNanos());
+    hist_seek_comparison.Add(perf_context.user_key_comparison_count);
+    ASSERT_EQ(total_keys, FLAGS_items_per_prefix);
+  }
+
+  std::cout << "Seek key comparison: \n"
+            << hist_seek_comparison.ToString()
+            << "Seek time: \n"
+            << hist_seek_time.ToString();
+
+  // test non-existing keys
+  HistogramImpl hist_no_seek_time;
+  HistogramImpl hist_no_seek_comparison;
+
+  for (auto prefix = FLAGS_total_prefixes;
+       prefix < FLAGS_total_prefixes + 100;
+       prefix++) {
+    TestKey test_key(prefix, 0);
+    Slice key = TestKeyToSlice(test_key);
+
+    if (FLAGS_use_prefix_hash_memtable) {
+      Slice key_prefix = options.prefix_extractor->Transform(key);
+      read_options.prefix = &key_prefix;
+    }
+    std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+
+    perf_context.Reset();
+    StopWatchNano timer(Env::Default(), true);
+    iter->Seek(key);
+    hist_no_seek_time.Add(timer.ElapsedNanos());
+    hist_no_seek_comparison.Add(perf_context.user_key_comparison_count);
+    ASSERT_TRUE(!iter->Valid());
+  }
+
+  std::cout << "non-existing Seek key comparison: \n"
+            << hist_no_seek_comparison.ToString()
+            << "non-existing Seek time: \n"
+            << hist_no_seek_time.ToString();
+}
+
+}
+
+int main(int argc, char** argv) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  std::cout << kDbName << "\n";
+
+  rocksdb::test::RunAllTests();
+  return 0;
+}
diff --git a/db/repair.cc b/db/repair.cc
new file mode 100644 (file)
index 0000000..2952423
--- /dev/null
@@ -0,0 +1,390 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// We recover the contents of the descriptor from the other files we find.
+// (1) Any log files are first converted to tables
+// (2) We scan every table to compute
+//     (a) smallest/largest for the table
+//     (b) largest sequence number in the table
+// (3) We generate descriptor contents:
+//      - log number is set to zero
+//      - next-file-number is set to 1 + largest file number we found
+//      - last-sequence-number is set to largest sequence# found across
+//        all tables (see 2c)
+//      - compaction pointers are cleared
+//      - every table file is added at level 0
+//
+// Possible optimization 1:
+//   (a) Compute total size and use to pick appropriate max-level M
+//   (b) Sort tables by largest sequence# in the table
+//   (c) For each table: if it overlaps earlier table, place in level-0,
+//       else place in level-M.
+// Possible optimization 2:
+//   Store per-table metadata (smallest, largest, largest-seq#, ...)
+//   in the table's meta section to speed up ScanTable.
+
+#include "db/builder.h"
+#include "db/db_impl.h"
+#include "db/dbformat.h"
+#include "db/filename.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/memtable.h"
+#include "db/table_cache.h"
+#include "db/version_edit.h"
+#include "db/write_batch_internal.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+
+namespace rocksdb {
+
+namespace {
+
+class Repairer {
+ public:
+  Repairer(const std::string& dbname, const Options& options)
+      : dbname_(dbname),
+        env_(options.env),
+        icmp_(options.comparator),
+        ipolicy_(options.filter_policy),
+        options_(SanitizeOptions(dbname, &icmp_, &ipolicy_, options)),
+        next_file_number_(1) {
+    // TableCache can be small since we expect each table to be opened once.
+    table_cache_ = new TableCache(dbname_, &options_, storage_options_, 10);
+    edit_ = new VersionEdit();
+  }
+
+  ~Repairer() {
+    delete table_cache_;
+    delete edit_;
+  }
+
+  Status Run() {
+    Status status = FindFiles();
+    if (status.ok()) {
+      ConvertLogFilesToTables();
+      ExtractMetaData();
+      status = WriteDescriptor();
+    }
+    if (status.ok()) {
+      unsigned long long bytes = 0;
+      for (size_t i = 0; i < tables_.size(); i++) {
+        bytes += tables_[i].meta.file_size;
+      }
+      Log(options_.info_log,
+          "**** Repaired rocksdb %s; "
+          "recovered %d files; %llu bytes. "
+          "Some data may have been lost. "
+          "****",
+          dbname_.c_str(),
+          static_cast<int>(tables_.size()),
+          bytes);
+    }
+    return status;
+  }
+
+ private:
+  struct TableInfo {
+    FileMetaData meta;
+    SequenceNumber min_sequence;
+    SequenceNumber max_sequence;
+  };
+
+  std::string const dbname_;
+  Env* const env_;
+  InternalKeyComparator const icmp_;
+  InternalFilterPolicy const ipolicy_;
+  Options const options_;
+  TableCache* table_cache_;
+  VersionEdit* edit_;
+
+  std::vector<std::string> manifests_;
+  std::vector<uint64_t> table_numbers_;
+  std::vector<uint64_t> logs_;
+  std::vector<TableInfo> tables_;
+  uint64_t next_file_number_;
+  const EnvOptions storage_options_;
+
+  Status FindFiles() {
+    std::vector<std::string> filenames;
+    Status status = env_->GetChildren(dbname_, &filenames);
+    if (!status.ok()) {
+      return status;
+    }
+    if (filenames.empty()) {
+      return Status::IOError(dbname_, "repair found no files");
+    }
+
+    uint64_t number;
+    FileType type;
+    for (size_t i = 0; i < filenames.size(); i++) {
+      if (ParseFileName(filenames[i], &number, &type)) {
+        if (type == kDescriptorFile) {
+          manifests_.push_back(filenames[i]);
+        } else {
+          if (number + 1 > next_file_number_) {
+            next_file_number_ = number + 1;
+          }
+          if (type == kLogFile) {
+            logs_.push_back(number);
+          } else if (type == kTableFile) {
+            table_numbers_.push_back(number);
+          } else {
+            // Ignore other files
+          }
+        }
+      }
+    }
+    return status;
+  }
+
+  void ConvertLogFilesToTables() {
+    for (size_t i = 0; i < logs_.size(); i++) {
+      std::string logname = LogFileName(dbname_, logs_[i]);
+      Status status = ConvertLogToTable(logs_[i]);
+      if (!status.ok()) {
+        Log(options_.info_log, "Log #%llu: ignoring conversion error: %s",
+            (unsigned long long) logs_[i],
+            status.ToString().c_str());
+      }
+      ArchiveFile(logname);
+    }
+  }
+
+  Status ConvertLogToTable(uint64_t log) {
+    struct LogReporter : public log::Reader::Reporter {
+      Env* env;
+      std::shared_ptr<Logger> info_log;
+      uint64_t lognum;
+      virtual void Corruption(size_t bytes, const Status& s) {
+        // We print error messages for corruption, but continue repairing.
+        Log(info_log, "Log #%llu: dropping %d bytes; %s",
+            (unsigned long long) lognum,
+            static_cast<int>(bytes),
+            s.ToString().c_str());
+      }
+    };
+
+    // Open the log file
+    std::string logname = LogFileName(dbname_, log);
+    unique_ptr<SequentialFile> lfile;
+    Status status = env_->NewSequentialFile(logname, &lfile, storage_options_);
+    if (!status.ok()) {
+      return status;
+    }
+
+    // Create the log reader.
+    LogReporter reporter;
+    reporter.env = env_;
+    reporter.info_log = options_.info_log;
+    reporter.lognum = log;
+    // We intentially make log::Reader do checksumming so that
+    // corruptions cause entire commits to be skipped instead of
+    // propagating bad information (like overly large sequence
+    // numbers).
+    log::Reader reader(std::move(lfile), &reporter, false/*do not checksum*/,
+                       0/*initial_offset*/);
+
+    // Read all the records and add to a memtable
+    std::string scratch;
+    Slice record;
+    WriteBatch batch;
+    MemTable* mem = new MemTable(icmp_, options_);
+    mem->Ref();
+    int counter = 0;
+    while (reader.ReadRecord(&record, &scratch)) {
+      if (record.size() < 12) {
+        reporter.Corruption(
+            record.size(), Status::Corruption("log record too small"));
+        continue;
+      }
+      WriteBatchInternal::SetContents(&batch, record);
+      status = WriteBatchInternal::InsertInto(&batch, mem, &options_);
+      if (status.ok()) {
+        counter += WriteBatchInternal::Count(&batch);
+      } else {
+        Log(options_.info_log, "Log #%llu: ignoring %s",
+            (unsigned long long) log,
+            status.ToString().c_str());
+        status = Status::OK();  // Keep going with rest of file
+      }
+    }
+
+    // Do not record a version edit for this conversion to a Table
+    // since ExtractMetaData() will also generate edits.
+    FileMetaData meta;
+    meta.number = next_file_number_++;
+    Iterator* iter = mem->NewIterator();
+    status = BuildTable(dbname_, env_, options_, storage_options_,
+                        table_cache_, iter, &meta,
+                        icmp_.user_comparator(), 0, 0,
+                        kNoCompression);
+    delete iter;
+    delete mem->Unref();
+    mem = nullptr;
+    if (status.ok()) {
+      if (meta.file_size > 0) {
+        table_numbers_.push_back(meta.number);
+      }
+    }
+    Log(options_.info_log, "Log #%llu: %d ops saved to Table #%llu %s",
+        (unsigned long long) log,
+        counter,
+        (unsigned long long) meta.number,
+        status.ToString().c_str());
+    return status;
+  }
+
+  void ExtractMetaData() {
+    std::vector<TableInfo> kept;
+    for (size_t i = 0; i < table_numbers_.size(); i++) {
+      TableInfo t;
+      t.meta.number = table_numbers_[i];
+      Status status = ScanTable(&t);
+      if (!status.ok()) {
+        std::string fname = TableFileName(dbname_, table_numbers_[i]);
+        Log(options_.info_log, "Table #%llu: ignoring %s",
+            (unsigned long long) table_numbers_[i],
+            status.ToString().c_str());
+        ArchiveFile(fname);
+      } else {
+        tables_.push_back(t);
+      }
+    }
+  }
+
+  Status ScanTable(TableInfo* t) {
+    std::string fname = TableFileName(dbname_, t->meta.number);
+    int counter = 0;
+    Status status = env_->GetFileSize(fname, &t->meta.file_size);
+    if (status.ok()) {
+      Iterator* iter = table_cache_->NewIterator(
+          ReadOptions(), storage_options_, t->meta.number, t->meta.file_size);
+      bool empty = true;
+      ParsedInternalKey parsed;
+      t->min_sequence = 0;
+      t->max_sequence = 0;
+      for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+        Slice key = iter->key();
+        if (!ParseInternalKey(key, &parsed)) {
+          Log(options_.info_log, "Table #%llu: unparsable key %s",
+              (unsigned long long) t->meta.number,
+              EscapeString(key).c_str());
+          continue;
+        }
+
+        counter++;
+        if (empty) {
+          empty = false;
+          t->meta.smallest.DecodeFrom(key);
+        }
+        t->meta.largest.DecodeFrom(key);
+        if (parsed.sequence < t->min_sequence) {
+          t->min_sequence = parsed.sequence;
+        }
+        if (parsed.sequence > t->max_sequence) {
+          t->max_sequence = parsed.sequence;
+        }
+      }
+      if (!iter->status().ok()) {
+        status = iter->status();
+      }
+      delete iter;
+    }
+    Log(options_.info_log, "Table #%llu: %d entries %s",
+        (unsigned long long) t->meta.number,
+        counter,
+        status.ToString().c_str());
+    return status;
+  }
+
+  Status WriteDescriptor() {
+    std::string tmp = TempFileName(dbname_, 1);
+    unique_ptr<WritableFile> file;
+    Status status = env_->NewWritableFile(tmp, &file, storage_options_);
+    if (!status.ok()) {
+      return status;
+    }
+
+    SequenceNumber max_sequence = 0;
+    for (size_t i = 0; i < tables_.size(); i++) {
+      if (max_sequence < tables_[i].max_sequence) {
+        max_sequence = tables_[i].max_sequence;
+      }
+    }
+
+    edit_->SetComparatorName(icmp_.user_comparator()->Name());
+    edit_->SetLogNumber(0);
+    edit_->SetNextFile(next_file_number_);
+    edit_->SetLastSequence(max_sequence);
+
+    for (size_t i = 0; i < tables_.size(); i++) {
+      // TODO(opt): separate out into multiple levels
+      const TableInfo& t = tables_[i];
+      edit_->AddFile(0, t.meta.number, t.meta.file_size,
+                    t.meta.smallest, t.meta.largest,
+                    t.min_sequence, t.max_sequence);
+    }
+
+    //fprintf(stderr, "NewDescriptor:\n%s\n", edit_.DebugString().c_str());
+    {
+      log::Writer log(std::move(file));
+      std::string record;
+      edit_->EncodeTo(&record);
+      status = log.AddRecord(record);
+    }
+
+    if (!status.ok()) {
+      env_->DeleteFile(tmp);
+    } else {
+      // Discard older manifests
+      for (size_t i = 0; i < manifests_.size(); i++) {
+        ArchiveFile(dbname_ + "/" + manifests_[i]);
+      }
+
+      // Install new manifest
+      status = env_->RenameFile(tmp, DescriptorFileName(dbname_, 1));
+      if (status.ok()) {
+        status = SetCurrentFile(env_, dbname_, 1);
+      } else {
+        env_->DeleteFile(tmp);
+      }
+    }
+    return status;
+  }
+
+  void ArchiveFile(const std::string& fname) {
+    // Move into another directory.  E.g., for
+    //    dir/foo
+    // rename to
+    //    dir/lost/foo
+    const char* slash = strrchr(fname.c_str(), '/');
+    std::string new_dir;
+    if (slash != nullptr) {
+      new_dir.assign(fname.data(), slash - fname.data());
+    }
+    new_dir.append("/lost");
+    env_->CreateDir(new_dir);  // Ignore error
+    std::string new_file = new_dir;
+    new_file.append("/");
+    new_file.append((slash == nullptr) ? fname.c_str() : slash + 1);
+    Status s = env_->RenameFile(fname, new_file);
+    Log(options_.info_log, "Archiving %s: %s\n",
+        fname.c_str(), s.ToString().c_str());
+  }
+};
+}  // namespace
+
+Status RepairDB(const std::string& dbname, const Options& options) {
+  Repairer repairer(dbname, options);
+  return repairer.Run();
+}
+
+}  // namespace rocksdb
diff --git a/db/simple_table_db_test.cc b/db/simple_table_db_test.cc
new file mode 100644 (file)
index 0000000..555d318
--- /dev/null
@@ -0,0 +1,793 @@
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include <algorithm>
+#include <set>
+
+#include "rocksdb/db.h"
+#include "rocksdb/filter_policy.h"
+#include "db/db_impl.h"
+#include "db/filename.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "db/db_statistics.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/env.h"
+#include "rocksdb/table.h"
+#include "util/hash.h"
+#include "util/logging.h"
+#include "util/mutexlock.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+#include "utilities/merge_operators.h"
+
+using std::unique_ptr;
+
+namespace rocksdb {
+
+// SimpleTable is a simple table format for UNIT TEST ONLY. It is not built
+// as production quality.
+// SimpleTable requires the input key size to be fixed 16 bytes, value cannot
+// be longer than 150000 bytes and stored data on disk in this format:
+// +--------------------------------------------+  <= key1 offset
+// | key1            | value_size (4 bytes) |   |
+// +----------------------------------------+   |
+// | value1                                     |
+// |                                            |
+// +----------------------------------------+---+  <= key2 offset
+// | key2            | value_size (4 bytes) |   |
+// +----------------------------------------+   |
+// | value2                                     |
+// |                                            |
+// |        ......                              |
+// +-----------------+--------------------------+   <= index_block_offset
+// | key1            | key1 offset (8 bytes)    |
+// +-----------------+--------------------------+
+// | key2            | key2 offset (8 bytes)    |
+// +-----------------+--------------------------+
+// | key3            | key3 offset (8 bytes)    |
+// +-----------------+--------------------------+
+// |        ......                              |
+// +-----------------+------------+-------------+
+// | index_block_offset (8 bytes) |
+// +------------------------------+
+
+// SimpleTable is a simple table format for UNIT TEST ONLY. It is not built
+// as production quality.
+class SimpleTableReader: public TableReader {
+public:
+  // Attempt to open the table that is stored in bytes [0..file_size)
+  // of "file", and read the metadata entries necessary to allow
+  // retrieving data from the table.
+  //
+  // If successful, returns ok and sets "*table" to the newly opened
+  // table.  The client should delete "*table" when no longer needed.
+  // If there was an error while initializing the table, sets "*table"
+  // to nullptr and returns a non-ok status.  Does not take ownership of
+  // "*source", but the client must ensure that "source" remains live
+  // for the duration of the returned table's lifetime.
+  //
+  // *file must remain live while this Table is in use.
+  static Status Open(const Options& options, const EnvOptions& soptions,
+                     unique_ptr<RandomAccessFile> && file, uint64_t file_size,
+                     unique_ptr<TableReader>* table_reader);
+
+  bool PrefixMayMatch(const Slice& internal_prefix) override;
+
+  Iterator* NewIterator(const ReadOptions&) override;
+
+  Status Get(
+      const ReadOptions&, const Slice& key, void* arg,
+      bool (*handle_result)(void* arg, const Slice& k, const Slice& v, bool),
+      void (*mark_key_may_exist)(void*) = nullptr) override;
+
+  uint64_t ApproximateOffsetOf(const Slice& key) override;
+
+  bool TEST_KeyInCache(const ReadOptions& options, const Slice& key) override;
+
+  void SetupForCompaction() override;
+
+  TableProperties& GetTableProperties() override;
+
+  ~SimpleTableReader();
+
+private:
+  struct Rep;
+  Rep* rep_;
+
+  explicit SimpleTableReader(Rep* rep) {
+    rep_ = rep;
+  }
+  friend class TableCache;
+  friend class SimpleTableIterator;
+
+  Status GetOffset(const Slice& target, uint64_t* offset);
+
+  // No copying allowed
+  explicit SimpleTableReader(const TableReader&) = delete;
+  void operator=(const TableReader&) = delete;
+};
+
+// Iterator to iterate SimpleTable
+class SimpleTableIterator: public Iterator {
+public:
+  explicit SimpleTableIterator(SimpleTableReader* table);
+  ~SimpleTableIterator();
+
+  bool Valid() const;
+
+  void SeekToFirst();
+
+  void SeekToLast();
+
+  void Seek(const Slice& target);
+
+  void Next();
+
+  void Prev();
+
+  Slice key() const;
+
+  Slice value() const;
+
+  Status status() const;
+
+private:
+  SimpleTableReader* table_;
+  uint64_t offset_;
+  uint64_t next_offset_;
+  Slice key_;
+  Slice value_;
+  char tmp_str_[4];
+  char* key_str_;
+  char* value_str_;
+  int value_str_len_;
+  Status status_;
+  // No copying allowed
+  SimpleTableIterator(const SimpleTableIterator&) = delete;
+  void operator=(const Iterator&) = delete;
+};
+
+struct SimpleTableReader::Rep {
+  ~Rep() {
+  }
+  Rep(const EnvOptions& storage_options, uint64_t index_start_offset,
+      int num_entries) :
+      soptions(storage_options), index_start_offset(index_start_offset),
+      num_entries(num_entries) {
+  }
+
+  Options options;
+  const EnvOptions& soptions;
+  Status status;
+  unique_ptr<RandomAccessFile> file;
+  uint64_t index_start_offset;
+  int num_entries;
+  TableProperties table_properties;
+
+  const static int user_key_size = 16;
+  const static int offset_length = 8;
+  const static int key_footer_len = 8;
+
+  static int GetInternalKeyLength() {
+    return user_key_size + key_footer_len;
+  }
+};
+
+SimpleTableReader::~SimpleTableReader() {
+  delete rep_;
+}
+
+Status SimpleTableReader::Open(const Options& options,
+                               const EnvOptions& soptions,
+                               unique_ptr<RandomAccessFile> && file,
+                               uint64_t size,
+                               unique_ptr<TableReader>* table_reader) {
+  char footer_space[Rep::offset_length];
+  Slice footer_input;
+  Status s = file->Read(size - Rep::offset_length, Rep::offset_length,
+                        &footer_input, footer_space);
+  if (s.ok()) {
+    uint64_t index_start_offset = DecodeFixed64(footer_space);
+
+    int num_entries = (size - Rep::offset_length - index_start_offset)
+        / (Rep::GetInternalKeyLength() + Rep::offset_length);
+    SimpleTableReader::Rep* rep = new SimpleTableReader::Rep(soptions,
+                                                             index_start_offset,
+                                                             num_entries);
+
+    rep->file = std::move(file);
+    rep->options = options;
+    table_reader->reset(new SimpleTableReader(rep));
+  }
+  return s;
+}
+
+void SimpleTableReader::SetupForCompaction() {
+}
+
+TableProperties& SimpleTableReader::GetTableProperties() {
+  return rep_->table_properties;
+}
+
+bool SimpleTableReader::PrefixMayMatch(const Slice& internal_prefix) {
+  return true;
+}
+
+Iterator* SimpleTableReader::NewIterator(const ReadOptions& options) {
+  return new SimpleTableIterator(this);
+}
+
+Status SimpleTableReader::GetOffset(const Slice& target, uint64_t* offset) {
+  uint32_t left = 0;
+  uint32_t right = rep_->num_entries - 1;
+  char key_chars[Rep::GetInternalKeyLength()];
+  Slice tmp_slice;
+
+  uint32_t target_offset = 0;
+  while (left <= right) {
+    uint32_t mid = (left + right + 1) / 2;
+
+    uint64_t offset_to_read = rep_->index_start_offset
+        + (Rep::GetInternalKeyLength() + Rep::offset_length) * mid;
+    Status s = rep_->file->Read(offset_to_read, Rep::GetInternalKeyLength(),
+                                &tmp_slice, key_chars);
+    if (!s.ok()) {
+      return s;
+    }
+
+    int compare_result = rep_->options.comparator->Compare(tmp_slice, target);
+
+    if (compare_result < 0) {
+      if (left == right) {
+        target_offset = right + 1;
+        break;
+      }
+      left = mid;
+    } else {
+      if (left == right) {
+        target_offset = left;
+        break;
+      }
+      right = mid - 1;
+    }
+  }
+
+  if (target_offset >= (uint32_t) rep_->num_entries) {
+    *offset = rep_->index_start_offset;
+    return Status::OK();
+  }
+
+  char value_offset_chars[Rep::offset_length];
+
+  int64_t offset_for_value_offset = rep_->index_start_offset
+      + (Rep::GetInternalKeyLength() + Rep::offset_length) * target_offset
+      + Rep::GetInternalKeyLength();
+  Status s = rep_->file->Read(offset_for_value_offset, Rep::offset_length,
+                              &tmp_slice, value_offset_chars);
+  if (s.ok()) {
+    *offset = DecodeFixed64(value_offset_chars);
+  }
+  return s;
+}
+
+Status SimpleTableReader::Get(
+    const ReadOptions& options, const Slice& k, void* arg,
+    bool (*saver)(void*, const Slice&, const Slice&, bool),
+    void (*mark_key_may_exist)(void*)) {
+  Status s;
+  SimpleTableIterator* iter = new SimpleTableIterator(this);
+  for (iter->Seek(k); iter->Valid(); iter->Next()) {
+    if (!(*saver)(arg, iter->key(), iter->value(), true)) {
+      break;
+    }
+  }
+  s = iter->status();
+  delete iter;
+  return s;
+}
+
+bool SimpleTableReader::TEST_KeyInCache(const ReadOptions& options,
+                                        const Slice& key) {
+  return false;
+}
+
+uint64_t SimpleTableReader::ApproximateOffsetOf(const Slice& key) {
+  return 0;
+}
+
+SimpleTableIterator::SimpleTableIterator(SimpleTableReader* table) :
+    table_(table) {
+  key_str_ = new char[SimpleTableReader::Rep::GetInternalKeyLength()];
+  value_str_len_ = -1;
+  SeekToFirst();
+}
+
+SimpleTableIterator::~SimpleTableIterator() {
+ delete[] key_str_;
+ if (value_str_len_ >= 0) {
+   delete[] value_str_;
+ }
+}
+
+bool SimpleTableIterator::Valid() const {
+  return offset_ < table_->rep_->index_start_offset;
+}
+
+void SimpleTableIterator::SeekToFirst() {
+  next_offset_ = 0;
+  Next();
+}
+
+void SimpleTableIterator::SeekToLast() {
+  assert(false);
+}
+
+void SimpleTableIterator::Seek(const Slice& target) {
+  Status s = table_->GetOffset(target, &next_offset_);
+  if (!s.ok()) {
+    status_ = s;
+  }
+  Next();
+}
+
+void SimpleTableIterator::Next() {
+  offset_ = next_offset_;
+  if (offset_ >= table_->rep_->index_start_offset) {
+    return;
+  }
+  Slice result;
+  int internal_key_size = SimpleTableReader::Rep::GetInternalKeyLength();
+
+  Status s = table_->rep_->file->Read(next_offset_, internal_key_size, &result,
+                                      key_str_);
+  next_offset_ += internal_key_size;
+  key_ = result;
+
+  Slice value_size_slice;
+  s = table_->rep_->file->Read(next_offset_, 4, &value_size_slice, tmp_str_);
+  next_offset_ += 4;
+  uint32_t value_size = DecodeFixed32(tmp_str_);
+
+  Slice value_slice;
+  if ((int) value_size > value_str_len_) {
+    if (value_str_len_ >= 0) {
+      delete[] value_str_;
+    }
+    value_str_ = new char[value_size];
+    value_str_len_ = value_size;
+  }
+  s = table_->rep_->file->Read(next_offset_, value_size, &value_slice,
+                               value_str_);
+  next_offset_ += value_size;
+  value_ = value_slice;
+}
+
+void SimpleTableIterator::Prev() {
+  assert(false);
+}
+
+Slice SimpleTableIterator::key() const {
+  Log(table_->rep_->options.info_log, "key!!!!");
+  return key_;
+}
+
+Slice SimpleTableIterator::value() const {
+  return value_;
+}
+
+Status SimpleTableIterator::status() const {
+  return status_;
+}
+
+class SimpleTableBuilder: public TableBuilder {
+public:
+  // Create a builder that will store the contents of the table it is
+  // building in *file.  Does not close the file.  It is up to the
+  // caller to close the file after calling Finish(). The output file
+  // will be part of level specified by 'level'.  A value of -1 means
+  // that the caller does not know which level the output file will reside.
+  SimpleTableBuilder(const Options& options, WritableFile* file,
+                     CompressionType compression_type);
+
+  // REQUIRES: Either Finish() or Abandon() has been called.
+  ~SimpleTableBuilder();
+
+  // Add key,value to the table being constructed.
+  // REQUIRES: key is after any previously added key according to comparator.
+  // REQUIRES: Finish(), Abandon() have not been called
+  void Add(const Slice& key, const Slice& value) override;
+
+  // Return non-ok iff some error has been detected.
+  Status status() const override;
+
+  // Finish building the table.  Stops using the file passed to the
+  // constructor after this function returns.
+  // REQUIRES: Finish(), Abandon() have not been called
+  Status Finish() override;
+
+  // Indicate that the contents of this builder should be abandoned.  Stops
+  // using the file passed to the constructor after this function returns.
+  // If the caller is not going to call Finish(), it must call Abandon()
+  // before destroying this builder.
+  // REQUIRES: Finish(), Abandon() have not been called
+  void Abandon() override;
+
+  // Number of calls to Add() so far.
+  uint64_t NumEntries() const override;
+
+  // Size of the file generated so far.  If invoked after a successful
+  // Finish() call, returns the size of the final generated file.
+  uint64_t FileSize() const override;
+
+private:
+  struct Rep;
+  Rep* rep_;
+
+  // No copying allowed
+  SimpleTableBuilder(const SimpleTableBuilder&) = delete;
+  void operator=(const SimpleTableBuilder&) = delete;
+};
+
+struct SimpleTableBuilder::Rep {
+  Options options;
+  WritableFile* file;
+  uint64_t offset = 0;
+  Status status;
+
+  uint64_t num_entries = 0;
+
+  bool closed = false;  // Either Finish() or Abandon() has been called.
+
+  const static int user_key_size = 16;
+  const static int offset_length = 8;
+  const static int key_footer_len = 8;
+
+  static int GetInternalKeyLength() {
+    return user_key_size + key_footer_len;
+  }
+
+  std::string index;
+
+  Rep(const Options& opt, WritableFile* f) :
+      options(opt), file(f) {
+  }
+  ~Rep() {
+  }
+};
+
+SimpleTableBuilder::SimpleTableBuilder(const Options& options,
+                                       WritableFile* file,
+                                       CompressionType compression_type) :
+    rep_(new SimpleTableBuilder::Rep(options, file)) {
+}
+
+SimpleTableBuilder::~SimpleTableBuilder() {
+  delete (rep_);
+}
+
+void SimpleTableBuilder::Add(const Slice& key, const Slice& value) {
+  assert((int ) key.size() == Rep::GetInternalKeyLength());
+
+  // Update index
+  rep_->index.append(key.data(), key.size());
+  PutFixed64(&(rep_->index), rep_->offset);
+
+  // Write key-value pair
+  rep_->file->Append(key);
+  rep_->offset += Rep::GetInternalKeyLength();
+
+  std::string size;
+  int value_size = value.size();
+  PutFixed32(&size, value_size);
+  Slice sizeSlice(size);
+  rep_->file->Append(sizeSlice);
+  rep_->file->Append(value);
+  rep_->offset += value_size + 4;
+
+  rep_->num_entries++;
+}
+
+Status SimpleTableBuilder::status() const {
+  return Status::OK();
+}
+
+Status SimpleTableBuilder::Finish() {
+  Rep* r = rep_;
+  assert(!r->closed);
+  r->closed = true;
+
+  uint64_t index_offset = rep_->offset;
+  Slice index_slice(rep_->index);
+  rep_->file->Append(index_slice);
+  rep_->offset += index_slice.size();
+
+  std::string index_offset_str;
+  PutFixed64(&index_offset_str, index_offset);
+  Slice foot_slice(index_offset_str);
+  rep_->file->Append(foot_slice);
+  rep_->offset += foot_slice.size();
+
+  return Status::OK();
+}
+
+void SimpleTableBuilder::Abandon() {
+  rep_->closed = true;
+}
+
+uint64_t SimpleTableBuilder::NumEntries() const {
+  return rep_->num_entries;
+}
+
+uint64_t SimpleTableBuilder::FileSize() const {
+  return rep_->offset;
+}
+
+class SimpleTableFactory: public TableFactory {
+public:
+  ~SimpleTableFactory() {
+  }
+  SimpleTableFactory() {
+  }
+  const char* Name() const override {
+    return "SimpleTable";
+  }
+  Status GetTableReader(const Options& options, const EnvOptions& soptions,
+                        unique_ptr<RandomAccessFile> && file,
+                        uint64_t file_size,
+                        unique_ptr<TableReader>* table_reader) const;
+
+  TableBuilder* GetTableBuilder(const Options& options, WritableFile* file,
+                                CompressionType compression_type) const;
+};
+
+Status SimpleTableFactory::GetTableReader(
+    const Options& options, const EnvOptions& soptions,
+    unique_ptr<RandomAccessFile> && file, uint64_t file_size,
+    unique_ptr<TableReader>* table_reader) const {
+
+  return SimpleTableReader::Open(options, soptions, std::move(file), file_size,
+                                 table_reader);
+}
+
+TableBuilder* SimpleTableFactory::GetTableBuilder(
+    const Options& options, WritableFile* file,
+    CompressionType compression_type) const {
+  return new SimpleTableBuilder(options, file, compression_type);
+}
+
+class SimpleTableDBTest {
+protected:
+public:
+  std::string dbname_;
+  Env* env_;
+  DB* db_;
+
+  Options last_options_;
+
+  SimpleTableDBTest() :
+      env_(Env::Default()) {
+    dbname_ = test::TmpDir() + "/simple_table_db_test";
+    ASSERT_OK(DestroyDB(dbname_, Options()));
+    db_ = nullptr;
+    Reopen();
+  }
+
+  ~SimpleTableDBTest() {
+    delete db_;
+    ASSERT_OK(DestroyDB(dbname_, Options()));
+  }
+
+  // Return the current option configuration.
+  Options CurrentOptions() {
+    Options options;
+    options.table_factory.reset(new SimpleTableFactory());
+    return options;
+  }
+
+  DBImpl* dbfull() {
+    return reinterpret_cast<DBImpl*>(db_);
+  }
+
+  void Reopen(Options* options = nullptr) {
+    ASSERT_OK(TryReopen(options));
+  }
+
+  void Close() {
+    delete db_;
+    db_ = nullptr;
+  }
+
+  void DestroyAndReopen(Options* options = nullptr) {
+    //Destroy using last options
+    Destroy(&last_options_);
+    ASSERT_OK(TryReopen(options));
+  }
+
+  void Destroy(Options* options) {
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(DestroyDB(dbname_, *options));
+  }
+
+  Status PureReopen(Options* options, DB** db) {
+    return DB::Open(*options, dbname_, db);
+  }
+
+  Status TryReopen(Options* options = nullptr) {
+    delete db_;
+    db_ = nullptr;
+    Options opts;
+    if (options != nullptr) {
+      opts = *options;
+    } else {
+      opts = CurrentOptions();
+      opts.create_if_missing = true;
+    }
+    last_options_ = opts;
+
+    return DB::Open(opts, dbname_, &db_);
+  }
+
+  Status Put(const Slice& k, const Slice& v) {
+    return db_->Put(WriteOptions(), k, v);
+  }
+
+  Status Delete(const std::string& k) {
+    return db_->Delete(WriteOptions(), k);
+  }
+
+  std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) {
+    ReadOptions options;
+    options.snapshot = snapshot;
+    std::string result;
+    Status s = db_->Get(options, k, &result);
+    if (s.IsNotFound()) {
+      result = "NOT_FOUND";
+    } else if (!s.ok()) {
+      result = s.ToString();
+    }
+    return result;
+  }
+
+
+  int NumTableFilesAtLevel(int level) {
+    std::string property;
+    ASSERT_TRUE(
+        db_->GetProperty("rocksdb.num-files-at-level" + NumberToString(level),
+                         &property));
+    return atoi(property.c_str());
+  }
+
+  // Return spread of files per level
+  std::string FilesPerLevel() {
+    std::string result;
+    int last_non_zero_offset = 0;
+    for (int level = 0; level < db_->NumberLevels(); level++) {
+      int f = NumTableFilesAtLevel(level);
+      char buf[100];
+      snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
+      result += buf;
+      if (f > 0) {
+        last_non_zero_offset = result.size();
+      }
+    }
+    result.resize(last_non_zero_offset);
+    return result;
+  }
+
+  std::string IterStatus(Iterator* iter) {
+    std::string result;
+    if (iter->Valid()) {
+      result = iter->key().ToString() + "->" + iter->value().ToString();
+    } else {
+      result = "(invalid)";
+    }
+    return result;
+  }
+};
+
+TEST(SimpleTableDBTest, Empty) {
+  ASSERT_TRUE(db_ != nullptr);
+  ASSERT_EQ("NOT_FOUND", Get("0000000000000foo"));
+}
+
+TEST(SimpleTableDBTest, ReadWrite) {
+  ASSERT_OK(Put("0000000000000foo", "v1"));
+  ASSERT_EQ("v1", Get("0000000000000foo"));
+  ASSERT_OK(Put("0000000000000bar", "v2"));
+  ASSERT_OK(Put("0000000000000foo", "v3"));
+  ASSERT_EQ("v3", Get("0000000000000foo"));
+  ASSERT_EQ("v2", Get("0000000000000bar"));
+}
+
+TEST(SimpleTableDBTest, Flush) {
+  ASSERT_OK(Put("0000000000000foo", "v1"));
+  ASSERT_OK(Put("0000000000000bar", "v2"));
+  ASSERT_OK(Put("0000000000000foo", "v3"));
+  dbfull()->TEST_FlushMemTable();
+  ASSERT_EQ("v3", Get("0000000000000foo"));
+  ASSERT_EQ("v2", Get("0000000000000bar"));
+}
+
+TEST(SimpleTableDBTest, Flush2) {
+  ASSERT_OK(Put("0000000000000bar", "b"));
+  ASSERT_OK(Put("0000000000000foo", "v1"));
+  dbfull()->TEST_FlushMemTable();
+
+  ASSERT_OK(Put("0000000000000foo", "v2"));
+  dbfull()->TEST_FlushMemTable();
+  ASSERT_EQ("v2", Get("0000000000000foo"));
+
+  ASSERT_OK(Put("0000000000000eee", "v3"));
+  dbfull()->TEST_FlushMemTable();
+  ASSERT_EQ("v3", Get("0000000000000eee"));
+
+  ASSERT_OK(Delete("0000000000000bar"));
+  dbfull()->TEST_FlushMemTable();
+  ASSERT_EQ("NOT_FOUND", Get("0000000000000bar"));
+
+  ASSERT_OK(Put("0000000000000eee", "v5"));
+  dbfull()->TEST_FlushMemTable();
+  ASSERT_EQ("v5", Get("0000000000000eee"));
+}
+
+static std::string Key(int i) {
+  char buf[100];
+  snprintf(buf, sizeof(buf), "key_______%06d", i);
+  return std::string(buf);
+}
+
+static std::string RandomString(Random* rnd, int len) {
+  std::string r;
+  test::RandomString(rnd, len, &r);
+  return r;
+}
+
+TEST(SimpleTableDBTest, CompactionTrigger) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100 << 10; //100KB
+  options.num_levels = 3;
+  options.max_mem_compaction_level = 0;
+  options.level0_file_num_compaction_trigger = 3;
+  Reopen(&options);
+
+  Random rnd(301);
+
+  for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
+      num++) {
+    std::vector<std::string> values;
+    // Write 120KB (12 values, each 10K)
+    for (int i = 0; i < 12; i++) {
+      values.push_back(RandomString(&rnd, 10000));
+      ASSERT_OK(Put(Key(i), values[i]));
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_EQ(NumTableFilesAtLevel(0), num + 1);
+  }
+
+  //generate one more file in level-0, and should trigger level-0 compaction
+  std::vector<std::string> values;
+  for (int i = 0; i < 12; i++) {
+    values.push_back(RandomString(&rnd, 10000));
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  dbfull()->TEST_WaitForCompact();
+
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1), 1);
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/db/skiplist.h b/db/skiplist.h
new file mode 100644 (file)
index 0000000..06a35d9
--- /dev/null
@@ -0,0 +1,416 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Thread safety
+// -------------
+//
+// Writes require external synchronization, most likely a mutex.
+// Reads require a guarantee that the SkipList will not be destroyed
+// while the read is in progress.  Apart from that, reads progress
+// without any internal locking or synchronization.
+//
+// Invariants:
+//
+// (1) Allocated nodes are never deleted until the SkipList is
+// destroyed.  This is trivially guaranteed by the code since we
+// never delete any skip list nodes.
+//
+// (2) The contents of a Node except for the next/prev pointers are
+// immutable after the Node has been linked into the SkipList.
+// Only Insert() modifies the list, and it is careful to initialize
+// a node and use release-stores to publish the nodes in one or
+// more lists.
+//
+// ... prev vs. next pointer ordering ...
+//
+
+#pragma once
+#include <assert.h>
+#include <stdlib.h>
+#include "port/port.h"
+#include "util/random.h"
+
+namespace rocksdb {
+
+template<typename Key, class Comparator>
+class SkipList {
+ private:
+  struct Node;
+
+ public:
+  // Create a new SkipList object that will use "cmp" for comparing keys,
+  // and will allocate memory using "*arena".  Objects allocated in the arena
+  // must remain allocated for the lifetime of the skiplist object.
+  explicit SkipList(Comparator cmp, Arena* arena);
+
+  // Insert key into the list.
+  // REQUIRES: nothing that compares equal to key is currently in the list.
+  void Insert(const Key& key);
+
+  // Returns true iff an entry that compares equal to key is in the list.
+  bool Contains(const Key& key) const;
+
+  // Iteration over the contents of a skip list
+  class Iterator {
+   public:
+    // Initialize an iterator over the specified list.
+    // The returned iterator is not valid.
+    explicit Iterator(const SkipList* list);
+
+    // Change the underlying skiplist used for this iterator
+    // This enables us not changing the iterator without deallocating
+    // an old one and then allocating a new one
+    void SetList(const SkipList* list);
+
+    // Returns true iff the iterator is positioned at a valid node.
+    bool Valid() const;
+
+    // Returns the key at the current position.
+    // REQUIRES: Valid()
+    const Key& key() const;
+
+    // Advances to the next position.
+    // REQUIRES: Valid()
+    void Next();
+
+    // Advances to the previous position.
+    // REQUIRES: Valid()
+    void Prev();
+
+    // Advance to the first entry with a key >= target
+    void Seek(const Key& target);
+
+    // Position at the first entry in list.
+    // Final state of iterator is Valid() iff list is not empty.
+    void SeekToFirst();
+
+    // Position at the last entry in list.
+    // Final state of iterator is Valid() iff list is not empty.
+    void SeekToLast();
+
+   private:
+    const SkipList* list_;
+    Node* node_;
+    // Intentionally copyable
+  };
+
+ private:
+  enum { kMaxHeight = 12 };
+
+  // Immutable after construction
+  Comparator const compare_;
+  Arena* const arena_;    // Arena used for allocations of nodes
+
+  Node* const head_;
+
+  // Modified only by Insert().  Read racily by readers, but stale
+  // values are ok.
+  port::AtomicPointer max_height_;   // Height of the entire list
+
+  // Used for optimizing sequential insert patterns
+  Node* prev_[kMaxHeight];
+  int   prev_height_;
+
+  inline int GetMaxHeight() const {
+    return static_cast<int>(
+        reinterpret_cast<intptr_t>(max_height_.NoBarrier_Load()));
+  }
+
+  // Read/written only by Insert().
+  Random rnd_;
+
+  Node* NewNode(const Key& key, int height);
+  int RandomHeight();
+  bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); }
+
+  // Return true if key is greater than the data stored in "n"
+  bool KeyIsAfterNode(const Key& key, Node* n) const;
+
+  // Return the earliest node that comes at or after key.
+  // Return nullptr if there is no such node.
+  //
+  // If prev is non-nullptr, fills prev[level] with pointer to previous
+  // node at "level" for every level in [0..max_height_-1].
+  Node* FindGreaterOrEqual(const Key& key, Node** prev) const;
+
+  // Return the latest node with a key < key.
+  // Return head_ if there is no such node.
+  Node* FindLessThan(const Key& key) const;
+
+  // Return the last node in the list.
+  // Return head_ if list is empty.
+  Node* FindLast() const;
+
+  // No copying allowed
+  SkipList(const SkipList&);
+  void operator=(const SkipList&);
+};
+
+// Implementation details follow
+template<typename Key, class Comparator>
+struct SkipList<Key,Comparator>::Node {
+  explicit Node(const Key& k) : key(k) { }
+
+  Key const key;
+
+  // Accessors/mutators for links.  Wrapped in methods so we can
+  // add the appropriate barriers as necessary.
+  Node* Next(int n) {
+    assert(n >= 0);
+    // Use an 'acquire load' so that we observe a fully initialized
+    // version of the returned Node.
+    return reinterpret_cast<Node*>(next_[n].Acquire_Load());
+  }
+  void SetNext(int n, Node* x) {
+    assert(n >= 0);
+    // Use a 'release store' so that anybody who reads through this
+    // pointer observes a fully initialized version of the inserted node.
+    next_[n].Release_Store(x);
+  }
+
+  // No-barrier variants that can be safely used in a few locations.
+  Node* NoBarrier_Next(int n) {
+    assert(n >= 0);
+    return reinterpret_cast<Node*>(next_[n].NoBarrier_Load());
+  }
+  void NoBarrier_SetNext(int n, Node* x) {
+    assert(n >= 0);
+    next_[n].NoBarrier_Store(x);
+  }
+
+ private:
+  // Array of length equal to the node height.  next_[0] is lowest level link.
+  port::AtomicPointer next_[1];
+};
+
+template<typename Key, class Comparator>
+typename SkipList<Key,Comparator>::Node*
+SkipList<Key,Comparator>::NewNode(const Key& key, int height) {
+  char* mem = arena_->AllocateAligned(
+      sizeof(Node) + sizeof(port::AtomicPointer) * (height - 1));
+  return new (mem) Node(key);
+}
+
+template<typename Key, class Comparator>
+inline SkipList<Key,Comparator>::Iterator::Iterator(const SkipList* list) {
+  SetList(list);
+}
+
+template<typename Key, class Comparator>
+inline void SkipList<Key,Comparator>::Iterator::SetList(const SkipList* list) {
+  list_ = list;
+  node_ = nullptr;
+}
+
+template<typename Key, class Comparator>
+inline bool SkipList<Key,Comparator>::Iterator::Valid() const {
+  return node_ != nullptr;
+}
+
+template<typename Key, class Comparator>
+inline const Key& SkipList<Key,Comparator>::Iterator::key() const {
+  assert(Valid());
+  return node_->key;
+}
+
+template<typename Key, class Comparator>
+inline void SkipList<Key,Comparator>::Iterator::Next() {
+  assert(Valid());
+  node_ = node_->Next(0);
+}
+
+template<typename Key, class Comparator>
+inline void SkipList<Key,Comparator>::Iterator::Prev() {
+  // Instead of using explicit "prev" links, we just search for the
+  // last node that falls before key.
+  assert(Valid());
+  node_ = list_->FindLessThan(node_->key);
+  if (node_ == list_->head_) {
+    node_ = nullptr;
+  }
+}
+
+template<typename Key, class Comparator>
+inline void SkipList<Key,Comparator>::Iterator::Seek(const Key& target) {
+  node_ = list_->FindGreaterOrEqual(target, nullptr);
+}
+
+template<typename Key, class Comparator>
+inline void SkipList<Key,Comparator>::Iterator::SeekToFirst() {
+  node_ = list_->head_->Next(0);
+}
+
+template<typename Key, class Comparator>
+inline void SkipList<Key,Comparator>::Iterator::SeekToLast() {
+  node_ = list_->FindLast();
+  if (node_ == list_->head_) {
+    node_ = nullptr;
+  }
+}
+
+template<typename Key, class Comparator>
+int SkipList<Key,Comparator>::RandomHeight() {
+  // Increase height with probability 1 in kBranching
+  static const unsigned int kBranching = 4;
+  int height = 1;
+  while (height < kMaxHeight && ((rnd_.Next() % kBranching) == 0)) {
+    height++;
+  }
+  assert(height > 0);
+  assert(height <= kMaxHeight);
+  return height;
+}
+
+template<typename Key, class Comparator>
+bool SkipList<Key,Comparator>::KeyIsAfterNode(const Key& key, Node* n) const {
+  // nullptr n is considered infinite
+  return (n != nullptr) && (compare_(n->key, key) < 0);
+}
+
+template<typename Key, class Comparator>
+typename SkipList<Key,Comparator>::Node* SkipList<Key,Comparator>::FindGreaterOrEqual(const Key& key, Node** prev)
+    const {
+  // Use prev as an optimization hint and fallback to slow path
+  if (prev && !KeyIsAfterNode(key, prev[0]->Next(0))) {
+    Node* x = prev[0];
+    Node* next = x->Next(0);
+    if ((x == head_) || KeyIsAfterNode(key, x)) {
+      // Adjust all relevant insertion points to the previous entry
+      for (int i = 1; i < prev_height_; i++) {
+        prev[i] = x;
+      }
+      return next;
+    }
+  }
+  // Normal lookup
+  Node* x = head_;
+  int level = GetMaxHeight() - 1;
+  while (true) {
+    Node* next = x->Next(level);
+    // Make sure the lists are sorted.
+    // If x points to head_ or next points nullptr, it is trivially satisfied.
+    assert((x == head_) || (next == nullptr) || KeyIsAfterNode(next->key, x));
+    if (KeyIsAfterNode(key, next)) {
+      // Keep searching in this list
+      x = next;
+    } else {
+      if (prev != nullptr) prev[level] = x;
+      if (level == 0) {
+        return next;
+      } else {
+        // Switch to next list
+        level--;
+      }
+    }
+  }
+}
+
+template<typename Key, class Comparator>
+typename SkipList<Key,Comparator>::Node*
+SkipList<Key,Comparator>::FindLessThan(const Key& key) const {
+  Node* x = head_;
+  int level = GetMaxHeight() - 1;
+  while (true) {
+    assert(x == head_ || compare_(x->key, key) < 0);
+    Node* next = x->Next(level);
+    if (next == nullptr || compare_(next->key, key) >= 0) {
+      if (level == 0) {
+        return x;
+      } else {
+        // Switch to next list
+        level--;
+      }
+    } else {
+      x = next;
+    }
+  }
+}
+
+template<typename Key, class Comparator>
+typename SkipList<Key,Comparator>::Node* SkipList<Key,Comparator>::FindLast()
+    const {
+  Node* x = head_;
+  int level = GetMaxHeight() - 1;
+  while (true) {
+    Node* next = x->Next(level);
+    if (next == nullptr) {
+      if (level == 0) {
+        return x;
+      } else {
+        // Switch to next list
+        level--;
+      }
+    } else {
+      x = next;
+    }
+  }
+}
+
+template<typename Key, class Comparator>
+SkipList<Key,Comparator>::SkipList(Comparator cmp, Arena* arena)
+    : compare_(cmp),
+      arena_(arena),
+      head_(NewNode(0 /* any key will do */, kMaxHeight)),
+      max_height_(reinterpret_cast<void*>(1)),
+      prev_height_(1),
+      rnd_(0xdeadbeef) {
+  for (int i = 0; i < kMaxHeight; i++) {
+    head_->SetNext(i, nullptr);
+    prev_[i] = head_;
+  }
+}
+
+template<typename Key, class Comparator>
+void SkipList<Key,Comparator>::Insert(const Key& key) {
+  // TODO(opt): We can use a barrier-free variant of FindGreaterOrEqual()
+  // here since Insert() is externally synchronized.
+  Node* x = FindGreaterOrEqual(key, prev_);
+
+  // Our data structure does not allow duplicate insertion
+  assert(x == nullptr || !Equal(key, x->key));
+
+  int height = RandomHeight();
+  if (height > GetMaxHeight()) {
+    for (int i = GetMaxHeight(); i < height; i++) {
+      prev_[i] = head_;
+    }
+    //fprintf(stderr, "Change height from %d to %d\n", max_height_, height);
+
+    // It is ok to mutate max_height_ without any synchronization
+    // with concurrent readers.  A concurrent reader that observes
+    // the new value of max_height_ will see either the old value of
+    // new level pointers from head_ (nullptr), or a new value set in
+    // the loop below.  In the former case the reader will
+    // immediately drop to the next level since nullptr sorts after all
+    // keys.  In the latter case the reader will use the new node.
+    max_height_.NoBarrier_Store(reinterpret_cast<void*>(height));
+  }
+
+  x = NewNode(key, height);
+  for (int i = 0; i < height; i++) {
+    // NoBarrier_SetNext() suffices since we will add a barrier when
+    // we publish a pointer to "x" in prev[i].
+    x->NoBarrier_SetNext(i, prev_[i]->NoBarrier_Next(i));
+    prev_[i]->SetNext(i, x);
+  }
+  prev_[0] = x;
+  prev_height_ = height;
+}
+
+template<typename Key, class Comparator>
+bool SkipList<Key,Comparator>::Contains(const Key& key) const {
+  Node* x = FindGreaterOrEqual(key, nullptr);
+  if (x != nullptr && Equal(key, x->key)) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+}  // namespace rocksdb
diff --git a/db/skiplist_test.cc b/db/skiplist_test.cc
new file mode 100644 (file)
index 0000000..dcbaf0a
--- /dev/null
@@ -0,0 +1,383 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/skiplist.h"
+#include <set>
+#include "rocksdb/env.h"
+#include "util/arena_impl.h"
+#include "util/hash.h"
+#include "util/random.h"
+#include "util/testharness.h"
+
+namespace rocksdb {
+
+typedef uint64_t Key;
+
+struct TestComparator {
+  int operator()(const Key& a, const Key& b) const {
+    if (a < b) {
+      return -1;
+    } else if (a > b) {
+      return +1;
+    } else {
+      return 0;
+    }
+  }
+};
+
+class SkipTest { };
+
+TEST(SkipTest, Empty) {
+  ArenaImpl arena_impl;
+  TestComparator cmp;
+  SkipList<Key, TestComparator> list(cmp, &arena_impl);
+  ASSERT_TRUE(!list.Contains(10));
+
+  SkipList<Key, TestComparator>::Iterator iter(&list);
+  ASSERT_TRUE(!iter.Valid());
+  iter.SeekToFirst();
+  ASSERT_TRUE(!iter.Valid());
+  iter.Seek(100);
+  ASSERT_TRUE(!iter.Valid());
+  iter.SeekToLast();
+  ASSERT_TRUE(!iter.Valid());
+}
+
+TEST(SkipTest, InsertAndLookup) {
+  const int N = 2000;
+  const int R = 5000;
+  Random rnd(1000);
+  std::set<Key> keys;
+  ArenaImpl arena_impl;
+  TestComparator cmp;
+  SkipList<Key, TestComparator> list(cmp, &arena_impl);
+  for (int i = 0; i < N; i++) {
+    Key key = rnd.Next() % R;
+    if (keys.insert(key).second) {
+      list.Insert(key);
+    }
+  }
+
+  for (int i = 0; i < R; i++) {
+    if (list.Contains(i)) {
+      ASSERT_EQ(keys.count(i), 1U);
+    } else {
+      ASSERT_EQ(keys.count(i), 0U);
+    }
+  }
+
+  // Simple iterator tests
+  {
+    SkipList<Key, TestComparator>::Iterator iter(&list);
+    ASSERT_TRUE(!iter.Valid());
+
+    iter.Seek(0);
+    ASSERT_TRUE(iter.Valid());
+    ASSERT_EQ(*(keys.begin()), iter.key());
+
+    iter.SeekToFirst();
+    ASSERT_TRUE(iter.Valid());
+    ASSERT_EQ(*(keys.begin()), iter.key());
+
+    iter.SeekToLast();
+    ASSERT_TRUE(iter.Valid());
+    ASSERT_EQ(*(keys.rbegin()), iter.key());
+  }
+
+  // Forward iteration test
+  for (int i = 0; i < R; i++) {
+    SkipList<Key, TestComparator>::Iterator iter(&list);
+    iter.Seek(i);
+
+    // Compare against model iterator
+    std::set<Key>::iterator model_iter = keys.lower_bound(i);
+    for (int j = 0; j < 3; j++) {
+      if (model_iter == keys.end()) {
+        ASSERT_TRUE(!iter.Valid());
+        break;
+      } else {
+        ASSERT_TRUE(iter.Valid());
+        ASSERT_EQ(*model_iter, iter.key());
+        ++model_iter;
+        iter.Next();
+      }
+    }
+  }
+
+  // Backward iteration test
+  {
+    SkipList<Key, TestComparator>::Iterator iter(&list);
+    iter.SeekToLast();
+
+    // Compare against model iterator
+    for (std::set<Key>::reverse_iterator model_iter = keys.rbegin();
+         model_iter != keys.rend();
+         ++model_iter) {
+      ASSERT_TRUE(iter.Valid());
+      ASSERT_EQ(*model_iter, iter.key());
+      iter.Prev();
+    }
+    ASSERT_TRUE(!iter.Valid());
+  }
+}
+
+// We want to make sure that with a single writer and multiple
+// concurrent readers (with no synchronization other than when a
+// reader's iterator is created), the reader always observes all the
+// data that was present in the skip list when the iterator was
+// constructor.  Because insertions are happening concurrently, we may
+// also observe new values that were inserted since the iterator was
+// constructed, but we should never miss any values that were present
+// at iterator construction time.
+//
+// We generate multi-part keys:
+//     <key,gen,hash>
+// where:
+//     key is in range [0..K-1]
+//     gen is a generation number for key
+//     hash is hash(key,gen)
+//
+// The insertion code picks a random key, sets gen to be 1 + the last
+// generation number inserted for that key, and sets hash to Hash(key,gen).
+//
+// At the beginning of a read, we snapshot the last inserted
+// generation number for each key.  We then iterate, including random
+// calls to Next() and Seek().  For every key we encounter, we
+// check that it is either expected given the initial snapshot or has
+// been concurrently added since the iterator started.
+class ConcurrentTest {
+ private:
+  static const uint32_t K = 4;
+
+  static uint64_t key(Key key) { return (key >> 40); }
+  static uint64_t gen(Key key) { return (key >> 8) & 0xffffffffu; }
+  static uint64_t hash(Key key) { return key & 0xff; }
+
+  static uint64_t HashNumbers(uint64_t k, uint64_t g) {
+    uint64_t data[2] = { k, g };
+    return Hash(reinterpret_cast<char*>(data), sizeof(data), 0);
+  }
+
+  static Key MakeKey(uint64_t k, uint64_t g) {
+    assert(sizeof(Key) == sizeof(uint64_t));
+    assert(k <= K);  // We sometimes pass K to seek to the end of the skiplist
+    assert(g <= 0xffffffffu);
+    return ((k << 40) | (g << 8) | (HashNumbers(k, g) & 0xff));
+  }
+
+  static bool IsValidKey(Key k) {
+    return hash(k) == (HashNumbers(key(k), gen(k)) & 0xff);
+  }
+
+  static Key RandomTarget(Random* rnd) {
+    switch (rnd->Next() % 10) {
+      case 0:
+        // Seek to beginning
+        return MakeKey(0, 0);
+      case 1:
+        // Seek to end
+        return MakeKey(K, 0);
+      default:
+        // Seek to middle
+        return MakeKey(rnd->Next() % K, 0);
+    }
+  }
+
+  // Per-key generation
+  struct State {
+    port::AtomicPointer generation[K];
+    void Set(int k, intptr_t v) {
+      generation[k].Release_Store(reinterpret_cast<void*>(v));
+    }
+    intptr_t Get(int k) {
+      return reinterpret_cast<intptr_t>(generation[k].Acquire_Load());
+    }
+
+    State() {
+      for (unsigned int k = 0; k < K; k++) {
+        Set(k, 0);
+      }
+    }
+  };
+
+  // Current state of the test
+  State current_;
+
+  ArenaImpl arena_impl_;
+
+  // SkipList is not protected by mu_.  We just use a single writer
+  // thread to modify it.
+  SkipList<Key, TestComparator> list_;
+
+ public:
+  ConcurrentTest() : list_(TestComparator(), &arena_impl_) { }
+
+  // REQUIRES: External synchronization
+  void WriteStep(Random* rnd) {
+    const uint32_t k = rnd->Next() % K;
+    const intptr_t g = current_.Get(k) + 1;
+    const Key key = MakeKey(k, g);
+    list_.Insert(key);
+    current_.Set(k, g);
+  }
+
+  void ReadStep(Random* rnd) {
+    // Remember the initial committed state of the skiplist.
+    State initial_state;
+    for (unsigned int k = 0; k < K; k++) {
+      initial_state.Set(k, current_.Get(k));
+    }
+
+    Key pos = RandomTarget(rnd);
+    SkipList<Key, TestComparator>::Iterator iter(&list_);
+    iter.Seek(pos);
+    while (true) {
+      Key current;
+      if (!iter.Valid()) {
+        current = MakeKey(K, 0);
+      } else {
+        current = iter.key();
+        ASSERT_TRUE(IsValidKey(current)) << current;
+      }
+      ASSERT_LE(pos, current) << "should not go backwards";
+
+      // Verify that everything in [pos,current) was not present in
+      // initial_state.
+      while (pos < current) {
+        ASSERT_LT(key(pos), K) << pos;
+
+        // Note that generation 0 is never inserted, so it is ok if
+        // <*,0,*> is missing.
+        ASSERT_TRUE((gen(pos) == 0U) ||
+                    (gen(pos) > (uint64_t)initial_state.Get(key(pos)))
+                    ) << "key: " << key(pos)
+                      << "; gen: " << gen(pos)
+                      << "; initgen: "
+                      << initial_state.Get(key(pos));
+
+        // Advance to next key in the valid key space
+        if (key(pos) < key(current)) {
+          pos = MakeKey(key(pos) + 1, 0);
+        } else {
+          pos = MakeKey(key(pos), gen(pos) + 1);
+        }
+      }
+
+      if (!iter.Valid()) {
+        break;
+      }
+
+      if (rnd->Next() % 2) {
+        iter.Next();
+        pos = MakeKey(key(pos), gen(pos) + 1);
+      } else {
+        Key new_target = RandomTarget(rnd);
+        if (new_target > pos) {
+          pos = new_target;
+          iter.Seek(new_target);
+        }
+      }
+    }
+  }
+};
+const uint32_t ConcurrentTest::K;
+
+// Simple test that does single-threaded testing of the ConcurrentTest
+// scaffolding.
+TEST(SkipTest, ConcurrentWithoutThreads) {
+  ConcurrentTest test;
+  Random rnd(test::RandomSeed());
+  for (int i = 0; i < 10000; i++) {
+    test.ReadStep(&rnd);
+    test.WriteStep(&rnd);
+  }
+}
+
+class TestState {
+ public:
+  ConcurrentTest t_;
+  int seed_;
+  port::AtomicPointer quit_flag_;
+
+  enum ReaderState {
+    STARTING,
+    RUNNING,
+    DONE
+  };
+
+  explicit TestState(int s)
+      : seed_(s),
+        quit_flag_(nullptr),
+        state_(STARTING),
+        state_cv_(&mu_) {}
+
+  void Wait(ReaderState s) {
+    mu_.Lock();
+    while (state_ != s) {
+      state_cv_.Wait();
+    }
+    mu_.Unlock();
+  }
+
+  void Change(ReaderState s) {
+    mu_.Lock();
+    state_ = s;
+    state_cv_.Signal();
+    mu_.Unlock();
+  }
+
+ private:
+  port::Mutex mu_;
+  ReaderState state_;
+  port::CondVar state_cv_;
+};
+
+static void ConcurrentReader(void* arg) {
+  TestState* state = reinterpret_cast<TestState*>(arg);
+  Random rnd(state->seed_);
+  int64_t reads = 0;
+  state->Change(TestState::RUNNING);
+  while (!state->quit_flag_.Acquire_Load()) {
+    state->t_.ReadStep(&rnd);
+    ++reads;
+  }
+  state->Change(TestState::DONE);
+}
+
+static void RunConcurrent(int run) {
+  const int seed = test::RandomSeed() + (run * 100);
+  Random rnd(seed);
+  const int N = 1000;
+  const int kSize = 1000;
+  for (int i = 0; i < N; i++) {
+    if ((i % 100) == 0) {
+      fprintf(stderr, "Run %d of %d\n", i, N);
+    }
+    TestState state(seed + 1);
+    Env::Default()->Schedule(ConcurrentReader, &state);
+    state.Wait(TestState::RUNNING);
+    for (int i = 0; i < kSize; i++) {
+      state.t_.WriteStep(&rnd);
+    }
+    state.quit_flag_.Release_Store(&state);  // Any non-nullptr arg will do
+    state.Wait(TestState::DONE);
+  }
+}
+
+TEST(SkipTest, Concurrent1) { RunConcurrent(1); }
+TEST(SkipTest, Concurrent2) { RunConcurrent(2); }
+TEST(SkipTest, Concurrent3) { RunConcurrent(3); }
+TEST(SkipTest, Concurrent4) { RunConcurrent(4); }
+TEST(SkipTest, Concurrent5) { RunConcurrent(5); }
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/db/snapshot.h b/db/snapshot.h
new file mode 100644 (file)
index 0000000..2c2e3ea
--- /dev/null
@@ -0,0 +1,86 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include "rocksdb/db.h"
+
+namespace rocksdb {
+
+class SnapshotList;
+
+// Snapshots are kept in a doubly-linked list in the DB.
+// Each SnapshotImpl corresponds to a particular sequence number.
+class SnapshotImpl : public Snapshot {
+ public:
+  SequenceNumber number_;  // const after creation
+
+ private:
+  friend class SnapshotList;
+
+  // SnapshotImpl is kept in a doubly-linked circular list
+  SnapshotImpl* prev_;
+  SnapshotImpl* next_;
+
+  SnapshotList* list_;                 // just for sanity checks
+};
+
+class SnapshotList {
+ public:
+  SnapshotList() {
+    list_.prev_ = &list_;
+    list_.next_ = &list_;
+    list_.number_ = 0xFFFFFFFFL;      // placeholder marker, for debugging
+  }
+
+  bool empty() const { return list_.next_ == &list_; }
+  SnapshotImpl* oldest() const { assert(!empty()); return list_.next_; }
+  SnapshotImpl* newest() const { assert(!empty()); return list_.prev_; }
+
+  const SnapshotImpl* New(SequenceNumber seq) {
+    SnapshotImpl* s = new SnapshotImpl;
+    s->number_ = seq;
+    s->list_ = this;
+    s->next_ = &list_;
+    s->prev_ = list_.prev_;
+    s->prev_->next_ = s;
+    s->next_->prev_ = s;
+    return s;
+  }
+
+  void Delete(const SnapshotImpl* s) {
+    assert(s->list_ == this);
+    s->prev_->next_ = s->next_;
+    s->next_->prev_ = s->prev_;
+    delete s;
+  }
+
+  // retrieve all snapshot numbers. They are sorted in ascending order.
+  void getAll(std::vector<SequenceNumber>& ret) {
+    if (empty()) return;
+    SnapshotImpl* s = &list_;
+    while (s->next_ != &list_) {
+      ret.push_back(s->next_->number_);
+      s = s ->next_;
+    }
+  }
+
+  // get the sequence number of the most recent snapshot
+  const SequenceNumber GetNewest() {
+    if (empty()) {
+      return 0;
+    }
+    return newest()->number_;
+  }
+
+ private:
+  // Dummy head of doubly-linked list of snapshots
+  SnapshotImpl list_;
+};
+
+}  // namespace rocksdb
diff --git a/db/table_cache.cc b/db/table_cache.cc
new file mode 100644 (file)
index 0000000..593352d
--- /dev/null
@@ -0,0 +1,174 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/table_cache.h"
+
+#include "db/filename.h"
+
+#include "rocksdb/statistics.h"
+#include "rocksdb/table.h"
+#include "util/coding.h"
+#include "util/stop_watch.h"
+
+namespace rocksdb {
+
+static void DeleteEntry(const Slice& key, void* value) {
+  TableReader* table_reader = reinterpret_cast<TableReader*>(value);
+  delete table_reader;
+}
+
+static void UnrefEntry(void* arg1, void* arg2) {
+  Cache* cache = reinterpret_cast<Cache*>(arg1);
+  Cache::Handle* h = reinterpret_cast<Cache::Handle*>(arg2);
+  cache->Release(h);
+}
+
+static Slice GetSliceForFileNumber(uint64_t* file_number) {
+  return Slice(reinterpret_cast<const char*>(file_number),
+               sizeof(*file_number));
+}
+
+TableCache::TableCache(const std::string& dbname,
+                       const Options* options,
+                       const EnvOptions& storage_options,
+                       int entries)
+    : env_(options->env),
+      dbname_(dbname),
+      options_(options),
+      storage_options_(storage_options),
+      cache_(
+        NewLRUCache(entries, options->table_cache_numshardbits,
+                    options->table_cache_remove_scan_count_limit)) {
+}
+
+TableCache::~TableCache() {
+}
+
+Status TableCache::FindTable(const EnvOptions& toptions,
+                             uint64_t file_number, uint64_t file_size,
+                             Cache::Handle** handle, bool* table_io,
+                             const bool no_io) {
+  Status s;
+  Slice key = GetSliceForFileNumber(&file_number);
+  *handle = cache_->Lookup(key);
+  if (*handle == nullptr) {
+    if (no_io) { // Dont do IO and return a not-found status
+      return Status::Incomplete("Table not found in table_cache, no_io is set");
+    }
+    if (table_io != nullptr) {
+      *table_io = true;    // we had to do IO from storage
+    }
+    std::string fname = TableFileName(dbname_, file_number);
+    unique_ptr<RandomAccessFile> file;
+    unique_ptr<TableReader> table_reader;
+    s = env_->NewRandomAccessFile(fname, &file, toptions);
+    RecordTick(options_->statistics.get(), NO_FILE_OPENS);
+    if (s.ok()) {
+      if (options_->advise_random_on_open) {
+        file->Hint(RandomAccessFile::RANDOM);
+      }
+      StopWatch sw(env_, options_->statistics.get(), TABLE_OPEN_IO_MICROS);
+      s = options_->table_factory->GetTableReader(*options_, toptions,
+                                                  std::move(file), file_size,
+                                                  &table_reader);
+    }
+
+    if (!s.ok()) {
+      assert(table_reader == nullptr);
+      RecordTick(options_->statistics.get(), NO_FILE_ERRORS);
+      // We do not cache error results so that if the error is transient,
+      // or somebody repairs the file, we recover automatically.
+    } else {
+      assert(file.get() == nullptr);
+      *handle = cache_->Insert(key, table_reader.release(), 1, &DeleteEntry);
+    }
+  }
+  return s;
+}
+
+Iterator* TableCache::NewIterator(const ReadOptions& options,
+                                  const EnvOptions& toptions,
+                                  uint64_t file_number,
+                                  uint64_t file_size,
+                                  TableReader** table_reader_ptr,
+                                  bool for_compaction) {
+  if (table_reader_ptr != nullptr) {
+    *table_reader_ptr = nullptr;
+  }
+
+  Cache::Handle* handle = nullptr;
+  Status s = FindTable(toptions, file_number, file_size, &handle,
+                       nullptr, options.read_tier == kBlockCacheTier);
+  if (!s.ok()) {
+    return NewErrorIterator(s);
+  }
+
+  TableReader* table_reader =
+    reinterpret_cast<TableReader*>(cache_->Value(handle));
+  Iterator* result = table_reader->NewIterator(options);
+  result->RegisterCleanup(&UnrefEntry, cache_.get(), handle);
+  if (table_reader_ptr != nullptr) {
+    *table_reader_ptr = table_reader;
+  }
+
+  if (for_compaction) {
+    table_reader->SetupForCompaction();
+  }
+
+  return result;
+}
+
+Status TableCache::Get(const ReadOptions& options,
+                       uint64_t file_number,
+                       uint64_t file_size,
+                       const Slice& k,
+                       void* arg,
+                       bool (*saver)(void*, const Slice&, const Slice&, bool),
+                       bool* table_io,
+                       void (*mark_key_may_exist)(void*)) {
+  Cache::Handle* handle = nullptr;
+  Status s = FindTable(storage_options_, file_number, file_size,
+                       &handle, table_io,
+                       options.read_tier == kBlockCacheTier);
+  if (s.ok()) {
+    TableReader* t =
+      reinterpret_cast<TableReader*>(cache_->Value(handle));
+    s = t->Get(options, k, arg, saver, mark_key_may_exist);
+    cache_->Release(handle);
+  } else if (options.read_tier && s.IsIncomplete()) {
+    // Couldnt find Table in cache but treat as kFound if no_io set
+    (*mark_key_may_exist)(arg);
+    return Status::OK();
+  }
+  return s;
+}
+
+bool TableCache::PrefixMayMatch(const ReadOptions& options,
+                                uint64_t file_number,
+                                uint64_t file_size,
+                                const Slice& internal_prefix,
+                                bool* table_io) {
+  Cache::Handle* handle = nullptr;
+  Status s = FindTable(storage_options_, file_number,
+                       file_size, &handle, table_io);
+  bool may_match = true;
+  if (s.ok()) {
+    TableReader* t =
+      reinterpret_cast<TableReader*>(cache_->Value(handle));
+    may_match = t->PrefixMayMatch(internal_prefix);
+    cache_->Release(handle);
+  }
+  return may_match;
+}
+
+void TableCache::Evict(uint64_t file_number) {
+  cache_->Erase(GetSliceForFileNumber(&file_number));
+}
+
+}  // namespace rocksdb
diff --git a/db/table_cache.h b/db/table_cache.h
new file mode 100644 (file)
index 0000000..4b225af
--- /dev/null
@@ -0,0 +1,78 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Thread-safe (provides internal synchronization)
+
+#pragma once
+#include <string>
+#include <stdint.h>
+#include "db/dbformat.h"
+#include "rocksdb/env.h"
+#include "rocksdb/cache.h"
+#include "port/port.h"
+#include "rocksdb/table.h"
+
+namespace rocksdb {
+
+class Env;
+
+class TableCache {
+ public:
+  TableCache(const std::string& dbname, const Options* options,
+             const EnvOptions& storage_options, int entries);
+  ~TableCache();
+
+  // Return an iterator for the specified file number (the corresponding
+  // file length must be exactly "file_size" bytes).  If "tableptr" is
+  // non-nullptr, also sets "*tableptr" to point to the Table object
+  // underlying the returned iterator, or nullptr if no Table object underlies
+  // the returned iterator.  The returned "*tableptr" object is owned by
+  // the cache and should not be deleted, and is valid for as long as the
+  // returned iterator is live.
+  Iterator* NewIterator(const ReadOptions& options,
+                        const EnvOptions& toptions,
+                        uint64_t file_number,
+                        uint64_t file_size,
+                        TableReader** table_reader_ptr = nullptr,
+                        bool for_compaction = false);
+
+  // If a seek to internal key "k" in specified file finds an entry,
+  // call (*handle_result)(arg, found_key, found_value) repeatedly until
+  // it returns false.
+  Status Get(const ReadOptions& options,
+             uint64_t file_number,
+             uint64_t file_size,
+             const Slice& k,
+             void* arg,
+             bool (*handle_result)(void*, const Slice&, const Slice&, bool),
+             bool* table_io,
+             void (*mark_key_may_exist)(void*) = nullptr);
+
+  // Determine whether the table may contain the specified prefix.  If
+  // the table index of blooms are not in memory, this may cause an I/O
+  bool PrefixMayMatch(const ReadOptions& options, uint64_t file_number,
+                      uint64_t file_size, const Slice& internal_prefix,
+                      bool* table_io);
+
+  // Evict any entry for the specified file number
+  void Evict(uint64_t file_number);
+
+ private:
+  Env* const env_;
+  const std::string dbname_;
+  const Options* options_;
+  const EnvOptions& storage_options_;
+  std::shared_ptr<Cache> cache_;
+
+  Status FindTable(const EnvOptions& toptions, uint64_t file_number,
+                   uint64_t file_size, Cache::Handle**, bool* table_io=nullptr,
+                   const bool no_io = false);
+};
+
+}  // namespace rocksdb
diff --git a/db/table_properties_collector.cc b/db/table_properties_collector.cc
new file mode 100644 (file)
index 0000000..3654663
--- /dev/null
@@ -0,0 +1,164 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include "db/table_properties_collector.h"
+
+#include "db/dbformat.h"
+#include "util/coding.h"
+
+namespace rocksdb {
+
+namespace {
+  void AppendProperty(
+      std::string& props,
+      const std::string& key,
+      const std::string& value,
+      const std::string& prop_delim,
+      const std::string& kv_delim) {
+    props.append(key);
+    props.append(kv_delim);
+    props.append(value);
+    props.append(prop_delim);
+  }
+
+  template <class TValue>
+  void AppendProperty(
+      std::string& props,
+      const std::string& key,
+      const TValue& value,
+      const std::string& prop_delim,
+      const std::string& kv_delim) {
+    AppendProperty(
+        props, key, std::to_string(value), prop_delim, kv_delim
+    );
+  }
+}
+
+std::string TableProperties::ToString(
+    const std::string& prop_delim,
+    const std::string& kv_delim) const {
+  std::string result;
+  result.reserve(1024);
+
+  // Basic Info
+  AppendProperty(
+      result, "# data blocks", num_data_blocks, prop_delim, kv_delim
+  );
+  AppendProperty(result, "# entries", num_entries, prop_delim, kv_delim);
+
+  AppendProperty(result, "raw key size", raw_key_size, prop_delim, kv_delim);
+  AppendProperty(
+      result,
+      "raw average key size",
+      num_entries != 0 ?  1.0 * raw_key_size / num_entries : 0.0,
+      prop_delim,
+      kv_delim
+  );
+  AppendProperty(
+      result, "raw value size", raw_value_size, prop_delim, kv_delim
+  );
+  AppendProperty(
+      result,
+      "raw average value size",
+      num_entries != 0 ?  1.0 * raw_value_size / num_entries : 0.0,
+      prop_delim,
+      kv_delim
+  );
+
+  AppendProperty(result, "data block size", data_size, prop_delim, kv_delim);
+  AppendProperty(result, "index block size", index_size, prop_delim, kv_delim);
+  AppendProperty(
+      result, "filter block size", filter_size, prop_delim, kv_delim
+  );
+  AppendProperty(
+      result,
+      "(estimated) table size",
+      data_size + index_size + filter_size,
+      prop_delim,
+      kv_delim
+  );
+
+  AppendProperty(
+      result,
+      "filter policy name",
+      filter_policy_name.empty() ? std::string("N/A") : filter_policy_name,
+      prop_delim,
+      kv_delim
+  );
+
+  return result;
+}
+
+Status InternalKeyPropertiesCollector::Add(
+    const Slice& key, const Slice& value) {
+  ParsedInternalKey ikey;
+  if (!ParseInternalKey(key, &ikey)) {
+    return Status::InvalidArgument("Invalid internal key");
+  }
+
+  if (ikey.type == ValueType::kTypeDeletion) {
+    ++deleted_keys_;
+  }
+
+  return Status::OK();
+}
+
+Status InternalKeyPropertiesCollector::Finish(
+    TableProperties::UserCollectedProperties* properties) {
+  assert(properties);
+  assert(properties->find(
+        InternalKeyTablePropertiesNames::kDeletedKeys) == properties->end());
+  std::string val;
+
+  PutVarint64(&val, deleted_keys_);
+  properties->insert({ InternalKeyTablePropertiesNames::kDeletedKeys, val });
+
+  return Status::OK();
+}
+
+TableProperties::UserCollectedProperties
+InternalKeyPropertiesCollector::GetReadableProperties() const {
+  return {
+    { "kDeletedKeys", std::to_string(deleted_keys_) }
+  };
+}
+
+
+Status UserKeyTablePropertiesCollector::Add(
+    const Slice& key, const Slice& value) {
+  ParsedInternalKey ikey;
+  if (!ParseInternalKey(key, &ikey)) {
+    return Status::InvalidArgument("Invalid internal key");
+  }
+
+  return collector_->Add(ikey.user_key, value);
+}
+
+Status UserKeyTablePropertiesCollector::Finish(
+    TableProperties::UserCollectedProperties* properties) {
+  return collector_->Finish(properties);
+}
+
+TableProperties::UserCollectedProperties
+UserKeyTablePropertiesCollector::GetReadableProperties() const {
+  return collector_->GetReadableProperties();
+}
+
+
+const std::string InternalKeyTablePropertiesNames::kDeletedKeys
+  = "rocksdb.deleted.keys";
+
+uint64_t GetDeletedKeys(
+    const TableProperties::UserCollectedProperties& props) {
+  auto pos = props.find(InternalKeyTablePropertiesNames::kDeletedKeys);
+  if (pos == props.end()) {
+    return 0;
+  }
+  Slice raw = pos->second;
+  uint64_t val = 0;
+  return GetVarint64(&raw, &val) ? val : 0;
+}
+
+}  // namespace rocksdb
diff --git a/db/table_properties_collector.h b/db/table_properties_collector.h
new file mode 100644 (file)
index 0000000..533130d
--- /dev/null
@@ -0,0 +1,76 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file defines a collection of statistics collectors.
+#pragma once
+
+#include "rocksdb/table_properties.h"
+
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace rocksdb {
+
+struct InternalKeyTablePropertiesNames {
+  static const std::string kDeletedKeys;
+};
+
+// Collecting the statistics for internal keys. Visible only by internal
+// rocksdb modules.
+class InternalKeyPropertiesCollector : public TablePropertiesCollector {
+ public:
+  virtual Status Add(const Slice& key, const Slice& value) override;
+
+  virtual Status Finish(
+      TableProperties::UserCollectedProperties* properties) override;
+
+  virtual const char* Name() const override {
+    return "InternalKeyPropertiesCollector";
+  }
+
+  TableProperties::UserCollectedProperties
+    GetReadableProperties() const override;
+
+ private:
+  uint64_t deleted_keys_ = 0;
+};
+
+// When rocksdb creates a new table, it will encode all "user keys" into
+// "internal keys", which contains meta information of a given entry.
+//
+// This class extracts user key from the encoded internal key when Add() is
+// invoked.
+class UserKeyTablePropertiesCollector : public TablePropertiesCollector {
+ public:
+  explicit UserKeyTablePropertiesCollector(
+      TablePropertiesCollector* collector) :
+      UserKeyTablePropertiesCollector(
+        std::shared_ptr<TablePropertiesCollector>(collector)
+    ) {
+  }
+
+  explicit UserKeyTablePropertiesCollector(
+      std::shared_ptr<TablePropertiesCollector> collector) :
+      collector_(collector) {
+  }
+
+  virtual ~UserKeyTablePropertiesCollector() { }
+
+  virtual Status Add(const Slice& key, const Slice& value) override;
+
+  virtual Status Finish(
+      TableProperties::UserCollectedProperties* properties) override;
+
+  virtual const char* Name() const override { return collector_->Name(); }
+
+  TableProperties::UserCollectedProperties
+    GetReadableProperties() const override;
+
+ protected:
+  std::shared_ptr<TablePropertiesCollector> collector_;
+};
+
+}  // namespace rocksdb
diff --git a/db/table_properties_collector_test.cc b/db/table_properties_collector_test.cc
new file mode 100644 (file)
index 0000000..6f405b2
--- /dev/null
@@ -0,0 +1,266 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include <map>
+#include <memory>
+#include <string>
+
+#include "db/dbformat.h"
+#include "db/db_impl.h"
+#include "db/table_properties_collector.h"
+#include "rocksdb/table_properties.h"
+#include "rocksdb/table.h"
+#include "table/block_based_table_factory.h"
+#include "util/coding.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+
+class TablePropertiesTest {
+ private:
+  unique_ptr<TableReader> table_reader_;
+};
+
+// TODO(kailiu) the following classes should be moved to some more general
+// places, so that other tests can also make use of them.
+// `FakeWritableFile` and `FakeRandomeAccessFile` bypass the real file system
+// and therefore enable us to quickly setup the tests.
+class FakeWritableFile : public WritableFile {
+ public:
+  ~FakeWritableFile() { }
+
+  const std::string& contents() const { return contents_; }
+
+  virtual Status Close() { return Status::OK(); }
+  virtual Status Flush() { return Status::OK(); }
+  virtual Status Sync() { return Status::OK(); }
+
+  virtual Status Append(const Slice& data) {
+    contents_.append(data.data(), data.size());
+    return Status::OK();
+  }
+
+ private:
+  std::string contents_;
+};
+
+
+class FakeRandomeAccessFile : public RandomAccessFile {
+ public:
+  explicit FakeRandomeAccessFile(const Slice& contents)
+      : contents_(contents.data(), contents.size()) {
+  }
+
+  virtual ~FakeRandomeAccessFile() { }
+
+  uint64_t Size() const { return contents_.size(); }
+
+  virtual Status Read(uint64_t offset, size_t n, Slice* result,
+                       char* scratch) const {
+    if (offset > contents_.size()) {
+      return Status::InvalidArgument("invalid Read offset");
+    }
+    if (offset + n > contents_.size()) {
+      n = contents_.size() - offset;
+    }
+    memcpy(scratch, &contents_[offset], n);
+    *result = Slice(scratch, n);
+    return Status::OK();
+  }
+
+ private:
+  std::string contents_;
+};
+
+
+class DumbLogger : public Logger {
+ public:
+  virtual void Logv(const char* format, va_list ap) { }
+  virtual size_t GetLogFileSize() const { return 0; }
+};
+
+// Utilities test functions
+void MakeBuilder(
+    const Options& options,
+    std::unique_ptr<FakeWritableFile>* writable,
+    std::unique_ptr<TableBuilder>* builder) {
+  writable->reset(new FakeWritableFile);
+  builder->reset(
+      options.table_factory->GetTableBuilder(options, writable->get(),
+                                             options.compression));
+}
+
+void OpenTable(
+    const Options& options,
+    const std::string& contents,
+    std::unique_ptr<TableReader>* table_reader) {
+
+  std::unique_ptr<RandomAccessFile> file(new FakeRandomeAccessFile(contents));
+  auto s = options.table_factory->GetTableReader(
+      options,
+      EnvOptions(),
+      std::move(file),
+      contents.size(),
+      table_reader
+  );
+  ASSERT_OK(s);
+}
+
+// Collects keys that starts with "A" in a table.
+class RegularKeysStartWithA: public TablePropertiesCollector {
+ public:
+   const char* Name() const { return "RegularKeysStartWithA"; }
+
+   Status Finish(TableProperties::UserCollectedProperties* properties) {
+     std::string encoded;
+     PutVarint32(&encoded, count_);
+     *properties = TableProperties::UserCollectedProperties {
+       { "TablePropertiesTest", "Rocksdb" },
+       { "Count", encoded }
+     };
+     return Status::OK();
+   }
+
+   Status Add(const Slice& user_key, const Slice& value) {
+     // simply asssume all user keys are not empty.
+     if (user_key.data()[0] == 'A') {
+       ++count_;
+     }
+     return Status::OK();
+   }
+
+  virtual TableProperties::UserCollectedProperties
+    GetReadableProperties() const {
+      return {};
+  }
+
+
+ private:
+  uint32_t count_ = 0;
+};
+
+TEST(TablePropertiesTest, CustomizedTablePropertiesCollector) {
+  Options options;
+
+  // make sure the entries will be inserted with order.
+  std::map<std::string, std::string> kvs = {
+    {"About",     "val5"},  // starts with 'A'
+    {"Abstract",  "val2"},  // starts with 'A'
+    {"Around",    "val7"},  // starts with 'A'
+    {"Beyond",    "val3"},
+    {"Builder",   "val1"},
+    {"Cancel",    "val4"},
+    {"Find",      "val6"},
+  };
+
+  // Test properties collectors with internal keys or regular keys
+  for (bool encode_as_internal : { true, false }) {
+    // -- Step 1: build table
+    auto collector = new RegularKeysStartWithA();
+    if (encode_as_internal) {
+      options.table_properties_collectors = {
+        std::make_shared<UserKeyTablePropertiesCollector>(collector)
+      };
+    } else {
+      options.table_properties_collectors.resize(1);
+      options.table_properties_collectors[0].reset(collector);
+    }
+    std::unique_ptr<TableBuilder> builder;
+    std::unique_ptr<FakeWritableFile> writable;
+    MakeBuilder(options, &writable, &builder);
+
+    for (const auto& kv : kvs) {
+      if (encode_as_internal) {
+        InternalKey ikey(kv.first, 0, ValueType::kTypeValue);
+        builder->Add(ikey.Encode(), kv.second);
+      } else {
+        builder->Add(kv.first, kv.second);
+      }
+    }
+    ASSERT_OK(builder->Finish());
+
+    // -- Step 2: Open table
+    std::unique_ptr<TableReader> table_reader;
+    OpenTable(options, writable->contents(), &table_reader);
+    const auto& properties =
+      table_reader->GetTableProperties().user_collected_properties;
+
+    ASSERT_EQ("Rocksdb", properties.at("TablePropertiesTest"));
+
+    uint32_t starts_with_A = 0;
+    Slice key(properties.at("Count"));
+    ASSERT_TRUE(GetVarint32(&key, &starts_with_A));
+    ASSERT_EQ(3u, starts_with_A);
+  }
+}
+
+TEST(TablePropertiesTest, InternalKeyPropertiesCollector) {
+  InternalKey keys[] = {
+    InternalKey("A", 0, ValueType::kTypeValue),
+    InternalKey("B", 0, ValueType::kTypeValue),
+    InternalKey("C", 0, ValueType::kTypeValue),
+    InternalKey("W", 0, ValueType::kTypeDeletion),
+    InternalKey("X", 0, ValueType::kTypeDeletion),
+    InternalKey("Y", 0, ValueType::kTypeDeletion),
+    InternalKey("Z", 0, ValueType::kTypeDeletion),
+  };
+
+  for (bool sanitized : { false, true }) {
+    std::unique_ptr<TableBuilder> builder;
+    std::unique_ptr<FakeWritableFile> writable;
+    Options options;
+    if (sanitized) {
+      options.table_properties_collectors = {
+        std::make_shared<RegularKeysStartWithA>()
+      };
+      // with sanitization, even regular properties collector will be able to
+      // handle internal keys.
+      auto comparator = options.comparator;
+      // HACK: Set options.info_log to avoid writing log in
+      // SanitizeOptions().
+      options.info_log = std::make_shared<DumbLogger>();
+      options = SanitizeOptions(
+          "db",  // just a place holder
+          nullptr,  // with skip internal key comparator
+          nullptr,  // don't care filter policy
+          options
+      );
+      options.comparator = comparator;
+    } else {
+      options.table_properties_collectors = {
+        std::make_shared<InternalKeyPropertiesCollector>()
+      };
+    }
+
+    MakeBuilder(options, &writable, &builder);
+    for (const auto& k : keys) {
+      builder->Add(k.Encode(), "val");
+    }
+
+    ASSERT_OK(builder->Finish());
+
+    std::unique_ptr<TableReader> table_reader;
+    OpenTable(options, writable->contents(), &table_reader);
+    const auto& properties =
+      table_reader->GetTableProperties().user_collected_properties;
+
+    uint64_t deleted = GetDeletedKeys(properties);
+    ASSERT_EQ(4u, deleted);
+
+    if (sanitized) {
+      uint32_t starts_with_A = 0;
+      Slice key(properties.at("Count"));
+      ASSERT_TRUE(GetVarint32(&key, &starts_with_A));
+      ASSERT_EQ(1u, starts_with_A);
+    }
+  }
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/db/transaction_log_impl.cc b/db/transaction_log_impl.cc
new file mode 100644 (file)
index 0000000..092d88c
--- /dev/null
@@ -0,0 +1,264 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#include "db/transaction_log_impl.h"
+#include "db/write_batch_internal.h"
+
+namespace rocksdb {
+
+TransactionLogIteratorImpl::TransactionLogIteratorImpl(
+                           const std::string& dir,
+                           const Options* options,
+                           const EnvOptions& soptions,
+                           const SequenceNumber seq,
+                           std::unique_ptr<VectorLogPtr> files,
+                           DBImpl const * const dbimpl) :
+    dir_(dir),
+    options_(options),
+    soptions_(soptions),
+    startingSequenceNumber_(seq),
+    files_(std::move(files)),
+    started_(false),
+    isValid_(false),
+    currentFileIndex_(0),
+    currentBatchSeq_(0),
+    currentLastSeq_(0),
+    dbimpl_(dbimpl) {
+  assert(files_ != nullptr);
+  assert(dbimpl_ != nullptr);
+
+  reporter_.env = options_->env;
+  reporter_.info_log = options_->info_log.get();
+  SeekToStartSequence(); // Seek till starting sequence
+}
+
+Status TransactionLogIteratorImpl::OpenLogFile(
+    const LogFile* logFile,
+    unique_ptr<SequentialFile>* file) {
+  Env* env = options_->env;
+  if (logFile->Type() == kArchivedLogFile) {
+    std::string fname = ArchivedLogFileName(dir_, logFile->LogNumber());
+    return env->NewSequentialFile(fname, file, soptions_);
+  } else {
+    std::string fname = LogFileName(dir_, logFile->LogNumber());
+    Status status = env->NewSequentialFile(fname, file, soptions_);
+    if (!status.ok()) {
+      //  If cannot open file in DB directory.
+      //  Try the archive dir, as it could have moved in the meanwhile.
+      fname = ArchivedLogFileName(dir_, logFile->LogNumber());
+      status = env->NewSequentialFile(fname, file, soptions_);
+      if (!status.ok()) {
+        return Status::IOError("Requested file not present in the dir");
+      }
+    }
+    return status;
+  }
+}
+
+BatchResult TransactionLogIteratorImpl::GetBatch()  {
+  assert(isValid_);  //  cannot call in a non valid state.
+  BatchResult result;
+  result.sequence = currentBatchSeq_;
+  result.writeBatchPtr = std::move(currentBatch_);
+  return result;
+}
+
+Status TransactionLogIteratorImpl::status() {
+  return currentStatus_;
+}
+
+bool TransactionLogIteratorImpl::Valid() {
+  return started_ && isValid_;
+}
+
+bool TransactionLogIteratorImpl::RestrictedRead(
+    Slice* record,
+    std::string* scratch) {
+  // Don't read if no more complete entries to read from logs
+  if (currentLastSeq_ >= dbimpl_->GetLatestSequenceNumber()) {
+    return false;
+  }
+  return currentLogReader_->ReadRecord(record, scratch);
+}
+
+void TransactionLogIteratorImpl::SeekToStartSequence(
+    uint64_t startFileIndex,
+    bool strict) {
+  std::string scratch;
+  Slice record;
+  started_ = false;
+  isValid_ = false;
+  if (files_->size() <= startFileIndex) {
+    return;
+  }
+  Status s = OpenLogReader(files_->at(startFileIndex).get());
+  if (!s.ok()) {
+    currentStatus_ = s;
+    return;
+  }
+  while (RestrictedRead(&record, &scratch)) {
+    if (record.size() < 12) {
+      reporter_.Corruption(
+        record.size(), Status::Corruption("very small log record"));
+      continue;
+    }
+    UpdateCurrentWriteBatch(record);
+    if (currentLastSeq_ >= startingSequenceNumber_) {
+      if (strict && currentBatchSeq_ != startingSequenceNumber_) {
+        currentStatus_ = Status::Corruption("Gap in sequence number. Could not "
+                                            "seek to required sequence number");
+        reporter_.Info(currentStatus_.ToString().c_str());
+        return;
+      } else if (strict) {
+        reporter_.Info("Could seek required sequence number. Iterator will "
+                       "continue.");
+      }
+      isValid_ = true;
+      started_ = true; // set started_ as we could seek till starting sequence
+      return;
+    } else {
+      isValid_ = false;
+    }
+  }
+
+  // Could not find start sequence in first file. Normally this must be the
+  // only file. Otherwise log the error and let the iterator return next entry
+  // If strict is set, we want to seek exactly till the start sequence and it
+  // should have been present in the file we scanned above
+  if (strict) {
+    currentStatus_ = Status::Corruption("Gap in sequence number. Could not "
+                                        "seek to required sequence number");
+    reporter_.Info(currentStatus_.ToString().c_str());
+  } else if (files_->size() != 1) {
+    currentStatus_ = Status::Corruption("Start sequence was not found, "
+                                        "skipping to the next available");
+    reporter_.Info(currentStatus_.ToString().c_str());
+    // Let NextImpl find the next available entry. started_ remains false
+    // because we don't want to check for gaps while moving to start sequence
+    NextImpl(true);
+  }
+}
+
+void TransactionLogIteratorImpl::Next() {
+  return NextImpl(false);
+}
+
+void TransactionLogIteratorImpl::NextImpl(bool internal) {
+  std::string scratch;
+  Slice record;
+  isValid_ = false;
+  if (!internal && !started_) {
+    // Runs every time until we can seek to the start sequence
+    return SeekToStartSequence();
+  }
+  while(true) {
+    assert(currentLogReader_);
+    if (currentLogReader_->IsEOF()) {
+      currentLogReader_->UnmarkEOF();
+    }
+    while (RestrictedRead(&record, &scratch)) {
+      if (record.size() < 12) {
+        reporter_.Corruption(
+          record.size(), Status::Corruption("very small log record"));
+        continue;
+      } else {
+        // started_ should be true if called by application
+        assert(internal || started_);
+        // started_ should be false if called internally
+        assert(!internal || !started_);
+        UpdateCurrentWriteBatch(record);
+        if (internal && !started_) {
+          started_ = true;
+        }
+        return;
+      }
+    }
+
+    // Open the next file
+    if (currentFileIndex_ < files_->size() - 1) {
+      ++currentFileIndex_;
+      Status status =OpenLogReader(files_->at(currentFileIndex_).get());
+      if (!status.ok()) {
+        isValid_ = false;
+        currentStatus_ = status;
+        return;
+      }
+    } else {
+      isValid_ = false;
+      if (currentLastSeq_ == dbimpl_->GetLatestSequenceNumber()) {
+        currentStatus_ = Status::OK();
+      } else {
+        currentStatus_ = Status::IOError("NO MORE DATA LEFT");
+      }
+      return;
+    }
+  }
+}
+
+bool TransactionLogIteratorImpl::IsBatchExpected(
+    const WriteBatch* batch,
+    const SequenceNumber expectedSeq) {
+  assert(batch);
+  SequenceNumber batchSeq = WriteBatchInternal::Sequence(batch);
+  if (batchSeq != expectedSeq) {
+    char buf[200];
+    snprintf(buf, sizeof(buf),
+             "Discontinuity in log records. Got seq=%lu, Expected seq=%lu, "
+             "Last flushed seq=%lu.Log iterator will reseek the correct "
+             "batch.",
+             (unsigned long)batchSeq,
+             (unsigned long)expectedSeq,
+             (unsigned long)dbimpl_->GetLatestSequenceNumber());
+    reporter_.Info(buf);
+    return false;
+  }
+  return true;
+}
+
+void TransactionLogIteratorImpl::UpdateCurrentWriteBatch(const Slice& record) {
+  std::unique_ptr<WriteBatch> batch(new WriteBatch());
+  WriteBatchInternal::SetContents(batch.get(), record);
+
+  SequenceNumber expectedSeq = currentLastSeq_ + 1;
+  // If the iterator has started, then confirm that we get continuous batches
+  if (started_ && !IsBatchExpected(batch.get(), expectedSeq)) {
+    // Seek to the batch having expected sequence number
+    if (expectedSeq < files_->at(currentFileIndex_)->StartSequence()) {
+      // Expected batch must lie in the previous log file
+      // Avoid underflow.
+      if (currentFileIndex_ != 0) {
+        currentFileIndex_--;
+      }
+    }
+    startingSequenceNumber_ = expectedSeq;
+    // currentStatus_ will be set to Ok if reseek succeeds
+    currentStatus_ = Status::NotFound("Gap in sequence numbers");
+    return SeekToStartSequence(currentFileIndex_, true);
+  }
+
+  currentBatchSeq_ = WriteBatchInternal::Sequence(batch.get());
+  currentLastSeq_ = currentBatchSeq_ +
+                    WriteBatchInternal::Count(batch.get()) - 1;
+  // currentBatchSeq_ can only change here
+  assert(currentLastSeq_ <= dbimpl_->GetLatestSequenceNumber());
+
+  currentBatch_ = move(batch);
+  isValid_ = true;
+  currentStatus_ = Status::OK();
+}
+
+Status TransactionLogIteratorImpl::OpenLogReader(const LogFile* logFile) {
+  unique_ptr<SequentialFile> file;
+  Status status = OpenLogFile(logFile, &file);
+  if (!status.ok()) {
+    return status;
+  }
+  assert(file);
+  currentLogReader_.reset(
+    new log::Reader(std::move(file), &reporter_, true, 0)
+  );
+  return Status::OK();
+}
+}  //  namespace rocksdb
diff --git a/db/transaction_log_impl.h b/db/transaction_log_impl.h
new file mode 100644 (file)
index 0000000..f3f4ce2
--- /dev/null
@@ -0,0 +1,118 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#pragma once
+#include <vector>
+
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/types.h"
+#include "rocksdb/transaction_log.h"
+#include "db/db_impl.h"
+#include "db/log_reader.h"
+#include "db/filename.h"
+
+namespace rocksdb {
+
+struct LogReporter : public log::Reader::Reporter {
+  Env* env;
+  Logger* info_log;
+  virtual void Corruption(size_t bytes, const Status& s) {
+    Log(info_log, "dropping %zu bytes; %s", bytes, s.ToString().c_str());
+  }
+  virtual void Info(const char* s) {
+    Log(info_log, "%s", s);
+  }
+};
+
+class LogFileImpl : public LogFile {
+ public:
+  LogFileImpl(uint64_t logNum, WalFileType logType, SequenceNumber startSeq,
+              uint64_t sizeBytes) :
+    logNumber_(logNum),
+    type_(logType),
+    startSequence_(startSeq),
+    sizeFileBytes_(sizeBytes) {
+  }
+
+  std::string PathName() const {
+    if (type_ == kArchivedLogFile) {
+      return ArchivedLogFileName("", logNumber_);
+    }
+    return LogFileName("", logNumber_);
+  }
+
+  uint64_t LogNumber() const { return logNumber_; }
+
+  WalFileType Type() const { return type_; }
+
+  SequenceNumber StartSequence() const { return startSequence_; }
+
+  uint64_t SizeFileBytes() const { return sizeFileBytes_; }
+
+  bool operator < (const LogFile& that) const {
+    return LogNumber() < that.LogNumber();
+  }
+
+ private:
+  uint64_t logNumber_;
+  WalFileType type_;
+  SequenceNumber startSequence_;
+  uint64_t sizeFileBytes_;
+
+};
+
+class TransactionLogIteratorImpl : public TransactionLogIterator {
+ public:
+  TransactionLogIteratorImpl(const std::string& dir,
+                             const Options* options,
+                             const EnvOptions& soptions,
+                             const SequenceNumber seqNum,
+                             std::unique_ptr<VectorLogPtr> files,
+                             DBImpl const * const dbimpl);
+
+  virtual bool Valid();
+
+  virtual void Next();
+
+  virtual Status status();
+
+  virtual BatchResult GetBatch();
+
+ private:
+  const std::string& dir_;
+  const Options* options_;
+  const EnvOptions& soptions_;
+  SequenceNumber startingSequenceNumber_;
+  std::unique_ptr<VectorLogPtr> files_;
+  bool started_;
+  bool isValid_;  // not valid when it starts of.
+  Status currentStatus_;
+  size_t currentFileIndex_;
+  std::unique_ptr<WriteBatch> currentBatch_;
+  unique_ptr<log::Reader> currentLogReader_;
+  Status OpenLogFile(const LogFile* logFile, unique_ptr<SequentialFile>* file);
+  LogReporter reporter_;
+  SequenceNumber currentBatchSeq_; // sequence number at start of current batch
+  SequenceNumber currentLastSeq_; // last sequence in the current batch
+  DBImpl const * const dbimpl_; // The db on whose log files this iterates
+
+  // Reads from transaction log only if the writebatch record has been written
+  bool RestrictedRead(Slice* record, std::string* scratch);
+  // Seeks to startingSequenceNumber reading from startFileIndex in files_.
+  // If strict is set,then must get a batch starting with startingSequenceNumber
+  void SeekToStartSequence(uint64_t startFileIndex = 0, bool strict = false);
+  // Implementation of Next. SeekToStartSequence calls it internally with
+  // internal=true to let it find next entry even if it has to jump gaps because
+  // the iterator may start off from the first available entry but promises to
+  // be continuous after that
+  void NextImpl(bool internal = false);
+  // Check if batch is expected, else return false
+  bool IsBatchExpected(const WriteBatch* batch, SequenceNumber expectedSeq);
+  // Update current batch if a continuous batch is found, else return false
+  void UpdateCurrentWriteBatch(const Slice& record);
+  Status OpenLogReader(const LogFile* file);
+};
+}  //  namespace rocksdb
diff --git a/db/version_edit.cc b/db/version_edit.cc
new file mode 100644 (file)
index 0000000..42c07e7
--- /dev/null
@@ -0,0 +1,301 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_edit.h"
+
+#include "db/version_set.h"
+#include "util/coding.h"
+
+namespace rocksdb {
+
+// Tag numbers for serialized VersionEdit.  These numbers are written to
+// disk and should not be changed.
+enum Tag {
+  kComparator           = 1,
+  kLogNumber            = 2,
+  kNextFileNumber       = 3,
+  kLastSequence         = 4,
+  kCompactPointer       = 5,
+  kDeletedFile          = 6,
+  kNewFile              = 7,
+  // 8 was used for large value refs
+  kPrevLogNumber        = 9,
+
+  // these are new formats divergent from open source leveldb
+  kNewFile2             = 100  // store smallest & largest seqno
+};
+
+void VersionEdit::Clear() {
+  comparator_.clear();
+  max_level_ = 0;
+  log_number_ = 0;
+  prev_log_number_ = 0;
+  last_sequence_ = 0;
+  next_file_number_ = 0;
+  has_comparator_ = false;
+  has_log_number_ = false;
+  has_prev_log_number_ = false;
+  has_next_file_number_ = false;
+  has_last_sequence_ = false;
+  deleted_files_.clear();
+  new_files_.clear();
+}
+
+void VersionEdit::EncodeTo(std::string* dst) const {
+  if (has_comparator_) {
+    PutVarint32(dst, kComparator);
+    PutLengthPrefixedSlice(dst, comparator_);
+  }
+  if (has_log_number_) {
+    PutVarint32(dst, kLogNumber);
+    PutVarint64(dst, log_number_);
+  }
+  if (has_prev_log_number_) {
+    PutVarint32(dst, kPrevLogNumber);
+    PutVarint64(dst, prev_log_number_);
+  }
+  if (has_next_file_number_) {
+    PutVarint32(dst, kNextFileNumber);
+    PutVarint64(dst, next_file_number_);
+  }
+  if (has_last_sequence_) {
+    PutVarint32(dst, kLastSequence);
+    PutVarint64(dst, last_sequence_);
+  }
+
+  for (size_t i = 0; i < compact_pointers_.size(); i++) {
+    PutVarint32(dst, kCompactPointer);
+    PutVarint32(dst, compact_pointers_[i].first);  // level
+    PutLengthPrefixedSlice(dst, compact_pointers_[i].second.Encode());
+  }
+
+  for (DeletedFileSet::const_iterator iter = deleted_files_.begin();
+       iter != deleted_files_.end();
+       ++iter) {
+    PutVarint32(dst, kDeletedFile);
+    PutVarint32(dst, iter->first);   // level
+    PutVarint64(dst, iter->second);  // file number
+  }
+
+  for (size_t i = 0; i < new_files_.size(); i++) {
+    const FileMetaData& f = new_files_[i].second;
+    PutVarint32(dst, kNewFile2);
+    PutVarint32(dst, new_files_[i].first);  // level
+    PutVarint64(dst, f.number);
+    PutVarint64(dst, f.file_size);
+    PutLengthPrefixedSlice(dst, f.smallest.Encode());
+    PutLengthPrefixedSlice(dst, f.largest.Encode());
+    PutVarint64(dst, f.smallest_seqno);
+    PutVarint64(dst, f.largest_seqno);
+  }
+}
+
+static bool GetInternalKey(Slice* input, InternalKey* dst) {
+  Slice str;
+  if (GetLengthPrefixedSlice(input, &str)) {
+    dst->DecodeFrom(str);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+bool VersionEdit::GetLevel(Slice* input, int* level, const char** msg) {
+  uint32_t v;
+  if (GetVarint32(input, &v)) {
+    *level = v;
+    if (max_level_ < *level) {
+      max_level_ = *level;
+    }
+    return true;
+  } else {
+    return false;
+  }
+}
+
+Status VersionEdit::DecodeFrom(const Slice& src) {
+  Clear();
+  Slice input = src;
+  const char* msg = nullptr;
+  uint32_t tag;
+
+  // Temporary storage for parsing
+  int level;
+  uint64_t number;
+  FileMetaData f;
+  Slice str;
+  InternalKey key;
+
+  while (msg == nullptr && GetVarint32(&input, &tag)) {
+    switch (tag) {
+      case kComparator:
+        if (GetLengthPrefixedSlice(&input, &str)) {
+          comparator_ = str.ToString();
+          has_comparator_ = true;
+        } else {
+          msg = "comparator name";
+        }
+        break;
+
+      case kLogNumber:
+        if (GetVarint64(&input, &log_number_)) {
+          has_log_number_ = true;
+        } else {
+          msg = "log number";
+        }
+        break;
+
+      case kPrevLogNumber:
+        if (GetVarint64(&input, &prev_log_number_)) {
+          has_prev_log_number_ = true;
+        } else {
+          msg = "previous log number";
+        }
+        break;
+
+      case kNextFileNumber:
+        if (GetVarint64(&input, &next_file_number_)) {
+          has_next_file_number_ = true;
+        } else {
+          msg = "next file number";
+        }
+        break;
+
+      case kLastSequence:
+        if (GetVarint64(&input, &last_sequence_)) {
+          has_last_sequence_ = true;
+        } else {
+          msg = "last sequence number";
+        }
+        break;
+
+      case kCompactPointer:
+        if (GetLevel(&input, &level, &msg) &&
+            GetInternalKey(&input, &key)) {
+          compact_pointers_.push_back(std::make_pair(level, key));
+        } else {
+          if (!msg) {
+            msg = "compaction pointer";
+          }
+        }
+        break;
+
+      case kDeletedFile:
+        if (GetLevel(&input, &level, &msg) &&
+            GetVarint64(&input, &number)) {
+          deleted_files_.insert(std::make_pair(level, number));
+        } else {
+          if (!msg) {
+            msg = "deleted file";
+          }
+        }
+        break;
+
+      case kNewFile:
+        if (GetLevel(&input, &level, &msg) &&
+            GetVarint64(&input, &f.number) &&
+            GetVarint64(&input, &f.file_size) &&
+            GetInternalKey(&input, &f.smallest) &&
+            GetInternalKey(&input, &f.largest)) {
+          new_files_.push_back(std::make_pair(level, f));
+        } else {
+          if (!msg) {
+            msg = "new-file entry";
+          }
+        }
+        break;
+
+      case kNewFile2:
+        if (GetLevel(&input, &level, &msg) &&
+            GetVarint64(&input, &f.number) &&
+            GetVarint64(&input, &f.file_size) &&
+            GetInternalKey(&input, &f.smallest) &&
+            GetInternalKey(&input, &f.largest) &&
+            GetVarint64(&input, &f.smallest_seqno) &&
+            GetVarint64(&input, &f.largest_seqno) ) {
+          new_files_.push_back(std::make_pair(level, f));
+        } else {
+          if (!msg) {
+            msg = "new-file2 entry";
+          }
+        }
+        break;
+
+      default:
+        msg = "unknown tag";
+        break;
+    }
+  }
+
+  if (msg == nullptr && !input.empty()) {
+    msg = "invalid tag";
+  }
+
+  Status result;
+  if (msg != nullptr) {
+    result = Status::Corruption("VersionEdit", msg);
+  }
+  return result;
+}
+
+std::string VersionEdit::DebugString(bool hex_key) const {
+  std::string r;
+  r.append("VersionEdit {");
+  if (has_comparator_) {
+    r.append("\n  Comparator: ");
+    r.append(comparator_);
+  }
+  if (has_log_number_) {
+    r.append("\n  LogNumber: ");
+    AppendNumberTo(&r, log_number_);
+  }
+  if (has_prev_log_number_) {
+    r.append("\n  PrevLogNumber: ");
+    AppendNumberTo(&r, prev_log_number_);
+  }
+  if (has_next_file_number_) {
+    r.append("\n  NextFile: ");
+    AppendNumberTo(&r, next_file_number_);
+  }
+  if (has_last_sequence_) {
+    r.append("\n  LastSeq: ");
+    AppendNumberTo(&r, last_sequence_);
+  }
+  for (size_t i = 0; i < compact_pointers_.size(); i++) {
+    r.append("\n  CompactPointer: ");
+    AppendNumberTo(&r, compact_pointers_[i].first);
+    r.append(" ");
+    r.append(compact_pointers_[i].second.DebugString(hex_key));
+  }
+  for (DeletedFileSet::const_iterator iter = deleted_files_.begin();
+       iter != deleted_files_.end();
+       ++iter) {
+    r.append("\n  DeleteFile: ");
+    AppendNumberTo(&r, iter->first);
+    r.append(" ");
+    AppendNumberTo(&r, iter->second);
+  }
+  for (size_t i = 0; i < new_files_.size(); i++) {
+    const FileMetaData& f = new_files_[i].second;
+    r.append("\n  AddFile: ");
+    AppendNumberTo(&r, new_files_[i].first);
+    r.append(" ");
+    AppendNumberTo(&r, f.number);
+    r.append(" ");
+    AppendNumberTo(&r, f.file_size);
+    r.append(" ");
+    r.append(f.smallest.DebugString(hex_key));
+    r.append(" .. ");
+    r.append(f.largest.DebugString(hex_key));
+  }
+  r.append("\n}\n");
+  return r;
+}
+
+}  // namespace rocksdb
diff --git a/db/version_edit.h b/db/version_edit.h
new file mode 100644 (file)
index 0000000..a0546c9
--- /dev/null
@@ -0,0 +1,125 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <set>
+#include <utility>
+#include <vector>
+#include "db/dbformat.h"
+
+namespace rocksdb {
+
+class VersionSet;
+
+struct FileMetaData {
+  int refs;
+  int allowed_seeks;          // Seeks allowed until compaction
+  uint64_t number;
+  uint64_t file_size;         // File size in bytes
+  InternalKey smallest;       // Smallest internal key served by table
+  InternalKey largest;        // Largest internal key served by table
+  bool being_compacted;       // Is this file undergoing compaction?
+  SequenceNumber smallest_seqno;// The smallest seqno in this file
+  SequenceNumber largest_seqno; // The largest seqno in this file
+
+  FileMetaData() : refs(0), allowed_seeks(1 << 30), file_size(0),
+                   being_compacted(false) { }
+};
+
+class VersionEdit {
+ public:
+  VersionEdit() { Clear(); }
+  ~VersionEdit() { }
+
+  void Clear();
+
+  void SetComparatorName(const Slice& name) {
+    has_comparator_ = true;
+    comparator_ = name.ToString();
+  }
+  void SetLogNumber(uint64_t num) {
+    has_log_number_ = true;
+    log_number_ = num;
+  }
+  void SetPrevLogNumber(uint64_t num) {
+    has_prev_log_number_ = true;
+    prev_log_number_ = num;
+  }
+  void SetNextFile(uint64_t num) {
+    has_next_file_number_ = true;
+    next_file_number_ = num;
+  }
+  void SetLastSequence(SequenceNumber seq) {
+    has_last_sequence_ = true;
+    last_sequence_ = seq;
+  }
+  void SetCompactPointer(int level, const InternalKey& key) {
+    compact_pointers_.push_back(std::make_pair(level, key));
+  }
+
+  // Add the specified file at the specified number.
+  // REQUIRES: This version has not been saved (see VersionSet::SaveTo)
+  // REQUIRES: "smallest" and "largest" are smallest and largest keys in file
+  void AddFile(int level, uint64_t file,
+               uint64_t file_size,
+               const InternalKey& smallest,
+               const InternalKey& largest,
+               const SequenceNumber& smallest_seqno,
+               const SequenceNumber& largest_seqno) {
+    FileMetaData f;
+    f.number = file;
+    f.file_size = file_size;
+    f.smallest = smallest;
+    f.largest = largest;
+    f.smallest_seqno = smallest_seqno;
+    f.largest_seqno = largest_seqno;
+    assert(smallest_seqno <= largest_seqno);
+    new_files_.push_back(std::make_pair(level, f));
+  }
+
+  // Delete the specified "file" from the specified "level".
+  void DeleteFile(int level, uint64_t file) {
+    deleted_files_.insert(std::make_pair(level, file));
+  }
+
+  // Number of edits
+  int NumEntries() {
+    return new_files_.size() + deleted_files_.size();
+  }
+
+  void EncodeTo(std::string* dst) const;
+  Status DecodeFrom(const Slice& src);
+
+  std::string DebugString(bool hex_key = false) const;
+
+ private:
+  friend class VersionSet;
+
+  typedef std::set< std::pair<int, uint64_t> > DeletedFileSet;
+
+  bool GetLevel(Slice* input, int* level, const char** msg);
+
+  int max_level_;
+  std::string comparator_;
+  uint64_t log_number_;
+  uint64_t prev_log_number_;
+  uint64_t next_file_number_;
+  SequenceNumber last_sequence_;
+  bool has_comparator_;
+  bool has_log_number_;
+  bool has_prev_log_number_;
+  bool has_next_file_number_;
+  bool has_last_sequence_;
+
+  std::vector<std::pair<int, InternalKey> > compact_pointers_;
+  DeletedFileSet deleted_files_;
+  std::vector<std::pair<int, FileMetaData> > new_files_;
+};
+
+}  // namespace rocksdb
diff --git a/db/version_edit_test.cc b/db/version_edit_test.cc
new file mode 100644 (file)
index 0000000..63aa32e
--- /dev/null
@@ -0,0 +1,53 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_edit.h"
+#include "util/testharness.h"
+
+namespace rocksdb {
+
+static void TestEncodeDecode(const VersionEdit& edit) {
+  std::string encoded, encoded2;
+  edit.EncodeTo(&encoded);
+  VersionEdit parsed;
+  Status s = parsed.DecodeFrom(encoded);
+  ASSERT_TRUE(s.ok()) << s.ToString();
+  parsed.EncodeTo(&encoded2);
+  ASSERT_EQ(encoded, encoded2);
+}
+
+class VersionEditTest { };
+
+TEST(VersionEditTest, EncodeDecode) {
+  static const uint64_t kBig = 1ull << 50;
+
+  VersionEdit edit;
+  for (int i = 0; i < 4; i++) {
+    TestEncodeDecode(edit);
+    edit.AddFile(3, kBig + 300 + i, kBig + 400 + i,
+                 InternalKey("foo", kBig + 500 + i, kTypeValue),
+                 InternalKey("zoo", kBig + 600 + i, kTypeDeletion),
+                 kBig + 500 + i,
+                 kBig + 600 + i);
+    edit.DeleteFile(4, kBig + 700 + i);
+    edit.SetCompactPointer(i, InternalKey("x", kBig + 900 + i, kTypeValue));
+  }
+
+  edit.SetComparatorName("foo");
+  edit.SetLogNumber(kBig + 100);
+  edit.SetNextFile(kBig + 200);
+  edit.SetLastSequence(kBig + 1000);
+  TestEncodeDecode(edit);
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/db/version_set.cc b/db/version_set.cc
new file mode 100644 (file)
index 0000000..eb20650
--- /dev/null
@@ -0,0 +1,3148 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_set.h"
+
+#include <algorithm>
+#include <climits>
+#include <stdio.h>
+#include "db/filename.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/memtable.h"
+#include "db/merge_context.h"
+#include "db/table_cache.h"
+#include "rocksdb/env.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/table.h"
+#include "table/merger.h"
+#include "table/two_level_iterator.h"
+#include "util/coding.h"
+#include "util/logging.h"
+#include "util/stop_watch.h"
+
+namespace rocksdb {
+
+static uint64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
+  uint64_t sum = 0;
+  for (size_t i = 0; i < files.size() && files[i]; i++) {
+    sum += files[i]->file_size;
+  }
+  return sum;
+}
+
+Version::~Version() {
+  assert(refs_ == 0);
+
+  // Remove from linked list
+  prev_->next_ = next_;
+  next_->prev_ = prev_;
+
+  // Drop references to files
+  for (int level = 0; level < num_levels_; level++) {
+    for (size_t i = 0; i < files_[level].size(); i++) {
+      FileMetaData* f = files_[level][i];
+      assert(f->refs > 0);
+      f->refs--;
+      if (f->refs <= 0) {
+        vset_->obsolete_files_.push_back(f);
+      }
+    }
+  }
+  delete[] files_;
+}
+
+int FindFile(const InternalKeyComparator& icmp,
+             const std::vector<FileMetaData*>& files,
+             const Slice& key) {
+  uint32_t left = 0;
+  uint32_t right = files.size();
+  while (left < right) {
+    uint32_t mid = (left + right) / 2;
+    const FileMetaData* f = files[mid];
+    if (icmp.InternalKeyComparator::Compare(f->largest.Encode(), key) < 0) {
+      // Key at "mid.largest" is < "target".  Therefore all
+      // files at or before "mid" are uninteresting.
+      left = mid + 1;
+    } else {
+      // Key at "mid.largest" is >= "target".  Therefore all files
+      // after "mid" are uninteresting.
+      right = mid;
+    }
+  }
+  return right;
+}
+
+static bool AfterFile(const Comparator* ucmp,
+                      const Slice* user_key, const FileMetaData* f) {
+  // nullptr user_key occurs before all keys and is therefore never after *f
+  return (user_key != nullptr &&
+          ucmp->Compare(*user_key, f->largest.user_key()) > 0);
+}
+
+static bool BeforeFile(const Comparator* ucmp,
+                       const Slice* user_key, const FileMetaData* f) {
+  // nullptr user_key occurs after all keys and is therefore never before *f
+  return (user_key != nullptr &&
+          ucmp->Compare(*user_key, f->smallest.user_key()) < 0);
+}
+
+bool SomeFileOverlapsRange(
+    const InternalKeyComparator& icmp,
+    bool disjoint_sorted_files,
+    const std::vector<FileMetaData*>& files,
+    const Slice* smallest_user_key,
+    const Slice* largest_user_key) {
+  const Comparator* ucmp = icmp.user_comparator();
+  if (!disjoint_sorted_files) {
+    // Need to check against all files
+    for (size_t i = 0; i < files.size(); i++) {
+      const FileMetaData* f = files[i];
+      if (AfterFile(ucmp, smallest_user_key, f) ||
+          BeforeFile(ucmp, largest_user_key, f)) {
+        // No overlap
+      } else {
+        return true;  // Overlap
+      }
+    }
+    return false;
+  }
+
+  // Binary search over file list
+  uint32_t index = 0;
+  if (smallest_user_key != nullptr) {
+    // Find the earliest possible internal key for smallest_user_key
+    InternalKey small(*smallest_user_key, kMaxSequenceNumber,kValueTypeForSeek);
+    index = FindFile(icmp, files, small.Encode());
+  }
+
+  if (index >= files.size()) {
+    // beginning of range is after all files, so no overlap.
+    return false;
+  }
+
+  return !BeforeFile(ucmp, largest_user_key, files[index]);
+}
+
+// An internal iterator.  For a given version/level pair, yields
+// information about the files in the level.  For a given entry, key()
+// is the largest key that occurs in the file, and value() is an
+// 16-byte value containing the file number and file size, both
+// encoded using EncodeFixed64.
+class Version::LevelFileNumIterator : public Iterator {
+ public:
+  LevelFileNumIterator(const InternalKeyComparator& icmp,
+                       const std::vector<FileMetaData*>* flist)
+      : icmp_(icmp),
+        flist_(flist),
+        index_(flist->size()) {        // Marks as invalid
+  }
+  virtual bool Valid() const {
+    return index_ < flist_->size();
+  }
+  virtual void Seek(const Slice& target) {
+    index_ = FindFile(icmp_, *flist_, target);
+  }
+  virtual void SeekToFirst() { index_ = 0; }
+  virtual void SeekToLast() {
+    index_ = flist_->empty() ? 0 : flist_->size() - 1;
+  }
+  virtual void Next() {
+    assert(Valid());
+    index_++;
+  }
+  virtual void Prev() {
+    assert(Valid());
+    if (index_ == 0) {
+      index_ = flist_->size();  // Marks as invalid
+    } else {
+      index_--;
+    }
+  }
+  Slice key() const {
+    assert(Valid());
+    return (*flist_)[index_]->largest.Encode();
+  }
+  Slice value() const {
+    assert(Valid());
+    EncodeFixed64(value_buf_, (*flist_)[index_]->number);
+    EncodeFixed64(value_buf_+8, (*flist_)[index_]->file_size);
+    return Slice(value_buf_, sizeof(value_buf_));
+  }
+  virtual Status status() const { return Status::OK(); }
+ private:
+  const InternalKeyComparator icmp_;
+  const std::vector<FileMetaData*>* const flist_;
+  uint32_t index_;
+
+  // Backing store for value().  Holds the file number and size.
+  mutable char value_buf_[16];
+};
+
+static Iterator* GetFileIterator(void* arg,
+                                 const ReadOptions& options,
+                                 const EnvOptions& soptions,
+                                 const Slice& file_value,
+                                 bool for_compaction) {
+  TableCache* cache = reinterpret_cast<TableCache*>(arg);
+  if (file_value.size() != 16) {
+    return NewErrorIterator(
+        Status::Corruption("FileReader invoked with unexpected value"));
+  } else {
+    ReadOptions options_copy;
+    if (options.prefix) {
+      // suppress prefix filtering since we have already checked the
+      // filters once at this point
+      options_copy = options;
+      options_copy.prefix = nullptr;
+    }
+    return cache->NewIterator(options.prefix ? options_copy : options,
+                              soptions,
+                              DecodeFixed64(file_value.data()),
+                              DecodeFixed64(file_value.data() + 8),
+                              nullptr /* don't need reference to table*/,
+                              for_compaction);
+  }
+}
+
+bool Version::PrefixMayMatch(const ReadOptions& options,
+                             const EnvOptions& soptions,
+                             const Slice& internal_prefix,
+                             Iterator* level_iter) const {
+  bool may_match = true;
+  level_iter->Seek(internal_prefix);
+  if (!level_iter->Valid()) {
+    // we're past end of level
+    may_match = false;
+  } else if (ExtractUserKey(level_iter->key()).starts_with(
+                                             ExtractUserKey(internal_prefix))) {
+    // TODO(tylerharter): do we need this case?  Or are we guaranteed
+    // key() will always be the biggest value for this SST?
+    may_match = true;
+  } else {
+    may_match = vset_->table_cache_->PrefixMayMatch(
+                           options,
+                           DecodeFixed64(level_iter->value().data()),
+                           DecodeFixed64(level_iter->value().data() + 8),
+                           internal_prefix, nullptr);
+  }
+  return may_match;
+}
+
+Iterator* Version::NewConcatenatingIterator(const ReadOptions& options,
+                                            const EnvOptions& soptions,
+                                            int level) const {
+  Iterator* level_iter = new LevelFileNumIterator(vset_->icmp_, &files_[level]);
+  if (options.prefix) {
+    InternalKey internal_prefix(*options.prefix, 0, kTypeValue);
+    if (!PrefixMayMatch(options, soptions,
+                        internal_prefix.Encode(), level_iter)) {
+      delete level_iter;
+      // nothing in this level can match the prefix
+      return NewEmptyIterator();
+    }
+  }
+  return NewTwoLevelIterator(level_iter, &GetFileIterator,
+                             vset_->table_cache_, options, soptions);
+}
+
+void Version::AddIterators(const ReadOptions& options,
+                           const EnvOptions& soptions,
+                           std::vector<Iterator*>* iters) {
+  // Merge all level zero files together since they may overlap
+  for (const FileMetaData* file : files_[0]) {
+    iters->push_back(
+        vset_->table_cache_->NewIterator(
+            options, soptions, file->number, file->file_size));
+  }
+
+  // For levels > 0, we can use a concatenating iterator that sequentially
+  // walks through the non-overlapping files in the level, opening them
+  // lazily.
+  for (int level = 1; level < num_levels_; level++) {
+    if (!files_[level].empty()) {
+      iters->push_back(NewConcatenatingIterator(options, soptions, level));
+    }
+  }
+}
+
+// Callback from TableCache::Get()
+namespace {
+enum SaverState {
+  kNotFound,
+  kFound,
+  kDeleted,
+  kCorrupt,
+  kMerge // saver contains the current merge result (the operands)
+};
+struct Saver {
+  SaverState state;
+  const Comparator* ucmp;
+  Slice user_key;
+  bool* value_found; // Is value set correctly? Used by KeyMayExist
+  std::string* value;
+  const MergeOperator* merge_operator;
+  // the merge operations encountered;
+  MergeContext* merge_context;
+  Logger* logger;
+  bool didIO;    // did we do any disk io?
+  Statistics* statistics;
+};
+}
+
+// Called from TableCache::Get and Table::Get when file/block in which
+// key may  exist are not there in TableCache/BlockCache respectively. In this
+// case we  can't guarantee that key does not exist and are not permitted to do
+// IO to be  certain.Set the status=kFound and value_found=false to let the
+// caller know that key may exist but is not there in memory
+static void MarkKeyMayExist(void* arg) {
+  Saver* s = reinterpret_cast<Saver*>(arg);
+  s->state = kFound;
+  if (s->value_found != nullptr) {
+    *(s->value_found) = false;
+  }
+}
+
+static bool SaveValue(void* arg, const Slice& ikey, const Slice& v, bool didIO){
+  Saver* s = reinterpret_cast<Saver*>(arg);
+  MergeContext* merge_contex = s->merge_context;
+  std::string merge_result;  // temporary area for merge results later
+
+  assert(s != nullptr && merge_contex != nullptr);
+
+  ParsedInternalKey parsed_key;
+  // TODO: didIO and Merge?
+  s->didIO = didIO;
+  if (!ParseInternalKey(ikey, &parsed_key)) {
+    // TODO: what about corrupt during Merge?
+    s->state = kCorrupt;
+  } else {
+    if (s->ucmp->Compare(parsed_key.user_key, s->user_key) == 0) {
+      // Key matches. Process it
+      switch (parsed_key.type) {
+        case kTypeValue:
+          if (kNotFound == s->state) {
+            s->state = kFound;
+            s->value->assign(v.data(), v.size());
+          } else if (kMerge == s->state) {
+            assert(s->merge_operator != nullptr);
+            s->state = kFound;
+            if (!s->merge_operator->FullMerge(s->user_key, &v,
+                                              merge_contex->GetOperands(),
+                                              s->value, s->logger)) {
+              RecordTick(s->statistics, NUMBER_MERGE_FAILURES);
+              s->state = kCorrupt;
+            }
+          } else {
+            assert(false);
+          }
+          return false;
+
+        case kTypeDeletion:
+          if (kNotFound == s->state) {
+            s->state = kDeleted;
+          } else if (kMerge == s->state) {
+            s->state = kFound;
+          if (!s->merge_operator->FullMerge(s->user_key, nullptr,
+                                            merge_contex->GetOperands(),
+                                            s->value, s->logger)) {
+              RecordTick(s->statistics, NUMBER_MERGE_FAILURES);
+              s->state = kCorrupt;
+            }
+          } else {
+            assert(false);
+          }
+          return false;
+
+        case kTypeMerge:
+          assert(s->state == kNotFound || s->state == kMerge);
+          s->state = kMerge;
+          merge_contex->PushOperand(v);
+          while (merge_contex->GetNumOperands() >= 2) {
+            // Attempt to merge operands together via user associateive merge
+            if (s->merge_operator->PartialMerge(s->user_key,
+                                                merge_contex->GetOperand(0),
+                                                merge_contex->GetOperand(1),
+                                                &merge_result,
+                                                s->logger)) {
+              merge_contex->PushPartialMergeResult(merge_result);
+            } else {
+              // Associative merge returns false ==> stack the operands
+              break;
+            }
+          }
+          return true;
+
+        case kTypeLogData:
+          assert(false);
+          break;
+      }
+    }
+  }
+
+  // s->state could be Corrupt, merge or notfound
+
+  return false;
+}
+
+static bool NewestFirst(FileMetaData* a, FileMetaData* b) {
+  return a->number > b->number;
+}
+static bool NewestFirstBySeqNo(FileMetaData* a, FileMetaData* b) {
+  if (a->smallest_seqno > b->smallest_seqno) {
+    assert(a->largest_seqno > b->largest_seqno);
+    return true;
+  }
+  assert(a->largest_seqno <= b->largest_seqno);
+  return false;
+}
+
+Version::Version(VersionSet* vset, uint64_t version_number)
+    : vset_(vset),
+      next_(this),
+      prev_(this),
+      refs_(0),
+      num_levels_(vset->num_levels_),
+      files_(new std::vector<FileMetaData*>[num_levels_]),
+      files_by_size_(num_levels_),
+      next_file_to_compact_by_size_(num_levels_),
+      file_to_compact_(nullptr),
+      file_to_compact_level_(-1),
+      compaction_score_(num_levels_),
+      compaction_level_(num_levels_),
+      version_number_(version_number) {}
+
+void Version::Get(const ReadOptions& options,
+                  const LookupKey& k,
+                  std::string* value,
+                  Status* status,
+                  MergeContext* merge_context,
+                  GetStats* stats,
+                  const Options& db_options,
+                  bool* value_found) {
+  Slice ikey = k.internal_key();
+  Slice user_key = k.user_key();
+  const Comparator* ucmp = vset_->icmp_.user_comparator();
+
+  auto merge_operator = db_options.merge_operator.get();
+  auto logger = db_options.info_log;
+
+  assert(status->ok() || status->IsMergeInProgress());
+  Saver saver;
+  saver.state = status->ok()? kNotFound : kMerge;
+  saver.ucmp = ucmp;
+  saver.user_key = user_key;
+  saver.value_found = value_found;
+  saver.value = value;
+  saver.merge_operator = merge_operator;
+  saver.merge_context = merge_context;
+  saver.logger = logger.get();
+  saver.didIO = false;
+  saver.statistics = db_options.statistics.get();
+
+  stats->seek_file = nullptr;
+  stats->seek_file_level = -1;
+  FileMetaData* last_file_read = nullptr;
+  int last_file_read_level = -1;
+
+  // We can search level-by-level since entries never hop across
+  // levels.  Therefore we are guaranteed that if we find data
+  // in an smaller level, later levels are irrelevant (unless we
+  // are MergeInProgress).
+  for (int level = 0; level < num_levels_; level++) {
+    size_t num_files = files_[level].size();
+    if (num_files == 0) continue;
+
+    // Get the list of files to search in this level
+    FileMetaData* const* files = &files_[level][0];
+
+    // Some files may overlap each other. We find
+    // all files that overlap user_key and process them in order from
+    // newest to oldest. In the context of merge-operator,
+    // this can occur at any level. Otherwise, it only occurs
+    // at Level-0 (since Put/Deletes are always compacted into a single entry).
+    uint32_t start_index;
+    if (level == 0) {
+      // On Level-0, we read through all files to check for overlap.
+      start_index = 0;
+    } else {
+      // On Level-n (n>=1), files are sorted.
+      // Binary search to find earliest index whose largest key >= ikey.
+      // We will also stop when the file no longer overlaps ikey
+      start_index = FindFile(vset_->icmp_, files_[level], ikey);
+    }
+
+    // Traverse each relevant file to find the desired key
+#ifndef NDEBUG
+    FileMetaData* prev_file = nullptr;
+#endif
+    for (uint32_t i = start_index; i < num_files; ++i) {
+      FileMetaData* f = files[i];
+      if (ucmp->Compare(user_key, f->smallest.user_key()) < 0 ||
+          ucmp->Compare(user_key, f->largest.user_key()) > 0) {
+        // Only process overlapping files.
+        if (level > 0) {
+          // If on Level-n (n>=1) then the files are sorted.
+          // So we can stop looking when we are past the ikey.
+          break;
+        }
+        // TODO: do we want to check file ranges for level0 files at all?
+        // For new SST format where Get() is fast, we might want to consider
+        // to avoid those two comparisons, if it can filter out too few files.
+        continue;
+      }
+#ifndef NDEBUG
+      // Sanity check to make sure that the files are correctly sorted
+      if (prev_file) {
+        if (level != 0) {
+          int comp_sign = vset_->icmp_.Compare(prev_file->largest, f->smallest);
+          assert(comp_sign < 0);
+        } else {
+          // level == 0, the current file cannot be newer than the previous one.
+          if (vset_->options_->compaction_style == kCompactionStyleUniversal) {
+            assert(!NewestFirstBySeqNo(f, prev_file));
+          } else {
+            assert(!NewestFirst(f, prev_file));
+          }
+        }
+      }
+      prev_file = f;
+#endif
+      bool tableIO = false;
+      *status = vset_->table_cache_->Get(options, f->number, f->file_size,
+                                         ikey, &saver, SaveValue, &tableIO,
+                                         MarkKeyMayExist);
+      // TODO: examine the behavior for corrupted key
+      if (!status->ok()) {
+        return;
+      }
+
+      if (last_file_read != nullptr && stats->seek_file == nullptr) {
+        // We have had more than one seek for this read.  Charge the 1st file.
+        stats->seek_file = last_file_read;
+        stats->seek_file_level = last_file_read_level;
+      }
+
+      // If we did any IO as part of the read, then we remember it because
+      // it is a possible candidate for seek-based compaction. saver.didIO
+      // is true if the block had to be read in from storage and was not
+      // pre-exisiting in the block cache. Also, if this file was not pre-
+      // existing in the table cache and had to be freshly opened that needed
+      // the index blocks to be read-in, then tableIO is true. One thing
+      // to note is that the index blocks are not part of the block cache.
+      if (saver.didIO || tableIO) {
+        last_file_read = f;
+        last_file_read_level = level;
+      }
+
+      switch (saver.state) {
+        case kNotFound:
+          break;      // Keep searching in other files
+        case kFound:
+          return;
+        case kDeleted:
+          *status = Status::NotFound();  // Use empty error message for speed
+          return;
+        case kCorrupt:
+          *status = Status::Corruption("corrupted key for ", user_key);
+          return;
+        case kMerge:
+          break;
+      }
+    }
+  }
+
+
+  if (kMerge == saver.state) {
+    // merge_operands are in saver and we hit the beginning of the key history
+    // do a final merge of nullptr and operands;
+    if (merge_operator->FullMerge(user_key, nullptr,
+                                  saver.merge_context->GetOperands(),
+                                  value, logger.get())) {
+      *status = Status::OK();
+    } else {
+      RecordTick(db_options.statistics.get(), NUMBER_MERGE_FAILURES);
+      *status = Status::Corruption("could not perform end-of-key merge for ",
+                                   user_key);
+    }
+  } else {
+    *status = Status::NotFound(); // Use an empty error message for speed
+  }
+}
+
+bool Version::UpdateStats(const GetStats& stats) {
+  FileMetaData* f = stats.seek_file;
+  if (f != nullptr) {
+    f->allowed_seeks--;
+    if (f->allowed_seeks <= 0 && file_to_compact_ == nullptr) {
+      file_to_compact_ = f;
+      file_to_compact_level_ = stats.seek_file_level;
+      return true;
+    }
+  }
+  return false;
+}
+
+void Version::Ref() {
+  ++refs_;
+}
+
+void Version::Unref() {
+  assert(this != &vset_->dummy_versions_);
+  assert(refs_ >= 1);
+  --refs_;
+  if (refs_ == 0) {
+    delete this;
+  }
+}
+
+bool Version::OverlapInLevel(int level,
+                             const Slice* smallest_user_key,
+                             const Slice* largest_user_key) {
+  return SomeFileOverlapsRange(vset_->icmp_, (level > 0), files_[level],
+                               smallest_user_key, largest_user_key);
+}
+
+int Version::PickLevelForMemTableOutput(
+    const Slice& smallest_user_key,
+    const Slice& largest_user_key) {
+  int level = 0;
+  if (!OverlapInLevel(0, &smallest_user_key, &largest_user_key)) {
+    // Push to next level if there is no overlap in next level,
+    // and the #bytes overlapping in the level after that are limited.
+    InternalKey start(smallest_user_key, kMaxSequenceNumber, kValueTypeForSeek);
+    InternalKey limit(largest_user_key, 0, static_cast<ValueType>(0));
+    std::vector<FileMetaData*> overlaps;
+    int max_mem_compact_level = vset_->options_->max_mem_compaction_level;
+    while (max_mem_compact_level > 0 && level < max_mem_compact_level) {
+      if (OverlapInLevel(level + 1, &smallest_user_key, &largest_user_key)) {
+        break;
+      }
+      if (level + 2 >= num_levels_) {
+        level++;
+        break;
+      }
+      GetOverlappingInputs(level + 2, &start, &limit, &overlaps);
+      const uint64_t sum = TotalFileSize(overlaps);
+      if (sum > vset_->MaxGrandParentOverlapBytes(level)) {
+        break;
+      }
+      level++;
+    }
+  }
+
+  return level;
+}
+
+// Store in "*inputs" all files in "level" that overlap [begin,end]
+// If hint_index is specified, then it points to a file in the
+// overlapping range.
+// The file_index returns a pointer to any file in an overlapping range.
+void Version::GetOverlappingInputs(
+    int level,
+    const InternalKey* begin,
+    const InternalKey* end,
+    std::vector<FileMetaData*>* inputs,
+    int hint_index,
+    int* file_index) {
+  inputs->clear();
+  Slice user_begin, user_end;
+  if (begin != nullptr) {
+    user_begin = begin->user_key();
+  }
+  if (end != nullptr) {
+    user_end = end->user_key();
+  }
+  if (file_index) {
+    *file_index = -1;
+  }
+  const Comparator* user_cmp = vset_->icmp_.user_comparator();
+  if (begin != nullptr && end != nullptr && level > 0) {
+    GetOverlappingInputsBinarySearch(level, user_begin, user_end, inputs,
+      hint_index, file_index);
+    return;
+  }
+  for (size_t i = 0; i < files_[level].size(); ) {
+    FileMetaData* f = files_[level][i++];
+    const Slice file_start = f->smallest.user_key();
+    const Slice file_limit = f->largest.user_key();
+    if (begin != nullptr && user_cmp->Compare(file_limit, user_begin) < 0) {
+      // "f" is completely before specified range; skip it
+    } else if (end != nullptr && user_cmp->Compare(file_start, user_end) > 0) {
+      // "f" is completely after specified range; skip it
+    } else {
+      inputs->push_back(f);
+      if (level == 0) {
+        // Level-0 files may overlap each other.  So check if the newly
+        // added file has expanded the range.  If so, restart search.
+        if (begin != nullptr && user_cmp->Compare(file_start, user_begin) < 0) {
+          user_begin = file_start;
+          inputs->clear();
+          i = 0;
+        } else if (end != nullptr
+            && user_cmp->Compare(file_limit, user_end) > 0) {
+          user_end = file_limit;
+          inputs->clear();
+          i = 0;
+        }
+      } else if (file_index) {
+        *file_index = i-1;
+      }
+    }
+  }
+}
+
+// Store in "*inputs" all files in "level" that overlap [begin,end]
+// Employ binary search to find at least one file that overlaps the
+// specified range. From that file, iterate backwards and
+// forwards to find all overlapping files.
+void Version::GetOverlappingInputsBinarySearch(
+    int level,
+    const Slice& user_begin,
+    const Slice& user_end,
+    std::vector<FileMetaData*>* inputs,
+    int hint_index,
+    int* file_index) {
+  assert(level > 0);
+  int min = 0;
+  int mid = 0;
+  int max = files_[level].size() -1;
+  bool foundOverlap = false;
+  const Comparator* user_cmp = vset_->icmp_.user_comparator();
+
+  // if the caller already knows the index of a file that has overlap,
+  // then we can skip the binary search.
+  if (hint_index != -1) {
+    mid = hint_index;
+    foundOverlap = true;
+  }
+
+  while (!foundOverlap && min <= max) {
+    mid = (min + max)/2;
+    FileMetaData* f = files_[level][mid];
+    const Slice file_start = f->smallest.user_key();
+    const Slice file_limit = f->largest.user_key();
+    if (user_cmp->Compare(file_limit, user_begin) < 0) {
+      min = mid + 1;
+    } else if (user_cmp->Compare(user_end, file_start) < 0) {
+      max = mid - 1;
+    } else {
+      foundOverlap = true;
+      break;
+    }
+  }
+
+  // If there were no overlapping files, return immediately.
+  if (!foundOverlap) {
+    return;
+  }
+  // returns the index where an overlap is found
+  if (file_index) {
+    *file_index = mid;
+  }
+  ExtendOverlappingInputs(level, user_begin, user_end, inputs, mid);
+}
+
+// Store in "*inputs" all files in "level" that overlap [begin,end]
+// The midIndex specifies the index of at least one file that
+// overlaps the specified range. From that file, iterate backward
+// and forward to find all overlapping files.
+void Version::ExtendOverlappingInputs(
+    int level,
+    const Slice& user_begin,
+    const Slice& user_end,
+    std::vector<FileMetaData*>* inputs,
+    unsigned int midIndex) {
+
+  const Comparator* user_cmp = vset_->icmp_.user_comparator();
+#ifndef NDEBUG
+  {
+    // assert that the file at midIndex overlaps with the range
+    assert(midIndex < files_[level].size());
+    FileMetaData* f = files_[level][midIndex];
+    const Slice fstart = f->smallest.user_key();
+    const Slice flimit = f->largest.user_key();
+    if (user_cmp->Compare(fstart, user_begin) >= 0) {
+      assert(user_cmp->Compare(fstart, user_end) <= 0);
+    } else {
+      assert(user_cmp->Compare(flimit, user_begin) >= 0);
+    }
+  }
+#endif
+  int startIndex = midIndex + 1;
+  int endIndex = midIndex;
+  int count __attribute__((unused)) = 0;
+
+  // check backwards from 'mid' to lower indices
+  for (int i = midIndex; i >= 0 ; i--) {
+    FileMetaData* f = files_[level][i];
+    const Slice file_limit = f->largest.user_key();
+    if (user_cmp->Compare(file_limit, user_begin) >= 0) {
+      startIndex = i;
+      assert((count++, true));
+    } else {
+      break;
+    }
+  }
+  // check forward from 'mid+1' to higher indices
+  for (unsigned int i = midIndex+1; i < files_[level].size(); i++) {
+    FileMetaData* f = files_[level][i];
+    const Slice file_start = f->smallest.user_key();
+    if (user_cmp->Compare(file_start, user_end) <= 0) {
+      assert((count++, true));
+      endIndex = i;
+    } else {
+      break;
+    }
+  }
+  assert(count == endIndex - startIndex + 1);
+
+  // insert overlapping files into vector
+  for (int i = startIndex; i <= endIndex; i++) {
+    FileMetaData* f = files_[level][i];
+    inputs->push_back(f);
+  }
+}
+
+// Returns true iff the first or last file in inputs contains
+// an overlapping user key to the file "just outside" of it (i.e.
+// just after the last file, or just before the first file)
+// REQUIRES: "*inputs" is a sorted list of non-overlapping files
+bool Version::HasOverlappingUserKey(
+    const std::vector<FileMetaData*>* inputs,
+    int level) {
+
+  // If inputs empty, there is no overlap.
+  // If level == 0, it is assumed that all needed files were already included.
+  if (inputs->empty() || level == 0){
+    return false;
+  }
+
+  const Comparator* user_cmp = vset_->icmp_.user_comparator();
+  const std::vector<FileMetaData*>& files = files_[level];
+  const size_t kNumFiles = files.size();
+
+  // Check the last file in inputs against the file after it
+  size_t last_file = FindFile(vset_->icmp_, files,
+                              inputs->back()->largest.Encode());
+  assert(0 <= last_file && last_file < kNumFiles);  // File should exist!
+  if (last_file < kNumFiles-1) {                    // If not the last file
+    const Slice last_key_in_input = files[last_file]->largest.user_key();
+    const Slice first_key_after = files[last_file+1]->smallest.user_key();
+    if (user_cmp->Compare(last_key_in_input, first_key_after) == 0) {
+      // The last user key in input overlaps with the next file's first key
+      return true;
+    }
+  }
+
+  // Check the first file in inputs against the file just before it
+  size_t first_file = FindFile(vset_->icmp_, files,
+                               inputs->front()->smallest.Encode());
+  assert(0 <= first_file && first_file <= last_file);   // File should exist!
+  if (first_file > 0) {                                 // If not first file
+    const Slice& first_key_in_input = files[first_file]->smallest.user_key();
+    const Slice& last_key_before = files[first_file-1]->largest.user_key();
+    if (user_cmp->Compare(first_key_in_input, last_key_before) == 0) {
+      // The first user key in input overlaps with the previous file's last key
+      return true;
+    }
+  }
+
+  return false;
+}
+
+int64_t Version::NumLevelBytes(int level) const {
+  assert(level >= 0);
+  assert(level < NumberLevels());
+  return TotalFileSize(files_[level]);
+}
+
+const char* Version::LevelSummary(LevelSummaryStorage* scratch) const {
+  int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files[");
+  for (int i = 0; i < NumberLevels(); i++) {
+    int sz = sizeof(scratch->buffer) - len;
+    int ret = snprintf(scratch->buffer + len, sz, "%d ", int(files_[i].size()));
+    if (ret < 0 || ret >= sz) break;
+    len += ret;
+  }
+  snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "]");
+  return scratch->buffer;
+}
+
+const char* Version::LevelFileSummary(FileSummaryStorage* scratch,
+                                      int level) const {
+  int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files_size[");
+  for (const auto& f : files_[level]) {
+    int sz = sizeof(scratch->buffer) - len;
+    int ret = snprintf(scratch->buffer + len, sz,
+                       "#%lu(seq=%lu,sz=%lu,%lu) ",
+                       (unsigned long)f->number,
+                       (unsigned long)f->smallest_seqno,
+                       (unsigned long)f->file_size,
+                       (unsigned long)f->being_compacted);
+    if (ret < 0 || ret >= sz)
+      break;
+    len += ret;
+  }
+  snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "]");
+  return scratch->buffer;
+}
+
+int64_t Version::MaxNextLevelOverlappingBytes() {
+  uint64_t result = 0;
+  std::vector<FileMetaData*> overlaps;
+  for (int level = 1; level < NumberLevels() - 1; level++) {
+    for (const auto& f : files_[level]) {
+      GetOverlappingInputs(level + 1, &f->smallest, &f->largest, &overlaps);
+      const uint64_t sum = TotalFileSize(overlaps);
+      if (sum > result) {
+        result = sum;
+      }
+    }
+  }
+  return result;
+}
+
+void Version::AddLiveFiles(std::set<uint64_t>* live) {
+  for (int level = 0; level < NumberLevels(); level++) {
+    const std::vector<FileMetaData*>& files = files_[level];
+    for (const auto& file : files) {
+      live->insert(file->number);
+    }
+  }
+}
+
+std::string Version::DebugString(bool hex) const {
+  std::string r;
+  for (int level = 0; level < num_levels_; level++) {
+    // E.g.,
+    //   --- level 1 ---
+    //   17:123['a' .. 'd']
+    //   20:43['e' .. 'g']
+    r.append("--- level ");
+    AppendNumberTo(&r, level);
+    r.append(" --- version# ");
+    AppendNumberTo(&r, version_number_);
+    r.append(" ---\n");
+    const std::vector<FileMetaData*>& files = files_[level];
+    for (size_t i = 0; i < files.size(); i++) {
+      r.push_back(' ');
+      AppendNumberTo(&r, files[i]->number);
+      r.push_back(':');
+      AppendNumberTo(&r, files[i]->file_size);
+      r.append("[");
+      r.append(files[i]->smallest.DebugString(hex));
+      r.append(" .. ");
+      r.append(files[i]->largest.DebugString(hex));
+      r.append("]\n");
+    }
+  }
+  return r;
+}
+
+// this is used to batch writes to the manifest file
+struct VersionSet::ManifestWriter {
+  Status status;
+  bool done;
+  port::CondVar cv;
+  VersionEdit* edit;
+
+  explicit ManifestWriter(port::Mutex* mu, VersionEdit* e) :
+             done(false), cv(mu), edit(e) {}
+};
+
+// A helper class so we can efficiently apply a whole sequence
+// of edits to a particular state without creating intermediate
+// Versions that contain full copies of the intermediate state.
+class VersionSet::Builder {
+ private:
+  // Helper to sort by v->files_[file_number].smallest
+  struct BySmallestKey {
+    const InternalKeyComparator* internal_comparator;
+
+    bool operator()(FileMetaData* f1, FileMetaData* f2) const {
+      int r = internal_comparator->Compare(f1->smallest, f2->smallest);
+      if (r != 0) {
+        return (r < 0);
+      } else {
+        // Break ties by file number
+        return (f1->number < f2->number);
+      }
+    }
+  };
+
+  typedef std::set<FileMetaData*, BySmallestKey> FileSet;
+  struct LevelState {
+    std::set<uint64_t> deleted_files;
+    FileSet* added_files;
+  };
+
+  VersionSet* vset_;
+  Version* base_;
+  LevelState* levels_;
+
+ public:
+  // Initialize a builder with the files from *base and other info from *vset
+  Builder(VersionSet* vset, Version* base) : vset_(vset), base_(base) {
+    base_->Ref();
+    levels_ = new LevelState[base->NumberLevels()];
+    BySmallestKey cmp;
+    cmp.internal_comparator = &vset_->icmp_;
+    for (int level = 0; level < base->NumberLevels(); level++) {
+      levels_[level].added_files = new FileSet(cmp);
+    }
+  }
+
+  ~Builder() {
+    for (int level = 0; level < base_->NumberLevels(); level++) {
+      const FileSet* added = levels_[level].added_files;
+      std::vector<FileMetaData*> to_unref;
+      to_unref.reserve(added->size());
+      for (FileSet::const_iterator it = added->begin();
+          it != added->end(); ++it) {
+        to_unref.push_back(*it);
+      }
+      delete added;
+      for (uint32_t i = 0; i < to_unref.size(); i++) {
+        FileMetaData* f = to_unref[i];
+        f->refs--;
+        if (f->refs <= 0) {
+          delete f;
+        }
+      }
+    }
+    delete[] levels_;
+    base_->Unref();
+  }
+
+  void CheckConsistency(Version* v) {
+#ifndef NDEBUG
+    for (int level = 0; level < v->NumberLevels(); level++) {
+      // Make sure there is no overlap in levels > 0
+      if (level > 0) {
+        for (uint32_t i = 1; i < v->files_[level].size(); i++) {
+          const InternalKey& prev_end = v->files_[level][i-1]->largest;
+          const InternalKey& this_begin = v->files_[level][i]->smallest;
+          if (vset_->icmp_.Compare(prev_end, this_begin) >= 0) {
+            fprintf(stderr, "overlapping ranges in same level %s vs. %s\n",
+                    prev_end.DebugString().c_str(),
+                    this_begin.DebugString().c_str());
+            abort();
+          }
+        }
+      }
+    }
+#endif
+  }
+
+  void CheckConsistencyForDeletes(VersionEdit* edit, unsigned int number,
+                                  int level) {
+#ifndef NDEBUG
+      // a file to be deleted better exist in the previous version
+      bool found = false;
+      for (int l = 0; !found && l < base_->NumberLevels(); l++) {
+        const std::vector<FileMetaData*>& base_files = base_->files_[l];
+        for (unsigned int i = 0; i < base_files.size(); i++) {
+          FileMetaData* f = base_files[i];
+          if (f->number == number) {
+            found =  true;
+            break;
+          }
+        }
+      }
+      // if the file did not exist in the previous version, then it
+      // is possibly moved from lower level to higher level in current
+      // version
+      for (int l = level+1; !found && l < base_->NumberLevels(); l++) {
+        const FileSet* added = levels_[l].added_files;
+        for (FileSet::const_iterator added_iter = added->begin();
+             added_iter != added->end(); ++added_iter) {
+          FileMetaData* f = *added_iter;
+          if (f->number == number) {
+            found = true;
+            break;
+          }
+        }
+      }
+
+      // maybe this file was added in a previous edit that was Applied
+      if (!found) {
+        const FileSet* added = levels_[level].added_files;
+        for (FileSet::const_iterator added_iter = added->begin();
+             added_iter != added->end(); ++added_iter) {
+          FileMetaData* f = *added_iter;
+          if (f->number == number) {
+            found = true;
+            break;
+          }
+        }
+      }
+      assert(found);
+#endif
+  }
+
+  // Apply all of the edits in *edit to the current state.
+  void Apply(VersionEdit* edit) {
+    CheckConsistency(base_);
+
+    // Update compaction pointers
+    for (size_t i = 0; i < edit->compact_pointers_.size(); i++) {
+      const int level = edit->compact_pointers_[i].first;
+      vset_->compact_pointer_[level] =
+          edit->compact_pointers_[i].second.Encode().ToString();
+    }
+
+    // Delete files
+    const VersionEdit::DeletedFileSet& del = edit->deleted_files_;
+    for (VersionEdit::DeletedFileSet::const_iterator iter = del.begin();
+         iter != del.end();
+         ++iter) {
+      const int level = iter->first;
+      const uint64_t number = iter->second;
+      levels_[level].deleted_files.insert(number);
+      CheckConsistencyForDeletes(edit, number, level);
+    }
+
+    // Add new files
+    for (size_t i = 0; i < edit->new_files_.size(); i++) {
+      const int level = edit->new_files_[i].first;
+      FileMetaData* f = new FileMetaData(edit->new_files_[i].second);
+      f->refs = 1;
+
+      // We arrange to automatically compact this file after
+      // a certain number of seeks.  Let's assume:
+      //   (1) One seek costs 10ms
+      //   (2) Writing or reading 1MB costs 10ms (100MB/s)
+      //   (3) A compaction of 1MB does 25MB of IO:
+      //         1MB read from this level
+      //         10-12MB read from next level (boundaries may be misaligned)
+      //         10-12MB written to next level
+      // This implies that 25 seeks cost the same as the compaction
+      // of 1MB of data.  I.e., one seek costs approximately the
+      // same as the compaction of 40KB of data.  We are a little
+      // conservative and allow approximately one seek for every 16KB
+      // of data before triggering a compaction.
+      f->allowed_seeks = (f->file_size / 16384);
+      if (f->allowed_seeks < 100) f->allowed_seeks = 100;
+
+      levels_[level].deleted_files.erase(f->number);
+      levels_[level].added_files->insert(f);
+    }
+  }
+
+  // Save the current state in *v.
+  void SaveTo(Version* v) {
+    CheckConsistency(base_);
+    CheckConsistency(v);
+    BySmallestKey cmp;
+    cmp.internal_comparator = &vset_->icmp_;
+    for (int level = 0; level < base_->NumberLevels(); level++) {
+      // Merge the set of added files with the set of pre-existing files.
+      // Drop any deleted files.  Store the result in *v.
+      const std::vector<FileMetaData*>& base_files = base_->files_[level];
+      std::vector<FileMetaData*>::const_iterator base_iter = base_files.begin();
+      std::vector<FileMetaData*>::const_iterator base_end = base_files.end();
+      const FileSet* added = levels_[level].added_files;
+      v->files_[level].reserve(base_files.size() + added->size());
+      for (FileSet::const_iterator added_iter = added->begin();
+           added_iter != added->end();
+           ++added_iter) {
+        // Add all smaller files listed in base_
+        for (std::vector<FileMetaData*>::const_iterator bpos
+                 = std::upper_bound(base_iter, base_end, *added_iter, cmp);
+             base_iter != bpos;
+             ++base_iter) {
+          MaybeAddFile(v, level, *base_iter);
+        }
+
+        MaybeAddFile(v, level, *added_iter);
+      }
+
+      // Add remaining base files
+      for (; base_iter != base_end; ++base_iter) {
+        MaybeAddFile(v, level, *base_iter);
+      }
+    }
+
+    CheckConsistency(v);
+  }
+
+  void MaybeAddFile(Version* v, int level, FileMetaData* f) {
+    if (levels_[level].deleted_files.count(f->number) > 0) {
+      // File is deleted: do nothing
+    } else {
+      std::vector<FileMetaData*>* files = &v->files_[level];
+      if (level > 0 && !files->empty()) {
+        // Must not overlap
+        assert(vset_->icmp_.Compare((*files)[files->size()-1]->largest,
+                                    f->smallest) < 0);
+      }
+      f->refs++;
+      files->push_back(f);
+    }
+  }
+};
+
+VersionSet::VersionSet(const std::string& dbname, const Options* options,
+                       const EnvOptions& storage_options,
+                       TableCache* table_cache,
+                       const InternalKeyComparator* cmp)
+    : env_(options->env),
+      dbname_(dbname),
+      options_(options),
+      table_cache_(table_cache),
+      icmp_(*cmp),
+      next_file_number_(2),
+      manifest_file_number_(0),  // Filled by Recover()
+      last_sequence_(0),
+      log_number_(0),
+      prev_log_number_(0),
+      num_levels_(options_->num_levels),
+      dummy_versions_(this),
+      current_(nullptr),
+      need_slowdown_for_num_level0_files_(false),
+      compactions_in_progress_(options_->num_levels),
+      current_version_number_(0),
+      manifest_file_size_(0),
+      storage_options_(storage_options),
+      storage_options_compactions_(storage_options_) {
+  compact_pointer_ = new std::string[options_->num_levels];
+  Init(options_->num_levels);
+  AppendVersion(new Version(this, current_version_number_++));
+}
+
+VersionSet::~VersionSet() {
+  current_->Unref();
+  assert(dummy_versions_.next_ == &dummy_versions_);  // List must be empty
+  for (auto file : obsolete_files_) {
+    delete file;
+  }
+  obsolete_files_.clear();
+  delete[] compact_pointer_;
+  delete[] max_file_size_;
+  delete[] level_max_bytes_;
+}
+
+void VersionSet::Init(int num_levels) {
+  max_file_size_ = new uint64_t[num_levels];
+  level_max_bytes_ = new uint64_t[num_levels];
+  int target_file_size_multiplier = options_->target_file_size_multiplier;
+  int max_bytes_multiplier = options_->max_bytes_for_level_multiplier;
+  for (int i = 0; i < num_levels; i++) {
+    if (i == 0 && options_->compaction_style == kCompactionStyleUniversal) {
+      max_file_size_[i] = ULLONG_MAX;
+      level_max_bytes_[i] = options_->max_bytes_for_level_base;
+    } else if (i > 1) {
+      max_file_size_[i] = max_file_size_[i-1] * target_file_size_multiplier;
+      level_max_bytes_[i] = level_max_bytes_[i-1] * max_bytes_multiplier *
+        options_->max_bytes_for_level_multiplier_additional[i-1];
+    } else {
+      max_file_size_[i] = options_->target_file_size_base;
+      level_max_bytes_[i] = options_->max_bytes_for_level_base;
+    }
+  }
+}
+
+void VersionSet::AppendVersion(Version* v) {
+  // Make "v" current
+  assert(v->refs_ == 0);
+  assert(v != current_);
+  if (current_ != nullptr) {
+    assert(current_->refs_ > 0);
+    current_->Unref();
+  }
+  current_ = v;
+  need_slowdown_for_num_level0_files_ =
+      (options_->level0_slowdown_writes_trigger >= 0 && current_ != nullptr &&
+       v->NumLevelFiles(0) >= options_->level0_slowdown_writes_trigger);
+  v->Ref();
+
+  // Append to linked list
+  v->prev_ = dummy_versions_.prev_;
+  v->next_ = &dummy_versions_;
+  v->prev_->next_ = v;
+  v->next_->prev_ = v;
+}
+
+Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu,
+                               bool new_descriptor_log) {
+  mu->AssertHeld();
+
+  // queue our request
+  ManifestWriter w(mu, edit);
+  manifest_writers_.push_back(&w);
+  while (!w.done && &w != manifest_writers_.front()) {
+    w.cv.Wait();
+  }
+  if (w.done) {
+    return w.status;
+  }
+
+  std::vector<VersionEdit*> batch_edits;
+  Version* v = new Version(this, current_version_number_++);
+  Builder builder(this, current_);
+
+  // process all requests in the queue
+  ManifestWriter* last_writer = &w;
+  assert(!manifest_writers_.empty());
+  assert(manifest_writers_.front() == &w);
+  std::deque<ManifestWriter*>::iterator iter = manifest_writers_.begin();
+  for (; iter != manifest_writers_.end(); ++iter) {
+    last_writer = *iter;
+    LogAndApplyHelper(&builder, v, last_writer->edit, mu);
+    batch_edits.push_back(last_writer->edit);
+  }
+  builder.SaveTo(v);
+
+  // Initialize new descriptor log file if necessary by creating
+  // a temporary file that contains a snapshot of the current version.
+  std::string new_manifest_file;
+  uint64_t new_manifest_file_size = 0;
+  Status s;
+  // we will need this if we are creating new manifest
+  uint64_t old_manifest_file_number = manifest_file_number_;
+
+  //  No need to perform this check if a new Manifest is being created anyways.
+  if (!descriptor_log_ ||
+      manifest_file_size_ > options_->max_manifest_file_size) {
+    new_descriptor_log = true;
+    manifest_file_number_ = NewFileNumber(); // Change manifest file no.
+  }
+
+  if (new_descriptor_log) {
+    new_manifest_file = DescriptorFileName(dbname_, manifest_file_number_);
+    edit->SetNextFile(next_file_number_);
+  }
+
+  // Unlock during expensive MANIFEST log write. New writes cannot get here
+  // because &w is ensuring that all new writes get queued.
+  {
+    // calculate the amount of data being compacted at every level
+    std::vector<uint64_t> size_being_compacted(v->NumberLevels() - 1);
+    SizeBeingCompacted(size_being_compacted);
+
+    mu->Unlock();
+
+    // This is fine because everything inside of this block is serialized --
+    // only one thread can be here at the same time
+    if (!new_manifest_file.empty()) {
+      unique_ptr<WritableFile> descriptor_file;
+      s = env_->NewWritableFile(new_manifest_file, &descriptor_file,
+                                storage_options_);
+      if (s.ok()) {
+        descriptor_log_.reset(new log::Writer(std::move(descriptor_file)));
+        s = WriteSnapshot(descriptor_log_.get());
+      }
+    }
+
+    // The calls to Finalize and UpdateFilesBySize are cpu-heavy
+    // and is best called outside the mutex.
+    Finalize(v, size_being_compacted);
+    UpdateFilesBySize(v);
+
+    // Write new record to MANIFEST log
+    if (s.ok()) {
+      std::string record;
+      for (unsigned int i = 0; i < batch_edits.size(); i++) {
+        batch_edits[i]->EncodeTo(&record);
+        s = descriptor_log_->AddRecord(record);
+        if (!s.ok()) {
+          break;
+        }
+      }
+      if (s.ok()) {
+        if (options_->use_fsync) {
+          StopWatch sw(env_, options_->statistics.get(),
+                       MANIFEST_FILE_SYNC_MICROS);
+          s = descriptor_log_->file()->Fsync();
+        } else {
+          StopWatch sw(env_, options_->statistics.get(),
+                       MANIFEST_FILE_SYNC_MICROS);
+          s = descriptor_log_->file()->Sync();
+        }
+      }
+      if (!s.ok()) {
+        Log(options_->info_log, "MANIFEST write: %s\n", s.ToString().c_str());
+        if (ManifestContains(record)) {
+          Log(options_->info_log,
+              "MANIFEST contains log record despite error; advancing to new "
+              "version to prevent mismatch between in-memory and logged state"
+              " If paranoid is set, then the db is now in readonly mode.");
+          s = Status::OK();
+        }
+      }
+    }
+
+    // If we just created a new descriptor file, install it by writing a
+    // new CURRENT file that points to it.
+    if (s.ok() && !new_manifest_file.empty()) {
+      s = SetCurrentFile(env_, dbname_, manifest_file_number_);
+      if (s.ok() && old_manifest_file_number < manifest_file_number_) {
+        // delete old manifest file
+        Log(options_->info_log,
+            "Deleting manifest %lu current manifest %lu\n",
+            (unsigned long)old_manifest_file_number,
+            (unsigned long)manifest_file_number_);
+        // we don't care about an error here, PurgeObsoleteFiles will take care
+        // of it later
+        env_->DeleteFile(DescriptorFileName(dbname_, old_manifest_file_number));
+      }
+    }
+
+    // find offset in manifest file where this version is stored.
+    new_manifest_file_size = descriptor_log_->file()->GetFileSize();
+
+    LogFlush(options_->info_log);
+    mu->Lock();
+  }
+
+  // Install the new version
+  if (s.ok()) {
+    manifest_file_size_ = new_manifest_file_size;
+    AppendVersion(v);
+    log_number_ = edit->log_number_;
+    prev_log_number_ = edit->prev_log_number_;
+
+  } else {
+    Log(options_->info_log, "Error in committing version %lu",
+        (unsigned long)v->GetVersionNumber());
+    delete v;
+    if (!new_manifest_file.empty()) {
+      descriptor_log_.reset();
+      env_->DeleteFile(new_manifest_file);
+    }
+  }
+
+  // wake up all the waiting writers
+  while (true) {
+    ManifestWriter* ready = manifest_writers_.front();
+    manifest_writers_.pop_front();
+    if (ready != &w) {
+      ready->status = s;
+      ready->done = true;
+      ready->cv.Signal();
+    }
+    if (ready == last_writer) break;
+  }
+  // Notify new head of write queue
+  if (!manifest_writers_.empty()) {
+    manifest_writers_.front()->cv.Signal();
+  }
+  return s;
+}
+
+void VersionSet::LogAndApplyHelper(Builder* builder, Version* v,
+                                   VersionEdit* edit, port::Mutex* mu) {
+  mu->AssertHeld();
+
+  if (edit->has_log_number_) {
+    assert(edit->log_number_ >= log_number_);
+    assert(edit->log_number_ < next_file_number_);
+  } else {
+    edit->SetLogNumber(log_number_);
+  }
+
+  if (!edit->has_prev_log_number_) {
+    edit->SetPrevLogNumber(prev_log_number_);
+  }
+
+  edit->SetNextFile(next_file_number_);
+  edit->SetLastSequence(last_sequence_);
+
+  builder->Apply(edit);
+}
+
+Status VersionSet::Recover() {
+  struct LogReporter : public log::Reader::Reporter {
+    Status* status;
+    virtual void Corruption(size_t bytes, const Status& s) {
+      if (this->status->ok()) *this->status = s;
+    }
+  };
+
+  // Read "CURRENT" file, which contains a pointer to the current manifest file
+  std::string current;
+  Status s = ReadFileToString(env_, CurrentFileName(dbname_), &current);
+  if (!s.ok()) {
+    return s;
+  }
+  if (current.empty() || current[current.size()-1] != '\n') {
+    return Status::Corruption("CURRENT file does not end with newline");
+  }
+  current.resize(current.size() - 1);
+
+  Log(options_->info_log, "Recovering from manifest file:%s\n",
+      current.c_str());
+
+  std::string dscname = dbname_ + "/" + current;
+  unique_ptr<SequentialFile> file;
+  s = env_->NewSequentialFile(dscname, &file, storage_options_);
+  if (!s.ok()) {
+    return s;
+  }
+  uint64_t manifest_file_size;
+  s = env_->GetFileSize(dscname, &manifest_file_size);
+  if (!s.ok()) {
+    return s;
+  }
+
+  bool have_log_number = false;
+  bool have_prev_log_number = false;
+  bool have_next_file = false;
+  bool have_last_sequence = false;
+  uint64_t next_file = 0;
+  uint64_t last_sequence = 0;
+  uint64_t log_number = 0;
+  uint64_t prev_log_number = 0;
+  Builder builder(this, current_);
+
+  {
+    LogReporter reporter;
+    reporter.status = &s;
+    log::Reader reader(std::move(file), &reporter, true/*checksum*/,
+                       0/*initial_offset*/);
+    Slice record;
+    std::string scratch;
+    while (reader.ReadRecord(&record, &scratch) && s.ok()) {
+      VersionEdit edit;
+      s = edit.DecodeFrom(record);
+      if (!s.ok()) {
+        break;
+      }
+
+      if (edit.max_level_ >= current_->NumberLevels()) {
+        s = Status::InvalidArgument(
+            "db has more levels than options.num_levels");
+        break;
+      }
+
+      if (edit.has_comparator_ &&
+          edit.comparator_ != icmp_.user_comparator()->Name()) {
+        s = Status::InvalidArgument(icmp_.user_comparator()->Name(),
+            "does not match existing comparator " +
+            edit.comparator_);
+        break;
+      }
+
+      builder.Apply(&edit);
+
+      if (edit.has_log_number_) {
+        log_number = edit.log_number_;
+        have_log_number = true;
+      }
+
+      if (edit.has_prev_log_number_) {
+        prev_log_number = edit.prev_log_number_;
+        have_prev_log_number = true;
+      }
+
+      if (edit.has_next_file_number_) {
+        next_file = edit.next_file_number_;
+        have_next_file = true;
+      }
+
+      if (edit.has_last_sequence_) {
+        last_sequence = edit.last_sequence_;
+        have_last_sequence = true;
+      }
+    }
+  }
+  file.reset();
+
+  if (s.ok()) {
+    if (!have_next_file) {
+      s = Status::Corruption("no meta-nextfile entry in descriptor");
+    } else if (!have_log_number) {
+      s = Status::Corruption("no meta-lognumber entry in descriptor");
+    } else if (!have_last_sequence) {
+      s = Status::Corruption("no last-sequence-number entry in descriptor");
+    }
+
+    if (!have_prev_log_number) {
+      prev_log_number = 0;
+    }
+
+    MarkFileNumberUsed(prev_log_number);
+    MarkFileNumberUsed(log_number);
+  }
+
+  if (s.ok()) {
+    Version* v = new Version(this, current_version_number_++);
+    builder.SaveTo(v);
+
+    // Install recovered version
+    std::vector<uint64_t> size_being_compacted(v->NumberLevels() - 1);
+    SizeBeingCompacted(size_being_compacted);
+    Finalize(v, size_being_compacted);
+
+    manifest_file_size_ = manifest_file_size;
+    AppendVersion(v);
+    manifest_file_number_ = next_file;
+    next_file_number_ = next_file + 1;
+    last_sequence_ = last_sequence;
+    log_number_ = log_number;
+    prev_log_number_ = prev_log_number;
+
+    Log(options_->info_log, "Recovered from manifest file:%s succeeded,"
+        "manifest_file_number is %lu, next_file_number is %lu, "
+        "last_sequence is %lu, log_number is %lu,"
+        "prev_log_number is %lu\n",
+        current.c_str(),
+        (unsigned long)manifest_file_number_,
+        (unsigned long)next_file_number_,
+        (unsigned long)last_sequence_,
+        (unsigned long)log_number_,
+        (unsigned long)prev_log_number_);
+  }
+
+  return s;
+}
+
+Status VersionSet::DumpManifest(Options& options, std::string& dscname,
+                                bool verbose, bool hex) {
+  struct LogReporter : public log::Reader::Reporter {
+    Status* status;
+    virtual void Corruption(size_t bytes, const Status& s) {
+      if (this->status->ok()) *this->status = s;
+    }
+  };
+
+  // Open the specified manifest file.
+  unique_ptr<SequentialFile> file;
+  Status s = options.env->NewSequentialFile(dscname, &file, storage_options_);
+  if (!s.ok()) {
+    return s;
+  }
+
+  bool have_log_number = false;
+  bool have_prev_log_number = false;
+  bool have_next_file = false;
+  bool have_last_sequence = false;
+  uint64_t next_file = 0;
+  uint64_t last_sequence = 0;
+  uint64_t log_number = 0;
+  uint64_t prev_log_number = 0;
+  int count = 0;
+  VersionSet::Builder builder(this, current_);
+
+  {
+    LogReporter reporter;
+    reporter.status = &s;
+    log::Reader reader(std::move(file), &reporter, true/*checksum*/,
+                       0/*initial_offset*/);
+    Slice record;
+    std::string scratch;
+    while (reader.ReadRecord(&record, &scratch) && s.ok()) {
+      VersionEdit edit;
+      s = edit.DecodeFrom(record);
+      if (s.ok()) {
+        if (edit.has_comparator_ &&
+            edit.comparator_ != icmp_.user_comparator()->Name()) {
+          s = Status::InvalidArgument(icmp_.user_comparator()->Name(),
+                                      "does not match existing comparator " +
+                                      edit.comparator_);
+        }
+      }
+
+      // Write out each individual edit
+      if (verbose) {
+        printf("*************************Edit[%d] = %s\n",
+                count, edit.DebugString(hex).c_str());
+      }
+      count++;
+
+      if (s.ok()) {
+        builder.Apply(&edit);
+      }
+
+      if (edit.has_log_number_) {
+        log_number = edit.log_number_;
+        have_log_number = true;
+      }
+
+      if (edit.has_prev_log_number_) {
+        prev_log_number = edit.prev_log_number_;
+        have_prev_log_number = true;
+      }
+
+      if (edit.has_next_file_number_) {
+        next_file = edit.next_file_number_;
+        have_next_file = true;
+      }
+
+      if (edit.has_last_sequence_) {
+        last_sequence = edit.last_sequence_;
+        have_last_sequence = true;
+      }
+    }
+  }
+  file.reset();
+
+  if (s.ok()) {
+    if (!have_next_file) {
+      s = Status::Corruption("no meta-nextfile entry in descriptor");
+      printf("no meta-nextfile entry in descriptor");
+    } else if (!have_log_number) {
+      s = Status::Corruption("no meta-lognumber entry in descriptor");
+      printf("no meta-lognumber entry in descriptor");
+    } else if (!have_last_sequence) {
+      printf("no last-sequence-number entry in descriptor");
+      s = Status::Corruption("no last-sequence-number entry in descriptor");
+    }
+
+    if (!have_prev_log_number) {
+      prev_log_number = 0;
+    }
+
+    MarkFileNumberUsed(prev_log_number);
+    MarkFileNumberUsed(log_number);
+  }
+
+  if (s.ok()) {
+    Version* v = new Version(this, 0);
+    builder.SaveTo(v);
+
+    // Install recovered version
+    std::vector<uint64_t> size_being_compacted(v->NumberLevels() - 1);
+    SizeBeingCompacted(size_being_compacted);
+    Finalize(v, size_being_compacted);
+
+    AppendVersion(v);
+    manifest_file_number_ = next_file;
+    next_file_number_ = next_file + 1;
+    last_sequence_ = last_sequence;
+    log_number_ = log_number;
+    prev_log_number_ = prev_log_number;
+
+    printf("manifest_file_number %lu next_file_number %lu last_sequence "
+           "%lu log_number %lu  prev_log_number %lu\n",
+           (unsigned long)manifest_file_number_,
+           (unsigned long)next_file_number_,
+           (unsigned long)last_sequence,
+           (unsigned long)log_number,
+           (unsigned long)prev_log_number);
+    printf("%s \n", v->DebugString(hex).c_str());
+  }
+
+  return s;
+}
+
+void VersionSet::MarkFileNumberUsed(uint64_t number) {
+  if (next_file_number_ <= number) {
+    next_file_number_ = number + 1;
+  }
+}
+
+void VersionSet::Finalize(Version* v,
+                          std::vector<uint64_t>& size_being_compacted) {
+  // Pre-sort level0 for Get()
+  if (options_->compaction_style == kCompactionStyleUniversal) {
+    std::sort(v->files_[0].begin(), v->files_[0].end(), NewestFirstBySeqNo);
+  } else {
+    std::sort(v->files_[0].begin(), v->files_[0].end(), NewestFirst);
+  }
+
+  double max_score = 0;
+  int max_score_level = 0;
+
+  int num_levels_to_check =
+      (options_->compaction_style != kCompactionStyleUniversal) ?
+          v->NumberLevels() - 1 : 1;
+
+  for (int level = 0; level < num_levels_to_check; level++) {
+
+    double score;
+    if (level == 0) {
+      // We treat level-0 specially by bounding the number of files
+      // instead of number of bytes for two reasons:
+      //
+      // (1) With larger write-buffer sizes, it is nice not to do too
+      // many level-0 compactions.
+      //
+      // (2) The files in level-0 are merged on every read and
+      // therefore we wish to avoid too many files when the individual
+      // file size is small (perhaps because of a small write-buffer
+      // setting, or very high compression ratios, or lots of
+      // overwrites/deletions).
+      int numfiles = 0;
+      for (unsigned int i = 0; i < v->files_[level].size(); i++) {
+        if (!v->files_[level][i]->being_compacted) {
+          numfiles++;
+        }
+      }
+
+      // If we are slowing down writes, then we better compact that first
+      if (numfiles >= options_->level0_stop_writes_trigger) {
+        score = 1000000;
+        // Log(options_->info_log, "XXX score l0 = 1000000000 max");
+      } else if (numfiles >= options_->level0_slowdown_writes_trigger) {
+        score = 10000;
+        // Log(options_->info_log, "XXX score l0 = 1000000 medium");
+      } else {
+        score = numfiles /
+          static_cast<double>(options_->level0_file_num_compaction_trigger);
+        if (score >= 1) {
+          // Log(options_->info_log, "XXX score l0 = %d least", (int)score);
+        }
+      }
+    } else {
+      // Compute the ratio of current size to size limit.
+      const uint64_t level_bytes = TotalFileSize(v->files_[level]) -
+                                   size_being_compacted[level];
+      score = static_cast<double>(level_bytes) / MaxBytesForLevel(level);
+      if (score > 1) {
+        // Log(options_->info_log, "XXX score l%d = %d ", level, (int)score);
+      }
+      if (max_score < score) {
+        max_score = score;
+        max_score_level = level;
+      }
+    }
+    v->compaction_level_[level] = level;
+    v->compaction_score_[level] = score;
+  }
+
+  // update the max compaction score in levels 1 to n-1
+  v->max_compaction_score_ = max_score;
+  v->max_compaction_score_level_ = max_score_level;
+
+  // sort all the levels based on their score. Higher scores get listed
+  // first. Use bubble sort because the number of entries are small.
+  for (int i = 0; i < v->NumberLevels() - 2; i++) {
+    for (int j = i + 1; j < v->NumberLevels() - 1; j++) {
+      if (v->compaction_score_[i] < v->compaction_score_[j]) {
+        double score = v->compaction_score_[i];
+        int level = v->compaction_level_[i];
+        v->compaction_score_[i] = v->compaction_score_[j];
+        v->compaction_level_[i] = v->compaction_level_[j];
+        v->compaction_score_[j] = score;
+        v->compaction_level_[j] = level;
+      }
+    }
+  }
+}
+
+// A static compator used to sort files based on their size
+// In normal mode: descending size
+static bool compareSizeDescending(const VersionSet::Fsize& first,
+  const VersionSet::Fsize& second) {
+  return (first.file->file_size > second.file->file_size);
+}
+// A static compator used to sort files based on their seqno
+// In universal style : descending seqno
+static bool compareSeqnoDescending(const VersionSet::Fsize& first,
+  const VersionSet::Fsize& second) {
+  if (first.file->smallest_seqno > second.file->smallest_seqno) {
+    assert(first.file->largest_seqno > second.file->largest_seqno);
+    return true;
+  }
+  assert(first.file->largest_seqno <= second.file->largest_seqno);
+  return false;
+}
+
+// sort all files in level1 to level(n-1) based on file size
+void VersionSet::UpdateFilesBySize(Version* v) {
+
+  // No need to sort the highest level because it is never compacted.
+  int max_level = (options_->compaction_style == kCompactionStyleUniversal)
+                      ? v->NumberLevels()
+                      : v->NumberLevels() - 1;
+
+  for (int level = 0; level < max_level; level++) {
+
+    const std::vector<FileMetaData*>& files = v->files_[level];
+    std::vector<int>& files_by_size = v->files_by_size_[level];
+    assert(files_by_size.size() == 0);
+
+    // populate a temp vector for sorting based on size
+    std::vector<Fsize> temp(files.size());
+    for (unsigned int i = 0; i < files.size(); i++) {
+      temp[i].index = i;
+      temp[i].file = files[i];
+    }
+
+    // sort the top number_of_files_to_sort_ based on file size
+    if (options_->compaction_style == kCompactionStyleUniversal) {
+      int num = temp.size();
+      std::partial_sort(temp.begin(),  temp.begin() + num,
+                        temp.end(), compareSeqnoDescending);
+    } else {
+      int num = Version::number_of_files_to_sort_;
+      if (num > (int)temp.size()) {
+        num = temp.size();
+      }
+      std::partial_sort(temp.begin(),  temp.begin() + num,
+                        temp.end(), compareSizeDescending);
+    }
+    assert(temp.size() == files.size());
+
+    // initialize files_by_size_
+    for (unsigned int i = 0; i < temp.size(); i++) {
+      files_by_size.push_back(temp[i].index);
+    }
+    v->next_file_to_compact_by_size_[level] = 0;
+    assert(v->files_[level].size() == v->files_by_size_[level].size());
+  }
+}
+
+Status VersionSet::WriteSnapshot(log::Writer* log) {
+  // TODO: Break up into multiple records to reduce memory usage on recovery?
+
+  // Save metadata
+  VersionEdit edit;
+  edit.SetComparatorName(icmp_.user_comparator()->Name());
+
+  // Save compaction pointers
+  for (int level = 0; level < NumberLevels(); level++) {
+    if (!compact_pointer_[level].empty()) {
+      InternalKey key;
+      key.DecodeFrom(compact_pointer_[level]);
+      edit.SetCompactPointer(level, key);
+    }
+  }
+
+  // Save files
+  for (int level = 0; level < current_->NumberLevels(); level++) {
+    const std::vector<FileMetaData*>& files = current_->files_[level];
+    for (size_t i = 0; i < files.size(); i++) {
+      const FileMetaData* f = files[i];
+      edit.AddFile(level, f->number, f->file_size, f->smallest, f->largest,
+                   f->smallest_seqno, f->largest_seqno);
+    }
+  }
+
+  std::string record;
+  edit.EncodeTo(&record);
+  return log->AddRecord(record);
+}
+
+// Opens the mainfest file and reads all records
+// till it finds the record we are looking for.
+bool VersionSet::ManifestContains(const std::string& record) const {
+  std::string fname = DescriptorFileName(dbname_, manifest_file_number_);
+  Log(options_->info_log, "ManifestContains: checking %s\n", fname.c_str());
+  unique_ptr<SequentialFile> file;
+  Status s = env_->NewSequentialFile(fname, &file, storage_options_);
+  if (!s.ok()) {
+    Log(options_->info_log, "ManifestContains: %s\n", s.ToString().c_str());
+    Log(options_->info_log,
+        "ManifestContains: is unable to reopen the manifest file  %s",
+        fname.c_str());
+    return false;
+  }
+  log::Reader reader(std::move(file), nullptr, true/*checksum*/, 0);
+  Slice r;
+  std::string scratch;
+  bool result = false;
+  while (reader.ReadRecord(&r, &scratch)) {
+    if (r == Slice(record)) {
+      result = true;
+      break;
+    }
+  }
+  Log(options_->info_log, "ManifestContains: result = %d\n", result ? 1 : 0);
+  return result;
+}
+
+
+uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) {
+  uint64_t result = 0;
+  for (int level = 0; level < v->NumberLevels(); level++) {
+    const std::vector<FileMetaData*>& files = v->files_[level];
+    for (size_t i = 0; i < files.size(); i++) {
+      if (icmp_.Compare(files[i]->largest, ikey) <= 0) {
+        // Entire file is before "ikey", so just add the file size
+        result += files[i]->file_size;
+      } else if (icmp_.Compare(files[i]->smallest, ikey) > 0) {
+        // Entire file is after "ikey", so ignore
+        if (level > 0) {
+          // Files other than level 0 are sorted by meta->smallest, so
+          // no further files in this level will contain data for
+          // "ikey".
+          break;
+        }
+      } else {
+        // "ikey" falls in the range for this table.  Add the
+        // approximate offset of "ikey" within the table.
+        TableReader* table_reader_ptr;
+        Iterator* iter = table_cache_->NewIterator(
+            ReadOptions(), storage_options_, files[i]->number,
+            files[i]->file_size, &table_reader_ptr);
+        if (table_reader_ptr != nullptr) {
+          result += table_reader_ptr->ApproximateOffsetOf(ikey.Encode());
+        }
+        delete iter;
+      }
+    }
+  }
+  return result;
+}
+
+void VersionSet::AddLiveFiles(std::vector<uint64_t>* live_list) {
+  // pre-calculate space requirement
+  int64_t total_files = 0;
+  for (Version* v = dummy_versions_.next_;
+       v != &dummy_versions_;
+       v = v->next_) {
+    for (int level = 0; level < v->NumberLevels(); level++) {
+      total_files += v->files_[level].size();
+    }
+  }
+
+  // just one time extension to the right size
+  live_list->reserve(live_list->size() + total_files);
+
+  for (Version* v = dummy_versions_.next_;
+       v != &dummy_versions_;
+       v = v->next_) {
+    for (int level = 0; level < v->NumberLevels(); level++) {
+      for (const auto& f : v->files_[level]) {
+        live_list->push_back(f->number);
+      }
+    }
+  }
+}
+
+// Stores the minimal range that covers all entries in inputs in
+// *smallest, *largest.
+// REQUIRES: inputs is not empty
+void VersionSet::GetRange(const std::vector<FileMetaData*>& inputs,
+                          InternalKey* smallest,
+                          InternalKey* largest) {
+  assert(!inputs.empty());
+  smallest->Clear();
+  largest->Clear();
+  for (size_t i = 0; i < inputs.size(); i++) {
+    FileMetaData* f = inputs[i];
+    if (i == 0) {
+      *smallest = f->smallest;
+      *largest = f->largest;
+    } else {
+      if (icmp_.Compare(f->smallest, *smallest) < 0) {
+        *smallest = f->smallest;
+      }
+      if (icmp_.Compare(f->largest, *largest) > 0) {
+        *largest = f->largest;
+      }
+    }
+  }
+}
+
+// Stores the minimal range that covers all entries in inputs1 and inputs2
+// in *smallest, *largest.
+// REQUIRES: inputs is not empty
+void VersionSet::GetRange2(const std::vector<FileMetaData*>& inputs1,
+                           const std::vector<FileMetaData*>& inputs2,
+                           InternalKey* smallest,
+                           InternalKey* largest) {
+  std::vector<FileMetaData*> all = inputs1;
+  all.insert(all.end(), inputs2.begin(), inputs2.end());
+  GetRange(all, smallest, largest);
+}
+
+Iterator* VersionSet::MakeInputIterator(Compaction* c) {
+  ReadOptions options;
+  options.verify_checksums = options_->paranoid_checks;
+  options.fill_cache = false;
+
+  // Level-0 files have to be merged together.  For other levels,
+  // we will make a concatenating iterator per level.
+  // TODO(opt): use concatenating iterator for level-0 if there is no overlap
+  const int space = (c->level() == 0 ? c->inputs_[0].size() + 1 : 2);
+  Iterator** list = new Iterator*[space];
+  int num = 0;
+  for (int which = 0; which < 2; which++) {
+    if (!c->inputs_[which].empty()) {
+      if (c->level() + which == 0) {
+        const std::vector<FileMetaData*>& files = c->inputs_[which];
+        for (size_t i = 0; i < files.size(); i++) {
+          list[num++] = table_cache_->NewIterator(
+              options, storage_options_compactions_,
+              files[i]->number, files[i]->file_size, nullptr,
+              true /* for compaction */);
+        }
+      } else {
+        // Create concatenating iterator for the files from this level
+        list[num++] = NewTwoLevelIterator(
+            new Version::LevelFileNumIterator(icmp_, &c->inputs_[which]),
+            &GetFileIterator, table_cache_, options, storage_options_,
+            true /* for compaction */);
+      }
+    }
+  }
+  assert(num <= space);
+  Iterator* result = NewMergingIterator(&icmp_, list, num);
+  delete[] list;
+  return result;
+}
+
+double VersionSet::MaxBytesForLevel(int level) {
+  // Note: the result for level zero is not really used since we set
+  // the level-0 compaction threshold based on number of files.
+  assert(level >= 0);
+  assert(level < NumberLevels());
+  return level_max_bytes_[level];
+}
+
+uint64_t VersionSet::MaxFileSizeForLevel(int level) {
+  assert(level >= 0);
+  assert(level < NumberLevels());
+  return max_file_size_[level];
+}
+
+uint64_t VersionSet::ExpandedCompactionByteSizeLimit(int level) {
+  uint64_t result = MaxFileSizeForLevel(level);
+  result *= options_->expanded_compaction_factor;
+  return result;
+}
+
+uint64_t VersionSet::MaxGrandParentOverlapBytes(int level) {
+  uint64_t result = MaxFileSizeForLevel(level);
+  result *= options_->max_grandparent_overlap_factor;
+  return result;
+}
+
+// verify that the files listed in this compaction are present
+// in the current version
+bool VersionSet::VerifyCompactionFileConsistency(Compaction* c) {
+#ifndef NDEBUG
+  if (c->input_version_ != current_) {
+    Log(options_->info_log, "VerifyCompactionFileConsistency version mismatch");
+  }
+
+  // verify files in level
+  int level = c->level();
+  for (int i = 0; i < c->num_input_files(0); i++) {
+    uint64_t number = c->input(0,i)->number;
+
+    // look for this file in the current version
+    bool found = false;
+    for (unsigned int j = 0; j < current_->files_[level].size(); j++) {
+      FileMetaData* f = current_->files_[level][j];
+      if (f->number == number) {
+        found = true;
+        break;
+      }
+    }
+    if (!found) {
+      return false; // input files non existant in current version
+    }
+  }
+  // verify level+1 files
+  level++;
+  for (int i = 0; i < c->num_input_files(1); i++) {
+    uint64_t number = c->input(1,i)->number;
+
+    // look for this file in the current version
+    bool found = false;
+    for (unsigned int j = 0; j < current_->files_[level].size(); j++) {
+      FileMetaData* f = current_->files_[level][j];
+      if (f->number == number) {
+        found = true;
+        break;
+      }
+    }
+    if (!found) {
+      return false; // input files non existant in current version
+    }
+  }
+#endif
+  return true;     // everything good
+}
+
+// Clear all files to indicate that they are not being compacted
+// Delete this compaction from the list of running compactions.
+void VersionSet::ReleaseCompactionFiles(Compaction* c, Status status) {
+  c->MarkFilesBeingCompacted(false);
+  compactions_in_progress_[c->level()].erase(c);
+  if (!status.ok()) {
+    c->ResetNextCompactionIndex();
+  }
+}
+
+// The total size of files that are currently being compacted
+// at at every level upto the penultimate level.
+void VersionSet::SizeBeingCompacted(std::vector<uint64_t>& sizes) {
+  for (int level = 0; level < NumberLevels() - 1; level++) {
+    uint64_t total = 0;
+    for (std::set<Compaction*>::iterator it =
+         compactions_in_progress_[level].begin();
+         it != compactions_in_progress_[level].end();
+         ++it) {
+      Compaction* c = (*it);
+      assert(c->level() == level);
+      for (int i = 0; i < c->num_input_files(0); i++) {
+        total += c->input(0,i)->file_size;
+      }
+    }
+    sizes[level] = total;
+  }
+}
+
+//
+// Look at overall size amplification. If size amplification
+// exceeeds the configured value, then do a compaction
+// of the candidate files all the way upto the earliest
+// base file (overrides configured values of file-size ratios,
+// min_merge_width and max_merge_width).
+//
+Compaction* VersionSet::PickCompactionUniversalSizeAmp(int level,
+                                                       double score) {
+  assert (level == 0);
+
+  // percentage flexibilty while reducing size amplification
+  uint64_t ratio = options_->compaction_options_universal.
+                     max_size_amplification_percent;
+
+  // The files are sorted from newest first to oldest last.
+  std::vector<int>& file_by_time = current_->files_by_size_[level];
+  assert(file_by_time.size() == current_->files_[level].size());
+
+  unsigned int candidate_count = 0;
+  uint64_t candidate_size = 0;
+  unsigned int start_index = 0;
+  FileMetaData* f = nullptr;
+
+  // Skip files that are already being compacted
+  for (unsigned int loop = 0; loop < file_by_time.size() - 1; loop++) {
+    int index = file_by_time[loop];
+    f = current_->files_[level][index];
+    if (!f->being_compacted) {
+      start_index = loop;         // Consider this as the first candidate.
+      break;
+    }
+    Log(options_->info_log, "Universal: skipping file %lu[%d] compacted %s",
+        (unsigned long)f->number,
+        loop,
+        " cannot be a candidate to reduce size amp.\n");
+    f = nullptr;
+  }
+  if (f == nullptr) {
+    return nullptr;             // no candidate files
+  }
+
+  Log(options_->info_log, "Universal: First candidate file %lu[%d] %s",
+      (unsigned long)f->number,
+      start_index,
+      " to reduce size amp.\n");
+
+  // keep adding up all the remaining files
+  for (unsigned int loop = start_index; loop < file_by_time.size() - 1;
+       loop++) {
+    int index = file_by_time[loop];
+    f = current_->files_[level][index];
+    if (f->being_compacted) {
+      Log(options_->info_log,
+          "Universal: Possible candidate file %lu[%d] %s.",
+          (unsigned long)f->number,
+          loop,
+          " is already being compacted. No size amp reduction possible.\n");
+      return nullptr;
+    }
+    candidate_size += f->file_size;
+    candidate_count++;
+  }
+  if (candidate_count == 0) {
+    return nullptr;
+  }
+
+  // size of earliest file
+  int index = file_by_time[file_by_time.size() - 1];
+  uint64_t earliest_file_size = current_->files_[level][index]->file_size;
+
+  // size amplification = percentage of additional size
+  if (candidate_size * 100 < ratio * earliest_file_size) {
+    Log(options_->info_log,
+        "Universal: size amp not needed. newer-files-total-size %lu "
+        "earliest-file-size %lu",
+        (unsigned long)candidate_size,
+        (unsigned long)earliest_file_size);
+    return nullptr;
+  } else {
+    Log(options_->info_log,
+        "Universal: size amp needed. newer-files-total-size %lu "
+        "earliest-file-size %lu",
+        (unsigned long)candidate_size,
+        (unsigned long)earliest_file_size);
+  }
+  assert(start_index >= 0 && start_index < file_by_time.size() - 1);
+
+  // create a compaction request
+  // We always compact all the files, so always compress.
+  Compaction* c =
+      new Compaction(current_, level, level, MaxFileSizeForLevel(level),
+                     LLONG_MAX, false, true);
+  c->score_ = score;
+  for (unsigned int loop = start_index; loop < file_by_time.size(); loop++) {
+    int index = file_by_time[loop];
+    f = c->input_version_->files_[level][index];
+    c->inputs_[0].push_back(f);
+    Log(options_->info_log,
+        "Universal: size amp picking file %lu[%d] with size %lu",
+        (unsigned long)f->number,
+        index,
+        (unsigned long)f->file_size);
+  }
+  return c;
+}
+
+//
+// Consider compaction files based on their size differences with
+// the next file in time order.
+//
+Compaction* VersionSet::PickCompactionUniversalReadAmp(
+    int level, double score, unsigned int ratio,
+    unsigned int max_number_of_files_to_compact) {
+
+  unsigned int min_merge_width =
+    options_->compaction_options_universal.min_merge_width;
+  unsigned int max_merge_width =
+    options_->compaction_options_universal.max_merge_width;
+
+  // The files are sorted from newest first to oldest last.
+  std::vector<int>& file_by_time = current_->files_by_size_[level];
+  FileMetaData* f = nullptr;
+  bool done = false;
+  int start_index = 0;
+  unsigned int candidate_count;
+  assert(file_by_time.size() == current_->files_[level].size());
+
+  unsigned int max_files_to_compact = std::min(max_merge_width,
+                                       max_number_of_files_to_compact);
+  min_merge_width = std::max(min_merge_width, 2U);
+
+  // Considers a candidate file only if it is smaller than the
+  // total size accumulated so far.
+  for (unsigned int loop = 0; loop < file_by_time.size(); loop++) {
+
+    candidate_count = 0;
+
+    // Skip files that are already being compacted
+    for (f = nullptr; loop < file_by_time.size(); loop++) {
+      int index = file_by_time[loop];
+      f = current_->files_[level][index];
+
+      if (!f->being_compacted) {
+        candidate_count = 1;
+        break;
+      }
+      Log(options_->info_log,
+          "Universal: file %lu[%d] being compacted, skipping",
+          (unsigned long)f->number, loop);
+      f = nullptr;
+    }
+
+    // This file is not being compacted. Consider it as the
+    // first candidate to be compacted.
+    uint64_t candidate_size =  f != nullptr? f->file_size : 0;
+    if (f != nullptr) {
+      Log(options_->info_log, "Universal: Possible candidate file %lu[%d].",
+          (unsigned long)f->number, loop);
+    }
+
+    // Check if the suceeding files need compaction.
+    for (unsigned int i = loop+1;
+         candidate_count < max_files_to_compact && i < file_by_time.size();
+         i++) {
+      int index = file_by_time[i];
+      FileMetaData* f = current_->files_[level][index];
+      if (f->being_compacted) {
+        break;
+      }
+      // pick files if the total candidate file size (increased by the
+      // specified ratio) is still larger than the next candidate file.
+      uint64_t sz = (candidate_size * (100L + ratio)) /100;
+      if (sz < f->file_size) {
+        break;
+      }
+      candidate_count++;
+      candidate_size += f->file_size;
+    }
+
+    // Found a series of consecutive files that need compaction.
+    if (candidate_count >= (unsigned int)min_merge_width) {
+      start_index = loop;
+      done = true;
+      break;
+    } else {
+      for (unsigned int i = loop;
+           i < loop + candidate_count && i < file_by_time.size(); i++) {
+       int index = file_by_time[i];
+       FileMetaData* f = current_->files_[level][index];
+       Log(options_->info_log,
+           "Universal: Skipping file %lu[%d] with size %lu %d\n",
+           (unsigned long)f->number,
+           i,
+           (unsigned long)f->file_size,
+           f->being_compacted);
+      }
+    }
+  }
+  if (!done || candidate_count <= 1) {
+    return nullptr;
+  }
+  unsigned int first_index_after = start_index + candidate_count;
+  // Compression is enabled if files compacted earlier already reached
+  // size ratio of compression.
+  bool enable_compression = true;
+  int ratio_to_compress =
+      options_->compaction_options_universal.compression_size_percent;
+  if (ratio_to_compress >= 0) {
+    uint64_t total_size = TotalFileSize(current_->files_[level]);
+    uint64_t older_file_size = 0;
+    for (unsigned int i = file_by_time.size() - 1; i >= first_index_after;
+        i--) {
+      older_file_size += current_->files_[level][file_by_time[i]]->file_size;
+      if (older_file_size * 100L >= total_size * (long) ratio_to_compress) {
+        enable_compression = false;
+        break;
+      }
+    }
+  }
+  Compaction* c =
+      new Compaction(current_, level, level, MaxFileSizeForLevel(level),
+                     LLONG_MAX, false, enable_compression);
+  c->score_ = score;
+
+  for (unsigned int i = start_index; i < first_index_after; i++) {
+    int index = file_by_time[i];
+    FileMetaData* f = c->input_version_->files_[level][index];
+    c->inputs_[0].push_back(f);
+    Log(options_->info_log, "Universal: Picking file %lu[%d] with size %lu\n",
+        (unsigned long)f->number,
+        i,
+        (unsigned long)f->file_size);
+  }
+  return c;
+}
+
+//
+// Universal style of compaction. Pick files that are contiguous in
+// time-range to compact.
+//
+Compaction* VersionSet::PickCompactionUniversal(int level, double score) {
+  assert (level == 0);
+
+  if ((current_->files_[level].size() <
+      (unsigned int)options_->level0_file_num_compaction_trigger)) {
+    Log(options_->info_log, "Universal: nothing to do\n");
+    return nullptr;
+  }
+  Version::FileSummaryStorage tmp;
+  Log(options_->info_log, "Universal: candidate files(%lu): %s\n",
+      current_->files_[level].size(),
+      current_->LevelFileSummary(&tmp, 0));
+
+  // Check for size amplification first.
+  Compaction* c = PickCompactionUniversalSizeAmp(level, score);
+  if (c == nullptr) {
+
+    // Size amplification is within limits. Try reducing read
+    // amplification while maintaining file size ratios.
+    unsigned int ratio = options_->compaction_options_universal.size_ratio;
+    c = PickCompactionUniversalReadAmp(level, score, ratio, UINT_MAX);
+
+    // Size amplification and file size ratios are within configured limits.
+    // If max read amplification is exceeding configured limits, then force
+    // compaction without looking at filesize ratios and try to reduce
+    // the number of files to fewer than level0_file_num_compaction_trigger.
+    if (c == nullptr) {
+      unsigned int num_files = current_->files_[level].size() -
+                               options_->level0_file_num_compaction_trigger;
+      c = PickCompactionUniversalReadAmp(level, score, UINT_MAX, num_files);
+    }
+  }
+  if (c == nullptr) {
+    return nullptr;
+  }
+  assert(c->inputs_[0].size() > 1);
+
+  // validate that all the chosen files are non overlapping in time
+  FileMetaData* newerfile __attribute__((unused)) = nullptr;
+  for (unsigned int i = 0; i < c->inputs_[0].size(); i++) {
+    FileMetaData* f = c->inputs_[0][i];
+    assert (f->smallest_seqno <= f->largest_seqno);
+    assert(newerfile == nullptr ||
+           newerfile->smallest_seqno > f->largest_seqno);
+    newerfile = f;
+  }
+
+  // The files are sorted from newest first to oldest last.
+  std::vector<int>& file_by_time = c->input_version_->files_by_size_[level];
+
+  // Is the earliest file part of this compaction?
+  int last_index = file_by_time[file_by_time.size()-1];
+  FileMetaData* last_file = c->input_version_->files_[level][last_index];
+  if (c->inputs_[0][c->inputs_[0].size()-1] == last_file) {
+    c->bottommost_level_ = true;
+  }
+
+  // update statistics
+  if (options_->statistics != nullptr) {
+    options_->statistics->measureTime(NUM_FILES_IN_SINGLE_COMPACTION,
+                                      c->inputs_[0].size());
+  }
+
+  // mark all the files that are being compacted
+  c->MarkFilesBeingCompacted(true);
+
+  // remember this currently undergoing compaction
+  compactions_in_progress_[level].insert(c);
+
+  // Record whether this compaction includes all sst files.
+  // For now, it is only relevant in universal compaction mode.
+  c->is_full_compaction_ =
+      (c->inputs_[0].size() == c->input_version_->files_[0].size());
+
+  return c;
+}
+
+Compaction* VersionSet::PickCompactionBySize(int level, double score) {
+  Compaction* c = nullptr;
+
+  // level 0 files are overlapping. So we cannot pick more
+  // than one concurrent compactions at this level. This
+  // could be made better by looking at key-ranges that are
+  // being compacted at level 0.
+  if (level == 0 && compactions_in_progress_[level].size() == 1) {
+    return nullptr;
+  }
+
+  assert(level >= 0);
+  assert(level + 1 < current_->NumberLevels());
+  c = new Compaction(current_, level, level + 1, MaxFileSizeForLevel(level + 1),
+                     MaxGrandParentOverlapBytes(level));
+  c->score_ = score;
+
+  // Pick the largest file in this level that is not already
+  // being compacted
+  std::vector<int>& file_size = c->input_version_->files_by_size_[level];
+
+  // record the first file that is not yet compacted
+  int nextIndex = -1;
+
+  for (unsigned int i = c->input_version_->next_file_to_compact_by_size_[level];
+       i < file_size.size(); i++) {
+    int index = file_size[i];
+    FileMetaData* f = c->input_version_->files_[level][index];
+
+    // check to verify files are arranged in descending size
+    assert((i == file_size.size() - 1) ||
+           (i >= Version::number_of_files_to_sort_ - 1) ||
+           (f->file_size >=
+            c->input_version_->files_[level][file_size[i + 1]]->file_size));
+
+    // do not pick a file to compact if it is being compacted
+    // from n-1 level.
+    if (f->being_compacted) {
+      continue;
+    }
+
+    // remember the startIndex for the next call to PickCompaction
+    if (nextIndex == -1) {
+      nextIndex = i;
+    }
+
+    //if (i > Version::number_of_files_to_sort_) {
+    //  Log(options_->info_log, "XXX Looking at index %d", i);
+    //}
+
+    // Do not pick this file if its parents at level+1 are being compacted.
+    // Maybe we can avoid redoing this work in SetupOtherInputs
+    int parent_index = -1;
+    if (ParentRangeInCompaction(&f->smallest, &f->largest, level,
+                                &parent_index)) {
+      continue;
+    }
+    c->inputs_[0].push_back(f);
+    c->base_index_ = index;
+    c->parent_index_ = parent_index;
+    break;
+  }
+
+  if (c->inputs_[0].empty()) {
+    delete c;
+    c = nullptr;
+  }
+
+  // store where to start the iteration in the next call to PickCompaction
+  c->input_version_->next_file_to_compact_by_size_[level] = nextIndex;
+
+  return c;
+}
+
+Compaction* VersionSet::PickCompaction() {
+  Compaction* c = nullptr;
+  int level = -1;
+
+  // Compute the compactions needed. It is better to do it here
+  // and also in LogAndApply(), otherwise the values could be stale.
+  std::vector<uint64_t> size_being_compacted(NumberLevels()-1);
+  current_->vset_->SizeBeingCompacted(size_being_compacted);
+  Finalize(current_, size_being_compacted);
+
+  // In universal style of compaction, compact L0 files back into L0.
+  if (options_->compaction_style ==  kCompactionStyleUniversal) {
+    int level = 0;
+    c = PickCompactionUniversal(level, current_->compaction_score_[level]);
+    return c;
+  }
+
+  // We prefer compactions triggered by too much data in a level over
+  // the compactions triggered by seeks.
+  //
+  // Find the compactions by size on all levels.
+  for (int i = 0; i < NumberLevels()-1; i++) {
+    assert(i == 0 || current_->compaction_score_[i] <=
+                     current_->compaction_score_[i-1]);
+    level = current_->compaction_level_[i];
+    if ((current_->compaction_score_[i] >= 1)) {
+      c = PickCompactionBySize(level, current_->compaction_score_[i]);
+      ExpandWhileOverlapping(c);
+      if (c != nullptr) {
+        break;
+      }
+    }
+  }
+
+  // Find compactions needed by seeks
+  FileMetaData* f = current_->file_to_compact_;
+  if (c == nullptr && f != nullptr && !f->being_compacted) {
+
+    level = current_->file_to_compact_level_;
+    int parent_index = -1;
+
+    // Only allow one level 0 compaction at a time.
+    // Do not pick this file if its parents at level+1 are being compacted.
+    if (level != 0 || compactions_in_progress_[0].empty()) {
+      if(!ParentRangeInCompaction(&f->smallest, &f->largest, level,
+                                  &parent_index)) {
+        c = new Compaction(current_, level, level + 1,
+                           MaxFileSizeForLevel(level + 1),
+                           MaxGrandParentOverlapBytes(level), true);
+        c->inputs_[0].push_back(f);
+        c->parent_index_ = parent_index;
+        c->input_version_->file_to_compact_ = nullptr;
+        ExpandWhileOverlapping(c);
+      }
+    }
+  }
+
+  if (c == nullptr) {
+    return nullptr;
+  }
+
+  // Two level 0 compaction won't run at the same time, so don't need to worry
+  // about files on level 0 being compacted.
+  if (level == 0) {
+    assert(compactions_in_progress_[0].empty());
+    InternalKey smallest, largest;
+    GetRange(c->inputs_[0], &smallest, &largest);
+    // Note that the next call will discard the file we placed in
+    // c->inputs_[0] earlier and replace it with an overlapping set
+    // which will include the picked file.
+    c->inputs_[0].clear();
+    c->input_version_->GetOverlappingInputs(0, &smallest, &largest,
+                                            &c->inputs_[0]);
+
+    // If we include more L0 files in the same compaction run it can
+    // cause the 'smallest' and 'largest' key to get extended to a
+    // larger range. So, re-invoke GetRange to get the new key range
+    GetRange(c->inputs_[0], &smallest, &largest);
+    if (ParentRangeInCompaction(&smallest, &largest,
+                                level, &c->parent_index_)) {
+      delete c;
+      return nullptr;
+    }
+    assert(!c->inputs_[0].empty());
+  }
+
+  // Setup "level+1" files (inputs_[1])
+  SetupOtherInputs(c);
+
+  // mark all the files that are being compacted
+  c->MarkFilesBeingCompacted(true);
+
+  // Is this compaction creating a file at the bottommost level
+  c->SetupBottomMostLevel(false);
+
+  // remember this currently undergoing compaction
+  compactions_in_progress_[level].insert(c);
+
+  return c;
+}
+
+// Returns true if any one of the parent files are being compacted
+bool VersionSet::ParentRangeInCompaction(const InternalKey* smallest,
+                                         const InternalKey* largest, int level,
+                                         int* parent_index) {
+  std::vector<FileMetaData*> inputs;
+  assert(level + 1 < current_->NumberLevels());
+
+  current_->GetOverlappingInputs(level + 1, smallest, largest, &inputs,
+                                 *parent_index, parent_index);
+  return FilesInCompaction(inputs);
+}
+
+// Returns true if any one of specified files are being compacted
+bool VersionSet::FilesInCompaction(std::vector<FileMetaData*>& files) {
+  for (unsigned int i = 0; i < files.size(); i++) {
+    if (files[i]->being_compacted) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Add more files to the inputs on "level" to make sure that
+// no newer version of a key is compacted to "level+1" while leaving an older
+// version in a "level". Otherwise, any Get() will search "level" first,
+// and will likely return an old/stale value for the key, since it always
+// searches in increasing order of level to find the value. This could
+// also scramble the order of merge operands. This function should be
+// called any time a new Compaction is created, and its inputs_[0] are
+// populated.
+//
+// Will set c to nullptr if it is impossible to apply this compaction.
+void VersionSet::ExpandWhileOverlapping(Compaction* c) {
+  // If inputs are empty then there is nothing to expand.
+  if (!c || c->inputs_[0].empty()) {
+    return;
+  }
+
+  // GetOverlappingInputs will always do the right thing for level-0.
+  // So we don't need to do any expansion if level == 0.
+  if (c->level() == 0) {
+    return;
+  }
+
+  const int level = c->level();
+  InternalKey smallest, largest;
+
+  // Keep expanding c->inputs_[0] until we are sure that there is a
+  // "clean cut" boundary between the files in input and the surrounding files.
+  // This will ensure that no parts of a key are lost during compaction.
+  int hint_index = -1;
+  size_t old_size;
+  do {
+    old_size = c->inputs_[0].size();
+    GetRange(c->inputs_[0], &smallest, &largest);
+    c->inputs_[0].clear();
+    c->input_version_->GetOverlappingInputs(
+        level, &smallest, &largest, &c->inputs_[0], hint_index, &hint_index);
+  } while(c->inputs_[0].size() > old_size);
+
+  // Get the new range
+  GetRange(c->inputs_[0], &smallest, &largest);
+
+  // If, after the expansion, there are files that are already under
+  // compaction, then we must drop/cancel this compaction.
+  int parent_index = -1;
+  if (FilesInCompaction(c->inputs_[0]) ||
+      (c->level() != c->output_level() &&
+       ParentRangeInCompaction(&smallest, &largest, level, &parent_index))) {
+    c->inputs_[0].clear();
+    c->inputs_[1].clear();
+    delete c;
+    c = nullptr;
+  }
+}
+
+// Populates the set of inputs from "level+1" that overlap with "level".
+// Will also attempt to expand "level" if that doesn't expand "level+1"
+// or cause "level" to include a file for compaction that has an overlapping
+// user-key with another file.
+void VersionSet::SetupOtherInputs(Compaction* c) {
+  // If inputs are empty, then there is nothing to expand.
+  // If both input and output levels are the same, no need to consider
+  // files at level "level+1"
+  if (c->inputs_[0].empty() || c->level() == c->output_level()) {
+    return;
+  }
+
+  const int level = c->level();
+  InternalKey smallest, largest;
+
+  // Get the range one last time.
+  GetRange(c->inputs_[0], &smallest, &largest);
+
+  // Populate the set of next-level files (inputs_[1]) to include in compaction
+  c->input_version_->GetOverlappingInputs(level + 1, &smallest, &largest,
+                                          &c->inputs_[1], c->parent_index_,
+                                          &c->parent_index_);
+
+  // Get entire range covered by compaction
+  InternalKey all_start, all_limit;
+  GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit);
+
+  // See if we can further grow the number of inputs in "level" without
+  // changing the number of "level+1" files we pick up. We also choose NOT
+  // to expand if this would cause "level" to include some entries for some
+  // user key, while excluding other entries for the same user key. This
+  // can happen when one user key spans multiple files.
+  if (!c->inputs_[1].empty()) {
+    std::vector<FileMetaData*> expanded0;
+    c->input_version_->GetOverlappingInputs(
+        level, &all_start, &all_limit, &expanded0, c->base_index_, nullptr);
+    const uint64_t inputs0_size = TotalFileSize(c->inputs_[0]);
+    const uint64_t inputs1_size = TotalFileSize(c->inputs_[1]);
+    const uint64_t expanded0_size = TotalFileSize(expanded0);
+    uint64_t limit = ExpandedCompactionByteSizeLimit(level);
+    if (expanded0.size() > c->inputs_[0].size() &&
+        inputs1_size + expanded0_size < limit &&
+        !FilesInCompaction(expanded0) &&
+        !c->input_version_->HasOverlappingUserKey(&expanded0, level)) {
+      InternalKey new_start, new_limit;
+      GetRange(expanded0, &new_start, &new_limit);
+      std::vector<FileMetaData*> expanded1;
+      c->input_version_->GetOverlappingInputs(level + 1, &new_start, &new_limit,
+                                              &expanded1, c->parent_index_,
+                                              &c->parent_index_);
+      if (expanded1.size() == c->inputs_[1].size() &&
+          !FilesInCompaction(expanded1)) {
+        Log(options_->info_log,
+            "Expanding@%lu %lu+%lu (%lu+%lu bytes) to %lu+%lu (%lu+%lu bytes)"
+            "\n",
+            (unsigned long)level,
+            (unsigned long)(c->inputs_[0].size()),
+            (unsigned long)(c->inputs_[1].size()),
+            (unsigned long)inputs0_size,
+            (unsigned long)inputs1_size,
+            (unsigned long)(expanded0.size()),
+            (unsigned long)(expanded1.size()),
+            (unsigned long)expanded0_size,
+            (unsigned long)inputs1_size);
+        smallest = new_start;
+        largest = new_limit;
+        c->inputs_[0] = expanded0;
+        c->inputs_[1] = expanded1;
+        GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit);
+      }
+    }
+  }
+
+  // Compute the set of grandparent files that overlap this compaction
+  // (parent == level+1; grandparent == level+2)
+  if (level + 2 < NumberLevels()) {
+    c->input_version_->GetOverlappingInputs(level + 2, &all_start, &all_limit,
+                                            &c->grandparents_);
+  }
+
+  if (false) {
+    Log(options_->info_log, "Compacting %d '%s' .. '%s'",
+        level,
+        smallest.DebugString().c_str(),
+        largest.DebugString().c_str());
+  }
+
+  // Update the place where we will do the next compaction for this level.
+  // We update this immediately instead of waiting for the VersionEdit
+  // to be applied so that if the compaction fails, we will try a different
+  // key range next time.
+  compact_pointer_[level] = largest.Encode().ToString();
+  c->edit_->SetCompactPointer(level, largest);
+}
+
+Status VersionSet::GetMetadataForFile(uint64_t number, int* filelevel,
+                                      FileMetaData* meta) {
+  for (int level = 0; level < NumberLevels(); level++) {
+    const std::vector<FileMetaData*>& files = current_->files_[level];
+    for (size_t i = 0; i < files.size(); i++) {
+      if (files[i]->number == number) {
+        *meta = *files[i];
+        *filelevel = level;
+        return Status::OK();
+      }
+    }
+  }
+  return Status::NotFound("File not present in any level");
+}
+
+void VersionSet::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
+  for (int level = 0; level < NumberLevels(); level++) {
+    const std::vector<FileMetaData*>& files = current_->files_[level];
+    for (size_t i = 0; i < files.size(); i++) {
+      LiveFileMetaData filemetadata;
+      filemetadata.name = TableFileName("", files[i]->number);
+      filemetadata.level = level;
+      filemetadata.size = files[i]->file_size;
+      filemetadata.smallestkey = files[i]->smallest.user_key().ToString();
+      filemetadata.largestkey = files[i]->largest.user_key().ToString();
+      filemetadata.smallest_seqno = files[i]->smallest_seqno;
+      filemetadata.largest_seqno = files[i]->largest_seqno;
+      metadata->push_back(filemetadata);
+    }
+  }
+}
+
+void VersionSet::GetObsoleteFiles(std::vector<FileMetaData*>* files) {
+  files->insert(files->end(),
+                obsolete_files_.begin(),
+                obsolete_files_.end());
+  obsolete_files_.clear();
+}
+
+Compaction* VersionSet::CompactRange(int input_level,
+                                     int output_level,
+                                     const InternalKey* begin,
+                                     const InternalKey* end,
+                                     InternalKey** compaction_end) {
+  std::vector<FileMetaData*> inputs;
+  bool covering_the_whole_range = true;
+
+  // All files are 'overlapping' in universal style compaction.
+  // We have to compact the entire range in one shot.
+  if (options_->compaction_style == kCompactionStyleUniversal) {
+    begin = nullptr;
+    end = nullptr;
+  }
+  current_->GetOverlappingInputs(input_level, begin, end, &inputs);
+  if (inputs.empty()) {
+    return nullptr;
+  }
+
+  // Avoid compacting too much in one shot in case the range is large.
+  // But we cannot do this for level-0 since level-0 files can overlap
+  // and we must not pick one file and drop another older file if the
+  // two files overlap.
+  if (input_level > 0) {
+    const uint64_t limit =
+        MaxFileSizeForLevel(input_level) * options_->source_compaction_factor;
+    uint64_t total = 0;
+    for (size_t i = 0; i + 1 < inputs.size(); ++i) {
+      uint64_t s = inputs[i]->file_size;
+      total += s;
+      if (total >= limit) {
+        **compaction_end = inputs[i + 1]->smallest;
+        covering_the_whole_range = false;
+        inputs.resize(i + 1);
+        break;
+      }
+    }
+  }
+  Compaction* c = new Compaction(current_, input_level, output_level,
+                                 MaxFileSizeForLevel(output_level),
+                                 MaxGrandParentOverlapBytes(input_level));
+
+  c->inputs_[0] = inputs;
+  ExpandWhileOverlapping(c);
+  if (c == nullptr) {
+    Log(options_->info_log, "Could not compact due to expansion failure.\n");
+    return nullptr;
+  }
+
+  SetupOtherInputs(c);
+
+  if (covering_the_whole_range) {
+    *compaction_end = nullptr;
+  }
+
+  // These files that are to be manaully compacted do not trample
+  // upon other files because manual compactions are processed when
+  // the system has a max of 1 background compaction thread.
+  c->MarkFilesBeingCompacted(true);
+
+  // Is this compaction creating a file at the bottommost level
+  c->SetupBottomMostLevel(true);
+  return c;
+}
+
+Compaction::Compaction(Version* input_version, int level, int out_level,
+                       uint64_t target_file_size,
+                       uint64_t max_grandparent_overlap_bytes,
+                       bool seek_compaction, bool enable_compression)
+    : level_(level),
+      out_level_(out_level),
+      max_output_file_size_(target_file_size),
+      maxGrandParentOverlapBytes_(max_grandparent_overlap_bytes),
+      input_version_(input_version),
+      number_levels_(input_version_->NumberLevels()),
+      seek_compaction_(seek_compaction),
+      enable_compression_(enable_compression),
+      grandparent_index_(0),
+      seen_key_(false),
+      overlapped_bytes_(0),
+      base_index_(-1),
+      parent_index_(-1),
+      score_(0),
+      bottommost_level_(false),
+      is_full_compaction_(false),
+      level_ptrs_(std::vector<size_t>(number_levels_)) {
+
+  input_version_->Ref();
+  edit_ = new VersionEdit();
+  for (int i = 0; i < number_levels_; i++) {
+    level_ptrs_[i] = 0;
+  }
+}
+
+Compaction::~Compaction() {
+  delete edit_;
+  if (input_version_ != nullptr) {
+    input_version_->Unref();
+  }
+}
+
+bool Compaction::IsTrivialMove() const {
+  // Avoid a move if there is lots of overlapping grandparent data.
+  // Otherwise, the move could create a parent file that will require
+  // a very expensive merge later on.
+  // If level_== out_level_, the purpose is to force compaction filter to be
+  // applied to that level, and thus cannot be a trivia move.
+  return (level_ != out_level_ &&
+          num_input_files(0) == 1 &&
+          num_input_files(1) == 0 &&
+          TotalFileSize(grandparents_) <= maxGrandParentOverlapBytes_);
+}
+
+void Compaction::AddInputDeletions(VersionEdit* edit) {
+  for (int which = 0; which < 2; which++) {
+    for (size_t i = 0; i < inputs_[which].size(); i++) {
+      edit->DeleteFile(level_ + which, inputs_[which][i]->number);
+    }
+  }
+}
+
+bool Compaction::IsBaseLevelForKey(const Slice& user_key) {
+  if (input_version_->vset_->options_->compaction_style ==
+      kCompactionStyleUniversal) {
+    return bottommost_level_;
+  }
+  // Maybe use binary search to find right entry instead of linear search?
+  const Comparator* user_cmp = input_version_->vset_->icmp_.user_comparator();
+  for (int lvl = level_ + 2; lvl < number_levels_; lvl++) {
+    const std::vector<FileMetaData*>& files = input_version_->files_[lvl];
+    for (; level_ptrs_[lvl] < files.size(); ) {
+      FileMetaData* f = files[level_ptrs_[lvl]];
+      if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) {
+        // We've advanced far enough
+        if (user_cmp->Compare(user_key, f->smallest.user_key()) >= 0) {
+          // Key falls in this file's range, so definitely not base level
+          return false;
+        }
+        break;
+      }
+      level_ptrs_[lvl]++;
+    }
+  }
+  return true;
+}
+
+bool Compaction::ShouldStopBefore(const Slice& internal_key) {
+  // Scan to find earliest grandparent file that contains key.
+  const InternalKeyComparator* icmp = &input_version_->vset_->icmp_;
+  while (grandparent_index_ < grandparents_.size() &&
+      icmp->Compare(internal_key,
+                    grandparents_[grandparent_index_]->largest.Encode()) > 0) {
+    if (seen_key_) {
+      overlapped_bytes_ += grandparents_[grandparent_index_]->file_size;
+    }
+    assert(grandparent_index_ + 1 >= grandparents_.size() ||
+           icmp->Compare(grandparents_[grandparent_index_]->largest.Encode(),
+                         grandparents_[grandparent_index_+1]->smallest.Encode())
+                         < 0);
+    grandparent_index_++;
+  }
+  seen_key_ = true;
+
+  if (overlapped_bytes_ > maxGrandParentOverlapBytes_) {
+    // Too much overlap for current output; start new output
+    overlapped_bytes_ = 0;
+    return true;
+  } else {
+    return false;
+  }
+}
+
+// Mark (or clear) each file that is being compacted
+void Compaction::MarkFilesBeingCompacted(bool value) {
+  for (int i = 0; i < 2; i++) {
+    std::vector<FileMetaData*> v = inputs_[i];
+    for (unsigned int j = 0; j < inputs_[i].size(); j++) {
+      assert(value ? !inputs_[i][j]->being_compacted :
+                      inputs_[i][j]->being_compacted);
+      inputs_[i][j]->being_compacted = value;
+    }
+  }
+}
+
+// Is this compaction producing files at the bottommost level?
+void Compaction::SetupBottomMostLevel(bool isManual) {
+  if (input_version_->vset_->options_->compaction_style  ==
+         kCompactionStyleUniversal) {
+    // If universal compaction style is used and manual
+    // compaction is occuring, then we are guaranteed that
+    // all files will be picked in a single compaction
+    // run. We can safely set bottommost_level_ = true.
+    // If it is not manual compaction, then bottommost_level_
+    // is already set when the Compaction was created.
+    if (isManual) {
+      bottommost_level_ = true;
+    }
+    return;
+  }
+  bottommost_level_ = true;
+  int num_levels = input_version_->vset_->NumberLevels();
+  for (int i = output_level() + 1; i < num_levels; i++) {
+    if (input_version_->NumLevelFiles(i) > 0) {
+      bottommost_level_ = false;
+      break;
+    }
+  }
+}
+
+void Compaction::ReleaseInputs() {
+  if (input_version_ != nullptr) {
+    input_version_->Unref();
+    input_version_ = nullptr;
+  }
+}
+
+void Compaction::ResetNextCompactionIndex() {
+  input_version_->ResetNextCompactionIndex(level_);
+}
+
+static void InputSummary(std::vector<FileMetaData*>& files, char* output,
+                         int len) {
+  int write = 0;
+  for (unsigned int i = 0; i < files.size(); i++) {
+    int sz = len - write;
+    int ret = snprintf(output + write, sz, "%lu(%lu) ",
+        (unsigned long)files.at(i)->number,
+        (unsigned long)files.at(i)->file_size);
+    if (ret < 0 || ret >= sz)
+      break;
+    write += ret;
+  }
+}
+
+void Compaction::Summary(char* output, int len) {
+  int write = snprintf(output, len,
+      "Base version %lu Base level %d, seek compaction:%d, inputs:",
+      (unsigned long)input_version_->GetVersionNumber(),
+      level_,
+      seek_compaction_);
+  if (write < 0 || write > len) {
+    return;
+  }
+
+  char level_low_summary[100];
+  InputSummary(inputs_[0], level_low_summary, sizeof(level_low_summary));
+  char level_up_summary[100];
+  if (inputs_[1].size()) {
+    InputSummary(inputs_[1], level_up_summary, sizeof(level_up_summary));
+  } else {
+    level_up_summary[0] = '\0';
+  }
+
+  snprintf(output + write, len - write, "[%s],[%s]",
+      level_low_summary, level_up_summary);
+}
+
+}  // namespace rocksdb
diff --git a/db/version_set.h b/db/version_set.h
new file mode 100644 (file)
index 0000000..51f6d9b
--- /dev/null
@@ -0,0 +1,663 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// The representation of a DBImpl consists of a set of Versions.  The
+// newest version is called "current".  Older versions may be kept
+// around to provide a consistent view to live iterators.
+//
+// Each Version keeps track of a set of Table files per level.  The
+// entire set of versions is maintained in a VersionSet.
+//
+// Version,VersionSet are thread-compatible, but require external
+// synchronization on all accesses.
+
+#pragma once
+#include <map>
+#include <memory>
+#include <set>
+#include <vector>
+#include <deque>
+#include "db/dbformat.h"
+#include "db/version_edit.h"
+#include "port/port.h"
+#include "db/table_cache.h"
+
+namespace rocksdb {
+
+namespace log { class Writer; }
+
+class Compaction;
+class Iterator;
+class MemTable;
+class TableCache;
+class Version;
+class VersionSet;
+class MergeContext;
+
+// Return the smallest index i such that files[i]->largest >= key.
+// Return files.size() if there is no such file.
+// REQUIRES: "files" contains a sorted list of non-overlapping files.
+extern int FindFile(const InternalKeyComparator& icmp,
+                    const std::vector<FileMetaData*>& files,
+                    const Slice& key);
+
+// Returns true iff some file in "files" overlaps the user key range
+// [*smallest,*largest].
+// smallest==nullptr represents a key smaller than all keys in the DB.
+// largest==nullptr represents a key largest than all keys in the DB.
+// REQUIRES: If disjoint_sorted_files, files[] contains disjoint ranges
+//           in sorted order.
+extern bool SomeFileOverlapsRange(
+    const InternalKeyComparator& icmp,
+    bool disjoint_sorted_files,
+    const std::vector<FileMetaData*>& files,
+    const Slice* smallest_user_key,
+    const Slice* largest_user_key);
+
+class Version {
+ public:
+  // Append to *iters a sequence of iterators that will
+  // yield the contents of this Version when merged together.
+  // REQUIRES: This version has been saved (see VersionSet::SaveTo)
+  void AddIterators(const ReadOptions&, const EnvOptions& soptions,
+                    std::vector<Iterator*>* iters);
+
+  // Lookup the value for key.  If found, store it in *val and
+  // return OK.  Else return a non-OK status.  Fills *stats.
+  // Uses *operands to store merge_operator operations to apply later
+  // REQUIRES: lock is not held
+  struct GetStats {
+    FileMetaData* seek_file;
+    int seek_file_level;
+  };
+  void Get(const ReadOptions&, const LookupKey& key, std::string* val,
+           Status* status, MergeContext* merge_context,
+           GetStats* stats, const Options& db_option, bool* value_found =
+               nullptr);
+
+  // Adds "stats" into the current state.  Returns true if a new
+  // compaction may need to be triggered, false otherwise.
+  // REQUIRES: lock is held
+  bool UpdateStats(const GetStats& stats);
+
+  // Reference count management (so Versions do not disappear out from
+  // under live iterators)
+  void Ref();
+  void Unref();
+
+  void GetOverlappingInputs(
+      int level,
+      const InternalKey* begin,         // nullptr means before all keys
+      const InternalKey* end,           // nullptr means after all keys
+      std::vector<FileMetaData*>* inputs,
+      int hint_index = -1,              // index of overlap file
+      int* file_index = nullptr);          // return index of overlap file
+
+  void GetOverlappingInputsBinarySearch(
+      int level,
+      const Slice& begin,         // nullptr means before all keys
+      const Slice& end,           // nullptr means after all keys
+      std::vector<FileMetaData*>* inputs,
+      int hint_index,             // index of overlap file
+      int* file_index);           // return index of overlap file
+
+  void ExtendOverlappingInputs(
+      int level,
+      const Slice& begin,         // nullptr means before all keys
+      const Slice& end,           // nullptr means after all keys
+      std::vector<FileMetaData*>* inputs,
+      unsigned int index);                 // start extending from this index
+
+  // Returns true iff some file in the specified level overlaps
+  // some part of [*smallest_user_key,*largest_user_key].
+  // smallest_user_key==NULL represents a key smaller than all keys in the DB.
+  // largest_user_key==NULL represents a key largest than all keys in the DB.
+  bool OverlapInLevel(int level,
+                      const Slice* smallest_user_key,
+                      const Slice* largest_user_key);
+
+  // Returns true iff the first or last file in inputs contains
+  // an overlapping user key to the file "just outside" of it (i.e.
+  // just after the last file, or just before the first file)
+  // REQUIRES: "*inputs" is a sorted list of non-overlapping files
+  bool HasOverlappingUserKey(const std::vector<FileMetaData*>* inputs,
+                             int level);
+
+
+  // Return the level at which we should place a new memtable compaction
+  // result that covers the range [smallest_user_key,largest_user_key].
+  int PickLevelForMemTableOutput(const Slice& smallest_user_key,
+                                 const Slice& largest_user_key);
+
+  int NumberLevels() const { return num_levels_; }
+
+  // REQUIRES: lock is held
+  int NumLevelFiles(int level) const { return files_[level].size(); }
+
+  // Return the combined file size of all files at the specified level.
+  int64_t NumLevelBytes(int level) const;
+
+  // Return a human-readable short (single-line) summary of the number
+  // of files per level.  Uses *scratch as backing store.
+  struct LevelSummaryStorage {
+    char buffer[100];
+  };
+  struct FileSummaryStorage {
+    char buffer[1000];
+  };
+  const char* LevelSummary(LevelSummaryStorage* scratch) const;
+  // Return a human-readable short (single-line) summary of files
+  // in a specified level.  Uses *scratch as backing store.
+  const char* LevelFileSummary(FileSummaryStorage* scratch, int level) const;
+
+  // Return the maximum overlapping data (in bytes) at next level for any
+  // file at a level >= 1.
+  int64_t MaxNextLevelOverlappingBytes();
+
+  // Add all files listed in the current version to *live.
+  void AddLiveFiles(std::set<uint64_t>* live);
+
+  // Return a human readable string that describes this version's contents.
+  std::string DebugString(bool hex = false) const;
+
+  // Returns the version nuber of this version
+  uint64_t GetVersionNumber() const { return version_number_; }
+
+ private:
+  friend class Compaction;
+  friend class VersionSet;
+  friend class DBImpl;
+
+  class LevelFileNumIterator;
+  Iterator* NewConcatenatingIterator(const ReadOptions&,
+                                     const EnvOptions& soptions,
+                                     int level) const;
+  bool PrefixMayMatch(const ReadOptions& options, const EnvOptions& soptions,
+                      const Slice& internal_prefix, Iterator* level_iter) const;
+
+  VersionSet* vset_;            // VersionSet to which this Version belongs
+  Version* next_;               // Next version in linked list
+  Version* prev_;               // Previous version in linked list
+  int refs_;                    // Number of live refs to this version
+  int num_levels_;              // Number of levels
+
+  // List of files per level, files in each level are arranged
+  // in increasing order of keys
+  std::vector<FileMetaData*>* files_;
+
+  // A list for the same set of files that are stored in files_,
+  // but files in each level are now sorted based on file
+  // size. The file with the largest size is at the front.
+  // This vector stores the index of the file from files_.
+  std::vector< std::vector<int> > files_by_size_;
+
+  // An index into files_by_size_ that specifies the first
+  // file that is not yet compacted
+  std::vector<int> next_file_to_compact_by_size_;
+
+  // Only the first few entries of files_by_size_ are sorted.
+  // There is no need to sort all the files because it is likely
+  // that on a running system, we need to look at only the first
+  // few largest files because a new version is created every few
+  // seconds/minutes (because of concurrent compactions).
+  static const int number_of_files_to_sort_ = 50;
+
+  // Next file to compact based on seek stats.
+  FileMetaData* file_to_compact_;
+  int file_to_compact_level_;
+
+  // Level that should be compacted next and its compaction score.
+  // Score < 1 means compaction is not strictly needed.  These fields
+  // are initialized by Finalize().
+  // The most critical level to be compacted is listed first
+  // These are used to pick the best compaction level
+  std::vector<double> compaction_score_;
+  std::vector<int> compaction_level_;
+  double max_compaction_score_; // max score in l1 to ln-1
+  int max_compaction_score_level_; // level on which max score occurs
+
+  // A version number that uniquely represents this version. This is
+  // used for debugging and logging purposes only.
+  uint64_t version_number_;
+
+  explicit Version(VersionSet* vset, uint64_t version_number = 0);
+
+  ~Version();
+
+  // re-initializes the index that is used to offset into files_by_size_
+  // to find the next compaction candidate file.
+  void ResetNextCompactionIndex(int level) {
+    next_file_to_compact_by_size_[level] = 0;
+  }
+
+  // No copying allowed
+  Version(const Version&);
+  void operator=(const Version&);
+};
+
+class VersionSet {
+ public:
+  VersionSet(const std::string& dbname, const Options* options,
+             const EnvOptions& storage_options, TableCache* table_cache,
+             const InternalKeyComparator*);
+  ~VersionSet();
+
+  // Apply *edit to the current version to form a new descriptor that
+  // is both saved to persistent state and installed as the new
+  // current version.  Will release *mu while actually writing to the file.
+  // REQUIRES: *mu is held on entry.
+  // REQUIRES: no other thread concurrently calls LogAndApply()
+  Status LogAndApply(VersionEdit* edit, port::Mutex* mu,
+                     bool new_descriptor_log = false);
+
+  // Recover the last saved descriptor from persistent storage.
+  Status Recover();
+
+  // Try to reduce the number of levels. This call is valid when
+  // only one level from the new max level to the old
+  // max level containing files.
+  // For example, a db currently has 7 levels [0-6], and a call to
+  // to reduce to 5 [0-4] can only be executed when only one level
+  // among [4-6] contains files.
+  Status ReduceNumberOfLevels(int new_levels, port::Mutex* mu);
+
+  // Return the current version.
+  Version* current() const { return current_; }
+
+  // A Flag indicating whether write needs to slowdown because of there are
+  // too many number of level0 files.
+  bool NeedSlowdownForNumLevel0Files() const {
+    return need_slowdown_for_num_level0_files_;
+  }
+
+  // Return the current manifest file number
+  uint64_t ManifestFileNumber() const { return manifest_file_number_; }
+
+  // Allocate and return a new file number
+  uint64_t NewFileNumber() { return next_file_number_++; }
+
+  // Arrange to reuse "file_number" unless a newer file number has
+  // already been allocated.
+  // REQUIRES: "file_number" was returned by a call to NewFileNumber().
+  void ReuseFileNumber(uint64_t file_number) {
+    if (next_file_number_ == file_number + 1) {
+      next_file_number_ = file_number;
+    }
+  }
+
+  // Return the last sequence number.
+  uint64_t LastSequence() const {
+    return last_sequence_.load(std::memory_order_acquire);
+  }
+
+  // Set the last sequence number to s.
+  void SetLastSequence(uint64_t s) {
+    assert(s >= last_sequence_);
+    last_sequence_.store(s, std::memory_order_release);
+  }
+
+  // Mark the specified file number as used.
+  void MarkFileNumberUsed(uint64_t number);
+
+  // Return the current log file number.
+  uint64_t LogNumber() const { return log_number_; }
+
+  // Return the log file number for the log file that is currently
+  // being compacted, or zero if there is no such log file.
+  uint64_t PrevLogNumber() const { return prev_log_number_; }
+
+  int NumberLevels() const { return num_levels_; }
+
+  // Pick level and inputs for a new compaction.
+  // Returns nullptr if there is no compaction to be done.
+  // Otherwise returns a pointer to a heap-allocated object that
+  // describes the compaction.  Caller should delete the result.
+  Compaction* PickCompaction();
+
+  // Return a compaction object for compacting the range [begin,end] in
+  // the specified level.  Returns nullptr if there is nothing in that
+  // level that overlaps the specified range.  Caller should delete
+  // the result.
+  //
+  // The returned Compaction might not include the whole requested range.
+  // In that case, compaction_end will be set to the next key that needs
+  // compacting. In case the compaction will compact the whole range,
+  // compaction_end will be set to nullptr.
+  // Client is responsible for compaction_end storage -- when called,
+  // *compaction_end should point to valid InternalKey!
+  Compaction* CompactRange(int input_level,
+                           int output_level,
+                           const InternalKey* begin,
+                           const InternalKey* end,
+                           InternalKey** compaction_end);
+
+  // Create an iterator that reads over the compaction inputs for "*c".
+  // The caller should delete the iterator when no longer needed.
+  Iterator* MakeInputIterator(Compaction* c);
+
+  // Returns true iff some level needs a compaction because it has
+  // exceeded its target size.
+  bool NeedsSizeCompaction() const {
+    // In universal compaction case, this check doesn't really
+    // check the compaction condition, but checks num of files threshold
+    // only. We are not going to miss any compaction opportunity
+    // but it's likely that more compactions are scheduled but
+    // ending up with nothing to do. We can improve it later.
+    // TODO: improve this function to be accurate for universal
+    //       compactions.
+    int num_levels_to_check =
+        (options_->compaction_style != kCompactionStyleUniversal) ?
+            NumberLevels() - 1 : 1;
+    for (int i = 0; i < num_levels_to_check; i++) {
+      if (current_->compaction_score_[i] >= 1) {
+        return true;
+      }
+    }
+    return false;
+  }
+  // Returns true iff some level needs a compaction.
+  bool NeedsCompaction() const {
+    return ((current_->file_to_compact_ != nullptr) ||
+            NeedsSizeCompaction());
+  }
+
+  // Returns the maxmimum compaction score for levels 1 to max
+  double MaxCompactionScore() const {
+    return current_->max_compaction_score_;
+  }
+
+  // See field declaration
+  int MaxCompactionScoreLevel() const {
+    return current_->max_compaction_score_level_;
+  }
+
+  // Add all files listed in any live version to *live.
+  void AddLiveFiles(std::vector<uint64_t>* live_list);
+
+  // Return the approximate offset in the database of the data for
+  // "key" as of version "v".
+  uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key);
+
+  // printf contents (for debugging)
+  Status DumpManifest(Options& options, std::string& manifestFileName,
+                      bool verbose, bool hex = false);
+
+  // Return the size of the current manifest file
+  uint64_t ManifestFileSize() const { return manifest_file_size_; }
+
+  // For the specfied level, pick a compaction.
+  // Returns nullptr if there is no compaction to be done.
+  // If level is 0 and there is already a compaction on that level, this
+  // function will return nullptr.
+  Compaction* PickCompactionBySize(int level, double score);
+
+  // Pick files to compact in Universal mode
+  Compaction* PickCompactionUniversal(int level, double score);
+
+  // Pick Universal compaction to limit read amplification
+  Compaction* PickCompactionUniversalReadAmp(int level, double score,
+                unsigned int ratio, unsigned int num_files);
+
+  // Pick Universal compaction to limit space amplification.
+  Compaction* PickCompactionUniversalSizeAmp(int level, double score);
+
+  // Free up the files that were participated in a compaction
+  void ReleaseCompactionFiles(Compaction* c, Status status);
+
+  // verify that the files that we started with for a compaction
+  // still exist in the current version and in the same original level.
+  // This ensures that a concurrent compaction did not erroneously
+  // pick the same files to compact.
+  bool VerifyCompactionFileConsistency(Compaction* c);
+
+  // used to sort files by size
+  typedef struct fsize {
+    int index;
+    FileMetaData* file;
+  } Fsize;
+
+  // Sort all files for this version based on their file size and
+  // record results in files_by_size_. The largest files are listed first.
+  void UpdateFilesBySize(Version *v);
+
+  // Get the max file size in a given level.
+  uint64_t MaxFileSizeForLevel(int level);
+
+  double MaxBytesForLevel(int level);
+
+  Status GetMetadataForFile(
+    uint64_t number, int *filelevel, FileMetaData *metadata);
+
+  void GetLiveFilesMetaData(
+    std::vector<LiveFileMetaData> *metadata);
+
+  void GetObsoleteFiles(std::vector<FileMetaData*>* files);
+
+ private:
+  class Builder;
+  struct ManifestWriter;
+
+  friend class Compaction;
+  friend class Version;
+
+  void Init(int num_levels);
+
+  void Finalize(Version* v, std::vector<uint64_t>&);
+
+  void GetRange(const std::vector<FileMetaData*>& inputs,
+                InternalKey* smallest,
+                InternalKey* largest);
+
+  void GetRange2(const std::vector<FileMetaData*>& inputs1,
+                 const std::vector<FileMetaData*>& inputs2,
+                 InternalKey* smallest,
+                 InternalKey* largest);
+
+  void ExpandWhileOverlapping(Compaction* c);
+
+  void SetupOtherInputs(Compaction* c);
+
+  // Save current contents to *log
+  Status WriteSnapshot(log::Writer* log);
+
+  void AppendVersion(Version* v);
+
+  bool ManifestContains(const std::string& record) const;
+
+  uint64_t ExpandedCompactionByteSizeLimit(int level);
+
+  uint64_t MaxGrandParentOverlapBytes(int level);
+
+  Env* const env_;
+  const std::string dbname_;
+  const Options* const options_;
+  TableCache* const table_cache_;
+  const InternalKeyComparator icmp_;
+  uint64_t next_file_number_;
+  uint64_t manifest_file_number_;
+  std::atomic<uint64_t> last_sequence_;
+  uint64_t log_number_;
+  uint64_t prev_log_number_;  // 0 or backing store for memtable being compacted
+
+  int num_levels_;
+
+  // Opened lazily
+  unique_ptr<log::Writer> descriptor_log_;
+  Version dummy_versions_;  // Head of circular doubly-linked list of versions.
+  Version* current_;        // == dummy_versions_.prev_
+
+  // A flag indicating whether we should delay writes because
+  // we have too many level 0 files
+  bool need_slowdown_for_num_level0_files_;
+
+  // Per-level key at which the next compaction at that level should start.
+  // Either an empty string, or a valid InternalKey.
+  std::string* compact_pointer_;
+
+  // Per-level target file size.
+  uint64_t* max_file_size_;
+
+  // Per-level max bytes
+  uint64_t* level_max_bytes_;
+
+  // record all the ongoing compactions for all levels
+  std::vector<std::set<Compaction*> > compactions_in_progress_;
+
+  // generates a increasing version number for every new version
+  uint64_t current_version_number_;
+
+  // Queue of writers to the manifest file
+  std::deque<ManifestWriter*> manifest_writers_;
+
+  // Current size of manifest file
+  uint64_t manifest_file_size_;
+
+  std::vector<FileMetaData*> obsolete_files_;
+
+  // storage options for all reads and writes except compactions
+  const EnvOptions& storage_options_;
+
+  // storage options used for compactions. This is a copy of
+  // storage_options_ but with readaheads set to readahead_compactions_.
+  const EnvOptions storage_options_compactions_;
+
+  // No copying allowed
+  VersionSet(const VersionSet&);
+  void operator=(const VersionSet&);
+
+  // Return the total amount of data that is undergoing
+  // compactions per level
+  void SizeBeingCompacted(std::vector<uint64_t>&);
+
+  // Returns true if any one of the parent files are being compacted
+  bool ParentRangeInCompaction(const InternalKey* smallest,
+    const InternalKey* largest, int level, int* index);
+
+  // Returns true if any one of the specified files are being compacted
+  bool FilesInCompaction(std::vector<FileMetaData*>& files);
+
+  void LogAndApplyHelper(Builder*b, Version* v,
+                           VersionEdit* edit, port::Mutex* mu);
+};
+
+// A Compaction encapsulates information about a compaction.
+class Compaction {
+ public:
+  ~Compaction();
+
+  // Return the level that is being compacted.  Inputs from "level"
+  // will be merged.
+  int level() const { return level_; }
+
+  // Outputs will go to this level
+  int output_level() const { return out_level_; }
+
+  // Return the object that holds the edits to the descriptor done
+  // by this compaction.
+  VersionEdit* edit() { return edit_; }
+
+  // "which" must be either 0 or 1
+  int num_input_files(int which) const { return inputs_[which].size(); }
+
+  // Return the ith input file at "level()+which" ("which" must be 0 or 1).
+  FileMetaData* input(int which, int i) const { return inputs_[which][i]; }
+
+  // Maximum size of files to build during this compaction.
+  uint64_t MaxOutputFileSize() const { return max_output_file_size_; }
+
+  // Whether compression will be enabled for compaction outputs
+  bool enable_compression() const { return enable_compression_; }
+
+  // Is this a trivial compaction that can be implemented by just
+  // moving a single input file to the next level (no merging or splitting)
+  bool IsTrivialMove() const;
+
+  // Add all inputs to this compaction as delete operations to *edit.
+  void AddInputDeletions(VersionEdit* edit);
+
+  // Returns true if the information we have available guarantees that
+  // the compaction is producing data in "level+1" for which no data exists
+  // in levels greater than "level+1".
+  bool IsBaseLevelForKey(const Slice& user_key);
+
+  // Returns true iff we should stop building the current output
+  // before processing "internal_key".
+  bool ShouldStopBefore(const Slice& internal_key);
+
+  // Release the input version for the compaction, once the compaction
+  // is successful.
+  void ReleaseInputs();
+
+  void Summary(char* output, int len);
+
+  // Return the score that was used to pick this compaction run.
+  double score() const { return score_; }
+
+  // Is this compaction creating a file in the bottom most level?
+  bool BottomMostLevel() { return bottommost_level_; }
+
+  // Does this compaction include all sst files?
+  bool IsFullCompaction() { return is_full_compaction_; }
+
+ private:
+  friend class Version;
+  friend class VersionSet;
+
+  Compaction(Version* input_version, int level, int out_level,
+             uint64_t target_file_size, uint64_t max_grandparent_overlap_bytes,
+             bool seek_compaction = false, bool enable_compression = true);
+
+  int level_;
+  int out_level_; // levels to which output files are stored
+  uint64_t max_output_file_size_;
+  uint64_t maxGrandParentOverlapBytes_;
+  Version* input_version_;
+  VersionEdit* edit_;
+  int number_levels_;
+
+  bool seek_compaction_;
+  bool enable_compression_;
+
+  // Each compaction reads inputs from "level_" and "level_+1"
+  std::vector<FileMetaData*> inputs_[2];      // The two sets of inputs
+
+  // State used to check for number of of overlapping grandparent files
+  // (parent == level_ + 1, grandparent == level_ + 2)
+  std::vector<FileMetaData*> grandparents_;
+  size_t grandparent_index_;  // Index in grandparent_starts_
+  bool seen_key_;             // Some output key has been seen
+  uint64_t overlapped_bytes_;  // Bytes of overlap between current output
+                              // and grandparent files
+  int base_index_;   // index of the file in files_[level_]
+  int parent_index_; // index of some file with same range in files_[level_+1]
+  double score_;     // score that was used to pick this compaction.
+
+  // Is this compaction creating a file in the bottom most level?
+  bool bottommost_level_;
+  // Does this compaction include all sst files?
+  bool is_full_compaction_;
+
+  // level_ptrs_ holds indices into input_version_->levels_: our state
+  // is that we are positioned at one of the file ranges for each
+  // higher level than the ones involved in this compaction (i.e. for
+  // all L >= level_ + 2).
+  std::vector<size_t> level_ptrs_;
+
+  // mark (or clear) all files that are being compacted
+  void MarkFilesBeingCompacted(bool);
+
+  // Initialize whether compaction producing files at the bottommost level
+  void SetupBottomMostLevel(bool isManual);
+
+  // In case of compaction error, reset the nextIndex that is used
+  // to pick up the next file to be compacted from files_by_size_
+  void ResetNextCompactionIndex();
+};
+
+}  // namespace rocksdb
diff --git a/db/version_set_reduce_num_levels.cc b/db/version_set_reduce_num_levels.cc
new file mode 100644 (file)
index 0000000..2ca6898
--- /dev/null
@@ -0,0 +1,81 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2012 Facebook. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "db/version_set.h"
+
+#include <algorithm>
+#include <stdio.h>
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "util/logging.h"
+
+namespace rocksdb {
+
+Status VersionSet::ReduceNumberOfLevels(int new_levels, port::Mutex* mu) {
+
+  if(new_levels <= 1) {
+    return Status::InvalidArgument(
+        "Number of levels needs to be bigger than 1");
+  }
+
+  Version* current_version = current_;
+  int current_levels = current_version->NumberLevels();
+
+  if (current_levels <= new_levels) {
+    return Status::OK();
+  }
+
+  // Make sure there are file only on one level from
+  // (new_levels-1) to (current_levels-1)
+  int first_nonempty_level = -1;
+  int first_nonempty_level_filenum = 0;
+  for (int i = new_levels - 1; i < current_levels; i++) {
+    int file_num = current_version->NumLevelFiles(i);
+    if (file_num != 0) {
+      if (first_nonempty_level < 0) {
+        first_nonempty_level = i;
+        first_nonempty_level_filenum = file_num;
+      } else {
+        char msg[255];
+        sprintf(msg, "Found at least two levels containing files: "
+            "[%d:%d],[%d:%d].\n",
+            first_nonempty_level, first_nonempty_level_filenum, i, file_num);
+        return Status::InvalidArgument(msg);
+      }
+    }
+  }
+
+  Status st;
+  std::vector<FileMetaData*>*  old_files_list = current_version->files_;
+  std::vector<FileMetaData*>* new_files_list =
+      new std::vector<FileMetaData*>[new_levels];
+  for (int i = 0; i < new_levels - 1; i++) {
+    new_files_list[i] = old_files_list[i];
+  }
+
+  if (first_nonempty_level > 0) {
+    new_files_list[new_levels - 1] = old_files_list[first_nonempty_level];
+  }
+
+  delete[] current_version->files_;
+  current_version->files_ = new_files_list;
+  current_version->num_levels_ = new_levels;
+
+  delete[] compact_pointer_;
+  delete[] max_file_size_;
+  delete[] level_max_bytes_;
+  num_levels_ = new_levels;
+  compact_pointer_ = new std::string[new_levels];
+  Init(new_levels);
+  VersionEdit ve;
+  st = LogAndApply(&ve, mu, true);
+  return st;
+}
+
+}
diff --git a/db/version_set_test.cc b/db/version_set_test.cc
new file mode 100644 (file)
index 0000000..1af95dd
--- /dev/null
@@ -0,0 +1,184 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_set.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+
+class FindFileTest {
+ public:
+  std::vector<FileMetaData*> files_;
+  bool disjoint_sorted_files_;
+
+  FindFileTest() : disjoint_sorted_files_(true) { }
+
+  ~FindFileTest() {
+    for (unsigned int i = 0; i < files_.size(); i++) {
+      delete files_[i];
+    }
+  }
+
+  void Add(const char* smallest, const char* largest,
+           SequenceNumber smallest_seq = 100,
+           SequenceNumber largest_seq = 100) {
+    FileMetaData* f = new FileMetaData;
+    f->number = files_.size() + 1;
+    f->smallest = InternalKey(smallest, smallest_seq, kTypeValue);
+    f->largest = InternalKey(largest, largest_seq, kTypeValue);
+    files_.push_back(f);
+  }
+
+  int Find(const char* key) {
+    InternalKey target(key, 100, kTypeValue);
+    InternalKeyComparator cmp(BytewiseComparator());
+    return FindFile(cmp, files_, target.Encode());
+  }
+
+  bool Overlaps(const char* smallest, const char* largest) {
+    InternalKeyComparator cmp(BytewiseComparator());
+    Slice s(smallest != nullptr ? smallest : "");
+    Slice l(largest != nullptr ? largest : "");
+    return SomeFileOverlapsRange(cmp, disjoint_sorted_files_, files_,
+                                 (smallest != nullptr ? &s : nullptr),
+                                 (largest != nullptr ? &l : nullptr));
+  }
+};
+
+TEST(FindFileTest, Empty) {
+  ASSERT_EQ(0, Find("foo"));
+  ASSERT_TRUE(! Overlaps("a", "z"));
+  ASSERT_TRUE(! Overlaps(nullptr, "z"));
+  ASSERT_TRUE(! Overlaps("a", nullptr));
+  ASSERT_TRUE(! Overlaps(nullptr, nullptr));
+}
+
+TEST(FindFileTest, Single) {
+  Add("p", "q");
+  ASSERT_EQ(0, Find("a"));
+  ASSERT_EQ(0, Find("p"));
+  ASSERT_EQ(0, Find("p1"));
+  ASSERT_EQ(0, Find("q"));
+  ASSERT_EQ(1, Find("q1"));
+  ASSERT_EQ(1, Find("z"));
+
+  ASSERT_TRUE(! Overlaps("a", "b"));
+  ASSERT_TRUE(! Overlaps("z1", "z2"));
+  ASSERT_TRUE(Overlaps("a", "p"));
+  ASSERT_TRUE(Overlaps("a", "q"));
+  ASSERT_TRUE(Overlaps("a", "z"));
+  ASSERT_TRUE(Overlaps("p", "p1"));
+  ASSERT_TRUE(Overlaps("p", "q"));
+  ASSERT_TRUE(Overlaps("p", "z"));
+  ASSERT_TRUE(Overlaps("p1", "p2"));
+  ASSERT_TRUE(Overlaps("p1", "z"));
+  ASSERT_TRUE(Overlaps("q", "q"));
+  ASSERT_TRUE(Overlaps("q", "q1"));
+
+  ASSERT_TRUE(! Overlaps(nullptr, "j"));
+  ASSERT_TRUE(! Overlaps("r", nullptr));
+  ASSERT_TRUE(Overlaps(nullptr, "p"));
+  ASSERT_TRUE(Overlaps(nullptr, "p1"));
+  ASSERT_TRUE(Overlaps("q", nullptr));
+  ASSERT_TRUE(Overlaps(nullptr, nullptr));
+}
+
+
+TEST(FindFileTest, Multiple) {
+  Add("150", "200");
+  Add("200", "250");
+  Add("300", "350");
+  Add("400", "450");
+  ASSERT_EQ(0, Find("100"));
+  ASSERT_EQ(0, Find("150"));
+  ASSERT_EQ(0, Find("151"));
+  ASSERT_EQ(0, Find("199"));
+  ASSERT_EQ(0, Find("200"));
+  ASSERT_EQ(1, Find("201"));
+  ASSERT_EQ(1, Find("249"));
+  ASSERT_EQ(1, Find("250"));
+  ASSERT_EQ(2, Find("251"));
+  ASSERT_EQ(2, Find("299"));
+  ASSERT_EQ(2, Find("300"));
+  ASSERT_EQ(2, Find("349"));
+  ASSERT_EQ(2, Find("350"));
+  ASSERT_EQ(3, Find("351"));
+  ASSERT_EQ(3, Find("400"));
+  ASSERT_EQ(3, Find("450"));
+  ASSERT_EQ(4, Find("451"));
+
+  ASSERT_TRUE(! Overlaps("100", "149"));
+  ASSERT_TRUE(! Overlaps("251", "299"));
+  ASSERT_TRUE(! Overlaps("451", "500"));
+  ASSERT_TRUE(! Overlaps("351", "399"));
+
+  ASSERT_TRUE(Overlaps("100", "150"));
+  ASSERT_TRUE(Overlaps("100", "200"));
+  ASSERT_TRUE(Overlaps("100", "300"));
+  ASSERT_TRUE(Overlaps("100", "400"));
+  ASSERT_TRUE(Overlaps("100", "500"));
+  ASSERT_TRUE(Overlaps("375", "400"));
+  ASSERT_TRUE(Overlaps("450", "450"));
+  ASSERT_TRUE(Overlaps("450", "500"));
+}
+
+TEST(FindFileTest, MultipleNullBoundaries) {
+  Add("150", "200");
+  Add("200", "250");
+  Add("300", "350");
+  Add("400", "450");
+  ASSERT_TRUE(! Overlaps(nullptr, "149"));
+  ASSERT_TRUE(! Overlaps("451", nullptr));
+  ASSERT_TRUE(Overlaps(nullptr, nullptr));
+  ASSERT_TRUE(Overlaps(nullptr, "150"));
+  ASSERT_TRUE(Overlaps(nullptr, "199"));
+  ASSERT_TRUE(Overlaps(nullptr, "200"));
+  ASSERT_TRUE(Overlaps(nullptr, "201"));
+  ASSERT_TRUE(Overlaps(nullptr, "400"));
+  ASSERT_TRUE(Overlaps(nullptr, "800"));
+  ASSERT_TRUE(Overlaps("100", nullptr));
+  ASSERT_TRUE(Overlaps("200", nullptr));
+  ASSERT_TRUE(Overlaps("449", nullptr));
+  ASSERT_TRUE(Overlaps("450", nullptr));
+}
+
+TEST(FindFileTest, OverlapSequenceChecks) {
+  Add("200", "200", 5000, 3000);
+  ASSERT_TRUE(! Overlaps("199", "199"));
+  ASSERT_TRUE(! Overlaps("201", "300"));
+  ASSERT_TRUE(Overlaps("200", "200"));
+  ASSERT_TRUE(Overlaps("190", "200"));
+  ASSERT_TRUE(Overlaps("200", "210"));
+}
+
+TEST(FindFileTest, OverlappingFiles) {
+  Add("150", "600");
+  Add("400", "500");
+  disjoint_sorted_files_ = false;
+  ASSERT_TRUE(! Overlaps("100", "149"));
+  ASSERT_TRUE(! Overlaps("601", "700"));
+  ASSERT_TRUE(Overlaps("100", "150"));
+  ASSERT_TRUE(Overlaps("100", "200"));
+  ASSERT_TRUE(Overlaps("100", "300"));
+  ASSERT_TRUE(Overlaps("100", "400"));
+  ASSERT_TRUE(Overlaps("100", "500"));
+  ASSERT_TRUE(Overlaps("375", "400"));
+  ASSERT_TRUE(Overlaps("450", "450"));
+  ASSERT_TRUE(Overlaps("450", "500"));
+  ASSERT_TRUE(Overlaps("450", "700"));
+  ASSERT_TRUE(Overlaps("600", "700"));
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/db/write_batch.cc b/db/write_batch.cc
new file mode 100644 (file)
index 0000000..7a6106a
--- /dev/null
@@ -0,0 +1,303 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// WriteBatch::rep_ :=
+//    sequence: fixed64
+//    count: fixed32
+//    data: record[count]
+// record :=
+//    kTypeValue varstring varstring
+//    kTypeMerge varstring varstring
+//    kTypeDeletion varstring
+// varstring :=
+//    len: varint32
+//    data: uint8[len]
+
+#include "rocksdb/write_batch.h"
+#include "rocksdb/options.h"
+#include "rocksdb/merge_operator.h"
+#include "db/dbformat.h"
+#include "db/db_impl.h"
+#include "db/memtable.h"
+#include "db/snapshot.h"
+#include "db/write_batch_internal.h"
+#include "util/coding.h"
+#include "util/statistics_imp.h"
+#include <stdexcept>
+
+namespace rocksdb {
+
+// WriteBatch header has an 8-byte sequence number followed by a 4-byte count.
+static const size_t kHeader = 12;
+
+WriteBatch::WriteBatch(size_t reserved_bytes) {
+  rep_.reserve((reserved_bytes > kHeader) ? reserved_bytes : kHeader);
+  Clear();
+}
+
+WriteBatch::~WriteBatch() { }
+
+WriteBatch::Handler::~Handler() { }
+
+void WriteBatch::Handler::Merge(const Slice& key, const Slice& value) {
+  throw std::runtime_error("Handler::Merge not implemented!");
+}
+
+void WriteBatch::Handler::LogData(const Slice& blob) {
+  // If the user has not specified something to do with blobs, then we ignore
+  // them.
+}
+
+bool WriteBatch::Handler::Continue() {
+  return true;
+}
+
+void WriteBatch::Clear() {
+  rep_.clear();
+  rep_.resize(kHeader);
+}
+
+int WriteBatch::Count() const {
+  return WriteBatchInternal::Count(this);
+}
+
+Status WriteBatch::Iterate(Handler* handler) const {
+  Slice input(rep_);
+  if (input.size() < kHeader) {
+    return Status::Corruption("malformed WriteBatch (too small)");
+  }
+
+  input.remove_prefix(kHeader);
+  Slice key, value, blob;
+  int found = 0;
+  while (!input.empty() && handler->Continue()) {
+    char tag = input[0];
+    input.remove_prefix(1);
+    switch (tag) {
+      case kTypeValue:
+        if (GetLengthPrefixedSlice(&input, &key) &&
+            GetLengthPrefixedSlice(&input, &value)) {
+          handler->Put(key, value);
+          found++;
+        } else {
+          return Status::Corruption("bad WriteBatch Put");
+        }
+        break;
+      case kTypeDeletion:
+        if (GetLengthPrefixedSlice(&input, &key)) {
+          handler->Delete(key);
+          found++;
+        } else {
+          return Status::Corruption("bad WriteBatch Delete");
+        }
+        break;
+      case kTypeMerge:
+        if (GetLengthPrefixedSlice(&input, &key) &&
+            GetLengthPrefixedSlice(&input, &value)) {
+          handler->Merge(key, value);
+          found++;
+        } else {
+          return Status::Corruption("bad WriteBatch Merge");
+        }
+        break;
+      case kTypeLogData:
+        if (GetLengthPrefixedSlice(&input, &blob)) {
+          handler->LogData(blob);
+        } else {
+          return Status::Corruption("bad WriteBatch Blob");
+        }
+        break;
+      default:
+        return Status::Corruption("unknown WriteBatch tag");
+    }
+  }
+  if (found != WriteBatchInternal::Count(this)) {
+    return Status::Corruption("WriteBatch has wrong count");
+  } else {
+    return Status::OK();
+  }
+}
+
+int WriteBatchInternal::Count(const WriteBatch* b) {
+  return DecodeFixed32(b->rep_.data() + 8);
+}
+
+void WriteBatchInternal::SetCount(WriteBatch* b, int n) {
+  EncodeFixed32(&b->rep_[8], n);
+}
+
+SequenceNumber WriteBatchInternal::Sequence(const WriteBatch* b) {
+  return SequenceNumber(DecodeFixed64(b->rep_.data()));
+}
+
+void WriteBatchInternal::SetSequence(WriteBatch* b, SequenceNumber seq) {
+  EncodeFixed64(&b->rep_[0], seq);
+}
+
+void WriteBatch::Put(const Slice& key, const Slice& value) {
+  WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1);
+  rep_.push_back(static_cast<char>(kTypeValue));
+  PutLengthPrefixedSlice(&rep_, key);
+  PutLengthPrefixedSlice(&rep_, value);
+}
+
+void WriteBatch::Put(const SliceParts& key, const SliceParts& value) {
+  WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1);
+  rep_.push_back(static_cast<char>(kTypeValue));
+  PutLengthPrefixedSliceParts(&rep_, key);
+  PutLengthPrefixedSliceParts(&rep_, value);
+}
+
+void WriteBatch::Delete(const Slice& key) {
+  WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1);
+  rep_.push_back(static_cast<char>(kTypeDeletion));
+  PutLengthPrefixedSlice(&rep_, key);
+}
+
+void WriteBatch::Merge(const Slice& key, const Slice& value) {
+  WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1);
+  rep_.push_back(static_cast<char>(kTypeMerge));
+  PutLengthPrefixedSlice(&rep_, key);
+  PutLengthPrefixedSlice(&rep_, value);
+}
+
+void WriteBatch::PutLogData(const Slice& blob) {
+  rep_.push_back(static_cast<char>(kTypeLogData));
+  PutLengthPrefixedSlice(&rep_, blob);
+}
+
+namespace {
+class MemTableInserter : public WriteBatch::Handler {
+ public:
+  SequenceNumber sequence_;
+  MemTable* mem_;
+  const Options* options_;
+  DBImpl* db_;
+  const bool filter_deletes_;
+
+  MemTableInserter(SequenceNumber sequence, MemTable* mem, const Options* opts,
+                   DB* db, const bool filter_deletes)
+    : sequence_(sequence),
+      mem_(mem),
+      options_(opts),
+      db_(reinterpret_cast<DBImpl*>(db)),
+      filter_deletes_(filter_deletes) {
+    assert(mem_);
+    if (filter_deletes_) {
+      assert(options_);
+      assert(db_);
+    }
+  }
+
+  virtual void Put(const Slice& key, const Slice& value) {
+    if (options_->inplace_update_support
+        && mem_->Update(sequence_, kTypeValue, key, value)) {
+      RecordTick(options_->statistics.get(), NUMBER_KEYS_UPDATED);
+    } else {
+      mem_->Add(sequence_, kTypeValue, key, value);
+    }
+    sequence_++;
+  }
+  virtual void Merge(const Slice& key, const Slice& value) {
+    bool perform_merge = false;
+
+    if (options_->max_successive_merges > 0 && db_ != nullptr) {
+      LookupKey lkey(key, sequence_);
+
+      // Count the number of successive merges at the head
+      // of the key in the memtable
+      size_t num_merges = mem_->CountSuccessiveMergeEntries(lkey);
+
+      if (num_merges >= options_->max_successive_merges) {
+        perform_merge = true;
+      }
+    }
+
+    if (perform_merge) {
+      // 1) Get the existing value
+      std::string get_value;
+
+      // Pass in the sequence number so that we also include previous merge
+      // operations in the same batch.
+      SnapshotImpl read_from_snapshot;
+      read_from_snapshot.number_ = sequence_;
+      ReadOptions read_options;
+      read_options.snapshot = &read_from_snapshot;
+
+      db_->Get(read_options, key, &get_value);
+      Slice get_value_slice = Slice(get_value);
+
+      // 2) Apply this merge
+      auto merge_operator = options_->merge_operator.get();
+      assert(merge_operator);
+
+      std::deque<std::string> operands;
+      operands.push_front(value.ToString());
+      std::string new_value;
+      if (!merge_operator->FullMerge(key,
+                                     &get_value_slice,
+                                     operands,
+                                     &new_value,
+                                     options_->info_log.get())) {
+          // Failed to merge!
+          RecordTick(options_->statistics.get(), NUMBER_MERGE_FAILURES);
+
+          // Store the delta in memtable
+          perform_merge = false;
+      } else {
+        // 3) Add value to memtable
+        mem_->Add(sequence_, kTypeValue, key, new_value);
+      }
+    }
+
+    if (!perform_merge) {
+      // Add merge operator to memtable
+      mem_->Add(sequence_, kTypeMerge, key, value);
+    }
+
+    sequence_++;
+  }
+  virtual void Delete(const Slice& key) {
+    if (filter_deletes_) {
+      SnapshotImpl read_from_snapshot;
+      read_from_snapshot.number_ = sequence_;
+      ReadOptions ropts;
+      ropts.snapshot = &read_from_snapshot;
+      std::string value;
+      if (!db_->KeyMayExist(ropts, key, &value)) {
+        RecordTick(options_->statistics.get(), NUMBER_FILTERED_DELETES);
+        return;
+      }
+    }
+    mem_->Add(sequence_, kTypeDeletion, key, Slice());
+    sequence_++;
+  }
+};
+}  // namespace
+
+Status WriteBatchInternal::InsertInto(const WriteBatch* b, MemTable* mem,
+                                      const Options* opts, DB* db,
+                                      const bool filter_deletes) {
+  MemTableInserter inserter(WriteBatchInternal::Sequence(b), mem, opts, db,
+                            filter_deletes);
+  return b->Iterate(&inserter);
+}
+
+void WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) {
+  assert(contents.size() >= kHeader);
+  b->rep_.assign(contents.data(), contents.size());
+}
+
+void WriteBatchInternal::Append(WriteBatch* dst, const WriteBatch* src) {
+  SetCount(dst, Count(dst) + Count(src));
+  assert(src->rep_.size() >= kHeader);
+  dst->rep_.append(src->rep_.data() + kHeader, src->rep_.size() - kHeader);
+}
+
+}  // namespace rocksdb
diff --git a/db/write_batch_internal.h b/db/write_batch_internal.h
new file mode 100644 (file)
index 0000000..b899173
--- /dev/null
@@ -0,0 +1,57 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include "rocksdb/types.h"
+#include "rocksdb/write_batch.h"
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+
+namespace rocksdb {
+
+class MemTable;
+
+// WriteBatchInternal provides static methods for manipulating a
+// WriteBatch that we don't want in the public WriteBatch interface.
+class WriteBatchInternal {
+ public:
+  // Return the number of entries in the batch.
+  static int Count(const WriteBatch* batch);
+
+  // Set the count for the number of entries in the batch.
+  static void SetCount(WriteBatch* batch, int n);
+
+  // Return the seqeunce number for the start of this batch.
+  static SequenceNumber Sequence(const WriteBatch* batch);
+
+  // Store the specified number as the seqeunce number for the start of
+  // this batch.
+  static void SetSequence(WriteBatch* batch, SequenceNumber seq);
+
+  static Slice Contents(const WriteBatch* batch) {
+    return Slice(batch->rep_);
+  }
+
+  static size_t ByteSize(const WriteBatch* batch) {
+    return batch->rep_.size();
+  }
+
+  static void SetContents(WriteBatch* batch, const Slice& contents);
+
+  // Inserts batch entries into memtable
+  // Drops deletes in batch if filter_del is set to true and
+  // db->KeyMayExist returns false
+  static Status InsertInto(const WriteBatch* batch, MemTable* memtable,
+                           const Options* opts, DB* db = nullptr,
+                           const bool filter_del = false);
+
+  static void Append(WriteBatch* dst, const WriteBatch* src);
+};
+
+}  // namespace rocksdb
diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc
new file mode 100644 (file)
index 0000000..931d8f3
--- /dev/null
@@ -0,0 +1,263 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/db.h"
+
+#include <memory>
+#include "db/memtable.h"
+#include "db/write_batch_internal.h"
+#include "rocksdb/env.h"
+#include "rocksdb/memtablerep.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+
+namespace rocksdb {
+
+static std::string PrintContents(WriteBatch* b) {
+  InternalKeyComparator cmp(BytewiseComparator());
+  auto factory = std::make_shared<SkipListFactory>();
+  Options options;
+  options.memtable_factory = factory;
+  MemTable* mem = new MemTable(cmp, options);
+  mem->Ref();
+  std::string state;
+  Status s = WriteBatchInternal::InsertInto(b, mem, &options);
+  int count = 0;
+  Iterator* iter = mem->NewIterator();
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ParsedInternalKey ikey;
+    memset((void *)&ikey, 0, sizeof(ikey));
+    ASSERT_TRUE(ParseInternalKey(iter->key(), &ikey));
+    switch (ikey.type) {
+      case kTypeValue:
+        state.append("Put(");
+        state.append(ikey.user_key.ToString());
+        state.append(", ");
+        state.append(iter->value().ToString());
+        state.append(")");
+        count++;
+        break;
+      case kTypeMerge:
+        state.append("Merge(");
+        state.append(ikey.user_key.ToString());
+        state.append(", ");
+        state.append(iter->value().ToString());
+        state.append(")");
+        count++;
+        break;
+      case kTypeDeletion:
+        state.append("Delete(");
+        state.append(ikey.user_key.ToString());
+        state.append(")");
+        count++;
+        break;
+      case kTypeLogData:
+        assert(false);
+        break;
+    }
+    state.append("@");
+    state.append(NumberToString(ikey.sequence));
+  }
+  delete iter;
+  if (!s.ok()) {
+    state.append(s.ToString());
+  } else if (count != WriteBatchInternal::Count(b)) {
+    state.append("CountMismatch()");
+  }
+  delete mem->Unref();
+  return state;
+}
+
+class WriteBatchTest { };
+
+TEST(WriteBatchTest, Empty) {
+  WriteBatch batch;
+  ASSERT_EQ("", PrintContents(&batch));
+  ASSERT_EQ(0, WriteBatchInternal::Count(&batch));
+  ASSERT_EQ(0, batch.Count());
+}
+
+TEST(WriteBatchTest, Multiple) {
+  WriteBatch batch;
+  batch.Put(Slice("foo"), Slice("bar"));
+  batch.Delete(Slice("box"));
+  batch.Put(Slice("baz"), Slice("boo"));
+  WriteBatchInternal::SetSequence(&batch, 100);
+  ASSERT_EQ(100U, WriteBatchInternal::Sequence(&batch));
+  ASSERT_EQ(3, WriteBatchInternal::Count(&batch));
+  ASSERT_EQ("Put(baz, boo)@102"
+            "Delete(box)@101"
+            "Put(foo, bar)@100",
+            PrintContents(&batch));
+  ASSERT_EQ(3, batch.Count());
+}
+
+TEST(WriteBatchTest, Corruption) {
+  WriteBatch batch;
+  batch.Put(Slice("foo"), Slice("bar"));
+  batch.Delete(Slice("box"));
+  WriteBatchInternal::SetSequence(&batch, 200);
+  Slice contents = WriteBatchInternal::Contents(&batch);
+  WriteBatchInternal::SetContents(&batch,
+                                  Slice(contents.data(),contents.size()-1));
+  ASSERT_EQ("Put(foo, bar)@200"
+            "Corruption: bad WriteBatch Delete",
+            PrintContents(&batch));
+}
+
+TEST(WriteBatchTest, Append) {
+  WriteBatch b1, b2;
+  WriteBatchInternal::SetSequence(&b1, 200);
+  WriteBatchInternal::SetSequence(&b2, 300);
+  WriteBatchInternal::Append(&b1, &b2);
+  ASSERT_EQ("",
+            PrintContents(&b1));
+  ASSERT_EQ(0, b1.Count());
+  b2.Put("a", "va");
+  WriteBatchInternal::Append(&b1, &b2);
+  ASSERT_EQ("Put(a, va)@200",
+            PrintContents(&b1));
+  ASSERT_EQ(1, b1.Count());
+  b2.Clear();
+  b2.Put("b", "vb");
+  WriteBatchInternal::Append(&b1, &b2);
+  ASSERT_EQ("Put(a, va)@200"
+            "Put(b, vb)@201",
+            PrintContents(&b1));
+  ASSERT_EQ(2, b1.Count());
+  b2.Delete("foo");
+  WriteBatchInternal::Append(&b1, &b2);
+  ASSERT_EQ("Put(a, va)@200"
+            "Put(b, vb)@202"
+            "Put(b, vb)@201"
+            "Delete(foo)@203",
+            PrintContents(&b1));
+  ASSERT_EQ(4, b1.Count());
+}
+
+namespace {
+  struct TestHandler : public WriteBatch::Handler {
+    std::string seen;
+    virtual void Put(const Slice& key, const Slice& value) {
+      seen += "Put(" + key.ToString() + ", " + value.ToString() + ")";
+    }
+    virtual void Merge(const Slice& key, const Slice& value) {
+      seen += "Merge(" + key.ToString() + ", " + value.ToString() + ")";
+    }
+    virtual void LogData(const Slice& blob) {
+      seen += "LogData(" + blob.ToString() + ")";
+    }
+    virtual void Delete(const Slice& key) {
+      seen += "Delete(" + key.ToString() + ")";
+    }
+  };
+}
+
+TEST(WriteBatchTest, Blob) {
+  WriteBatch batch;
+  batch.Put(Slice("k1"), Slice("v1"));
+  batch.Put(Slice("k2"), Slice("v2"));
+  batch.Put(Slice("k3"), Slice("v3"));
+  batch.PutLogData(Slice("blob1"));
+  batch.Delete(Slice("k2"));
+  batch.PutLogData(Slice("blob2"));
+  batch.Merge(Slice("foo"), Slice("bar"));
+  ASSERT_EQ(5, batch.Count());
+  ASSERT_EQ("Merge(foo, bar)@4"
+            "Put(k1, v1)@0"
+            "Delete(k2)@3"
+            "Put(k2, v2)@1"
+            "Put(k3, v3)@2",
+            PrintContents(&batch));
+
+  TestHandler handler;
+  batch.Iterate(&handler);
+  ASSERT_EQ(
+            "Put(k1, v1)"
+            "Put(k2, v2)"
+            "Put(k3, v3)"
+            "LogData(blob1)"
+            "Delete(k2)"
+            "LogData(blob2)"
+            "Merge(foo, bar)",
+            handler.seen);
+}
+
+TEST(WriteBatchTest, Continue) {
+  WriteBatch batch;
+
+  struct Handler : public TestHandler {
+    int num_seen = 0;
+    virtual void Put(const Slice& key, const Slice& value) {
+      ++num_seen;
+      TestHandler::Put(key, value);
+    }
+    virtual void Merge(const Slice& key, const Slice& value) {
+      ++num_seen;
+      TestHandler::Merge(key, value);
+    }
+    virtual void LogData(const Slice& blob) {
+      ++num_seen;
+      TestHandler::LogData(blob);
+    }
+    virtual void Delete(const Slice& key) {
+      ++num_seen;
+      TestHandler::Delete(key);
+    }
+    virtual bool Continue() override {
+      return num_seen < 3;
+    }
+  } handler;
+
+  batch.Put(Slice("k1"), Slice("v1"));
+  batch.PutLogData(Slice("blob1"));
+  batch.Delete(Slice("k1"));
+  batch.PutLogData(Slice("blob2"));
+  batch.Merge(Slice("foo"), Slice("bar"));
+  batch.Iterate(&handler);
+  ASSERT_EQ(
+            "Put(k1, v1)"
+            "LogData(blob1)"
+            "Delete(k1)",
+            handler.seen);
+}
+
+TEST(WriteBatchTest, PutGatherSlices) {
+  WriteBatch batch;
+  batch.Put(Slice("foo"), Slice("bar"));
+
+  {
+    // Try a write where the key is one slice but the value is two
+    Slice key_slice("baz");
+    Slice value_slices[2] = { Slice("header"), Slice("payload") };
+    batch.Put(SliceParts(&key_slice, 1),
+              SliceParts(value_slices, 2));
+  }
+
+  {
+    // One where the key is composite but the value is a single slice
+    Slice key_slices[3] = { Slice("key"), Slice("part2"), Slice("part3") };
+    Slice value_slice("value");
+    batch.Put(SliceParts(key_slices, 3),
+              SliceParts(&value_slice, 1));
+  }
+
+  WriteBatchInternal::SetSequence(&batch, 100);
+  ASSERT_EQ("Put(baz, headerpayload)@101"
+            "Put(foo, bar)@100"
+            "Put(keypart2part3, value)@102",
+            PrintContents(&batch));
+  ASSERT_EQ(3, batch.Count());
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/doc/doc.css b/doc/doc.css
new file mode 100644 (file)
index 0000000..700c564
--- /dev/null
@@ -0,0 +1,89 @@
+body {
+  margin-left: 0.5in;
+  margin-right: 0.5in;
+  background: white;
+  color: black;
+}
+
+h1 {
+  margin-left: -0.2in;
+  font-size: 14pt;
+}
+h2 {
+  margin-left: -0in;
+  font-size: 12pt;
+}
+h3 {
+  margin-left: -0in;
+}
+h4 {
+  margin-left: -0in;
+}
+hr {
+  margin-left: -0in;
+}
+
+/* Definition lists: definition term bold */
+dt {
+  font-weight: bold;
+}
+
+address {
+  text-align: center;
+}
+code,samp,var {
+  color: blue;
+}
+kbd {
+  color: #600000;
+}
+div.note p {
+  float: right;
+  width: 3in;
+  margin-right: 0%;
+  padding: 1px;
+  border: 2px solid #6060a0;
+  background-color: #fffff0;
+}
+
+ul {
+  margin-top: -0em;
+  margin-bottom: -0em;
+}
+
+ol {
+  margin-top: -0em;
+  margin-bottom: -0em;
+}
+
+UL.nobullets {
+  list-style-type: none;
+  list-style-image: none;
+  margin-left: -1em;
+}
+
+p {
+  margin: 1em 0 1em 0;
+  padding: 0 0 0 0;
+}
+
+pre {
+  line-height: 1.3em;
+  padding: 0.4em 0 0.8em 0;
+  margin:  0 0 0 0;
+  border:  0 0 0 0;
+  color: blue;
+}
+
+.datatable {
+  margin-left: auto;
+  margin-right: auto;
+  margin-top: 2em;
+  margin-bottom: 2em;
+  border: 1px solid;
+}
+
+.datatable td,th {
+  padding: 0 0.5em 0 0.5em;
+  text-align: right;
+}
diff --git a/doc/index.html b/doc/index.html
new file mode 100644 (file)
index 0000000..84c4d13
--- /dev/null
@@ -0,0 +1,831 @@
+<!DOCTYPE html>
+<html>
+<head>
+<link rel="stylesheet" type="text/css" href="doc.css" />
+<title>RocksDB</title>
+</head>
+
+<body>
+<h1>RocksDB</h1>
+<address>The Facebook Database Engineering Team</address>
+<address>Build on earlier work on leveldb by Sanjay Ghemawat
+               (sanjay@google.com) and Jeff Dean (jeff@google.com)</address>
+<p>
+The <code>rocksdb</code> library provides a persistent key value store.  Keys and
+values are arbitrary byte arrays.  The keys are ordered within the key
+value store according to a user-specified comparator function.
+
+<p>
+<h1>Opening A Database</h1>
+<p>
+A <code>rocksdb</code> database has a name which corresponds to a file system
+directory.  All of the contents of database are stored in this
+directory.  The following example shows how to open a database,
+creating it if necessary:
+<p>
+<pre>
+  #include &lt;assert&gt;
+  #include "rocksdb/db.h"
+
+  rocksdb::DB* db;
+  rocksdb::Options options;
+  options.create_if_missing = true;
+  rocksdb::Status status = rocksdb::DB::Open(options, "/tmp/testdb", &amp;db);
+  assert(status.ok());
+  ...
+</pre>
+If you want to raise an error if the database already exists, add
+the following line before the <code>rocksdb::DB::Open</code> call:
+<pre>
+  options.error_if_exists = true;
+</pre>
+<h1>Status</h1>
+<p>
+You may have noticed the <code>rocksdb::Status</code> type above.  Values of this
+type are returned by most functions in <code>rocksdb</code> that may encounter an
+error.  You can check if such a result is ok, and also print an
+associated error message:
+<p>
+<pre>
+   rocksdb::Status s = ...;
+   if (!s.ok()) cerr &lt;&lt; s.ToString() &lt;&lt; endl;
+</pre>
+<h1>Closing A Database</h1>
+<p>
+When you are done with a database, just delete the database object.
+Example:
+<p>
+<pre>
+  ... open the db as described above ...
+  ... do something with db ...
+  delete db;
+</pre>
+<h1>Reads And Writes</h1>
+<p>
+The database provides <code>Put</code>, <code>Delete</code>, and <code>Get</code> methods to
+modify/query the database.  For example, the following code
+moves the value stored under key1 to key2.
+<pre>
+  std::string value;
+  rocksdb::Status s = db-&gt;Get(rocksdb::ReadOptions(), key1, &amp;value);
+  if (s.ok()) s = db-&gt;Put(rocksdb::WriteOptions(), key2, value);
+  if (s.ok()) s = db-&gt;Delete(rocksdb::WriteOptions(), key1);
+</pre>
+
+<h1>Atomic Updates</h1>
+<p>
+Note that if the process dies after the Put of key2 but before the
+delete of key1, the same value may be left stored under multiple keys.
+Such problems can be avoided by using the <code>WriteBatch</code> class to
+atomically apply a set of updates:
+<p>
+<pre>
+  #include "rocksdb/write_batch.h"
+  ...
+  std::string value;
+  rocksdb::Status s = db-&gt;Get(rocksdb::ReadOptions(), key1, &amp;value);
+  if (s.ok()) {
+    rocksdb::WriteBatch batch;
+    batch.Delete(key1);
+    batch.Put(key2, value);
+    s = db-&gt;Write(rocksdb::WriteOptions(), &amp;batch);
+  }
+</pre>
+The <code>WriteBatch</code> holds a sequence of edits to be made to the database,
+and these edits within the batch are applied in order.  Note that we
+called <code>Delete</code> before <code>Put</code> so that if <code>key1</code> is identical to <code>key2</code>,
+we do not end up erroneously dropping the value entirely.
+<p>
+Apart from its atomicity benefits, <code>WriteBatch</code> may also be used to
+speed up bulk updates by placing lots of individual mutations into the
+same batch.
+
+<h1>Synchronous Writes</h1>
+By default, each write to <code>leveldb</code> is asynchronous: it
+returns after pushing the write from the process into the operating
+system.  The transfer from operating system memory to the underlying
+persistent storage happens asynchronously.  The <code>sync</code> flag
+can be turned on for a particular write to make the write operation
+not return until the data being written has been pushed all the way to
+persistent storage.  (On Posix systems, this is implemented by calling
+either <code>fsync(...)</code> or <code>fdatasync(...)</code> or
+<code>msync(..., MS_SYNC)</code> before the write operation returns.)
+<pre>
+  rocksdb::WriteOptions write_options;
+  write_options.sync = true;
+  db-&gt;Put(write_options, ...);
+</pre>
+Asynchronous writes are often more than a thousand times as fast as
+synchronous writes.  The downside of asynchronous writes is that a
+crash of the machine may cause the last few updates to be lost.  Note
+that a crash of just the writing process (i.e., not a reboot) will not
+cause any loss since even when <code>sync</code> is false, an update
+is pushed from the process memory into the operating system before it
+is considered done.
+
+<p>
+Asynchronous writes can often be used safely.  For example, when
+loading a large amount of data into the database you can handle lost
+updates by restarting the bulk load after a crash.  A hybrid scheme is
+also possible where every Nth write is synchronous, and in the event
+of a crash, the bulk load is restarted just after the last synchronous
+write finished by the previous run.  (The synchronous write can update
+a marker that describes where to restart on a crash.)
+
+<p>
+<code>WriteBatch</code> provides an alternative to asynchronous writes.
+Multiple updates may be placed in the same <code>WriteBatch</code> and
+applied together using a synchronous write (i.e.,
+<code>write_options.sync</code> is set to true).  The extra cost of
+the synchronous write will be amortized across all of the writes in
+the batch.
+
+<p>
+We also provide a way to completely disable Write Ahead Log for a
+particular write. If you set write_option.disableWAL to true, the
+write will not go to the log at all and may be lost in an event of
+process crash.
+
+<p>
+When opening a DB, you can disable syncing of data files by setting
+Options::disableDataSync to true. This can be useful when doing
+bulk-loading or big idempotent operations. Once the operation is
+finished, you can manually call sync() to flush all dirty buffers
+to stable storage.
+
+<p>
+RocksDB by default uses faster fdatasync() to sync files. If you want
+to use fsync(), you can set Options::use_fsync to true. You should set
+this to true on filesystems like ext3 that can lose files after a
+reboot.
+
+<p>
+<h1>Concurrency</h1>
+<p>
+A database may only be opened by one process at a time.
+The <code>rocksdb</code> implementation acquires a lock from the
+operating system to prevent misuse.  Within a single process, the
+same <code>rocksdb::DB</code> object may be safely shared by multiple
+concurrent threads.  I.e., different threads may write into or fetch
+iterators or call <code>Get</code> on the same database without any
+external synchronization (the leveldb implementation will
+automatically do the required synchronization).  However other objects
+(like Iterator and WriteBatch) may require external synchronization.
+If two threads share such an object, they must protect access to it
+using their own locking protocol.  More details are available in
+the public header files.
+
+<p>
+<h1>Merge operators</h1>
+<p>
+Merge operators provide efficient support for read-modify-write operation.
+More on the interface and implementation can be found on:
+<p>
+<a href="https://github.com/facebook/rocksdb/wiki/Merge-Operator">
+    Merge Operator</a>
+<p>
+<a href="https://github.com/facebook/rocksdb/wiki/Merge-Operator-Implementation">
+    Merge Operator Implementation</a>
+
+<p>
+<h1>Iteration</h1>
+<p>
+The following example demonstrates how to print all key,value pairs
+in a database.
+<p>
+<pre>
+  rocksdb::Iterator* it = db-&gt;NewIterator(rocksdb::ReadOptions());
+  for (it-&gt;SeekToFirst(); it-&gt;Valid(); it-&gt;Next()) {
+    cout &lt;&lt; it-&gt;key().ToString() &lt;&lt; ": "  &lt;&lt; it-&gt;value().ToString() &lt;&lt; endl;
+  }
+  assert(it-&gt;status().ok());  // Check for any errors found during the scan
+  delete it;
+</pre>
+The following variation shows how to process just the keys in the
+range <code>[start,limit)</code>:
+<p>
+<pre>
+  for (it-&gt;Seek(start);
+       it-&gt;Valid() &amp;&amp; it-&gt;key().ToString() &lt; limit;
+       it-&gt;Next()) {
+    ...
+  }
+</pre>
+You can also process entries in reverse order.  (Caveat: reverse
+iteration may be somewhat slower than forward iteration.)
+<p>
+<pre>
+  for (it-&gt;SeekToLast(); it-&gt;Valid(); it-&gt;Prev()) {
+    ...
+  }
+</pre>
+<h1>Snapshots</h1>
+<p>
+Snapshots provide consistent read-only views over the entire state of
+the key-value store.  <code>ReadOptions::snapshot</code> may be non-NULL to indicate
+that a read should operate on a particular version of the DB state.
+If <code>ReadOptions::snapshot</code> is NULL, the read will operate on an
+implicit snapshot of the current state.
+<p>
+Snapshots are created by the DB::GetSnapshot() method:
+<p>
+<pre>
+  rocksdb::ReadOptions options;
+  options.snapshot = db-&gt;GetSnapshot();
+  ... apply some updates to db ...
+  rocksdb::Iterator* iter = db-&gt;NewIterator(options);
+  ... read using iter to view the state when the snapshot was created ...
+  delete iter;
+  db-&gt;ReleaseSnapshot(options.snapshot);
+</pre>
+Note that when a snapshot is no longer needed, it should be released
+using the DB::ReleaseSnapshot interface.  This allows the
+implementation to get rid of state that was being maintained just to
+support reading as of that snapshot.
+<h1>Slice</h1>
+<p>
+The return value of the <code>it->key()</code> and <code>it->value()</code> calls above
+are instances of the <code>rocksdb::Slice</code> type.  <code>Slice</code> is a simple
+structure that contains a length and a pointer to an external byte
+array.  Returning a <code>Slice</code> is a cheaper alternative to returning a
+<code>std::string</code> since we do not need to copy potentially large keys and
+values.  In addition, <code>rocksdb</code> methods do not return null-terminated
+C-style strings since <code>rocksdb</code> keys and values are allowed to
+contain '\0' bytes.
+<p>
+C++ strings and null-terminated C-style strings can be easily converted
+to a Slice:
+<p>
+<pre>
+   rocksdb::Slice s1 = "hello";
+
+   std::string str("world");
+   rocksdb::Slice s2 = str;
+</pre>
+A Slice can be easily converted back to a C++ string:
+<pre>
+   std::string str = s1.ToString();
+   assert(str == std::string("hello"));
+</pre>
+Be careful when using Slices since it is up to the caller to ensure that
+the external byte array into which the Slice points remains live while
+the Slice is in use.  For example, the following is buggy:
+<p>
+<pre>
+   rocksdb::Slice slice;
+   if (...) {
+     std::string str = ...;
+     slice = str;
+   }
+   Use(slice);
+</pre>
+When the <code>if</code> statement goes out of scope, <code>str</code> will be destroyed and the
+backing storage for <code>slice</code> will disappear.
+<p>
+<h1>Comparators</h1>
+<p>
+The preceding examples used the default ordering function for key,
+which orders bytes lexicographically.  You can however supply a custom
+comparator when opening a database.  For example, suppose each
+database key consists of two numbers and we should sort by the first
+number, breaking ties by the second number.  First, define a proper
+subclass of <code>rocksdb::Comparator</code> that expresses these rules:
+<p>
+<pre>
+  class TwoPartComparator : public rocksdb::Comparator {
+   public:
+    // Three-way comparison function:
+    //   if a &lt; b: negative result
+    //   if a &gt; b: positive result
+    //   else: zero result
+    int Compare(const rocksdb::Slice&amp; a, const rocksdb::Slice&amp; b) const {
+      int a1, a2, b1, b2;
+      ParseKey(a, &amp;a1, &amp;a2);
+      ParseKey(b, &amp;b1, &amp;b2);
+      if (a1 &lt; b1) return -1;
+      if (a1 &gt; b1) return +1;
+      if (a2 &lt; b2) return -1;
+      if (a2 &gt; b2) return +1;
+      return 0;
+    }
+
+    // Ignore the following methods for now:
+    const char* Name() const { return "TwoPartComparator"; }
+    void FindShortestSeparator(std::string*, const rocksdb::Slice&amp;) const { }
+    void FindShortSuccessor(std::string*) const { }
+  };
+</pre>
+Now create a database using this custom comparator:
+<p>
+<pre>
+  TwoPartComparator cmp;
+  rocksdb::DB* db;
+  rocksdb::Options options;
+  options.create_if_missing = true;
+  options.comparator = &amp;cmp;
+  rocksdb::Status status = rocksdb::DB::Open(options, "/tmp/testdb", &amp;db);
+  ...
+</pre>
+<h2>Backwards compatibility</h2>
+<p>
+The result of the comparator's <code>Name</code> method is attached to the
+database when it is created, and is checked on every subsequent
+database open.  If the name changes, the <code>rocksdb::DB::Open</code> call will
+fail.  Therefore, change the name if and only if the new key format
+and comparison function are incompatible with existing databases, and
+it is ok to discard the contents of all existing databases.
+<p>
+You can however still gradually evolve your key format over time with
+a little bit of pre-planning.  For example, you could store a version
+number at the end of each key (one byte should suffice for most uses).
+When you wish to switch to a new key format (e.g., adding an optional
+third part to the keys processed by <code>TwoPartComparator</code>),
+(a) keep the same comparator name (b) increment the version number
+for new keys (c) change the comparator function so it uses the
+version numbers found in the keys to decide how to interpret them.
+
+
+<p>
+<h1>MemTable and Table factories</h1>
+<p>
+By default, we keep the data in memory in skiplist memtable and the data
+on disk in a table format described here:
+<a href="https://github.com/facebook/rocksdb/wiki/Rocksdb-Table-Format">
+    RocksDB Table Format</a>.
+<p>
+Since one of the goals of RocksDB is to have
+different parts of the system easily pluggable, we support different
+implementations of both memtable and table format. You can supply
+your own memtable factory by setting <code>Options::memtable_factory</code>
+and your own table factory by setting <code>Options::table_factory</code>.
+For available memtable factories, please refer to
+<code>rocksdb/memtablerep.h</code> and for table factores to
+<code>rocksdb/table.h</code>. These features are both in active development
+and please be wary of any API changes that might break your application
+going forward.
+<p>
+You can also read more about memtables here:
+<a href="https://github.com/facebook/rocksdb/wiki/Rocksdb-Architecture-Guide#memtables">
+Memtables wiki
+</a>
+
+<p>
+<h1>Performance</h1>
+<p>
+Performance can be tuned by changing the default values of the
+types defined in <code>include/rocksdb/options.h</code>.
+
+<p>
+<h2>Block size</h2>
+<p>
+<code>rocksdb</code> groups adjacent keys together into the same block and such a
+block is the unit of transfer to and from persistent storage.  The
+default block size is approximately 4096 uncompressed bytes.
+Applications that mostly do bulk scans over the contents of the
+database may wish to increase this size.  Applications that do a lot
+of point reads of small values may wish to switch to a smaller block
+size if performance measurements indicate an improvement.  There isn't
+much benefit in using blocks smaller than one kilobyte, or larger than
+a few megabytes.  Also note that compression will be more effective
+with larger block sizes. To change block size parameter, use
+<code>Options::block_size</code>.
+<p>
+<h2>Write buffer</h2>
+<p>
+<code>Options::write_buffer_size</code> specifies the amount of data
+to build up in memory before converting to a sorted on-disk file.
+Larger values increase performance, especially during bulk loads.
+Up to max_write_buffer_number write buffers may be held in memory
+at the same time,
+so you may wish to adjust this parameter to control memory usage.
+Also, a larger write buffer will result in a longer recovery time
+the next time the database is opened.
+Related option is
+<code>Options::max_write_buffer_number</code>, which is maximum number
+of write buffers that are built up in memory. The default is 2, so that
+when 1 write buffer is being flushed to storage, new writes can continue
+to the other write buffer.
+<code>Options::min_write_buffer_number_to_merge</code> is the minimum number
+of write buffers that will be merged together before writing to storage.
+If set to 1, then all write buffers are flushed to L0 as individual files and
+this increases read amplification because a get request has to check in all
+of these files. Also, an in-memory merge may result in writing lesser
+data to storage if there are duplicate records in each of these
+individual write buffers.  Default: 1
+<p>
+<h2>Compression</h2>
+<p>
+Each block is individually compressed before being written to
+persistent storage.  Compression is on by default since the default
+compression method is very fast, and is automatically disabled for
+uncompressible data.  In rare cases, applications may want to disable
+compression entirely, but should only do so if benchmarks show a
+performance improvement:
+<p>
+<pre>
+  rocksdb::Options options;
+  options.compression = rocksdb::kNoCompression;
+  ... rocksdb::DB::Open(options, name, ...) ....
+</pre>
+<h2>Cache</h2>
+<p>
+The contents of the database are stored in a set of files in the
+filesystem and each file stores a sequence of compressed blocks.  If
+<code>options.block_cache</code> is non-NULL, it is used to cache frequently
+used uncompressed block contents. If <code>options.block_cache_compressed</code>
+is non-NULL, it is used to cache frequently used compressed blocks. Compressed
+cache is an alternative to OS cache, which also caches compressed blocks. If
+compressed cache is used, the OS cache will be disabled automatically by setting
+<code>options.allow_os_buffer</code> to false.
+<p>
+<pre>
+  #include "rocksdb/cache.h"
+
+  rocksdb::Options options;
+  options.block_cache = rocksdb::NewLRUCache(100 * 1048576);  // 100MB uncompressed cache
+  options.block_cache_compressed = rocksdb::NewLRUCache(100 * 1048576);  // 100MB compressed cache
+  rocksdb::DB* db;
+  rocksdb::DB::Open(options, name, &db);
+  ... use the db ...
+  delete db
+  delete options.block_cache;
+  delete options.block_cache_compressed;
+</pre>
+<p>
+When performing a bulk read, the application may wish to disable
+caching so that the data processed by the bulk read does not end up
+displacing most of the cached contents.  A per-iterator option can be
+used to achieve this:
+<p>
+<pre>
+  rocksdb::ReadOptions options;
+  options.fill_cache = false;
+  rocksdb::Iterator* it = db-&gt;NewIterator(options);
+  for (it-&gt;SeekToFirst(); it-&gt;Valid(); it-&gt;Next()) {
+    ...
+  }
+</pre>
+<p>
+You can also disable block cache by setting <code>options.no_block_cache</code>
+to true.
+<h2>Key Layout</h2>
+<p>
+Note that the unit of disk transfer and caching is a block.  Adjacent
+keys (according to the database sort order) will usually be placed in
+the same block.  Therefore the application can improve its performance
+by placing keys that are accessed together near each other and placing
+infrequently used keys in a separate region of the key space.
+<p>
+For example, suppose we are implementing a simple file system on top
+of <code>rocksdb</code>.  The types of entries we might wish to store are:
+<p>
+<pre>
+   filename -&gt; permission-bits, length, list of file_block_ids
+   file_block_id -&gt; data
+</pre>
+We might want to prefix <code>filename</code> keys with one letter (say '/') and the
+<code>file_block_id</code> keys with a different letter (say '0') so that scans
+over just the metadata do not force us to fetch and cache bulky file
+contents.
+<p>
+<h2>Filters</h2>
+<p>
+Because of the way <code>rocksdb</code> data is organized on disk,
+a single <code>Get()</code> call may involve multiple reads from disk.
+The optional <code>FilterPolicy</code> mechanism can be used to reduce
+the number of disk reads substantially.
+<pre>
+   rocksdb::Options options;
+   options.filter_policy = NewBloomFilter(10);
+   rocksdb::DB* db;
+   rocksdb::DB::Open(options, "/tmp/testdb", &amp;db);
+   ... use the database ...
+   delete db;
+   delete options.filter_policy;
+</pre>
+The preceding code associates a
+<a href="http://en.wikipedia.org/wiki/Bloom_filter">Bloom filter</a>
+based filtering policy with the database.  Bloom filter based
+filtering relies on keeping some number of bits of data in memory per
+key (in this case 10 bits per key since that is the argument we passed
+to NewBloomFilter).  This filter will reduce the number of unnecessary
+disk reads needed for <code>Get()</code> calls by a factor of
+approximately a 100.  Increasing the bits per key will lead to a
+larger reduction at the cost of more memory usage.  We recommend that
+applications whose working set does not fit in memory and that do a
+lot of random reads set a filter policy.
+<p>
+If you are using a custom comparator, you should ensure that the filter
+policy you are using is compatible with your comparator.  For example,
+consider a comparator that ignores trailing spaces when comparing keys.
+<code>NewBloomFilter</code> must not be used with such a comparator.
+Instead, the application should provide a custom filter policy that
+also ignores trailing spaces.  For example:
+<pre>
+  class CustomFilterPolicy : public rocksdb::FilterPolicy {
+   private:
+    FilterPolicy* builtin_policy_;
+   public:
+    CustomFilterPolicy() : builtin_policy_(NewBloomFilter(10)) { }
+    ~CustomFilterPolicy() { delete builtin_policy_; }
+
+    const char* Name() const { return "IgnoreTrailingSpacesFilter"; }
+
+    void CreateFilter(const Slice* keys, int n, std::string* dst) const {
+      // Use builtin bloom filter code after removing trailing spaces
+      std::vector&lt;Slice&gt; trimmed(n);
+      for (int i = 0; i &lt; n; i++) {
+        trimmed[i] = RemoveTrailingSpaces(keys[i]);
+      }
+      return builtin_policy_-&gt;CreateFilter(&amp;trimmed[i], n, dst);
+    }
+
+    bool KeyMayMatch(const Slice& key, const Slice& filter) const {
+      // Use builtin bloom filter code after removing trailing spaces
+      return builtin_policy_-&gt;KeyMayMatch(RemoveTrailingSpaces(key), filter);
+    }
+  };
+</pre>
+<p>
+Advanced applications may provide a filter policy that does not use
+a bloom filter but uses some other mechanism for summarizing a set
+of keys.  See <code>rocksdb/filter_policy.h</code> for detail.
+<p>
+<h1>Checksums</h1>
+<p>
+<code>rocksdb</code> associates checksums with all data it stores in the file system.
+There are two separate controls provided over how aggressively these
+checksums are verified:
+<p>
+<ul>
+<li> <code>ReadOptions::verify_checksums</code> may be set to true to force
+  checksum verification of all data that is read from the file system on
+  behalf of a particular read.  By default, no such verification is
+  done.
+<p>
+<li> <code>Options::paranoid_checks</code> may be set to true before opening a
+  database to make the database implementation raise an error as soon as
+  it detects an internal corruption.  Depending on which portion of the
+  database has been corrupted, the error may be raised when the database
+  is opened, or later by another database operation.  By default,
+  paranoid checking is off so that the database can be used even if
+  parts of its persistent storage have been corrupted.
+<p>
+  If a database is corrupted (perhaps it cannot be opened when
+  paranoid checking is turned on), the <code>rocksdb::RepairDB</code> function
+  may be used to recover as much of the data as possible.
+<p>
+</ul>
+
+<p>
+<h1>Compaction</h1>
+<p>
+You can read more on Compactions here:
+<a href="https://github.com/facebook/rocksdb/wiki/Rocksdb-Architecture-Guide#multi-threaded-compactions">
+    Multi-threaded compactions
+</a>
+<p>
+Here we give overview of the options that impact behavior of Compactions:
+<ul>
+<p>
+<li><code>Options::compaction_style</code> - RocksDB currently supports two
+compaction algorithms - Universal  style and Level style. This option switches
+between the two.  Can be kCompactionStyleUniversal or kCompactionStyleLevel.
+If this is kCompactionStyleUniversal, then you can configure universal style
+parameters with <code>Options::compaction_options_universal</code>.
+<p>
+<li><code>Options::disable_auto_compactions</code> - Disable automatic compactions.
+Manual compactions can still be issued on this database.
+<p>
+<li><code>Options::compaction_filter</code> - Allows an application to modify/delete
+a key-value during background compaction. The client must provide
+compaction_filter_factory if it requires a new compaction filter to be used
+for different compaction processes. Client should specify only one of filter
+or factory.
+<p>
+<li><code>Options::compaction_filter_factory</code> - a factory that provides
+compaction filter objects which allow an application to modify/delete a
+key-value during background compaction.
+</ul>
+<p>
+Other options impacting performance of compactions and when they get triggered
+are: 
+<ul>
+<p>
+<li> <code>Options::access_hint_on_compaction_start</code> - Specify the file access 
+pattern once a compaction is started. It will be applied to all input files of a compaction. Default: NORMAL
+<p>
+<li> <code>Options::level0_file_num_compaction_trigger</code> -  Number of files to trigger level-0 compaction. 
+A negative value means that level-0 compaction will not be triggered by number of files at all.
+<p>
+<li> <code>Options::max_mem_compaction_level</code> -  Maximum level to which a new compacted memtable is pushed if it
+does not create overlap.  We try to push to level 2 to avoid the relatively expensive level 0=>1 compactions and to avoid some
+expensive manifest file operations.  We do not push all the way to the largest level since that can generate a lot of wasted disk
+space if the same key space is being repeatedly overwritten.
+<p>
+<li> <code>Options::target_file_size_base</code> and <code>Options::target_file_size_multiplier</code> - 
+Target file size for compaction.  target_file_size_base is per-file size for level-1.
+Target file size for level L can be calculated by target_file_size_base * (target_file_size_multiplier ^ (L-1))
+For example, if target_file_size_base is 2MB and target_file_size_multiplier is 10, then each file on level-1 will
+be 2MB, and each file on level 2 will be 20MB, and each file on level-3 will be 200MB. Default target_file_size_base is 2MB
+and default target_file_size_multiplier is 1.
+<p>
+<li> <code>Options::expanded_compaction_factor</code> -  Maximum number of bytes in all compacted files.  We avoid expanding
+the lower level file set of a compaction if it would make the total compaction cover more than
+(expanded_compaction_factor * targetFileSizeLevel()) many bytes.
+<p>
+<li> <code>Options::source_compaction_factor</code> -    Maximum number of bytes in all source files to be compacted in a
+single compaction run. We avoid picking too many files in the source level so that we do not exceed the total source bytes
+for compaction to exceed (source_compaction_factor * targetFileSizeLevel()) many bytes.
+Default:1, i.e. pick maxfilesize amount of data as the source of a compaction.
+<p>
+<li> <code>Options::max_grandparent_overlap_factor</code> -   Control maximum bytes of overlaps in grandparent (i.e., level+2) before we
+stop building a single file in a level->level+1 compaction.
+<p>
+<li> <code>Options::disable_seek_compaction</code> -  Disable compaction triggered by seek.
+With bloomfilter and fast storage, a miss on one level is very cheap if the file handle is cached in table cache
+(which is true if max_open_files is large).
+<p>
+<li> <code>Options::max_background_compactions</code> - Maximum number of concurrent background jobs, submitted to
+the default LOW priority thread pool
+</ul>
+
+<p>
+You can learn more about all of those options in <code>rocksdb/options.h</code>
+
+<h2> Universal style compaction specific settings</h2>
+<p>
+If you're using Universal style compaction, there is an object <code>CompactionOptionsUniversal</code>
+that hold all the different options for that compaction. The exact definition is in
+<code>rocksdb/universal_compaction.h</code> and you can set it in <code>Options::compaction_options_universal</code>.
+Here we give short overview of options in <code>CompactionOptionsUniversal</code>:
+<ul>
+<p>
+<li> <code>CompactionOptionsUniversal::size_ratio</code> - Percentage flexibility while comparing file size. If the candidate file(s)
+   size is 1% smaller than the next file's size, then include next file into
+   this candidate set.  Default: 1
+<p>
+<li> <code>CompactionOptionsUniversal::min_merge_width</code> - The minimum number of files in a single compaction run. Default: 2
+<p>
+<li> <code>CompactionOptionsUniversal::max_merge_width</code> - The maximum number of files in a single compaction run. Default: UINT_MAX
+<p>
+<li> <code>CompactionOptionsUniversal::max_size_amplification_percent</code> - The size amplification is defined as the amount (in percentage) of
+additional storage needed to store a single byte of data in the database.  For example, a size amplification of 2% means that a database that
+contains 100 bytes of user-data may occupy upto 102 bytes of physical storage. By this definition, a fully compacted database has
+a size amplification of 0%. Rocksdb uses the following heuristic to calculate size amplification: it assumes that all files excluding
+the earliest file contribute to the size amplification.  Default: 200, which means that a 100 byte database could require upto
+300 bytes of storage.
+<p>
+<li> <code>CompactionOptionsUniversal::compression_size_percent</code> - If this option is set to be -1 (the default value), all the output files
+will follow compression type specified.  If this option is not negative, we will try to make sure compressed
+size is just above this value. In normal cases, at least this percentage
+of data will be compressed.
+When we are compacting to a new file, here is the criteria whether
+it needs to be compressed: assuming here are the list of files sorted
+by generation time: [ A1...An B1...Bm C1...Ct ],
+where A1 is the newest and Ct is the oldest, and we are going to compact
+B1...Bm, we calculate the total size of all the files as total_size, as
+well as  the total size of C1...Ct as total_C, the compaction output file
+will be compressed iff total_C / total_size < this percentage
+<p>
+<li> <code>CompactionOptionsUniversal::stop_style</code> - The algorithm used to stop picking files into a single compaction run.
+Can be kCompactionStopStyleSimilarSize (pick files of similar size) or kCompactionStopStyleTotalSize (total size of picked files > next file).
+Default: kCompactionStopStyleTotalSize
+</ul>
+
+<h1>Thread pools</h1>
+<p>
+A thread pool is associated with Env environment object. The client has to create a thread pool by setting the number of background
+threads using method <code>Env::SetBackgroundThreads()</code> defined in <code>rocksdb/env.h</code>.
+We use the thread pool for compactions and memtable flushes.
+Since memtable flushes are in critical code path (stalling memtable flush can stall writes, increasing p99), we suggest 
+having two thread pools - with priorities HIGH and LOW. Memtable flushes can be set up to be scheduled on HIGH thread pool.
+There are two options available for configuration of background compactions and flushes:
+<ul>
+<p>
+<li> <code>Options::max_background_compactions</code> - Maximum number of concurrent background jobs,
+submitted to the default LOW priority thread pool
+<p>
+<li> <code>Options::max_background_flushes</code> - Maximum number of concurrent background memtable flush jobs, submitted to
+the HIGH priority thread pool.  By default, all background jobs (major compaction and memtable flush) go
+to the LOW priority pool. If this option is set to a positive number, memtable flush jobs will be submitted to the HIGH priority pool.
+It is important when the same Env is shared by multiple db instances.  Without a separate pool, long running major compaction jobs could
+potentially block memtable flush jobs of other db instances, leading to unnecessary Put stalls.
+</ul>
+<p>
+<pre>
+  #include "rocksdb/env.h"
+  #include "rocksdb/db.h"
+
+  auto env = rocksdb::Env::Default();
+  env->SetBackgroundThreads(2, rocksdb::Env::LOW);
+  env->SetBackgroundThreads(1, rocksdb::Env::HIGH);
+  rocksdb::DB* db;
+  rocksdb::Options options;
+  options.env = env;
+  options.max_background_compactions = 2;
+  options.max_background_flushes = 1;
+  rocksdb::Status status = rocksdb::DB::Open(options, "/tmp/testdb", &amp;db);
+  assert(status.ok());
+  ...
+</pre>
+<h1>Approximate Sizes</h1>
+<p>
+The <code>GetApproximateSizes</code> method can used to get the approximate
+number of bytes of file system space used by one or more key ranges.
+<p>
+<pre>
+   rocksdb::Range ranges[2];
+   ranges[0] = rocksdb::Range("a", "c");
+   ranges[1] = rocksdb::Range("x", "z");
+   uint64_t sizes[2];
+   rocksdb::Status s = db-&gt;GetApproximateSizes(ranges, 2, sizes);
+</pre>
+The preceding call will set <code>sizes[0]</code> to the approximate number of
+bytes of file system space used by the key range <code>[a..c)</code> and
+<code>sizes[1]</code> to the approximate number of bytes used by the key range
+<code>[x..z)</code>.
+<p>
+<h1>Environment</h1>
+<p>
+All file operations (and other operating system calls) issued by the
+<code>rocksdb</code> implementation are routed through a <code>rocksdb::Env</code> object.
+Sophisticated clients may wish to provide their own <code>Env</code>
+implementation to get better control.  For example, an application may
+introduce artificial delays in the file IO paths to limit the impact
+of <code>rocksdb</code> on other activities in the system.
+<p>
+<pre>
+  class SlowEnv : public rocksdb::Env {
+    .. implementation of the Env interface ...
+  };
+
+  SlowEnv env;
+  rocksdb::Options options;
+  options.env = &amp;env;
+  Status s = rocksdb::DB::Open(options, ...);
+</pre>
+<h1>Porting</h1>
+<p>
+<code>rocksdb</code> may be ported to a new platform by providing platform
+specific implementations of the types/methods/functions exported by
+<code>rocksdb/port/port.h</code>.  See <code>rocksdb/port/port_example.h</code> for more
+details.
+<p>
+In addition, the new platform may need a new default <code>rocksdb::Env</code>
+implementation.  See <code>rocksdb/util/env_posix.h</code> for an example.
+
+<h1>Statistics</h1>
+<p>
+To be able to efficiently tune your application, it is always helpful if you
+have access to usage statistics. You can collect those statistics by setting
+<code>Options::table_stats_collectors</code> or
+<code>Options::statistics</code>. For more information, refer to
+<code>rocksdb/table_stats.h</code> and <code>rocksdb/statistics.h</code>.
+These should not add significant overhead to your application and we
+recommend exporting them to other monitoring tools.
+
+<h1>Purging WAL files</h1>
+<p>
+By default, old write-ahead logs are deleted automatically when they fall out
+of scope and application doesn't need them anymore. There are options that
+enable the user to archive the logs and then delete them lazily, either in
+TTL fashion or based on size limit.
+
+The options are <code>Options::WAL_ttl_seconds</code> and
+<code>Options::WAL_size_limit_MB</code>. Here is how they can be used:
+<ul>
+<li>
+<p>
+If both set to 0, logs will be deleted asap and will never get into the archive.
+<li>
+<p>
+If <code>WAL_ttl_seconds</code> is 0 and WAL_size_limit_MB is not 0, WAL
+files will be checked every 10 min and if total size is greater then
+<code>WAL_size_limit_MB</code>, they will be deleted starting with the
+earliest until size_limit is met. All empty files will be deleted.
+<li>
+<p>
+If <code>WAL_ttl_seconds</code> is not 0 and WAL_size_limit_MB is 0, then
+WAL files will be checked every <code>WAL_ttl_seconds / 2</code> and those
+that are older than WAL_ttl_seconds will be deleted.
+<li>
+<p>
+If both are not 0, WAL files will be checked every 10 min and both
+checks will be performed with ttl being first.
+</ul>
+
+<h1>Other Information</h1>
+<p>
+Details about the <code>rocksdb</code> implementation may be found in
+the following documents:
+<ul>
+<li> <a href="https://github.com/facebook/rocksdb/wiki/Rocksdb-Architecture-Guide">
+  RocksDB Architecture Guide</a>
+<li> <a href="https://github.com/facebook/rocksdb/wiki/Rocksdb-Table-Format">
+    Format of an immutable Table file</a>
+<li> <a href="log_format.txt">Format of a log file</a>
+</ul>
+
+</body>
+</html>
diff --git a/doc/log_format.txt b/doc/log_format.txt
new file mode 100644 (file)
index 0000000..3a0414b
--- /dev/null
@@ -0,0 +1,75 @@
+The log file contents are a sequence of 32KB blocks.  The only
+exception is that the tail of the file may contain a partial block.
+
+Each block consists of a sequence of records:
+   block := record* trailer?
+   record :=
+       checksum: uint32        // crc32c of type and data[]
+       length: uint16
+       type: uint8             // One of FULL, FIRST, MIDDLE, LAST
+       data: uint8[length]
+
+A record never starts within the last six bytes of a block (since it
+won't fit).  Any leftover bytes here form the trailer, which must
+consist entirely of zero bytes and must be skipped by readers.  
+
+Aside: if exactly seven bytes are left in the current block, and a new
+non-zero length record is added, the writer must emit a FIRST record
+(which contains zero bytes of user data) to fill up the trailing seven
+bytes of the block and then emit all of the user data in subsequent
+blocks.
+
+More types may be added in the future.  Some Readers may skip record
+types they do not understand, others may report that some data was
+skipped.
+
+FULL == 1
+FIRST == 2
+MIDDLE == 3
+LAST == 4
+
+The FULL record contains the contents of an entire user record.
+
+FIRST, MIDDLE, LAST are types used for user records that have been
+split into multiple fragments (typically because of block boundaries).
+FIRST is the type of the first fragment of a user record, LAST is the
+type of the last fragment of a user record, and MID is the type of all
+interior fragments of a user record.
+
+Example: consider a sequence of user records:
+   A: length 1000
+   B: length 97270
+   C: length 8000
+A will be stored as a FULL record in the first block.
+
+B will be split into three fragments: first fragment occupies the rest
+of the first block, second fragment occupies the entirety of the
+second block, and the third fragment occupies a prefix of the third
+block.  This will leave six bytes free in the third block, which will
+be left empty as the trailer.
+
+C will be stored as a FULL record in the fourth block.
+
+===================
+
+Some benefits over the recordio format:
+
+(1) We do not need any heuristics for resyncing - just go to next
+block boundary and scan.  If there is a corruption, skip to the next
+block.  As a side-benefit, we do not get confused when part of the
+contents of one log file are embedded as a record inside another log
+file.
+
+(2) Splitting at approximate boundaries (e.g., for mapreduce) is
+simple: find the next block boundary and skip records until we
+hit a FULL or FIRST record.
+
+(3) We do not need extra buffering for large records.
+
+Some downsides compared to recordio format:
+
+(1) No packing of tiny records.  This could be fixed by adding a new
+record type, so it is a shortcoming of the current implementation,
+not necessarily the format.
+
+(2) No compression.  Again, this could be fixed by adding new record types.
diff --git a/doc/rockslogo.jpg b/doc/rockslogo.jpg
new file mode 100644 (file)
index 0000000..363905a
Binary files /dev/null and b/doc/rockslogo.jpg differ
diff --git a/doc/rockslogo.png b/doc/rockslogo.png
new file mode 100644 (file)
index 0000000..1961360
Binary files /dev/null and b/doc/rockslogo.png differ
diff --git a/hdfs/README b/hdfs/README
new file mode 100644 (file)
index 0000000..9b7d0a6
--- /dev/null
@@ -0,0 +1,26 @@
+This directory contains the hdfs extensions needed to make rocksdb store
+files in HDFS.
+
+The hdfs.h file is copied from the Apache Hadoop 1.0 source code. 
+It defines the libhdfs library
+(http://hadoop.apache.org/common/docs/r0.20.2/libhdfs.html) to access 
+data in HDFS.  The libhdfs.a is copied from the Apache Hadoop 1.0 build. 
+It implements the API defined in hdfs.h. If your hadoop cluster is running
+a different hadoop release, then install these two files manually from your
+hadoop distribution and then recompile rocksdb.
+
+The env_hdfs.h file defines the rocksdb objects that are needed to talk to an
+underlying filesystem. 
+
+If you want to compile rocksdb with hdfs support, please set the following
+enviroment variables appropriately:
+   USE_HDFS=1
+   JAVA_HOME=/usr/local/jdk-6u22-64
+   LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/jdk-6u22-64/jre/lib/amd64/server:/usr/local/jdk-6u22-64/jre/lib/amd64/:./snappy/libs
+   make clean all db_bench
+
+To run dbbench,
+  set CLASSPATH to include your hadoop distribution
+  db_bench --hdfs="hdfs://hbaseudbperf001.snc1.facebook.com:9000"
+
+
diff --git a/hdfs/env_hdfs.h b/hdfs/env_hdfs.h
new file mode 100644 (file)
index 0000000..cb8ca62
--- /dev/null
@@ -0,0 +1,302 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+
+#pragma once
+#include <algorithm>
+#include <stdio.h>
+#include <sys/time.h>
+#include <time.h>
+#include <iostream>
+#include "rocksdb/env.h"
+#include "rocksdb/status.h"
+
+#ifdef USE_HDFS
+#include "hdfs/hdfs.h"
+
+namespace rocksdb {
+
+static const std::string kProto = "hdfs://";
+static const std::string pathsep = "/";
+
+// Thrown during execution when there is an issue with the supplied
+// arguments.
+class HdfsUsageException : public std::exception { };
+
+// A simple exception that indicates something went wrong that is not
+// recoverable.  The intention is for the message to be printed (with
+// nothing else) and the process terminate.
+class HdfsFatalException : public std::exception {
+public:
+  explicit HdfsFatalException(const std::string& s) : what_(s) { }
+  virtual ~HdfsFatalException() throw() { }
+  virtual const char* what() const throw() {
+    return what_.c_str();
+  }
+private:
+  const std::string what_;
+};
+
+//
+// The HDFS environment for rocksdb. This class overrides all the
+// file/dir access methods and delegates the thread-mgmt methods to the
+// default posix environment.
+//
+class HdfsEnv : public Env {
+
+ public:
+  HdfsEnv(const std::string& fsname) : fsname_(fsname) {
+    posixEnv = Env::Default();
+    fileSys_ = connectToPath(fsname_);
+  }
+
+  virtual ~HdfsEnv() {
+    fprintf(stderr, "Destroying HdfsEnv::Default()\n");
+    hdfsDisconnect(fileSys_);
+  }
+
+  virtual Status NewSequentialFile(const std::string& fname,
+                                   SequentialFile** result);
+
+  virtual Status NewRandomAccessFile(const std::string& fname,
+                                     RandomAccessFile** result);
+
+  virtual Status NewWritableFile(const std::string& fname,
+                                 WritableFile** result);
+
+  virtual Status NewRandomRWFile(const std::string& fname,
+                                 unique_ptr<RandomRWFile>* result,
+                                 const EnvOptions& options);
+
+  virtual bool FileExists(const std::string& fname);
+
+  virtual Status GetChildren(const std::string& path,
+                             std::vector<std::string>* result);
+
+  virtual Status DeleteFile(const std::string& fname);
+
+  virtual Status CreateDir(const std::string& name);
+
+  virtual Status CreateDirIfMissing(const std::string& name);
+
+  virtual Status DeleteDir(const std::string& name);
+
+  virtual Status GetFileSize(const std::string& fname, uint64_t* size);
+
+  virtual Status GetFileModificationTime(const std::string& fname,
+                                         uint64_t* file_mtime);
+
+  virtual Status RenameFile(const std::string& src, const std::string& target);
+
+  virtual Status LockFile(const std::string& fname, FileLock** lock);
+
+  virtual Status UnlockFile(FileLock* lock);
+
+  virtual Status NewLogger(const std::string& fname, Logger** result);
+
+  virtual void Schedule(void (*function)(void* arg), void* arg,
+                        Priority pri = LOW) {
+    posixEnv->Schedule(function, arg, pri);
+  }
+
+  virtual void StartThread(void (*function)(void* arg), void* arg) {
+    posixEnv->StartThread(function, arg);
+  }
+
+  virtual Status GetTestDirectory(std::string* path) {
+    return posixEnv->GetTestDirectory(path);
+  }
+
+  virtual uint64_t NowMicros() {
+    return posixEnv->NowMicros();
+  }
+
+  virtual void SleepForMicroseconds(int micros) {
+    posixEnv->SleepForMicroseconds(micros);
+  }
+
+  virtual Status GetHostName(char* name, uint64_t len) {
+    return posixEnv->GetHostName(name, len);
+  }
+
+  virtual Status GetCurrentTime(int64_t* unix_time) {
+    return posixEnv->GetCurrentTime(unix_time);
+  }
+
+  virtual Status GetAbsolutePath(const std::string& db_path,
+      std::string* output_path) {
+    return posixEnv->GetAbsolutePath(db_path, output_path);
+  }
+
+  virtual void SetBackgroundThreads(int number, Priority pri = LOW) {
+    posixEnv->SetBackgroundThreads(number, pri);
+  }
+
+  virtual std::string TimeToString(uint64_t number) {
+    return posixEnv->TimeToString(number);
+  }
+
+  static uint64_t gettid() {
+    assert(sizeof(pthread_t) <= sizeof(uint64_t));
+    return (uint64_t)pthread_self();
+  }
+
+ private:
+  std::string fsname_;  // string of the form "hdfs://hostname:port/"
+  hdfsFS fileSys_;      //  a single FileSystem object for all files
+  Env*  posixEnv;       // This object is derived from Env, but not from
+                        // posixEnv. We have posixnv as an encapsulated
+                        // object here so that we can use posix timers,
+                        // posix threads, etc.
+
+  /**
+   * If the URI is specified of the form hdfs://server:port/path,
+   * then connect to the specified cluster
+   * else connect to default.
+   */
+  hdfsFS connectToPath(const std::string& uri) {
+    if (uri.empty()) {
+      return NULL;
+    }
+    if (uri.find(kProto) != 0) {
+      // uri doesn't start with hdfs:// -> use default:0, which is special
+      // to libhdfs.
+      return hdfsConnectNewInstance("default", 0);
+    }
+    const std::string hostport = uri.substr(kProto.length());
+
+    std::vector <std::string> parts;
+    split(hostport, ':', parts);
+    if (parts.size() != 2) {
+      throw HdfsFatalException("Bad uri for hdfs " + uri);
+    }
+    // parts[0] = hosts, parts[1] = port/xxx/yyy
+    std::string host(parts[0]);
+    std::string remaining(parts[1]);
+
+    int rem = remaining.find(pathsep);
+    std::string portStr = (rem == 0 ? remaining :
+                           remaining.substr(0, rem));
+
+    tPort port;
+    port = atoi(portStr.c_str());
+    if (port == 0) {
+      throw HdfsFatalException("Bad host-port for hdfs " + uri);
+    }
+    hdfsFS fs = hdfsConnectNewInstance(host.c_str(), port);
+    return fs;
+  }
+
+  void split(const std::string &s, char delim,
+             std::vector<std::string> &elems) {
+    elems.clear();
+    size_t prev = 0;
+    size_t pos = s.find(delim);
+    while (pos != std::string::npos) {
+      elems.push_back(s.substr(prev, pos));
+      prev = pos + 1;
+      pos = s.find(delim, prev);
+    }
+    elems.push_back(s.substr(prev, s.size()));
+  }
+};
+
+}  // namespace rocksdb
+
+#else // USE_HDFS
+
+
+namespace rocksdb {
+
+static const Status notsup;
+
+class HdfsEnv : public Env {
+
+ public:
+  HdfsEnv(const std::string& fsname) {
+    fprintf(stderr, "You have not build rocksdb with HDFS support\n");
+    fprintf(stderr, "Please see hdfs/README for details\n");
+    throw new std::exception();
+  }
+
+  virtual ~HdfsEnv() {
+  }
+
+  virtual Status NewSequentialFile(const std::string& fname,
+                                   unique_ptr<SequentialFile>* result,
+                                   const EnvOptions& options);
+
+  virtual Status NewRandomAccessFile(const std::string& fname,
+                                     unique_ptr<RandomAccessFile>* result,
+                                     const EnvOptions& options) {
+    return notsup;
+  }
+
+  virtual Status NewWritableFile(const std::string& fname,
+                                 unique_ptr<WritableFile>* result,
+                                 const EnvOptions& options) {
+    return notsup;
+  }
+
+  virtual Status NewRandomRWFile(const std::string& fname,
+                                 unique_ptr<RandomRWFile>* result,
+                                 const EnvOptions& options) {
+    return notsup;
+  }
+
+  virtual bool FileExists(const std::string& fname){return false;}
+
+  virtual Status GetChildren(const std::string& path,
+                             std::vector<std::string>* result){return notsup;}
+
+  virtual Status DeleteFile(const std::string& fname){return notsup;}
+
+  virtual Status CreateDir(const std::string& name){return notsup;}
+
+  virtual Status CreateDirIfMissing(const std::string& name){return notsup;}
+
+  virtual Status DeleteDir(const std::string& name){return notsup;}
+
+  virtual Status GetFileSize(const std::string& fname, uint64_t* size){return notsup;}
+
+  virtual Status GetFileModificationTime(const std::string& fname,
+                                         uint64_t* time) {
+    return notsup;
+  }
+
+  virtual Status RenameFile(const std::string& src, const std::string& target){return notsup;}
+
+  virtual Status LockFile(const std::string& fname, FileLock** lock){return notsup;}
+
+  virtual Status UnlockFile(FileLock* lock){return notsup;}
+
+  virtual Status NewLogger(const std::string& fname,
+                           shared_ptr<Logger>* result){return notsup;}
+
+  virtual void Schedule(void (*function)(void* arg), void* arg,
+                        Priority pri = LOW) {}
+
+  virtual void StartThread(void (*function)(void* arg), void* arg) {}
+
+  virtual Status GetTestDirectory(std::string* path) {return notsup;}
+
+  virtual uint64_t NowMicros() {return 0;}
+
+  virtual void SleepForMicroseconds(int micros) {}
+
+  virtual Status GetHostName(char* name, uint64_t len) {return notsup;}
+
+  virtual Status GetCurrentTime(int64_t* unix_time) {return notsup;}
+
+  virtual Status GetAbsolutePath(const std::string& db_path,
+      std::string* outputpath) {return notsup;}
+
+  virtual void SetBackgroundThreads(int number, Priority pri = LOW) {}
+
+  virtual std::string TimeToString(uint64_t number) { return "";}
+};
+}
+
+#endif // USE_HDFS
diff --git a/hdfs/hdfs.h b/hdfs/hdfs.h
new file mode 100644 (file)
index 0000000..8e8dfec
--- /dev/null
@@ -0,0 +1,477 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#ifndef LIBHDFS_HDFS_H
+#define LIBHDFS_HDFS_H
+
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+#include <time.h>
+#include <errno.h>
+
+#include <jni.h>
+
+#ifndef O_RDONLY
+#define O_RDONLY 1
+#endif
+
+#ifndef O_WRONLY 
+#define O_WRONLY 2
+#endif
+
+#ifndef EINTERNAL
+#define EINTERNAL 255 
+#endif
+
+
+/** All APIs set errno to meaningful values */
+#ifdef __cplusplus
+extern  "C" {
+#endif
+
+    /**
+     * Some utility decls used in libhdfs.
+     */
+
+    typedef int32_t   tSize; /// size of data for read/write io ops 
+    typedef time_t    tTime; /// time type in seconds
+    typedef int64_t   tOffset;/// offset within the file
+    typedef uint16_t  tPort; /// port
+    typedef enum tObjectKind {
+        kObjectKindFile = 'F',
+        kObjectKindDirectory = 'D',
+    } tObjectKind;
+
+
+    /**
+     * The C reflection of org.apache.org.hadoop.FileSystem .
+     */
+    typedef void* hdfsFS;
+
+    
+    /**
+     * The C equivalent of org.apache.org.hadoop.FSData(Input|Output)Stream .
+     */
+    enum hdfsStreamType
+    {
+        UNINITIALIZED = 0,
+        INPUT = 1,
+        OUTPUT = 2,
+    };
+
+    
+    /**
+     * The 'file-handle' to a file in hdfs.
+     */
+    struct hdfsFile_internal {
+        void* file;
+        enum hdfsStreamType type;
+    };
+    typedef struct hdfsFile_internal* hdfsFile;
+      
+
+    /** 
+     * hdfsConnectAsUser - Connect to a hdfs file system as a specific user
+     * Connect to the hdfs.
+     * @param host A string containing either a host name, or an ip address
+     * of the namenode of a hdfs cluster. 'host' should be passed as NULL if
+     * you want to connect to local filesystem. 'host' should be passed as
+     * 'default' (and port as 0) to used the 'configured' filesystem
+     * (core-site/core-default.xml).
+     * @param port The port on which the server is listening.
+     * @param user the user name (this is hadoop domain user). Or NULL is equivelant to hhdfsConnect(host, port)
+     * @param groups the groups (these are hadoop domain groups)
+     * @return Returns a handle to the filesystem or NULL on error.
+     */
+     hdfsFS hdfsConnectAsUser(const char* host, tPort port, const char *user , const char *groups[], int groups_size );
+
+
+    /** 
+     * hdfsConnect - Connect to a hdfs file system.
+     * Connect to the hdfs.
+     * @param host A string containing either a host name, or an ip address
+     * of the namenode of a hdfs cluster. 'host' should be passed as NULL if
+     * you want to connect to local filesystem. 'host' should be passed as
+     * 'default' (and port as 0) to used the 'configured' filesystem
+     * (core-site/core-default.xml).
+     * @param port The port on which the server is listening.
+     * @return Returns a handle to the filesystem or NULL on error.
+     */
+     hdfsFS hdfsConnect(const char* host, tPort port);
+
+
+    /**
+     * This are the same as hdfsConnectAsUser except that every invocation returns a new FileSystem handle.
+     * Applications should call a hdfsDisconnect for every call to hdfsConnectAsUserNewInstance.
+     */
+     hdfsFS hdfsConnectAsUserNewInstance(const char* host, tPort port, const char *user , const char *groups[], int groups_size );
+     hdfsFS hdfsConnectNewInstance(const char* host, tPort port);
+     hdfsFS hdfsConnectPath(const char* uri);
+
+    /** 
+     * hdfsDisconnect - Disconnect from the hdfs file system.
+     * Disconnect from hdfs.
+     * @param fs The configured filesystem handle.
+     * @return Returns 0 on success, -1 on error.  
+     */
+    int hdfsDisconnect(hdfsFS fs);
+        
+
+    /** 
+     * hdfsOpenFile - Open a hdfs file in given mode.
+     * @param fs The configured filesystem handle.
+     * @param path The full path to the file.
+     * @param flags - an | of bits/fcntl.h file flags - supported flags are O_RDONLY, O_WRONLY (meaning create or overwrite i.e., implies O_TRUNCAT), 
+     * O_WRONLY|O_APPEND. Other flags are generally ignored other than (O_RDWR || (O_EXCL & O_CREAT)) which return NULL and set errno equal ENOTSUP.
+     * @param bufferSize Size of buffer for read/write - pass 0 if you want
+     * to use the default configured values.
+     * @param replication Block replication - pass 0 if you want to use
+     * the default configured values.
+     * @param blocksize Size of block - pass 0 if you want to use the
+     * default configured values.
+     * @return Returns the handle to the open file or NULL on error.
+     */
+    hdfsFile hdfsOpenFile(hdfsFS fs, const char* path, int flags,
+                          int bufferSize, short replication, tSize blocksize);
+
+
+    /** 
+     * hdfsCloseFile - Close an open file. 
+     * @param fs The configured filesystem handle.
+     * @param file The file handle.
+     * @return Returns 0 on success, -1 on error.  
+     */
+    int hdfsCloseFile(hdfsFS fs, hdfsFile file);
+
+
+    /** 
+     * hdfsExists - Checks if a given path exsits on the filesystem 
+     * @param fs The configured filesystem handle.
+     * @param path The path to look for
+     * @return Returns 0 on exists, 1 on non-exists, -1/-2 on error.  
+     */
+    int hdfsExists(hdfsFS fs, const char *path);
+
+
+    /** 
+     * hdfsSeek - Seek to given offset in file. 
+     * This works only for files opened in read-only mode. 
+     * @param fs The configured filesystem handle.
+     * @param file The file handle.
+     * @param desiredPos Offset into the file to seek into.
+     * @return Returns 0 on success, -1 on error.  
+     */
+    int hdfsSeek(hdfsFS fs, hdfsFile file, tOffset desiredPos); 
+
+
+    /** 
+     * hdfsTell - Get the current offset in the file, in bytes.
+     * @param fs The configured filesystem handle.
+     * @param file The file handle.
+     * @return Current offset, -1 on error.
+     */
+    tOffset hdfsTell(hdfsFS fs, hdfsFile file);
+
+
+    /** 
+     * hdfsRead - Read data from an open file.
+     * @param fs The configured filesystem handle.
+     * @param file The file handle.
+     * @param buffer The buffer to copy read bytes into.
+     * @param length The length of the buffer.
+     * @return Returns the number of bytes actually read, possibly less
+     * than than length;-1 on error.
+     */
+    tSize hdfsRead(hdfsFS fs, hdfsFile file, void* buffer, tSize length);
+
+
+    /** 
+     * hdfsPread - Positional read of data from an open file.
+     * @param fs The configured filesystem handle.
+     * @param file The file handle.
+     * @param position Position from which to read
+     * @param buffer The buffer to copy read bytes into.
+     * @param length The length of the buffer.
+     * @return Returns the number of bytes actually read, possibly less than
+     * than length;-1 on error.
+     */
+    tSize hdfsPread(hdfsFS fs, hdfsFile file, tOffset position,
+                    void* buffer, tSize length);
+
+
+    /** 
+     * hdfsWrite - Write data into an open file.
+     * @param fs The configured filesystem handle.
+     * @param file The file handle.
+     * @param buffer The data.
+     * @param length The no. of bytes to write. 
+     * @return Returns the number of bytes written, -1 on error.
+     */
+    tSize hdfsWrite(hdfsFS fs, hdfsFile file, const void* buffer,
+                    tSize length);
+
+
+    /** 
+     * hdfsWrite - Flush the data. 
+     * @param fs The configured filesystem handle.
+     * @param file The file handle.
+     * @return Returns 0 on success, -1 on error. 
+     */
+    int hdfsFlush(hdfsFS fs, hdfsFile file);
+
+    /**
+     * hdfsSync - Sync the data to persistent store.
+     * @param fs The configured filesystem handle.
+     * @param file The file handle.
+     * @return Returns 0 on success, -1 on error.
+     */
+    int hdfsSync(hdfsFS fs, hdfsFile file);
+
+    /**
+     * hdfsGetNumReplicasInPipeline - get number of remaining replicas in 
+     * pipeline
+     * @param fs The configured filesystem handle
+     * @param file the file handle
+     * @return returns the # of datanodes in the write pipeline; -1 on error
+     */
+   int hdfsGetNumCurrentReplicas(hdfsFS, hdfsFile file);
+
+    /**
+     * hdfsAvailable - Number of bytes that can be read from this
+     * input stream without blocking.
+     * @param fs The configured filesystem handle.
+     * @param file The file handle.
+     * @return Returns available bytes; -1 on error. 
+     */
+    int hdfsAvailable(hdfsFS fs, hdfsFile file);
+
+
+    /**
+     * hdfsCopy - Copy file from one filesystem to another.
+     * @param srcFS The handle to source filesystem.
+     * @param src The path of source file. 
+     * @param dstFS The handle to destination filesystem.
+     * @param dst The path of destination file. 
+     * @return Returns 0 on success, -1 on error. 
+     */
+    int hdfsCopy(hdfsFS srcFS, const char* src, hdfsFS dstFS, const char* dst);
+
+
+    /**
+     * hdfsMove - Move file from one filesystem to another.
+     * @param srcFS The handle to source filesystem.
+     * @param src The path of source file. 
+     * @param dstFS The handle to destination filesystem.
+     * @param dst The path of destination file. 
+     * @return Returns 0 on success, -1 on error. 
+     */
+    int hdfsMove(hdfsFS srcFS, const char* src, hdfsFS dstFS, const char* dst);
+
+
+    /**
+     * hdfsDelete - Delete file. 
+     * @param fs The configured filesystem handle.
+     * @param path The path of the file. 
+     * @return Returns 0 on success, -1 on error. 
+     */
+    int hdfsDelete(hdfsFS fs, const char* path);
+
+
+    /**
+     * hdfsRename - Rename file. 
+     * @param fs The configured filesystem handle.
+     * @param oldPath The path of the source file. 
+     * @param newPath The path of the destination file. 
+     * @return Returns 0 on success, -1 on error. 
+     */
+    int hdfsRename(hdfsFS fs, const char* oldPath, const char* newPath);
+
+
+    /** 
+     * hdfsGetWorkingDirectory - Get the current working directory for
+     * the given filesystem.
+     * @param fs The configured filesystem handle.
+     * @param buffer The user-buffer to copy path of cwd into. 
+     * @param bufferSize The length of user-buffer.
+     * @return Returns buffer, NULL on error.
+     */
+    char* hdfsGetWorkingDirectory(hdfsFS fs, char *buffer, size_t bufferSize);
+
+
+    /** 
+     * hdfsSetWorkingDirectory - Set the working directory. All relative
+     * paths will be resolved relative to it.
+     * @param fs The configured filesystem handle.
+     * @param path The path of the new 'cwd'. 
+     * @return Returns 0 on success, -1 on error. 
+     */
+    int hdfsSetWorkingDirectory(hdfsFS fs, const char* path);
+
+
+    /** 
+     * hdfsCreateDirectory - Make the given file and all non-existent
+     * parents into directories.
+     * @param fs The configured filesystem handle.
+     * @param path The path of the directory. 
+     * @return Returns 0 on success, -1 on error. 
+     */
+    int hdfsCreateDirectory(hdfsFS fs, const char* path);
+
+
+    /** 
+     * hdfsSetReplication - Set the replication of the specified
+     * file to the supplied value
+     * @param fs The configured filesystem handle.
+     * @param path The path of the file. 
+     * @return Returns 0 on success, -1 on error. 
+     */
+    int hdfsSetReplication(hdfsFS fs, const char* path, int16_t replication);
+
+
+    /** 
+     * hdfsFileInfo - Information about a file/directory.
+     */
+    typedef struct  {
+        tObjectKind mKind;   /* file or directory */
+        char *mName;         /* the name of the file */
+        tTime mLastMod;      /* the last modification time for the file in seconds */
+        tOffset mSize;       /* the size of the file in bytes */
+        short mReplication;    /* the count of replicas */
+        tOffset mBlockSize;  /* the block size for the file */
+        char *mOwner;        /* the owner of the file */
+        char *mGroup;        /* the group associated with the file */
+        short mPermissions;  /* the permissions associated with the file */
+        tTime mLastAccess;    /* the last access time for the file in seconds */
+    } hdfsFileInfo;
+
+
+    /** 
+     * hdfsListDirectory - Get list of files/directories for a given
+     * directory-path. hdfsFreeFileInfo should be called to deallocate memory if
+     * the function returns non-NULL value.
+     * @param fs The configured filesystem handle.
+     * @param path The path of the directory. 
+     * @param numEntries Set to the number of files/directories in path.
+     * @return Returns a dynamically-allocated array of hdfsFileInfo
+     * objects; NULL if empty or on error.
+     * on error, numEntries will be -1.
+     */
+    hdfsFileInfo *hdfsListDirectory(hdfsFS fs, const char* path,
+                                    int *numEntries);
+
+
+    /** 
+     * hdfsGetPathInfo - Get information about a path as a (dynamically
+     * allocated) single hdfsFileInfo struct. hdfsFreeFileInfo should be
+     * called when the pointer is no longer needed.
+     * @param fs The configured filesystem handle.
+     * @param path The path of the file. 
+     * @return Returns a dynamically-allocated hdfsFileInfo object;
+     * NULL on error.
+     */
+    hdfsFileInfo *hdfsGetPathInfo(hdfsFS fs, const char* path);
+
+
+    /** 
+     * hdfsFreeFileInfo - Free up the hdfsFileInfo array (including fields) 
+     * @param hdfsFileInfo The array of dynamically-allocated hdfsFileInfo
+     * objects.
+     * @param numEntries The size of the array.
+     */
+    void hdfsFreeFileInfo(hdfsFileInfo *hdfsFileInfo, int numEntries);
+
+
+    /** 
+     * hdfsGetHosts - Get hostnames where a particular block (determined by
+     * pos & blocksize) of a file is stored. The last element in the array
+     * is NULL. Due to replication, a single block could be present on
+     * multiple hosts.
+     * @param fs The configured filesystem handle.
+     * @param path The path of the file. 
+     * @param start The start of the block.
+     * @param length The length of the block.
+     * @return Returns a dynamically-allocated 2-d array of blocks-hosts;
+     * NULL on error.
+     */
+    char*** hdfsGetHosts(hdfsFS fs, const char* path, 
+            tOffset start, tOffset length);
+
+
+    /** 
+     * hdfsFreeHosts - Free up the structure returned by hdfsGetHosts
+     * @param hdfsFileInfo The array of dynamically-allocated hdfsFileInfo
+     * objects.
+     * @param numEntries The size of the array.
+     */
+    void hdfsFreeHosts(char ***blockHosts);
+
+
+    /** 
+     * hdfsGetDefaultBlockSize - Get the optimum blocksize.
+     * @param fs The configured filesystem handle.
+     * @return Returns the blocksize; -1 on error. 
+     */
+    tOffset hdfsGetDefaultBlockSize(hdfsFS fs);
+
+
+    /** 
+     * hdfsGetCapacity - Return the raw capacity of the filesystem.  
+     * @param fs The configured filesystem handle.
+     * @return Returns the raw-capacity; -1 on error. 
+     */
+    tOffset hdfsGetCapacity(hdfsFS fs);
+
+
+    /** 
+     * hdfsGetUsed - Return the total raw size of all files in the filesystem.
+     * @param fs The configured filesystem handle.
+     * @return Returns the total-size; -1 on error. 
+     */
+    tOffset hdfsGetUsed(hdfsFS fs);
+
+    /** 
+     * hdfsChown 
+     * @param fs The configured filesystem handle.
+     * @param path the path to the file or directory
+     * @param owner this is a string in Hadoop land. Set to null or "" if only setting group
+     * @param group  this is a string in Hadoop land. Set to null or "" if only setting user
+     * @return 0 on success else -1
+     */
+    int hdfsChown(hdfsFS fs, const char* path, const char *owner, const char *group);
+
+    /** 
+     * hdfsChmod
+     * @param fs The configured filesystem handle.
+     * @param path the path to the file or directory
+     * @param mode the bitmask to set it to
+     * @return 0 on success else -1
+     */
+      int hdfsChmod(hdfsFS fs, const char* path, short mode);
+
+    /** 
+     * hdfsUtime
+     * @param fs The configured filesystem handle.
+     * @param path the path to the file or directory
+     * @param mtime new modification time or 0 for only set access time in seconds
+     * @param atime new access time or 0 for only set modification time in seconds
+     * @return 0 on success else -1
+     */
+    int hdfsUtime(hdfsFS fs, const char* path, tTime mtime, tTime atime);
+    
+#ifdef __cplusplus
+}
+#endif
+
+#endif /*LIBHDFS_HDFS_H*/
+
+/**
+ * vim: ts=4: sw=4: et
+ */
diff --git a/hdfs/libhdfs.a b/hdfs/libhdfs.a
new file mode 100644 (file)
index 0000000..4d1f19f
Binary files /dev/null and b/hdfs/libhdfs.a differ
diff --git a/helpers/memenv/memenv.cc b/helpers/memenv/memenv.cc
new file mode 100644 (file)
index 0000000..15f1383
--- /dev/null
@@ -0,0 +1,386 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "helpers/memenv/memenv.h"
+
+#include "rocksdb/env.h"
+#include "rocksdb/status.h"
+#include "port/port.h"
+#include "util/mutexlock.h"
+#include <map>
+#include <string.h>
+#include <string>
+#include <vector>
+
+namespace rocksdb {
+
+namespace {
+
+class FileState {
+ public:
+  // FileStates are reference counted. The initial reference count is zero
+  // and the caller must call Ref() at least once.
+  FileState() : refs_(0), size_(0) {}
+
+  // Increase the reference count.
+  void Ref() {
+    MutexLock lock(&refs_mutex_);
+    ++refs_;
+  }
+
+  // Decrease the reference count. Delete if this is the last reference.
+  void Unref() {
+    bool do_delete = false;
+
+    {
+      MutexLock lock(&refs_mutex_);
+      --refs_;
+      assert(refs_ >= 0);
+      if (refs_ <= 0) {
+        do_delete = true;
+      }
+    }
+
+    if (do_delete) {
+      delete this;
+    }
+  }
+
+  uint64_t Size() const { return size_; }
+
+  Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const {
+    if (offset > size_) {
+      return Status::IOError("Offset greater than file size.");
+    }
+    const uint64_t available = size_ - offset;
+    if (n > available) {
+      n = available;
+    }
+    if (n == 0) {
+      *result = Slice();
+      return Status::OK();
+    }
+
+    size_t block = offset / kBlockSize;
+    size_t block_offset = offset % kBlockSize;
+
+    if (n <= kBlockSize - block_offset) {
+      // The requested bytes are all in the first block.
+      *result = Slice(blocks_[block] + block_offset, n);
+      return Status::OK();
+    }
+
+    size_t bytes_to_copy = n;
+    char* dst = scratch;
+
+    while (bytes_to_copy > 0) {
+      size_t avail = kBlockSize - block_offset;
+      if (avail > bytes_to_copy) {
+        avail = bytes_to_copy;
+      }
+      memcpy(dst, blocks_[block] + block_offset, avail);
+
+      bytes_to_copy -= avail;
+      dst += avail;
+      block++;
+      block_offset = 0;
+    }
+
+    *result = Slice(scratch, n);
+    return Status::OK();
+  }
+
+  Status Append(const Slice& data) {
+    const char* src = data.data();
+    size_t src_len = data.size();
+
+    while (src_len > 0) {
+      size_t avail;
+      size_t offset = size_ % kBlockSize;
+
+      if (offset != 0) {
+        // There is some room in the last block.
+        avail = kBlockSize - offset;
+      } else {
+        // No room in the last block; push new one.
+        blocks_.push_back(new char[kBlockSize]);
+        avail = kBlockSize;
+      }
+
+      if (avail > src_len) {
+        avail = src_len;
+      }
+      memcpy(blocks_.back() + offset, src, avail);
+      src_len -= avail;
+      src += avail;
+      size_ += avail;
+    }
+
+    return Status::OK();
+  }
+
+ private:
+  // Private since only Unref() should be used to delete it.
+  ~FileState() {
+    for (std::vector<char*>::iterator i = blocks_.begin(); i != blocks_.end();
+         ++i) {
+      delete [] *i;
+    }
+  }
+
+  // No copying allowed.
+  FileState(const FileState&);
+  void operator=(const FileState&);
+
+  port::Mutex refs_mutex_;
+  int refs_;  // Protected by refs_mutex_;
+
+  // The following fields are not protected by any mutex. They are only mutable
+  // while the file is being written, and concurrent access is not allowed
+  // to writable files.
+  std::vector<char*> blocks_;
+  uint64_t size_;
+
+  enum { kBlockSize = 8 * 1024 };
+};
+
+class SequentialFileImpl : public SequentialFile {
+ public:
+  explicit SequentialFileImpl(FileState* file) : file_(file), pos_(0) {
+    file_->Ref();
+  }
+
+  ~SequentialFileImpl() {
+    file_->Unref();
+  }
+
+  virtual Status Read(size_t n, Slice* result, char* scratch) {
+    Status s = file_->Read(pos_, n, result, scratch);
+    if (s.ok()) {
+      pos_ += result->size();
+    }
+    return s;
+  }
+
+  virtual Status Skip(uint64_t n) {
+    if (pos_ > file_->Size()) {
+      return Status::IOError("pos_ > file_->Size()");
+    }
+    const size_t available = file_->Size() - pos_;
+    if (n > available) {
+      n = available;
+    }
+    pos_ += n;
+    return Status::OK();
+  }
+
+ private:
+  FileState* file_;
+  size_t pos_;
+};
+
+class RandomAccessFileImpl : public RandomAccessFile {
+ public:
+  explicit RandomAccessFileImpl(FileState* file) : file_(file) {
+    file_->Ref();
+  }
+
+  ~RandomAccessFileImpl() {
+    file_->Unref();
+  }
+
+  virtual Status Read(uint64_t offset, size_t n, Slice* result,
+                      char* scratch) const {
+    return file_->Read(offset, n, result, scratch);
+  }
+
+ private:
+  FileState* file_;
+};
+
+class WritableFileImpl : public WritableFile {
+ public:
+  WritableFileImpl(FileState* file) : file_(file) {
+    file_->Ref();
+  }
+
+  ~WritableFileImpl() {
+    file_->Unref();
+  }
+
+  virtual Status Append(const Slice& data) {
+    return file_->Append(data);
+  }
+
+  virtual Status Close() { return Status::OK(); }
+  virtual Status Flush() { return Status::OK(); }
+  virtual Status Sync() { return Status::OK(); }
+
+ private:
+  FileState* file_;
+};
+
+class InMemoryEnv : public EnvWrapper {
+ public:
+  explicit InMemoryEnv(Env* base_env) : EnvWrapper(base_env) { }
+
+  virtual ~InMemoryEnv() {
+    for (FileSystem::iterator i = file_map_.begin(); i != file_map_.end(); ++i){
+      i->second->Unref();
+    }
+  }
+
+  // Partial implementation of the Env interface.
+  virtual Status NewSequentialFile(const std::string& fname,
+                                   unique_ptr<SequentialFile>* result,
+                                   const EnvOptions& soptions) {
+    MutexLock lock(&mutex_);
+    if (file_map_.find(fname) == file_map_.end()) {
+      *result = NULL;
+      return Status::IOError(fname, "File not found");
+    }
+
+    result->reset(new SequentialFileImpl(file_map_[fname]));
+    return Status::OK();
+  }
+
+  virtual Status NewRandomAccessFile(const std::string& fname,
+                                     unique_ptr<RandomAccessFile>* result,
+                                     const EnvOptions& soptions) {
+    MutexLock lock(&mutex_);
+    if (file_map_.find(fname) == file_map_.end()) {
+      *result = NULL;
+      return Status::IOError(fname, "File not found");
+    }
+
+    result->reset(new RandomAccessFileImpl(file_map_[fname]));
+    return Status::OK();
+  }
+
+  virtual Status NewWritableFile(const std::string& fname,
+                                 unique_ptr<WritableFile>* result,
+                                 const EnvOptions& soptions) {
+    MutexLock lock(&mutex_);
+    if (file_map_.find(fname) != file_map_.end()) {
+      DeleteFileInternal(fname);
+    }
+
+    FileState* file = new FileState();
+    file->Ref();
+    file_map_[fname] = file;
+
+    result->reset(new WritableFileImpl(file));
+    return Status::OK();
+  }
+
+  virtual bool FileExists(const std::string& fname) {
+    MutexLock lock(&mutex_);
+    return file_map_.find(fname) != file_map_.end();
+  }
+
+  virtual Status GetChildren(const std::string& dir,
+                             std::vector<std::string>* result) {
+    MutexLock lock(&mutex_);
+    result->clear();
+
+    for (FileSystem::iterator i = file_map_.begin(); i != file_map_.end(); ++i){
+      const std::string& filename = i->first;
+
+      if (filename.size() >= dir.size() + 1 && filename[dir.size()] == '/' &&
+          Slice(filename).starts_with(Slice(dir))) {
+        result->push_back(filename.substr(dir.size() + 1));
+      }
+    }
+
+    return Status::OK();
+  }
+
+  void DeleteFileInternal(const std::string& fname) {
+    if (file_map_.find(fname) == file_map_.end()) {
+      return;
+    }
+
+    file_map_[fname]->Unref();
+    file_map_.erase(fname);
+  }
+
+  virtual Status DeleteFile(const std::string& fname) {
+    MutexLock lock(&mutex_);
+    if (file_map_.find(fname) == file_map_.end()) {
+      return Status::IOError(fname, "File not found");
+    }
+
+    DeleteFileInternal(fname);
+    return Status::OK();
+  }
+
+  virtual Status CreateDir(const std::string& dirname) {
+    return Status::OK();
+  }
+
+  virtual Status CreateDirIfMissing(const std::string& dirname) {
+    return Status::OK();
+  }
+
+  virtual Status DeleteDir(const std::string& dirname) {
+    return Status::OK();
+  }
+
+  virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) {
+    MutexLock lock(&mutex_);
+    if (file_map_.find(fname) == file_map_.end()) {
+      return Status::IOError(fname, "File not found");
+    }
+
+    *file_size = file_map_[fname]->Size();
+    return Status::OK();
+  }
+
+  virtual Status GetFileModificationTime(const std::string& fname,
+                                         uint64_t* time) {
+    return Status::NotSupported("getFileMTime", "Not supported in MemEnv");
+  }
+
+  virtual Status RenameFile(const std::string& src,
+                            const std::string& target) {
+    MutexLock lock(&mutex_);
+    if (file_map_.find(src) == file_map_.end()) {
+      return Status::IOError(src, "File not found");
+    }
+
+    DeleteFileInternal(target);
+    file_map_[target] = file_map_[src];
+    file_map_.erase(src);
+    return Status::OK();
+  }
+
+  virtual Status LockFile(const std::string& fname, FileLock** lock) {
+    *lock = new FileLock;
+    return Status::OK();
+  }
+
+  virtual Status UnlockFile(FileLock* lock) {
+    delete lock;
+    return Status::OK();
+  }
+
+  virtual Status GetTestDirectory(std::string* path) {
+    *path = "/test";
+    return Status::OK();
+  }
+
+ private:
+  // Map from filenames to FileState objects, representing a simple file system.
+  typedef std::map<std::string, FileState*> FileSystem;
+  port::Mutex mutex_;
+  FileSystem file_map_;  // Protected by mutex_.
+};
+
+}  // namespace
+
+Env* NewMemEnv(Env* base_env) {
+  return new InMemoryEnv(base_env);
+}
+
+}  // namespace rocksdb
diff --git a/helpers/memenv/memenv.h b/helpers/memenv/memenv.h
new file mode 100644 (file)
index 0000000..2126441
--- /dev/null
@@ -0,0 +1,19 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_ROCKSDB_HELPERS_MEMENV_MEMENV_H_
+#define STORAGE_ROCKSDB_HELPERS_MEMENV_MEMENV_H_
+namespace rocksdb {
+
+class Env;
+
+// Returns a new environment that stores its data in memory and delegates
+// all non-file-storage tasks to base_env. The caller must delete the result
+// when it is no longer needed.
+// *base_env must remain live while the result is in use.
+Env* NewMemEnv(Env* base_env);
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_HELPERS_MEMENV_MEMENV_H_
diff --git a/helpers/memenv/memenv_test.cc b/helpers/memenv/memenv_test.cc
new file mode 100644 (file)
index 0000000..19fc8ff
--- /dev/null
@@ -0,0 +1,233 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "helpers/memenv/memenv.h"
+
+#include "db/db_impl.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "util/testharness.h"
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace rocksdb {
+
+class MemEnvTest {
+ public:
+  Env* env_;
+  const EnvOptions soptions_;
+
+  MemEnvTest()
+      : env_(NewMemEnv(Env::Default())) {
+  }
+  ~MemEnvTest() {
+    delete env_;
+  }
+};
+
+TEST(MemEnvTest, Basics) {
+  uint64_t file_size;
+  unique_ptr<WritableFile> writable_file;
+  std::vector<std::string> children;
+
+  ASSERT_OK(env_->CreateDir("/dir"));
+
+  // Check that the directory is empty.
+  ASSERT_TRUE(!env_->FileExists("/dir/non_existent"));
+  ASSERT_TRUE(!env_->GetFileSize("/dir/non_existent", &file_size).ok());
+  ASSERT_OK(env_->GetChildren("/dir", &children));
+  ASSERT_EQ(0U, children.size());
+
+  // Create a file.
+  ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_));
+  writable_file.reset();
+
+  // Check that the file exists.
+  ASSERT_TRUE(env_->FileExists("/dir/f"));
+  ASSERT_OK(env_->GetFileSize("/dir/f", &file_size));
+  ASSERT_EQ(0U, file_size);
+  ASSERT_OK(env_->GetChildren("/dir", &children));
+  ASSERT_EQ(1U, children.size());
+  ASSERT_EQ("f", children[0]);
+
+  // Write to the file.
+  ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_));
+  ASSERT_OK(writable_file->Append("abc"));
+  writable_file.reset();
+
+  // Check for expected size.
+  ASSERT_OK(env_->GetFileSize("/dir/f", &file_size));
+  ASSERT_EQ(3U, file_size);
+
+  // Check that renaming works.
+  ASSERT_TRUE(!env_->RenameFile("/dir/non_existent", "/dir/g").ok());
+  ASSERT_OK(env_->RenameFile("/dir/f", "/dir/g"));
+  ASSERT_TRUE(!env_->FileExists("/dir/f"));
+  ASSERT_TRUE(env_->FileExists("/dir/g"));
+  ASSERT_OK(env_->GetFileSize("/dir/g", &file_size));
+  ASSERT_EQ(3U, file_size);
+
+  // Check that opening non-existent file fails.
+  unique_ptr<SequentialFile> seq_file;
+  unique_ptr<RandomAccessFile> rand_file;
+  ASSERT_TRUE(!env_->NewSequentialFile("/dir/non_existent", &seq_file,
+                                       soptions_).ok());
+  ASSERT_TRUE(!seq_file);
+  ASSERT_TRUE(!env_->NewRandomAccessFile("/dir/non_existent", &rand_file,
+                                         soptions_).ok());
+  ASSERT_TRUE(!rand_file);
+
+  // Check that deleting works.
+  ASSERT_TRUE(!env_->DeleteFile("/dir/non_existent").ok());
+  ASSERT_OK(env_->DeleteFile("/dir/g"));
+  ASSERT_TRUE(!env_->FileExists("/dir/g"));
+  ASSERT_OK(env_->GetChildren("/dir", &children));
+  ASSERT_EQ(0U, children.size());
+  ASSERT_OK(env_->DeleteDir("/dir"));
+}
+
+TEST(MemEnvTest, ReadWrite) {
+  unique_ptr<WritableFile> writable_file;
+  unique_ptr<SequentialFile> seq_file;
+  unique_ptr<RandomAccessFile> rand_file;
+  Slice result;
+  char scratch[100];
+
+  ASSERT_OK(env_->CreateDir("/dir"));
+
+  ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_));
+  ASSERT_OK(writable_file->Append("hello "));
+  ASSERT_OK(writable_file->Append("world"));
+  writable_file.reset();
+
+  // Read sequentially.
+  ASSERT_OK(env_->NewSequentialFile("/dir/f", &seq_file, soptions_));
+  ASSERT_OK(seq_file->Read(5, &result, scratch)); // Read "hello".
+  ASSERT_EQ(0, result.compare("hello"));
+  ASSERT_OK(seq_file->Skip(1));
+  ASSERT_OK(seq_file->Read(1000, &result, scratch)); // Read "world".
+  ASSERT_EQ(0, result.compare("world"));
+  ASSERT_OK(seq_file->Read(1000, &result, scratch)); // Try reading past EOF.
+  ASSERT_EQ(0U, result.size());
+  ASSERT_OK(seq_file->Skip(100)); // Try to skip past end of file.
+  ASSERT_OK(seq_file->Read(1000, &result, scratch));
+  ASSERT_EQ(0U, result.size());
+
+  // Random reads.
+  ASSERT_OK(env_->NewRandomAccessFile("/dir/f", &rand_file, soptions_));
+  ASSERT_OK(rand_file->Read(6, 5, &result, scratch)); // Read "world".
+  ASSERT_EQ(0, result.compare("world"));
+  ASSERT_OK(rand_file->Read(0, 5, &result, scratch)); // Read "hello".
+  ASSERT_EQ(0, result.compare("hello"));
+  ASSERT_OK(rand_file->Read(10, 100, &result, scratch)); // Read "d".
+  ASSERT_EQ(0, result.compare("d"));
+
+  // Too high offset.
+  ASSERT_TRUE(!rand_file->Read(1000, 5, &result, scratch).ok());
+}
+
+TEST(MemEnvTest, Locks) {
+  FileLock* lock;
+
+  // These are no-ops, but we test they return success.
+  ASSERT_OK(env_->LockFile("some file", &lock));
+  ASSERT_OK(env_->UnlockFile(lock));
+}
+
+TEST(MemEnvTest, Misc) {
+  std::string test_dir;
+  ASSERT_OK(env_->GetTestDirectory(&test_dir));
+  ASSERT_TRUE(!test_dir.empty());
+
+  unique_ptr<WritableFile> writable_file;
+  ASSERT_OK(env_->NewWritableFile("/a/b", &writable_file, soptions_));
+
+  // These are no-ops, but we test they return success.
+  ASSERT_OK(writable_file->Sync());
+  ASSERT_OK(writable_file->Flush());
+  ASSERT_OK(writable_file->Close());
+  writable_file.reset();
+}
+
+TEST(MemEnvTest, LargeWrite) {
+  const size_t kWriteSize = 300 * 1024;
+  char* scratch = new char[kWriteSize * 2];
+
+  std::string write_data;
+  for (size_t i = 0; i < kWriteSize; ++i) {
+    write_data.append(1, static_cast<char>(i));
+  }
+
+  unique_ptr<WritableFile> writable_file;
+  ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_));
+  ASSERT_OK(writable_file->Append("foo"));
+  ASSERT_OK(writable_file->Append(write_data));
+  writable_file.reset();
+
+  unique_ptr<SequentialFile> seq_file;
+  Slice result;
+  ASSERT_OK(env_->NewSequentialFile("/dir/f", &seq_file, soptions_));
+  ASSERT_OK(seq_file->Read(3, &result, scratch)); // Read "foo".
+  ASSERT_EQ(0, result.compare("foo"));
+
+  size_t read = 0;
+  std::string read_data;
+  while (read < kWriteSize) {
+    ASSERT_OK(seq_file->Read(kWriteSize - read, &result, scratch));
+    read_data.append(result.data(), result.size());
+    read += result.size();
+  }
+  ASSERT_TRUE(write_data == read_data);
+  delete [] scratch;
+}
+
+TEST(MemEnvTest, DBTest) {
+  Options options;
+  options.create_if_missing = true;
+  options.env = env_;
+  DB* db;
+
+  const Slice keys[] = {Slice("aaa"), Slice("bbb"), Slice("ccc")};
+  const Slice vals[] = {Slice("foo"), Slice("bar"), Slice("baz")};
+
+  ASSERT_OK(DB::Open(options, "/dir/db", &db));
+  for (size_t i = 0; i < 3; ++i) {
+    ASSERT_OK(db->Put(WriteOptions(), keys[i], vals[i]));
+  }
+
+  for (size_t i = 0; i < 3; ++i) {
+    std::string res;
+    ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
+    ASSERT_TRUE(res == vals[i]);
+  }
+
+  Iterator* iterator = db->NewIterator(ReadOptions());
+  iterator->SeekToFirst();
+  for (size_t i = 0; i < 3; ++i) {
+    ASSERT_TRUE(iterator->Valid());
+    ASSERT_TRUE(keys[i] == iterator->key());
+    ASSERT_TRUE(vals[i] == iterator->value());
+    iterator->Next();
+  }
+  ASSERT_TRUE(!iterator->Valid());
+  delete iterator;
+
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db);
+  ASSERT_OK(dbi->TEST_FlushMemTable());
+
+  for (size_t i = 0; i < 3; ++i) {
+    std::string res;
+    ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
+    ASSERT_TRUE(res == vals[i]);
+  }
+
+  delete db;
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/include/rocksdb/arena.h b/include/rocksdb/arena.h
new file mode 100644 (file)
index 0000000..642b614
--- /dev/null
@@ -0,0 +1,45 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Arena class defines memory allocation methods. It's used by memtable and
+// skiplist.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_ARENA_H_
+#define STORAGE_ROCKSDB_INCLUDE_ARENA_H_
+
+#include <limits>
+#include <memory>
+
+namespace rocksdb {
+
+class Arena {
+ public:
+  Arena() {};
+  virtual ~Arena() {};
+
+  // Return a pointer to a newly allocated memory block of "bytes" bytes.
+  virtual char* Allocate(size_t bytes) = 0;
+
+  // Allocate memory with the normal alignment guarantees provided by malloc.
+  virtual char* AllocateAligned(size_t bytes) = 0;
+
+  // Returns an estimate of the total memory used by arena.
+  virtual const size_t ApproximateMemoryUsage() = 0;
+
+  // Returns the total number of bytes in all blocks allocated so far.
+  virtual const size_t MemoryAllocatedBytes() = 0;
+
+ private:
+  // No copying allowed
+  Arena(const Arena&);
+  void operator=(const Arena&);
+};
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_ARENA_H_
diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h
new file mode 100644 (file)
index 0000000..bd22e19
--- /dev/null
@@ -0,0 +1,344 @@
+/*  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+  This source code is licensed under the BSD-style license found in the
+  LICENSE file in the root directory of this source tree. An additional grant
+  of patent rights can be found in the PATENTS file in the same directory.
+ Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+  Use of this source code is governed by a BSD-style license that can be
+  found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+  C bindings for leveldb.  May be useful as a stable ABI that can be
+  used by programs that keep leveldb in a shared library, or for
+  a JNI api.
+
+  Does not support:
+  . getters for the option types
+  . custom comparators that implement key shortening
+  . capturing post-write-snapshot
+  . custom iter, db, env, cache implementations using just the C bindings
+
+  Some conventions:
+
+  (1) We expose just opaque struct pointers and functions to clients.
+  This allows us to change internal representations without having to
+  recompile clients.
+
+  (2) For simplicity, there is no equivalent to the Slice type.  Instead,
+  the caller has to pass the pointer and length as separate
+  arguments.
+
+  (3) Errors are represented by a null-terminated c string.  NULL
+  means no error.  All operations that can raise an error are passed
+  a "char** errptr" as the last argument.  One of the following must
+  be true on entry:
+     *errptr == NULL
+     *errptr points to a malloc()ed null-terminated error message
+  On success, a leveldb routine leaves *errptr unchanged.
+  On failure, leveldb frees the old value of *errptr and
+  set *errptr to a malloc()ed error message.
+
+  (4) Bools have the type unsigned char (0 == false; rest == true)
+
+  (5) All of the pointer arguments must be non-NULL.
+*/
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_C_H_
+#define STORAGE_ROCKSDB_INCLUDE_C_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdarg.h>
+#include <stddef.h>
+#include <stdint.h>
+
+/* Exported types */
+
+typedef struct rocksdb_t               rocksdb_t;
+typedef struct rocksdb_cache_t         rocksdb_cache_t;
+typedef struct rocksdb_comparator_t    rocksdb_comparator_t;
+typedef struct rocksdb_env_t           rocksdb_env_t;
+typedef struct rocksdb_filelock_t      rocksdb_filelock_t;
+typedef struct rocksdb_filterpolicy_t  rocksdb_filterpolicy_t;
+typedef struct rocksdb_iterator_t      rocksdb_iterator_t;
+typedef struct rocksdb_logger_t        rocksdb_logger_t;
+typedef struct rocksdb_options_t       rocksdb_options_t;
+typedef struct rocksdb_randomfile_t    rocksdb_randomfile_t;
+typedef struct rocksdb_readoptions_t   rocksdb_readoptions_t;
+typedef struct rocksdb_seqfile_t       rocksdb_seqfile_t;
+typedef struct rocksdb_snapshot_t      rocksdb_snapshot_t;
+typedef struct rocksdb_writablefile_t  rocksdb_writablefile_t;
+typedef struct rocksdb_writebatch_t    rocksdb_writebatch_t;
+typedef struct rocksdb_writeoptions_t  rocksdb_writeoptions_t;
+typedef struct rocksdb_universal_compaction_options_t rocksdb_universal_compaction_options_t;
+
+/* DB operations */
+
+extern rocksdb_t* rocksdb_open(
+    const rocksdb_options_t* options,
+    const char* name,
+    char** errptr);
+
+extern void rocksdb_close(rocksdb_t* db);
+
+extern void rocksdb_put(
+    rocksdb_t* db,
+    const rocksdb_writeoptions_t* options,
+    const char* key, size_t keylen,
+    const char* val, size_t vallen,
+    char** errptr);
+
+extern void rocksdb_delete(
+    rocksdb_t* db,
+    const rocksdb_writeoptions_t* options,
+    const char* key, size_t keylen,
+    char** errptr);
+
+extern void rocksdb_write(
+    rocksdb_t* db,
+    const rocksdb_writeoptions_t* options,
+    rocksdb_writebatch_t* batch,
+    char** errptr);
+
+/* Returns NULL if not found.  A malloc()ed array otherwise.
+   Stores the length of the array in *vallen. */
+extern char* rocksdb_get(
+    rocksdb_t* db,
+    const rocksdb_readoptions_t* options,
+    const char* key, size_t keylen,
+    size_t* vallen,
+    char** errptr);
+
+extern rocksdb_iterator_t* rocksdb_create_iterator(
+    rocksdb_t* db,
+    const rocksdb_readoptions_t* options);
+
+extern const rocksdb_snapshot_t* rocksdb_create_snapshot(
+    rocksdb_t* db);
+
+extern void rocksdb_release_snapshot(
+    rocksdb_t* db,
+    const rocksdb_snapshot_t* snapshot);
+
+/* Returns NULL if property name is unknown.
+   Else returns a pointer to a malloc()-ed null-terminated value. */
+extern char* rocksdb_property_value(
+    rocksdb_t* db,
+    const char* propname);
+
+extern void rocksdb_approximate_sizes(
+    rocksdb_t* db,
+    int num_ranges,
+    const char* const* range_start_key, const size_t* range_start_key_len,
+    const char* const* range_limit_key, const size_t* range_limit_key_len,
+    uint64_t* sizes);
+
+extern void rocksdb_compact_range(
+    rocksdb_t* db,
+    const char* start_key, size_t start_key_len,
+    const char* limit_key, size_t limit_key_len);
+
+/* Management operations */
+
+extern void rocksdb_destroy_db(
+    const rocksdb_options_t* options,
+    const char* name,
+    char** errptr);
+
+extern void rocksdb_repair_db(
+    const rocksdb_options_t* options,
+    const char* name,
+    char** errptr);
+
+/* Iterator */
+
+extern void rocksdb_iter_destroy(rocksdb_iterator_t*);
+extern unsigned char rocksdb_iter_valid(const rocksdb_iterator_t*);
+extern void rocksdb_iter_seek_to_first(rocksdb_iterator_t*);
+extern void rocksdb_iter_seek_to_last(rocksdb_iterator_t*);
+extern void rocksdb_iter_seek(rocksdb_iterator_t*, const char* k, size_t klen);
+extern void rocksdb_iter_next(rocksdb_iterator_t*);
+extern void rocksdb_iter_prev(rocksdb_iterator_t*);
+extern const char* rocksdb_iter_key(const rocksdb_iterator_t*, size_t* klen);
+extern const char* rocksdb_iter_value(const rocksdb_iterator_t*, size_t* vlen);
+extern void rocksdb_iter_get_error(const rocksdb_iterator_t*, char** errptr);
+
+/* Write batch */
+
+extern rocksdb_writebatch_t* rocksdb_writebatch_create();
+extern void rocksdb_writebatch_destroy(rocksdb_writebatch_t*);
+extern void rocksdb_writebatch_clear(rocksdb_writebatch_t*);
+extern void rocksdb_writebatch_put(
+    rocksdb_writebatch_t*,
+    const char* key, size_t klen,
+    const char* val, size_t vlen);
+extern void rocksdb_writebatch_delete(
+    rocksdb_writebatch_t*,
+    const char* key, size_t klen);
+extern void rocksdb_writebatch_iterate(
+    rocksdb_writebatch_t*,
+    void* state,
+    void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen),
+    void (*deleted)(void*, const char* k, size_t klen));
+
+/* Options */
+
+extern rocksdb_options_t* rocksdb_options_create();
+extern void rocksdb_options_destroy(rocksdb_options_t*);
+extern void rocksdb_options_set_comparator(
+    rocksdb_options_t*,
+    rocksdb_comparator_t*);
+extern void rocksdb_options_set_compression_per_level(
+  rocksdb_options_t* opt,
+  int* level_values,
+  size_t num_levels);
+extern void rocksdb_options_set_filter_policy(
+    rocksdb_options_t*,
+    rocksdb_filterpolicy_t*);
+extern void rocksdb_options_set_create_if_missing(
+    rocksdb_options_t*, unsigned char);
+extern void rocksdb_options_set_error_if_exists(
+    rocksdb_options_t*, unsigned char);
+extern void rocksdb_options_set_paranoid_checks(
+    rocksdb_options_t*, unsigned char);
+extern void rocksdb_options_set_env(rocksdb_options_t*, rocksdb_env_t*);
+extern void rocksdb_options_set_info_log(rocksdb_options_t*, rocksdb_logger_t*);
+extern void rocksdb_options_set_write_buffer_size(rocksdb_options_t*, size_t);
+extern void rocksdb_options_set_max_open_files(rocksdb_options_t*, int);
+extern void rocksdb_options_set_cache(rocksdb_options_t*, rocksdb_cache_t*);
+extern void rocksdb_options_set_block_size(rocksdb_options_t*, size_t);
+extern void rocksdb_options_set_block_restart_interval(rocksdb_options_t*, int);
+extern void rocksdb_options_set_compression_options(
+    rocksdb_options_t*, int, int, int);
+extern void rocksdb_options_set_num_levels(rocksdb_options_t*, int);
+extern void rocksdb_options_set_level0_file_num_compaction_trigger(
+    rocksdb_options_t*, int);
+extern void rocksdb_options_set_level0_slowdown_writes_trigger(
+    rocksdb_options_t*, int);
+extern void rocksdb_options_set_level0_stop_writes_trigger(
+    rocksdb_options_t*, int);
+extern void rocksdb_options_set_target_file_size_base(
+    rocksdb_options_t*, uint64_t);
+extern void rocksdb_options_set_target_file_size_multiplier(
+    rocksdb_options_t*, int);
+extern void rocksdb_options_set_max_write_buffer_number(rocksdb_options_t*, int);
+extern void rocksdb_options_set_min_write_buffer_number_to_merge(rocksdb_options_t*, int);
+extern void rocksdb_options_set_max_background_compactions(rocksdb_options_t*, int);
+extern void rocksdb_options_set_max_background_flushes(rocksdb_options_t*, int);
+extern void rocksdb_options_set_use_fsync(
+    rocksdb_options_t*, int);
+extern void rocksdb_options_set_disable_data_sync(rocksdb_options_t*, int);
+extern void rocksdb_options_set_disable_auto_compactions(rocksdb_options_t*, int);
+extern void rocksdb_options_set_disable_seek_compaction(rocksdb_options_t*, int);
+extern void rocksdb_options_set_source_compaction_factor(rocksdb_options_t*, int);
+extern void rocksdb_options_prepare_for_bulk_load(rocksdb_options_t*);
+extern void rocksdb_options_set_memtable_vector_rep(rocksdb_options_t*);
+
+
+enum {
+  rocksdb_no_compression = 0,
+  rocksdb_snappy_compression = 1,
+  rocksdb_zlib_compression = 1,
+  rocksdb_bz2_compression = 1
+};
+extern void rocksdb_options_set_compression(rocksdb_options_t*, int);
+
+enum {
+  rocksdb_level_compaction = 0,
+  rocksdb_universal_compaction = 1
+};
+extern void rocksdb_options_set_compaction_style(rocksdb_options_t*, int);
+extern void rocksdb_options_set_universal_compaction_options(rocksdb_options_t*, rocksdb_universal_compaction_options_t*);
+/* Comparator */
+
+extern rocksdb_comparator_t* rocksdb_comparator_create(
+    void* state,
+    void (*destructor)(void*),
+    int (*compare)(
+        void*,
+        const char* a, size_t alen,
+        const char* b, size_t blen),
+    const char* (*name)(void*));
+extern void rocksdb_comparator_destroy(rocksdb_comparator_t*);
+
+/* Filter policy */
+
+extern rocksdb_filterpolicy_t* rocksdb_filterpolicy_create(
+    void* state,
+    void (*destructor)(void*),
+    char* (*create_filter)(
+        void*,
+        const char* const* key_array, const size_t* key_length_array,
+        int num_keys,
+        size_t* filter_length),
+    unsigned char (*key_may_match)(
+        void*,
+        const char* key, size_t length,
+        const char* filter, size_t filter_length),
+    const char* (*name)(void*));
+extern void rocksdb_filterpolicy_destroy(rocksdb_filterpolicy_t*);
+
+extern rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom(
+    int bits_per_key);
+
+/* Read options */
+
+extern rocksdb_readoptions_t* rocksdb_readoptions_create();
+extern void rocksdb_readoptions_destroy(rocksdb_readoptions_t*);
+extern void rocksdb_readoptions_set_verify_checksums(
+    rocksdb_readoptions_t*,
+    unsigned char);
+extern void rocksdb_readoptions_set_fill_cache(
+    rocksdb_readoptions_t*, unsigned char);
+extern void rocksdb_readoptions_set_snapshot(
+    rocksdb_readoptions_t*,
+    const rocksdb_snapshot_t*);
+
+/* Write options */
+
+extern rocksdb_writeoptions_t* rocksdb_writeoptions_create();
+extern void rocksdb_writeoptions_destroy(rocksdb_writeoptions_t*);
+extern void rocksdb_writeoptions_set_sync(
+    rocksdb_writeoptions_t*, unsigned char);
+extern void rocksdb_writeoptions_disable_WAL(rocksdb_writeoptions_t* opt, int disable);
+
+/* Cache */
+
+extern rocksdb_cache_t* rocksdb_cache_create_lru(size_t capacity);
+extern void rocksdb_cache_destroy(rocksdb_cache_t* cache);
+
+/* Env */
+
+extern rocksdb_env_t* rocksdb_create_default_env();
+extern void rocksdb_env_set_background_threads(rocksdb_env_t* env, int n);
+extern void rocksdb_env_set_high_priority_background_threads(rocksdb_env_t* env, int n);
+extern void rocksdb_env_destroy(rocksdb_env_t*);
+
+/* Universal Compaction options */
+
+enum {
+  rocksdb_similar_size_compaction_stop_style = 0,
+  rocksdb_total_size_compaction_stop_style = 1
+};
+
+extern rocksdb_universal_compaction_options_t* rocksdb_universal_compaction_options_create() ;
+extern void rocksdb_universal_compaction_options_set_size_ratio(
+  rocksdb_universal_compaction_options_t*, int);
+extern void rocksdb_universal_compaction_options_set_min_merge_width(
+  rocksdb_universal_compaction_options_t*, int);
+extern void rocksdb_universal_compaction_options_set_max_merge_width(
+  rocksdb_universal_compaction_options_t*, int);
+extern void rocksdb_universal_compaction_options_set_max_size_amplification_percent(
+  rocksdb_universal_compaction_options_t*, int);
+extern void rocksdb_universal_compaction_options_set_compression_size_percent(
+  rocksdb_universal_compaction_options_t*, int);
+extern void rocksdb_universal_compaction_options_set_stop_style(
+  rocksdb_universal_compaction_options_t*, int);
+extern void rocksdb_universal_compaction_options_destroy(
+  rocksdb_universal_compaction_options_t*);
+
+#ifdef __cplusplus
+}  /* end extern "C" */
+#endif
+
+#endif  /* STORAGE_ROCKSDB_INCLUDE_C_H_ */
diff --git a/include/rocksdb/cache.h b/include/rocksdb/cache.h
new file mode 100644 (file)
index 0000000..3e0e5c1
--- /dev/null
@@ -0,0 +1,122 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A Cache is an interface that maps keys to values.  It has internal
+// synchronization and may be safely accessed concurrently from
+// multiple threads.  It may automatically evict entries to make room
+// for new entries.  Values have a specified charge against the cache
+// capacity.  For example, a cache where the values are variable
+// length strings, may use the length of the string as the charge for
+// the string.
+//
+// A builtin cache implementation with a least-recently-used eviction
+// policy is provided.  Clients may use their own implementations if
+// they want something more sophisticated (like scan-resistance, a
+// custom eviction policy, variable cache sizing, etc.)
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_CACHE_H_
+#define STORAGE_ROCKSDB_INCLUDE_CACHE_H_
+
+#include <memory>
+#include <stdint.h>
+#include "rocksdb/slice.h"
+
+namespace rocksdb {
+
+using std::shared_ptr;
+
+class Cache;
+
+// Create a new cache with a fixed size capacity. The cache is sharded
+// to 2^numShardBits shards, by hash of the key. The total capacity
+// is divided and evenly assigned to each shard. Inside each shard,
+// the eviction is done in two passes: first try to free spaces by
+// evicting entries that are among the most least used removeScanCountLimit
+// entries and do not have reference other than by the cache itself, in
+// the least-used order. If not enough space is freed, further free the
+// entries in least used order.
+//
+// The functions without parameter numShardBits and/or removeScanCountLimit
+// use default values. removeScanCountLimit's default value is 0, which
+// means a strict LRU order inside each shard.
+extern shared_ptr<Cache> NewLRUCache(size_t capacity);
+extern shared_ptr<Cache> NewLRUCache(size_t capacity, int numShardBits);
+extern shared_ptr<Cache> NewLRUCache(size_t capacity, int numShardBits,
+                                     int removeScanCountLimit);
+
+class Cache {
+ public:
+  Cache() { }
+
+  // Destroys all existing entries by calling the "deleter"
+  // function that was passed to the constructor.
+  virtual ~Cache();
+
+  // Opaque handle to an entry stored in the cache.
+  struct Handle { };
+
+  // Insert a mapping from key->value into the cache and assign it
+  // the specified charge against the total cache capacity.
+  //
+  // Returns a handle that corresponds to the mapping.  The caller
+  // must call this->Release(handle) when the returned mapping is no
+  // longer needed.
+  //
+  // When the inserted entry is no longer needed, the key and
+  // value will be passed to "deleter".
+  virtual Handle* Insert(const Slice& key, void* value, size_t charge,
+                         void (*deleter)(const Slice& key, void* value)) = 0;
+
+  // If the cache has no mapping for "key", returns nullptr.
+  //
+  // Else return a handle that corresponds to the mapping.  The caller
+  // must call this->Release(handle) when the returned mapping is no
+  // longer needed.
+  virtual Handle* Lookup(const Slice& key) = 0;
+
+  // Release a mapping returned by a previous Lookup().
+  // REQUIRES: handle must not have been released yet.
+  // REQUIRES: handle must have been returned by a method on *this.
+  virtual void Release(Handle* handle) = 0;
+
+  // Return the value encapsulated in a handle returned by a
+  // successful Lookup().
+  // REQUIRES: handle must not have been released yet.
+  // REQUIRES: handle must have been returned by a method on *this.
+  virtual void* Value(Handle* handle) = 0;
+
+  // If the cache contains entry for key, erase it.  Note that the
+  // underlying entry will be kept around until all existing handles
+  // to it have been released.
+  virtual void Erase(const Slice& key) = 0;
+
+  // Return a new numeric id.  May be used by multiple clients who are
+  // sharing the same cache to partition the key space.  Typically the
+  // client will allocate a new id at startup and prepend the id to
+  // its cache keys.
+  virtual uint64_t NewId() = 0;
+
+  // returns the maximum configured capacity of the cache
+  virtual size_t GetCapacity() = 0;
+
+ private:
+  void LRU_Remove(Handle* e);
+  void LRU_Append(Handle* e);
+  void Unref(Handle* e);
+
+  struct Rep;
+  Rep* rep_;
+
+  // No copying allowed
+  Cache(const Cache&);
+  void operator=(const Cache&);
+};
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_UTIL_CACHE_H_
diff --git a/include/rocksdb/compaction_filter.h b/include/rocksdb/compaction_filter.h
new file mode 100644 (file)
index 0000000..f24132a
--- /dev/null
@@ -0,0 +1,93 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2013 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_
+#define STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_
+
+#include <string>
+
+namespace rocksdb {
+
+class Slice;
+
+// CompactionFilter allows an application to modify/delete a key-value at
+// the time of compaction.
+
+class CompactionFilter {
+ public:
+
+  // Context information of a compaction run
+  struct Context {
+    // Does this compaction run include all data files
+    bool is_full_compaction;
+  };
+
+  virtual ~CompactionFilter() {}
+
+  // The compaction process invokes this
+  // method for kv that is being compacted. A return value
+  // of false indicates that the kv should be preserved in the
+  // output of this compaction run and a return value of true
+  // indicates that this key-value should be removed from the
+  // output of the compaction.  The application can inspect
+  // the existing value of the key and make decision based on it.
+  //
+  // When the value is to be preserved, the application has the option
+  // to modify the existing_value and pass it back through new_value.
+  // value_changed needs to be set to true in this case.
+  //
+  // If multithreaded compaction is being used *and* a single CompactionFilter
+  // instance was supplied via Options::compaction_filter, this method may be
+  // called from different threads concurrently.  The application must ensure
+  // that the call is thread-safe.
+  //
+  // If the CompactionFilter was created by a factory, then it will only ever
+  // be used by a single thread that is doing the compaction run, and this
+  // call does not need to be thread-safe.  However, multiple filters may be
+  // in existence and operating concurrently.
+  virtual bool Filter(int level,
+                      const Slice& key,
+                      const Slice& existing_value,
+                      std::string* new_value,
+                      bool* value_changed) const = 0;
+
+  // Returns a name that identifies this compaction filter.
+  // The name will be printed to LOG file on start up for diagnosis.
+  virtual const char* Name() const = 0;
+};
+
+// Each compaction will create a new CompactionFilter allowing the
+// application to know about different campactions
+class CompactionFilterFactory {
+ public:
+  virtual ~CompactionFilterFactory() { };
+
+  virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+    const CompactionFilter::Context& context) = 0;
+
+  // Returns a name that identifies this compaction filter factory.
+  virtual const char* Name() const = 0;
+};
+
+// Default implementaion of CompactionFilterFactory which does not
+// return any filter
+class DefaultCompactionFilterFactory : public CompactionFilterFactory {
+ public:
+  virtual std::unique_ptr<CompactionFilter>
+  CreateCompactionFilter(const CompactionFilter::Context& context) override {
+    return std::unique_ptr<CompactionFilter>(nullptr);
+  }
+
+  virtual const char* Name() const override {
+    return "DefaultCompactionFilterFactory";
+  }
+};
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_
diff --git a/include/rocksdb/comparator.h b/include/rocksdb/comparator.h
new file mode 100644 (file)
index 0000000..f3a8499
--- /dev/null
@@ -0,0 +1,67 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_COMPARATOR_H_
+#define STORAGE_ROCKSDB_INCLUDE_COMPARATOR_H_
+
+#include <string>
+
+namespace rocksdb {
+
+class Slice;
+
+// A Comparator object provides a total order across slices that are
+// used as keys in an sstable or a database.  A Comparator implementation
+// must be thread-safe since rocksdb may invoke its methods concurrently
+// from multiple threads.
+class Comparator {
+ public:
+  virtual ~Comparator();
+
+  // Three-way comparison.  Returns value:
+  //   < 0 iff "a" < "b",
+  //   == 0 iff "a" == "b",
+  //   > 0 iff "a" > "b"
+  virtual int Compare(const Slice& a, const Slice& b) const = 0;
+
+  // The name of the comparator.  Used to check for comparator
+  // mismatches (i.e., a DB created with one comparator is
+  // accessed using a different comparator.
+  //
+  // The client of this package should switch to a new name whenever
+  // the comparator implementation changes in a way that will cause
+  // the relative ordering of any two keys to change.
+  //
+  // Names starting with "rocksdb." are reserved and should not be used
+  // by any clients of this package.
+  virtual const char* Name() const = 0;
+
+  // Advanced functions: these are used to reduce the space requirements
+  // for internal data structures like index blocks.
+
+  // If *start < limit, changes *start to a short string in [start,limit).
+  // Simple comparator implementations may return with *start unchanged,
+  // i.e., an implementation of this method that does nothing is correct.
+  virtual void FindShortestSeparator(
+      std::string* start,
+      const Slice& limit) const = 0;
+
+  // Changes *key to a short string >= *key.
+  // Simple comparator implementations may return with *key unchanged,
+  // i.e., an implementation of this method that does nothing is correct.
+  virtual void FindShortSuccessor(std::string* key) const = 0;
+};
+
+// Return a builtin comparator that uses lexicographic byte-wise
+// ordering.  The result remains the property of this module and
+// must not be deleted.
+extern const Comparator* BytewiseComparator();
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_COMPARATOR_H_
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
new file mode 100644 (file)
index 0000000..4bf0957
--- /dev/null
@@ -0,0 +1,331 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_DB_H_
+#define STORAGE_ROCKSDB_INCLUDE_DB_H_
+
+#include <stdint.h>
+#include <stdio.h>
+#include <memory>
+#include <vector>
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/types.h"
+#include "rocksdb/transaction_log.h"
+
+namespace rocksdb {
+
+using std::unique_ptr;
+
+// Update Makefile if you change these
+static const int kMajorVersion = 2;
+static const int kMinorVersion = 0;
+
+struct Options;
+struct ReadOptions;
+struct WriteOptions;
+struct FlushOptions;
+class WriteBatch;
+
+// Metadata associated with each SST file.
+struct LiveFileMetaData {
+  std::string name;        // Name of the file
+  int level;               // Level at which this file resides.
+  size_t size;             // File size in bytes.
+  std::string smallestkey; // Smallest user defined key in the file.
+  std::string largestkey;  // Largest user defined key in the file.
+  SequenceNumber smallest_seqno; // smallest seqno in file
+  SequenceNumber largest_seqno;  // largest seqno in file
+};
+
+// Abstract handle to particular state of a DB.
+// A Snapshot is an immutable object and can therefore be safely
+// accessed from multiple threads without any external synchronization.
+class Snapshot {
+ protected:
+  virtual ~Snapshot();
+};
+
+// A range of keys
+struct Range {
+  Slice start;          // Included in the range
+  Slice limit;          // Not included in the range
+
+  Range() { }
+  Range(const Slice& s, const Slice& l) : start(s), limit(l) { }
+};
+
+// A DB is a persistent ordered map from keys to values.
+// A DB is safe for concurrent access from multiple threads without
+// any external synchronization.
+class DB {
+ public:
+  // Open the database with the specified "name".
+  // Stores a pointer to a heap-allocated database in *dbptr and returns
+  // OK on success.
+  // Stores nullptr in *dbptr and returns a non-OK status on error.
+  // Caller should delete *dbptr when it is no longer needed.
+  static Status Open(const Options& options,
+                     const std::string& name,
+                     DB** dbptr);
+
+  // Open the database for read only. All DB interfaces
+  // that modify data, like put/delete, will return error.
+  // If the db is opened in read only mode, then no compactions
+  // will happen.
+  static Status OpenForReadOnly(const Options& options,
+      const std::string& name, DB** dbptr,
+      bool error_if_log_file_exist = false);
+
+  DB() { }
+  virtual ~DB();
+
+  // Set the database entry for "key" to "value".
+  // Returns OK on success, and a non-OK status on error.
+  // Note: consider setting options.sync = true.
+  virtual Status Put(const WriteOptions& options,
+                     const Slice& key,
+                     const Slice& value) = 0;
+
+  // Remove the database entry (if any) for "key".  Returns OK on
+  // success, and a non-OK status on error.  It is not an error if "key"
+  // did not exist in the database.
+  // Note: consider setting options.sync = true.
+  virtual Status Delete(const WriteOptions& options, const Slice& key) = 0;
+
+  // Merge the database entry for "key" with "value".  Returns OK on success,
+  // and a non-OK status on error. The semantics of this operation is
+  // determined by the user provided merge_operator when opening DB.
+  // Note: consider setting options.sync = true.
+  virtual Status Merge(const WriteOptions& options,
+                       const Slice& key,
+                       const Slice& value) = 0;
+
+  // Apply the specified updates to the database.
+  // Returns OK on success, non-OK on failure.
+  // Note: consider setting options.sync = true.
+  virtual Status Write(const WriteOptions& options, WriteBatch* updates) = 0;
+
+  // If the database contains an entry for "key" store the
+  // corresponding value in *value and return OK.
+  //
+  // If there is no entry for "key" leave *value unchanged and return
+  // a status for which Status::IsNotFound() returns true.
+  //
+  // May return some other Status on an error.
+  virtual Status Get(const ReadOptions& options,
+                     const Slice& key,
+                     std::string* value) = 0;
+
+  // If keys[i] does not exist in the database, then the i'th returned
+  // status will be one for which Status::IsNotFound() is true, and
+  // (*values)[i] will be set to some arbitrary value (often ""). Otherwise,
+  // the i'th returned status will have Status::ok() true, and (*values)[i]
+  // will store the value associated with keys[i].
+  //
+  // (*values) will always be resized to be the same size as (keys).
+  // Similarly, the number of returned statuses will be the number of keys.
+  // Note: keys will not be "de-duplicated". Duplicate keys will return
+  // duplicate values in order.
+  virtual std::vector<Status> MultiGet(const ReadOptions& options,
+                                       const std::vector<Slice>& keys,
+                                       std::vector<std::string>* values) = 0;
+
+  // If the key definitely does not exist in the database, then this method
+  // returns false, else true. If the caller wants to obtain value when the key
+  // is found in memory, a bool for 'value_found' must be passed. 'value_found'
+  // will be true on return if value has been set properly.
+  // This check is potentially lighter-weight than invoking DB::Get(). One way
+  // to make this lighter weight is to avoid doing any IOs.
+  // Default implementation here returns true and sets 'value_found' to false
+  virtual bool KeyMayExist(const ReadOptions& options,
+                           const Slice& key,
+                           std::string* value,
+                           bool* value_found = nullptr) {
+    if (value_found != nullptr) {
+      *value_found = false;
+    }
+    return true;
+  }
+
+  // Return a heap-allocated iterator over the contents of the database.
+  // The result of NewIterator() is initially invalid (caller must
+  // call one of the Seek methods on the iterator before using it).
+  //
+  // Caller should delete the iterator when it is no longer needed.
+  // The returned iterator should be deleted before this db is deleted.
+  virtual Iterator* NewIterator(const ReadOptions& options) = 0;
+
+  // Return a handle to the current DB state.  Iterators created with
+  // this handle will all observe a stable snapshot of the current DB
+  // state.  The caller must call ReleaseSnapshot(result) when the
+  // snapshot is no longer needed.
+  virtual const Snapshot* GetSnapshot() = 0;
+
+  // Release a previously acquired snapshot.  The caller must not
+  // use "snapshot" after this call.
+  virtual void ReleaseSnapshot(const Snapshot* snapshot) = 0;
+
+  // DB implementations can export properties about their state
+  // via this method.  If "property" is a valid property understood by this
+  // DB implementation, fills "*value" with its current value and returns
+  // true.  Otherwise returns false.
+  //
+  //
+  // Valid property names include:
+  //
+  //  "rocksdb.num-files-at-level<N>" - return the number of files at level <N>,
+  //     where <N> is an ASCII representation of a level number (e.g. "0").
+  //  "rocksdb.stats" - returns a multi-line string that describes statistics
+  //     about the internal operation of the DB.
+  //  "rocksdb.sstables" - returns a multi-line string that describes all
+  //     of the sstables that make up the db contents.
+  virtual bool GetProperty(const Slice& property, std::string* value) = 0;
+
+  // For each i in [0,n-1], store in "sizes[i]", the approximate
+  // file system space used by keys in "[range[i].start .. range[i].limit)".
+  //
+  // Note that the returned sizes measure file system space usage, so
+  // if the user data compresses by a factor of ten, the returned
+  // sizes will be one-tenth the size of the corresponding user data size.
+  //
+  // The results may not include the sizes of recently written data.
+  virtual void GetApproximateSizes(const Range* range, int n,
+                                   uint64_t* sizes) = 0;
+
+  // Compact the underlying storage for the key range [*begin,*end].
+  // The actual compaction interval might be superset of [*begin, *end].
+  // In particular, deleted and overwritten versions are discarded,
+  // and the data is rearranged to reduce the cost of operations
+  // needed to access the data.  This operation should typically only
+  // be invoked by users who understand the underlying implementation.
+  //
+  // begin==nullptr is treated as a key before all keys in the database.
+  // end==nullptr is treated as a key after all keys in the database.
+  // Therefore the following call will compact the entire database:
+  //    db->CompactRange(nullptr, nullptr);
+  // Note that after the entire database is compacted, all data are pushed
+  // down to the last level containing any data. If the total data size
+  // after compaction is reduced, that level might not be appropriate for
+  // hosting all the files. In this case, client could set reduce_level
+  // to true, to move the files back to the minimum level capable of holding
+  // the data set or a given level (specified by non-negative target_level).
+  virtual void CompactRange(const Slice* begin, const Slice* end,
+                            bool reduce_level = false,
+                            int target_level = -1) = 0;
+
+  // Number of levels used for this DB.
+  virtual int NumberLevels() = 0;
+
+  // Maximum level to which a new compacted memtable is pushed if it
+  // does not create overlap.
+  virtual int MaxMemCompactionLevel() = 0;
+
+  // Number of files in level-0 that would stop writes.
+  virtual int Level0StopWriteTrigger() = 0;
+
+  // Get DB name -- the exact same name that was provided as an argument to
+  // DB::Open()
+  virtual const std::string& GetName() const = 0;
+
+  // Get Env object from the DB
+  virtual Env* GetEnv() const = 0;
+
+  // Get DB Options that we use
+  virtual const Options& GetOptions() const = 0;
+
+  // Flush all mem-table data.
+  virtual Status Flush(const FlushOptions& options) = 0;
+
+  // Prevent file deletions. Compactions will continue to occur,
+  // but no obsolete files will be deleted. Calling this multiple
+  // times have the same effect as calling it once.
+  virtual Status DisableFileDeletions() = 0;
+
+  // Allow compactions to delete obselete files.
+  // If force == true, the call to EnableFileDeletions() will guarantee that
+  // file deletions are enabled after the call, even if DisableFileDeletions()
+  // was called multiple times before.
+  // If force == false, EnableFileDeletions will only enable file deletion
+  // after it's been called at least as many times as DisableFileDeletions(),
+  // enabling the two methods to be called by two threads concurrently without
+  // synchronization -- i.e., file deletions will be enabled only after both
+  // threads call EnableFileDeletions()
+  virtual Status EnableFileDeletions(bool force = true) = 0;
+
+  // GetLiveFiles followed by GetSortedWalFiles can generate a lossless backup
+
+  // THIS METHOD IS DEPRECATED. Use the GetTableMetaData to get more
+  // detailed information on the live files.
+  // Retrieve the list of all files in the database. The files are
+  // relative to the dbname and are not absolute paths. The valid size of the
+  // manifest file is returned in manifest_file_size. The manifest file is an
+  // ever growing file, but only the portion specified by manifest_file_size is
+  // valid for this snapshot.
+  // Setting flush_memtable to true does Flush before recording the live files.
+  // Setting flush_memtable to false is useful when we don't want to wait for
+  // flush which may have to wait for compaction to complete taking an
+  // indeterminate time. But this will have to use GetSortedWalFiles after
+  // GetLiveFiles to compensate for memtables missed in this snapshot due to the
+  // absence of Flush, by WAL files to recover the database consistently later
+  virtual Status GetLiveFiles(std::vector<std::string>&,
+                              uint64_t* manifest_file_size,
+                              bool flush_memtable = true) = 0;
+
+  // Retrieve the sorted list of all wal files with earliest file first
+  virtual Status GetSortedWalFiles(VectorLogPtr& files) = 0;
+
+  // The sequence number of the most recent transaction.
+  virtual SequenceNumber GetLatestSequenceNumber() const = 0;
+
+  // Sets iter to an iterator that is positioned at a write-batch containing
+  // seq_number. If the sequence number is non existent, it returns an iterator
+  // at the first available seq_no after the requested seq_no
+  // Returns Status::OK if iterator is valid
+  // Must set WAL_ttl_seconds or WAL_size_limit_MB to large values to
+  // use this api, else the WAL files will get
+  // cleared aggressively and the iterator might keep getting invalid before
+  // an update is read.
+  virtual Status GetUpdatesSince(SequenceNumber seq_number,
+                                 unique_ptr<TransactionLogIterator>* iter) = 0;
+
+  // Delete the file name from the db directory and update the internal state to
+  // reflect that. Supports deletion of sst and log files only. 'name' must be
+  // path relative to the db directory. eg. 000001.sst, /archive/000003.log
+  virtual Status DeleteFile(std::string name) = 0;
+
+  // Returns a list of all table files with their level, start key
+  // and end key
+  virtual void GetLiveFilesMetaData(
+    std::vector<LiveFileMetaData> *metadata) {
+  }
+
+  // Sets the globally unique ID created at database creation time by invoking
+  // Env::GenerateUniqueId(), in identity. Returns Status::OK if identity could
+  // be set properly
+  virtual Status GetDbIdentity(std::string& identity) = 0;
+
+ private:
+  // No copying allowed
+  DB(const DB&);
+  void operator=(const DB&);
+};
+
+// Destroy the contents of the specified database.
+// Be very careful using this method.
+Status DestroyDB(const std::string& name, const Options& options);
+
+// If a DB cannot be opened, you may attempt to call this method to
+// resurrect as much of the contents of the database as possible.
+// Some data may be lost, so be careful when calling this function
+// on a database that contains important information.
+Status RepairDB(const std::string& dbname, const Options& options);
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_DB_H_
diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h
new file mode 100644 (file)
index 0000000..73acbfa
--- /dev/null
@@ -0,0 +1,649 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// An Env is an interface used by the rocksdb implementation to access
+// operating system functionality like the filesystem etc.  Callers
+// may wish to provide a custom Env object when opening a database to
+// get fine gain control; e.g., to rate limit file system operations.
+//
+// All Env implementations are safe for concurrent access from
+// multiple threads without any external synchronization.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_ENV_H_
+#define STORAGE_ROCKSDB_INCLUDE_ENV_H_
+
+#include <cstdarg>
+#include <string>
+#include <memory>
+#include <vector>
+#include <stdint.h>
+#include "rocksdb/status.h"
+
+namespace rocksdb {
+
+class FileLock;
+class Logger;
+class RandomAccessFile;
+class SequentialFile;
+class Slice;
+class WritableFile;
+class RandomRWFile;
+struct Options;
+
+using std::unique_ptr;
+using std::shared_ptr;
+
+
+// Options while opening a file to read/write
+struct EnvOptions {
+
+  // construct with default Options
+  EnvOptions();
+
+  // construct from Options
+  explicit EnvOptions(const Options& options);
+
+  // If true, then allow caching of data in environment buffers
+  bool use_os_buffer = true;
+
+   // If true, then use mmap to read data
+  bool use_mmap_reads = false;
+
+   // If true, then use mmap to write data
+  bool use_mmap_writes = true;
+
+  // If true, set the FD_CLOEXEC on open fd.
+  bool set_fd_cloexec= true;
+
+  // Allows OS to incrementally sync files to disk while they are being
+  // written, in the background. Issue one request for every bytes_per_sync
+  // written. 0 turns it off.
+  // Default: 0
+  uint64_t bytes_per_sync = 0;
+};
+
+class Env {
+ public:
+  Env() { }
+  virtual ~Env();
+
+  // Return a default environment suitable for the current operating
+  // system.  Sophisticated users may wish to provide their own Env
+  // implementation instead of relying on this default environment.
+  //
+  // The result of Default() belongs to rocksdb and must never be deleted.
+  static Env* Default();
+
+  // Create a brand new sequentially-readable file with the specified name.
+  // On success, stores a pointer to the new file in *result and returns OK.
+  // On failure stores nullptr in *result and returns non-OK.  If the file does
+  // not exist, returns a non-OK status.
+  //
+  // The returned file will only be accessed by one thread at a time.
+  virtual Status NewSequentialFile(const std::string& fname,
+                                   unique_ptr<SequentialFile>* result,
+                                   const EnvOptions& options)
+                                   = 0;
+
+  // Create a brand new random access read-only file with the
+  // specified name.  On success, stores a pointer to the new file in
+  // *result and returns OK.  On failure stores nullptr in *result and
+  // returns non-OK.  If the file does not exist, returns a non-OK
+  // status.
+  //
+  // The returned file may be concurrently accessed by multiple threads.
+  virtual Status NewRandomAccessFile(const std::string& fname,
+                                     unique_ptr<RandomAccessFile>* result,
+                                     const EnvOptions& options)
+                                     = 0;
+
+  // Create an object that writes to a new file with the specified
+  // name.  Deletes any existing file with the same name and creates a
+  // new file.  On success, stores a pointer to the new file in
+  // *result and returns OK.  On failure stores nullptr in *result and
+  // returns non-OK.
+  //
+  // The returned file will only be accessed by one thread at a time.
+  virtual Status NewWritableFile(const std::string& fname,
+                                 unique_ptr<WritableFile>* result,
+                                 const EnvOptions& options) = 0;
+
+  // Create an object that both reads and writes to a file on
+  // specified offsets (random access). If file already exists,
+  // does not overwrite it. On success, stores a pointer to the
+  // new file in *result and returns OK. On failure stores nullptr
+  // in *result and returns non-OK.
+  virtual Status NewRandomRWFile(const std::string& fname,
+                                 unique_ptr<RandomRWFile>* result,
+                                 const EnvOptions& options) = 0;
+
+  // Returns true iff the named file exists.
+  virtual bool FileExists(const std::string& fname) = 0;
+
+  // Store in *result the names of the children of the specified directory.
+  // The names are relative to "dir".
+  // Original contents of *results are dropped.
+  virtual Status GetChildren(const std::string& dir,
+                             std::vector<std::string>* result) = 0;
+
+  // Delete the named file.
+  virtual Status DeleteFile(const std::string& fname) = 0;
+
+  // Create the specified directory. Returns error if directory exists.
+  virtual Status CreateDir(const std::string& dirname) = 0;
+
+  // Creates directory if missing. Return Ok if it exists, or successful in
+  // Creating.
+  virtual Status CreateDirIfMissing(const std::string& dirname) = 0;
+
+  // Delete the specified directory.
+  virtual Status DeleteDir(const std::string& dirname) = 0;
+
+  // Store the size of fname in *file_size.
+  virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) = 0;
+
+  // Store the last modification time of fname in *file_mtime.
+  virtual Status GetFileModificationTime(const std::string& fname,
+                                         uint64_t* file_mtime) = 0;
+  // Rename file src to target.
+  virtual Status RenameFile(const std::string& src,
+                            const std::string& target) = 0;
+
+  // Lock the specified file.  Used to prevent concurrent access to
+  // the same db by multiple processes.  On failure, stores nullptr in
+  // *lock and returns non-OK.
+  //
+  // On success, stores a pointer to the object that represents the
+  // acquired lock in *lock and returns OK.  The caller should call
+  // UnlockFile(*lock) to release the lock.  If the process exits,
+  // the lock will be automatically released.
+  //
+  // If somebody else already holds the lock, finishes immediately
+  // with a failure.  I.e., this call does not wait for existing locks
+  // to go away.
+  //
+  // May create the named file if it does not already exist.
+  virtual Status LockFile(const std::string& fname, FileLock** lock) = 0;
+
+  // Release the lock acquired by a previous successful call to LockFile.
+  // REQUIRES: lock was returned by a successful LockFile() call
+  // REQUIRES: lock has not already been unlocked.
+  virtual Status UnlockFile(FileLock* lock) = 0;
+
+  enum Priority { LOW, HIGH, TOTAL };
+
+  // Arrange to run "(*function)(arg)" once in a background thread, in
+  // the thread pool specified by pri. By default, jobs go to the 'LOW'
+  // priority thread pool.
+
+  // "function" may run in an unspecified thread.  Multiple functions
+  // added to the same Env may run concurrently in different threads.
+  // I.e., the caller may not assume that background work items are
+  // serialized.
+  virtual void Schedule(
+      void (*function)(void* arg),
+      void* arg,
+      Priority pri = LOW) = 0;
+
+  // Start a new thread, invoking "function(arg)" within the new thread.
+  // When "function(arg)" returns, the thread will be destroyed.
+  virtual void StartThread(void (*function)(void* arg), void* arg) = 0;
+
+  // *path is set to a temporary directory that can be used for testing. It may
+  // or many not have just been created. The directory may or may not differ
+  // between runs of the same process, but subsequent calls will return the
+  // same directory.
+  virtual Status GetTestDirectory(std::string* path) = 0;
+
+  // Create and return a log file for storing informational messages.
+  virtual Status NewLogger(const std::string& fname,
+                           shared_ptr<Logger>* result) = 0;
+
+  // Returns the number of micro-seconds since some fixed point in time. Only
+  // useful for computing deltas of time.
+  virtual uint64_t NowMicros() = 0;
+
+  // Returns the number of nano-seconds since some fixed point in time. Only
+  // useful for computing deltas of time in one run.
+  // Default implementation simply relies on NowMicros
+  virtual uint64_t NowNanos() {
+    return NowMicros() * 1000;
+  }
+
+  // Sleep/delay the thread for the perscribed number of micro-seconds.
+  virtual void SleepForMicroseconds(int micros) = 0;
+
+  // Get the current host name.
+  virtual Status GetHostName(char* name, uint64_t len) = 0;
+
+  // Get the number of seconds since the Epoch, 1970-01-01 00:00:00 (UTC).
+  virtual Status GetCurrentTime(int64_t* unix_time) = 0;
+
+  // Get full directory name for this db.
+  virtual Status GetAbsolutePath(const std::string& db_path,
+      std::string* output_path) = 0;
+
+  // The number of background worker threads of a specific thread pool
+  // for this environment. 'LOW' is the default pool.
+  // default number: 1
+  virtual void SetBackgroundThreads(int number, Priority pri = LOW) = 0;
+
+  // Converts seconds-since-Jan-01-1970 to a printable string
+  virtual std::string TimeToString(uint64_t time) = 0;
+
+  // Generates a unique id that can be used to identify a db
+  virtual std::string GenerateUniqueId();
+
+ private:
+  // No copying allowed
+  Env(const Env&);
+  void operator=(const Env&);
+};
+
+// A file abstraction for reading sequentially through a file
+class SequentialFile {
+ public:
+  SequentialFile() { }
+  virtual ~SequentialFile();
+
+  // Read up to "n" bytes from the file.  "scratch[0..n-1]" may be
+  // written by this routine.  Sets "*result" to the data that was
+  // read (including if fewer than "n" bytes were successfully read).
+  // May set "*result" to point at data in "scratch[0..n-1]", so
+  // "scratch[0..n-1]" must be live when "*result" is used.
+  // If an error was encountered, returns a non-OK status.
+  //
+  // REQUIRES: External synchronization
+  virtual Status Read(size_t n, Slice* result, char* scratch) = 0;
+
+  // Skip "n" bytes from the file. This is guaranteed to be no
+  // slower that reading the same data, but may be faster.
+  //
+  // If end of file is reached, skipping will stop at the end of the
+  // file, and Skip will return OK.
+  //
+  // REQUIRES: External synchronization
+  virtual Status Skip(uint64_t n) = 0;
+
+  // Remove any kind of caching of data from the offset to offset+length
+  // of this file. If the length is 0, then it refers to the end of file.
+  // If the system is not caching the file contents, then this is a noop.
+  virtual Status InvalidateCache(size_t offset, size_t length) {
+    return Status::NotSupported("InvalidateCache not supported.");
+  }
+};
+
+// A file abstraction for randomly reading the contents of a file.
+class RandomAccessFile {
+ public:
+  RandomAccessFile() { }
+  virtual ~RandomAccessFile();
+
+  // Read up to "n" bytes from the file starting at "offset".
+  // "scratch[0..n-1]" may be written by this routine.  Sets "*result"
+  // to the data that was read (including if fewer than "n" bytes were
+  // successfully read).  May set "*result" to point at data in
+  // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when
+  // "*result" is used.  If an error was encountered, returns a non-OK
+  // status.
+  //
+  // Safe for concurrent use by multiple threads.
+  virtual Status Read(uint64_t offset, size_t n, Slice* result,
+                      char* scratch) const = 0;
+
+  // Tries to get an unique ID for this file that will be the same each time
+  // the file is opened (and will stay the same while the file is open).
+  // Furthermore, it tries to make this ID at most "max_size" bytes. If such an
+  // ID can be created this function returns the length of the ID and places it
+  // in "id"; otherwise, this function returns 0, in which case "id"
+  // may not have been modified.
+  //
+  // This function guarantees, for IDs from a given environment, two unique ids
+  // cannot be made equal to eachother by adding arbitrary bytes to one of
+  // them. That is, no unique ID is the prefix of another.
+  //
+  // This function guarantees that the returned ID will not be interpretable as
+  // a single varint.
+  //
+  // Note: these IDs are only valid for the duration of the process.
+  virtual size_t GetUniqueId(char* id, size_t max_size) const {
+    return 0; // Default implementation to prevent issues with backwards
+              // compatibility.
+  };
+
+
+  enum AccessPattern { NORMAL, RANDOM, SEQUENTIAL, WILLNEED, DONTNEED };
+
+  virtual void Hint(AccessPattern pattern) {}
+
+  // Remove any kind of caching of data from the offset to offset+length
+  // of this file. If the length is 0, then it refers to the end of file.
+  // If the system is not caching the file contents, then this is a noop.
+  virtual Status InvalidateCache(size_t offset, size_t length) {
+    return Status::NotSupported("InvalidateCache not supported.");
+  }
+};
+
+// A file abstraction for sequential writing.  The implementation
+// must provide buffering since callers may append small fragments
+// at a time to the file.
+class WritableFile {
+ public:
+  WritableFile() : last_preallocated_block_(0), preallocation_block_size_ (0) {
+  }
+  virtual ~WritableFile();
+
+  virtual Status Append(const Slice& data) = 0;
+  virtual Status Close() = 0;
+  virtual Status Flush() = 0;
+  virtual Status Sync() = 0; // sync data
+
+  /*
+   * Sync data and/or metadata as well.
+   * By default, sync only data.
+   * Override this method for environments where we need to sync
+   * metadata as well.
+   */
+  virtual Status Fsync() {
+    return Sync();
+  }
+
+  /*
+   * Get the size of valid data in the file.
+   */
+  virtual uint64_t GetFileSize() {
+    return 0;
+  }
+
+  /*
+   * Get and set the default pre-allocation block size for writes to
+   * this file.  If non-zero, then Allocate will be used to extend the
+   * underlying storage of a file (generally via fallocate) if the Env
+   * instance supports it.
+   */
+  void SetPreallocationBlockSize(size_t size) {
+    preallocation_block_size_ = size;
+  }
+
+  virtual void GetPreallocationStatus(size_t* block_size,
+                                      size_t* last_allocated_block) {
+    *last_allocated_block = last_preallocated_block_;
+    *block_size = preallocation_block_size_;
+  }
+
+  // For documentation, refer to RandomAccessFile::GetUniqueId()
+  virtual size_t GetUniqueId(char* id, size_t max_size) const {
+    return 0; // Default implementation to prevent issues with backwards
+  }
+
+  // Remove any kind of caching of data from the offset to offset+length
+  // of this file. If the length is 0, then it refers to the end of file.
+  // If the system is not caching the file contents, then this is a noop.
+  // This call has no effect on dirty pages in the cache.
+  virtual Status InvalidateCache(size_t offset, size_t length) {
+    return Status::NotSupported("InvalidateCache not supported.");
+  }
+
+ protected:
+  // PrepareWrite performs any necessary preparation for a write
+  // before the write actually occurs.  This allows for pre-allocation
+  // of space on devices where it can result in less file
+  // fragmentation and/or less waste from over-zealous filesystem
+  // pre-allocation.
+  void PrepareWrite(size_t offset, size_t len) {
+    if (preallocation_block_size_ == 0) {
+      return;
+    }
+    // If this write would cross one or more preallocation blocks,
+    // determine what the last preallocation block necesessary to
+    // cover this write would be and Allocate to that point.
+    const auto block_size = preallocation_block_size_;
+    size_t new_last_preallocated_block =
+      (offset + len + block_size - 1) / block_size;
+    if (new_last_preallocated_block > last_preallocated_block_) {
+      size_t num_spanned_blocks =
+        new_last_preallocated_block - last_preallocated_block_;
+      Allocate(block_size * last_preallocated_block_,
+               block_size * num_spanned_blocks);
+      last_preallocated_block_ = new_last_preallocated_block;
+    }
+  }
+
+  /*
+   * Pre-allocate space for a file.
+   */
+  virtual Status Allocate(off_t offset, off_t len) {
+    return Status::OK();
+  }
+
+  // Sync a file range with disk.
+  // offset is the starting byte of the file range to be synchronized.
+  // nbytes specifies the length of the range to be synchronized.
+  // This asks the OS to initiate flushing the cached data to disk,
+  // without waiting for completion.
+  // Default implementation does nothing.
+  virtual Status RangeSync(off_t offset, off_t nbytes) {
+    return Status::OK();
+  }
+
+ private:
+  size_t last_preallocated_block_;
+  size_t preallocation_block_size_;
+  // No copying allowed
+  WritableFile(const WritableFile&);
+  void operator=(const WritableFile&);
+};
+
+// A file abstraction for random reading and writing.
+class RandomRWFile {
+ public:
+  RandomRWFile() {}
+  virtual ~RandomRWFile() {}
+
+  // Write data from Slice data to file starting from offset
+  // Returns IOError on failure, but does not guarantee
+  // atomicity of a write.  Returns OK status on success.
+  //
+  // Safe for concurrent use.
+  virtual Status Write(uint64_t offset, const Slice& data) = 0;
+  // Read up to "n" bytes from the file starting at "offset".
+  // "scratch[0..n-1]" may be written by this routine.  Sets "*result"
+  // to the data that was read (including if fewer than "n" bytes were
+  // successfully read).  May set "*result" to point at data in
+  // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when
+  // "*result" is used.  If an error was encountered, returns a non-OK
+  // status.
+  //
+  // Safe for concurrent use by multiple threads.
+  virtual Status Read(uint64_t offset, size_t n, Slice* result,
+                      char* scratch) const = 0;
+  virtual Status Close() = 0; // closes the file
+  virtual Status Sync() = 0; // sync data
+
+  /*
+   * Sync data and/or metadata as well.
+   * By default, sync only data.
+   * Override this method for environments where we need to sync
+   * metadata as well.
+   */
+  virtual Status Fsync() {
+    return Sync();
+  }
+
+  /*
+   * Pre-allocate space for a file.
+   */
+  virtual Status Allocate(off_t offset, off_t len) {
+    return Status::OK();
+  }
+
+ private:
+  // No copying allowed
+  RandomRWFile(const RandomRWFile&);
+  void operator=(const RandomRWFile&);
+};
+
+// An interface for writing log messages.
+class Logger {
+ public:
+  enum { DO_NOT_SUPPORT_GET_LOG_FILE_SIZE = -1 };
+  Logger() { }
+  virtual ~Logger();
+
+  // Write an entry to the log file with the specified format.
+  virtual void Logv(const char* format, va_list ap) = 0;
+  virtual size_t GetLogFileSize() const {
+    return DO_NOT_SUPPORT_GET_LOG_FILE_SIZE;
+  }
+  // Flush to the OS buffers
+  virtual void Flush() {}
+
+ private:
+  // No copying allowed
+  Logger(const Logger&);
+  void operator=(const Logger&);
+};
+
+
+// Identifies a locked file.
+class FileLock {
+ public:
+  FileLock() { }
+  virtual ~FileLock();
+ private:
+  // No copying allowed
+  FileLock(const FileLock&);
+  void operator=(const FileLock&);
+};
+
+
+extern void LogFlush(const shared_ptr<Logger>& info_log);
+
+// Log the specified data to *info_log if info_log is non-nullptr.
+extern void Log(const shared_ptr<Logger>& info_log, const char* format, ...)
+#   if defined(__GNUC__) || defined(__clang__)
+    __attribute__((__format__ (__printf__, 2, 3)))
+#   endif
+    ;
+
+extern void LogFlush(Logger *info_log);
+
+extern void Log(Logger* info_log, const char* format, ...)
+#   if defined(__GNUC__) || defined(__clang__)
+    __attribute__((__format__ (__printf__, 2, 3)))
+#   endif
+    ;
+
+// A utility routine: write "data" to the named file.
+extern Status WriteStringToFile(Env* env, const Slice& data,
+                                const std::string& fname);
+
+// A utility routine: read contents of named file into *data
+extern Status ReadFileToString(Env* env, const std::string& fname,
+                               std::string* data);
+
+// An implementation of Env that forwards all calls to another Env.
+// May be useful to clients who wish to override just part of the
+// functionality of another Env.
+class EnvWrapper : public Env {
+ public:
+  // Initialize an EnvWrapper that delegates all calls to *t
+  explicit EnvWrapper(Env* t) : target_(t) { }
+  virtual ~EnvWrapper();
+
+  // Return the target to which this Env forwards all calls
+  Env* target() const { return target_; }
+
+  // The following text is boilerplate that forwards all methods to target()
+  Status NewSequentialFile(const std::string& f,
+                           unique_ptr<SequentialFile>* r,
+                           const EnvOptions& options) {
+    return target_->NewSequentialFile(f, r, options);
+  }
+  Status NewRandomAccessFile(const std::string& f,
+                             unique_ptr<RandomAccessFile>* r,
+                             const EnvOptions& options) {
+    return target_->NewRandomAccessFile(f, r, options);
+  }
+  Status NewWritableFile(const std::string& f, unique_ptr<WritableFile>* r,
+                         const EnvOptions& options) {
+    return target_->NewWritableFile(f, r, options);
+  }
+  Status NewRandomRWFile(const std::string& f, unique_ptr<RandomRWFile>* r,
+                         const EnvOptions& options) {
+    return target_->NewRandomRWFile(f, r, options);
+  }
+  bool FileExists(const std::string& f) { return target_->FileExists(f); }
+  Status GetChildren(const std::string& dir, std::vector<std::string>* r) {
+    return target_->GetChildren(dir, r);
+  }
+  Status DeleteFile(const std::string& f) { return target_->DeleteFile(f); }
+  Status CreateDir(const std::string& d) { return target_->CreateDir(d); }
+  Status CreateDirIfMissing(const std::string& d) {
+    return target_->CreateDirIfMissing(d);
+  }
+  Status DeleteDir(const std::string& d) { return target_->DeleteDir(d); }
+  Status GetFileSize(const std::string& f, uint64_t* s) {
+    return target_->GetFileSize(f, s);
+  }
+
+  Status GetFileModificationTime(const std::string& fname,
+                                 uint64_t* file_mtime) {
+    return target_->GetFileModificationTime(fname, file_mtime);
+  }
+
+  Status RenameFile(const std::string& s, const std::string& t) {
+    return target_->RenameFile(s, t);
+  }
+  Status LockFile(const std::string& f, FileLock** l) {
+    return target_->LockFile(f, l);
+  }
+  Status UnlockFile(FileLock* l) { return target_->UnlockFile(l); }
+  void Schedule(void (*f)(void*), void* a, Priority pri) {
+    return target_->Schedule(f, a, pri);
+  }
+  void StartThread(void (*f)(void*), void* a) {
+    return target_->StartThread(f, a);
+  }
+  virtual Status GetTestDirectory(std::string* path) {
+    return target_->GetTestDirectory(path);
+  }
+  virtual Status NewLogger(const std::string& fname,
+                           shared_ptr<Logger>* result) {
+    return target_->NewLogger(fname, result);
+  }
+  uint64_t NowMicros() {
+    return target_->NowMicros();
+  }
+  void SleepForMicroseconds(int micros) {
+    target_->SleepForMicroseconds(micros);
+  }
+  Status GetHostName(char* name, uint64_t len) {
+    return target_->GetHostName(name, len);
+  }
+  Status GetCurrentTime(int64_t* unix_time) {
+    return target_->GetCurrentTime(unix_time);
+  }
+  Status GetAbsolutePath(const std::string& db_path,
+      std::string* output_path) {
+    return target_->GetAbsolutePath(db_path, output_path);
+  }
+  void SetBackgroundThreads(int num, Priority pri) {
+    return target_->SetBackgroundThreads(num, pri);
+  }
+  std::string TimeToString(uint64_t time) {
+    return target_->TimeToString(time);
+  }
+
+ private:
+  Env* target_;
+};
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_ENV_H_
diff --git a/include/rocksdb/filter_policy.h b/include/rocksdb/filter_policy.h
new file mode 100644 (file)
index 0000000..fa44db4
--- /dev/null
@@ -0,0 +1,74 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A database can be configured with a custom FilterPolicy object.
+// This object is responsible for creating a small filter from a set
+// of keys.  These filters are stored in rocksdb and are consulted
+// automatically by rocksdb to decide whether or not to read some
+// information from disk. In many cases, a filter can cut down the
+// number of disk seeks form a handful to a single disk seek per
+// DB::Get() call.
+//
+// Most people will want to use the builtin bloom filter support (see
+// NewBloomFilterPolicy() below).
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_
+#define STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_
+
+#include <string>
+
+namespace rocksdb {
+
+class Slice;
+
+class FilterPolicy {
+ public:
+  virtual ~FilterPolicy();
+
+  // Return the name of this policy.  Note that if the filter encoding
+  // changes in an incompatible way, the name returned by this method
+  // must be changed.  Otherwise, old incompatible filters may be
+  // passed to methods of this type.
+  virtual const char* Name() const = 0;
+
+  // keys[0,n-1] contains a list of keys (potentially with duplicates)
+  // that are ordered according to the user supplied comparator.
+  // Append a filter that summarizes keys[0,n-1] to *dst.
+  //
+  // Warning: do not change the initial contents of *dst.  Instead,
+  // append the newly constructed filter to *dst.
+  virtual void CreateFilter(const Slice* keys, int n, std::string* dst)
+      const = 0;
+
+  // "filter" contains the data appended by a preceding call to
+  // CreateFilter() on this class.  This method must return true if
+  // the key was in the list of keys passed to CreateFilter().
+  // This method may return true or false if the key was not on the
+  // list, but it should aim to return false with a high probability.
+  virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const = 0;
+};
+
+// Return a new filter policy that uses a bloom filter with approximately
+// the specified number of bits per key.  A good value for bits_per_key
+// is 10, which yields a filter with ~ 1% false positive rate.
+//
+// Callers must delete the result after any database that is using the
+// result has been closed.
+//
+// Note: if you are using a custom comparator that ignores some parts
+// of the keys being compared, you must not use NewBloomFilterPolicy()
+// and must provide your own FilterPolicy that also ignores the
+// corresponding parts of the keys.  For example, if the comparator
+// ignores trailing spaces, it would be incorrect to use a
+// FilterPolicy (like NewBloomFilterPolicy) that does not ignore
+// trailing spaces in keys.
+extern const FilterPolicy* NewBloomFilterPolicy(int bits_per_key);
+
+}
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_
diff --git a/include/rocksdb/flush_block_policy.h b/include/rocksdb/flush_block_policy.h
new file mode 100644 (file)
index 0000000..1740d87
--- /dev/null
@@ -0,0 +1,64 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+
+#include <string>
+
+namespace rocksdb {
+
+class Slice;
+class BlockBuilder;
+
+// FlushBlockPolicy provides a configurable way to determine when to flush a
+// block in the block based tables,
+class FlushBlockPolicy {
+ public:
+  // Keep track of the key/value sequences and return the boolean value to
+  // determine if table builder should flush current data block.
+  virtual bool Update(const Slice& key,
+                      const Slice& value) = 0;
+
+  virtual ~FlushBlockPolicy() { }
+};
+
+class FlushBlockPolicyFactory {
+ public:
+  // Return the name of the flush block policy.
+  virtual const char* Name() const = 0;
+
+  // Return a new block flush policy that flushes data blocks by data size.
+  // FlushBlockPolicy may need to access the metadata of the data block
+  // builder to determine when to flush the blocks.
+  //
+  // Callers must delete the result after any database that is using the
+  // result has been closed.
+  virtual FlushBlockPolicy* NewFlushBlockPolicy(
+      const BlockBuilder& data_block_builder) const = 0;
+
+  virtual ~FlushBlockPolicyFactory() { }
+};
+
+class FlushBlockBySizePolicyFactory : public FlushBlockPolicyFactory {
+ public:
+  FlushBlockBySizePolicyFactory(const uint64_t block_size,
+                                const uint64_t block_size_deviation) :
+      block_size_(block_size),
+      block_size_deviation_(block_size_deviation) {
+  }
+
+  virtual const char* Name() const override {
+    return "FlushBlockBySizePolicyFactory";
+  }
+
+  virtual FlushBlockPolicy* NewFlushBlockPolicy(
+      const BlockBuilder& data_block_builder) const override;
+
+ private:
+  const uint64_t block_size_;
+  const uint64_t block_size_deviation_;
+};
+
+}  // rocksdb
diff --git a/include/rocksdb/iterator.h b/include/rocksdb/iterator.h
new file mode 100644 (file)
index 0000000..7538e9c
--- /dev/null
@@ -0,0 +1,106 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// An iterator yields a sequence of key/value pairs from a source.
+// The following class defines the interface.  Multiple implementations
+// are provided by this library.  In particular, iterators are provided
+// to access the contents of a Table or a DB.
+//
+// Multiple threads can invoke const methods on an Iterator without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same Iterator must use
+// external synchronization.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_ITERATOR_H_
+#define STORAGE_ROCKSDB_INCLUDE_ITERATOR_H_
+
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+
+namespace rocksdb {
+
+class Iterator {
+ public:
+  Iterator();
+  virtual ~Iterator();
+
+  // An iterator is either positioned at a key/value pair, or
+  // not valid.  This method returns true iff the iterator is valid.
+  virtual bool Valid() const = 0;
+
+  // Position at the first key in the source.  The iterator is Valid()
+  // after this call iff the source is not empty.
+  virtual void SeekToFirst() = 0;
+
+  // Position at the last key in the source.  The iterator is
+  // Valid() after this call iff the source is not empty.
+  virtual void SeekToLast() = 0;
+
+  // Position at the first key in the source that at or past target
+  // The iterator is Valid() after this call iff the source contains
+  // an entry that comes at or past target.
+  virtual void Seek(const Slice& target) = 0;
+
+  // Moves to the next entry in the source.  After this call, Valid() is
+  // true iff the iterator was not positioned at the last entry in the source.
+  // REQUIRES: Valid()
+  virtual void Next() = 0;
+
+  // Moves to the previous entry in the source.  After this call, Valid() is
+  // true iff the iterator was not positioned at the first entry in source.
+  // REQUIRES: Valid()
+  virtual void Prev() = 0;
+
+  // Return the key for the current entry.  The underlying storage for
+  // the returned slice is valid only until the next modification of
+  // the iterator.
+  // REQUIRES: Valid()
+  virtual Slice key() const = 0;
+
+  // Return the value for the current entry.  The underlying storage for
+  // the returned slice is valid only until the next modification of
+  // the iterator.
+  // REQUIRES: !AtEnd() && !AtStart()
+  virtual Slice value() const = 0;
+
+  // If an error has occurred, return it.  Else return an ok status.
+  // If non-blocking IO is requested and this operation cannot be
+  // satisfied without doing some IO, then this returns Status::Incomplete().
+  virtual Status status() const = 0;
+
+  // Clients are allowed to register function/arg1/arg2 triples that
+  // will be invoked when this iterator is destroyed.
+  //
+  // Note that unlike all of the preceding methods, this method is
+  // not abstract and therefore clients should not override it.
+  typedef void (*CleanupFunction)(void* arg1, void* arg2);
+  void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2);
+
+ private:
+  struct Cleanup {
+    CleanupFunction function;
+    void* arg1;
+    void* arg2;
+    Cleanup* next;
+  };
+  Cleanup cleanup_;
+
+  // No copying allowed
+  Iterator(const Iterator&);
+  void operator=(const Iterator&);
+};
+
+// Return an empty iterator (yields nothing).
+extern Iterator* NewEmptyIterator();
+
+// Return an empty iterator with the specified status.
+extern Iterator* NewErrorIterator(const Status& status);
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_ITERATOR_H_
diff --git a/include/rocksdb/ldb_tool.h b/include/rocksdb/ldb_tool.h
new file mode 100644 (file)
index 0000000..a46b6a7
--- /dev/null
@@ -0,0 +1,18 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+#ifndef STORAGE_ROCKSDB_INCLUDE_LDB_TOOL_H
+#define STORAGE_ROCKSDB_INCLUDE_LDB_TOOL_H
+#include "rocksdb/options.h"
+
+namespace rocksdb {
+
+class LDBTool {
+ public:
+  void Run(int argc, char** argv, Options = Options());
+};
+
+} // namespace rocksdb
+
+#endif // STORAGE_ROCKSDB_INCLUDE_LDB_TOOL_H
diff --git a/include/rocksdb/memtablerep.h b/include/rocksdb/memtablerep.h
new file mode 100644 (file)
index 0000000..fcb782d
--- /dev/null
@@ -0,0 +1,203 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file contains the interface that must be implemented by any collection
+// to be used as the backing store for a MemTable. Such a collection must
+// satisfy the following properties:
+//  (1) It does not store duplicate items.
+//  (2) It uses MemTableRep::KeyComparator to compare items for iteration and
+//     equality.
+//  (3) It can be accessed concurrently by multiple readers and can support
+//     during reads. However, it needn't support multiple concurrent writes.
+//  (4) Items are never deleted.
+// The liberal use of assertions is encouraged to enforce (1).
+//
+// The factory will be passed an Arena object when a new MemTableRep is
+// requested. The API for this object is in rocksdb/arena.h.
+//
+// Users can implement their own memtable representations. We include three
+// types built in:
+//  - SkipListRep: This is the default; it is backed by a skip list.
+//  - HashSkipListRep: The memtable rep that is best used for keys that are
+//  structured like "prefix:suffix" where iteration withing a prefix is
+//  common and iteration across different prefixes is rare. It is backed by
+//  a hash map where each bucket is a skip list.
+//  - VectorRep: This is backed by an unordered std::vector. On iteration, the
+// vector is sorted. It is intelligent about sorting; once the MarkReadOnly()
+// has been called, the vector will only be sorted once. It is optimized for
+// random-write-heavy workloads.
+//
+// The last four implementations are designed for situations in which
+// iteration over the entire collection is rare since doing so requires all the
+// keys to be copied into a sorted data structure.
+
+#ifndef STORAGE_ROCKSDB_DB_MEMTABLEREP_H_
+#define STORAGE_ROCKSDB_DB_MEMTABLEREP_H_
+
+#include <memory>
+#include "rocksdb/arena.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+
+namespace rocksdb {
+
+class MemTableRep {
+ public:
+  // KeyComparator provides a means to compare keys, which are internal keys
+  // concatenated with values.
+  class KeyComparator {
+   public:
+    // Compare a and b. Return a negative value if a is less than b, 0 if they
+    // are equal, and a positive value if a is greater than b
+    virtual int operator()(const char* a, const char* b) const = 0;
+
+    virtual ~KeyComparator() { }
+  };
+
+  // Insert key into the collection. (The caller will pack key and value into a
+  // single buffer and pass that in as the parameter to Insert)
+  // REQUIRES: nothing that compares equal to key is currently in the
+  // collection.
+  virtual void Insert(const char* key) = 0;
+
+  // Returns true iff an entry that compares equal to key is in the collection.
+  virtual bool Contains(const char* key) const = 0;
+
+  // Notify this table rep that it will no longer be added to. By default, does
+  // nothing.
+  virtual void MarkReadOnly() { }
+
+  // Report an approximation of how much memory has been used other than memory
+  // that was allocated through the arena.
+  virtual size_t ApproximateMemoryUsage() = 0;
+
+  virtual ~MemTableRep() { }
+
+  // Iteration over the contents of a skip collection
+  class Iterator {
+   public:
+    // Initialize an iterator over the specified collection.
+    // The returned iterator is not valid.
+    // explicit Iterator(const MemTableRep* collection);
+    virtual ~Iterator() { };
+
+    // Returns true iff the iterator is positioned at a valid node.
+    virtual bool Valid() const = 0;
+
+    // Returns the key at the current position.
+    // REQUIRES: Valid()
+    virtual const char* key() const = 0;
+
+    // Advances to the next position.
+    // REQUIRES: Valid()
+    virtual void Next() = 0;
+
+    // Advances to the previous position.
+    // REQUIRES: Valid()
+    virtual void Prev() = 0;
+
+    // Advance to the first entry with a key >= target
+    virtual void Seek(const char* target) = 0;
+
+    // Position at the first entry in collection.
+    // Final state of iterator is Valid() iff collection is not empty.
+    virtual void SeekToFirst() = 0;
+
+    // Position at the last entry in collection.
+    // Final state of iterator is Valid() iff collection is not empty.
+    virtual void SeekToLast() = 0;
+  };
+
+  // Return an iterator over the keys in this representation.
+  virtual std::shared_ptr<Iterator> GetIterator() = 0;
+
+  // Return an iterator over at least the keys with the specified user key. The
+  // iterator may also allow access to other keys, but doesn't have to. Default:
+  // GetIterator().
+  virtual std::shared_ptr<Iterator> GetIterator(const Slice& user_key) {
+    return GetIterator();
+  }
+
+  // Return an iterator over at least the keys with the specified prefix. The
+  // iterator may also allow access to other keys, but doesn't have to. Default:
+  // GetIterator().
+  virtual std::shared_ptr<Iterator> GetPrefixIterator(const Slice& prefix) {
+    return GetIterator();
+  }
+
+  // Return an iterator that has a special Seek semantics. The result of
+  // a Seek might only include keys with the same prefix as the target key.
+  virtual std::shared_ptr<Iterator> GetDynamicPrefixIterator() {
+    return GetIterator();
+  }
+
+ protected:
+  // When *key is an internal key concatenated with the value, returns the
+  // user key.
+  virtual Slice UserKey(const char* key) const;
+};
+
+// This is the base class for all factories that are used by RocksDB to create
+// new MemTableRep objects
+class MemTableRepFactory {
+ public:
+  virtual ~MemTableRepFactory() { };
+  virtual std::shared_ptr<MemTableRep> CreateMemTableRep(
+    MemTableRep::KeyComparator&, Arena*) = 0;
+  virtual const char* Name() const = 0;
+};
+
+// This creates MemTableReps that are backed by an std::vector. On iteration,
+// the vector is sorted. This is useful for workloads where iteration is very
+// rare and writes are generally not issued after reads begin.
+//
+// Parameters:
+//   count: Passed to the constructor of the underlying std::vector of each
+//     VectorRep. On initialization, the underlying array will be at least count
+//     bytes reserved for usage.
+class VectorRepFactory : public MemTableRepFactory {
+  const size_t count_;
+public:
+  explicit VectorRepFactory(size_t count = 0) : count_(count) { }
+  virtual std::shared_ptr<MemTableRep> CreateMemTableRep(
+    MemTableRep::KeyComparator&, Arena*) override;
+  virtual const char* Name() const override {
+    return "VectorRepFactory";
+  }
+};
+
+// This uses a skip list to store keys. It is the default.
+class SkipListFactory : public MemTableRepFactory {
+public:
+  virtual std::shared_ptr<MemTableRep> CreateMemTableRep(
+    MemTableRep::KeyComparator&, Arena*) override;
+  virtual const char* Name() const override {
+    return "SkipListFactory";
+  }
+};
+
+// HashSkipListRep is backed by hash map of buckets. Each bucket is a skip
+// list. All the keys with the same prefix will be in the same bucket.
+// The prefix is determined using user supplied SliceTransform. It has
+// to match prefix_extractor in options.prefix_extractor.
+//
+// Iteration over the entire collection is implemented by dumping all the keys
+// into a separate skip list. Thus, these data structures are best used when
+// iteration over the entire collection is rare.
+//
+// Parameters:
+//   transform: The prefix extractor that returns prefix when supplied a user
+//     key. Has to match options.prefix_extractor
+//   bucket_count: Number of buckets in a hash_map. Each bucket needs
+//     8 bytes. By default, we set buckets to one million, which
+//     will take 8MB of memory. If you know the number of keys you'll
+//     keep in hash map, set bucket count to be approximately twice
+//     the number of keys
+extern MemTableRepFactory* NewHashSkipListRepFactory(
+    const SliceTransform* transform, size_t bucket_count = 1000000);
+
+}
+
+#endif // STORAGE_ROCKSDB_DB_MEMTABLEREP_H_
diff --git a/include/rocksdb/merge_operator.h b/include/rocksdb/merge_operator.h
new file mode 100644 (file)
index 0000000..bd4c36c
--- /dev/null
@@ -0,0 +1,149 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_MERGE_OPERATOR_H_
+#define STORAGE_ROCKSDB_INCLUDE_MERGE_OPERATOR_H_
+
+#include <memory>
+#include <string>
+#include <deque>
+#include "rocksdb/slice.h"
+
+namespace rocksdb {
+
+class Slice;
+class Logger;
+
+// The Merge Operator
+//
+// Essentially, a MergeOperator specifies the SEMANTICS of a merge, which only
+// client knows. It could be numeric addition, list append, string
+// concatenation, edit data structure, ... , anything.
+// The library, on the other hand, is concerned with the exercise of this
+// interface, at the right time (during get, iteration, compaction...)
+//
+// To use merge, the client needs to provide an object implementing one of
+// the following interfaces:
+//  a) AssociativeMergeOperator - for most simple semantics (always take
+//    two values, and merge them into one value, which is then put back
+//    into rocksdb); numeric addition and string concatenation are examples;
+//
+//  b) MergeOperator - the generic class for all the more abstract / complex
+//    operations; one method (FullMerge) to merge a Put/Delete value with a
+//    merge operand; and another method (PartialMerge) that merges two
+//    operands together. this is especially useful if your key values have a
+//    complex structure but you would still like to support client-specific
+//    incremental updates.
+//
+// AssociativeMergeOperator is simpler to implement. MergeOperator is simply
+// more powerful.
+//
+// Refer to rocksdb-merge wiki for more details and example implementations.
+//
+class MergeOperator {
+ public:
+  virtual ~MergeOperator() {}
+
+  // Gives the client a way to express the read -> modify -> write semantics
+  // key:      (IN)    The key that's associated with this merge operation.
+  //                   Client could multiplex the merge operator based on it
+  //                   if the key space is partitioned and different subspaces
+  //                   refer to different types of data which have different
+  //                   merge operation semantics
+  // existing: (IN)    null indicates that the key does not exist before this op
+  // operand_list:(IN) the sequence of merge operations to apply, front() first.
+  // new_value:(OUT)   Client is responsible for filling the merge result here
+  // logger:   (IN)    Client could use this to log errors during merge.
+  //
+  // Return true on success.
+  // All values passed in will be client-specific values. So if this method
+  // returns false, it is because client specified bad data or there was
+  // internal corruption. This will be treated as an error by the library.
+  //
+  // Also make use of the *logger for error messages.
+  virtual bool FullMerge(const Slice& key,
+                         const Slice* existing_value,
+                         const std::deque<std::string>& operand_list,
+                         std::string* new_value,
+                         Logger* logger) const = 0;
+
+  // This function performs merge(left_op, right_op)
+  // when both the operands are themselves merge operation types
+  // that you would have passed to a DB::Merge() call in the same order
+  // (i.e.: DB::Merge(key,left_op), followed by DB::Merge(key,right_op)).
+  //
+  // PartialMerge should combine them into a single merge operation that is
+  // saved into *new_value, and then it should return true.
+  // *new_value should be constructed such that a call to
+  // DB::Merge(key, *new_value) would yield the same result as a call
+  // to DB::Merge(key, left_op) followed by DB::Merge(key, right_op).
+  //
+  // If it is impossible or infeasible to combine the two operations,
+  // leave new_value unchanged and return false. The library will
+  // internally keep track of the operations, and apply them in the
+  // correct order once a base-value (a Put/Delete/End-of-Database) is seen.
+  //
+  // TODO: Presently there is no way to differentiate between error/corruption
+  // and simply "return false". For now, the client should simply return
+  // false in any case it cannot perform partial-merge, regardless of reason.
+  // If there is corruption in the data, handle it in the FullMerge() function,
+  // and return false there.
+  virtual bool PartialMerge(const Slice& key,
+                            const Slice& left_operand,
+                            const Slice& right_operand,
+                            std::string* new_value,
+                            Logger* logger) const = 0;
+
+  // The name of the MergeOperator. Used to check for MergeOperator
+  // mismatches (i.e., a DB created with one MergeOperator is
+  // accessed using a different MergeOperator)
+  // TODO: the name is currently not stored persistently and thus
+  //       no checking is enforced. Client is responsible for providing
+  //       consistent MergeOperator between DB opens.
+  virtual const char* Name() const = 0;
+};
+
+// The simpler, associative merge operator.
+class AssociativeMergeOperator : public MergeOperator {
+ public:
+  virtual ~AssociativeMergeOperator() {}
+
+  // Gives the client a way to express the read -> modify -> write semantics
+  // key:           (IN) The key that's associated with this merge operation.
+  // existing_value:(IN) null indicates the key does not exist before this op
+  // value:         (IN) the value to update/merge the existing_value with
+  // new_value:    (OUT) Client is responsible for filling the merge result here
+  // logger:        (IN) Client could use this to log errors during merge.
+  //
+  // Return true on success.
+  // All values passed in will be client-specific values. So if this method
+  // returns false, it is because client specified bad data or there was
+  // internal corruption. The client should assume that this will be treated
+  // as an error by the library.
+  virtual bool Merge(const Slice& key,
+                     const Slice* existing_value,
+                     const Slice& value,
+                     std::string* new_value,
+                     Logger* logger) const = 0;
+
+
+ private:
+  // Default implementations of the MergeOperator functions
+  virtual bool FullMerge(const Slice& key,
+                         const Slice* existing_value,
+                         const std::deque<std::string>& operand_list,
+                         std::string* new_value,
+                         Logger* logger) const override;
+
+  virtual bool PartialMerge(const Slice& key,
+                            const Slice& left_operand,
+                            const Slice& right_operand,
+                            std::string* new_value,
+                            Logger* logger) const override;
+};
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_MERGE_OPERATOR_H_
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
new file mode 100644 (file)
index 0000000..b84bdcf
--- /dev/null
@@ -0,0 +1,768 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_
+#define STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_
+
+#include <stddef.h>
+#include <string>
+#include <memory>
+#include <vector>
+#include <stdint.h>
+
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/table_properties.h"
+#include "rocksdb/universal_compaction.h"
+
+namespace rocksdb {
+
+class Cache;
+class CompactionFilter;
+class CompactionFilterFactory;
+class Comparator;
+class Env;
+class FilterPolicy;
+class Logger;
+class MergeOperator;
+class Snapshot;
+class TableFactory;
+
+using std::shared_ptr;
+
+// DB contents are stored in a set of blocks, each of which holds a
+// sequence of key,value pairs.  Each block may be compressed before
+// being stored in a file.  The following enum describes which
+// compression method (if any) is used to compress a block.
+enum CompressionType : char {
+  // NOTE: do not change the values of existing entries, as these are
+  // part of the persistent format on disk.
+  kNoCompression = 0x0,
+  kSnappyCompression = 0x1,
+  kZlibCompression = 0x2,
+  kBZip2Compression = 0x3
+};
+
+enum CompactionStyle : char {
+  kCompactionStyleLevel = 0x0,     // level based compaction style
+  kCompactionStyleUniversal = 0x1  // Universal compaction style
+};
+
+// Compression options for different compression algorithms like Zlib
+struct CompressionOptions {
+  int window_bits;
+  int level;
+  int strategy;
+  CompressionOptions() : window_bits(-14), level(-1), strategy(0) {}
+  CompressionOptions(int wbits, int lev, int strategy)
+      : window_bits(wbits), level(lev), strategy(strategy) {}
+};
+
+// Options to control the behavior of a database (passed to DB::Open)
+struct Options {
+  // -------------------
+  // Parameters that affect behavior
+
+  // Comparator used to define the order of keys in the table.
+  // Default: a comparator that uses lexicographic byte-wise ordering
+  //
+  // REQUIRES: The client must ensure that the comparator supplied
+  // here has the same name and orders keys *exactly* the same as the
+  // comparator provided to previous open calls on the same DB.
+  const Comparator* comparator;
+
+  // REQUIRES: The client must provide a merge operator if Merge operation
+  // needs to be accessed. Calling Merge on a DB without a merge operator
+  // would result in Status::NotSupported. The client must ensure that the
+  // merge operator supplied here has the same name and *exactly* the same
+  // semantics as the merge operator provided to previous open calls on
+  // the same DB. The only exception is reserved for upgrade, where a DB
+  // previously without a merge operator is introduced to Merge operation
+  // for the first time. It's necessary to specify a merge operator when
+  // openning the DB in this case.
+  // Default: nullptr
+  shared_ptr<MergeOperator> merge_operator;
+
+  // A single CompactionFilter instance to call into during compaction.
+  // Allows an application to modify/delete a key-value during background
+  // compaction.
+  //
+  // If the client requires a new compaction filter to be used for different
+  // compaction runs, it can specify compaction_filter_factory instead of this
+  // option.  The client should specify only one of the two.
+  // compaction_filter takes precedence over compaction_filter_factory if
+  // client specifies both.
+  //
+  // If multithreaded compaction is being used, the supplied CompactionFilter
+  // instance may be used from different threads concurrently and so should be
+  // thread-safe.
+  //
+  // Default: nullptr
+  const CompactionFilter* compaction_filter;
+
+  // This is a factory that provides compaction filter objects which allow
+  // an application to modify/delete a key-value during background compaction.
+  //
+  // A new filter will be created on each compaction run.  If multithreaded
+  // compaction is being used, each created CompactionFilter will only be used
+  // from a single thread and so does not need to be thread-safe.
+  //
+  // Default: a factory that doesn't provide any object
+  std::shared_ptr<CompactionFilterFactory> compaction_filter_factory;
+
+  // If true, the database will be created if it is missing.
+  // Default: false
+  bool create_if_missing;
+
+  // If true, an error is raised if the database already exists.
+  // Default: false
+  bool error_if_exists;
+
+  // If true, the implementation will do aggressive checking of the
+  // data it is processing and will stop early if it detects any
+  // errors.  This may have unforeseen ramifications: for example, a
+  // corruption of one DB entry may cause a large number of entries to
+  // become unreadable or for the entire DB to become unopenable.
+  // If any of the  writes to the database fails (Put, Delete, Merge, Write),
+  // the database will switch to read-only mode and fail all other
+  // Write operations.
+  // Default: false
+  bool paranoid_checks;
+
+  // Use the specified object to interact with the environment,
+  // e.g. to read/write files, schedule background work, etc.
+  // Default: Env::Default()
+  Env* env;
+
+  // Any internal progress/error information generated by the db will
+  // be written to info_log if it is non-nullptr, or to a file stored
+  // in the same directory as the DB contents if info_log is nullptr.
+  // Default: nullptr
+  shared_ptr<Logger> info_log;
+
+  // -------------------
+  // Parameters that affect performance
+
+  // Amount of data to build up in memory (backed by an unsorted log
+  // on disk) before converting to a sorted on-disk file.
+  //
+  // Larger values increase performance, especially during bulk loads.
+  // Up to max_write_buffer_number write buffers may be held in memory
+  // at the same time,
+  // so you may wish to adjust this parameter to control memory usage.
+  // Also, a larger write buffer will result in a longer recovery time
+  // the next time the database is opened.
+  //
+  // Default: 4MB
+  size_t write_buffer_size;
+
+  // The maximum number of write buffers that are built up in memory.
+  // The default is 2, so that when 1 write buffer is being flushed to
+  // storage, new writes can continue to the other write buffer.
+  // Default: 2
+  int max_write_buffer_number;
+
+  // The minimum number of write buffers that will be merged together
+  // before writing to storage.  If set to 1, then
+  // all write buffers are fushed to L0 as individual files and this increases
+  // read amplification because a get request has to check in all of these
+  // files. Also, an in-memory merge may result in writing lesser
+  // data to storage if there are duplicate records in each of these
+  // individual write buffers.  Default: 1
+  int min_write_buffer_number_to_merge;
+
+  // Number of open files that can be used by the DB.  You may need to
+  // increase this if your database has a large working set (budget
+  // one open file per 2MB of working set).
+  //
+  // Default: 1000
+  int max_open_files;
+
+  // Control over blocks (user data is stored in a set of blocks, and
+  // a block is the unit of reading from disk).
+
+  // If non-NULL use the specified cache for blocks.
+  // If NULL, rocksdb will automatically create and use an 8MB internal cache.
+  // Default: nullptr
+  shared_ptr<Cache> block_cache;
+
+  // If non-NULL use the specified cache for compressed blocks.
+  // If NULL, rocksdb will not use a compressed block cache.
+  // Default: nullptr
+  shared_ptr<Cache> block_cache_compressed;
+
+  // Approximate size of user data packed per block.  Note that the
+  // block size specified here corresponds to uncompressed data.  The
+  // actual size of the unit read from disk may be smaller if
+  // compression is enabled.  This parameter can be changed dynamically.
+  //
+  // Default: 4K
+  size_t block_size;
+
+  // Number of keys between restart points for delta encoding of keys.
+  // This parameter can be changed dynamically.  Most clients should
+  // leave this parameter alone.
+  //
+  // Default: 16
+  int block_restart_interval;
+
+  // Compress blocks using the specified compression algorithm.  This
+  // parameter can be changed dynamically.
+  //
+  // Default: kSnappyCompression, which gives lightweight but fast
+  // compression.
+  //
+  // Typical speeds of kSnappyCompression on an Intel(R) Core(TM)2 2.4GHz:
+  //    ~200-500MB/s compression
+  //    ~400-800MB/s decompression
+  // Note that these speeds are significantly faster than most
+  // persistent storage speeds, and therefore it is typically never
+  // worth switching to kNoCompression.  Even if the input data is
+  // incompressible, the kSnappyCompression implementation will
+  // efficiently detect that and will switch to uncompressed mode.
+  CompressionType compression;
+
+  // Different levels can have different compression policies. There
+  // are cases where most lower levels would like to quick compression
+  // algorithm while the higher levels (which have more data) use
+  // compression algorithms that have better compression but could
+  // be slower. This array, if non nullptr, should have an entry for
+  // each level of the database. This array, if non nullptr, overides the
+  // value specified in the previous field 'compression'. The caller is
+  // reponsible for allocating memory and initializing the values in it
+  // before invoking Open(). The caller is responsible for freeing this
+  // array and it could be freed anytime after the return from Open().
+  // This could have been a std::vector but that makes the equivalent
+  // java/C api hard to construct.
+  std::vector<CompressionType> compression_per_level;
+
+  // different options for compression algorithms
+  CompressionOptions compression_opts;
+
+  // If non-nullptr, use the specified filter policy to reduce disk reads.
+  // Many applications will benefit from passing the result of
+  // NewBloomFilterPolicy() here.
+  //
+  // Default: nullptr
+  const FilterPolicy* filter_policy;
+
+  // If non-nullptr, use the specified function to determine the
+  // prefixes for keys.  These prefixes will be placed in the filter.
+  // Depending on the workload, this can reduce the number of read-IOP
+  // cost for scans when a prefix is passed via ReadOptions to
+  // db.NewIterator().  For prefix filtering to work properly,
+  // "prefix_extractor" and "comparator" must be such that the following
+  // properties hold:
+  //
+  // 1) key.starts_with(prefix(key))
+  // 2) Compare(prefix(key), key) <= 0.
+  // 3) If Compare(k1, k2) <= 0, then Compare(prefix(k1), prefix(k2)) <= 0
+  // 4) prefix(prefix(key)) == prefix(key)
+  //
+  // Default: nullptr
+  const SliceTransform* prefix_extractor;
+
+  // If true, place whole keys in the filter (not just prefixes).
+  // This must generally be true for gets to be efficient.
+  //
+  // Default: true
+  bool whole_key_filtering;
+
+  // Number of levels for this database
+  int num_levels;
+
+  // Number of files to trigger level-0 compaction. A value <0 means that
+  // level-0 compaction will not be triggered by number of files at all.
+  int level0_file_num_compaction_trigger;
+
+  // Soft limit on number of level-0 files. We start slowing down writes at this
+  // point. A value <0 means that no writing slow down will be triggered by
+  // number of files in level-0.
+  int level0_slowdown_writes_trigger;
+
+  // Maximum number of level-0 files.  We stop writes at this point.
+  int level0_stop_writes_trigger;
+
+  // Maximum level to which a new compacted memtable is pushed if it
+  // does not create overlap.  We try to push to level 2 to avoid the
+  // relatively expensive level 0=>1 compactions and to avoid some
+  // expensive manifest file operations.  We do not push all the way to
+  // the largest level since that can generate a lot of wasted disk
+  // space if the same key space is being repeatedly overwritten.
+  int max_mem_compaction_level;
+
+  // Target file size for compaction.
+  // target_file_size_base is per-file size for level-1.
+  // Target file size for level L can be calculated by
+  // target_file_size_base * (target_file_size_multiplier ^ (L-1))
+  // For example, if target_file_size_base is 2MB and
+  // target_file_size_multiplier is 10, then each file on level-1 will
+  // be 2MB, and each file on level 2 will be 20MB,
+  // and each file on level-3 will be 200MB.
+
+  // by default target_file_size_base is 2MB.
+  int target_file_size_base;
+  // by default target_file_size_multiplier is 1, which means
+  // by default files in different levels will have similar size.
+  int target_file_size_multiplier;
+
+  // Control maximum total data size for a level.
+  // max_bytes_for_level_base is the max total for level-1.
+  // Maximum number of bytes for level L can be calculated as
+  // (max_bytes_for_level_base) * (max_bytes_for_level_multiplier ^ (L-1))
+  // For example, if max_bytes_for_level_base is 20MB, and if
+  // max_bytes_for_level_multiplier is 10, total data size for level-1
+  // will be 20MB, total file size for level-2 will be 200MB,
+  // and total file size for level-3 will be 2GB.
+
+  // by default 'max_bytes_for_level_base' is 10MB.
+  uint64_t max_bytes_for_level_base;
+  // by default 'max_bytes_for_level_base' is 10.
+  int max_bytes_for_level_multiplier;
+
+  // Different max-size multipliers for different levels.
+  // These are multiplied by max_bytes_for_level_multiplier to arrive
+  // at the max-size of each level.
+  // Default: 1
+  std::vector<int> max_bytes_for_level_multiplier_additional;
+
+  // Maximum number of bytes in all compacted files.  We avoid expanding
+  // the lower level file set of a compaction if it would make the
+  // total compaction cover more than
+  // (expanded_compaction_factor * targetFileSizeLevel()) many bytes.
+  int expanded_compaction_factor;
+
+  // Maximum number of bytes in all source files to be compacted in a
+  // single compaction run. We avoid picking too many files in the
+  // source level so that we do not exceed the total source bytes
+  // for compaction to exceed
+  // (source_compaction_factor * targetFileSizeLevel()) many bytes.
+  // Default:1, i.e. pick maxfilesize amount of data as the source of
+  // a compaction.
+  int source_compaction_factor;
+
+  // Control maximum bytes of overlaps in grandparent (i.e., level+2) before we
+  // stop building a single file in a level->level+1 compaction.
+  int max_grandparent_overlap_factor;
+
+  // If non-null, then we should collect metrics about database operations
+  // Statistics objects should not be shared between DB instances as
+  // it does not use any locks to prevent concurrent updates.
+  shared_ptr<Statistics> statistics;
+
+  // If true, then the contents of data files are not synced
+  // to stable storage. Their contents remain in the OS buffers till the
+  // OS decides to flush them. This option is good for bulk-loading
+  // of data. Once the bulk-loading is complete, please issue a
+  // sync to the OS to flush all dirty buffesrs to stable storage.
+  // Default: false
+  bool disableDataSync;
+
+  // If true, then every store to stable storage will issue a fsync.
+  // If false, then every store to stable storage will issue a fdatasync.
+  // This parameter should be set to true while storing data to
+  // filesystem like ext3 that can lose files after a reboot.
+  // Default: false
+  bool use_fsync;
+
+  // This number controls how often a new scribe log about
+  // db deploy stats is written out.
+  // -1 indicates no logging at all.
+  // Default value is 1800 (half an hour).
+  int db_stats_log_interval;
+
+  // This specifies the info LOG dir.
+  // If it is empty, the log files will be in the same dir as data.
+  // If it is non empty, the log files will be in the specified dir,
+  // and the db data dir's absolute path will be used as the log file
+  // name's prefix.
+  std::string db_log_dir;
+
+  // This specifies the absolute dir path for write-ahead logs (WAL).
+  // If it is empty, the log files will be in the same dir as data,
+  //   dbname is used as the data dir by default
+  // If it is non empty, the log files will be in kept the specified dir.
+  // When destroying the db,
+  //   all log files in wal_dir and the dir itself is deleted
+  std::string wal_dir;
+
+  // Disable compaction triggered by seek.
+  // With bloomfilter and fast storage, a miss on one level
+  // is very cheap if the file handle is cached in table cache
+  // (which is true if max_open_files is large).
+  bool disable_seek_compaction;
+
+  // The periodicity when obsolete files get deleted. The default
+  // value is 6 hours. The files that get out of scope by compaction
+  // process will still get automatically delete on every compaction,
+  // regardless of this setting
+  uint64_t delete_obsolete_files_period_micros;
+
+  // Maximum number of concurrent background jobs, submitted to
+  // the default LOW priority thread pool
+  // Default: 1
+  int max_background_compactions;
+
+  // Maximum number of concurrent background memtable flush jobs, submitted to
+  // the HIGH priority thread pool.
+  // By default, all background jobs (major compaction and memtable flush) go
+  // to the LOW priority pool. If this option is set to a positive number,
+  // memtable flush jobs will be submitted to the HIGH priority pool.
+  // It is important when the same Env is shared by multiple db instances.
+  // Without a separate pool, long running major compaction jobs could
+  // potentially block memtable flush jobs of other db instances, leading to
+  // unnecessary Put stalls.
+  // Default: 0
+  int max_background_flushes;
+
+  // Specify the maximal size of the info log file. If the log file
+  // is larger than `max_log_file_size`, a new info log file will
+  // be created.
+  // If max_log_file_size == 0, all logs will be written to one
+  // log file.
+  size_t max_log_file_size;
+
+  // Time for the info log file to roll (in seconds).
+  // If specified with non-zero value, log file will be rolled
+  // if it has been active longer than `log_file_time_to_roll`.
+  // Default: 0 (disabled)
+  size_t log_file_time_to_roll;
+
+  // Maximal info log files to be kept.
+  // Default: 1000
+  size_t keep_log_file_num;
+
+  // Puts are delayed 0-1 ms when any level has a compaction score that exceeds
+  // soft_rate_limit. This is ignored when == 0.0.
+  // CONSTRAINT: soft_rate_limit <= hard_rate_limit. If this constraint does not
+  // hold, RocksDB will set soft_rate_limit = hard_rate_limit
+  // Default: 0 (disabled)
+  double soft_rate_limit;
+
+  // Puts are delayed 1ms at a time when any level has a compaction score that
+  // exceeds hard_rate_limit. This is ignored when <= 1.0.
+  // Default: 0 (disabled)
+  double hard_rate_limit;
+
+  // Max time a put will be stalled when hard_rate_limit is enforced. If 0, then
+  // there is no limit.
+  // Default: 1000
+  unsigned int rate_limit_delay_max_milliseconds;
+
+  // manifest file is rolled over on reaching this limit.
+  // The older manifest file be deleted.
+  // The default value is MAX_INT so that roll-over does not take place.
+  uint64_t max_manifest_file_size;
+
+  // Disable block cache. If this is set to true,
+  // then no block cache should be used, and the block_cache should
+  // point to a nullptr object.
+  // Default: false
+  bool no_block_cache;
+
+  // Number of shards used for table cache.
+  int table_cache_numshardbits;
+
+  // During data eviction of table's LRU cache, it would be inefficient
+  // to strictly follow LRU because this piece of memory will not really
+  // be released unless its refcount falls to zero. Instead, make two
+  // passes: the first pass will release items with refcount = 1,
+  // and if not enough space releases after scanning the number of
+  // elements specified by this parameter, we will remove items in LRU
+  // order.
+  int table_cache_remove_scan_count_limit;
+
+  // Size of one block in arena memory allocation.
+  //
+  // If <= 0, a proper value is automatically calculated (usually about 1/10 of
+  // writer_buffer_size).
+  //
+  // There are two additonal restriction of the The specified size:
+  // (1) size should be in the range of [4096, 2 << 30] and
+  // (2) be the multiple of the CPU word (which helps with the memory
+  // alignment).
+  //
+  // We'll automatically check and adjust the size number to make sure it
+  // conforms to the restrictions.
+  //
+  // Default: 0
+  size_t arena_block_size;
+
+  // Create an Options object with default values for all fields.
+  Options();
+
+  void Dump(Logger* log) const;
+
+  // Set appropriate parameters for bulk loading.
+  // The reason that this is a function that returns "this" instead of a
+  // constructor is to enable chaining of multiple similar calls in the future.
+  //
+  // All data will be in level 0 without any automatic compaction.
+  // It's recommended to manually call CompactRange(NULL, NULL) before reading
+  // from the database, because otherwise the read can be very slow.
+  Options* PrepareForBulkLoad();
+
+  // Disable automatic compactions. Manual compactions can still
+  // be issued on this database.
+  bool disable_auto_compactions;
+
+  // The following two fields affect how archived logs will be deleted.
+  // 1. If both set to 0, logs will be deleted asap and will not get into
+  //    the archive.
+  // 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0,
+  //    WAL files will be checked every 10 min and if total size is greater
+  //    then WAL_size_limit_MB, they will be deleted starting with the
+  //    earliest until size_limit is met. All empty files will be deleted.
+  // 3. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
+  //    WAL files will be checked every WAL_ttl_secondsi / 2 and those that
+  //    are older than WAL_ttl_seconds will be deleted.
+  // 4. If both are not 0, WAL files will be checked every 10 min and both
+  //    checks will be performed with ttl being first.
+  uint64_t WAL_ttl_seconds;
+  uint64_t WAL_size_limit_MB;
+
+  // Number of bytes to preallocate (via fallocate) the manifest
+  // files.  Default is 4mb, which is reasonable to reduce random IO
+  // as well as prevent overallocation for mounts that preallocate
+  // large amounts of data (such as xfs's allocsize option).
+  size_t manifest_preallocation_size;
+
+  // Purge duplicate/deleted keys when a memtable is flushed to storage.
+  // Default: true
+  bool purge_redundant_kvs_while_flush;
+
+  // Data being read from file storage may be buffered in the OS
+  // Default: true
+  bool allow_os_buffer;
+
+  // Allow the OS to mmap file for reading sst tables. Default: false
+  bool allow_mmap_reads;
+
+  // Allow the OS to mmap file for writing. Default: true
+  bool allow_mmap_writes;
+
+  // Disable child process inherit open files. Default: true
+  bool is_fd_close_on_exec;
+
+  // Skip log corruption error on recovery (If client is ok with
+  // losing most recent changes)
+  // Default: false
+  bool skip_log_error_on_recovery;
+
+  // if not zero, dump rocksdb.stats to LOG every stats_dump_period_sec
+  // Default: 3600 (1 hour)
+  unsigned int stats_dump_period_sec;
+
+  // This is used to close a block before it reaches the configured
+  // 'block_size'. If the percentage of free space in the current block is less
+  // than this specified number and adding a new record to the block will
+  // exceed the configured block size, then this block will be closed and the
+  // new record will be written to the next block.
+  // Default is 10.
+  int block_size_deviation;
+
+  // If set true, will hint the underlying file system that the file
+  // access pattern is random, when a sst file is opened.
+  // Default: true
+  bool advise_random_on_open;
+
+  // Specify the file access pattern once a compaction is started.
+  // It will be applied to all input files of a compaction.
+  // Default: NORMAL
+  enum {
+    NONE,
+    NORMAL,
+    SEQUENTIAL,
+    WILLNEED
+  } access_hint_on_compaction_start;
+
+  // Use adaptive mutex, which spins in the user space before resorting
+  // to kernel. This could reduce context switch when the mutex is not
+  // heavily contended. However, if the mutex is hot, we could end up
+  // wasting spin time.
+  // Default: false
+  bool use_adaptive_mutex;
+
+  // Allows OS to incrementally sync files to disk while they are being
+  // written, asynchronously, in the background.
+  // Issue one request for every bytes_per_sync written. 0 turns it off.
+  // Default: 0
+  uint64_t bytes_per_sync;
+
+  // The compaction style. Default: kCompactionStyleLevel
+  CompactionStyle compaction_style;
+
+  // The options needed to support Universal Style compactions
+  CompactionOptionsUniversal compaction_options_universal;
+
+  // Use KeyMayExist API to filter deletes when this is true.
+  // If KeyMayExist returns false, i.e. the key definitely does not exist, then
+  // the delete is a noop. KeyMayExist only incurs in-memory look up.
+  // This optimization avoids writing the delete to storage when appropriate.
+  // Default: false
+  bool filter_deletes;
+
+  // An iteration->Next() sequentially skips over keys with the same
+  // user-key unless this option is set. This number specifies the number
+  // of keys (with the same userkey) that will be sequentially
+  // skipped before a reseek is issued.
+  // Default: 8
+  uint64_t max_sequential_skip_in_iterations;
+
+  // This is a factory that provides MemTableRep objects.
+  // Default: a factory that provides a skip-list-based implementation of
+  // MemTableRep.
+  std::shared_ptr<MemTableRepFactory> memtable_factory;
+
+  // This is a factory that provides TableFactory objects.
+  // Default: a factory that provides a default implementation of
+  // Table and TableBuilder.
+  std::shared_ptr<TableFactory> table_factory;
+
+  // This option allows user to to collect their own interested statistics of
+  // the tables.
+  // Default: emtpy vector -- no user-defined statistics collection will be
+  // performed.
+  std::vector<std::shared_ptr<TablePropertiesCollector>>
+  table_properties_collectors;
+
+  // Allows thread-safe inplace updates. Requires Updates iff
+  // * key exists in current memtable
+  // * new sizeof(new_value) <= sizeof(old_value)
+  // * old_value for that key is a put i.e. kTypeValue
+  // Default: false.
+  bool inplace_update_support;
+
+  // Number of locks used for inplace update
+  // Default: 10000, if inplace_update_support = true, else 0.
+  size_t inplace_update_num_locks;
+
+  // Maximum number of successive merge operations on a key in the memtable.
+  //
+  // When a merge operation is added to the memtable and the maximum number of
+  // successive merges is reached, the value of the key will be calculated and
+  // inserted into the memtable instead of the merge operation. This will
+  // ensure that there are never more than max_successive_merges merge
+  // operations in the memtable.
+  //
+  // Default: 0 (disabled)
+  size_t max_successive_merges;
+};
+
+//
+// An application can issue a read request (via Get/Iterators) and specify
+// if that read should process data that ALREADY resides on a specified cache
+// level. For example, if an application specifies kBlockCacheTier then the
+// Get call will process data that is already processed in the memtable or
+// the block cache. It will not page in data from the OS cache or data that
+// resides in storage.
+enum ReadTier {
+  kReadAllTier = 0x0,    // data in memtable, block cache, OS cache or storage
+  kBlockCacheTier = 0x1  // data in memtable or block cache
+};
+
+// Options that control read operations
+struct ReadOptions {
+  // If true, all data read from underlying storage will be
+  // verified against corresponding checksums.
+  // Default: false
+  bool verify_checksums;
+
+  // Should the "data block"/"index block"/"filter block" read for this
+  // iteration be cached in memory?
+  // Callers may wish to set this field to false for bulk scans.
+  // Default: true
+  bool fill_cache;
+
+  // If this option is set and memtable implementation allows, Seek
+  // might only return keys with the same prefix as the seek-key
+  bool prefix_seek;
+
+  // If "snapshot" is non-nullptr, read as of the supplied snapshot
+  // (which must belong to the DB that is being read and which must
+  // not have been released).  If "snapshot" is nullptr, use an impliicit
+  // snapshot of the state at the beginning of this read operation.
+  // Default: nullptr
+  const Snapshot* snapshot;
+
+  // If "prefix" is non-nullptr, and ReadOptions is being passed to
+  // db.NewIterator, only return results when the key begins with this
+  // prefix.  This field is ignored by other calls (e.g., Get).
+  // Options.prefix_extractor must also be set, and
+  // prefix_extractor.InRange(prefix) must be true.  The iterator
+  // returned by NewIterator when this option is set will behave just
+  // as if the underlying store did not contain any non-matching keys,
+  // with two exceptions.  Seek() only accepts keys starting with the
+  // prefix, and SeekToLast() is not supported.  prefix filter with this
+  // option will sometimes reduce the number of read IOPs.
+  // Default: nullptr
+  const Slice* prefix;
+
+  // Specify if this read request should process data that ALREADY
+  // resides on a particular cache. If the required data is not
+  // found at the specified cache, then Status::Incomplete is returned.
+  // Default: kReadAllTier
+  ReadTier read_tier;
+
+  ReadOptions()
+      : verify_checksums(false),
+        fill_cache(true),
+        prefix_seek(false),
+        snapshot(nullptr),
+        prefix(nullptr),
+        read_tier(kReadAllTier) {}
+  ReadOptions(bool cksum, bool cache)
+      : verify_checksums(cksum),
+        fill_cache(cache),
+        prefix_seek(false),
+        snapshot(nullptr),
+        prefix(nullptr),
+        read_tier(kReadAllTier) {}
+};
+
+// Options that control write operations
+struct WriteOptions {
+  // If true, the write will be flushed from the operating system
+  // buffer cache (by calling WritableFile::Sync()) before the write
+  // is considered complete.  If this flag is true, writes will be
+  // slower.
+  //
+  // If this flag is false, and the machine crashes, some recent
+  // writes may be lost.  Note that if it is just the process that
+  // crashes (i.e., the machine does not reboot), no writes will be
+  // lost even if sync==false.
+  //
+  // In other words, a DB write with sync==false has similar
+  // crash semantics as the "write()" system call.  A DB write
+  // with sync==true has similar crash semantics to a "write()"
+  // system call followed by "fdatasync()".
+  //
+  // Default: false
+  bool sync;
+
+  // If true, writes will not first go to the write ahead log,
+  // and the write may got lost after a crash.
+  bool disableWAL;
+
+  WriteOptions() : sync(false), disableWAL(false) {}
+};
+
+// Options that control flush operations
+struct FlushOptions {
+  // If true, the flush will wait until the flush is done.
+  // Default: true
+  bool wait;
+
+  FlushOptions() : wait(true) {}
+};
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_
diff --git a/include/rocksdb/perf_context.h b/include/rocksdb/perf_context.h
new file mode 100644 (file)
index 0000000..9e900e0
--- /dev/null
@@ -0,0 +1,48 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_PERF_CONTEXT_H
+#define STORAGE_ROCKSDB_INCLUDE_PERF_CONTEXT_H
+
+#include <stdint.h>
+
+namespace rocksdb {
+
+enum PerfLevel {
+  kDisable        = 0,  // disable perf stats
+  kEnableCount    = 1,  // enable only count stats
+  kEnableTime     = 2   // enable time stats too
+};
+
+// set the perf stats level
+void SetPerfLevel(PerfLevel level);
+
+// A thread local context for gathering performance counter efficiently
+// and transparently.
+
+struct PerfContext {
+
+  void Reset(); // reset all performance counters to zero
+
+  uint64_t user_key_comparison_count; // total number of user key comparisons
+  uint64_t block_cache_hit_count;     // total number of block cache hits
+  uint64_t block_read_count;          // total number of block reads (with IO)
+  uint64_t block_read_byte;           // total number of bytes from block reads
+  uint64_t block_read_time;           // total time spent on block reads
+  uint64_t block_checksum_time;       // total time spent on block checksum
+  uint64_t block_decompress_time;     // total time spent on block decompression
+  // total number of internal keys skipped over during iteration (overwritten or
+  // deleted, to be more specific, hidden by a put or delete of the same key)
+  uint64_t internal_key_skipped_count;
+  // total number of deletes skipped over during iteration
+  uint64_t internal_delete_skipped_count;
+  uint64_t wal_write_time;            // total time spent on writing to WAL
+};
+
+extern __thread PerfContext perf_context;
+
+}
+
+#endif
diff --git a/include/rocksdb/slice.h b/include/rocksdb/slice.h
new file mode 100644 (file)
index 0000000..e6cca21
--- /dev/null
@@ -0,0 +1,136 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Slice is a simple structure containing a pointer into some external
+// storage and a size.  The user of a Slice must ensure that the slice
+// is not used after the corresponding external storage has been
+// deallocated.
+//
+// Multiple threads can invoke const methods on a Slice without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same Slice must use
+// external synchronization.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_SLICE_H_
+#define STORAGE_ROCKSDB_INCLUDE_SLICE_H_
+
+#include <assert.h>
+#include <stddef.h>
+#include <string.h>
+#include <string>
+
+namespace rocksdb {
+
+class Slice {
+ public:
+  // Create an empty slice.
+  Slice() : data_(""), size_(0) { }
+
+  // Create a slice that refers to d[0,n-1].
+  Slice(const char* d, size_t n) : data_(d), size_(n) { }
+
+  // Create a slice that refers to the contents of "s"
+  /* implicit */
+  Slice(const std::string& s) : data_(s.data()), size_(s.size()) { }
+
+  // Create a slice that refers to s[0,strlen(s)-1]
+  /* implicit */
+  Slice(const char* s) : data_(s), size_(strlen(s)) { }
+
+  // Return a pointer to the beginning of the referenced data
+  const char* data() const { return data_; }
+
+  // Return the length (in bytes) of the referenced data
+  size_t size() const { return size_; }
+
+  // Return true iff the length of the referenced data is zero
+  bool empty() const { return size_ == 0; }
+
+  // Return the ith byte in the referenced data.
+  // REQUIRES: n < size()
+  char operator[](size_t n) const {
+    assert(n < size());
+    return data_[n];
+  }
+
+  // Change this slice to refer to an empty array
+  void clear() { data_ = ""; size_ = 0; }
+
+  // Drop the first "n" bytes from this slice.
+  void remove_prefix(size_t n) {
+    assert(n <= size());
+    data_ += n;
+    size_ -= n;
+  }
+
+  // Return a string that contains the copy of the referenced data.
+  std::string ToString(bool hex = false) const {
+    if (hex) {
+      std::string result;
+      char buf[10];
+      for (size_t i = 0; i < size_; i++) {
+        snprintf(buf, 10, "%02X", (unsigned char)data_[i]);
+        result += buf;
+      }
+      return result;
+    } else {
+      return std::string(data_, size_);
+    }
+  }
+
+  // Three-way comparison.  Returns value:
+  //   <  0 iff "*this" <  "b",
+  //   == 0 iff "*this" == "b",
+  //   >  0 iff "*this" >  "b"
+  int compare(const Slice& b) const;
+
+  // Return true iff "x" is a prefix of "*this"
+  bool starts_with(const Slice& x) const {
+    return ((size_ >= x.size_) &&
+            (memcmp(data_, x.data_, x.size_) == 0));
+  }
+
+ // private: make these public for rocksdbjni access
+  const char* data_;
+  size_t size_;
+
+  // Intentionally copyable
+};
+
+// A set of Slices that are virtually concatenated together.  'parts' points
+// to an array of Slices.  The number of elements in the array is 'num_parts'.
+struct SliceParts {
+  SliceParts(const Slice* parts, int num_parts) :
+      parts(parts), num_parts(num_parts) { }
+
+  const Slice* parts;
+  int num_parts;
+};
+
+inline bool operator==(const Slice& x, const Slice& y) {
+  return ((x.size() == y.size()) &&
+          (memcmp(x.data(), y.data(), x.size()) == 0));
+}
+
+inline bool operator!=(const Slice& x, const Slice& y) {
+  return !(x == y);
+}
+
+inline int Slice::compare(const Slice& b) const {
+  const int min_len = (size_ < b.size_) ? size_ : b.size_;
+  int r = memcmp(data_, b.data_, min_len);
+  if (r == 0) {
+    if (size_ < b.size_) r = -1;
+    else if (size_ > b.size_) r = +1;
+  }
+  return r;
+}
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_SLICE_H_
diff --git a/include/rocksdb/slice_transform.h b/include/rocksdb/slice_transform.h
new file mode 100644 (file)
index 0000000..a784550
--- /dev/null
@@ -0,0 +1,47 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Class for specifying user-defined functions which perform a
+// transformation on a slice.  It is not required that every slice
+// belong to the domain and/or range of a function.  Subclasses should
+// define InDomain and InRange to determine which slices are in either
+// of these sets respectively.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_SLICE_TRANSFORM_H_
+#define STORAGE_ROCKSDB_INCLUDE_SLICE_TRANSFORM_H_
+
+#include <string>
+
+namespace rocksdb {
+
+class Slice;
+
+class SliceTransform {
+ public:
+  virtual ~SliceTransform() {};
+
+  // Return the name of this transformation.
+  virtual const char* Name() const = 0;
+
+  // transform a src in domain to a dst in the range
+  virtual Slice Transform(const Slice& src) const = 0;
+
+  // determine whether this is a valid src upon the function applies
+  virtual bool InDomain(const Slice& src) const = 0;
+
+  // determine whether dst=Transform(src) for some src
+  virtual bool InRange(const Slice& dst) const = 0;
+};
+
+extern const SliceTransform* NewFixedPrefixTransform(size_t prefix_len);
+
+extern const SliceTransform* NewNoopTransform();
+
+}
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_SLICE_TRANSFORM_H_
diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h
new file mode 100644 (file)
index 0000000..011e510
--- /dev/null
@@ -0,0 +1,308 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_STATISTICS_H_
+#define STORAGE_ROCKSDB_INCLUDE_STATISTICS_H_
+
+#include <atomic>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <memory>
+#include <vector>
+
+namespace rocksdb {
+
+/**
+ * Keep adding ticker's here.
+ * Any ticker should have a value less than TICKER_ENUM_MAX.
+ * Add a new ticker by assigning it the current value of TICKER_ENUM_MAX
+ * Add a string representation in TickersNameMap below.
+ * And incrementing TICKER_ENUM_MAX.
+ */
+enum Tickers {
+  // total block cache misses
+  // REQUIRES: BLOCK_CACHE_MISS == BLOCK_CACHE_INDEX_MISS +
+  //                               BLOCK_CACHE_FILTER_MISS +
+  //                               BLOCK_CACHE_DATA_MISS;
+  BLOCK_CACHE_MISS,
+  // total block cache hit
+  // REQUIRES: BLOCK_CACHE_HIT == BLOCK_CACHE_INDEX_HIT +
+  //                              BLOCK_CACHE_FILTER_HIT +
+  //                              BLOCK_CACHE_DATA_HIT;
+  BLOCK_CACHE_HIT,
+  // # of blocks added to block cache.
+  BLOCK_CACHE_ADD,
+  // # of times cache miss when accessing index block from block cache.
+  BLOCK_CACHE_INDEX_MISS,
+  // # of times cache hit when accessing index block from block cache.
+  BLOCK_CACHE_INDEX_HIT,
+  // # of times cache miss when accessing filter block from block cache.
+  BLOCK_CACHE_FILTER_MISS,
+  // # of times cache hit when accessing filter block from block cache.
+  BLOCK_CACHE_FILTER_HIT,
+  // # of times cache miss when accessing data block from block cache.
+  BLOCK_CACHE_DATA_MISS,
+  // # of times cache hit when accessing data block from block cache.
+  BLOCK_CACHE_DATA_HIT,
+  // # of times bloom filter has avoided file reads.
+  BLOOM_FILTER_USEFUL,
+
+  // # of memtable hits.
+  MEMTABLE_HIT,
+  // # of memtable misses.
+  MEMTABLE_MISS,
+
+  /**
+   * COMPACTION_KEY_DROP_* count the reasons for key drop during compaction
+   * There are 3 reasons currently.
+   */
+  COMPACTION_KEY_DROP_NEWER_ENTRY, // key was written with a newer value.
+  COMPACTION_KEY_DROP_OBSOLETE, // The key is obsolete.
+  COMPACTION_KEY_DROP_USER, // user compaction function has dropped the key.
+
+  // Number of keys written to the database via the Put and Write call's
+  NUMBER_KEYS_WRITTEN,
+  // Number of Keys read,
+  NUMBER_KEYS_READ,
+  // Number keys updated, if inplace update is enabled
+  NUMBER_KEYS_UPDATED,
+  // Bytes written / read
+  BYTES_WRITTEN,
+  BYTES_READ,
+  NO_FILE_CLOSES,
+  NO_FILE_OPENS,
+  NO_FILE_ERRORS,
+  // Time system had to wait to do LO-L1 compactions
+  STALL_L0_SLOWDOWN_MICROS,
+  // Time system had to wait to move memtable to L1.
+  STALL_MEMTABLE_COMPACTION_MICROS,
+  // write throttle because of too many files in L0
+  STALL_L0_NUM_FILES_MICROS,
+  RATE_LIMIT_DELAY_MILLIS,
+
+  NO_ITERATORS, // number of iterators currently open
+
+  // Number of MultiGet calls, keys read, and bytes read
+  NUMBER_MULTIGET_CALLS,
+  NUMBER_MULTIGET_KEYS_READ,
+  NUMBER_MULTIGET_BYTES_READ,
+
+  // Number of deletes records that were not required to be
+  // written to storage because key does not exist
+  NUMBER_FILTERED_DELETES,
+  NUMBER_MERGE_FAILURES,
+  SEQUENCE_NUMBER,
+
+  // number of times bloom was checked before creating iterator on a
+  // file, and the number of times the check was useful in avoiding
+  // iterator creation (and thus likely IOPs).
+  BLOOM_FILTER_PREFIX_CHECKED,
+  BLOOM_FILTER_PREFIX_USEFUL,
+
+  // Number of times we had to reseek inside an iteration to skip
+  // over large number of keys with same userkey.
+  NUMBER_OF_RESEEKS_IN_ITERATION,
+
+  // Record the number of calls to GetUpadtesSince. Useful to keep track of
+  // transaction log iterator refreshes
+  GET_UPDATES_SINCE_CALLS,
+
+  BLOCK_CACHE_COMPRESSED_MISS, // miss in the compressed block cache
+  BLOCK_CACHE_COMPRESSED_HIT,  // hit in the compressed block cache
+
+  WAL_FILE_SYNCED,  // Number of times WAL sync is done
+  WAL_FILE_BYTES,   // Number of bytes written to WAL
+
+  // Writes can be processed by requesting thread or by the thread at the
+  // head of the writers queue.
+  WRITE_DONE_BY_SELF,
+  WRITE_DONE_BY_OTHER,
+
+  WRITE_WITH_WAL,      // Number of Write calls that request WAL
+
+  COMPACT_READ_BYTES,  // Bytes read during compaction
+  COMPACT_WRITE_BYTES, // Bytes written during compaction
+
+  TICKER_ENUM_MAX
+};
+
+// The order of items listed in  Tickers should be the same as
+// the order listed in TickersNameMap
+const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
+  { BLOCK_CACHE_MISS, "rocksdb.block.cache.miss" },
+  { BLOCK_CACHE_HIT, "rocksdb.block.cache.hit" },
+  { BLOCK_CACHE_ADD, "rocksdb.block.cache.add" },
+  { BLOCK_CACHE_INDEX_MISS, "rocksdb.block.cache.index.miss" },
+  { BLOCK_CACHE_INDEX_HIT, "rocksdb.block.cache.index.hit" },
+  { BLOCK_CACHE_FILTER_MISS, "rocksdb.block.cache.filter.miss" },
+  { BLOCK_CACHE_FILTER_HIT, "rocksdb.block.cache.filter.hit" },
+  { BLOCK_CACHE_DATA_MISS, "rocksdb.block.cache.data.miss" },
+  { BLOCK_CACHE_DATA_HIT, "rocksdb.block.cache.data.hit" },
+  { BLOOM_FILTER_USEFUL, "rocksdb.bloom.filter.useful" },
+  { MEMTABLE_HIT, "rocksdb.memtable.hit" },
+  { MEMTABLE_MISS, "rocksdb.memtable.miss" },
+  { COMPACTION_KEY_DROP_NEWER_ENTRY, "rocksdb.compaction.key.drop.new" },
+  { COMPACTION_KEY_DROP_OBSOLETE, "rocksdb.compaction.key.drop.obsolete" },
+  { COMPACTION_KEY_DROP_USER, "rocksdb.compaction.key.drop.user" },
+  { NUMBER_KEYS_WRITTEN, "rocksdb.number.keys.written" },
+  { NUMBER_KEYS_READ, "rocksdb.number.keys.read" },
+  { NUMBER_KEYS_UPDATED, "rocksdb.number.keys.updated" },
+  { BYTES_WRITTEN, "rocksdb.bytes.written" },
+  { BYTES_READ, "rocksdb.bytes.read" },
+  { NO_FILE_CLOSES, "rocksdb.no.file.closes" },
+  { NO_FILE_OPENS, "rocksdb.no.file.opens" },
+  { NO_FILE_ERRORS, "rocksdb.no.file.errors" },
+  { STALL_L0_SLOWDOWN_MICROS, "rocksdb.l0.slowdown.micros" },
+  { STALL_MEMTABLE_COMPACTION_MICROS, "rocksdb.memtable.compaction.micros" },
+  { STALL_L0_NUM_FILES_MICROS, "rocksdb.l0.num.files.stall.micros" },
+  { RATE_LIMIT_DELAY_MILLIS, "rocksdb.rate.limit.delay.millis" },
+  { NO_ITERATORS, "rocksdb.num.iterators" },
+  { NUMBER_MULTIGET_CALLS, "rocksdb.number.multiget.get" },
+  { NUMBER_MULTIGET_KEYS_READ, "rocksdb.number.multiget.keys.read" },
+  { NUMBER_MULTIGET_BYTES_READ, "rocksdb.number.multiget.bytes.read" },
+  { NUMBER_FILTERED_DELETES, "rocksdb.number.deletes.filtered" },
+  { NUMBER_MERGE_FAILURES, "rocksdb.number.merge.failures" },
+  { SEQUENCE_NUMBER, "rocksdb.sequence.number" },
+  { BLOOM_FILTER_PREFIX_CHECKED, "rocksdb.bloom.filter.prefix.checked" },
+  { BLOOM_FILTER_PREFIX_USEFUL, "rocksdb.bloom.filter.prefix.useful" },
+  { NUMBER_OF_RESEEKS_IN_ITERATION, "rocksdb.number.reseeks.iteration" },
+  { GET_UPDATES_SINCE_CALLS, "rocksdb.getupdatessince.calls" },
+  { BLOCK_CACHE_COMPRESSED_MISS, "rocksdb.block.cachecompressed.miss" },
+  { BLOCK_CACHE_COMPRESSED_HIT, "rocksdb.block.cachecompressed.hit" },
+  { WAL_FILE_SYNCED, "rocksdb.wal.synced" },
+  { WAL_FILE_BYTES, "rocksdb.wal.bytes" },
+  { WRITE_DONE_BY_SELF, "rocksdb.write.self" },
+  { WRITE_DONE_BY_OTHER, "rocksdb.write.other" },
+  { WRITE_WITH_WAL, "rocksdb.write.wal" },
+  { COMPACT_READ_BYTES, "rocksdb.compact.read.bytes" },
+  { COMPACT_WRITE_BYTES, "rocksdb.compact.write.bytes" },
+};
+
+/**
+ * Keep adding histogram's here.
+ * Any histogram whould have value less than HISTOGRAM_ENUM_MAX
+ * Add a new Histogram by assigning it the current value of HISTOGRAM_ENUM_MAX
+ * Add a string representation in HistogramsNameMap below
+ * And increment HISTOGRAM_ENUM_MAX
+ */
+enum Histograms {
+  DB_GET,
+  DB_WRITE,
+  COMPACTION_TIME,
+  TABLE_SYNC_MICROS,
+  COMPACTION_OUTFILE_SYNC_MICROS,
+  WAL_FILE_SYNC_MICROS,
+  MANIFEST_FILE_SYNC_MICROS,
+  // TIME SPENT IN IO DURING TABLE OPEN
+  TABLE_OPEN_IO_MICROS,
+  DB_MULTIGET,
+  READ_BLOCK_COMPACTION_MICROS,
+  READ_BLOCK_GET_MICROS,
+  WRITE_RAW_BLOCK_MICROS,
+
+  STALL_L0_SLOWDOWN_COUNT,
+  STALL_MEMTABLE_COMPACTION_COUNT,
+  STALL_L0_NUM_FILES_COUNT,
+  HARD_RATE_LIMIT_DELAY_COUNT,
+  SOFT_RATE_LIMIT_DELAY_COUNT,
+  NUM_FILES_IN_SINGLE_COMPACTION,
+  HISTOGRAM_ENUM_MAX,
+};
+
+const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap = {
+  { DB_GET, "rocksdb.db.get.micros" },
+  { DB_WRITE, "rocksdb.db.write.micros" },
+  { COMPACTION_TIME, "rocksdb.compaction.times.micros" },
+  { TABLE_SYNC_MICROS, "rocksdb.table.sync.micros" },
+  { COMPACTION_OUTFILE_SYNC_MICROS, "rocksdb.compaction.outfile.sync.micros" },
+  { WAL_FILE_SYNC_MICROS, "rocksdb.wal.file.sync.micros" },
+  { MANIFEST_FILE_SYNC_MICROS, "rocksdb.manifest.file.sync.micros" },
+  { TABLE_OPEN_IO_MICROS, "rocksdb.table.open.io.micros" },
+  { DB_MULTIGET, "rocksdb.db.multiget.micros" },
+  { READ_BLOCK_COMPACTION_MICROS, "rocksdb.read.block.compaction.micros" },
+  { READ_BLOCK_GET_MICROS, "rocksdb.read.block.get.micros" },
+  { WRITE_RAW_BLOCK_MICROS, "rocksdb.write.raw.block.micros" },
+  { STALL_L0_SLOWDOWN_COUNT, "rocksdb.l0.slowdown.count"},
+  { STALL_MEMTABLE_COMPACTION_COUNT, "rocksdb.memtable.compaction.count"},
+  { STALL_L0_NUM_FILES_COUNT, "rocksdb.num.files.stall.count"},
+  { HARD_RATE_LIMIT_DELAY_COUNT, "rocksdb.hard.rate.limit.delay.count"},
+  { SOFT_RATE_LIMIT_DELAY_COUNT, "rocksdb.soft.rate.limit.delay.count"},
+  { NUM_FILES_IN_SINGLE_COMPACTION, "rocksdb.numfiles.in.singlecompaction" },
+};
+
+struct HistogramData {
+  double median;
+  double percentile95;
+  double percentile99;
+  double average;
+  double standard_deviation;
+};
+
+
+class Histogram {
+ public:
+  // clear's the histogram
+  virtual void Clear() = 0;
+  virtual ~Histogram();
+  // Add a value to be recorded in the histogram.
+  virtual void Add(uint64_t value) = 0;
+
+  virtual std::string ToString() const = 0;
+
+  // Get statistics
+  virtual double Median() const = 0;
+  virtual double Percentile(double p) const = 0;
+  virtual double Average() const = 0;
+  virtual double StandardDeviation() const = 0;
+  virtual void Data(HistogramData * const data) const = 0;
+
+};
+
+/**
+ * A dumb ticker which keeps incrementing through its life time.
+ * Thread safe. Locking managed by implementation of this interface.
+ */
+class Ticker {
+ public:
+  Ticker() : count_(0) { }
+
+  inline void setTickerCount(uint64_t count) {
+    count_ = count;
+  }
+
+  inline void recordTick(int count = 1) {
+    count_ += count;
+  }
+
+  inline uint64_t getCount() {
+    return count_;
+  }
+
+ private:
+  std::atomic_uint_fast64_t count_;
+};
+
+// Analyze the performance of a db
+class Statistics {
+ public:
+
+  virtual long getTickerCount(Tickers tickerType) = 0;
+  virtual void recordTick(Tickers tickerType, uint64_t count = 0) = 0;
+  virtual void setTickerCount(Tickers tickerType, uint64_t count) = 0;
+  virtual void measureTime(Histograms histogramType, uint64_t time) = 0;
+
+  virtual void histogramData(Histograms type, HistogramData * const data) = 0;
+  // String representation of the statistic object.
+  std::string ToString();
+};
+
+// Create a concrete DBStatistics object
+std::shared_ptr<Statistics> CreateDBStatistics();
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_STATISTICS_H_
diff --git a/include/rocksdb/status.h b/include/rocksdb/status.h
new file mode 100644 (file)
index 0000000..e2304fd
--- /dev/null
@@ -0,0 +1,137 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A Status encapsulates the result of an operation.  It may indicate success,
+// or it may indicate an error with an associated error message.
+//
+// Multiple threads can invoke const methods on a Status without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same Status must use
+// external synchronization.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_STATUS_H_
+#define STORAGE_ROCKSDB_INCLUDE_STATUS_H_
+
+#include <string>
+#include "rocksdb/slice.h"
+
+namespace rocksdb {
+
+class Status {
+ public:
+  // Create a success status.
+  Status() : code_(kOk), state_(nullptr) { }
+  ~Status() { delete[] state_; }
+
+  // Copy the specified status.
+  Status(const Status& s);
+  void operator=(const Status& s);
+
+  // Return a success status.
+  static Status OK() { return Status(); }
+
+  // Return error status of an appropriate type.
+  static Status NotFound(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kNotFound, msg, msg2);
+  }
+  // Fast path for not found without malloc;
+  static Status NotFound() {
+    return Status(kNotFound);
+  }
+  static Status Corruption(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kCorruption, msg, msg2);
+  }
+  static Status NotSupported(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kNotSupported, msg, msg2);
+  }
+  static Status InvalidArgument(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kInvalidArgument, msg, msg2);
+  }
+  static Status IOError(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kIOError, msg, msg2);
+  }
+  static Status MergeInProgress(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kMergeInProgress, msg, msg2);
+  }
+  static Status Incomplete(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kIncomplete, msg, msg2);
+  }
+
+  // Returns true iff the status indicates success.
+  bool ok() const { return code() == kOk; }
+
+  // Returns true iff the status indicates a NotFound error.
+  bool IsNotFound() const { return code() == kNotFound; }
+
+  // Returns true iff the status indicates a Corruption error.
+  bool IsCorruption() const { return code() == kCorruption; }
+
+  // Returns true iff the status indicates a NotSupported error.
+  bool IsNotSupported() const { return code() == kNotSupported; }
+
+  // Returns true iff the status indicates an InvalidArgument error.
+  bool IsInvalidArgument() const { return code() == kInvalidArgument; }
+
+  // Returns true iff the status indicates an IOError.
+  bool IsIOError() const { return code() == kIOError; }
+
+  // Returns true iff the status indicates an MergeInProgress.
+  bool IsMergeInProgress() const { return code() == kMergeInProgress; }
+
+  // Returns true iff the status indicates Incomplete
+  bool IsIncomplete() const { return code() == kIncomplete; }
+
+  // Return a string representation of this status suitable for printing.
+  // Returns the string "OK" for success.
+  std::string ToString() const;
+
+ private:
+  enum Code {
+    kOk = 0,
+    kNotFound = 1,
+    kCorruption = 2,
+    kNotSupported = 3,
+    kInvalidArgument = 4,
+    kIOError = 5,
+    kMergeInProgress = 6,
+    kIncomplete = 7
+  };
+
+  // A nullptr state_ (which is always the case for OK) means the message
+  // is empty.
+  // of the following form:
+  //    state_[0..3] == length of message
+  //    state_[4..]  == message
+  Code code_;
+  const char* state_;
+
+  Code code() const {
+    return code_;
+  }
+  explicit Status(Code code) : code_(code), state_(nullptr) { }
+  Status(Code code, const Slice& msg, const Slice& msg2);
+  static const char* CopyState(const char* s);
+};
+
+inline Status::Status(const Status& s) {
+  code_ = s.code_;
+  state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_);
+}
+inline void Status::operator=(const Status& s) {
+  // The following condition catches both aliasing (when this == &s),
+  // and the common case where both s and *this are ok.
+  code_ = s.code_;
+  if (state_ != s.state_) {
+    delete[] state_;
+    state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_);
+  }
+}
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_STATUS_H_
diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h
new file mode 100644 (file)
index 0000000..2d2bfac
--- /dev/null
@@ -0,0 +1,180 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <memory>
+#include <stdint.h>
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/table_properties.h"
+#include "rocksdb/options.h"
+
+namespace rocksdb {
+
+struct Options;
+class RandomAccessFile;
+struct ReadOptions;
+class TableCache;
+class WritableFile;
+
+using std::unique_ptr;
+
+// TableBuilder provides the interface used to build a Table
+// (an immutable and sorted map from keys to values).
+//
+// Multiple threads can invoke const methods on a TableBuilder without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same TableBuilder must use
+// external synchronization.
+class TableBuilder {
+ public:
+  // REQUIRES: Either Finish() or Abandon() has been called.
+  virtual ~TableBuilder() {}
+
+  // Add key,value to the table being constructed.
+  // REQUIRES: key is after any previously added key according to comparator.
+  // REQUIRES: Finish(), Abandon() have not been called
+  virtual void Add(const Slice& key, const Slice& value) = 0;
+
+  // Return non-ok iff some error has been detected.
+  virtual Status status() const = 0;
+
+  // Finish building the table.
+  // REQUIRES: Finish(), Abandon() have not been called
+  virtual Status Finish() = 0;
+
+  // Indicate that the contents of this builder should be abandoned.
+  // If the caller is not going to call Finish(), it must call Abandon()
+  // before destroying this builder.
+  // REQUIRES: Finish(), Abandon() have not been called
+  virtual void Abandon() = 0;
+
+  // Number of calls to Add() so far.
+  virtual uint64_t NumEntries() const = 0;
+
+  // Size of the file generated so far.  If invoked after a successful
+  // Finish() call, returns the size of the final generated file.
+  virtual uint64_t FileSize() const = 0;
+};
+
+// A Table is a sorted map from strings to strings.  Tables are
+// immutable and persistent.  A Table may be safely accessed from
+// multiple threads without external synchronization.
+class TableReader {
+ public:
+  virtual ~TableReader() {}
+
+  // Determine whether there is a chance that the current table file
+  // contains the key a key starting with iternal_prefix. The specific
+  // table implementation can use bloom filter and/or other heuristic
+  // to filter out this table as a whole.
+  virtual bool PrefixMayMatch(const Slice& internal_prefix) = 0;
+
+  // Returns a new iterator over the table contents.
+  // The result of NewIterator() is initially invalid (caller must
+  // call one of the Seek methods on the iterator before using it).
+  virtual Iterator* NewIterator(const ReadOptions&) = 0;
+
+  // Given a key, return an approximate byte offset in the file where
+  // the data for that key begins (or would begin if the key were
+  // present in the file).  The returned value is in terms of file
+  // bytes, and so includes effects like compression of the underlying data.
+  // E.g., the approximate offset of the last key in the table will
+  // be close to the file length.
+  virtual uint64_t ApproximateOffsetOf(const Slice& key) = 0;
+
+  // Returns true if the block for the specified key is in cache.
+  // REQUIRES: key is in this table.
+  virtual bool TEST_KeyInCache(const ReadOptions& options,
+                               const Slice& key) = 0;
+
+  // Set up the table for Compaction. Might change some parameters with
+  // posix_fadvise
+  virtual void SetupForCompaction() = 0;
+
+  virtual TableProperties& GetTableProperties() = 0;
+
+  // Calls (*result_handler)(handle_context, ...) repeatedly, starting with
+  // the entry found after a call to Seek(key), until result_handler returns
+  // false, where k is the actual internal key for a row found and v as the
+  // value of the key. didIO is true if I/O is involved in the operation. May
+  // not make such a call if filter policy says that key is not present.
+  //
+  // mark_key_may_exist_handler needs to be called when it is configured to be
+  // memory only and the key is not found in the block cache, with
+  // the parameter to be handle_context.
+  //
+  // readOptions is the options for the read
+  // key is the key to search for
+  virtual Status Get(
+      const ReadOptions& readOptions,
+      const Slice& key,
+      void* handle_context,
+      bool (*result_handler)(void* handle_context, const Slice& k,
+                             const Slice& v, bool didIO),
+      void (*mark_key_may_exist_handler)(void* handle_context) = nullptr) = 0;
+};
+
+// A base class for table factories
+class TableFactory {
+ public:
+  virtual ~TableFactory() {}
+
+  // The type of the table.
+  //
+  // The client of this package should switch to a new name whenever
+  // the table format implementation changes.
+  //
+  // Names starting with "rocksdb." are reserved and should not be used
+  // by any clients of this package.
+  virtual const char* Name() const = 0;
+
+  // Returns a Table object table that can fetch data from file specified
+  // in parameter file. It's the caller's responsibility to make sure
+  // file is in the correct format.
+  //
+  // GetTableReader() is called in two places:
+  // (1) TableCache::FindTable() calls the function when table cache miss
+  //     and cache the table object returned.
+  // (1) SstFileReader (for SST Dump) opens the table and dump the table
+  //     contents using the interator of the table.
+  // options and soptions are options. options is the general options.
+  // Multiple configured can be accessed from there, including and not
+  // limited to block cache and key comparators.
+  // file is a file handler to handle the file for the table
+  // file_size is the physical file size of the file
+  // table_reader is the output table reader
+  virtual Status GetTableReader(
+      const Options& options, const EnvOptions& soptions,
+      unique_ptr<RandomAccessFile> && file, uint64_t file_size,
+      unique_ptr<TableReader>* table_reader) const = 0;
+
+  // Return a table builder to write to a file for this table type.
+  //
+  // It is called in several places:
+  // (1) When flushing memtable to a level-0 output file, it creates a table
+  //     builder (In DBImpl::WriteLevel0Table(), by calling BuildTable())
+  // (2) During compaction, it gets the builder for writing compaction output
+  //     files in DBImpl::OpenCompactionOutputFile().
+  // (3) When recovering from transaction logs, it creates a table builder to
+  //     write to a level-0 output file (In DBImpl::WriteLevel0TableForRecovery,
+  //     by calling BuildTable())
+  // (4) When running Repairer, it creates a table builder to convert logs to
+  //     SST files (In Repairer::ConvertLogToTable() by calling BuildTable())
+  //
+  // options is the general options. Multiple configured can be acceseed from
+  // there, including and not limited to compression options.
+  // file is a handle of a writable file. It is the caller's responsibility to
+  // keep the file open and close the file after closing the table builder.
+  // compression_type is the compression type to use in this table.
+  virtual TableBuilder* GetTableBuilder(
+      const Options& options, WritableFile* file,
+      CompressionType compression_type) const = 0;
+};
+}  // namespace rocksdb
diff --git a/include/rocksdb/table_properties.h b/include/rocksdb/table_properties.h
new file mode 100644 (file)
index 0000000..8824ca1
--- /dev/null
@@ -0,0 +1,90 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+#pragma once
+
+#include <string>
+#include <unordered_map>
+
+#include "rocksdb/status.h"
+
+namespace rocksdb {
+
+// TableProperties contains a bunch of read-only properties of its associated
+// table.
+struct TableProperties {
+ public:
+  // Other than basic table properties, each table may also have the user
+  // collected properties.
+  // The value of the user-collected properties are encoded as raw bytes --
+  // users have to interprete these values by themselves.
+  typedef
+    std::unordered_map<std::string, std::string>
+    UserCollectedProperties;
+
+  // the total size of all data blocks.
+  uint64_t data_size = 0;
+  // the size of index block.
+  uint64_t index_size = 0;
+  // the size of filter block.
+  uint64_t filter_size = 0;
+  // total raw key size
+  uint64_t raw_key_size = 0;
+  // total raw value size
+  uint64_t raw_value_size = 0;
+  // the number of blocks in this table
+  uint64_t num_data_blocks = 0;
+  // the number of entries in this table
+  uint64_t num_entries = 0;
+
+  // The name of the filter policy used in this table.
+  // If no filter policy is used, `filter_policy_name` will be an empty string.
+  std::string filter_policy_name;
+
+  // user collected properties
+  UserCollectedProperties user_collected_properties;
+
+  // convert this object to a human readable form
+  //   @prop_delim: delimiter for each property.
+  std::string ToString(
+      const std::string& prop_delim = "; ",
+      const std::string& kv_delim = "=") const;
+};
+
+// `TablePropertiesCollector` provides the mechanism for users to collect
+// their own interested properties. This class is essentially a collection
+//  of callback functions that will be invoked during table building.
+class TablePropertiesCollector {
+ public:
+  virtual ~TablePropertiesCollector() { }
+
+  // Add() will be called when a new key/value pair is inserted into the table.
+  // @params key    the original key that is inserted into the table.
+  // @params value  the original value that is inserted into the table.
+  virtual Status Add(const Slice& key, const Slice& value) = 0;
+
+  // Finish() will be called when a table has already been built and is ready
+  // for writing the properties block.
+  // @params properties  User will add their collected statistics to
+  // `properties`.
+  virtual Status Finish(
+      TableProperties::UserCollectedProperties* properties) = 0;
+
+  // The name of the properties collector can be used for debugging purpose.
+  virtual const char* Name() const = 0;
+
+  // Return the human-readable properties, where the key is property name and
+  // the value is the human-readable form of value.
+  virtual TableProperties::UserCollectedProperties
+    GetReadableProperties() const = 0;
+};
+
+// Extra properties
+// Below is a list of non-basic properties that are collected by database
+// itself. Especially some properties regarding to the internal keys (which
+// is unknown to `table`).
+extern uint64_t GetDeletedKeys(
+    const TableProperties::UserCollectedProperties& props);
+
+}  // namespace rocksdb
diff --git a/include/rocksdb/transaction_log.h b/include/rocksdb/transaction_log.h
new file mode 100644 (file)
index 0000000..41a3250
--- /dev/null
@@ -0,0 +1,91 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_TRANSACTION_LOG_ITERATOR_H_
+#define STORAGE_ROCKSDB_INCLUDE_TRANSACTION_LOG_ITERATOR_H_
+
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+#include "rocksdb/write_batch.h"
+#include <memory>
+#include <vector>
+
+namespace rocksdb {
+
+class LogFile;
+typedef std::vector<std::unique_ptr<LogFile>> VectorLogPtr;
+
+enum  WalFileType {
+  /* Indicates that WAL file is in archive directory. WAL files are moved from
+   * the main db directory to archive directory once they are not live and stay
+   * there until cleaned up. Files are cleaned depending on archive size
+   * (Options::WAL_size_limit_MB) and time since last cleaning
+   * (Options::WAL_ttl_seconds).
+   */
+  kArchivedLogFile = 0,
+
+  /* Indicates that WAL file is live and resides in the main db directory */
+  kAliveLogFile = 1
+} ;
+
+class LogFile {
+ public:
+  LogFile() {}
+  virtual ~LogFile() {}
+
+  // Returns log file's pathname relative to the main db dir
+  // Eg. For a live-log-file = /000003.log
+  //     For an archived-log-file = /archive/000003.log
+  virtual std::string PathName() const = 0;
+
+
+  // Primary identifier for log file.
+  // This is directly proportional to creation time of the log file
+  virtual uint64_t LogNumber() const = 0;
+
+  // Log file can be either alive or archived
+  virtual WalFileType Type() const = 0;
+
+  // Starting sequence number of writebatch written in this log file
+  virtual SequenceNumber StartSequence() const = 0;
+
+  // Size of log file on disk in Bytes
+  virtual uint64_t SizeFileBytes() const = 0;
+};
+
+struct BatchResult {
+  SequenceNumber sequence = 0;
+  std::unique_ptr<WriteBatch> writeBatchPtr;
+};
+
+// A TransactionLogIterator is used to iterate over the transactions in a db.
+// One run of the iterator is continuous, i.e. the iterator will stop at the
+// beginning of any gap in sequences
+class TransactionLogIterator {
+ public:
+  TransactionLogIterator() {}
+  virtual ~TransactionLogIterator() {}
+
+  // An iterator is either positioned at a WriteBatch or not valid.
+  // This method returns true if the iterator is valid.
+  // Can read data from a valid iterator.
+  virtual bool Valid() = 0;
+
+  // Moves the iterator to the next WriteBatch.
+  // REQUIRES: Valid() to be true.
+  virtual void Next() = 0;
+
+  // Returns ok if the iterator is valid.
+  // Returns the Error when something has gone wrong.
+  virtual Status status() = 0;
+
+  // If valid return's the current write_batch and the sequence number of the
+  // earliest transaction contained in the batch.
+  // ONLY use if Valid() is true and status() is OK.
+  virtual BatchResult GetBatch() = 0;
+};
+} //  namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_TRANSACTION_LOG_ITERATOR_H_
diff --git a/include/rocksdb/types.h b/include/rocksdb/types.h
new file mode 100644 (file)
index 0000000..f20bf82
--- /dev/null
@@ -0,0 +1,20 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_TYPES_H_
+#define STORAGE_ROCKSDB_INCLUDE_TYPES_H_
+
+#include <stdint.h>
+
+namespace rocksdb {
+
+// Define all public custom types here.
+
+// Represents a sequence number in a WAL file.
+typedef uint64_t SequenceNumber;
+
+}  //  namespace rocksdb
+
+#endif //  STORAGE_ROCKSDB_INCLUDE_TYPES_H_
diff --git a/include/rocksdb/universal_compaction.h b/include/rocksdb/universal_compaction.h
new file mode 100644 (file)
index 0000000..ec862b9
--- /dev/null
@@ -0,0 +1,89 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef STORAGE_ROCKSDB_UNIVERSAL_COMPACTION_OPTIONS_H
+#define STORAGE_ROCKSDB_UNIVERSAL_COMPACTION_OPTIONS_H
+
+#include <stddef.h>
+#include <string>
+#include <memory>
+#include <vector>
+#include <stdint.h>
+#include <climits>
+#include "rocksdb/slice.h"
+#include "rocksdb/statistics.h"
+
+namespace rocksdb {
+
+//
+// Algorithm used to make a compaction request stop picking new files
+// into a single compaction run
+//
+enum CompactionStopStyle {
+  kCompactionStopStyleSimilarSize, // pick files of similar size
+  kCompactionStopStyleTotalSize    // total size of picked files > next file
+};
+
+class CompactionOptionsUniversal {
+ public:
+
+  // Percentage flexibilty while comparing file size. If the candidate file(s)
+  // size is 1% smaller than the next file's size, then include next file into
+  // this candidate set. // Default: 1
+  unsigned int size_ratio;
+
+  // The minimum number of files in a single compaction run. Default: 2
+  unsigned int min_merge_width;
+
+  // The maximum number of files in a single compaction run. Default: UINT_MAX
+  unsigned int max_merge_width;
+
+  // The size amplification is defined as the amount (in percentage) of
+  // additional storage needed to store a single byte of data in the database.
+  // For example, a size amplification of 2% means that a database that
+  // contains 100 bytes of user-data may occupy upto 102 bytes of
+  // physical storage. By this definition, a fully compacted database has
+  // a size amplification of 0%. Rocksdb uses the following heuristic
+  // to calculate size amplification: it assumes that all files excluding
+  // the earliest file contribute to the size amplification.
+  // Default: 200, which means that a 100 byte database could require upto
+  // 300 bytes of storage.
+  unsigned int max_size_amplification_percent;
+
+  // If this option is set to be -1 (the default value), all the output files
+  // will follow compression type specified.
+  //
+  // If this option is not negative, we will try to make sure compressed
+  // size is just above this value. In normal cases, at least this percentage
+  // of data will be compressed.
+  // When we are compacting to a new file, here is the criteria whether
+  // it needs to be compressed: assuming here are the list of files sorted
+  // by generation time:
+  //    A1...An B1...Bm C1...Ct
+  // where A1 is the newest and Ct is the oldest, and we are going to compact
+  // B1...Bm, we calculate the total size of all the files as total_size, as
+  // well as  the total size of C1...Ct as total_C, the compaction output file
+  // will be compressed iff
+  //   total_C / total_size < this percentage
+  int compression_size_percent;
+
+  // The algorithm used to stop picking files into a single compaction run
+  // Default: kCompactionStopStyleTotalSize
+  CompactionStopStyle stop_style;
+
+  // Default set of parameters
+  CompactionOptionsUniversal() :
+    size_ratio(1),
+    min_merge_width(2),
+    max_merge_width(UINT_MAX),
+    max_size_amplification_percent(200),
+    compression_size_percent(-1),
+    stop_style(kCompactionStopStyleTotalSize) {
+  }
+};
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_UNIVERSAL_COMPACTION_OPTIONS_H
diff --git a/include/rocksdb/write_batch.h b/include/rocksdb/write_batch.h
new file mode 100644 (file)
index 0000000..2cfb731
--- /dev/null
@@ -0,0 +1,112 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// WriteBatch holds a collection of updates to apply atomically to a DB.
+//
+// The updates are applied in the order in which they are added
+// to the WriteBatch.  For example, the value of "key" will be "v3"
+// after the following batch is written:
+//
+//    batch.Put("key", "v1");
+//    batch.Delete("key");
+//    batch.Put("key", "v2");
+//    batch.Put("key", "v3");
+//
+// Multiple threads can invoke const methods on a WriteBatch without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same WriteBatch must use
+// external synchronization.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_WRITE_BATCH_H_
+#define STORAGE_ROCKSDB_INCLUDE_WRITE_BATCH_H_
+
+#include <string>
+#include "rocksdb/status.h"
+
+namespace rocksdb {
+
+class Slice;
+struct SliceParts;
+
+class WriteBatch {
+ public:
+  explicit WriteBatch(size_t reserved_bytes = 0);
+  ~WriteBatch();
+
+  // Store the mapping "key->value" in the database.
+  void Put(const Slice& key, const Slice& value);
+
+  // Variant of Put() that gathers output like writev(2).  The key and value
+  // that will be written to the database are concatentations of arrays of
+  // slices.
+  void Put(const SliceParts& key, const SliceParts& value);
+
+  // Merge "value" with the existing value of "key" in the database.
+  // "key->merge(existing, value)"
+  void Merge(const Slice& key, const Slice& value);
+
+  // If the database contains a mapping for "key", erase it.  Else do nothing.
+  void Delete(const Slice& key);
+
+  // Append a blob of arbitrary size to the records in this batch. The blob will
+  // be stored in the transaction log but not in any other file. In particular,
+  // it will not be persisted to the SST files. When iterating over this
+  // WriteBatch, WriteBatch::Handler::LogData will be called with the contents
+  // of the blob as it is encountered. Blobs, puts, deletes, and merges will be
+  // encountered in the same order in thich they were inserted. The blob will
+  // NOT consume sequence number(s) and will NOT increase the count of the batch
+  //
+  // Example application: add timestamps to the transaction log for use in
+  // replication.
+  void PutLogData(const Slice& blob);
+
+  // Clear all updates buffered in this batch.
+  void Clear();
+
+  // Support for iterating over the contents of a batch.
+  class Handler {
+   public:
+    virtual ~Handler();
+    virtual void Put(const Slice& key, const Slice& value) = 0;
+    // Merge and LogData are not pure virtual. Otherwise, we would break
+    // existing clients of Handler on a source code level. The default
+    // implementation of Merge simply throws a runtime exception.
+    virtual void Merge(const Slice& key, const Slice& value);
+    // The default implementation of LogData does nothing.
+    virtual void LogData(const Slice& blob);
+    virtual void Delete(const Slice& key) = 0;
+    // Continue is called by WriteBatch::Iterate. If it returns false,
+    // iteration is halted. Otherwise, it continues iterating. The default
+    // implementation always returns true.
+    virtual bool Continue();
+  };
+  Status Iterate(Handler* handler) const;
+
+  // Retrieve the serialized version of this batch.
+  const std::string& Data() const { return rep_; }
+
+  // Retrieve data size of the batch.
+  size_t GetDataSize() const { return rep_.size(); }
+
+  // Returns the number of updates in the batch
+  int Count() const;
+
+  // Constructor with a serialized string object
+  explicit WriteBatch(std::string rep): rep_(rep) {}
+
+ private:
+  friend class WriteBatchInternal;
+
+  std::string rep_;  // See comment in write_batch.cc for the format of rep_
+
+  // Intentionally copyable
+};
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_WRITE_BATCH_H_
diff --git a/include/utilities/backupable_db.h b/include/utilities/backupable_db.h
new file mode 100644 (file)
index 0000000..fbe2ae8
--- /dev/null
@@ -0,0 +1,151 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include "utilities/stackable_db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/status.h"
+
+#include <string>
+#include <map>
+#include <vector>
+
+namespace rocksdb {
+
+struct BackupableDBOptions {
+  // Where to keep the backup files. Has to be different than dbname_
+  // Best to set this to dbname_ + "/backups"
+  // Required
+  std::string backup_dir;
+
+  // Backup Env object. It will be used for backup file I/O. If it's
+  // nullptr, backups will be written out using DBs Env. If it's
+  // non-nullptr, backup's I/O will be performed using this object.
+  // If you want to have backups on HDFS, use HDFS Env here!
+  // Default: nullptr
+  Env* backup_env;
+
+  // If share_table_files == true, backup will assume that table files with
+  // same name have the same contents. This enables incremental backups and
+  // avoids unnecessary data copies.
+  // If share_table_files == false, each backup will be on its own and will
+  // not share any data with other backups.
+  // default: true
+  bool share_table_files;
+
+  // Backup info and error messages will be written to info_log
+  // if non-nullptr.
+  // Default: nullptr
+  Logger* info_log;
+
+  // If sync == true, we can guarantee you'll get consistent backup even
+  // on a machine crash/reboot. Backup process is slower with sync enabled.
+  // If sync == false, we don't guarantee anything on machine reboot. However,
+  // chances are some of the backups are consistent.
+  // Default: true
+  bool sync;
+
+  // If true, it will delete whatever backups there are already
+  // Default: false
+  bool destroy_old_data;
+
+  explicit BackupableDBOptions(const std::string& _backup_dir,
+                               Env* _backup_env = nullptr,
+                               bool _share_table_files = true,
+                               Logger* _info_log = nullptr,
+                               bool _sync = true,
+                               bool _destroy_old_data = false) :
+      backup_dir(_backup_dir),
+      backup_env(_backup_env),
+      info_log(_info_log),
+      sync(_sync),
+      destroy_old_data(_destroy_old_data) { }
+};
+
+class BackupEngine;
+
+typedef uint32_t BackupID;
+
+struct BackupInfo {
+  BackupID backup_id;
+  int64_t timestamp;
+  uint64_t size;
+
+  BackupInfo() {}
+  BackupInfo(BackupID _backup_id, int64_t _timestamp, uint64_t _size)
+      : backup_id(_backup_id), timestamp(_timestamp), size(_size) {}
+};
+
+// Stack your DB with BackupableDB to be able to backup the DB
+class BackupableDB : public StackableDB {
+ public:
+  // BackupableDBOptions have to be the same as the ones used in a previous
+  // incarnation of the DB
+  //
+  // BackupableDB ownes the pointer `DB* db` now. You should not delete it or
+  // use it after the invocation of BackupableDB
+  BackupableDB(DB* db, const BackupableDBOptions& options);
+  virtual ~BackupableDB();
+
+  // Captures the state of the database in the latest backup
+  // NOT a thread safe call
+  Status CreateNewBackup(bool flush_before_backup = false);
+  // Returns info about backups in backup_info
+  void GetBackupInfo(std::vector<BackupInfo>* backup_info);
+  // deletes old backups, keeping latest num_backups_to_keep alive
+  Status PurgeOldBackups(uint32_t num_backups_to_keep);
+  // deletes a specific backup
+  Status DeleteBackup(BackupID backup_id);
+  // Call this from another thread if you want to stop the backup
+  // that is currently happening. It will return immediatelly, will
+  // not wait for the backup to stop.
+  // The backup will stop ASAP and the call to CreateNewBackup will
+  // return Status::Incomplete(). It will not clean up after itself, but
+  // the state will remain consistent. The state will be cleaned up
+  // next time you create BackupableDB or RestoreBackupableDB.
+  void StopBackup();
+
+ private:
+  BackupEngine* backup_engine_;
+};
+
+// Use this class to access information about backups and restore from them
+class RestoreBackupableDB {
+  public:
+   RestoreBackupableDB(Env* db_env, const BackupableDBOptions& options);
+   ~RestoreBackupableDB();
+
+   // Returns info about backups in backup_info
+   void GetBackupInfo(std::vector<BackupInfo>* backup_info);
+
+   // restore from backup with backup_id
+   // IMPORTANT -- if options_.share_table_files == true and you restore DB
+   // from some backup that is not the latest, and you start creating new
+   // backups from the new DB, all the backups that were newer than the
+   // backup you restored from will be deleted
+   //
+   // Example: Let's say you have backups 1, 2, 3, 4, 5 and you restore 3.
+   // If you try creating a new backup now, old backups 4 and 5 will be deleted
+   // and new backup with ID 4 will be created.
+   Status RestoreDBFromBackup(BackupID backup_id, const std::string& db_dir,
+                              const std::string& wal_dir);
+
+   // restore from the latest backup
+   Status RestoreDBFromLatestBackup(const std::string& db_dir,
+                                    const std::string& wal_dir);
+   // deletes old backups, keeping latest num_backups_to_keep alive
+   Status PurgeOldBackups(uint32_t num_backups_to_keep);
+   // deletes a specific backup
+   Status DeleteBackup(BackupID backup_id);
+
+ private:
+  BackupEngine* backup_engine_;
+};
+
+} // rocksdb namespace
diff --git a/include/utilities/stackable_db.h b/include/utilities/stackable_db.h
new file mode 100644 (file)
index 0000000..908fe10
--- /dev/null
@@ -0,0 +1,161 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include "rocksdb/db.h"
+
+namespace rocksdb {
+
+// This class contains APIs to stack rocksdb wrappers.Eg. Stack TTL over base d
+class StackableDB : public DB {
+ public:
+  // StackableDB is the owner of db now!
+  explicit StackableDB(DB* db) : db_(db) {}
+
+  ~StackableDB() {
+    delete db_;
+  }
+
+  virtual DB* GetBaseDB() {
+    return db_;
+  }
+
+  virtual Status Put(const WriteOptions& options,
+                     const Slice& key,
+                     const Slice& val) override {
+    return db_->Put(options, key, val);
+  }
+
+  virtual Status Get(const ReadOptions& options,
+                     const Slice& key,
+                     std::string* value) override {
+    return db_->Get(options, key, value);
+  }
+
+  virtual std::vector<Status> MultiGet(const ReadOptions& options,
+                                       const std::vector<Slice>& keys,
+                                       std::vector<std::string>* values)
+    override {
+      return db_->MultiGet(options, keys, values);
+  }
+
+  virtual bool KeyMayExist(const ReadOptions& options,
+                           const Slice& key,
+                           std::string* value,
+                           bool* value_found = nullptr) override {
+    return db_->KeyMayExist(options, key, value, value_found);
+  }
+
+  virtual Status Delete(const WriteOptions& wopts, const Slice& key) override {
+    return db_->Delete(wopts, key);
+  }
+
+  virtual Status Merge(const WriteOptions& options,
+                       const Slice& key,
+                       const Slice& value) override {
+    return db_->Merge(options, key, value);
+  }
+
+
+  virtual Status Write(const WriteOptions& opts, WriteBatch* updates)
+    override {
+      return db_->Write(opts, updates);
+  }
+
+  virtual Iterator* NewIterator(const ReadOptions& opts) override {
+    return db_->NewIterator(opts);
+  }
+
+  virtual const Snapshot* GetSnapshot() override {
+    return db_->GetSnapshot();
+  }
+
+  virtual void ReleaseSnapshot(const Snapshot* snapshot) override {
+    return db_->ReleaseSnapshot(snapshot);
+  }
+
+  virtual bool GetProperty(const Slice& property, std::string* value)
+    override {
+      return db_->GetProperty(property, value);
+  }
+
+  virtual void GetApproximateSizes(const Range* r, int n, uint64_t* sizes)
+    override {
+      return db_->GetApproximateSizes(r, n, sizes);
+  }
+
+  virtual void CompactRange(const Slice* begin, const Slice* end,
+                            bool reduce_level = false,
+                            int target_level = -1) override {
+    return db_->CompactRange(begin, end, reduce_level, target_level);
+  }
+
+  virtual int NumberLevels() override {
+    return db_->NumberLevels();
+  }
+
+  virtual int MaxMemCompactionLevel() override {
+    return db_->MaxMemCompactionLevel();
+  }
+
+  virtual int Level0StopWriteTrigger() override {
+    return db_->Level0StopWriteTrigger();
+  }
+
+  virtual const std::string& GetName() const override {
+    return db_->GetName();
+  }
+
+  virtual Env* GetEnv() const override {
+    return db_->GetEnv();
+  }
+
+  virtual const Options& GetOptions() const override {
+    return db_->GetOptions();
+  }
+
+  virtual Status Flush(const FlushOptions& fopts) override {
+    return db_->Flush(fopts);
+  }
+
+  virtual Status DisableFileDeletions() override {
+    return db_->DisableFileDeletions();
+  }
+
+  virtual Status EnableFileDeletions(bool force) override {
+    return db_->EnableFileDeletions(force);
+  }
+
+  virtual Status GetLiveFiles(std::vector<std::string>& vec, uint64_t* mfs,
+                              bool flush_memtable = true) override {
+      return db_->GetLiveFiles(vec, mfs, flush_memtable);
+  }
+
+  virtual SequenceNumber GetLatestSequenceNumber() const override {
+    return db_->GetLatestSequenceNumber();
+  }
+
+  virtual Status GetSortedWalFiles(VectorLogPtr& files) override {
+    return db_->GetSortedWalFiles(files);
+  }
+
+  virtual Status DeleteFile(std::string name) override {
+    return db_->DeleteFile(name);
+  }
+
+  virtual Status GetDbIdentity(std::string& identity) {
+    return db_->GetDbIdentity(identity);
+  }
+
+  virtual Status GetUpdatesSince(SequenceNumber seq_number,
+                                 unique_ptr<TransactionLogIterator>* iter)
+    override {
+      return db_->GetUpdatesSince(seq_number, iter);
+  }
+
+ protected:
+  DB* db_;
+};
+
+} //  namespace rocksdb
diff --git a/include/utilities/utility_db.h b/include/utilities/utility_db.h
new file mode 100644 (file)
index 0000000..1a7a269
--- /dev/null
@@ -0,0 +1,50 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include "stackable_db.h"
+
+namespace rocksdb {
+
+// This class contains APIs to open rocksdb with specific support eg. TTL
+class UtilityDB {
+
+  public:
+    // Open the database with TTL support.
+    //
+    // USE-CASES:
+    // This API should be used to open the db when key-values inserted are
+    //  meant to be removed from the db in a non-strict 'ttl' amount of time
+    //  Therefore, this guarantees that key-values inserted will remain in the
+    //  db for >= ttl amount of time and the db will make efforts to remove the
+    //  key-values as soon as possible after ttl seconds of their insertion.
+    //
+    // BEHAVIOUR:
+    // TTL is accepted in seconds
+    // (int32_t)Timestamp(creation) is suffixed to values in Put internally
+    // Expired TTL values deleted in compaction only:(Timestamp+ttl<time_now)
+    // Get/Iterator may return expired entries(compaction not run on them yet)
+    // Different TTL may be used during different Opens
+    // Example: Open1 at t=0 with ttl=4 and insert k1,k2, close at t=2
+    //          Open2 at t=3 with ttl=5. Now k1,k2 should be deleted at t>=5
+    // read_only=true opens in the usual read-only mode. Compactions will not be
+    //  triggered(neither manual nor automatic), so no expired entries removed
+    //
+    // CONSTRAINTS:
+    // Not specifying/passing or non-positive TTL behaves like TTL = infinity
+    //
+    // !!!WARNING!!!:
+    // Calling DB::Open directly to re-open a db created by this API will get
+    //  corrupt values(timestamp suffixed) and no ttl effect will be there
+    //  during the second Open, so use this API consistently to open the db
+    // Be careful when passing ttl with a small positive value because the
+    //  whole database may be deleted in a small amount of time
+    static Status OpenTtlDB(const Options& options,
+                            const std::string& name,
+                            StackableDB** dbptr,
+                            int32_t ttl = 0,
+                            bool read_only = false);
+};
+
+} //  namespace rocksdb
diff --git a/linters/src/.phutil_module_cache b/linters/src/.phutil_module_cache
new file mode 100644 (file)
index 0000000..5c93a84
--- /dev/null
@@ -0,0 +1 @@
+{"__symbol_cache_version__":8,"b937ad5f80a8bd1156038b730ff56ec5":{"have":{"class":{"FacebookFbcodeLintEngine":71}},"need":{"class":{"ArcanistLintEngine":104,"ArcanistGeneratedLinter":488,"ArcanistNoLintLinter":577,"ArcanistTextLinter":658,"ArcanistPEP8Linter":1227,"FbcodeCppLinter":1715,"PfffCppLinter":1759,"ArcanistSpellingLinter":1875,"ArcanistFilenameLinter":4207,"Filesystem":357,"ArcanistLintSeverity":778}},"xmap":{"FacebookFbcodeLintEngine":["ArcanistLintEngine"]}},"02e2a613e371424b2108d2d6cb849d39":{"have":{"class":{"PfffCppLinter":71}},"need":{"function":{"Futures":875},"class":{"ArcanistLinter":93,"ExecFuture":756,"ArcanistLintMessage":1270,"ArcanistLintSeverity":1607}},"xmap":{"PfffCppLinter":["ArcanistLinter"]}},"4443484928afb005f585843d07b04190":{"have":{"class":{"FbcodeCppLinter":13}},"need":{"function":{"Futures":1265},"class":{"ArcanistLinter":37,"ExecFuture":934,"ArcanistLintSeverity":1729}},"xmap":{"FbcodeCppLinter":["ArcanistLinter"]}}}
\ No newline at end of file
diff --git a/linters/src/__phutil_library_init__.php b/linters/src/__phutil_library_init__.php
new file mode 100644 (file)
index 0000000..4b8d3d1
--- /dev/null
@@ -0,0 +1,3 @@
+<?php
+
+phutil_register_library('linters', __FILE__);
diff --git a/linters/src/__phutil_library_map__.php b/linters/src/__phutil_library_map__.php
new file mode 100644 (file)
index 0000000..cb10bed
--- /dev/null
@@ -0,0 +1,26 @@
+<?php
+
+/**
+ * This file is automatically generated. Use 'arc liberate' to rebuild it.
+ * @generated
+ * @phutil-library-version 2
+ */
+
+phutil_register_library_map(array(
+  '__library_version__' => 2,
+  'class' =>
+  array(
+    'FacebookFbcodeLintEngine' => 'lint_engine/FacebookFbcodeLintEngine.php',
+    'FbcodeCppLinter' => 'cpp_linter/FbcodeCppLinter.php',
+    'PfffCppLinter' => 'cpp_linter/PfffCppLinter.php',
+  ),
+  'function' =>
+  array(
+  ),
+  'xmap' =>
+  array(
+    'FacebookFbcodeLintEngine' => 'ArcanistLintEngine',
+    'FbcodeCppLinter' => 'ArcanistLinter',
+    'PfffCppLinter' => 'ArcanistLinter',
+  ),
+));
diff --git a/linters/src/cpp_linter/FbcodeCppLinter.php b/linters/src/cpp_linter/FbcodeCppLinter.php
new file mode 100644 (file)
index 0000000..e62d3bb
--- /dev/null
@@ -0,0 +1,99 @@
+<?php
+
+class FbcodeCppLinter extends ArcanistLinter {
+  const CPPLINT      = "/home/engshare/tools/cpplint";
+  const LINT_ERROR   = 1;
+  const LINT_WARNING = 2;
+  const C_FLAG = "--c_mode=true";
+  private $rawLintOutput = array();
+
+  public function willLintPaths(array $paths) {
+    $futures = array();
+    $ret_value = 0;
+    $last_line = system("which cpplint", $ret_value);
+    $CPP_LINT = false;
+    if ($ret_value == 0) {
+      $CPP_LINT = $last_line;
+    } else if (file_exists(self::CPPLINT)) {
+      $CPP_LINT = self::CPPLINT;
+    }
+
+    if ($CPP_LINT) {
+      foreach ($paths as $p) {
+        $lpath = $this->getEngine()->getFilePathOnDisk($p);
+        $lpath_file = file($lpath);
+        if (preg_match('/\.(c)$/', $lpath) ||
+            preg_match('/-\*-.*Mode: C[; ].*-\*-/', $lpath_file[0]) ||
+            preg_match('/vim(:.*)*:\s*(set\s+)?filetype=c\s*:/', $lpath_file[0])
+            ) {
+          $futures[$p] = new ExecFuture("%s %s %s 2>&1",
+                             $CPP_LINT, self::C_FLAG,
+                             $this->getEngine()->getFilePathOnDisk($p));
+        } else {
+          $futures[$p] = new ExecFuture("%s %s 2>&1",
+            self::CPPLINT, $this->getEngine()->getFilePathOnDisk($p));
+        }
+      }
+
+      foreach (Futures($futures)->limit(8) as $p => $f) {
+        $this->rawLintOutput[$p] = $f->resolvex();
+      }
+    }
+    return;
+  }
+
+  public function getLinterName() {
+    return "FBCPP";
+  }
+
+  public function lintPath($path) {
+    $msgs = $this->getCppLintOutput($path);
+    foreach ($msgs as $m) {
+      $this->raiseLintAtLine($m['line'], 0, $m['severity'], $m['msg']);
+    }
+  }
+
+  public function getLintSeverityMap() {
+    return array(
+      self::LINT_WARNING => ArcanistLintSeverity::SEVERITY_WARNING,
+      self::LINT_ERROR   => ArcanistLintSeverity::SEVERITY_ERROR
+    );
+  }
+
+  public function getLintNameMap() {
+    return array(
+      self::LINT_WARNING => "CppLint Warning",
+      self::LINT_ERROR   => "CppLint Error"
+    );
+  }
+
+  private function getCppLintOutput($path) {
+    list($output) = $this->rawLintOutput[$path];
+
+    $msgs = array();
+    $current = null;
+    foreach (explode("\n", $output) as $line) {
+      if (preg_match('/[^:]*\((\d+)\):(.*)$/', $line, $matches)) {
+        if ($current) {
+          $msgs[] = $current;
+        }
+        $line = $matches[1];
+        $text = $matches[2];
+        $sev  = preg_match('/.*Warning.*/', $text)
+                  ? self::LINT_WARNING
+                  : self::LINT_ERROR;
+        $current = array('line'     => $line,
+                         'msg'      => $text,
+                         'severity' => $sev);
+      } else if ($current) {
+        $current['msg'] .= ' ' . $line;
+      }
+    }
+    if ($current) {
+      $msgs[] = $current;
+    }
+
+    return $msgs;
+  }
+}
+
diff --git a/linters/src/cpp_linter/PfffCppLinter.php b/linters/src/cpp_linter/PfffCppLinter.php
new file mode 100644 (file)
index 0000000..6736614
--- /dev/null
@@ -0,0 +1,68 @@
+<?php
+// Copyright 2004-present Facebook.  All rights reserved.
+
+class PfffCppLinter extends ArcanistLinter {
+  const PROGRAM      = "/home/engshare/tools/checkCpp";
+
+  public function getLinterName() {
+    return "checkCpp";
+  }
+  public function getLintNameMap() {
+    return array(
+    );
+  }
+
+  public function getLintSeverityMap() {
+    return array(
+    );
+  }
+
+  public function willLintPaths(array $paths) {
+    $program = false;
+    $ret_value = 0;
+    $last_line = system("which checkCpp", $ret_value);
+    if ($ret_value == 0) {
+      $program = $last_line;
+    } else if (file_exists(self::PROGRAM)) {
+      $program = self::PROGRAM;
+    }
+    if ($program) {
+      $futures = array();
+      foreach ($paths as $p) {
+        $futures[$p] = new ExecFuture("%s --lint %s 2>&1",
+          $program, $this->getEngine()->getFilePathOnDisk($p));
+      }
+      foreach (Futures($futures)->limit(8) as $p => $f) {
+
+        list($stdout, $stderr) = $f->resolvex();
+        $raw = json_decode($stdout, true);
+        if (!is_array($raw)) {
+          throw new Exception(
+            "checkCpp returned invalid JSON!".
+            "Stdout: {$stdout} Stderr: {$stderr}"
+          );
+        }
+        foreach($raw as $err) {
+          $this->addLintMessage(
+            ArcanistLintMessage::newFromDictionary(
+              array(
+                'path' => $err['file'],
+                'line' => $err['line'],
+                'char' => 0,
+                'name' => $err['name'],
+                'description' => $err['info'],
+                'code' => $this->getLinterName(),
+                'severity' => ArcanistLintSeverity::SEVERITY_WARNING,
+              )
+            )
+          );
+        }
+      }
+    }
+    return;
+  }
+
+  public function lintPath($path) {
+    return;
+  }
+}
diff --git a/linters/src/lint_engine/FacebookFbcodeLintEngine.php b/linters/src/lint_engine/FacebookFbcodeLintEngine.php
new file mode 100644 (file)
index 0000000..c34530c
--- /dev/null
@@ -0,0 +1,147 @@
+<?php
+// Copyright 2004-present Facebook.  All rights reserved.
+
+class FacebookFbcodeLintEngine extends ArcanistLintEngine {
+
+  public function buildLinters() {
+    $linters = array();
+    $paths = $this->getPaths();
+
+    // Remove all deleted files, which are not checked by the
+    // following linters.
+    foreach ($paths as $key => $path) {
+      if (!Filesystem::pathExists($this->getFilePathOnDisk($path))) {
+        unset($paths[$key]);
+      }
+    }
+
+    $generated_linter = new ArcanistGeneratedLinter();
+    $linters[] = $generated_linter;
+
+    $nolint_linter = new ArcanistNoLintLinter();
+    $linters[] = $nolint_linter;
+
+    $text_linter = new ArcanistTextLinter();
+    $text_linter->setCustomSeverityMap(array(
+      ArcanistTextLinter::LINT_LINE_WRAP
+        => ArcanistLintSeverity::SEVERITY_ADVICE,
+    ));
+    $linters[] = $text_linter;
+
+    $java_text_linter = new ArcanistTextLinter();
+    $java_text_linter->setMaxLineLength(100);
+    $java_text_linter->setCustomSeverityMap(array(
+      ArcanistTextLinter::LINT_LINE_WRAP
+        => ArcanistLintSeverity::SEVERITY_ADVICE,
+    ));
+    $linters[] = $java_text_linter;
+
+    $pep8_options = $this->getPEP8WithTextOptions().',E302';
+
+    $python_linter = new ArcanistPEP8Linter();
+    $python_linter->setConfig(array('options' => $pep8_options));
+    $linters[] = $python_linter;
+
+    $python_2space_linter = new ArcanistPEP8Linter();
+    $python_2space_linter->setConfig(array('options' => $pep8_options.',E111'));
+    $linters[] = $python_2space_linter;
+
+   // Currently we can't run cpplint in commit hook mode, because it
+    // depends on having access to the working directory.
+    if (!$this->getCommitHookMode()) {
+      $cpp_linter = new FbcodeCppLinter();
+      $cpp_linter2 = new PfffCppLinter();
+      $linters[] = $cpp_linter;
+      $linters[] = $cpp_linter2;
+    }
+
+    $spelling_linter = new ArcanistSpellingLinter();
+    $linters[] = $spelling_linter;
+
+    foreach ($paths as $path) {
+      $is_text = false;
+
+      $text_extensions = (
+        '/\.('.
+        'cpp|cxx|c|cc|h|hpp|hxx|tcc|'.
+        'py|rb|hs|pl|pm|tw|'.
+        'php|phpt|css|js|'.
+        'java|'.
+        'thrift|'.
+        'lua|'.
+        'siv|'.
+        'txt'.
+        ')$/'
+      );
+      if (preg_match($text_extensions, $path)) {
+        $is_text = true;
+      }
+      if ($is_text) {
+        $nolint_linter->addPath($path);
+
+        $generated_linter->addPath($path);
+        $generated_linter->addData($path, $this->loadData($path));
+
+        if (preg_match('/\.java$/', $path)) {
+          $java_text_linter->addPath($path);
+          $java_text_linter->addData($path, $this->loadData($path));
+        } else {
+          $text_linter->addPath($path);
+          $text_linter->addData($path, $this->loadData($path));
+        }
+
+        $spelling_linter->addPath($path);
+        $spelling_linter->addData($path, $this->loadData($path));
+      }
+      if (isset($cpp_linter) && isset($cpp_linter2)  &&
+          preg_match('/\.(cpp|c|cc|cxx|h|hh|hpp|hxx|tcc)$/', $path)) {
+        $cpp_linter->addPath($path);
+        $cpp_linter->addData($path, $this->loadData($path));
+        $cpp_linter2->addPath($path);
+        $cpp_linter2->addData($path, $this->loadData($path));
+
+      }
+
+      // Match *.py and contbuild config files
+      if (preg_match('/(\.(py|tw|smcprops)|^contbuild\/configs\/[^\/]*)$/',
+                    $path)) {
+        $space_count = 4;
+        $real_path = $this->getFilePathOnDisk($path);
+        $dir = dirname($real_path);
+        do {
+          if (file_exists($dir.'/.python2space')) {
+            $space_count = 2;
+            break;
+          }
+          $dir = dirname($dir);
+        } while ($dir != '/' && $dir != '.');
+
+        if ($space_count == 4) {
+          $cur_path_linter = $python_linter;
+        } else {
+          $cur_path_linter = $python_2space_linter;
+        }
+        $cur_path_linter->addPath($path);
+        $cur_path_linter->addData($path, $this->loadData($path));
+
+        if (preg_match('/\.tw$/', $path)) {
+          $cur_path_linter->setCustomSeverityMap(array(
+            'E251' => ArcanistLintSeverity::SEVERITY_DISABLED,
+          ));
+        }
+      }
+
+
+
+    }
+
+    $name_linter = new ArcanistFilenameLinter();
+    $linters[] = $name_linter;
+    foreach ($paths as $path) {
+      $name_linter->addPath($path);
+    }
+
+    return $linters;
+  }
+
+}
diff --git a/port/README b/port/README
new file mode 100644 (file)
index 0000000..422563e
--- /dev/null
@@ -0,0 +1,10 @@
+This directory contains interfaces and implementations that isolate the
+rest of the package from platform details.
+
+Code in the rest of the package includes "port.h" from this directory.
+"port.h" in turn includes a platform specific "port_<platform>.h" file
+that provides the platform specific implementation.
+
+See port_posix.h for an example of what must be provided in a platform
+specific header file.
+
diff --git a/port/atomic_pointer.h b/port/atomic_pointer.h
new file mode 100644 (file)
index 0000000..db3580b
--- /dev/null
@@ -0,0 +1,157 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// AtomicPointer provides storage for a lock-free pointer.
+// Platform-dependent implementation of AtomicPointer:
+// - If the platform provides a cheap barrier, we use it with raw pointers
+// - If cstdatomic is present (on newer versions of gcc, it is), we use
+//   a cstdatomic-based AtomicPointer.  However we prefer the memory
+//   barrier based version, because at least on a gcc 4.4 32-bit build
+//   on linux, we have encountered a buggy <cstdatomic>
+//   implementation.  Also, some <cstdatomic> implementations are much
+//   slower than a memory-barrier based implementation (~16ns for
+//   <cstdatomic> based acquire-load vs. ~1ns for a barrier based
+//   acquire-load).
+// This code is based on atomicops-internals-* in Google's perftools:
+// http://code.google.com/p/google-perftools/source/browse/#svn%2Ftrunk%2Fsrc%2Fbase
+
+#ifndef PORT_ATOMIC_POINTER_H_
+#define PORT_ATOMIC_POINTER_H_
+
+#include <stdint.h>
+#ifdef ROCKSDB_ATOMIC_PRESENT
+#include <atomic>
+#endif
+#ifdef OS_WIN
+#include <windows.h>
+#endif
+#ifdef OS_MACOSX
+#include <libkern/OSAtomic.h>
+#endif
+
+#if defined(_M_X64) || defined(__x86_64__)
+#define ARCH_CPU_X86_FAMILY 1
+#elif defined(_M_IX86) || defined(__i386__) || defined(__i386)
+#define ARCH_CPU_X86_FAMILY 1
+#elif defined(__ARMEL__)
+#define ARCH_CPU_ARM_FAMILY 1
+#endif
+
+namespace rocksdb {
+namespace port {
+
+// Define MemoryBarrier() if available
+// Windows on x86
+#if defined(OS_WIN) && defined(COMPILER_MSVC) && defined(ARCH_CPU_X86_FAMILY)
+// windows.h already provides a MemoryBarrier(void) macro
+// http://msdn.microsoft.com/en-us/library/ms684208(v=vs.85).aspx
+#define ROCKSDB_HAVE_MEMORY_BARRIER
+
+// Gcc on x86
+#elif defined(ARCH_CPU_X86_FAMILY) && defined(__GNUC__)
+inline void MemoryBarrier() {
+  // See http://gcc.gnu.org/ml/gcc/2003-04/msg01180.html for a discussion on
+  // this idiom. Also see http://en.wikipedia.org/wiki/Memory_ordering.
+  __asm__ __volatile__("" : : : "memory");
+}
+#define ROCKSDB_HAVE_MEMORY_BARRIER
+
+// Sun Studio
+#elif defined(ARCH_CPU_X86_FAMILY) && defined(__SUNPRO_CC)
+inline void MemoryBarrier() {
+  // See http://gcc.gnu.org/ml/gcc/2003-04/msg01180.html for a discussion on
+  // this idiom. Also see http://en.wikipedia.org/wiki/Memory_ordering.
+  asm volatile("" : : : "memory");
+}
+#define ROCKSDB_HAVE_MEMORY_BARRIER
+
+// Mac OS
+#elif defined(OS_MACOSX)
+inline void MemoryBarrier() {
+  OSMemoryBarrier();
+}
+#define ROCKSDB_HAVE_MEMORY_BARRIER
+
+// ARM Linux
+#elif defined(ARCH_CPU_ARM_FAMILY) && defined(__linux__)
+typedef void (*LinuxKernelMemoryBarrierFunc)(void);
+// The Linux ARM kernel provides a highly optimized device-specific memory
+// barrier function at a fixed memory address that is mapped in every
+// user-level process.
+//
+// This beats using CPU-specific instructions which are, on single-core
+// devices, un-necessary and very costly (e.g. ARMv7-A "dmb" takes more
+// than 180ns on a Cortex-A8 like the one on a Nexus One). Benchmarking
+// shows that the extra function call cost is completely negligible on
+// multi-core devices.
+//
+inline void MemoryBarrier() {
+  (*(LinuxKernelMemoryBarrierFunc)0xffff0fa0)();
+}
+#define ROCKSDB_HAVE_MEMORY_BARRIER
+
+#endif
+
+// AtomicPointer built using platform-specific MemoryBarrier()
+#if defined(ROCKSDB_HAVE_MEMORY_BARRIER)
+class AtomicPointer {
+ private:
+  void* rep_;
+ public:
+  AtomicPointer() { }
+  explicit AtomicPointer(void* p) : rep_(p) {}
+  inline void* NoBarrier_Load() const { return rep_; }
+  inline void NoBarrier_Store(void* v) { rep_ = v; }
+  inline void* Acquire_Load() const {
+    void* result = rep_;
+    MemoryBarrier();
+    return result;
+  }
+  inline void Release_Store(void* v) {
+    MemoryBarrier();
+    rep_ = v;
+  }
+};
+
+// AtomicPointer based on <atomic>
+#elif defined(ROCKSDB_ATOMIC_PRESENT)
+class AtomicPointer {
+ private:
+  std::atomic<void*> rep_;
+ public:
+  AtomicPointer() { }
+  explicit AtomicPointer(void* v) : rep_(v) { }
+  inline void* Acquire_Load() const {
+    return rep_.load(std::memory_order_acquire);
+  }
+  inline void Release_Store(void* v) {
+    rep_.store(v, std::memory_order_release);
+  }
+  inline void* NoBarrier_Load() const {
+    return rep_.load(std::memory_order_relaxed);
+  }
+  inline void NoBarrier_Store(void* v) {
+    rep_.store(v, std::memory_order_relaxed);
+  }
+};
+
+// We have neither MemoryBarrier(), nor <cstdatomic>
+#else
+#error Please implement AtomicPointer for this platform.
+
+#endif
+
+#undef ROCKSDB_HAVE_MEMORY_BARRIER
+#undef ARCH_CPU_X86_FAMILY
+#undef ARCH_CPU_ARM_FAMILY
+
+}  // namespace port
+}  // namespace rocksdb
+
+#endif  // PORT_ATOMIC_POINTER_H_
diff --git a/port/port.h b/port/port.h
new file mode 100644 (file)
index 0000000..2dc9a0f
--- /dev/null
@@ -0,0 +1,22 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_PORT_PORT_H_
+#define STORAGE_LEVELDB_PORT_PORT_H_
+
+#include <string.h>
+
+// Include the appropriate platform specific file below.  If you are
+// porting to a new platform, see "port_example.h" for documentation
+// of what the new port_<platform>.h file must provide.
+#if defined(ROCKSDB_PLATFORM_POSIX)
+#  include "port/port_posix.h"
+#endif
+
+#endif  // STORAGE_LEVELDB_PORT_PORT_H_
diff --git a/port/port_example.h b/port/port_example.h
new file mode 100644 (file)
index 0000000..64a5791
--- /dev/null
@@ -0,0 +1,140 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// This file contains the specification, but not the implementations,
+// of the types/operations/etc. that should be defined by a platform
+// specific port_<platform>.h file.  Use this file as a reference for
+// how to port this package to a new platform.
+
+#ifndef STORAGE_LEVELDB_PORT_PORT_EXAMPLE_H_
+#define STORAGE_LEVELDB_PORT_PORT_EXAMPLE_H_
+
+namespace rocksdb {
+namespace port {
+
+// TODO(jorlow): Many of these belong more in the environment class rather than
+//               here. We should try moving them and see if it affects perf.
+
+// The following boolean constant must be true on a little-endian machine
+// and false otherwise.
+static const bool kLittleEndian = true /* or some other expression */;
+
+// ------------------ Threading -------------------
+
+// A Mutex represents an exclusive lock.
+class Mutex {
+ public:
+  Mutex();
+  ~Mutex();
+
+  // Lock the mutex.  Waits until other lockers have exited.
+  // Will deadlock if the mutex is already locked by this thread.
+  void Lock();
+
+  // Unlock the mutex.
+  // REQUIRES: This mutex was locked by this thread.
+  void Unlock();
+
+  // Optionally crash if this thread does not hold this mutex.
+  // The implementation must be fast, especially if NDEBUG is
+  // defined.  The implementation is allowed to skip all checks.
+  void AssertHeld();
+};
+
+class CondVar {
+ public:
+  explicit CondVar(Mutex* mu);
+  ~CondVar();
+
+  // Atomically release *mu and block on this condition variable until
+  // either a call to SignalAll(), or a call to Signal() that picks
+  // this thread to wakeup.
+  // REQUIRES: this thread holds *mu
+  void Wait();
+
+  // If there are some threads waiting, wake up at least one of them.
+  void Signal();
+
+  // Wake up all waiting threads.
+  void SignallAll();
+};
+
+// Thread-safe initialization.
+// Used as follows:
+//      static port::OnceType init_control = LEVELDB_ONCE_INIT;
+//      static void Initializer() { ... do something ...; }
+//      ...
+//      port::InitOnce(&init_control, &Initializer);
+typedef intptr_t OnceType;
+#define LEVELDB_ONCE_INIT 0
+extern void InitOnce(port::OnceType*, void (*initializer)());
+
+// A type that holds a pointer that can be read or written atomically
+// (i.e., without word-tearing.)
+class AtomicPointer {
+ private:
+  intptr_t rep_;
+ public:
+  // Initialize to arbitrary value
+  AtomicPointer();
+
+  // Initialize to hold v
+  explicit AtomicPointer(void* v) : rep_(v) { }
+
+  // Read and return the stored pointer with the guarantee that no
+  // later memory access (read or write) by this thread can be
+  // reordered ahead of this read.
+  void* Acquire_Load() const;
+
+  // Set v as the stored pointer with the guarantee that no earlier
+  // memory access (read or write) by this thread can be reordered
+  // after this store.
+  void Release_Store(void* v);
+
+  // Read the stored pointer with no ordering guarantees.
+  void* NoBarrier_Load() const;
+
+  // Set va as the stored pointer with no ordering guarantees.
+  void NoBarrier_Store(void* v);
+};
+
+// ------------------ Compression -------------------
+
+// Store the snappy compression of "input[0,input_length-1]" in *output.
+// Returns false if snappy is not supported by this port.
+extern bool Snappy_Compress(const char* input, size_t input_length,
+                            std::string* output);
+
+// If input[0,input_length-1] looks like a valid snappy compressed
+// buffer, store the size of the uncompressed data in *result and
+// return true.  Else return false.
+extern bool Snappy_GetUncompressedLength(const char* input, size_t length,
+                                         size_t* result);
+
+// Attempt to snappy uncompress input[0,input_length-1] into *output.
+// Returns true if successful, false if the input is invalid lightweight
+// compressed data.
+//
+// REQUIRES: at least the first "n" bytes of output[] must be writable
+// where "n" is the result of a successful call to
+// Snappy_GetUncompressedLength.
+extern bool Snappy_Uncompress(const char* input_data, size_t input_length,
+                              char* output);
+
+// ------------------ Miscellaneous -------------------
+
+// If heap profiling is not supported, returns false.
+// Else repeatedly calls (*func)(arg, data, n) and then returns true.
+// The concatenation of all "data[0,n-1]" fragments is the heap profile.
+extern bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg);
+
+}  // namespace port
+}  // namespace rocksdb
+
+#endif  // STORAGE_LEVELDB_PORT_PORT_EXAMPLE_H_
diff --git a/port/port_posix.cc b/port/port_posix.cc
new file mode 100644 (file)
index 0000000..f7025f4
--- /dev/null
@@ -0,0 +1,86 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "port/port_posix.h"
+
+#include <cstdlib>
+#include <stdio.h>
+#include <string.h>
+#include "util/logging.h"
+
+namespace rocksdb {
+namespace port {
+
+static void PthreadCall(const char* label, int result) {
+  if (result != 0) {
+    fprintf(stderr, "pthread %s: %s\n", label, strerror(result));
+    abort();
+  }
+}
+
+Mutex::Mutex(bool adaptive) {
+#ifdef OS_LINUX
+  if (!adaptive) {
+    PthreadCall("init mutex", pthread_mutex_init(&mu_, NULL));
+  } else {
+    pthread_mutexattr_t mutex_attr;
+    PthreadCall("init mutex attr", pthread_mutexattr_init(&mutex_attr));
+    PthreadCall("set mutex attr",
+                pthread_mutexattr_settype(&mutex_attr,
+                                          PTHREAD_MUTEX_ADAPTIVE_NP));
+    PthreadCall("init mutex", pthread_mutex_init(&mu_, &mutex_attr));
+    PthreadCall("destroy mutex attr",
+                pthread_mutexattr_destroy(&mutex_attr));
+  }
+#else // ignore adaptive for non-linux platform
+  PthreadCall("init mutex", pthread_mutex_init(&mu_, NULL));
+#endif // OS_LINUX
+}
+
+Mutex::~Mutex() { PthreadCall("destroy mutex", pthread_mutex_destroy(&mu_)); }
+
+void Mutex::Lock() { PthreadCall("lock", pthread_mutex_lock(&mu_)); }
+
+void Mutex::Unlock() { PthreadCall("unlock", pthread_mutex_unlock(&mu_)); }
+
+CondVar::CondVar(Mutex* mu)
+    : mu_(mu) {
+    PthreadCall("init cv", pthread_cond_init(&cv_, NULL));
+}
+
+CondVar::~CondVar() { PthreadCall("destroy cv", pthread_cond_destroy(&cv_)); }
+
+void CondVar::Wait() {
+  PthreadCall("wait", pthread_cond_wait(&cv_, &mu_->mu_));
+}
+
+void CondVar::Signal() {
+  PthreadCall("signal", pthread_cond_signal(&cv_));
+}
+
+void CondVar::SignalAll() {
+  PthreadCall("broadcast", pthread_cond_broadcast(&cv_));
+}
+
+RWMutex::RWMutex() { PthreadCall("init mutex", pthread_rwlock_init(&mu_, NULL)); }
+
+RWMutex::~RWMutex() { PthreadCall("destroy mutex", pthread_rwlock_destroy(&mu_)); }
+
+void RWMutex::ReadLock() { PthreadCall("read lock", pthread_rwlock_rdlock(&mu_)); }
+
+void RWMutex::WriteLock() { PthreadCall("write lock", pthread_rwlock_wrlock(&mu_)); }
+
+void RWMutex::Unlock() { PthreadCall("unlock", pthread_rwlock_unlock(&mu_)); }
+
+void InitOnce(OnceType* once, void (*initializer)()) {
+  PthreadCall("once", pthread_once(once, initializer));
+}
+
+}  // namespace port
+}  // namespace rocksdb
diff --git a/port/port_posix.h b/port/port_posix.h
new file mode 100644 (file)
index 0000000..15ab0dc
--- /dev/null
@@ -0,0 +1,421 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// See port_example.h for documentation for the following types/functions.
+
+#ifndef STORAGE_LEVELDB_PORT_PORT_POSIX_H_
+#define STORAGE_LEVELDB_PORT_PORT_POSIX_H_
+
+#undef PLATFORM_IS_LITTLE_ENDIAN
+#if defined(OS_MACOSX)
+  #include <machine/endian.h>
+  #if defined(__DARWIN_LITTLE_ENDIAN) && defined(__DARWIN_BYTE_ORDER)
+    #define PLATFORM_IS_LITTLE_ENDIAN \
+        (__DARWIN_BYTE_ORDER == __DARWIN_LITTLE_ENDIAN)
+  #endif
+#elif defined(OS_SOLARIS)
+  #include <sys/isa_defs.h>
+  #ifdef _LITTLE_ENDIAN
+    #define PLATFORM_IS_LITTLE_ENDIAN true
+  #else
+    #define PLATFORM_IS_LITTLE_ENDIAN false
+  #endif
+#elif defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) ||\
+      defined(OS_DRAGONFLYBSD) || defined(OS_ANDROID)
+  #include <sys/types.h>
+  #include <sys/endian.h>
+#else
+  #include <endian.h>
+#endif
+#include <pthread.h>
+#ifdef SNAPPY
+#include <snappy.h>
+#endif
+
+#ifdef ZLIB
+#include <zlib.h>
+#endif
+
+#ifdef BZIP2
+#include <bzlib.h>
+#endif
+
+#include <stdint.h>
+#include <string>
+#include <string.h>
+#include "rocksdb/options.h"
+#include "port/atomic_pointer.h"
+
+#ifndef PLATFORM_IS_LITTLE_ENDIAN
+#define PLATFORM_IS_LITTLE_ENDIAN (__BYTE_ORDER == __LITTLE_ENDIAN)
+#endif
+
+#if defined(OS_MACOSX) || defined(OS_SOLARIS) || defined(OS_FREEBSD) ||\
+    defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLYBSD) ||\
+    defined(OS_ANDROID)
+// Use fread/fwrite/fflush on platforms without _unlocked variants
+#define fread_unlocked fread
+#define fwrite_unlocked fwrite
+#define fflush_unlocked fflush
+#endif
+
+#if defined(OS_MACOSX) || defined(OS_FREEBSD) ||\
+    defined(OS_OPENBSD) || defined(OS_DRAGONFLYBSD)
+// Use fsync() on platforms without fdatasync()
+#define fdatasync fsync
+#endif
+
+#if defined(OS_ANDROID) && __ANDROID_API__ < 9
+// fdatasync() was only introduced in API level 9 on Android. Use fsync()
+// when targetting older platforms.
+#define fdatasync fsync
+#endif
+
+namespace rocksdb {
+namespace port {
+
+static const bool kLittleEndian = PLATFORM_IS_LITTLE_ENDIAN;
+#undef PLATFORM_IS_LITTLE_ENDIAN
+
+class CondVar;
+
+class Mutex {
+ public:
+  /* implicit */ Mutex(bool adaptive = false);
+  ~Mutex();
+
+  void Lock();
+  void Unlock();
+  void AssertHeld() { }
+
+ private:
+  friend class CondVar;
+  pthread_mutex_t mu_;
+
+  // No copying
+  Mutex(const Mutex&);
+  void operator=(const Mutex&);
+};
+
+class RWMutex {
+ public:
+  RWMutex();
+  ~RWMutex();
+
+  void ReadLock();
+  void WriteLock();
+  void Unlock();
+  void AssertHeld() { }
+
+ private:
+  pthread_rwlock_t mu_; // the underlying platform mutex
+
+  // No copying allowed
+  RWMutex(const RWMutex&);
+  void operator=(const RWMutex&);
+};
+
+class CondVar {
+ public:
+  explicit CondVar(Mutex* mu);
+  ~CondVar();
+  void Wait();
+  void Signal();
+  void SignalAll();
+ private:
+  pthread_cond_t cv_;
+  Mutex* mu_;
+};
+
+typedef pthread_once_t OnceType;
+#define LEVELDB_ONCE_INIT PTHREAD_ONCE_INIT
+extern void InitOnce(OnceType* once, void (*initializer)());
+
+inline bool Snappy_Compress(const CompressionOptions& opts, const char* input,
+                            size_t length, ::std::string* output) {
+#ifdef SNAPPY
+  output->resize(snappy::MaxCompressedLength(length));
+  size_t outlen;
+  snappy::RawCompress(input, length, &(*output)[0], &outlen);
+  output->resize(outlen);
+  return true;
+#endif
+
+  return false;
+}
+
+inline bool Snappy_GetUncompressedLength(const char* input, size_t length,
+                                         size_t* result) {
+#ifdef SNAPPY
+  return snappy::GetUncompressedLength(input, length, result);
+#else
+  return false;
+#endif
+}
+
+inline bool Snappy_Uncompress(const char* input, size_t length,
+                              char* output) {
+#ifdef SNAPPY
+  return snappy::RawUncompress(input, length, output);
+#else
+  return false;
+#endif
+}
+
+inline bool Zlib_Compress(const CompressionOptions& opts, const char* input,
+                          size_t length, ::std::string* output) {
+#ifdef ZLIB
+  // The memLevel parameter specifies how much memory should be allocated for
+  // the internal compression state.
+  // memLevel=1 uses minimum memory but is slow and reduces compression ratio.
+  // memLevel=9 uses maximum memory for optimal speed.
+  // The default value is 8. See zconf.h for more details.
+  static const int memLevel = 8;
+  z_stream _stream;
+  memset(&_stream, 0, sizeof(z_stream));
+  int st = deflateInit2(&_stream, opts.level, Z_DEFLATED, opts.window_bits,
+                        memLevel, opts.strategy);
+  if (st != Z_OK) {
+    return false;
+  }
+
+  // Resize output to be the plain data length.
+  // This may not be big enough if the compression actually expands data.
+  output->resize(length);
+
+  // Compress the input, and put compressed data in output.
+  _stream.next_in = (Bytef *)input;
+  _stream.avail_in = length;
+
+  // Initialize the output size.
+  _stream.avail_out = length;
+  _stream.next_out = (Bytef *)&(*output)[0];
+
+  int old_sz =0, new_sz =0, new_sz_delta =0;
+  bool done = false;
+  while (!done) {
+    int st = deflate(&_stream, Z_FINISH);
+    switch (st) {
+      case Z_STREAM_END:
+        done = true;
+        break;
+      case Z_OK:
+        // No output space. Increase the output space by 20%.
+        // (Should we fail the compression since it expands the size?)
+        old_sz = output->size();
+        new_sz_delta = (int)(output->size() * 0.2);
+        new_sz = output->size() + (new_sz_delta < 10 ? 10 : new_sz_delta);
+        output->resize(new_sz);
+        // Set more output.
+        _stream.next_out = (Bytef *)&(*output)[old_sz];
+        _stream.avail_out = new_sz - old_sz;
+        break;
+      case Z_BUF_ERROR:
+      default:
+        deflateEnd(&_stream);
+        return false;
+    }
+  }
+
+  output->resize(output->size() - _stream.avail_out);
+  deflateEnd(&_stream);
+  return true;
+#endif
+  return false;
+}
+
+inline char* Zlib_Uncompress(const char* input_data, size_t input_length,
+    int* decompress_size, int windowBits = -14) {
+#ifdef ZLIB
+  z_stream _stream;
+  memset(&_stream, 0, sizeof(z_stream));
+
+  // For raw inflate, the windowBits should be -8..-15.
+  // If windowBits is bigger than zero, it will use either zlib
+  // header or gzip header. Adding 32 to it will do automatic detection.
+  int st = inflateInit2(&_stream,
+      windowBits > 0 ? windowBits + 32 : windowBits);
+  if (st != Z_OK) {
+    return nullptr;
+  }
+
+  _stream.next_in = (Bytef *)input_data;
+  _stream.avail_in = input_length;
+
+  // Assume the decompressed data size will 5x of compressed size.
+  int output_len = input_length * 5;
+  char* output = new char[output_len];
+  int old_sz = output_len;
+
+  _stream.next_out = (Bytef *)output;
+  _stream.avail_out = output_len;
+
+  char* tmp = nullptr;
+  int output_len_delta;
+  bool done = false;
+
+  //while(_stream.next_in != nullptr && _stream.avail_in != 0) {
+  while (!done) {
+    int st = inflate(&_stream, Z_SYNC_FLUSH);
+    switch (st) {
+      case Z_STREAM_END:
+        done = true;
+        break;
+      case Z_OK:
+        // No output space. Increase the output space by 20%.
+        old_sz = output_len;
+        output_len_delta = (int)(output_len * 0.2);
+        output_len += output_len_delta < 10 ? 10 : output_len_delta;
+        tmp = new char[output_len];
+        memcpy(tmp, output, old_sz);
+        delete[] output;
+        output = tmp;
+
+        // Set more output.
+        _stream.next_out = (Bytef *)(output + old_sz);
+        _stream.avail_out = output_len - old_sz;
+        break;
+      case Z_BUF_ERROR:
+      default:
+        delete[] output;
+        inflateEnd(&_stream);
+        return nullptr;
+    }
+  }
+
+  *decompress_size = output_len - _stream.avail_out;
+  inflateEnd(&_stream);
+  return output;
+#endif
+
+  return nullptr;
+}
+
+inline bool BZip2_Compress(const CompressionOptions& opts, const char* input,
+                           size_t length, ::std::string* output) {
+#ifdef BZIP2
+  bz_stream _stream;
+  memset(&_stream, 0, sizeof(bz_stream));
+
+  // Block size 1 is 100K.
+  // 0 is for silent.
+  // 30 is the default workFactor
+  int st = BZ2_bzCompressInit(&_stream, 1, 0, 30);
+  if (st != BZ_OK) {
+    return false;
+  }
+
+  // Resize output to be the plain data length.
+  // This may not be big enough if the compression actually expands data.
+  output->resize(length);
+
+  // Compress the input, and put compressed data in output.
+  _stream.next_in = (char *)input;
+  _stream.avail_in = length;
+
+  // Initialize the output size.
+  _stream.next_out = (char *)&(*output)[0];
+  _stream.avail_out = length;
+
+  int old_sz =0, new_sz =0;
+  while(_stream.next_in != nullptr && _stream.avail_in != 0) {
+    int st = BZ2_bzCompress(&_stream, BZ_FINISH);
+    switch (st) {
+      case BZ_STREAM_END:
+        break;
+      case BZ_FINISH_OK:
+        // No output space. Increase the output space by 20%.
+        // (Should we fail the compression since it expands the size?)
+        old_sz = output->size();
+        new_sz = (int)(output->size() * 1.2);
+        output->resize(new_sz);
+        // Set more output.
+        _stream.next_out = (char *)&(*output)[old_sz];
+        _stream.avail_out = new_sz - old_sz;
+        break;
+      case BZ_SEQUENCE_ERROR:
+      default:
+        BZ2_bzCompressEnd(&_stream);
+        return false;
+    }
+  }
+
+  output->resize(output->size() - _stream.avail_out);
+  BZ2_bzCompressEnd(&_stream);
+  return true;
+  return output;
+#endif
+  return false;
+}
+
+inline char*  BZip2_Uncompress(const char* input_data, size_t input_length,
+    int* decompress_size) {
+#ifdef BZIP2
+  bz_stream _stream;
+  memset(&_stream, 0, sizeof(bz_stream));
+
+  int st = BZ2_bzDecompressInit(&_stream, 0, 0);
+  if (st != BZ_OK) {
+    return nullptr;
+  }
+
+  _stream.next_in = (char *)input_data;
+  _stream.avail_in = input_length;
+
+  // Assume the decompressed data size will be 5x of compressed size.
+  int output_len = input_length * 5;
+  char* output = new char[output_len];
+  int old_sz = output_len;
+
+  _stream.next_out = (char *)output;
+  _stream.avail_out = output_len;
+
+  char* tmp = nullptr;
+
+  while(_stream.next_in != nullptr && _stream.avail_in != 0) {
+    int st = BZ2_bzDecompress(&_stream);
+    switch (st) {
+      case BZ_STREAM_END:
+        break;
+      case BZ_OK:
+        // No output space. Increase the output space by 20%.
+        old_sz = output_len;
+        output_len = (int)(output_len * 1.2);
+        tmp = new char[output_len];
+        memcpy(tmp, output, old_sz);
+        delete[] output;
+        output = tmp;
+
+        // Set more output.
+        _stream.next_out = (char *)(output + old_sz);
+        _stream.avail_out = output_len - old_sz;
+        break;
+      case Z_BUF_ERROR:
+      default:
+        delete[] output;
+        BZ2_bzDecompressEnd(&_stream);
+        return nullptr;
+    }
+  }
+
+  *decompress_size = output_len - _stream.avail_out;
+  BZ2_bzDecompressEnd(&_stream);
+  return output;
+#endif
+  return nullptr;
+}
+
+inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) {
+  return false;
+}
+
+} // namespace port
+} // namespace rocksdb
+
+#endif  // STORAGE_LEVELDB_PORT_PORT_POSIX_H_
diff --git a/port/stack_trace.cc b/port/stack_trace.cc
new file mode 100644 (file)
index 0000000..aa01fd0
--- /dev/null
@@ -0,0 +1,102 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#include "util/stack_trace.h"
+
+#ifdef OS_LINUX
+
+#include <execinfo.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+namespace rocksdb {
+
+static const char* GetExecutableName()
+{
+  static char name[1024];
+
+  char link[1024];
+  snprintf(link, sizeof(link), "/proc/%d/exe", getpid());
+  auto read = readlink(link, name, sizeof(name));
+  if (-1 == read) {
+    return nullptr;
+  } else {
+    name[read] = 0;
+    return name;
+  }
+}
+
+void PrintStack(int first_frames_to_skip) {
+  const int kMaxFrames = 100;
+  void *frames[kMaxFrames];
+
+  auto num_frames = backtrace(frames, kMaxFrames);
+  auto symbols = backtrace_symbols(frames, num_frames);
+
+  auto executable = GetExecutableName();
+
+  for (int i = first_frames_to_skip; i < num_frames; ++i) {
+    fprintf(stderr, "#%-2d  ", i - first_frames_to_skip);
+    if (symbols) {
+      fprintf(stderr, "%s ", symbols[i]);
+    }
+    if (executable) {
+      // out source to addr2line, for the address translation
+      const int kLineMax = 256;
+      char cmd[kLineMax];
+      sprintf(cmd, "addr2line %p -e %s -f -C 2>&1", frames[i], executable);
+      auto f = popen(cmd, "r");
+      if (f) {
+        char line[kLineMax];
+        while (fgets(line, sizeof(line), f)) {
+          line[strlen(line) - 1] = 0; // remove newline
+          fprintf(stderr, "%s\t", line);
+        }
+        pclose(f);
+      }
+    } else {
+      fprintf(stderr, " %p", frames[i]);
+    }
+    fprintf(stderr, "\n");
+  }
+}
+
+static void StackTraceHandler(int sig) {
+  // reset to default handler
+  signal(sig, SIG_DFL);
+  fprintf(stderr, "Received signal %d (%s)\n", sig, strsignal(sig));
+  // skip the top three signal handler related frames
+  PrintStack(3);
+  // re-signal to default handler (so we still get core dump if needed...)
+  raise(sig);
+}
+
+void InstallStackTraceHandler() {
+  // just use the plain old signal as it's simple and sufficient
+  // for this use case
+  signal(SIGILL, StackTraceHandler);
+  signal(SIGSEGV, StackTraceHandler);
+  signal(SIGBUS, StackTraceHandler);
+  signal(SIGABRT, StackTraceHandler);
+
+  printf("Installed stack trace handler for SIGILL SIGSEGV SIGBUS SIGABRT\n");
+
+}
+
+}   // namespace rocksdb
+
+#else // no-op for non-linux system for now
+
+namespace rocksdb {
+
+void InstallStackTraceHandler() {}
+void PrintStack(int first_frames_to_skip) {}
+
+}
+
+#endif // OS_LINUX
diff --git a/port/win/stdint.h b/port/win/stdint.h
new file mode 100644 (file)
index 0000000..39edd0d
--- /dev/null
@@ -0,0 +1,24 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// MSVC didn't ship with this file until the 2010 version.
+
+#ifndef STORAGE_LEVELDB_PORT_WIN_STDINT_H_
+#define STORAGE_LEVELDB_PORT_WIN_STDINT_H_
+
+#if !defined(_MSC_VER)
+#error This file should only be included when compiling with MSVC.
+#endif
+
+// Define C99 equivalent types.
+typedef signed char           int8_t;
+typedef signed short          int16_t;
+typedef signed int            int32_t;
+typedef signed long long      int64_t;
+typedef unsigned char         uint8_t;
+typedef unsigned short        uint16_t;
+typedef unsigned int          uint32_t;
+typedef unsigned long long    uint64_t;
+
+#endif  // STORAGE_LEVELDB_PORT_WIN_STDINT_H_
diff --git a/table/block.cc b/table/block.cc
new file mode 100644 (file)
index 0000000..3f969fe
--- /dev/null
@@ -0,0 +1,274 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Decodes the blocks generated by block_builder.cc.
+
+#include "table/block.h"
+
+#include <vector>
+#include <algorithm>
+#include "rocksdb/comparator.h"
+#include "table/format.h"
+#include "util/coding.h"
+#include "util/logging.h"
+
+namespace rocksdb {
+
+inline uint32_t Block::NumRestarts() const {
+  assert(size_ >= 2*sizeof(uint32_t));
+  return DecodeFixed32(data_ + size_ - sizeof(uint32_t));
+}
+
+Block::Block(const BlockContents& contents)
+    : data_(contents.data.data()),
+      size_(contents.data.size()),
+      owned_(contents.heap_allocated),
+      cachable_(contents.cachable),
+      compression_type_(contents.compression_type) {
+  if (size_ < sizeof(uint32_t)) {
+    size_ = 0;  // Error marker
+  } else {
+    restart_offset_ = size_ - (1 + NumRestarts()) * sizeof(uint32_t);
+    if (restart_offset_ > size_ - sizeof(uint32_t)) {
+      // The size is too small for NumRestarts() and therefore
+      // restart_offset_ wrapped around.
+      size_ = 0;
+    }
+  }
+}
+
+Block::~Block() {
+  if (owned_) {
+    delete[] data_;
+  }
+}
+
+// Helper routine: decode the next block entry starting at "p",
+// storing the number of shared key bytes, non_shared key bytes,
+// and the length of the value in "*shared", "*non_shared", and
+// "*value_length", respectively.  Will not derefence past "limit".
+//
+// If any errors are detected, returns nullptr.  Otherwise, returns a
+// pointer to the key delta (just past the three decoded values).
+static inline const char* DecodeEntry(const char* p, const char* limit,
+                                      uint32_t* shared,
+                                      uint32_t* non_shared,
+                                      uint32_t* value_length) {
+  if (limit - p < 3) return nullptr;
+  *shared = reinterpret_cast<const unsigned char*>(p)[0];
+  *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
+  *value_length = reinterpret_cast<const unsigned char*>(p)[2];
+  if ((*shared | *non_shared | *value_length) < 128) {
+    // Fast path: all three values are encoded in one byte each
+    p += 3;
+  } else {
+    if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr;
+    if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr;
+    if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) return nullptr;
+  }
+
+  if (static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)) {
+    return nullptr;
+  }
+  return p;
+}
+
+class Block::Iter : public Iterator {
+ private:
+  const Comparator* const comparator_;
+  const char* const data_;      // underlying block contents
+  uint32_t const restarts_;     // Offset of restart array (list of fixed32)
+  uint32_t const num_restarts_; // Number of uint32_t entries in restart array
+
+  // current_ is offset in data_ of current entry.  >= restarts_ if !Valid
+  uint32_t current_;
+  uint32_t restart_index_;  // Index of restart block in which current_ falls
+  std::string key_;
+  Slice value_;
+  Status status_;
+
+  inline int Compare(const Slice& a, const Slice& b) const {
+    return comparator_->Compare(a, b);
+  }
+
+  // Return the offset in data_ just past the end of the current entry.
+  inline uint32_t NextEntryOffset() const {
+    return (value_.data() + value_.size()) - data_;
+  }
+
+  uint32_t GetRestartPoint(uint32_t index) {
+    assert(index < num_restarts_);
+    return DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t));
+  }
+
+  void SeekToRestartPoint(uint32_t index) {
+    key_.clear();
+    restart_index_ = index;
+    // current_ will be fixed by ParseNextKey();
+
+    // ParseNextKey() starts at the end of value_, so set value_ accordingly
+    uint32_t offset = GetRestartPoint(index);
+    value_ = Slice(data_ + offset, 0);
+  }
+
+ public:
+  Iter(const Comparator* comparator,
+       const char* data,
+       uint32_t restarts,
+       uint32_t num_restarts)
+      : comparator_(comparator),
+        data_(data),
+        restarts_(restarts),
+        num_restarts_(num_restarts),
+        current_(restarts_),
+        restart_index_(num_restarts_) {
+    assert(num_restarts_ > 0);
+  }
+
+  virtual bool Valid() const { return current_ < restarts_; }
+  virtual Status status() const { return status_; }
+  virtual Slice key() const {
+    assert(Valid());
+    return key_;
+  }
+  virtual Slice value() const {
+    assert(Valid());
+    return value_;
+  }
+
+  virtual void Next() {
+    assert(Valid());
+    ParseNextKey();
+  }
+
+  virtual void Prev() {
+    assert(Valid());
+
+    // Scan backwards to a restart point before current_
+    const uint32_t original = current_;
+    while (GetRestartPoint(restart_index_) >= original) {
+      if (restart_index_ == 0) {
+        // No more entries
+        current_ = restarts_;
+        restart_index_ = num_restarts_;
+        return;
+      }
+      restart_index_--;
+    }
+
+    SeekToRestartPoint(restart_index_);
+    do {
+      // Loop until end of current entry hits the start of original entry
+    } while (ParseNextKey() && NextEntryOffset() < original);
+  }
+
+  virtual void Seek(const Slice& target) {
+    // Binary search in restart array to find the first restart point
+    // with a key >= target
+    uint32_t left = 0;
+    uint32_t right = num_restarts_ - 1;
+    while (left < right) {
+      uint32_t mid = (left + right + 1) / 2;
+      uint32_t region_offset = GetRestartPoint(mid);
+      uint32_t shared, non_shared, value_length;
+      const char* key_ptr = DecodeEntry(data_ + region_offset,
+                                        data_ + restarts_,
+                                        &shared, &non_shared, &value_length);
+      if (key_ptr == nullptr || (shared != 0)) {
+        CorruptionError();
+        return;
+      }
+      Slice mid_key(key_ptr, non_shared);
+      if (Compare(mid_key, target) < 0) {
+        // Key at "mid" is smaller than "target".  Therefore all
+        // blocks before "mid" are uninteresting.
+        left = mid;
+      } else {
+        // Key at "mid" is >= "target".  Therefore all blocks at or
+        // after "mid" are uninteresting.
+        right = mid - 1;
+      }
+    }
+
+    // Linear search (within restart block) for first key >= target
+    SeekToRestartPoint(left);
+    while (true) {
+      if (!ParseNextKey()) {
+        return;
+      }
+      if (Compare(key_, target) >= 0) {
+        return;
+      }
+    }
+  }
+
+  virtual void SeekToFirst() {
+    SeekToRestartPoint(0);
+    ParseNextKey();
+  }
+
+  virtual void SeekToLast() {
+    SeekToRestartPoint(num_restarts_ - 1);
+    while (ParseNextKey() && NextEntryOffset() < restarts_) {
+      // Keep skipping
+    }
+  }
+
+ private:
+  void CorruptionError() {
+    current_ = restarts_;
+    restart_index_ = num_restarts_;
+    status_ = Status::Corruption("bad entry in block");
+    key_.clear();
+    value_.clear();
+  }
+
+  bool ParseNextKey() {
+    current_ = NextEntryOffset();
+    const char* p = data_ + current_;
+    const char* limit = data_ + restarts_;  // Restarts come right after data
+    if (p >= limit) {
+      // No more entries to return.  Mark as invalid.
+      current_ = restarts_;
+      restart_index_ = num_restarts_;
+      return false;
+    }
+
+    // Decode next entry
+    uint32_t shared, non_shared, value_length;
+    p = DecodeEntry(p, limit, &shared, &non_shared, &value_length);
+    if (p == nullptr || key_.size() < shared) {
+      CorruptionError();
+      return false;
+    } else {
+      key_.resize(shared);
+      key_.append(p, non_shared);
+      value_ = Slice(p + non_shared, value_length);
+      while (restart_index_ + 1 < num_restarts_ &&
+             GetRestartPoint(restart_index_ + 1) < current_) {
+        ++restart_index_;
+      }
+      return true;
+    }
+  }
+};
+
+Iterator* Block::NewIterator(const Comparator* cmp) {
+  if (size_ < 2*sizeof(uint32_t)) {
+    return NewErrorIterator(Status::Corruption("bad block contents"));
+  }
+  const uint32_t num_restarts = NumRestarts();
+  if (num_restarts == 0) {
+    return NewEmptyIterator();
+  } else {
+    return new Iter(cmp, data_, restart_offset_, num_restarts);
+  }
+}
+
+}  // namespace rocksdb
diff --git a/table/block.h b/table/block.h
new file mode 100644 (file)
index 0000000..7fac006
--- /dev/null
@@ -0,0 +1,51 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <stddef.h>
+#include <stdint.h>
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+
+namespace rocksdb {
+
+struct BlockContents;
+class Comparator;
+
+class Block {
+ public:
+  // Initialize the block with the specified contents.
+  explicit Block(const BlockContents& contents);
+
+  ~Block();
+
+  size_t size() const { return size_; }
+  bool   isCachable() const { return cachable_; }
+  CompressionType compressionType() const { return compression_type_; }
+  Iterator* NewIterator(const Comparator* comparator);
+  const char* data() { return data_; }
+
+ private:
+  uint32_t NumRestarts() const;
+
+  const char* data_;
+  size_t size_;
+  uint32_t restart_offset_;     // Offset in data_ of restart array
+  bool owned_;                  // Block owns data_[]
+  bool cachable_;
+  CompressionType compression_type_;
+
+  // No copying allowed
+  Block(const Block&);
+  void operator=(const Block&);
+
+  class Iter;
+};
+
+}  // namespace rocksdb
diff --git a/table/block_based_table_builder.cc b/table/block_based_table_builder.cc
new file mode 100644 (file)
index 0000000..a5e546b
--- /dev/null
@@ -0,0 +1,559 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/block_based_table_builder.h"
+
+#include <assert.h>
+#include <inttypes.h>
+#include <map>
+#include <stdio.h>
+
+#include "rocksdb/flush_block_policy.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/table.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/options.h"
+#include "table/block_based_table_reader.h"
+#include "table/block.h"
+#include "table/block_builder.h"
+#include "table/filter_block.h"
+#include "table/format.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+#include "util/stop_watch.h"
+
+namespace rocksdb {
+
+namespace {
+
+struct BytewiseLessThan {
+  bool operator()(const std::string& key1, const std::string& key2) const {
+    // smaller entries will be placed in front.
+    return comparator->Compare(key1, key2) <= 0;
+  }
+  const Comparator* comparator = BytewiseComparator();
+};
+
+// When writing to a block that requires entries to be sorted by
+// `BytewiseComparator`, we can buffer the content to `BytewiseSortedMap`
+// before writng to store.
+typedef std::map<std::string, std::string, BytewiseLessThan> BytewiseSortedMap;
+
+void AddProperties(BytewiseSortedMap& props, std::string name, uint64_t val) {
+  assert(props.find(name) == props.end());
+
+  std::string dst;
+  PutVarint64(&dst, val);
+
+  props.insert(
+      std::make_pair(name, dst)
+  );
+}
+
+static bool GoodCompressionRatio(size_t compressed_size, size_t raw_size) {
+  // Check to see if compressed less than 12.5%
+  return compressed_size < raw_size - (raw_size / 8u);
+}
+
+// Were we encounter any error occurs during user-defined statistics collection,
+// we'll write the warning message to info log.
+void LogPropertiesCollectionError(
+    Logger* info_log, const std::string& method, const std::string& name) {
+  assert(method == "Add" || method == "Finish");
+
+  std::string msg =
+    "[Warning] encountered error when calling TablePropertiesCollector::" +
+    method + "() with collector name: " + name;
+  Log(info_log, "%s", msg.c_str());
+}
+
+}  // anonymous namespace
+
+struct BlockBasedTableBuilder::Rep {
+  Options options;
+  WritableFile* file;
+  uint64_t offset = 0;
+  Status status;
+  BlockBuilder data_block;
+  BlockBuilder index_block;
+  std::string last_key;
+  CompressionType compression_type;
+  TableProperties props;
+
+  bool closed = false;  // Either Finish() or Abandon() has been called.
+  FilterBlockBuilder* filter_block;
+  char compressed_cache_key_prefix[BlockBasedTable::kMaxCacheKeyPrefixSize];
+  size_t compressed_cache_key_prefix_size;
+
+  BlockHandle pending_handle;  // Handle to add to index block
+
+  std::string compressed_output;
+  std::unique_ptr<FlushBlockPolicy> flush_block_policy;
+
+  Rep(const Options& opt,
+      WritableFile* f,
+      FlushBlockPolicyFactory* flush_block_policy_factory,
+      CompressionType compression_type)
+      : options(opt),
+        file(f),
+        data_block(options),
+        // To avoid linear scan, we make the block_restart_interval to be `1`
+        // in index block builder
+        index_block(1 /* block_restart_interval */, options.comparator),
+        compression_type(compression_type),
+        filter_block(opt.filter_policy == nullptr ? nullptr
+                     : new FilterBlockBuilder(opt)),
+        flush_block_policy(
+            flush_block_policy_factory->NewFlushBlockPolicy(data_block)) {
+  }
+};
+
+BlockBasedTableBuilder::BlockBasedTableBuilder(
+    const Options& options,
+    WritableFile* file,
+    FlushBlockPolicyFactory* flush_block_policy_factory,
+    CompressionType compression_type)
+    : rep_(new Rep(options,
+                   file, flush_block_policy_factory, compression_type)) {
+  if (rep_->filter_block != nullptr) {
+    rep_->filter_block->StartBlock(0);
+  }
+  if (options.block_cache_compressed.get() != nullptr) {
+    BlockBasedTable::GenerateCachePrefix(
+        options.block_cache_compressed.get(), file,
+        &rep_->compressed_cache_key_prefix[0],
+        &rep_->compressed_cache_key_prefix_size);
+  }
+}
+
+BlockBasedTableBuilder::~BlockBasedTableBuilder() {
+  assert(rep_->closed);  // Catch errors where caller forgot to call Finish()
+  delete rep_->filter_block;
+  delete rep_;
+}
+
+void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
+  Rep* r = rep_;
+  assert(!r->closed);
+  if (!ok()) return;
+  if (r->props.num_entries > 0) {
+    assert(r->options.comparator->Compare(key, Slice(r->last_key)) > 0);
+  }
+
+  auto should_flush = r->flush_block_policy->Update(key, value);
+  if (should_flush) {
+    assert(!r->data_block.empty());
+    Flush();
+
+    // Add item to index block.
+    // We do not emit the index entry for a block until we have seen the
+    // first key for the next data block.  This allows us to use shorter
+    // keys in the index block.  For example, consider a block boundary
+    // between the keys "the quick brown fox" and "the who".  We can use
+    // "the r" as the key for the index block entry since it is >= all
+    // entries in the first block and < all entries in subsequent
+    // blocks.
+    if (ok()) {
+      r->options.comparator->FindShortestSeparator(&r->last_key, key);
+      std::string handle_encoding;
+      r->pending_handle.EncodeTo(&handle_encoding);
+      r->index_block.Add(r->last_key, Slice(handle_encoding));
+    }
+  }
+
+  if (r->filter_block != nullptr) {
+    r->filter_block->AddKey(key);
+  }
+
+  r->last_key.assign(key.data(), key.size());
+  r->data_block.Add(key, value);
+  r->props.num_entries++;
+  r->props.raw_key_size += key.size();
+  r->props.raw_value_size += value.size();
+
+  for (auto collector : r->options.table_properties_collectors) {
+    Status s = collector->Add(key, value);
+    if (!s.ok()) {
+      LogPropertiesCollectionError(
+          r->options.info_log.get(),
+          "Add", /* method */
+          collector->Name()
+      );
+    }
+  }
+}
+
+void BlockBasedTableBuilder::Flush() {
+  Rep* r = rep_;
+  assert(!r->closed);
+  if (!ok()) return;
+  if (r->data_block.empty()) return;
+  WriteBlock(&r->data_block, &r->pending_handle);
+  if (ok()) {
+    r->status = r->file->Flush();
+  }
+  if (r->filter_block != nullptr) {
+    r->filter_block->StartBlock(r->offset);
+  }
+  r->props.data_size = r->offset;
+  ++r->props.num_data_blocks;
+}
+
+void BlockBasedTableBuilder::WriteBlock(BlockBuilder* block,
+                                        BlockHandle* handle) {
+  // File format contains a sequence of blocks where each block has:
+  //    block_data: uint8[n]
+  //    type: uint8
+  //    crc: uint32
+  assert(ok());
+  Rep* r = rep_;
+  Slice raw = block->Finish();
+
+  Slice block_contents;
+  std::string* compressed = &r->compressed_output;
+  CompressionType type = r->compression_type;
+  switch (type) {
+    case kNoCompression:
+      block_contents = raw;
+      break;
+
+    case kSnappyCompression: {
+      std::string* compressed = &r->compressed_output;
+      if (port::Snappy_Compress(r->options.compression_opts, raw.data(),
+                                raw.size(), compressed) &&
+          GoodCompressionRatio(compressed->size(), raw.size())) {
+        block_contents = *compressed;
+      } else {
+        // Snappy not supported, or not good compression ratio, so just
+        // store uncompressed form
+        block_contents = raw;
+        type = kNoCompression;
+      }
+      break;
+    }
+    case kZlibCompression:
+      if (port::Zlib_Compress(r->options.compression_opts, raw.data(),
+                              raw.size(), compressed) &&
+          GoodCompressionRatio(compressed->size(), raw.size())) {
+        block_contents = *compressed;
+      } else {
+        // Zlib not supported, or not good compression ratio, so just
+        // store uncompressed form
+        block_contents = raw;
+        type = kNoCompression;
+      }
+      break;
+    case kBZip2Compression:
+      if (port::BZip2_Compress(r->options.compression_opts, raw.data(),
+                               raw.size(), compressed) &&
+          GoodCompressionRatio(compressed->size(), raw.size())) {
+        block_contents = *compressed;
+      } else {
+        // BZip not supported, or not good compression ratio, so just
+        // store uncompressed form
+        block_contents = raw;
+        type = kNoCompression;
+      }
+      break;
+  }
+  WriteRawBlock(block_contents, type, handle);
+  r->compressed_output.clear();
+  block->Reset();
+}
+
+void BlockBasedTableBuilder::WriteRawBlock(const Slice& block_contents,
+                                           CompressionType type,
+                                           BlockHandle* handle) {
+  Rep* r = rep_;
+  StopWatch sw(r->options.env, r->options.statistics.get(),
+               WRITE_RAW_BLOCK_MICROS);
+  handle->set_offset(r->offset);
+  handle->set_size(block_contents.size());
+  r->status = r->file->Append(block_contents);
+  if (r->status.ok()) {
+    char trailer[kBlockTrailerSize];
+    trailer[0] = type;
+    uint32_t crc = crc32c::Value(block_contents.data(), block_contents.size());
+    crc = crc32c::Extend(crc, trailer, 1);  // Extend crc to cover block type
+    EncodeFixed32(trailer+1, crc32c::Mask(crc));
+    r->status = r->file->Append(Slice(trailer, kBlockTrailerSize));
+    if (r->status.ok()) {
+      r->status = InsertBlockInCache(block_contents, type, handle);
+    }
+    if (r->status.ok()) {
+      r->offset += block_contents.size() + kBlockTrailerSize;
+    }
+  }
+}
+
+Status BlockBasedTableBuilder::status() const {
+  return rep_->status;
+}
+
+static void DeleteCachedBlock(const Slice& key, void* value) {
+  Block* block = reinterpret_cast<Block*>(value);
+  delete block;
+}
+
+//
+// Make a copy of the block contents and insert into compressed block cache
+//
+Status BlockBasedTableBuilder::InsertBlockInCache(const Slice& block_contents,
+                                 const CompressionType type,
+                                 const BlockHandle* handle) {
+  Rep* r = rep_;
+  Cache* block_cache_compressed = r->options.block_cache_compressed.get();
+
+  if (type != kNoCompression && block_cache_compressed != nullptr) {
+
+    Cache::Handle* cache_handle = nullptr;
+    size_t size = block_contents.size();
+
+    char* ubuf = new char[size];             // make a new copy
+    memcpy(ubuf, block_contents.data(), size);
+
+    BlockContents results;
+    Slice sl(ubuf, size);
+    results.data = sl;
+    results.cachable = true; // XXX
+    results.heap_allocated = true;
+    results.compression_type = type;
+
+    Block* block = new Block(results);
+
+    // make cache key by appending the file offset to the cache prefix id
+    char* end = EncodeVarint64(
+                  r->compressed_cache_key_prefix +
+                  r->compressed_cache_key_prefix_size,
+                  handle->offset());
+    Slice key(r->compressed_cache_key_prefix, static_cast<size_t>
+              (end - r->compressed_cache_key_prefix));
+
+    // Insert into compressed block cache.
+    cache_handle = block_cache_compressed->Insert(key, block, block->size(),
+                                                  &DeleteCachedBlock);
+    block_cache_compressed->Release(cache_handle);
+
+    // Invalidate OS cache.
+    r->file->InvalidateCache(r->offset, size);
+  }
+  return Status::OK();
+}
+
+Status BlockBasedTableBuilder::Finish() {
+  Rep* r = rep_;
+  bool empty_data_block = r->data_block.empty();
+  Flush();
+  assert(!r->closed);
+  r->closed = true;
+
+  BlockHandle filter_block_handle,
+              metaindex_block_handle,
+              index_block_handle;
+
+  // Write filter block
+  if (ok() && r->filter_block != nullptr) {
+    auto filter_contents = r->filter_block->Finish();
+    r->props.filter_size = filter_contents.size();
+    WriteRawBlock(filter_contents, kNoCompression, &filter_block_handle);
+  }
+
+  // To make sure properties block is able to keep the accurate size of index
+  // block, we will finish writing all index entries here and flush them
+  // to storage after metaindex block is written.
+  if (ok() && !empty_data_block) {
+    r->options.comparator->FindShortSuccessor(&r->last_key);
+
+    std::string handle_encoding;
+    r->pending_handle.EncodeTo(&handle_encoding);
+    r->index_block.Add(r->last_key, handle_encoding);
+  }
+
+  // Write meta blocks and metaindex block with the following order.
+  //    1. [meta block: filter]
+  //    2. [meta block: properties]
+  //    3. [metaindex block]
+  if (ok()) {
+    // We use `BytewiseComparator` as the comparator for meta block.
+    BlockBuilder meta_index_block(
+        r->options.block_restart_interval,
+        BytewiseComparator()
+    );
+    // Key: meta block name
+    // Value: block handle to that meta block
+    BytewiseSortedMap meta_block_handles;
+
+    // Write filter block.
+    if (r->filter_block != nullptr) {
+      // Add mapping from "<filter_block_prefix>.Name" to location
+      // of filter data.
+      std::string key = BlockBasedTable::kFilterBlockPrefix;
+      key.append(r->options.filter_policy->Name());
+      std::string handle_encoding;
+      filter_block_handle.EncodeTo(&handle_encoding);
+      meta_block_handles.insert(
+          std::make_pair(key, handle_encoding)
+      );
+    }
+
+    // Write properties block.
+    {
+      BlockBuilder properties_block(
+          r->options.block_restart_interval,
+          BytewiseComparator()
+      );
+
+      BytewiseSortedMap properties;
+
+      // Add basic properties
+      AddProperties(
+          properties,
+          BlockBasedTablePropertiesNames::kRawKeySize,
+          r->props.raw_key_size
+      );
+      AddProperties(
+          properties,
+          BlockBasedTablePropertiesNames::kRawValueSize,
+          r->props.raw_value_size
+      );
+      AddProperties(
+          properties,
+          BlockBasedTablePropertiesNames::kDataSize,
+          r->props.data_size
+      );
+      r->props.index_size =
+        r->index_block.CurrentSizeEstimate() + kBlockTrailerSize;
+      AddProperties(
+          properties,
+          BlockBasedTablePropertiesNames::kIndexSize,
+          r->props.index_size
+      );
+      AddProperties(
+          properties,
+          BlockBasedTablePropertiesNames::kNumEntries,
+          r->props.num_entries
+      );
+      AddProperties(
+          properties,
+          BlockBasedTablePropertiesNames::kNumDataBlocks,
+          r->props.num_data_blocks);
+      if (r->filter_block != nullptr) {
+        properties.insert({
+              BlockBasedTablePropertiesNames::kFilterPolicy,
+              r->options.filter_policy->Name()
+        });
+      }
+      AddProperties(
+          properties,
+          BlockBasedTablePropertiesNames::kFilterSize,
+          r->props.filter_size
+      );
+
+      for (auto collector : r->options.table_properties_collectors) {
+        TableProperties::UserCollectedProperties user_collected_properties;
+        Status s =
+          collector->Finish(&user_collected_properties);
+
+        if (!s.ok()) {
+          LogPropertiesCollectionError(
+              r->options.info_log.get(),
+              "Finish", /* method */
+              collector->Name()
+          );
+        } else {
+          properties.insert(
+              user_collected_properties.begin(),
+              user_collected_properties.end()
+          );
+        }
+      }
+
+      for (const auto& stat : properties) {
+        properties_block.Add(stat.first, stat.second);
+      }
+
+      BlockHandle properties_block_handle;
+      WriteBlock(&properties_block, &properties_block_handle);
+
+      std::string handle_encoding;
+      properties_block_handle.EncodeTo(&handle_encoding);
+      meta_block_handles.insert(
+          { BlockBasedTable::kPropertiesBlock, handle_encoding }
+      );
+    }  // end of properties block writing
+
+    for (const auto& metablock : meta_block_handles) {
+      meta_index_block.Add(metablock.first, metablock.second);
+    }
+
+    WriteBlock(&meta_index_block, &metaindex_block_handle);
+  }  // meta blocks and metaindex block.
+
+  // Write index block
+  if (ok()) {
+    WriteBlock(&r->index_block, &index_block_handle);
+  }
+
+  // Write footer
+  if (ok()) {
+    Footer footer;
+    footer.set_metaindex_handle(metaindex_block_handle);
+    footer.set_index_handle(index_block_handle);
+    std::string footer_encoding;
+    footer.EncodeTo(&footer_encoding);
+    r->status = r->file->Append(footer_encoding);
+    if (r->status.ok()) {
+      r->offset += footer_encoding.size();
+    }
+  }
+
+  // Print out the table stats
+  if (ok()) {
+    // user collected properties
+    std::string user_collected;
+    user_collected.reserve(1024);
+    for (auto collector : r->options.table_properties_collectors) {
+      for (const auto& prop : collector->GetReadableProperties()) {
+        user_collected.append(prop.first);
+        user_collected.append("=");
+        user_collected.append(prop.second);
+        user_collected.append("; ");
+      }
+    }
+
+    Log(
+        r->options.info_log,
+        "Table was constructed:\n"
+        "  [basic properties]: %s\n"
+        "  [user collected properties]: %s",
+        r->props.ToString().c_str(),
+        user_collected.c_str()
+    );
+  }
+
+  return r->status;
+}
+
+void BlockBasedTableBuilder::Abandon() {
+  Rep* r = rep_;
+  assert(!r->closed);
+  r->closed = true;
+}
+
+uint64_t BlockBasedTableBuilder::NumEntries() const {
+  return rep_->props.num_entries;
+}
+
+uint64_t BlockBasedTableBuilder::FileSize() const {
+  return rep_->offset;
+}
+
+}  // namespace rocksdb
diff --git a/table/block_based_table_builder.h b/table/block_based_table_builder.h
new file mode 100644 (file)
index 0000000..517f8e7
--- /dev/null
@@ -0,0 +1,85 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <stdint.h>
+#include "rocksdb/flush_block_policy.h"
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+
+namespace rocksdb {
+
+class BlockBuilder;
+class BlockHandle;
+class WritableFile;
+
+
+class BlockBasedTableBuilder : public TableBuilder {
+ public:
+  // Create a builder that will store the contents of the table it is
+  // building in *file.  Does not close the file.  It is up to the
+  // caller to close the file after calling Finish().
+  BlockBasedTableBuilder(const Options& options,
+                         WritableFile* file,
+                         FlushBlockPolicyFactory* flush_block_policy_factory,
+                         CompressionType compression_type);
+
+  // REQUIRES: Either Finish() or Abandon() has been called.
+  ~BlockBasedTableBuilder();
+
+  // Add key,value to the table being constructed.
+  // REQUIRES: key is after any previously added key according to comparator.
+  // REQUIRES: Finish(), Abandon() have not been called
+  void Add(const Slice& key, const Slice& value) override;
+
+  // Return non-ok iff some error has been detected.
+  Status status() const override;
+
+  // Finish building the table.  Stops using the file passed to the
+  // constructor after this function returns.
+  // REQUIRES: Finish(), Abandon() have not been called
+  Status Finish() override;
+
+  // Indicate that the contents of this builder should be abandoned.  Stops
+  // using the file passed to the constructor after this function returns.
+  // If the caller is not going to call Finish(), it must call Abandon()
+  // before destroying this builder.
+  // REQUIRES: Finish(), Abandon() have not been called
+  void Abandon() override;
+
+  // Number of calls to Add() so far.
+  uint64_t NumEntries() const override;
+
+  // Size of the file generated so far.  If invoked after a successful
+  // Finish() call, returns the size of the final generated file.
+  uint64_t FileSize() const override;
+
+ private:
+  bool ok() const { return status().ok(); }
+  void WriteBlock(BlockBuilder* block, BlockHandle* handle);
+  void WriteRawBlock(const Slice& data, CompressionType, BlockHandle* handle);
+  Status InsertBlockInCache(const Slice& block_contents,
+                         const CompressionType type, const BlockHandle* handle);
+  struct Rep;
+  Rep* rep_;
+
+  // Advanced operation: flush any buffered key/value pairs to file.
+  // Can be used to ensure that two adjacent entries never live in
+  // the same data block.  Most clients should not need to use this method.
+  // REQUIRES: Finish(), Abandon() have not been called
+  void Flush();
+
+  // No copying allowed
+  BlockBasedTableBuilder(const BlockBasedTableBuilder&) = delete;
+  void operator=(const BlockBasedTableBuilder&) = delete;
+};
+
+}  // namespace rocksdb
+
diff --git a/table/block_based_table_factory.cc b/table/block_based_table_factory.cc
new file mode 100644 (file)
index 0000000..836f6ed
--- /dev/null
@@ -0,0 +1,66 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+
+#include "table/block_based_table_factory.h"
+
+#include <memory>
+#include <stdint.h>
+#include "table/block_based_table_builder.h"
+#include "table/block_based_table_reader.h"
+#include "port/port.h"
+
+namespace rocksdb {
+
+Status BlockBasedTableFactory::GetTableReader(
+    const Options& options, const EnvOptions& soptions,
+    unique_ptr<RandomAccessFile> && file, uint64_t file_size,
+    unique_ptr<TableReader>* table_reader) const {
+  return BlockBasedTable::Open(options, soptions, std::move(file), file_size,
+                               table_reader);
+}
+
+TableBuilder* BlockBasedTableFactory::GetTableBuilder(
+    const Options& options, WritableFile* file,
+    CompressionType compression_type) const {
+  auto flush_block_policy_factory = 
+    table_options_.flush_block_policy_factory.get();
+
+  // if flush block policy factory is not set, we'll create the default one
+  // from the options.
+  //
+  // NOTE: we cannot pre-cache the "default block policy factory" because
+  // `FlushBlockBySizePolicyFactory` takes `options.block_size` and
+  // `options.block_size_deviation` as parameters, which may be different
+  // every time.
+  if (flush_block_policy_factory == nullptr) {
+    flush_block_policy_factory =
+        new FlushBlockBySizePolicyFactory(options.block_size,
+                                          options.block_size_deviation);
+  }
+
+  auto table_builder =  new BlockBasedTableBuilder(
+      options,
+      file,
+      flush_block_policy_factory,
+      compression_type);
+
+  // Delete flush_block_policy_factory only when it's just created from the
+  // options.
+  // We can safely delete flush_block_policy_factory since it will only be used
+  // during the construction of `BlockBasedTableBuilder`.
+  if (flush_block_policy_factory != 
+      table_options_.flush_block_policy_factory.get()) {
+    delete flush_block_policy_factory;
+  }
+
+  return table_builder;
+}
+
+}  // namespace rocksdb
diff --git a/table/block_based_table_factory.h b/table/block_based_table_factory.h
new file mode 100644 (file)
index 0000000..ee52581
--- /dev/null
@@ -0,0 +1,69 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <memory>
+#include <stdint.h>
+
+#include "rocksdb/flush_block_policy.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+
+namespace rocksdb {
+
+struct Options;
+struct EnvOptions;
+
+using std::unique_ptr;
+class Status;
+class RandomAccessFile;
+class WritableFile;
+class Table;
+class TableBuilder;
+class BlockBasedTable;
+class BlockBasedTableBuilder;
+
+class BlockBasedTableFactory: public TableFactory {
+public:
+  struct TableOptions {
+    // @flush_block_policy_factory creates the instances of flush block policy.
+    // which provides a configurable way to determine when to flush a block in
+    // the block based tables.  If not set, table builder will use the default
+    // block flush policy, which cut blocks by block size (please refer to
+    // `FlushBlockBySizePolicy`).
+    std::shared_ptr<FlushBlockPolicyFactory> flush_block_policy_factory;
+  };
+
+  BlockBasedTableFactory() : BlockBasedTableFactory(TableOptions()) { }
+  BlockBasedTableFactory(const TableOptions& table_options): 
+      table_options_(table_options) { 
+  }
+
+  ~BlockBasedTableFactory() {
+  }
+
+  const char* Name() const override {
+    return "BlockBasedTable";
+  }
+
+  Status GetTableReader(const Options& options, const EnvOptions& soptions,
+                        unique_ptr<RandomAccessFile> && file,
+                        uint64_t file_size,
+                        unique_ptr<TableReader>* table_reader) const override;
+
+  TableBuilder* GetTableBuilder(const Options& options, WritableFile* file,
+                                CompressionType compression_type) const
+                                    override;
+
+ private:
+  TableOptions table_options_;
+};
+
+
+}  // namespace rocksdb
diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc
new file mode 100644 (file)
index 0000000..dcb55fc
--- /dev/null
@@ -0,0 +1,1099 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/block_based_table_reader.h"
+
+#include "db/dbformat.h"
+
+#include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/options.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/table.h"
+
+#include "table/block.h"
+#include "table/filter_block.h"
+#include "table/format.h"
+#include "table/two_level_iterator.h"
+
+#include "util/coding.h"
+#include "util/perf_context_imp.h"
+#include "util/stop_watch.h"
+
+namespace rocksdb {
+
+// The longest the prefix of the cache key used to identify blocks can be.
+// We are using the fact that we know for Posix files the unique ID is three
+// varints.
+const size_t kMaxCacheKeyPrefixSize = kMaxVarint64Length*3+1;
+using std::unique_ptr;
+
+struct BlockBasedTable::Rep {
+  Rep(const EnvOptions& storage_options) :
+    soptions(storage_options) {
+  }
+
+  Options options;
+  const EnvOptions& soptions;
+  Status status;
+  unique_ptr<RandomAccessFile> file;
+  char cache_key_prefix[kMaxCacheKeyPrefixSize];
+  size_t cache_key_prefix_size;
+  char compressed_cache_key_prefix[kMaxCacheKeyPrefixSize];
+  size_t compressed_cache_key_prefix_size;
+
+  // Handle to metaindex_block: saved from footer
+  BlockHandle metaindex_handle;
+  // Handle to index: saved from footer
+  BlockHandle index_handle;
+  // index_block will be populated and used only when options.block_cache is
+  // NULL; otherwise we will get the index block via the block cache.
+  unique_ptr<Block> index_block;
+  unique_ptr<FilterBlockReader> filter;
+
+  TableProperties table_properties;
+};
+
+BlockBasedTable::~BlockBasedTable() {
+  delete rep_;
+}
+
+// CachableEntry represents the entries that *may* be fetched from block cache.
+//  field `value` is the item we want to get.
+//  field `cache_handle` is the cache handle to the block cache. If the value
+//    was not read from cache, `cache_handle` will be nullptr.
+template <class TValue>
+struct BlockBasedTable::CachableEntry {
+  CachableEntry(TValue* value, Cache::Handle* cache_handle)
+    : value(value)
+    , cache_handle(cache_handle) {
+  }
+  CachableEntry(): CachableEntry(nullptr, nullptr) { }
+  void Release(Cache* cache) {
+    if (cache_handle) {
+      cache->Release(cache_handle);
+      value = nullptr;
+      cache_handle = nullptr;
+    }
+  }
+
+  TValue* value = nullptr;
+  // if the entry is from the cache, cache_handle will be populated.
+  Cache::Handle* cache_handle = nullptr;
+};
+
+// Helper function to setup the cache key's prefix for the Table.
+void BlockBasedTable::SetupCacheKeyPrefix(Rep* rep) {
+  assert(kMaxCacheKeyPrefixSize >= 10);
+  rep->cache_key_prefix_size = 0;
+  rep->compressed_cache_key_prefix_size = 0;
+  if (rep->options.block_cache != nullptr) {
+    GenerateCachePrefix(rep->options.block_cache.get(), rep->file.get(),
+                        &rep->cache_key_prefix[0],
+                        &rep->cache_key_prefix_size);
+  }
+  if (rep->options.block_cache_compressed != nullptr) {
+    GenerateCachePrefix(rep->options.block_cache_compressed.get(),
+                        rep->file.get(), &rep->compressed_cache_key_prefix[0],
+                        &rep->compressed_cache_key_prefix_size);
+  }
+}
+
+void BlockBasedTable::GenerateCachePrefix(Cache* cc,
+    RandomAccessFile* file, char* buffer, size_t* size) {
+
+  // generate an id from the file
+  *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize);
+
+  // If the prefix wasn't generated or was too long,
+  // create one from the cache.
+  if (*size == 0) {
+    char* end = EncodeVarint64(buffer, cc->NewId());
+    *size = static_cast<size_t>(end - buffer);
+  }
+}
+
+void BlockBasedTable::GenerateCachePrefix(Cache* cc,
+    WritableFile* file, char* buffer, size_t* size) {
+
+  // generate an id from the file
+  *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize);
+
+  // If the prefix wasn't generated or was too long,
+  // create one from the cache.
+  if (*size == 0) {
+    char* end = EncodeVarint64(buffer, cc->NewId());
+    *size = static_cast<size_t>(end - buffer);
+  }
+}
+
+namespace {  // anonymous namespace, not visible externally
+
+// Read the block identified by "handle" from "file".
+// The only relevant option is options.verify_checksums for now.
+// Set *didIO to true if didIO is not null.
+// On failure return non-OK.
+// On success fill *result and return OK - caller owns *result
+Status ReadBlockFromFile(
+    RandomAccessFile* file,
+    const ReadOptions& options,
+    const BlockHandle& handle,
+    Block** result,
+    Env* env,
+    bool* didIO = nullptr,
+    bool do_uncompress = true) {
+  BlockContents contents;
+  Status s = ReadBlockContents(file, options, handle, &contents,
+                               env, do_uncompress);
+  if (s.ok()) {
+    *result = new Block(contents);
+  }
+
+  if (didIO) {
+    *didIO = true;
+  }
+  return s;
+}
+
+void DeleteBlock(void* arg, void* ignored) {
+  delete reinterpret_cast<Block*>(arg);
+}
+
+void DeleteCachedBlock(const Slice& key, void* value) {
+  Block* block = reinterpret_cast<Block*>(value);
+  delete block;
+}
+
+void DeleteCachedFilter(const Slice& key, void* value) {
+  auto filter = reinterpret_cast<FilterBlockReader*>(value);
+  delete filter;
+}
+
+void ReleaseBlock(void* arg, void* h) {
+  Cache* cache = reinterpret_cast<Cache*>(arg);
+  Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(h);
+  cache->Release(handle);
+}
+
+Slice GetCacheKey(const char* cache_key_prefix,
+                  size_t cache_key_prefix_size,
+                  const BlockHandle& handle,
+                  char* cache_key) {
+  assert(cache_key != nullptr);
+  assert(cache_key_prefix_size != 0);
+  assert(cache_key_prefix_size <= kMaxCacheKeyPrefixSize);
+  memcpy(cache_key, cache_key_prefix, cache_key_prefix_size);
+  char* end = EncodeVarint64(cache_key + cache_key_prefix_size,
+                             handle.offset());
+  return Slice(cache_key, static_cast<size_t>(end - cache_key));
+}
+
+Cache::Handle* GetFromBlockCache(
+    Cache* block_cache,
+    const Slice& key,
+    Tickers block_cache_miss_ticker,
+    Tickers block_cache_hit_ticker,
+    Statistics* statistics) {
+  auto cache_handle = block_cache->Lookup(key);
+  if (cache_handle != nullptr) {
+    BumpPerfCount(&perf_context.block_cache_hit_count);
+    // overall cache hit
+    RecordTick(statistics, BLOCK_CACHE_HIT);
+    // block-type specific cache hit
+    RecordTick(statistics, block_cache_hit_ticker);
+  } else {
+    // overall cache miss
+    RecordTick(statistics, BLOCK_CACHE_MISS);
+    // block-type specific cache miss
+    RecordTick(statistics, block_cache_miss_ticker);
+  }
+
+  return cache_handle;
+}
+
+} // end of anonymous namespace
+
+Status BlockBasedTable::Open(const Options& options,
+                             const EnvOptions& soptions,
+                             unique_ptr<RandomAccessFile> && file,
+                             uint64_t size,
+                             unique_ptr<TableReader>* table_reader) {
+  table_reader->reset();
+  if (size < Footer::kEncodedLength) {
+    return Status::InvalidArgument("file is too short to be an sstable");
+  }
+
+  char footer_space[Footer::kEncodedLength];
+  Slice footer_input;
+  Status s = file->Read(size - Footer::kEncodedLength, Footer::kEncodedLength,
+                        &footer_input, footer_space);
+  if (!s.ok()) return s;
+
+  // Check that we actually read the whole footer from the file. It may be
+  // that size isn't correct.
+  if (footer_input.size() != Footer::kEncodedLength) {
+    return Status::InvalidArgument("file is too short to be an sstable");
+  }
+
+  Footer footer;
+  s = footer.DecodeFrom(&footer_input);
+  if (!s.ok()) return s;
+
+  // We've successfully read the footer and the index block: we're
+  // ready to serve requests.
+  Rep* rep = new BlockBasedTable::Rep(soptions);
+  rep->options = options;
+  rep->file = std::move(file);
+  rep->metaindex_handle = footer.metaindex_handle();
+  rep->index_handle = footer.index_handle();
+  SetupCacheKeyPrefix(rep);
+  unique_ptr<BlockBasedTable> new_table(new BlockBasedTable(rep));
+
+  // Read meta index
+  std::unique_ptr<Block> meta;
+  std::unique_ptr<Iterator> meta_iter;
+  s = ReadMetaBlock(rep, &meta, &meta_iter);
+
+  // Read the properties
+  meta_iter->Seek(kPropertiesBlock);
+  if (meta_iter->Valid() && meta_iter->key() == Slice(kPropertiesBlock)) {
+    s = meta_iter->status();
+    if (s.ok()) {
+      s = ReadProperties(meta_iter->value(), rep, &rep->table_properties);
+    }
+
+    if (!s.ok()) {
+      auto err_msg =
+        "[Warning] Encountered error while reading data from properties "
+        "block " + s.ToString();
+      Log(rep->options.info_log, "%s", err_msg.c_str());
+    }
+  }
+
+  // Initialize index/filter blocks. If block cache is not specified,
+  // these blocks will be kept in member variables in Rep, which will
+  // reside in the memory as long as this table object is alive; otherwise
+  // they will be added to block cache.
+  if (!options.block_cache) {
+    Block* index_block = nullptr;
+    // TODO: we never really verify check sum for index block
+    s = ReadBlockFromFile(
+        rep->file.get(),
+        ReadOptions(),
+        footer.index_handle(),
+        &index_block,
+        options.env
+    );
+
+    if (s.ok()) {
+      assert(index_block->compressionType() == kNoCompression);
+      rep->index_block.reset(index_block);
+
+      // Set index block
+      if (rep->options.filter_policy) {
+        std::string key = kFilterBlockPrefix;
+        key.append(rep->options.filter_policy->Name());
+        meta_iter->Seek(key);
+
+        if (meta_iter->Valid() && meta_iter->key() == Slice(key)) {
+          rep->filter.reset(ReadFilter(meta_iter->value(), rep));
+        }
+      }
+    } else {
+      delete index_block;
+    }
+  } else {
+    // Call IndexBlockReader() to implicitly add index to the block_cache
+    unique_ptr<Iterator> iter(
+        new_table->IndexBlockReader(ReadOptions())
+    );
+    s = iter->status();
+
+    if (s.ok()) {
+      // Call GetFilter() to implicitly add filter to the block_cache
+      auto filter_entry = new_table->GetFilter();
+      filter_entry.Release(options.block_cache.get());
+    }
+  }
+
+  if (s.ok()) {
+    *table_reader = std::move(new_table);
+  }
+
+  return s;
+}
+
+void BlockBasedTable::SetupForCompaction() {
+  switch (rep_->options.access_hint_on_compaction_start) {
+    case Options::NONE:
+      break;
+    case Options::NORMAL:
+      rep_->file->Hint(RandomAccessFile::NORMAL);
+      break;
+    case Options::SEQUENTIAL:
+      rep_->file->Hint(RandomAccessFile::SEQUENTIAL);
+      break;
+    case Options::WILLNEED:
+      rep_->file->Hint(RandomAccessFile::WILLNEED);
+      break;
+    default:
+      assert(false);
+  }
+  compaction_optimized_ = true;
+}
+
+TableProperties& BlockBasedTable::GetTableProperties() {
+  return rep_->table_properties;
+}
+
+// Load the meta-block from the file. On success, return the loaded meta block
+// and its iterator.
+Status BlockBasedTable::ReadMetaBlock(
+    Rep* rep,
+    std::unique_ptr<Block>* meta_block,
+    std::unique_ptr<Iterator>* iter) {
+  // TODO(sanjay): Skip this if footer.metaindex_handle() size indicates
+  // it is an empty block.
+  //  TODO: we never really verify check sum for meta index block
+  Block* meta = nullptr;
+  Status s = ReadBlockFromFile(
+      rep->file.get(),
+      ReadOptions(),
+      rep->metaindex_handle,
+      &meta,
+      rep->options.env);
+
+    if (!s.ok()) {
+      auto err_msg =
+        "[Warning] Encountered error while reading data from properties"
+        "block " + s.ToString();
+      Log(rep->options.info_log, "%s", err_msg.c_str());
+    }
+  if (!s.ok()) {
+    delete meta;
+    return s;
+  }
+
+  meta_block->reset(meta);
+  // meta block uses bytewise comparator.
+  iter->reset(meta->NewIterator(BytewiseComparator()));
+  return Status::OK();
+}
+
+FilterBlockReader* BlockBasedTable::ReadFilter (
+    const Slice& filter_handle_value,
+    BlockBasedTable::Rep* rep,
+    size_t* filter_size) {
+  Slice v = filter_handle_value;
+  BlockHandle filter_handle;
+  if (!filter_handle.DecodeFrom(&v).ok()) {
+    return nullptr;
+  }
+
+  // TODO: We might want to unify with ReadBlockFromFile() if we start
+  // requiring checksum verification in Table::Open.
+  ReadOptions opt;
+  BlockContents block;
+  if (!ReadBlockContents(rep->file.get(), opt, filter_handle, &block,
+                        rep->options.env, false).ok()) {
+    return nullptr;
+  }
+
+  if (filter_size) {
+    *filter_size = block.data.size();
+  }
+
+  return new FilterBlockReader(
+       rep->options, block.data, block.heap_allocated);
+}
+
+Status BlockBasedTable::ReadProperties(
+    const Slice& handle_value, Rep* rep, TableProperties* table_properties) {
+  assert(table_properties);
+
+  Slice v = handle_value;
+  BlockHandle handle;
+  if (!handle.DecodeFrom(&v).ok()) {
+    return Status::InvalidArgument("Failed to decode properties block handle");
+  }
+
+  BlockContents block_contents;
+  Status s = ReadBlockContents(
+      rep->file.get(),
+      ReadOptions(),
+      handle,
+      &block_contents,
+      rep->options.env,
+      false
+  );
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  Block properties_block(block_contents);
+  std::unique_ptr<Iterator> iter(
+      properties_block.NewIterator(BytewiseComparator())
+  );
+
+  // All pre-defined properties of type uint64_t
+  std::unordered_map<std::string, uint64_t*> predefined_uint64_properties = {
+    { BlockBasedTablePropertiesNames::kDataSize,
+      &table_properties->data_size },
+    { BlockBasedTablePropertiesNames::kIndexSize,
+      &table_properties->index_size },
+    { BlockBasedTablePropertiesNames::kFilterSize,
+      &table_properties->filter_size },
+    { BlockBasedTablePropertiesNames::kRawKeySize,
+      &table_properties->raw_key_size },
+    { BlockBasedTablePropertiesNames::kRawValueSize,
+      &table_properties->raw_value_size },
+    { BlockBasedTablePropertiesNames::kNumDataBlocks,
+      &table_properties->num_data_blocks },
+    { BlockBasedTablePropertiesNames::kNumEntries,
+      &table_properties->num_entries },
+  };
+
+  std::string last_key;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    s = iter->status();
+    if (!s.ok()) {
+      break;
+    }
+
+    auto key = iter->key().ToString();
+    // properties block is strictly sorted with no duplicate key.
+    assert(
+        last_key.empty() ||
+        BytewiseComparator()->Compare(key, last_key) > 0
+    );
+    last_key = key;
+
+    auto raw_val = iter->value();
+    auto pos = predefined_uint64_properties.find(key);
+
+    if (pos != predefined_uint64_properties.end()) {
+      // handle predefined rocksdb properties
+      uint64_t val;
+      if (!GetVarint64(&raw_val, &val)) {
+        // skip malformed value
+        auto error_msg =
+          "[Warning] detect malformed value in properties meta-block:"
+          "\tkey: " + key + "\tval: " + raw_val.ToString();
+        Log(rep->options.info_log, "%s", error_msg.c_str());
+        continue;
+      }
+      *(pos->second) = val;
+    } else if (key == BlockBasedTablePropertiesNames::kFilterPolicy) {
+      table_properties->filter_policy_name = raw_val.ToString();
+    } else {
+      // handle user-collected
+      table_properties->user_collected_properties.insert(
+          std::make_pair(key, raw_val.ToString())
+      );
+    }
+  }
+
+  return s;
+}
+
+Status BlockBasedTable::GetBlock(
+    const BlockBasedTable* table,
+    const BlockHandle& handle,
+    const ReadOptions& options,
+    const bool for_compaction,
+    const Tickers block_cache_miss_ticker,
+    const Tickers block_cache_hit_ticker,
+    bool* didIO,
+    CachableEntry<Block>* entry) {
+  bool no_io = options.read_tier == kBlockCacheTier;
+  Cache* block_cache = table->rep_->options.block_cache.get();
+  Statistics* statistics = table->rep_->options.statistics.get();
+  Status s;
+
+  if (block_cache != nullptr) {
+    char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
+    auto key = GetCacheKey(
+        table->rep_->cache_key_prefix,
+        table->rep_->cache_key_prefix_size,
+        handle,
+        cache_key
+    );
+
+    entry->cache_handle = GetFromBlockCache(
+        block_cache,
+        key,
+        block_cache_miss_ticker,
+        block_cache_hit_ticker,
+        statistics
+    );
+
+    if (entry->cache_handle != nullptr) {
+      entry->value =
+        reinterpret_cast<Block*>(block_cache->Value(entry->cache_handle));
+    } else if (no_io) {
+      // Did not find in block_cache and can't do IO
+      return Status::Incomplete("no blocking io");
+    } else {
+      Histograms histogram = for_compaction ?
+        READ_BLOCK_COMPACTION_MICROS : READ_BLOCK_GET_MICROS;
+      {
+        // block for stop watch
+        StopWatch sw(table->rep_->options.env, statistics, histogram);
+        s = ReadBlockFromFile(
+              table->rep_->file.get(),
+              options,
+              handle,
+              &entry->value,
+              table->rep_->options.env,
+              didIO
+            );
+      }
+      if (s.ok()) {
+        if (options.fill_cache && entry->value->isCachable()) {
+          entry->cache_handle = block_cache->Insert(
+            key, entry->value, entry->value->size(), &DeleteCachedBlock);
+          RecordTick(statistics, BLOCK_CACHE_ADD);
+        }
+      }
+    }
+  } else if (no_io) {
+    // Could not read from block_cache and can't do IO
+    return Status::Incomplete("no blocking io");
+  } else {
+    s = ReadBlockFromFile(
+        table->rep_->file.get(),
+        options,
+        handle,
+        &entry->value,
+        table->rep_->options.env,
+        didIO
+      );
+  }
+
+  return s;
+}
+
+// Convert an index iterator value (i.e., an encoded BlockHandle)
+// into an iterator over the contents of the corresponding block.
+Iterator* BlockBasedTable::BlockReader(void* arg,
+                                       const ReadOptions& options,
+                                       const Slice& index_value,
+                                       bool* didIO,
+                                       bool for_compaction) {
+  const bool no_io = (options.read_tier == kBlockCacheTier);
+  BlockBasedTable* table = reinterpret_cast<BlockBasedTable*>(arg);
+  Cache* block_cache = table->rep_->options.block_cache.get();
+  Cache* block_cache_compressed = table->rep_->options.
+                                    block_cache_compressed.get();
+  Statistics* statistics = table->rep_->options.statistics.get();
+  Block* block = nullptr;
+  Block* cblock = nullptr;
+  Cache::Handle* cache_handle = nullptr;
+  Cache::Handle* compressed_cache_handle = nullptr;
+
+  BlockHandle handle;
+  Slice input = index_value;
+  Status s = handle.DecodeFrom(&input);
+  // We intentionally allow extra stuff in index_value so that we
+  // can add more features in the future.
+
+  if (!s.ok()) {
+    return NewErrorIterator(s);
+  }
+
+  if (block_cache != nullptr || block_cache_compressed != nullptr) {
+    char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
+    char compressed_cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
+    Slice key,  /* key to the block cache */
+          ckey /* key to the compressed block cache */ ;
+
+    // create key for block cache
+    if (block_cache != nullptr) {
+      key = GetCacheKey(
+          table->rep_->cache_key_prefix,
+          table->rep_->cache_key_prefix_size,
+          handle,
+          cache_key
+      );
+    }
+
+    if (block_cache_compressed != nullptr) {
+      ckey = GetCacheKey(
+          table->rep_->compressed_cache_key_prefix,
+          table->rep_->compressed_cache_key_prefix_size,
+          handle,
+          compressed_cache_key
+      );
+    }
+
+    // Lookup uncompressed cache first
+    if (block_cache != nullptr) {
+      assert(!key.empty());
+      cache_handle = block_cache->Lookup(key);
+      if (cache_handle != nullptr) {
+        block = reinterpret_cast<Block*>(block_cache->Value(cache_handle));
+        RecordTick(statistics, BLOCK_CACHE_HIT);
+        RecordTick(statistics, BLOCK_CACHE_DATA_HIT);
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_MISS);
+        RecordTick(statistics, BLOCK_CACHE_DATA_MISS);
+      }
+    }
+
+    // If not found in uncompressed cache, lookup compressed cache
+    if (block == nullptr && block_cache_compressed != nullptr) {
+      assert(!ckey.empty());
+      compressed_cache_handle = block_cache_compressed->Lookup(ckey);
+
+      // if we found in the compressed cache, then uncompress and
+      // insert into uncompressed cache
+      if (compressed_cache_handle != nullptr) {
+        // found compressed block
+        cblock = reinterpret_cast<Block*>(block_cache_compressed->
+                        Value(compressed_cache_handle));
+        assert(cblock->compressionType() != kNoCompression);
+
+        // Retrieve the uncompressed contents into a new buffer
+        BlockContents contents;
+        s = UncompressBlockContents(cblock->data(), cblock->size(),
+                                    &contents);
+
+        // Insert uncompressed block into block cache
+        if (s.ok()) {
+          block = new Block(contents); // uncompressed block
+          assert(block->compressionType() == kNoCompression);
+          if (block_cache != nullptr && block->isCachable() &&
+              options.fill_cache) {
+            cache_handle = block_cache->Insert(key, block, block->size(),
+                                               &DeleteCachedBlock);
+            assert(reinterpret_cast<Block*>(block_cache->Value(cache_handle))
+                   == block);
+          }
+        }
+        // Release hold on compressed cache entry
+        block_cache_compressed->Release(compressed_cache_handle);
+        RecordTick(statistics, BLOCK_CACHE_COMPRESSED_HIT);
+      }
+    }
+
+    if (block != nullptr) {
+      BumpPerfCount(&perf_context.block_cache_hit_count);
+    } else if (no_io) {
+      // Did not find in block_cache and can't do IO
+      return NewErrorIterator(Status::Incomplete("no blocking io"));
+    } else {
+      Histograms histogram = for_compaction ?
+        READ_BLOCK_COMPACTION_MICROS : READ_BLOCK_GET_MICROS;
+      {  // block for stop watch
+        StopWatch sw(table->rep_->options.env, statistics, histogram);
+        s = ReadBlockFromFile(
+              table->rep_->file.get(),
+              options,
+              handle,
+              &cblock,
+              table->rep_->options.env,
+              didIO,
+              block_cache_compressed == nullptr
+            );
+      }
+      if (s.ok()) {
+        assert(cblock->compressionType() == kNoCompression ||
+               block_cache_compressed != nullptr);
+
+        // Retrieve the uncompressed contents into a new buffer
+        BlockContents contents;
+        if (cblock->compressionType() != kNoCompression) {
+          s = UncompressBlockContents(cblock->data(), cblock->size(),
+                                      &contents);
+        }
+        if (s.ok()) {
+          if (cblock->compressionType() != kNoCompression) {
+            block = new Block(contents); // uncompressed block
+          } else {
+            block = cblock;
+            cblock = nullptr;
+          }
+          if (block->isCachable() && options.fill_cache) {
+            // Insert compressed block into compressed block cache.
+            // Release the hold on the compressed cache entry immediately.
+            if (block_cache_compressed != nullptr && cblock != nullptr) {
+              compressed_cache_handle = block_cache_compressed->Insert(
+                          ckey, cblock, cblock->size(), &DeleteCachedBlock);
+              block_cache_compressed->Release(compressed_cache_handle);
+              RecordTick(statistics, BLOCK_CACHE_COMPRESSED_MISS);
+              cblock = nullptr;
+            }
+            // insert into uncompressed block cache
+            assert((block->compressionType() == kNoCompression));
+            if (block_cache != nullptr) {
+              cache_handle = block_cache->Insert(
+                key, block, block->size(), &DeleteCachedBlock);
+              RecordTick(statistics, BLOCK_CACHE_ADD);
+              assert(reinterpret_cast<Block*>(block_cache->Value(
+                     cache_handle))== block);
+            }
+          }
+        }
+      }
+      if (cblock != nullptr) {
+        delete cblock;
+      }
+    }
+  } else if (no_io) {
+    // Could not read from block_cache and can't do IO
+    return NewErrorIterator(Status::Incomplete("no blocking io"));
+  } else {
+    s = ReadBlockFromFile(
+        table->rep_->file.get(),
+        options,
+        handle,
+        &block,
+        table->rep_->options.env,
+        didIO
+      );
+  }
+
+  Iterator* iter;
+  if (block != nullptr) {
+    iter = block->NewIterator(table->rep_->options.comparator);
+    if (cache_handle != nullptr) {
+      iter->RegisterCleanup(&ReleaseBlock, block_cache, cache_handle);
+    } else {
+      iter->RegisterCleanup(&DeleteBlock, block, nullptr);
+    }
+  } else {
+    iter = NewErrorIterator(s);
+  }
+  return iter;
+}
+
+BlockBasedTable::CachableEntry<FilterBlockReader>
+BlockBasedTable::GetFilter(bool no_io) const {
+  if (!rep_->options.filter_policy || !rep_->options.block_cache) {
+    return {rep_->filter.get(), nullptr};
+  }
+
+  // Fetching from the cache
+  Cache* block_cache = rep_->options.block_cache.get();
+  char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
+  auto key = GetCacheKey(
+      rep_->cache_key_prefix,
+      rep_->cache_key_prefix_size,
+      rep_->metaindex_handle,
+      cache_key
+  );
+
+  Statistics* statistics = rep_->options.statistics.get();
+  auto cache_handle = GetFromBlockCache(
+    block_cache,
+    key,
+    BLOCK_CACHE_FILTER_MISS,
+    BLOCK_CACHE_FILTER_HIT,
+    statistics
+  );
+
+  FilterBlockReader* filter = nullptr;
+  if (cache_handle != nullptr) {
+     filter = reinterpret_cast<FilterBlockReader*>(
+         block_cache->Value(cache_handle));
+  } else if (no_io) {
+    // Do not invoke any io.
+    return CachableEntry<FilterBlockReader>();
+  } else {
+    size_t filter_size = 0;
+    std::unique_ptr<Block> meta;
+    std::unique_ptr<Iterator> iter;
+    auto s = ReadMetaBlock(rep_, &meta, &iter);
+
+    if (s.ok()) {
+      std::string filter_block_key = kFilterBlockPrefix;
+      filter_block_key.append(rep_->options.filter_policy->Name());
+      iter->Seek(filter_block_key);
+
+      if (iter->Valid() && iter->key() == Slice(filter_block_key)) {
+        filter = ReadFilter(iter->value(), rep_, &filter_size);
+        assert(filter);
+        assert(filter_size > 0);
+
+        cache_handle = block_cache->Insert(
+          key, filter, filter_size, &DeleteCachedFilter);
+        RecordTick(statistics, BLOCK_CACHE_ADD);
+      }
+    }
+  }
+
+  return { filter, cache_handle };
+}
+
+// Get the iterator from the index block.
+Iterator* BlockBasedTable::IndexBlockReader(const ReadOptions& options) const {
+  if (rep_->index_block) {
+    assert (!rep_->options.block_cache);
+    return rep_->index_block->NewIterator(rep_->options.comparator);
+  }
+
+  // get index block from cache
+  assert (rep_->options.block_cache);
+  bool didIO = false;
+  CachableEntry<Block> entry;
+
+  auto s = GetBlock(
+      this,
+      rep_->index_handle,
+      options,
+      false,  /* for compaction */
+      BLOCK_CACHE_INDEX_MISS,
+      BLOCK_CACHE_INDEX_HIT,
+      &didIO,
+      &entry
+  );
+
+  Iterator* iter;
+  if (entry.value != nullptr) {
+    iter = entry.value->NewIterator(rep_->options.comparator);
+    if (entry.cache_handle) {
+      iter->RegisterCleanup(
+          &ReleaseBlock, rep_->options.block_cache.get(), entry.cache_handle
+      );
+    } else {
+      iter->RegisterCleanup(&DeleteBlock, entry.value, nullptr);
+    }
+  } else {
+    iter = NewErrorIterator(s);
+  }
+  return iter;
+}
+
+Iterator* BlockBasedTable::BlockReader(void* arg,
+                                       const ReadOptions& options,
+                                       const EnvOptions& soptions,
+                                       const Slice& index_value,
+                                       bool for_compaction) {
+  return BlockReader(arg, options, index_value, nullptr, for_compaction);
+}
+
+// This will be broken if the user specifies an unusual implementation
+// of Options.comparator, or if the user specifies an unusual
+// definition of prefixes in Options.filter_policy.  In particular, we
+// require the following three properties:
+//
+// 1) key.starts_with(prefix(key))
+// 2) Compare(prefix(key), key) <= 0.
+// 3) If Compare(key1, key2) <= 0, then Compare(prefix(key1), prefix(key2)) <= 0
+//
+// Otherwise, this method guarantees no I/O will be incurred.
+//
+// REQUIRES: this method shouldn't be called while the DB lock is held.
+bool BlockBasedTable::PrefixMayMatch(const Slice& internal_prefix) {
+  bool may_match = true;
+  Status s;
+
+  if (!rep_->options.filter_policy) {
+    return true;
+  }
+
+  // To prevent any io operation in this method, we set `read_tier` to make
+  // sure we always read index or filter only when they have already been
+  // loaded to memory.
+  ReadOptions no_io_read_options;
+  no_io_read_options.read_tier = kBlockCacheTier;
+  unique_ptr<Iterator> iiter(
+      IndexBlockReader(no_io_read_options)
+  );
+  iiter->Seek(internal_prefix);
+
+  if (!iiter->Valid()) {
+    // we're past end of file
+    // if it's incomplete, it means that we avoided I/O
+    // and we're not really sure that we're past the end
+    // of the file
+    may_match = iiter->status().IsIncomplete();
+  } else if (ExtractUserKey(iiter->key()).starts_with(
+              ExtractUserKey(internal_prefix))) {
+    // we need to check for this subtle case because our only
+    // guarantee is that "the key is a string >= last key in that data
+    // block" according to the doc/table_format.txt spec.
+    //
+    // Suppose iiter->key() starts with the desired prefix; it is not
+    // necessarily the case that the corresponding data block will
+    // contain the prefix, since iiter->key() need not be in the
+    // block.  However, the next data block may contain the prefix, so
+    // we return true to play it safe.
+    may_match = true;
+  } else {
+    // iiter->key() does NOT start with the desired prefix.  Because
+    // Seek() finds the first key that is >= the seek target, this
+    // means that iiter->key() > prefix.  Thus, any data blocks coming
+    // after the data block corresponding to iiter->key() cannot
+    // possibly contain the key.  Thus, the corresponding data block
+    // is the only one which could potentially contain the prefix.
+    Slice handle_value = iiter->value();
+    BlockHandle handle;
+    s = handle.DecodeFrom(&handle_value);
+    assert(s.ok());
+    auto filter_entry = GetFilter(true /* no io */);
+    may_match =
+      filter_entry.value == nullptr ||
+      filter_entry.value->PrefixMayMatch(handle.offset(), internal_prefix);
+    filter_entry.Release(rep_->options.block_cache.get());
+  }
+
+  Statistics* statistics = rep_->options.statistics.get();
+  RecordTick(statistics, BLOOM_FILTER_PREFIX_CHECKED);
+  if (!may_match) {
+    RecordTick(statistics, BLOOM_FILTER_PREFIX_USEFUL);
+  }
+
+  return may_match;
+}
+
+Iterator* BlockBasedTable::NewIterator(const ReadOptions& options) {
+  if (options.prefix) {
+    InternalKey internal_prefix(*options.prefix, 0, kTypeValue);
+    if (!PrefixMayMatch(internal_prefix.Encode())) {
+      // nothing in this file can match the prefix, so we should not
+      // bother doing I/O to this file when iterating.
+      return NewEmptyIterator();
+    }
+  }
+
+  return NewTwoLevelIterator(
+           IndexBlockReader(options),
+           &BlockBasedTable::BlockReader,
+           const_cast<BlockBasedTable*>(this),
+           options,
+           rep_->soptions
+         );
+}
+
+Status BlockBasedTable::Get(
+    const ReadOptions& readOptions,
+    const Slice& key,
+    void* handle_context,
+    bool (*result_handler)(void* handle_context, const Slice& k,
+                           const Slice& v, bool didIO),
+    void (*mark_key_may_exist_handler)(void* handle_context)) {
+  Status s;
+  Iterator* iiter = IndexBlockReader(readOptions);
+  auto filter_entry = GetFilter(readOptions.read_tier == kBlockCacheTier);
+  FilterBlockReader* filter = filter_entry.value;
+  bool done = false;
+  for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) {
+    Slice handle_value = iiter->value();
+
+    BlockHandle handle;
+    bool may_not_exist_in_filter =
+      filter != nullptr &&
+      handle.DecodeFrom(&handle_value).ok() &&
+      !filter->KeyMayMatch(handle.offset(), key);
+
+    if (may_not_exist_in_filter) {
+      // Not found
+      // TODO: think about interaction with Merge. If a user key cannot
+      // cross one data block, we should be fine.
+      RecordTick(rep_->options.statistics.get(), BLOOM_FILTER_USEFUL);
+      break;
+    } else {
+      bool didIO = false;
+      unique_ptr<Iterator> block_iter(
+        BlockReader(this, readOptions, iiter->value(), &didIO));
+
+      if (readOptions.read_tier && block_iter->status().IsIncomplete()) {
+        // couldn't get block from block_cache
+        // Update Saver.state to Found because we are only looking for whether
+        // we can guarantee the key is not there when "no_io" is set
+        (*mark_key_may_exist_handler)(handle_context);
+        break;
+      }
+
+      // Call the *saver function on each entry/block until it returns false
+      for (block_iter->Seek(key); block_iter->Valid(); block_iter->Next()) {
+        if (!(*result_handler)(handle_context, block_iter->key(),
+                               block_iter->value(), didIO)) {
+          done = true;
+          break;
+        }
+      }
+      s = block_iter->status();
+    }
+  }
+
+  filter_entry.Release(rep_->options.block_cache.get());
+  if (s.ok()) {
+    s = iiter->status();
+  }
+  delete iiter;
+  return s;
+}
+
+bool SaveDidIO(void* arg, const Slice& key, const Slice& value, bool didIO) {
+  *reinterpret_cast<bool*>(arg) = didIO;
+  return false;
+}
+bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options,
+                                      const Slice& key) {
+  // We use Get() as it has logic that checks whether we read the
+  // block from the disk or not.
+  bool didIO = false;
+  Status s = Get(options, key, &didIO, SaveDidIO);
+  assert(s.ok());
+  return !didIO;
+}
+
+uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key) {
+  Iterator* index_iter = IndexBlockReader(ReadOptions());
+
+  index_iter->Seek(key);
+  uint64_t result;
+  if (index_iter->Valid()) {
+    BlockHandle handle;
+    Slice input = index_iter->value();
+    Status s = handle.DecodeFrom(&input);
+    if (s.ok()) {
+      result = handle.offset();
+    } else {
+      // Strange: we can't decode the block handle in the index block.
+      // We'll just return the offset of the metaindex block, which is
+      // close to the whole file size for this case.
+      result = rep_->metaindex_handle.offset();
+    }
+  } else {
+    // key is past the last key in the file.  Approximate the offset
+    // by returning the offset of the metaindex block (which is
+    // right near the end of the file).
+    result = rep_->metaindex_handle.offset();
+  }
+  delete index_iter;
+  return result;
+}
+
+const std::string BlockBasedTable::kFilterBlockPrefix =
+    "filter.";
+const std::string BlockBasedTable::kPropertiesBlock =
+    "rocksdb.properties";
+const std::string BlockBasedTablePropertiesNames::kDataSize  =
+    "rocksdb.data.size";
+const std::string BlockBasedTablePropertiesNames::kIndexSize =
+    "rocksdb.index.size";
+const std::string BlockBasedTablePropertiesNames::kFilterSize =
+    "rocksdb.filter.size";
+const std::string BlockBasedTablePropertiesNames::kRawKeySize =
+    "rocksdb.raw.key.size";
+const std::string BlockBasedTablePropertiesNames::kRawValueSize =
+    "rocksdb.raw.value.size";
+const std::string BlockBasedTablePropertiesNames::kNumDataBlocks =
+    "rocksdb.num.data.blocks";
+const std::string BlockBasedTablePropertiesNames::kNumEntries =
+    "rocksdb.num.entries";
+const std::string BlockBasedTablePropertiesNames::kFilterPolicy =
+    "rocksdb.filter.policy";
+
+}  // namespace rocksdb
diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h
new file mode 100644 (file)
index 0000000..66f63fc
--- /dev/null
@@ -0,0 +1,195 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <memory>
+#include <stdint.h>
+#include "rocksdb/cache.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/table_properties.h"
+#include "rocksdb/table.h"
+#include "util/coding.h"
+
+namespace rocksdb {
+
+class Block;
+class BlockHandle;
+class Footer;
+struct Options;
+class RandomAccessFile;
+struct ReadOptions;
+class TableCache;
+class TableReader;
+class FilterBlockReader;
+
+using std::unique_ptr;
+
+// A Table is a sorted map from strings to strings.  Tables are
+// immutable and persistent.  A Table may be safely accessed from
+// multiple threads without external synchronization.
+class BlockBasedTable : public TableReader {
+ public:
+  static const std::string kFilterBlockPrefix;
+  static const std::string kPropertiesBlock;
+
+  // Attempt to open the table that is stored in bytes [0..file_size)
+  // of "file", and read the metadata entries necessary to allow
+  // retrieving data from the table.
+  //
+  // If successful, returns ok and sets "*table_reader" to the newly opened
+  // table.  The client should delete "*table_reader" when no longer needed.
+  // If there was an error while initializing the table, sets "*table_reader"
+  // to nullptr and returns a non-ok status.
+  //
+  // *file must remain live while this Table is in use.
+  static Status Open(const Options& options,
+                     const EnvOptions& soptions,
+                     unique_ptr<RandomAccessFile>&& file,
+                     uint64_t file_size,
+                     unique_ptr<TableReader>* table_reader);
+
+  bool PrefixMayMatch(const Slice& internal_prefix) override;
+
+  // Returns a new iterator over the table contents.
+  // The result of NewIterator() is initially invalid (caller must
+  // call one of the Seek methods on the iterator before using it).
+  Iterator* NewIterator(const ReadOptions&) override;
+
+  Status Get(
+        const ReadOptions& readOptions,
+        const Slice& key,
+        void* handle_context,
+        bool (*result_handler)(void* handle_context, const Slice& k,
+                               const Slice& v, bool didIO),
+        void (*mark_key_may_exist_handler)(void* handle_context) = nullptr)
+    override;
+
+  // Given a key, return an approximate byte offset in the file where
+  // the data for that key begins (or would begin if the key were
+  // present in the file).  The returned value is in terms of file
+  // bytes, and so includes effects like compression of the underlying data.
+  // E.g., the approximate offset of the last key in the table will
+  // be close to the file length.
+  uint64_t ApproximateOffsetOf(const Slice& key) override;
+
+  // Returns true if the block for the specified key is in cache.
+  // REQUIRES: key is in this table.
+  bool TEST_KeyInCache(const ReadOptions& options, const Slice& key) override;
+
+  // Set up the table for Compaction. Might change some parameters with
+  // posix_fadvise
+  void SetupForCompaction() override;
+
+  TableProperties& GetTableProperties() override;
+
+  ~BlockBasedTable();
+
+ private:
+  template <class TValue>
+  struct CachableEntry;
+
+  struct Rep;
+  Rep* rep_;
+  bool compaction_optimized_;
+
+  static Iterator* BlockReader(void*, const ReadOptions&,
+                               const EnvOptions& soptions, const Slice&,
+                               bool for_compaction);
+
+  static Iterator* BlockReader(void*, const ReadOptions&, const Slice&,
+                               bool* didIO, bool for_compaction = false);
+
+  // if `no_io == true`, we will not try to read filter from sst file
+  // if it is not cached yet.
+  CachableEntry<FilterBlockReader> GetFilter(bool no_io = false) const;
+
+  Iterator* IndexBlockReader(const ReadOptions& options) const;
+
+  // Read the block, either from sst file or from cache. This method will try
+  // to read from cache only when block_cache is set or ReadOption doesn't
+  // explicitly prohibit storage IO.
+  //
+  // If the block is read from cache, the statistics for cache miss/hit of the
+  // the given type of block will be updated. User can specify
+  // `block_cache_miss_ticker` and `block_cache_hit_ticker` for the statistics
+  // update.
+  //
+  // On success, the `result` parameter will be populated, which contains a
+  // pointer to the block and its cache handle, which will be nullptr if it's
+  // not read from the cache.
+  static Status GetBlock(const BlockBasedTable* table,
+                         const BlockHandle& handle,
+                         const ReadOptions& options,
+                         bool for_compaction,
+                         Tickers block_cache_miss_ticker,
+                         Tickers block_cache_hit_ticker,
+                         bool* didIO,
+                         CachableEntry<Block>* result);
+
+  // Calls (*handle_result)(arg, ...) repeatedly, starting with the entry found
+  // after a call to Seek(key), until handle_result returns false.
+  // May not make such a call if filter policy says that key is not present.
+  friend class TableCache;
+  friend class BlockBasedTableBuilder;
+
+  void ReadMeta(const Footer& footer);
+  void ReadFilter(const Slice& filter_handle_value);
+  static Status ReadProperties(const Slice& handle_value, Rep* rep);
+
+  // Read the meta block from sst.
+  static Status ReadMetaBlock(
+      Rep* rep,
+      std::unique_ptr<Block>* meta_block,
+      std::unique_ptr<Iterator>* iter);
+
+  // Create the filter from the filter block.
+  static FilterBlockReader* ReadFilter(
+      const Slice& filter_handle_value,
+      Rep* rep,
+      size_t* filter_size = nullptr);
+
+  // Read the table properties from properties block.
+  static Status ReadProperties(
+      const Slice& handle_value, Rep* rep, TableProperties* properties);
+
+  static void SetupCacheKeyPrefix(Rep* rep);
+
+  explicit BlockBasedTable(Rep* rep) :
+      compaction_optimized_(false) {
+    rep_ = rep;
+  }
+  // Generate a cache key prefix from the file
+  static void GenerateCachePrefix(Cache* cc,
+    RandomAccessFile* file, char* buffer, size_t* size);
+  static void GenerateCachePrefix(Cache* cc,
+    WritableFile* file, char* buffer, size_t* size);
+
+  // The longest prefix of the cache key used to identify blocks.
+  // For Posix files the unique ID is three varints.
+  static const size_t kMaxCacheKeyPrefixSize = kMaxVarint64Length*3+1;
+
+  // No copying allowed
+  explicit BlockBasedTable(const TableReader&) = delete;
+  void operator=(const TableReader&) = delete;
+};
+
+struct BlockBasedTablePropertiesNames {
+  static const std::string kDataSize;
+  static const std::string kIndexSize;
+  static const std::string kFilterSize;
+  static const std::string kRawKeySize;
+  static const std::string kRawValueSize;
+  static const std::string kNumDataBlocks;
+  static const std::string kNumEntries;
+  static const std::string kFilterPolicy;
+};
+
+}  // namespace rocksdb
diff --git a/table/block_builder.cc b/table/block_builder.cc
new file mode 100644 (file)
index 0000000..9176018
--- /dev/null
@@ -0,0 +1,134 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// BlockBuilder generates blocks where keys are prefix-compressed:
+//
+// When we store a key, we drop the prefix shared with the previous
+// string.  This helps reduce the space requirement significantly.
+// Furthermore, once every K keys, we do not apply the prefix
+// compression and store the entire key.  We call this a "restart
+// point".  The tail end of the block stores the offsets of all of the
+// restart points, and can be used to do a binary search when looking
+// for a particular key.  Values are stored as-is (without compression)
+// immediately following the corresponding key.
+//
+// An entry for a particular key-value pair has the form:
+//     shared_bytes: varint32
+//     unshared_bytes: varint32
+//     value_length: varint32
+//     key_delta: char[unshared_bytes]
+//     value: char[value_length]
+// shared_bytes == 0 for restart points.
+//
+// The trailer of the block has the form:
+//     restarts: uint32[num_restarts]
+//     num_restarts: uint32
+// restarts[i] contains the offset within the block of the ith restart point.
+
+#include "table/block_builder.h"
+
+#include <algorithm>
+#include <assert.h>
+#include "rocksdb/comparator.h"
+#include "util/coding.h"
+
+namespace rocksdb {
+
+BlockBuilder::BlockBuilder(int block_restart_interval,
+                           const Comparator* comparator)
+    : block_restart_interval_(block_restart_interval),
+      comparator_(comparator),
+      restarts_(),
+      counter_(0),
+      finished_(false) {
+  assert(block_restart_interval_ >= 1);
+  restarts_.push_back(0);       // First restart point is at offset 0
+}
+
+BlockBuilder::BlockBuilder(const Options& options)
+    : BlockBuilder(options.block_restart_interval, options.comparator) {
+}
+
+void BlockBuilder::Reset() {
+  buffer_.clear();
+  restarts_.clear();
+  restarts_.push_back(0);       // First restart point is at offset 0
+  counter_ = 0;
+  finished_ = false;
+  last_key_.clear();
+}
+
+size_t BlockBuilder::CurrentSizeEstimate() const {
+  return (buffer_.size() +                        // Raw data buffer
+          restarts_.size() * sizeof(uint32_t) +   // Restart array
+          sizeof(uint32_t));                      // Restart array length
+}
+
+size_t BlockBuilder::EstimateSizeAfterKV(const Slice& key, const Slice& value)
+  const {
+  size_t estimate = CurrentSizeEstimate();
+  estimate += key.size() + value.size();
+  if (counter_ >= block_restart_interval_) {
+    estimate += sizeof(uint32_t); // a new restart entry.
+  }
+
+  estimate += sizeof(int32_t); // varint for shared prefix length.
+  estimate += VarintLength(key.size()); // varint for key length.
+  estimate += VarintLength(value.size()); // varint for value length.
+
+  return estimate;
+}
+
+Slice BlockBuilder::Finish() {
+  // Append restart array
+  for (size_t i = 0; i < restarts_.size(); i++) {
+    PutFixed32(&buffer_, restarts_[i]);
+  }
+  PutFixed32(&buffer_, restarts_.size());
+  finished_ = true;
+  return Slice(buffer_);
+}
+
+void BlockBuilder::Add(const Slice& key, const Slice& value) {
+  Slice last_key_piece(last_key_);
+  assert(!finished_);
+  assert(counter_ <= block_restart_interval_);
+  assert(buffer_.empty() // No values yet?
+         || comparator_->Compare(key, last_key_piece) > 0);
+  size_t shared = 0;
+  if (counter_ < block_restart_interval_) {
+    // See how much sharing to do with previous string
+    const size_t min_length = std::min(last_key_piece.size(), key.size());
+    while ((shared < min_length) && (last_key_piece[shared] == key[shared])) {
+      shared++;
+    }
+  } else {
+    // Restart compression
+    restarts_.push_back(buffer_.size());
+    counter_ = 0;
+  }
+  const size_t non_shared = key.size() - shared;
+
+  // Add "<shared><non_shared><value_size>" to buffer_
+  PutVarint32(&buffer_, shared);
+  PutVarint32(&buffer_, non_shared);
+  PutVarint32(&buffer_, value.size());
+
+  // Add string delta to buffer_ followed by value
+  buffer_.append(key.data() + shared, non_shared);
+  buffer_.append(value.data(), value.size());
+
+  // Update state
+  last_key_.resize(shared);
+  last_key_.append(key.data() + shared, non_shared);
+  assert(Slice(last_key_) == key);
+  counter_++;
+}
+
+}  // namespace rocksdb
diff --git a/table/block_builder.h b/table/block_builder.h
new file mode 100644 (file)
index 0000000..31faf19
--- /dev/null
@@ -0,0 +1,65 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <vector>
+
+#include <stdint.h>
+#include "rocksdb/slice.h"
+
+namespace rocksdb {
+
+struct Options;
+class Comparator;
+
+class BlockBuilder {
+ public:
+  BlockBuilder(int block_builder, const Comparator* comparator);
+  explicit BlockBuilder(const Options& options);
+
+  // Reset the contents as if the BlockBuilder was just constructed.
+  void Reset();
+
+  // REQUIRES: Finish() has not been callled since the last call to Reset().
+  // REQUIRES: key is larger than any previously added key
+  void Add(const Slice& key, const Slice& value);
+
+  // Finish building the block and return a slice that refers to the
+  // block contents.  The returned slice will remain valid for the
+  // lifetime of this builder or until Reset() is called.
+  Slice Finish();
+
+  // Returns an estimate of the current (uncompressed) size of the block
+  // we are building.
+  size_t CurrentSizeEstimate() const;
+
+  // Returns an estimated block size after appending key and value.
+  size_t EstimateSizeAfterKV(const Slice& key, const Slice& value) const;
+
+  // Return true iff no entries have been added since the last Reset()
+  bool empty() const {
+    return buffer_.empty();
+  }
+
+ private:
+  const int          block_restart_interval_;
+  const Comparator*  comparator_;
+
+  std::string           buffer_;    // Destination buffer
+  std::vector<uint32_t> restarts_;  // Restart points
+  int                   counter_;   // Number of entries emitted since restart
+  bool                  finished_;  // Has Finish() been called?
+  std::string           last_key_;
+
+  // No copying allowed
+  BlockBuilder(const BlockBuilder&);
+  void operator=(const BlockBuilder&);
+};
+
+}  // namespace rocksdb
diff --git a/table/block_test.cc b/table/block_test.cc
new file mode 100644 (file)
index 0000000..7f33e3a
--- /dev/null
@@ -0,0 +1,105 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#include <string>
+#include "db/dbformat.h"
+#include "db/memtable.h"
+#include "db/write_batch_internal.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/table.h"
+#include "table/block.h"
+#include "table/block_builder.h"
+#include "table/format.h"
+#include "util/random.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+
+static std::string RandomString(Random* rnd, int len) {
+  std::string r;
+  test::RandomString(rnd, len, &r);
+  return r;
+}
+
+class BlockTest {};
+
+// block test
+TEST(BlockTest, SimpleTest) {
+  Random rnd(301);
+  Options options = Options();
+  std::vector<std::string> keys;
+  std::vector<std::string> values;
+  BlockBuilder builder(options);
+  int num_records = 100000;
+  char buf[10];
+  char* p = &buf[0];
+
+  // add a bunch of records to a block
+  for (int i = 0; i < num_records; i++) {
+    // generate random kvs
+    sprintf(p, "%6d", i);
+    std::string k(p);
+    std::string v = RandomString(&rnd, 100); // 100 byte values
+
+    // write kvs to the block
+    Slice key(k);
+    Slice value(v);
+    builder.Add(key, value);
+
+    // remember kvs in a lookaside array
+    keys.push_back(k);
+    values.push_back(v);
+  }
+
+  // read serialized contents of the block
+  Slice rawblock = builder.Finish();
+
+  // create block reader
+  BlockContents contents;
+  contents.data = rawblock;
+  contents.cachable = false;
+  contents.heap_allocated = false;
+  Block reader(contents);
+
+  // read contents of block sequentially
+  int count = 0;
+  Iterator* iter = reader.NewIterator(options.comparator);
+  for (iter->SeekToFirst();iter->Valid(); count++, iter->Next()) {
+
+    // read kv from block
+    Slice k = iter->key();
+    Slice v = iter->value();
+
+    // compare with lookaside array
+    ASSERT_EQ(k.ToString().compare(keys[count]), 0);
+    ASSERT_EQ(v.ToString().compare(values[count]), 0);
+  }
+  delete iter;
+
+  // read block contents randomly
+  iter = reader.NewIterator(options.comparator);
+  for (int i = 0; i < num_records; i++) {
+
+    // find a random key in the lookaside array
+    int index = rnd.Uniform(num_records);
+    Slice k(keys[index]);
+
+    // search in block for this key
+    iter->Seek(k);
+    ASSERT_TRUE(iter->Valid());
+    Slice v = iter->value();
+    ASSERT_EQ(v.ToString().compare(values[index]), 0);
+  }
+  delete iter;
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/table/filter_block.cc b/table/filter_block.cc
new file mode 100644 (file)
index 0000000..82b6c6e
--- /dev/null
@@ -0,0 +1,187 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/filter_block.h"
+
+#include "db/dbformat.h"
+#include "rocksdb/filter_policy.h"
+#include "util/coding.h"
+
+namespace rocksdb {
+
+// See doc/table_format.txt for an explanation of the filter block format.
+
+// Generate new filter every 2KB of data
+static const size_t kFilterBaseLg = 11;
+static const size_t kFilterBase = 1 << kFilterBaseLg;
+
+FilterBlockBuilder::FilterBlockBuilder(const Options& opt)
+                 : policy_(opt.filter_policy),
+                   prefix_extractor_(opt.prefix_extractor),
+                   whole_key_filtering_(opt.whole_key_filtering),
+                   comparator_(opt.comparator){}
+
+void FilterBlockBuilder::StartBlock(uint64_t block_offset) {
+  uint64_t filter_index = (block_offset / kFilterBase);
+  assert(filter_index >= filter_offsets_.size());
+  while (filter_index > filter_offsets_.size()) {
+    GenerateFilter();
+  }
+}
+
+bool FilterBlockBuilder::SamePrefix(const Slice &key1,
+                                    const Slice &key2) const {
+  if (!prefix_extractor_->InDomain(key1) &&
+      !prefix_extractor_->InDomain(key2)) {
+    return true;
+  } else if (!prefix_extractor_->InDomain(key1) ||
+             !prefix_extractor_->InDomain(key2)) {
+    return false;
+  } else {
+    return (prefix_extractor_->Transform(key1) ==
+            prefix_extractor_->Transform(key2));
+  }
+}
+
+void FilterBlockBuilder::AddKey(const Slice& key) {
+  // get slice for most recently added entry
+  Slice prev;
+  size_t added_to_start = 0;
+
+  // add key to filter if needed
+  if (whole_key_filtering_) {
+    start_.push_back(entries_.size());
+    ++added_to_start;
+    entries_.append(key.data(), key.size());
+  }
+
+  if (start_.size() > added_to_start) {
+    size_t prev_start = start_[start_.size() - 1 - added_to_start];
+    const char* base = entries_.data() + prev_start;
+    size_t length = entries_.size() - prev_start;
+    prev = Slice(base, length);
+  }
+
+  // add prefix to filter if needed
+  if (prefix_extractor_ && prefix_extractor_->InDomain(ExtractUserKey(key))) {
+    // If prefix_extractor_, this filter_block layer assumes we only
+    // operate on internal keys.
+    Slice user_key = ExtractUserKey(key);
+    // this assumes prefix(prefix(key)) == prefix(key), as the last
+    // entry in entries_ may be either a key or prefix, and we use
+    // prefix(last entry) to get the prefix of the last key.
+    if (prev.size() == 0 ||
+        !SamePrefix(user_key, ExtractUserKey(prev))) {
+      Slice prefix = prefix_extractor_->Transform(user_key);
+      InternalKey internal_prefix_tmp(prefix, 0, kTypeValue);
+      Slice internal_prefix = internal_prefix_tmp.Encode();
+      assert(comparator_->Compare(internal_prefix, key) <= 0);
+      start_.push_back(entries_.size());
+      entries_.append(internal_prefix.data(), internal_prefix.size());
+    }
+  }
+}
+
+Slice FilterBlockBuilder::Finish() {
+  if (!start_.empty()) {
+    GenerateFilter();
+  }
+
+  // Append array of per-filter offsets
+  const uint32_t array_offset = result_.size();
+  for (size_t i = 0; i < filter_offsets_.size(); i++) {
+    PutFixed32(&result_, filter_offsets_[i]);
+  }
+
+  PutFixed32(&result_, array_offset);
+  result_.push_back(kFilterBaseLg);  // Save encoding parameter in result
+  return Slice(result_);
+}
+
+void FilterBlockBuilder::GenerateFilter() {
+  const size_t num_entries = start_.size();
+  if (num_entries == 0) {
+    // Fast path if there are no keys for this filter
+    filter_offsets_.push_back(result_.size());
+    return;
+  }
+
+  // Make list of keys from flattened key structure
+  start_.push_back(entries_.size());  // Simplify length computation
+  tmp_entries_.resize(num_entries);
+  for (size_t i = 0; i < num_entries; i++) {
+    const char* base = entries_.data() + start_[i];
+    size_t length = start_[i+1] - start_[i];
+    tmp_entries_[i] = Slice(base, length);
+  }
+
+  // Generate filter for current set of keys and append to result_.
+  filter_offsets_.push_back(result_.size());
+  policy_->CreateFilter(&tmp_entries_[0], num_entries, &result_);
+
+  tmp_entries_.clear();
+  entries_.clear();
+  start_.clear();
+}
+
+FilterBlockReader::FilterBlockReader(
+    const Options& opt, const Slice& contents, bool delete_contents_after_use)
+    : policy_(opt.filter_policy),
+      prefix_extractor_(opt.prefix_extractor),
+      whole_key_filtering_(opt.whole_key_filtering),
+      data_(nullptr),
+      offset_(nullptr),
+      num_(0),
+      base_lg_(0) {
+  size_t n = contents.size();
+  if (n < 5) return;  // 1 byte for base_lg_ and 4 for start of offset array
+  base_lg_ = contents[n-1];
+  uint32_t last_word = DecodeFixed32(contents.data() + n - 5);
+  if (last_word > n - 5) return;
+  data_ = contents.data();
+  offset_ = data_ + last_word;
+  num_ = (n - 5 - last_word) / 4;
+  if (delete_contents_after_use) {
+    filter_data.reset(contents.data());
+  }
+}
+
+bool FilterBlockReader::KeyMayMatch(uint64_t block_offset,
+                                    const Slice& key) {
+  if (!whole_key_filtering_) {
+    return true;
+  }
+  return MayMatch(block_offset, key);
+}
+
+bool FilterBlockReader::PrefixMayMatch(uint64_t block_offset,
+                                       const Slice& prefix) {
+  if (!prefix_extractor_) {
+    return true;
+  }
+  return MayMatch(block_offset, prefix);
+}
+
+bool FilterBlockReader::MayMatch(uint64_t block_offset, const Slice& entry) {
+  uint64_t index = block_offset >> base_lg_;
+  if (index < num_) {
+    uint32_t start = DecodeFixed32(offset_ + index*4);
+    uint32_t limit = DecodeFixed32(offset_ + index*4 + 4);
+    if (start <= limit && limit <= (offset_ - data_)) {
+      Slice filter = Slice(data_ + start, limit - start);
+      return policy_->KeyMayMatch(entry, filter);
+    } else if (start == limit) {
+      // Empty filters do not match any entries
+      return false;
+    }
+  }
+  return true;  // Errors are treated as potential matches
+}
+
+}
diff --git a/table/filter_block.h b/table/filter_block.h
new file mode 100644 (file)
index 0000000..e47f946
--- /dev/null
@@ -0,0 +1,88 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A filter block is stored near the end of a Table file.  It contains
+// filters (e.g., bloom filters) for all data blocks in the table combined
+// into a single filter block.
+
+#pragma once
+
+#include <memory>
+#include <stddef.h>
+#include <stdint.h>
+#include <string>
+#include <vector>
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "util/hash.h"
+
+namespace rocksdb {
+
+class FilterPolicy;
+
+// A FilterBlockBuilder is used to construct all of the filters for a
+// particular Table.  It generates a single string which is stored as
+// a special block in the Table.
+//
+// The sequence of calls to FilterBlockBuilder must match the regexp:
+//      (StartBlock AddKey*)* Finish
+class FilterBlockBuilder {
+ public:
+  explicit FilterBlockBuilder(const Options& opt);
+
+  void StartBlock(uint64_t block_offset);
+  void AddKey(const Slice& key);
+  Slice Finish();
+
+ private:
+  bool SamePrefix(const Slice &key1, const Slice &key2) const;
+  void GenerateFilter();
+
+  const FilterPolicy* policy_;
+  const SliceTransform* prefix_extractor_;
+  bool whole_key_filtering_;
+  const Comparator* comparator_;
+
+  std::string entries_;         // Flattened entry contents
+  std::vector<size_t> start_;   // Starting index in entries_ of each entry
+  std::string result_;          // Filter data computed so far
+  std::vector<Slice> tmp_entries_; // policy_->CreateFilter() argument
+  std::vector<uint32_t> filter_offsets_;
+
+  // No copying allowed
+  FilterBlockBuilder(const FilterBlockBuilder&);
+  void operator=(const FilterBlockBuilder&);
+};
+
+class FilterBlockReader {
+ public:
+ // REQUIRES: "contents" and *policy must stay live while *this is live.
+  FilterBlockReader(
+    const Options& opt,
+    const Slice& contents,
+    bool delete_contents_after_use = false);
+  bool KeyMayMatch(uint64_t block_offset, const Slice& key);
+  bool PrefixMayMatch(uint64_t block_offset, const Slice& prefix);
+
+ private:
+  const FilterPolicy* policy_;
+  const SliceTransform* prefix_extractor_;
+  bool whole_key_filtering_;
+  const char* data_;    // Pointer to filter data (at block-start)
+  const char* offset_;  // Pointer to beginning of offset array (at block-end)
+  size_t num_;          // Number of entries in offset array
+  size_t base_lg_;      // Encoding parameter (see kFilterBaseLg in .cc file)
+  std::unique_ptr<const char[]> filter_data;
+
+
+  bool MayMatch(uint64_t block_offset, const Slice& entry);
+};
+
+}
diff --git a/table/filter_block_test.cc b/table/filter_block_test.cc
new file mode 100644 (file)
index 0000000..bc1a0d0
--- /dev/null
@@ -0,0 +1,139 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/filter_block.h"
+
+#include "rocksdb/filter_policy.h"
+#include "util/coding.h"
+#include "util/hash.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+
+// For testing: emit an array with one hash value per key
+class TestHashFilter : public FilterPolicy {
+ public:
+  virtual const char* Name() const {
+    return "TestHashFilter";
+  }
+
+  virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const {
+    for (int i = 0; i < n; i++) {
+      uint32_t h = Hash(keys[i].data(), keys[i].size(), 1);
+      PutFixed32(dst, h);
+    }
+  }
+
+  virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const {
+    uint32_t h = Hash(key.data(), key.size(), 1);
+    for (unsigned int i = 0; i + 4 <= filter.size(); i += 4) {
+      if (h == DecodeFixed32(filter.data() + i)) {
+        return true;
+      }
+    }
+    return false;
+  }
+};
+
+class FilterBlockTest {
+ public:
+  TestHashFilter policy_;
+  Options options_;
+
+  FilterBlockTest() {
+    options_ = Options();
+    options_.filter_policy = &policy_;
+  }
+};
+
+TEST(FilterBlockTest, EmptyBuilder) {
+  FilterBlockBuilder builder(options_);
+  Slice block = builder.Finish();
+  ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block));
+  FilterBlockReader reader(options_, block);
+  ASSERT_TRUE(reader.KeyMayMatch(0, "foo"));
+  ASSERT_TRUE(reader.KeyMayMatch(100000, "foo"));
+}
+
+TEST(FilterBlockTest, SingleChunk) {
+  FilterBlockBuilder builder(options_);
+  builder.StartBlock(100);
+  builder.AddKey("foo");
+  builder.AddKey("bar");
+  builder.AddKey("box");
+  builder.StartBlock(200);
+  builder.AddKey("box");
+  builder.StartBlock(300);
+  builder.AddKey("hello");
+  Slice block = builder.Finish();
+  FilterBlockReader reader(options_, block);
+  ASSERT_TRUE(reader.KeyMayMatch(100, "foo"));
+  ASSERT_TRUE(reader.KeyMayMatch(100, "bar"));
+  ASSERT_TRUE(reader.KeyMayMatch(100, "box"));
+  ASSERT_TRUE(reader.KeyMayMatch(100, "hello"));
+  ASSERT_TRUE(reader.KeyMayMatch(100, "foo"));
+  ASSERT_TRUE(! reader.KeyMayMatch(100, "missing"));
+  ASSERT_TRUE(! reader.KeyMayMatch(100, "other"));
+}
+
+TEST(FilterBlockTest, MultiChunk) {
+  FilterBlockBuilder builder(options_);
+
+  // First filter
+  builder.StartBlock(0);
+  builder.AddKey("foo");
+  builder.StartBlock(2000);
+  builder.AddKey("bar");
+
+  // Second filter
+  builder.StartBlock(3100);
+  builder.AddKey("box");
+
+  // Third filter is empty
+
+  // Last filter
+  builder.StartBlock(9000);
+  builder.AddKey("box");
+  builder.AddKey("hello");
+
+  Slice block = builder.Finish();
+  FilterBlockReader reader(options_, block);
+
+  // Check first filter
+  ASSERT_TRUE(reader.KeyMayMatch(0, "foo"));
+  ASSERT_TRUE(reader.KeyMayMatch(2000, "bar"));
+  ASSERT_TRUE(! reader.KeyMayMatch(0, "box"));
+  ASSERT_TRUE(! reader.KeyMayMatch(0, "hello"));
+
+  // Check second filter
+  ASSERT_TRUE(reader.KeyMayMatch(3100, "box"));
+  ASSERT_TRUE(! reader.KeyMayMatch(3100, "foo"));
+  ASSERT_TRUE(! reader.KeyMayMatch(3100, "bar"));
+  ASSERT_TRUE(! reader.KeyMayMatch(3100, "hello"));
+
+  // Check third filter (empty)
+  ASSERT_TRUE(! reader.KeyMayMatch(4100, "foo"));
+  ASSERT_TRUE(! reader.KeyMayMatch(4100, "bar"));
+  ASSERT_TRUE(! reader.KeyMayMatch(4100, "box"));
+  ASSERT_TRUE(! reader.KeyMayMatch(4100, "hello"));
+
+  // Check last filter
+  ASSERT_TRUE(reader.KeyMayMatch(9000, "box"));
+  ASSERT_TRUE(reader.KeyMayMatch(9000, "hello"));
+  ASSERT_TRUE(! reader.KeyMayMatch(9000, "foo"));
+  ASSERT_TRUE(! reader.KeyMayMatch(9000, "bar"));
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/table/flush_block_policy.cc b/table/flush_block_policy.cc
new file mode 100644 (file)
index 0000000..a953a78
--- /dev/null
@@ -0,0 +1,70 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include "rocksdb/flush_block_policy.h"
+#include "rocksdb/slice.h"
+#include "table/block_builder.h"
+
+#include <cassert>
+
+namespace rocksdb {
+
+// Flush block by size
+class FlushBlockBySizePolicy : public FlushBlockPolicy {
+ public:
+  // @params block_size:           Approximate size of user data packed per
+  //                               block.
+  // @params block_size_deviation: This is used to close a block before it
+  //                               reaches the configured
+  FlushBlockBySizePolicy(const uint64_t block_size,
+                         const uint64_t block_size_deviation,
+                         const BlockBuilder& data_block_builder) :
+      block_size_(block_size),
+      block_size_deviation_(block_size_deviation),
+      data_block_builder_(data_block_builder) {
+  }
+
+  virtual bool Update(const Slice& key,
+                      const Slice& value) override {
+    // it makes no sense to flush when the data block is empty
+    if (data_block_builder_.empty()) {
+      return false;
+    }
+
+    auto curr_size = data_block_builder_.CurrentSizeEstimate();
+
+    // Do flush if one of the below two conditions is true:
+    // 1) if the current estimated size already exceeds the block size,
+    // 2) block_size_deviation is set and the estimated size after appending
+    // the kv will exceed the block size and the current size is under the
+    // the deviation.
+    return curr_size >= block_size_ || BlockAlmostFull(key, value);
+  }
+
+ private:
+  bool BlockAlmostFull(const Slice& key, const Slice& value) const {
+    const auto curr_size = data_block_builder_.CurrentSizeEstimate();
+    const auto estimated_size_after =
+      data_block_builder_.EstimateSizeAfterKV(key, value);
+
+    return
+      estimated_size_after > block_size_ &&
+      block_size_deviation_ > 0 &&
+      curr_size * 100 > block_size_ * (100 - block_size_deviation_);
+  }
+
+  const uint64_t block_size_;
+  const uint64_t block_size_deviation_;
+  const BlockBuilder& data_block_builder_;
+};
+
+FlushBlockPolicy* FlushBlockBySizePolicyFactory::NewFlushBlockPolicy(
+    const BlockBuilder& data_block_builder) const {
+  return new FlushBlockBySizePolicy(block_size_,
+                                    block_size_deviation_,
+                                    data_block_builder);
+}
+
+}  // namespace rocksdb
diff --git a/table/format.cc b/table/format.cc
new file mode 100644 (file)
index 0000000..ff6d8fa
--- /dev/null
@@ -0,0 +1,203 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/format.h"
+
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "table/block.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+#include "util/perf_context_imp.h"
+
+namespace rocksdb {
+
+void BlockHandle::EncodeTo(std::string* dst) const {
+  // Sanity check that all fields have been set
+  assert(offset_ != ~static_cast<uint64_t>(0));
+  assert(size_ != ~static_cast<uint64_t>(0));
+  PutVarint64(dst, offset_);
+  PutVarint64(dst, size_);
+}
+
+Status BlockHandle::DecodeFrom(Slice* input) {
+  if (GetVarint64(input, &offset_) &&
+      GetVarint64(input, &size_)) {
+    return Status::OK();
+  } else {
+    return Status::Corruption("bad block handle");
+  }
+}
+
+void Footer::EncodeTo(std::string* dst) const {
+#ifndef NDEBUG
+  const size_t original_size = dst->size();
+#endif
+  metaindex_handle_.EncodeTo(dst);
+  index_handle_.EncodeTo(dst);
+  dst->resize(2 * BlockHandle::kMaxEncodedLength);  // Padding
+  PutFixed32(dst, static_cast<uint32_t>(kTableMagicNumber & 0xffffffffu));
+  PutFixed32(dst, static_cast<uint32_t>(kTableMagicNumber >> 32));
+  assert(dst->size() == original_size + kEncodedLength);
+}
+
+Status Footer::DecodeFrom(Slice* input) {
+  assert(input != nullptr);
+  assert(input->size() >= kEncodedLength);
+
+  const char* magic_ptr = input->data() + kEncodedLength - 8;
+  const uint32_t magic_lo = DecodeFixed32(magic_ptr);
+  const uint32_t magic_hi = DecodeFixed32(magic_ptr + 4);
+  const uint64_t magic = ((static_cast<uint64_t>(magic_hi) << 32) |
+                          (static_cast<uint64_t>(magic_lo)));
+  if (magic != kTableMagicNumber) {
+    return Status::InvalidArgument("not an sstable (bad magic number)");
+  }
+
+  Status result = metaindex_handle_.DecodeFrom(input);
+  if (result.ok()) {
+    result = index_handle_.DecodeFrom(input);
+  }
+  if (result.ok()) {
+    // We skip over any leftover data (just padding for now) in "input"
+    const char* end = magic_ptr + 8;
+    *input = Slice(end, input->data() + input->size() - end);
+  }
+  return result;
+}
+
+Status ReadBlockContents(RandomAccessFile* file,
+                         const ReadOptions& options,
+                         const BlockHandle& handle,
+                         BlockContents* result,
+                         Env* env,
+                         bool do_uncompress) {
+  result->data = Slice();
+  result->cachable = false;
+  result->heap_allocated = false;
+
+  // Read the block contents as well as the type/crc footer.
+  // See table_builder.cc for the code that built this structure.
+  size_t n = static_cast<size_t>(handle.size());
+  char* buf = new char[n + kBlockTrailerSize];
+  Slice contents;
+
+  StopWatchNano timer(env);
+  StartPerfTimer(&timer);
+  Status s = file->Read(handle.offset(), n + kBlockTrailerSize, &contents, buf);
+  BumpPerfCount(&perf_context.block_read_count);
+  BumpPerfCount(&perf_context.block_read_byte, n + kBlockTrailerSize);
+  BumpPerfTime(&perf_context.block_read_time, &timer);
+
+  if (!s.ok()) {
+    delete[] buf;
+    return s;
+  }
+  if (contents.size() != n + kBlockTrailerSize) {
+    delete[] buf;
+    return Status::Corruption("truncated block read");
+  }
+
+  // Check the crc of the type and the block contents
+  const char* data = contents.data();    // Pointer to where Read put the data
+  if (options.verify_checksums) {
+    const uint32_t crc = crc32c::Unmask(DecodeFixed32(data + n + 1));
+    const uint32_t actual = crc32c::Value(data, n + 1);
+    if (actual != crc) {
+      delete[] buf;
+      s = Status::Corruption("block checksum mismatch");
+      return s;
+    }
+    BumpPerfTime(&perf_context.block_checksum_time, &timer);
+  }
+
+  // If the caller has requested that the block not be uncompressed
+  if (!do_uncompress || data[n] == kNoCompression) {
+    if (data != buf) {
+      // File implementation gave us pointer to some other data.
+      // Use it directly under the assumption that it will be live
+      // while the file is open.
+      delete[] buf;
+      result->data = Slice(data, n);
+      result->heap_allocated = false;
+      result->cachable = false;  // Do not double-cache
+    } else {
+      result->data = Slice(buf, n);
+      result->heap_allocated = true;
+      result->cachable = true;
+    }
+    result->compression_type = (rocksdb::CompressionType)data[n];
+    s =  Status::OK();
+  } else {
+    s = UncompressBlockContents(data, n, result);
+    delete[] buf;
+  }
+  BumpPerfTime(&perf_context.block_decompress_time, &timer);
+  return s;
+}
+
+//
+// The 'data' points to the raw block contents that was read in from file.
+// This method allocates a new heap buffer and the raw block
+// contents are uncompresed into this buffer. This
+// buffer is returned via 'result' and it is upto the caller to
+// free this buffer.
+Status UncompressBlockContents(const char* data, size_t n,
+                               BlockContents* result) {
+  char* ubuf = nullptr;
+  int decompress_size = 0;
+  assert(data[n] != kNoCompression);
+  switch (data[n]) {
+    case kSnappyCompression: {
+      size_t ulength = 0;
+      static char snappy_corrupt_msg[] =
+        "Snappy not supported or corrupted Snappy compressed block contents";
+      if (!port::Snappy_GetUncompressedLength(data, n, &ulength)) {
+        return Status::Corruption(snappy_corrupt_msg);
+      }
+      ubuf = new char[ulength];
+      if (!port::Snappy_Uncompress(data, n, ubuf)) {
+        delete[] ubuf;
+        return Status::Corruption(snappy_corrupt_msg);
+      }
+      result->data = Slice(ubuf, ulength);
+      result->heap_allocated = true;
+      result->cachable = true;
+      break;
+    }
+    case kZlibCompression:
+      ubuf = port::Zlib_Uncompress(data, n, &decompress_size);
+      static char zlib_corrupt_msg[] =
+        "Zlib not supported or corrupted Zlib compressed block contents";
+      if (!ubuf) {
+        return Status::Corruption(zlib_corrupt_msg);
+      }
+      result->data = Slice(ubuf, decompress_size);
+      result->heap_allocated = true;
+      result->cachable = true;
+      break;
+    case kBZip2Compression:
+      ubuf = port::BZip2_Uncompress(data, n, &decompress_size);
+      static char bzip2_corrupt_msg[] =
+        "Bzip2 not supported or corrupted Bzip2 compressed block contents";
+      if (!ubuf) {
+        return Status::Corruption(bzip2_corrupt_msg);
+      }
+      result->data = Slice(ubuf, decompress_size);
+      result->heap_allocated = true;
+      result->cachable = true;
+      break;
+    default:
+      return Status::Corruption("bad block type");
+  }
+  result->compression_type = kNoCompression; // not compressed any more
+  return Status::OK();
+}
+
+}  // namespace rocksdb
diff --git a/table/format.h b/table/format.h
new file mode 100644 (file)
index 0000000..2f1c1e8
--- /dev/null
@@ -0,0 +1,122 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <string>
+#include <stdint.h>
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+
+namespace rocksdb {
+
+class Block;
+class RandomAccessFile;
+struct ReadOptions;
+
+// BlockHandle is a pointer to the extent of a file that stores a data
+// block or a meta block.
+class BlockHandle {
+ public:
+  BlockHandle();
+
+  // The offset of the block in the file.
+  uint64_t offset() const { return offset_; }
+  void set_offset(uint64_t offset) { offset_ = offset; }
+
+  // The size of the stored block
+  uint64_t size() const { return size_; }
+  void set_size(uint64_t size) { size_ = size; }
+
+  void EncodeTo(std::string* dst) const;
+  Status DecodeFrom(Slice* input);
+
+  // Maximum encoding length of a BlockHandle
+  enum { kMaxEncodedLength = 10 + 10 };
+
+ private:
+  uint64_t offset_;
+  uint64_t size_;
+};
+
+// Footer encapsulates the fixed information stored at the tail
+// end of every table file.
+class Footer {
+ public:
+  Footer() { }
+
+  // The block handle for the metaindex block of the table
+  const BlockHandle& metaindex_handle() const { return metaindex_handle_; }
+  void set_metaindex_handle(const BlockHandle& h) { metaindex_handle_ = h; }
+
+  // The block handle for the index block of the table
+  const BlockHandle& index_handle() const {
+    return index_handle_;
+  }
+  void set_index_handle(const BlockHandle& h) {
+    index_handle_ = h;
+  }
+
+  void EncodeTo(std::string* dst) const;
+  Status DecodeFrom(Slice* input);
+
+  // Encoded length of a Footer.  Note that the serialization of a
+  // Footer will always occupy exactly this many bytes.  It consists
+  // of two block handles and a magic number.
+  enum {
+    kEncodedLength = 2*BlockHandle::kMaxEncodedLength + 8
+  };
+
+ private:
+  BlockHandle metaindex_handle_;
+  BlockHandle index_handle_;
+};
+
+// kTableMagicNumber was picked by running
+//    echo http://code.google.com/p/leveldb/ | sha1sum
+// and taking the leading 64 bits.
+static const uint64_t kTableMagicNumber = 0xdb4775248b80fb57ull;
+
+// 1-byte type + 32-bit crc
+static const size_t kBlockTrailerSize = 5;
+
+struct BlockContents {
+  Slice data;           // Actual contents of data
+  bool cachable;        // True iff data can be cached
+  bool heap_allocated;  // True iff caller should delete[] data.data()
+  CompressionType compression_type;
+};
+
+// Read the block identified by "handle" from "file".  On failure
+// return non-OK.  On success fill *result and return OK.
+extern Status ReadBlockContents(RandomAccessFile* file,
+                                const ReadOptions& options,
+                                const BlockHandle& handle,
+                                BlockContents* result,
+                                Env* env,
+                                bool do_uncompress);
+
+// The 'data' points to the raw block contents read in from file.
+// This method allocates a new heap buffer and the raw block
+// contents are uncompresed into this buffer. This buffer is
+// returned via 'result' and it is upto the caller to
+// free this buffer.
+extern Status UncompressBlockContents(const char* data,
+                                      size_t n,
+                                      BlockContents* result);
+
+// Implementation details follow.  Clients should ignore,
+
+inline BlockHandle::BlockHandle()
+    : offset_(~static_cast<uint64_t>(0)),
+      size_(~static_cast<uint64_t>(0)) {
+}
+
+}  // namespace rocksdb
diff --git a/table/iter_heap.h b/table/iter_heap.h
new file mode 100644 (file)
index 0000000..af8834e
--- /dev/null
@@ -0,0 +1,64 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+
+#pragma once
+#include <queue>
+
+#include "rocksdb/comparator.h"
+#include "table/iterator_wrapper.h"
+
+namespace rocksdb {
+
+// Return the max of two keys.
+class MaxIteratorComparator {
+ public:
+  MaxIteratorComparator(const Comparator* comparator) :
+    comparator_(comparator) {}
+
+  bool operator()(IteratorWrapper* a, IteratorWrapper* b) {
+    return comparator_->Compare(a->key(), b->key()) <= 0;
+  }
+ private:
+  const Comparator* comparator_;
+};
+
+// Return the max of two keys.
+class MinIteratorComparator {
+ public:
+  // if maxHeap is set comparator returns the max value.
+  // else returns the min Value.
+  // Can use to create a minHeap or a maxHeap.
+  MinIteratorComparator(const Comparator* comparator) :
+    comparator_(comparator) {}
+
+  bool operator()(IteratorWrapper* a, IteratorWrapper* b) {
+    return comparator_->Compare(a->key(), b->key()) > 0;
+  }
+ private:
+  const Comparator* comparator_;
+};
+
+typedef std::priority_queue<
+          IteratorWrapper*,
+          std::vector<IteratorWrapper*>,
+          MaxIteratorComparator> MaxIterHeap;
+
+typedef std::priority_queue<
+          IteratorWrapper*,
+          std::vector<IteratorWrapper*>,
+          MinIteratorComparator> MinIterHeap;
+
+// Return's a new MaxHeap of IteratorWrapper's using the provided Comparator.
+MaxIterHeap NewMaxIterHeap(const Comparator* comparator) {
+  return MaxIterHeap(MaxIteratorComparator(comparator));
+}
+
+// Return's a new MinHeap of IteratorWrapper's using the provided Comparator.
+MinIterHeap NewMinIterHeap(const Comparator* comparator) {
+  return MinIterHeap(MinIteratorComparator(comparator));
+}
+
+}  // namespace rocksdb
diff --git a/table/iterator.cc b/table/iterator.cc
new file mode 100644 (file)
index 0000000..a3d4f63
--- /dev/null
@@ -0,0 +1,72 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/iterator.h"
+
+namespace rocksdb {
+
+Iterator::Iterator() {
+  cleanup_.function = nullptr;
+  cleanup_.next = nullptr;
+}
+
+Iterator::~Iterator() {
+  if (cleanup_.function != nullptr) {
+    (*cleanup_.function)(cleanup_.arg1, cleanup_.arg2);
+    for (Cleanup* c = cleanup_.next; c != nullptr; ) {
+      (*c->function)(c->arg1, c->arg2);
+      Cleanup* next = c->next;
+      delete c;
+      c = next;
+    }
+  }
+}
+
+void Iterator::RegisterCleanup(CleanupFunction func, void* arg1, void* arg2) {
+  assert(func != nullptr);
+  Cleanup* c;
+  if (cleanup_.function == nullptr) {
+    c = &cleanup_;
+  } else {
+    c = new Cleanup;
+    c->next = cleanup_.next;
+    cleanup_.next = c;
+  }
+  c->function = func;
+  c->arg1 = arg1;
+  c->arg2 = arg2;
+}
+
+namespace {
+class EmptyIterator : public Iterator {
+ public:
+  explicit EmptyIterator(const Status& s) : status_(s) { }
+  virtual bool Valid() const { return false; }
+  virtual void Seek(const Slice& target) { }
+  virtual void SeekToFirst() { }
+  virtual void SeekToLast() { }
+  virtual void Next() { assert(false); }
+  virtual void Prev() { assert(false); }
+  Slice key() const { assert(false); return Slice(); }
+  Slice value() const { assert(false); return Slice(); }
+  virtual Status status() const { return status_; }
+ private:
+  Status status_;
+};
+}  // namespace
+
+Iterator* NewEmptyIterator() {
+  return new EmptyIterator(Status::OK());
+}
+
+Iterator* NewErrorIterator(const Status& status) {
+  return new EmptyIterator(status);
+}
+
+}  // namespace rocksdb
diff --git a/table/iterator_wrapper.h b/table/iterator_wrapper.h
new file mode 100644 (file)
index 0000000..cb8520b
--- /dev/null
@@ -0,0 +1,64 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+namespace rocksdb {
+
+// A internal wrapper class with an interface similar to Iterator that
+// caches the valid() and key() results for an underlying iterator.
+// This can help avoid virtual function calls and also gives better
+// cache locality.
+class IteratorWrapper {
+ public:
+  IteratorWrapper(): iter_(nullptr), valid_(false) { }
+  explicit IteratorWrapper(Iterator* iter): iter_(nullptr) {
+    Set(iter);
+  }
+  ~IteratorWrapper() { delete iter_; }
+  Iterator* iter() const { return iter_; }
+
+  // Takes ownership of "iter" and will delete it when destroyed, or
+  // when Set() is invoked again.
+  void Set(Iterator* iter) {
+    delete iter_;
+    iter_ = iter;
+    if (iter_ == nullptr) {
+      valid_ = false;
+    } else {
+      Update();
+    }
+  }
+
+
+  // Iterator interface methods
+  bool Valid() const        { return valid_; }
+  Slice key() const         { assert(Valid()); return key_; }
+  Slice value() const       { assert(Valid()); return iter_->value(); }
+  // Methods below require iter() != nullptr
+  Status status() const     { assert(iter_); return iter_->status(); }
+  void Next()               { assert(iter_); iter_->Next();        Update(); }
+  void Prev()               { assert(iter_); iter_->Prev();        Update(); }
+  void Seek(const Slice& k) { assert(iter_); iter_->Seek(k);       Update(); }
+  void SeekToFirst()        { assert(iter_); iter_->SeekToFirst(); Update(); }
+  void SeekToLast()         { assert(iter_); iter_->SeekToLast();  Update(); }
+
+ private:
+  void Update() {
+    valid_ = iter_->Valid();
+    if (valid_) {
+      key_ = iter_->key();
+    }
+  }
+
+  Iterator* iter_;
+  bool valid_;
+  Slice key_;
+};
+
+}  // namespace rocksdb
diff --git a/table/merger.cc b/table/merger.cc
new file mode 100644 (file)
index 0000000..f5ce744
--- /dev/null
@@ -0,0 +1,228 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/merger.h"
+
+#include "rocksdb/comparator.h"
+#include "rocksdb/iterator.h"
+#include "table/iter_heap.h"
+#include "table/iterator_wrapper.h"
+
+#include <vector>
+
+namespace rocksdb {
+
+namespace {
+
+class MergingIterator : public Iterator {
+ public:
+  MergingIterator(const Comparator* comparator, Iterator** children, int n)
+      : comparator_(comparator),
+        children_(n),
+        current_(nullptr),
+        direction_(kForward),
+        maxHeap_(NewMaxIterHeap(comparator_)),
+        minHeap_ (NewMinIterHeap(comparator_)) {
+    for (int i = 0; i < n; i++) {
+      children_[i].Set(children[i]);
+    }
+    for (auto& child : children_) {
+      if (child.Valid()) {
+        minHeap_.push(&child);
+      }
+    }
+  }
+
+  virtual ~MergingIterator() { }
+
+  virtual bool Valid() const {
+    return (current_ != nullptr);
+  }
+
+  virtual void SeekToFirst() {
+    ClearHeaps();
+    for (auto& child : children_) {
+      child.SeekToFirst();
+      if (child.Valid()) {
+        minHeap_.push(&child);
+      }
+    }
+    FindSmallest();
+    direction_ = kForward;
+  }
+
+  virtual void SeekToLast() {
+    ClearHeaps();
+    for (auto& child : children_) {
+      child.SeekToLast();
+      if (child.Valid()) {
+        maxHeap_.push(&child);
+      }
+    }
+    FindLargest();
+    direction_ = kReverse;
+  }
+
+  virtual void Seek(const Slice& target) {
+    ClearHeaps();
+    for (auto& child : children_) {
+      child.Seek(target);
+      if (child.Valid()) {
+        minHeap_.push(&child);
+      }
+    }
+    FindSmallest();
+    direction_ = kForward;
+  }
+
+  virtual void Next() {
+    assert(Valid());
+
+    // Ensure that all children are positioned after key().
+    // If we are moving in the forward direction, it is already
+    // true for all of the non-current_ children since current_ is
+    // the smallest child and key() == current_->key().  Otherwise,
+    // we explicitly position the non-current_ children.
+    if (direction_ != kForward) {
+      ClearHeaps();
+      for (auto& child : children_) {
+        if (&child != current_) {
+          child.Seek(key());
+          if (child.Valid() &&
+              comparator_->Compare(key(), child.key()) == 0) {
+            child.Next();
+          }
+          if (child.Valid()) {
+            minHeap_.push(&child);
+          }
+        }
+      }
+      direction_ = kForward;
+    }
+
+    // as the current points to the current record. move the iterator forward.
+    // and if it is valid add it to the heap.
+    current_->Next();
+    if (current_->Valid()){
+      minHeap_.push(current_);
+    }
+    FindSmallest();
+  }
+
+  virtual void Prev() {
+    assert(Valid());
+    // Ensure that all children are positioned before key().
+    // If we are moving in the reverse direction, it is already
+    // true for all of the non-current_ children since current_ is
+    // the largest child and key() == current_->key().  Otherwise,
+    // we explicitly position the non-current_ children.
+    if (direction_ != kReverse) {
+      ClearHeaps();
+      for (auto& child : children_) {
+        if (&child != current_) {
+          child.Seek(key());
+          if (child.Valid()) {
+            // Child is at first entry >= key().  Step back one to be < key()
+            child.Prev();
+          } else {
+            // Child has no entries >= key().  Position at last entry.
+            child.SeekToLast();
+          }
+          if (child.Valid()) {
+            maxHeap_.push(&child);
+          }
+        }
+      }
+      direction_ = kReverse;
+    }
+
+    current_->Prev();
+    if (current_->Valid()) {
+      maxHeap_.push(current_);
+    }
+    FindLargest();
+  }
+
+  virtual Slice key() const {
+    assert(Valid());
+    return current_->key();
+  }
+
+  virtual Slice value() const {
+    assert(Valid());
+    return current_->value();
+  }
+
+  virtual Status status() const {
+    Status status;
+    for (auto& child : children_) {
+      status = child.status();
+      if (!status.ok()) {
+        break;
+      }
+    }
+    return status;
+  }
+
+ private:
+  void FindSmallest();
+  void FindLargest();
+  void ClearHeaps();
+
+  const Comparator* comparator_;
+  std::vector<IteratorWrapper> children_;
+  IteratorWrapper* current_;
+  // Which direction is the iterator moving?
+  enum Direction {
+    kForward,
+    kReverse
+  };
+  Direction direction_;
+  MaxIterHeap maxHeap_;
+  MinIterHeap minHeap_;
+};
+
+void MergingIterator::FindSmallest() {
+  if (minHeap_.empty()) {
+    current_ = nullptr;
+  } else {
+    current_ = minHeap_.top();
+    assert(current_->Valid());
+    minHeap_.pop();
+  }
+}
+
+void MergingIterator::FindLargest() {
+  if (maxHeap_.empty()) {
+    current_ = nullptr;
+  } else {
+    current_ = maxHeap_.top();
+    assert(current_->Valid());
+    maxHeap_.pop();
+  }
+}
+
+void MergingIterator::ClearHeaps() {
+  maxHeap_ = NewMaxIterHeap(comparator_);
+  minHeap_ = NewMinIterHeap(comparator_);
+}
+}  // namespace
+
+Iterator* NewMergingIterator(const Comparator* cmp, Iterator** list, int n) {
+  assert(n >= 0);
+  if (n == 0) {
+    return NewEmptyIterator();
+  } else if (n == 1) {
+    return list[0];
+  } else {
+    return new MergingIterator(cmp, list, n);
+  }
+}
+
+}  // namespace rocksdb
diff --git a/table/merger.h b/table/merger.h
new file mode 100644 (file)
index 0000000..dbc1f69
--- /dev/null
@@ -0,0 +1,28 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+namespace rocksdb {
+
+class Comparator;
+class Iterator;
+
+// Return an iterator that provided the union of the data in
+// children[0,n-1].  Takes ownership of the child iterators and
+// will delete them when the result iterator is deleted.
+//
+// The result does no duplicate suppression.  I.e., if a particular
+// key is present in K child iterators, it will be yielded K times.
+//
+// REQUIRES: n >= 0
+extern Iterator* NewMergingIterator(
+    const Comparator* comparator, Iterator** children, int n);
+
+}  // namespace rocksdb
diff --git a/table/table_reader_bench.cc b/table/table_reader_bench.cc
new file mode 100644 (file)
index 0000000..e7b6b0b
--- /dev/null
@@ -0,0 +1,244 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include <gflags/gflags.h>
+
+#include "rocksdb/db.h"
+#include "rocksdb/table.h"
+#include "rocksdb/slice_transform.h"
+#include "db/db_impl.h"
+#include "db/dbformat.h"
+#include "port/atomic_pointer.h"
+#include "table/block_based_table_factory.h"
+#include "util/histogram.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+// Make a key that i determines the first 4 characters and j determines the
+// last 4 characters.
+static std::string MakeKey(int i, int j, bool through_db) {
+  char buf[100];
+  snprintf(buf, sizeof(buf), "%04d__key___%04d", i, j);
+  if (through_db) {
+    return std::string(buf);
+  }
+  // If we directly query table, which operates on internal keys
+  // instead of user keys, we need to add 8 bytes of internal
+  // information (row type etc) to user key to make an internal
+  // key.
+  InternalKey key(std::string(buf), 0, ValueType::kTypeValue);
+  return key.Encode().ToString();
+}
+
+static bool DummySaveValue(void* arg, const Slice& ikey, const Slice& v,
+                           bool didIO) {
+  return false;
+}
+
+// A very simple benchmark that.
+// Create a table with roughly numKey1 * numKey2 keys,
+// where there are numKey1 prefixes of the key, each has numKey2 number of
+// distinguished key, differing in the suffix part.
+// If if_query_empty_keys = false, query the existing keys numKey1 * numKey2
+// times randomly.
+// If if_query_empty_keys = true, query numKey1 * numKey2 random empty keys.
+// Print out the total time.
+// If through_db=true, a full DB will be created and queries will be against
+// it. Otherwise, operations will be directly through table level.
+//
+// If for_terator=true, instead of just query one key each time, it queries
+// a range sharing the same prefix.
+void TableReaderBenchmark(Options& opts, EnvOptions& env_options,
+                          ReadOptions& read_options, int num_keys1,
+                          int num_keys2, int num_iter, int prefix_len,
+                          bool if_query_empty_keys, bool for_iterator,
+                          bool through_db) {
+  Slice prefix = Slice();
+
+  std::string file_name = test::TmpDir()
+      + "/rocksdb_table_reader_benchmark";
+  std::string dbname = test::TmpDir() + "/rocksdb_table_reader_bench_db";
+  ReadOptions ro;
+  WriteOptions wo;
+  unique_ptr<WritableFile> file;
+  Env* env = Env::Default();
+  TableBuilder* tb = nullptr;
+  DB* db = nullptr;
+  Status s;
+  if (!through_db) {
+    env->NewWritableFile(file_name, &file, env_options);
+    tb = opts.table_factory->GetTableBuilder(opts, file.get(),
+                                             CompressionType::kNoCompression);
+  } else {
+    s = DB::Open(opts, dbname, &db);
+    ASSERT_OK(s);
+    ASSERT_TRUE(db != nullptr);
+  }
+  // Populate slightly more than 1M keys
+  for (int i = 0; i < num_keys1; i++) {
+    for (int j = 0; j < num_keys2; j++) {
+      std::string key = MakeKey(i * 2, j, through_db);
+      if (!through_db) {
+        tb->Add(key, key);
+      } else {
+        db->Put(wo, key, key);
+      }
+    }
+  }
+  if (!through_db) {
+    tb->Finish();
+    file->Close();
+  } else {
+    db->Flush(FlushOptions());
+  }
+
+  unique_ptr<TableReader> table_reader;
+  unique_ptr<RandomAccessFile> raf;
+  if (!through_db) {
+    Status s = env->NewRandomAccessFile(file_name, &raf, env_options);
+    uint64_t file_size;
+    env->GetFileSize(file_name, &file_size);
+    s = opts.table_factory->GetTableReader(opts, env_options, std::move(raf),
+                                           file_size, &table_reader);
+  }
+
+  Random rnd(301);
+  std::string result;
+  HistogramImpl hist;
+
+  void* arg = nullptr;
+  for (int it = 0; it < num_iter; it++) {
+    for (int i = 0; i < num_keys1; i++) {
+      for (int j = 0; j < num_keys2; j++) {
+        int r1 = rnd.Uniform(num_keys1) * 2;
+        int r2 = rnd.Uniform(num_keys2);
+        if (if_query_empty_keys) {
+          r1++;
+          r2 = num_keys2 * 2 - r2;
+        }
+
+        if (!for_iterator) {
+          // Query one existing key;
+          std::string key = MakeKey(r1, r2, through_db);
+          uint64_t start_micros = env->NowMicros();
+          port::MemoryBarrier();
+          if (!through_db) {
+            s = table_reader->Get(ro, key, arg, DummySaveValue, nullptr);
+          } else {
+            s = db->Get(ro, key, &result);
+          }
+          port::MemoryBarrier();
+          hist.Add(env->NowMicros() - start_micros);
+        } else {
+          int r2_len;
+          if (if_query_empty_keys) {
+            r2_len = 0;
+          } else {
+            r2_len = rnd.Uniform(num_keys2) + 1;
+            if (r2_len + r2 > num_keys2) {
+              r2_len = num_keys2 - r2;
+            }
+          }
+          std::string start_key = MakeKey(r1, r2, through_db);
+          std::string end_key = MakeKey(r1, r2 + r2_len, through_db);
+          if (prefix_len < 16) {
+            prefix = Slice(start_key.data(), prefix_len);
+            read_options.prefix = &prefix;
+          }
+          uint64_t total_time = 0;
+          uint64_t start_micros = env->NowMicros();
+          port::MemoryBarrier();
+          Iterator* iter;
+          if (!through_db) {
+            iter = table_reader->NewIterator(read_options);
+          } else {
+            iter = db->NewIterator(read_options);
+          }
+          int count = 0;
+          for(iter->Seek(start_key); iter->Valid(); iter->Next()) {
+            if (if_query_empty_keys) {
+              break;
+            }
+            // verify key;
+            port::MemoryBarrier();
+            total_time += env->NowMicros() - start_micros;
+            assert(Slice(MakeKey(r1, r2 + count, through_db)) == iter->key());
+            start_micros = env->NowMicros();
+            if (++count >= r2_len) {
+              break;
+            }
+          }
+          if (count != r2_len) {
+            fprintf(
+                stderr, "Iterator cannot iterate expected number of entries. "
+                "Expected %d but got %d\n", r2_len, count);
+            assert(false);
+          }
+          delete iter;
+          port::MemoryBarrier();
+          total_time += env->NowMicros() - start_micros;
+          hist.Add(total_time);
+        }
+      }
+    }
+  }
+
+  fprintf(
+      stderr,
+      "==================================================="
+      "====================================================\n"
+      "InMemoryTableSimpleBenchmark: %20s   num_key1:  %5d   "
+      "num_key2: %5d  %10s\n"
+      "==================================================="
+      "===================================================="
+      "\nHistogram (unit: microseconds): \n%s",
+      opts.table_factory->Name(), num_keys1, num_keys2,
+      for_iterator? "iterator" : (if_query_empty_keys ? "empty" : "non_empty"),
+      hist.ToString().c_str());
+  if (!through_db) {
+    env->DeleteFile(file_name);
+  } else {
+    delete db;
+    db = nullptr;
+    DestroyDB(dbname, opts);
+  }
+}
+} // namespace rocksdb
+
+DEFINE_bool(query_empty, false, "query non-existing keys instead of existing "
+            "ones.");
+DEFINE_int32(num_keys1, 4096, "number of distinguish prefix of keys");
+DEFINE_int32(num_keys2, 512, "number of distinguish keys for each prefix");
+DEFINE_int32(iter, 3, "query non-existing keys instead of existing ones");
+DEFINE_int32(prefix_len, 16, "Prefix length used for iterators and indexes");
+DEFINE_bool(iterator, false, "For test iterator");
+DEFINE_bool(through_db, false, "If enable, a DB instance will be created and "
+            "the query will be against DB. Otherwise, will be directly against "
+            "a table reader.");
+
+int main(int argc, char** argv) {
+  google::SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
+                          " [OPTIONS]...");
+  google::ParseCommandLineFlags(&argc, &argv, true);
+
+  rocksdb::TableFactory* tf = new rocksdb::BlockBasedTableFactory();
+  rocksdb::Options options;
+  if (FLAGS_prefix_len < 16) {
+    options.prefix_extractor = rocksdb::NewFixedPrefixTransform(
+        FLAGS_prefix_len);
+  }
+  options.SetUpDefaultFlushBlockPolicyFactory();
+  rocksdb::ReadOptions ro;
+  rocksdb::EnvOptions env_options;
+  options.create_if_missing = true;
+  options.table_factory =
+      std::shared_ptr<rocksdb::TableFactory>(tf);
+  TableReaderBenchmark(options, env_options, ro, FLAGS_num_keys1,
+                       FLAGS_num_keys2, FLAGS_iter, FLAGS_prefix_len,
+                       FLAGS_query_empty, FLAGS_iterator, FLAGS_through_db);
+  delete tf;
+  return 0;
+}
diff --git a/table/table_test.cc b/table/table_test.cc
new file mode 100644 (file)
index 0000000..d404e0b
--- /dev/null
@@ -0,0 +1,1305 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include <map>
+#include <string>
+#include <memory>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "db/db_statistics.h"
+#include "db/memtable.h"
+#include "db/write_batch_internal.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/memtablerep.h"
+#include "table/block_based_table_builder.h"
+#include "table/block_based_table_factory.h"
+#include "table/block_based_table_reader.h"
+#include "table/block_builder.h"
+#include "table/block.h"
+#include "table/format.h"
+#include "util/random.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+
+namespace {
+// Return reverse of "key".
+// Used to test non-lexicographic comparators.
+static std::string Reverse(const Slice& key) {
+  std::string str(key.ToString());
+  std::string rev("");
+  for (std::string::reverse_iterator rit = str.rbegin();
+       rit != str.rend(); ++rit) {
+    rev.push_back(*rit);
+  }
+  return rev;
+}
+
+class ReverseKeyComparator : public Comparator {
+ public:
+  virtual const char* Name() const {
+    return "rocksdb.ReverseBytewiseComparator";
+  }
+
+  virtual int Compare(const Slice& a, const Slice& b) const {
+    return BytewiseComparator()->Compare(Reverse(a), Reverse(b));
+  }
+
+  virtual void FindShortestSeparator(
+      std::string* start,
+      const Slice& limit) const {
+    std::string s = Reverse(*start);
+    std::string l = Reverse(limit);
+    BytewiseComparator()->FindShortestSeparator(&s, l);
+    *start = Reverse(s);
+  }
+
+  virtual void FindShortSuccessor(std::string* key) const {
+    std::string s = Reverse(*key);
+    BytewiseComparator()->FindShortSuccessor(&s);
+    *key = Reverse(s);
+  }
+};
+}  // namespace
+static ReverseKeyComparator reverse_key_comparator;
+
+static void Increment(const Comparator* cmp, std::string* key) {
+  if (cmp == BytewiseComparator()) {
+    key->push_back('\0');
+  } else {
+    assert(cmp == &reverse_key_comparator);
+    std::string rev = Reverse(*key);
+    rev.push_back('\0');
+    *key = Reverse(rev);
+  }
+}
+
+// An STL comparator that uses a Comparator
+namespace anon {
+struct STLLessThan {
+  const Comparator* cmp;
+
+  STLLessThan() : cmp(BytewiseComparator()) { }
+  explicit STLLessThan(const Comparator* c) : cmp(c) { }
+  bool operator()(const std::string& a, const std::string& b) const {
+    return cmp->Compare(Slice(a), Slice(b)) < 0;
+  }
+};
+}  // namespace
+
+class StringSink: public WritableFile {
+ public:
+  ~StringSink() { }
+
+  const std::string& contents() const { return contents_; }
+
+  virtual Status Close() { return Status::OK(); }
+  virtual Status Flush() { return Status::OK(); }
+  virtual Status Sync() { return Status::OK(); }
+
+  virtual Status Append(const Slice& data) {
+    contents_.append(data.data(), data.size());
+    return Status::OK();
+  }
+
+ private:
+  std::string contents_;
+};
+
+
+class StringSource: public RandomAccessFile {
+ public:
+  StringSource(const Slice& contents, uint64_t uniq_id)
+      : contents_(contents.data(), contents.size()), uniq_id_(uniq_id) {
+  }
+
+  virtual ~StringSource() { }
+
+  uint64_t Size() const { return contents_.size(); }
+
+  virtual Status Read(uint64_t offset, size_t n, Slice* result,
+                       char* scratch) const {
+    if (offset > contents_.size()) {
+      return Status::InvalidArgument("invalid Read offset");
+    }
+    if (offset + n > contents_.size()) {
+      n = contents_.size() - offset;
+    }
+    memcpy(scratch, &contents_[offset], n);
+    *result = Slice(scratch, n);
+    return Status::OK();
+  }
+
+  virtual size_t GetUniqueId(char* id, size_t max_size) const {
+    if (max_size < 20) {
+      return 0;
+    }
+
+    char* rid = id;
+    rid = EncodeVarint64(rid, uniq_id_);
+    rid = EncodeVarint64(rid, 0);
+    return static_cast<size_t>(rid-id);
+  }
+
+ private:
+  std::string contents_;
+  uint64_t uniq_id_;
+};
+
+typedef std::map<std::string, std::string, anon::STLLessThan> KVMap;
+
+// Helper class for tests to unify the interface between
+// BlockBuilder/TableBuilder and Block/Table.
+class Constructor {
+ public:
+  explicit Constructor(const Comparator* cmp) : data_(anon::STLLessThan(cmp)) { }
+  virtual ~Constructor() { }
+
+  void Add(const std::string& key, const Slice& value) {
+    data_[key] = value.ToString();
+  }
+
+  // Finish constructing the data structure with all the keys that have
+  // been added so far.  Returns the keys in sorted order in "*keys"
+  // and stores the key/value pairs in "*kvmap"
+  void Finish(const Options& options,
+              std::vector<std::string>* keys,
+              KVMap* kvmap) {
+    *kvmap = data_;
+    keys->clear();
+    for (KVMap::const_iterator it = data_.begin();
+         it != data_.end();
+         ++it) {
+      keys->push_back(it->first);
+    }
+    data_.clear();
+    Status s = FinishImpl(options, *kvmap);
+    ASSERT_TRUE(s.ok()) << s.ToString();
+  }
+
+  // Construct the data structure from the data in "data"
+  virtual Status FinishImpl(const Options& options, const KVMap& data) = 0;
+
+  virtual Iterator* NewIterator() const = 0;
+
+  virtual const KVMap& data() { return data_; }
+
+  virtual DB* db() const { return nullptr; }  // Overridden in DBConstructor
+
+ private:
+  KVMap data_;
+};
+
+class BlockConstructor: public Constructor {
+ public:
+  explicit BlockConstructor(const Comparator* cmp)
+      : Constructor(cmp),
+        comparator_(cmp),
+        block_(nullptr) { }
+  ~BlockConstructor() {
+    delete block_;
+  }
+  virtual Status FinishImpl(const Options& options, const KVMap& data) {
+    delete block_;
+    block_ = nullptr;
+    BlockBuilder builder(options);
+
+    for (KVMap::const_iterator it = data.begin();
+         it != data.end();
+         ++it) {
+      builder.Add(it->first, it->second);
+    }
+    // Open the block
+    data_ = builder.Finish().ToString();
+    BlockContents contents;
+    contents.data = data_;
+    contents.cachable = false;
+    contents.heap_allocated = false;
+    block_ = new Block(contents);
+    return Status::OK();
+  }
+  virtual Iterator* NewIterator() const {
+    return block_->NewIterator(comparator_);
+  }
+
+ private:
+  const Comparator* comparator_;
+  std::string data_;
+  Block* block_;
+
+  BlockConstructor();
+};
+
+class BlockBasedTableConstructor: public Constructor {
+ public:
+  explicit BlockBasedTableConstructor(
+      const Comparator* cmp)
+      : Constructor(cmp) {
+  }
+  ~BlockBasedTableConstructor() {
+    Reset();
+  }
+  virtual Status FinishImpl(const Options& options, const KVMap& data) {
+    Reset();
+    sink_.reset(new StringSink());
+    std::unique_ptr<FlushBlockBySizePolicyFactory> flush_policy_factory(
+        new FlushBlockBySizePolicyFactory(options.block_size,
+                                          options.block_size_deviation));
+
+    BlockBasedTableBuilder builder(
+        options,
+        sink_.get(),
+        flush_policy_factory.get(),
+        options.compression);
+
+    for (KVMap::const_iterator it = data.begin();
+         it != data.end();
+         ++it) {
+      builder.Add(it->first, it->second);
+      ASSERT_TRUE(builder.status().ok());
+    }
+    Status s = builder.Finish();
+    ASSERT_TRUE(s.ok()) << s.ToString();
+
+    ASSERT_EQ(sink_->contents().size(), builder.FileSize());
+
+    // Open the table
+    uniq_id_ = cur_uniq_id_++;
+    source_.reset(new StringSource(sink_->contents(), uniq_id_));
+    unique_ptr<TableFactory> table_factory;
+    return options.table_factory->GetTableReader(options, soptions,
+                                                 std::move(source_),
+                                                 sink_->contents().size(),
+                                                 &table_reader_);
+  }
+
+  virtual Iterator* NewIterator() const {
+    return table_reader_->NewIterator(ReadOptions());
+  }
+
+  uint64_t ApproximateOffsetOf(const Slice& key) const {
+    return table_reader_->ApproximateOffsetOf(key);
+  }
+
+  virtual Status Reopen(const Options& options) {
+    source_.reset(new StringSource(sink_->contents(), uniq_id_));
+    return options.table_factory->GetTableReader(options, soptions,
+                                                 std::move(source_),
+                                                 sink_->contents().size(),
+                                                 &table_reader_);
+  }
+
+  virtual TableReader* table_reader() {
+    return table_reader_.get();
+  }
+
+ private:
+  void Reset() {
+    uniq_id_ = 0;
+    table_reader_.reset();
+    sink_.reset();
+    source_.reset();
+  }
+
+  uint64_t uniq_id_;
+  unique_ptr<StringSink> sink_;
+  unique_ptr<StringSource> source_;
+  unique_ptr<TableReader> table_reader_;
+
+  BlockBasedTableConstructor();
+
+  static uint64_t cur_uniq_id_;
+  const EnvOptions soptions;
+};
+uint64_t BlockBasedTableConstructor::cur_uniq_id_ = 1;
+
+// A helper class that converts internal format keys into user keys
+class KeyConvertingIterator: public Iterator {
+ public:
+  explicit KeyConvertingIterator(Iterator* iter) : iter_(iter) { }
+  virtual ~KeyConvertingIterator() { delete iter_; }
+  virtual bool Valid() const { return iter_->Valid(); }
+  virtual void Seek(const Slice& target) {
+    ParsedInternalKey ikey(target, kMaxSequenceNumber, kTypeValue);
+    std::string encoded;
+    AppendInternalKey(&encoded, ikey);
+    iter_->Seek(encoded);
+  }
+  virtual void SeekToFirst() { iter_->SeekToFirst(); }
+  virtual void SeekToLast() { iter_->SeekToLast(); }
+  virtual void Next() { iter_->Next(); }
+  virtual void Prev() { iter_->Prev(); }
+
+  virtual Slice key() const {
+    assert(Valid());
+    ParsedInternalKey key;
+    if (!ParseInternalKey(iter_->key(), &key)) {
+      status_ = Status::Corruption("malformed internal key");
+      return Slice("corrupted key");
+    }
+    return key.user_key;
+  }
+
+  virtual Slice value() const { return iter_->value(); }
+  virtual Status status() const {
+    return status_.ok() ? iter_->status() : status_;
+  }
+
+ private:
+  mutable Status status_;
+  Iterator* iter_;
+
+  // No copying allowed
+  KeyConvertingIterator(const KeyConvertingIterator&);
+  void operator=(const KeyConvertingIterator&);
+};
+
+class MemTableConstructor: public Constructor {
+ public:
+  explicit MemTableConstructor(const Comparator* cmp)
+      : Constructor(cmp),
+        internal_comparator_(cmp),
+        table_factory_(new SkipListFactory) {
+    Options options;
+    options.memtable_factory = table_factory_;
+    memtable_ = new MemTable(internal_comparator_, options);
+    memtable_->Ref();
+  }
+  ~MemTableConstructor() {
+    delete memtable_->Unref();
+  }
+  virtual Status FinishImpl(const Options& options, const KVMap& data) {
+    delete memtable_->Unref();
+    Options memtable_options;
+    memtable_options.memtable_factory = table_factory_;
+    memtable_ = new MemTable(internal_comparator_, memtable_options);
+    memtable_->Ref();
+    int seq = 1;
+    for (KVMap::const_iterator it = data.begin();
+         it != data.end();
+         ++it) {
+      memtable_->Add(seq, kTypeValue, it->first, it->second);
+      seq++;
+    }
+    return Status::OK();
+  }
+  virtual Iterator* NewIterator() const {
+    return new KeyConvertingIterator(memtable_->NewIterator());
+  }
+
+ private:
+  InternalKeyComparator internal_comparator_;
+  MemTable* memtable_;
+  std::shared_ptr<SkipListFactory> table_factory_;
+};
+
+class DBConstructor: public Constructor {
+ public:
+  explicit DBConstructor(const Comparator* cmp)
+      : Constructor(cmp),
+        comparator_(cmp) {
+    db_ = nullptr;
+    NewDB();
+  }
+  ~DBConstructor() {
+    delete db_;
+  }
+  virtual Status FinishImpl(const Options& options, const KVMap& data) {
+    delete db_;
+    db_ = nullptr;
+    NewDB();
+    for (KVMap::const_iterator it = data.begin();
+         it != data.end();
+         ++it) {
+      WriteBatch batch;
+      batch.Put(it->first, it->second);
+      ASSERT_TRUE(db_->Write(WriteOptions(), &batch).ok());
+    }
+    return Status::OK();
+  }
+  virtual Iterator* NewIterator() const {
+    return db_->NewIterator(ReadOptions());
+  }
+
+  virtual DB* db() const { return db_; }
+
+ private:
+  void NewDB() {
+    std::string name = test::TmpDir() + "/table_testdb";
+
+    Options options;
+    options.comparator = comparator_;
+    Status status = DestroyDB(name, options);
+    ASSERT_TRUE(status.ok()) << status.ToString();
+
+    options.create_if_missing = true;
+    options.error_if_exists = true;
+    options.write_buffer_size = 10000;  // Something small to force merging
+    status = DB::Open(options, name, &db_);
+    ASSERT_TRUE(status.ok()) << status.ToString();
+  }
+
+  const Comparator* comparator_;
+  DB* db_;
+};
+
+static bool SnappyCompressionSupported() {
+  std::string out;
+  Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
+  return port::Snappy_Compress(Options().compression_opts,
+                               in.data(), in.size(),
+                               &out);
+}
+
+static bool ZlibCompressionSupported() {
+  std::string out;
+  Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
+  return port::Zlib_Compress(Options().compression_opts,
+                             in.data(), in.size(),
+                             &out);
+}
+
+#ifdef BZIP2
+static bool BZip2CompressionSupported() {
+  std::string out;
+  Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
+  return port::BZip2_Compress(Options().compression_opts,
+                              in.data(), in.size(),
+                              &out);
+}
+#endif
+
+enum TestType {
+  TABLE_TEST,
+  BLOCK_TEST,
+  MEMTABLE_TEST,
+  DB_TEST
+};
+
+struct TestArgs {
+  TestType type;
+  bool reverse_compare;
+  int restart_interval;
+  CompressionType compression;
+};
+
+
+static std::vector<TestArgs> GenerateArgList() {
+  std::vector<TestArgs> ret;
+  TestType test_type[4] = {TABLE_TEST, BLOCK_TEST, MEMTABLE_TEST, DB_TEST};
+  int test_type_len = 4;
+  bool reverse_compare[2] = {false, true};
+  int reverse_compare_len = 2;
+  int restart_interval[3] = {16, 1, 1024};
+  int restart_interval_len = 3;
+
+  // Only add compression if it is supported
+  std::vector<CompressionType> compression_types;
+  compression_types.push_back(kNoCompression);
+#ifdef SNAPPY
+  if (SnappyCompressionSupported())
+    compression_types.push_back(kSnappyCompression);
+#endif
+
+#ifdef ZLIB
+  if (ZlibCompressionSupported())
+    compression_types.push_back(kZlibCompression);
+#endif
+
+#ifdef BZIP2
+  if (BZip2CompressionSupported())
+    compression_types.push_back(kBZip2Compression);
+#endif
+
+  for(int i =0; i < test_type_len; i++)
+    for (int j =0; j < reverse_compare_len; j++)
+      for (int k =0; k < restart_interval_len; k++)
+  for (unsigned int n =0; n < compression_types.size(); n++) {
+    TestArgs one_arg;
+    one_arg.type = test_type[i];
+    one_arg.reverse_compare = reverse_compare[j];
+    one_arg.restart_interval = restart_interval[k];
+    one_arg.compression = compression_types[n];
+    ret.push_back(one_arg);
+  }
+
+  return ret;
+}
+
+class Harness {
+ public:
+  Harness() : constructor_(nullptr) { }
+
+  void Init(const TestArgs& args) {
+    delete constructor_;
+    constructor_ = nullptr;
+    options_ = Options();
+
+    options_.block_restart_interval = args.restart_interval;
+    options_.compression = args.compression;
+    // Use shorter block size for tests to exercise block boundary
+    // conditions more.
+    options_.block_size = 256;
+    if (args.reverse_compare) {
+      options_.comparator = &reverse_key_comparator;
+    }
+    switch (args.type) {
+      case TABLE_TEST:
+        constructor_ = new BlockBasedTableConstructor(options_.comparator);
+        break;
+      case BLOCK_TEST:
+        constructor_ = new BlockConstructor(options_.comparator);
+        break;
+      case MEMTABLE_TEST:
+        constructor_ = new MemTableConstructor(options_.comparator);
+        break;
+      case DB_TEST:
+        constructor_ = new DBConstructor(options_.comparator);
+        break;
+    }
+  }
+
+  ~Harness() {
+    delete constructor_;
+  }
+
+  void Add(const std::string& key, const std::string& value) {
+    constructor_->Add(key, value);
+  }
+
+  void Test(Random* rnd) {
+    std::vector<std::string> keys;
+    KVMap data;
+    constructor_->Finish(options_, &keys, &data);
+
+    TestForwardScan(keys, data);
+    TestBackwardScan(keys, data);
+    TestRandomAccess(rnd, keys, data);
+  }
+
+  void TestForwardScan(const std::vector<std::string>& keys,
+                       const KVMap& data) {
+    Iterator* iter = constructor_->NewIterator();
+    ASSERT_TRUE(!iter->Valid());
+    iter->SeekToFirst();
+    for (KVMap::const_iterator model_iter = data.begin();
+         model_iter != data.end();
+         ++model_iter) {
+      ASSERT_EQ(ToString(data, model_iter), ToString(iter));
+      iter->Next();
+    }
+    ASSERT_TRUE(!iter->Valid());
+    delete iter;
+  }
+
+  void TestBackwardScan(const std::vector<std::string>& keys,
+                        const KVMap& data) {
+    Iterator* iter = constructor_->NewIterator();
+    ASSERT_TRUE(!iter->Valid());
+    iter->SeekToLast();
+    for (KVMap::const_reverse_iterator model_iter = data.rbegin();
+         model_iter != data.rend();
+         ++model_iter) {
+      ASSERT_EQ(ToString(data, model_iter), ToString(iter));
+      iter->Prev();
+    }
+    ASSERT_TRUE(!iter->Valid());
+    delete iter;
+  }
+
+  void TestRandomAccess(Random* rnd,
+                        const std::vector<std::string>& keys,
+                        const KVMap& data) {
+    static const bool kVerbose = false;
+    Iterator* iter = constructor_->NewIterator();
+    ASSERT_TRUE(!iter->Valid());
+    KVMap::const_iterator model_iter = data.begin();
+    if (kVerbose) fprintf(stderr, "---\n");
+    for (int i = 0; i < 200; i++) {
+      const int toss = rnd->Uniform(5);
+      switch (toss) {
+        case 0: {
+          if (iter->Valid()) {
+            if (kVerbose) fprintf(stderr, "Next\n");
+            iter->Next();
+            ++model_iter;
+            ASSERT_EQ(ToString(data, model_iter), ToString(iter));
+          }
+          break;
+        }
+
+        case 1: {
+          if (kVerbose) fprintf(stderr, "SeekToFirst\n");
+          iter->SeekToFirst();
+          model_iter = data.begin();
+          ASSERT_EQ(ToString(data, model_iter), ToString(iter));
+          break;
+        }
+
+        case 2: {
+          std::string key = PickRandomKey(rnd, keys);
+          model_iter = data.lower_bound(key);
+          if (kVerbose) fprintf(stderr, "Seek '%s'\n",
+                                EscapeString(key).c_str());
+          iter->Seek(Slice(key));
+          ASSERT_EQ(ToString(data, model_iter), ToString(iter));
+          break;
+        }
+
+        case 3: {
+          if (iter->Valid()) {
+            if (kVerbose) fprintf(stderr, "Prev\n");
+            iter->Prev();
+            if (model_iter == data.begin()) {
+              model_iter = data.end();   // Wrap around to invalid value
+            } else {
+              --model_iter;
+            }
+            ASSERT_EQ(ToString(data, model_iter), ToString(iter));
+          }
+          break;
+        }
+
+        case 4: {
+          if (kVerbose) fprintf(stderr, "SeekToLast\n");
+          iter->SeekToLast();
+          if (keys.empty()) {
+            model_iter = data.end();
+          } else {
+            std::string last = data.rbegin()->first;
+            model_iter = data.lower_bound(last);
+          }
+          ASSERT_EQ(ToString(data, model_iter), ToString(iter));
+          break;
+        }
+      }
+    }
+    delete iter;
+  }
+
+  std::string ToString(const KVMap& data, const KVMap::const_iterator& it) {
+    if (it == data.end()) {
+      return "END";
+    } else {
+      return "'" + it->first + "->" + it->second + "'";
+    }
+  }
+
+  std::string ToString(const KVMap& data,
+                       const KVMap::const_reverse_iterator& it) {
+    if (it == data.rend()) {
+      return "END";
+    } else {
+      return "'" + it->first + "->" + it->second + "'";
+    }
+  }
+
+  std::string ToString(const Iterator* it) {
+    if (!it->Valid()) {
+      return "END";
+    } else {
+      return "'" + it->key().ToString() + "->" + it->value().ToString() + "'";
+    }
+  }
+
+  std::string PickRandomKey(Random* rnd, const std::vector<std::string>& keys) {
+    if (keys.empty()) {
+      return "foo";
+    } else {
+      const int index = rnd->Uniform(keys.size());
+      std::string result = keys[index];
+      switch (rnd->Uniform(3)) {
+        case 0:
+          // Return an existing key
+          break;
+        case 1: {
+          // Attempt to return something smaller than an existing key
+          if (result.size() > 0 && result[result.size()-1] > '\0') {
+            result[result.size()-1]--;
+          }
+          break;
+        }
+        case 2: {
+          // Return something larger than an existing key
+          Increment(options_.comparator, &result);
+          break;
+        }
+      }
+      return result;
+    }
+  }
+
+  // Returns nullptr if not running against a DB
+  DB* db() const { return constructor_->db(); }
+
+ private:
+  Options options_ = Options();
+  Constructor* constructor_;
+};
+
+// Test the empty key
+TEST(Harness, SimpleEmptyKey) {
+  std::vector<TestArgs> args = GenerateArgList();
+  for (unsigned int i = 0; i < args.size(); i++) {
+    Init(args[i]);
+    Random rnd(test::RandomSeed() + 1);
+    Add("", "v");
+    Test(&rnd);
+  }
+}
+
+TEST(Harness, SimpleSingle) {
+  std::vector<TestArgs> args = GenerateArgList();
+  for (unsigned int i = 0; i < args.size(); i++) {
+    Init(args[i]);
+    Random rnd(test::RandomSeed() + 2);
+    Add("abc", "v");
+    Test(&rnd);
+  }
+}
+
+TEST(Harness, SimpleMulti) {
+  std::vector<TestArgs> args = GenerateArgList();
+  for (unsigned int i = 0; i < args.size(); i++) {
+    Init(args[i]);
+    Random rnd(test::RandomSeed() + 3);
+    Add("abc", "v");
+    Add("abcd", "v");
+    Add("ac", "v2");
+    Test(&rnd);
+  }
+}
+
+TEST(Harness, SimpleSpecialKey) {
+  std::vector<TestArgs> args = GenerateArgList();
+  for (unsigned int i = 0; i < args.size(); i++) {
+    Init(args[i]);
+    Random rnd(test::RandomSeed() + 4);
+    Add("\xff\xff", "v3");
+    Test(&rnd);
+  }
+}
+
+static bool Between(uint64_t val, uint64_t low, uint64_t high) {
+  bool result = (val >= low) && (val <= high);
+  if (!result) {
+    fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n",
+            (unsigned long long)(val),
+            (unsigned long long)(low),
+            (unsigned long long)(high));
+  }
+  return result;
+}
+
+class TableTest { };
+
+// This test include all the basic checks except those for index size and block
+// size, which will be conducted in separated unit tests.
+TEST(TableTest, BasicTableProperties) {
+  BlockBasedTableConstructor c(BytewiseComparator());
+
+  c.Add("a1", "val1");
+  c.Add("b2", "val2");
+  c.Add("c3", "val3");
+  c.Add("d4", "val4");
+  c.Add("e5", "val5");
+  c.Add("f6", "val6");
+  c.Add("g7", "val7");
+  c.Add("h8", "val8");
+  c.Add("j9", "val9");
+
+  std::vector<std::string> keys;
+  KVMap kvmap;
+  Options options;
+  options.compression = kNoCompression;
+  options.block_restart_interval = 1;
+
+  c.Finish(options, &keys, &kvmap);
+
+  auto& props = c.table_reader()->GetTableProperties();
+  ASSERT_EQ(kvmap.size(), props.num_entries);
+
+  auto raw_key_size = kvmap.size() * 2ul;
+  auto raw_value_size = kvmap.size() * 4ul;
+
+  ASSERT_EQ(raw_key_size, props.raw_key_size);
+  ASSERT_EQ(raw_value_size, props.raw_value_size);
+  ASSERT_EQ(1ul, props.num_data_blocks);
+  ASSERT_EQ("", props.filter_policy_name);  // no filter policy is used
+
+  // Verify data size.
+  BlockBuilder block_builder(options);
+  for (const auto& item : kvmap) {
+    block_builder.Add(item.first, item.second);
+  }
+  Slice content = block_builder.Finish();
+  ASSERT_EQ(
+      content.size() + kBlockTrailerSize,
+      props.data_size
+  );
+}
+
+TEST(TableTest, FilterPolicyNameProperties) {
+  BlockBasedTableConstructor c(BytewiseComparator());
+  c.Add("a1", "val1");
+  std::vector<std::string> keys;
+  KVMap kvmap;
+  Options options;
+  std::unique_ptr<const FilterPolicy> filter_policy(
+    NewBloomFilterPolicy(10)
+  );
+  options.filter_policy = filter_policy.get();
+
+  c.Finish(options, &keys, &kvmap);
+  auto& props = c.table_reader()->GetTableProperties();
+  ASSERT_EQ("rocksdb.BuiltinBloomFilter", props.filter_policy_name);
+}
+
+static std::string RandomString(Random* rnd, int len) {
+  std::string r;
+  test::RandomString(rnd, len, &r);
+  return r;
+}
+
+// It's very hard to figure out the index block size of a block accurately.
+// To make sure we get the index size, we just make sure as key number
+// grows, the filter block size also grows.
+TEST(TableTest, IndexSizeStat) {
+  uint64_t last_index_size = 0;
+
+  // we need to use random keys since the pure human readable texts
+  // may be well compressed, resulting insignifcant change of index
+  // block size.
+  Random rnd(test::RandomSeed());
+  std::vector<std::string> keys;
+
+  for (int i = 0; i < 100; ++i) {
+    keys.push_back(RandomString(&rnd, 10000));
+  }
+
+  // Each time we load one more key to the table. the table index block
+  // size is expected to be larger than last time's.
+  for (size_t i = 1; i < keys.size(); ++i) {
+    BlockBasedTableConstructor c(BytewiseComparator());
+    for (size_t j = 0; j < i; ++j) {
+      c.Add(keys[j], "val");
+    }
+
+    std::vector<std::string> ks;
+    KVMap kvmap;
+    Options options;
+    options.compression = kNoCompression;
+    options.block_restart_interval = 1;
+
+    c.Finish(options, &ks, &kvmap);
+    auto index_size =
+      c.table_reader()->GetTableProperties().index_size;
+    ASSERT_GT(index_size, last_index_size);
+    last_index_size = index_size;
+  }
+}
+
+TEST(TableTest, NumBlockStat) {
+  Random rnd(test::RandomSeed());
+  BlockBasedTableConstructor c(BytewiseComparator());
+  Options options;
+  options.compression = kNoCompression;
+  options.block_restart_interval = 1;
+  options.block_size = 1000;
+
+  for (int i = 0; i < 10; ++i) {
+    // the key/val are slightly smaller than block size, so that each block
+    // holds roughly one key/value pair.
+    c.Add(RandomString(&rnd, 900), "val");
+  }
+
+  std::vector<std::string> ks;
+  KVMap kvmap;
+  c.Finish(options, &ks, &kvmap);
+  ASSERT_EQ(
+      kvmap.size(),
+      c.table_reader()->GetTableProperties().num_data_blocks
+  );
+}
+
+class BlockCacheProperties {
+ public:
+  explicit BlockCacheProperties(Statistics* statistics) {
+    block_cache_miss =
+      statistics->getTickerCount(BLOCK_CACHE_MISS);
+    block_cache_hit =
+      statistics->getTickerCount(BLOCK_CACHE_HIT);
+    index_block_cache_miss =
+      statistics->getTickerCount(BLOCK_CACHE_INDEX_MISS);
+    index_block_cache_hit =
+      statistics->getTickerCount(BLOCK_CACHE_INDEX_HIT);
+    data_block_cache_miss =
+      statistics->getTickerCount(BLOCK_CACHE_DATA_MISS);
+    data_block_cache_hit =
+      statistics->getTickerCount(BLOCK_CACHE_DATA_HIT);
+  }
+
+  // Check if the fetched props matches the expected ones.
+  void AssertEqual(
+      long index_block_cache_miss,
+      long index_block_cache_hit,
+      long data_block_cache_miss,
+      long data_block_cache_hit) const {
+    ASSERT_EQ(index_block_cache_miss, this->index_block_cache_miss);
+    ASSERT_EQ(index_block_cache_hit, this->index_block_cache_hit);
+    ASSERT_EQ(data_block_cache_miss, this->data_block_cache_miss);
+    ASSERT_EQ(data_block_cache_hit, this->data_block_cache_hit);
+    ASSERT_EQ(
+        index_block_cache_miss + data_block_cache_miss,
+        this->block_cache_miss
+    );
+    ASSERT_EQ(
+        index_block_cache_hit + data_block_cache_hit,
+        this->block_cache_hit
+    );
+  }
+
+ private:
+  long block_cache_miss = 0;
+  long block_cache_hit = 0;
+  long index_block_cache_miss = 0;
+  long index_block_cache_hit = 0;
+  long data_block_cache_miss = 0;
+  long data_block_cache_hit = 0;
+};
+
+TEST(TableTest, BlockCacheTest) {
+  // -- Table construction
+  Options options;
+  options.create_if_missing = true;
+  options.statistics = CreateDBStatistics();
+  options.block_cache = NewLRUCache(1024);
+  std::vector<std::string> keys;
+  KVMap kvmap;
+
+  BlockBasedTableConstructor c(BytewiseComparator());
+  c.Add("key", "value");
+  c.Finish(options, &keys, &kvmap);
+
+  // -- PART 1: Open with regular block cache.
+  // Since block_cache is disabled, no cache activities will be involved.
+  unique_ptr<Iterator> iter;
+
+  // At first, no block will be accessed.
+  {
+    BlockCacheProperties props(options.statistics.get());
+    // index will be added to block cache.
+    props.AssertEqual(
+        1,  // index block miss
+        0,
+        0,
+        0
+    );
+  }
+
+  // Only index block will be accessed
+  {
+    iter.reset(c.NewIterator());
+    BlockCacheProperties props(options.statistics.get());
+    // NOTE: to help better highlight the "detla" of each ticker, I use
+    // <last_value> + <added_value> to indicate the increment of changed
+    // value; other numbers remain the same.
+    props.AssertEqual(
+        1,
+        0 + 1,  // index block hit
+        0,
+        0
+    );
+  }
+
+  // Only data block will be accessed
+  {
+    iter->SeekToFirst();
+    BlockCacheProperties props(options.statistics.get());
+    props.AssertEqual(
+        1,
+        1,
+        0 + 1,  // data block miss
+        0
+    );
+  }
+
+  // Data block will be in cache
+  {
+    iter.reset(c.NewIterator());
+    iter->SeekToFirst();
+    BlockCacheProperties props(options.statistics.get());
+    props.AssertEqual(
+        1,
+        1 + 1,  // index block hit
+        1,
+        0 + 1  // data block hit
+    );
+  }
+  // release the iterator so that the block cache can reset correctly.
+  iter.reset();
+
+  // -- PART 2: Open without block cache
+  options.block_cache.reset();
+  options.statistics = CreateDBStatistics();  // reset the stats
+  c.Reopen(options);
+
+  {
+    iter.reset(c.NewIterator());
+    iter->SeekToFirst();
+    ASSERT_EQ("key", iter->key().ToString());
+    BlockCacheProperties props(options.statistics.get());
+    // Nothing is affected at all
+    props.AssertEqual(0, 0, 0, 0);
+  }
+
+  // -- PART 3: Open with very small block cache
+  // In this test, no block will ever get hit since the block cache is
+  // too small to fit even one entry.
+  options.block_cache = NewLRUCache(1);
+  c.Reopen(options);
+  {
+    BlockCacheProperties props(options.statistics.get());
+    props.AssertEqual(
+        1,  // index block miss
+        0,
+        0,
+        0
+    );
+  }
+
+
+  {
+    // Both index and data block get accessed.
+    // It first cache index block then data block. But since the cache size
+    // is only 1, index block will be purged after data block is inserted.
+    iter.reset(c.NewIterator());
+    BlockCacheProperties props(options.statistics.get());
+    props.AssertEqual(
+        1 + 1,  // index block miss
+        0,
+        0,  // data block miss
+        0
+    );
+  }
+
+  {
+    // SeekToFirst() accesses data block. With similar reason, we expect data
+    // block's cache miss.
+    iter->SeekToFirst();
+    BlockCacheProperties props(options.statistics.get());
+    props.AssertEqual(
+        2,
+        0,
+        0 + 1,  // data block miss
+        0
+    );
+  }
+}
+
+TEST(TableTest, ApproximateOffsetOfPlain) {
+  BlockBasedTableConstructor c(BytewiseComparator());
+  c.Add("k01", "hello");
+  c.Add("k02", "hello2");
+  c.Add("k03", std::string(10000, 'x'));
+  c.Add("k04", std::string(200000, 'x'));
+  c.Add("k05", std::string(300000, 'x'));
+  c.Add("k06", "hello3");
+  c.Add("k07", std::string(100000, 'x'));
+  std::vector<std::string> keys;
+  KVMap kvmap;
+  Options options;
+  options.block_size = 1024;
+  options.compression = kNoCompression;
+  c.Finish(options, &keys, &kvmap);
+
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"),       0,      0));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"),       0,      0));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01a"),      0,      0));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"),       0,      0));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"),       0,      0));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"),   10000,  11000));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04a"), 210000, 211000));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k05"),  210000, 211000));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k06"),  510000, 511000));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k07"),  510000, 511000));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"),  610000, 612000));
+
+}
+
+static void Do_Compression_Test(CompressionType comp) {
+  Random rnd(301);
+  BlockBasedTableConstructor c(BytewiseComparator());
+  std::string tmp;
+  c.Add("k01", "hello");
+  c.Add("k02", test::CompressibleString(&rnd, 0.25, 10000, &tmp));
+  c.Add("k03", "hello3");
+  c.Add("k04", test::CompressibleString(&rnd, 0.25, 10000, &tmp));
+  std::vector<std::string> keys;
+  KVMap kvmap;
+  Options options;
+  options.block_size = 1024;
+  options.compression = comp;
+  c.Finish(options, &keys, &kvmap);
+
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"),       0,      0));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"),       0,      0));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"),       0,      0));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"),    2000,   3000));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"),    2000,   3000));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"),    4000,   6000));
+}
+
+TEST(TableTest, ApproximateOffsetOfCompressed) {
+  CompressionType compression_state[2];
+  int valid = 0;
+  if (!SnappyCompressionSupported()) {
+    fprintf(stderr, "skipping snappy compression tests\n");
+  } else {
+    compression_state[valid] = kSnappyCompression;
+    valid++;
+  }
+
+  if (!ZlibCompressionSupported()) {
+    fprintf(stderr, "skipping zlib compression tests\n");
+  } else {
+    compression_state[valid] = kZlibCompression;
+    valid++;
+  }
+
+  for(int i =0; i < valid; i++)
+  {
+    Do_Compression_Test(compression_state[i]);
+  }
+
+}
+
+TEST(TableTest, BlockCacheLeak) {
+  // Check that when we reopen a table we don't lose access to blocks already
+  // in the cache. This test checks whether the Table actually makes use of the
+  // unique ID from the file.
+
+  Options opt;
+  opt.block_size = 1024;
+  opt.compression = kNoCompression;
+  opt.block_cache = NewLRUCache(16*1024*1024); // big enough so we don't ever
+                                               // lose cached values.
+
+  BlockBasedTableConstructor c(BytewiseComparator());
+  c.Add("k01", "hello");
+  c.Add("k02", "hello2");
+  c.Add("k03", std::string(10000, 'x'));
+  c.Add("k04", std::string(200000, 'x'));
+  c.Add("k05", std::string(300000, 'x'));
+  c.Add("k06", "hello3");
+  c.Add("k07", std::string(100000, 'x'));
+  std::vector<std::string> keys;
+  KVMap kvmap;
+  c.Finish(opt, &keys, &kvmap);
+
+  unique_ptr<Iterator> iter(c.NewIterator());
+  iter->SeekToFirst();
+  while (iter->Valid()) {
+    iter->key();
+    iter->value();
+    iter->Next();
+  }
+  ASSERT_OK(iter->status());
+
+  ASSERT_OK(c.Reopen(opt));
+  for (const std::string& key: keys) {
+    ASSERT_TRUE(c.table_reader()->TEST_KeyInCache(ReadOptions(), key));
+  }
+}
+
+TEST(Harness, Randomized) {
+  std::vector<TestArgs> args = GenerateArgList();
+  for (unsigned int i = 0; i < args.size(); i++) {
+    Init(args[i]);
+    Random rnd(test::RandomSeed() + 5);
+    for (int num_entries = 0; num_entries < 2000;
+         num_entries += (num_entries < 50 ? 1 : 200)) {
+      if ((num_entries % 10) == 0) {
+        fprintf(stderr, "case %d of %d: num_entries = %d\n",
+                (i + 1), int(args.size()), num_entries);
+      }
+      for (int e = 0; e < num_entries; e++) {
+        std::string v;
+        Add(test::RandomKey(&rnd, rnd.Skewed(4)),
+            test::RandomString(&rnd, rnd.Skewed(5), &v).ToString());
+      }
+      Test(&rnd);
+    }
+  }
+}
+
+TEST(Harness, RandomizedLongDB) {
+  Random rnd(test::RandomSeed());
+  TestArgs args = { DB_TEST, false, 16, kNoCompression };
+  Init(args);
+  int num_entries = 100000;
+  for (int e = 0; e < num_entries; e++) {
+    std::string v;
+    Add(test::RandomKey(&rnd, rnd.Skewed(4)),
+        test::RandomString(&rnd, rnd.Skewed(5), &v).ToString());
+  }
+  Test(&rnd);
+
+  // We must have created enough data to force merging
+  int files = 0;
+  for (int level = 0; level < db()->NumberLevels(); level++) {
+    std::string value;
+    char name[100];
+    snprintf(name, sizeof(name), "rocksdb.num-files-at-level%d", level);
+    ASSERT_TRUE(db()->GetProperty(name, &value));
+    files += atoi(value.c_str());
+  }
+  ASSERT_GT(files, 0);
+}
+
+class MemTableTest { };
+
+TEST(MemTableTest, Simple) {
+  InternalKeyComparator cmp(BytewiseComparator());
+  auto table_factory = std::make_shared<SkipListFactory>();
+  Options options;
+  options.memtable_factory = table_factory;
+  MemTable* memtable = new MemTable(cmp, options);
+  memtable->Ref();
+  WriteBatch batch;
+  WriteBatchInternal::SetSequence(&batch, 100);
+  batch.Put(std::string("k1"), std::string("v1"));
+  batch.Put(std::string("k2"), std::string("v2"));
+  batch.Put(std::string("k3"), std::string("v3"));
+  batch.Put(std::string("largekey"), std::string("vlarge"));
+  ASSERT_TRUE(WriteBatchInternal::InsertInto(&batch, memtable, &options).ok());
+
+  Iterator* iter = memtable->NewIterator();
+  iter->SeekToFirst();
+  while (iter->Valid()) {
+    fprintf(stderr, "key: '%s' -> '%s'\n",
+            iter->key().ToString().c_str(),
+            iter->value().ToString().c_str());
+    iter->Next();
+  }
+
+  delete iter;
+  delete memtable->Unref();
+}
+
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/table/two_level_iterator.cc b/table/two_level_iterator.cc
new file mode 100644 (file)
index 0000000..ac2d8d3
--- /dev/null
@@ -0,0 +1,205 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/two_level_iterator.h"
+
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+#include "table/block.h"
+#include "table/format.h"
+#include "table/iterator_wrapper.h"
+
+namespace rocksdb {
+
+namespace {
+
+typedef Iterator* (*BlockFunction)(void*, const ReadOptions&,
+                                   const EnvOptions& soptions, const Slice&,
+                                   bool for_compaction);
+
+class TwoLevelIterator: public Iterator {
+ public:
+  TwoLevelIterator(
+    Iterator* index_iter,
+    BlockFunction block_function,
+    void* arg,
+    const ReadOptions& options,
+    const EnvOptions& soptions,
+    bool for_compaction);
+
+  virtual ~TwoLevelIterator();
+
+  virtual void Seek(const Slice& target);
+  virtual void SeekToFirst();
+  virtual void SeekToLast();
+  virtual void Next();
+  virtual void Prev();
+
+  virtual bool Valid() const {
+    return data_iter_.Valid();
+  }
+  virtual Slice key() const {
+    assert(Valid());
+    return data_iter_.key();
+  }
+  virtual Slice value() const {
+    assert(Valid());
+    return data_iter_.value();
+  }
+  virtual Status status() const {
+    // It'd be nice if status() returned a const Status& instead of a Status
+    if (!index_iter_.status().ok()) {
+      return index_iter_.status();
+    } else if (data_iter_.iter() != nullptr && !data_iter_.status().ok()) {
+      return data_iter_.status();
+    } else {
+      return status_;
+    }
+  }
+
+ private:
+  void SaveError(const Status& s) {
+    if (status_.ok() && !s.ok()) status_ = s;
+  }
+  void SkipEmptyDataBlocksForward();
+  void SkipEmptyDataBlocksBackward();
+  void SetDataIterator(Iterator* data_iter);
+  void InitDataBlock();
+
+  BlockFunction block_function_;
+  void* arg_;
+  const ReadOptions options_;
+  const EnvOptions& soptions_;
+  Status status_;
+  IteratorWrapper index_iter_;
+  IteratorWrapper data_iter_; // May be nullptr
+  // If data_iter_ is non-nullptr, then "data_block_handle_" holds the
+  // "index_value" passed to block_function_ to create the data_iter_.
+  std::string data_block_handle_;
+  bool for_compaction_;
+};
+
+TwoLevelIterator::TwoLevelIterator(
+    Iterator* index_iter,
+    BlockFunction block_function,
+    void* arg,
+    const ReadOptions& options,
+    const EnvOptions& soptions,
+    bool for_compaction)
+    : block_function_(block_function),
+      arg_(arg),
+      options_(options),
+      soptions_(soptions),
+      index_iter_(index_iter),
+      data_iter_(nullptr),
+      for_compaction_(for_compaction) {
+}
+
+TwoLevelIterator::~TwoLevelIterator() {
+}
+
+void TwoLevelIterator::Seek(const Slice& target) {
+  index_iter_.Seek(target);
+  InitDataBlock();
+  if (data_iter_.iter() != nullptr) data_iter_.Seek(target);
+  SkipEmptyDataBlocksForward();
+}
+
+void TwoLevelIterator::SeekToFirst() {
+  index_iter_.SeekToFirst();
+  InitDataBlock();
+  if (data_iter_.iter() != nullptr) data_iter_.SeekToFirst();
+  SkipEmptyDataBlocksForward();
+}
+
+void TwoLevelIterator::SeekToLast() {
+  index_iter_.SeekToLast();
+  InitDataBlock();
+  if (data_iter_.iter() != nullptr) data_iter_.SeekToLast();
+  SkipEmptyDataBlocksBackward();
+}
+
+void TwoLevelIterator::Next() {
+  assert(Valid());
+  data_iter_.Next();
+  SkipEmptyDataBlocksForward();
+}
+
+void TwoLevelIterator::Prev() {
+  assert(Valid());
+  data_iter_.Prev();
+  SkipEmptyDataBlocksBackward();
+}
+
+
+void TwoLevelIterator::SkipEmptyDataBlocksForward() {
+  while (data_iter_.iter() == nullptr || (!data_iter_.Valid() &&
+        !data_iter_.status().IsIncomplete())) {
+    // Move to next block
+    if (!index_iter_.Valid()) {
+      SetDataIterator(nullptr);
+      return;
+    }
+    index_iter_.Next();
+    InitDataBlock();
+    if (data_iter_.iter() != nullptr) data_iter_.SeekToFirst();
+  }
+}
+
+void TwoLevelIterator::SkipEmptyDataBlocksBackward() {
+  while (data_iter_.iter() == nullptr || (!data_iter_.Valid() &&
+        !data_iter_.status().IsIncomplete())) {
+    // Move to next block
+    if (!index_iter_.Valid()) {
+      SetDataIterator(nullptr);
+      return;
+    }
+    index_iter_.Prev();
+    InitDataBlock();
+    if (data_iter_.iter() != nullptr) data_iter_.SeekToLast();
+  }
+}
+
+void TwoLevelIterator::SetDataIterator(Iterator* data_iter) {
+  if (data_iter_.iter() != nullptr) SaveError(data_iter_.status());
+  data_iter_.Set(data_iter);
+}
+
+void TwoLevelIterator::InitDataBlock() {
+  if (!index_iter_.Valid()) {
+    SetDataIterator(nullptr);
+  } else {
+    Slice handle = index_iter_.value();
+    if (data_iter_.iter() != nullptr
+        && handle.compare(data_block_handle_) == 0) {
+      // data_iter_ is already constructed with this iterator, so
+      // no need to change anything
+    } else {
+      Iterator* iter = (*block_function_)(arg_, options_, soptions_, handle,
+                                          for_compaction_);
+      data_block_handle_.assign(handle.data(), handle.size());
+      SetDataIterator(iter);
+    }
+  }
+}
+
+}  // namespace
+
+Iterator* NewTwoLevelIterator(
+    Iterator* index_iter,
+    BlockFunction block_function,
+    void* arg,
+    const ReadOptions& options,
+    const EnvOptions& soptions,
+    bool for_compaction) {
+  return new TwoLevelIterator(index_iter, block_function, arg,
+                              options, soptions, for_compaction);
+}
+
+}  // namespace rocksdb
diff --git a/table/two_level_iterator.h b/table/two_level_iterator.h
new file mode 100644 (file)
index 0000000..85aed3f
--- /dev/null
@@ -0,0 +1,40 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include "rocksdb/iterator.h"
+#include "rocksdb/env.h"
+
+namespace rocksdb {
+
+struct ReadOptions;
+
+// Return a new two level iterator.  A two-level iterator contains an
+// index iterator whose values point to a sequence of blocks where
+// each block is itself a sequence of key,value pairs.  The returned
+// two-level iterator yields the concatenation of all key/value pairs
+// in the sequence of blocks.  Takes ownership of "index_iter" and
+// will delete it when no longer needed.
+//
+// Uses a supplied function to convert an index_iter value into
+// an iterator over the contents of the corresponding block.
+extern Iterator* NewTwoLevelIterator(
+    Iterator* index_iter,
+    Iterator* (*block_function)(
+        void* arg,
+        const ReadOptions& options,
+        const EnvOptions& soptions,
+        const Slice& index_value,
+        bool for_compaction),
+    void* arg,
+    const ReadOptions& options,
+    const EnvOptions& soptions,
+    bool for_compaction = false);
+
+}  // namespace rocksdb
diff --git a/tools/blob_store_bench.cc b/tools/blob_store_bench.cc
new file mode 100644 (file)
index 0000000..70ece2c
--- /dev/null
@@ -0,0 +1,269 @@
+#include <cstdio>
+#include <vector>
+#include <atomic>
+
+#include "rocksdb/env.h"
+#include "util/blob_store.h"
+#include "util/testutil.h"
+
+#define KB 1024LL
+#define MB 1024*1024LL
+// BlobStore does costly asserts to make sure it's running correctly, which
+// significantly impacts benchmark runtime.
+// NDEBUG will compile out those asserts.
+#ifndef NDEBUG
+#define NDEBUG
+#endif
+
+using namespace rocksdb;
+using namespace std;
+
+// used by all threads
+uint64_t timeout_sec;
+Env *env;
+BlobStore* bs;
+
+static std::string RandomString(Random* rnd, uint64_t len) {
+  std::string r;
+  test::RandomString(rnd, len, &r);
+  return r;
+}
+
+struct Result {
+  uint32_t writes;
+  uint32_t reads;
+  uint32_t deletes;
+  uint64_t data_written;
+  uint64_t data_read;
+
+  void print() {
+    printf("Total writes = %u\n", writes);
+    printf("Total reads = %u\n", reads);
+    printf("Total deletes = %u\n", deletes);
+    printf("Write throughput = %lf MB/s\n",
+           (double)data_written / (1024*1024.0) / timeout_sec);
+    printf("Read throughput = %lf MB/s\n",
+           (double)data_read / (1024*1024.0) / timeout_sec);
+    printf("Total throughput = %lf MB/s\n",
+           (double)(data_read + data_written) / (1024*1024.0) / timeout_sec);
+  }
+
+  Result() {
+    writes = reads = deletes = data_read = data_written = 0;
+  }
+
+  Result (uint32_t writes, uint32_t reads, uint32_t deletes,
+          uint64_t data_written, uint64_t data_read) :
+    writes(writes), reads(reads), deletes(deletes),
+    data_written(data_written), data_read(data_read) {}
+
+};
+
+Result operator + (const Result &a, const Result &b) {
+  return Result(a.writes + b.writes, a.reads + b.reads,
+                a.deletes + b.deletes, a.data_written + b.data_written,
+                a.data_read + b.data_read);
+}
+
+struct WorkerThread {
+  uint64_t data_size_from, data_size_to;
+  double read_ratio;
+  uint64_t working_set_size; // start deleting once you reach this
+  Result result;
+  atomic<bool> stopped;
+
+  WorkerThread(uint64_t data_size_from, uint64_t data_size_to,
+                double read_ratio, uint64_t working_set_size) :
+    data_size_from(data_size_from), data_size_to(data_size_to),
+    read_ratio(read_ratio), working_set_size(working_set_size),
+    stopped(false) {}
+
+  WorkerThread(const WorkerThread& wt) :
+    data_size_from(wt.data_size_from), data_size_to(wt.data_size_to),
+    read_ratio(wt.read_ratio), working_set_size(wt.working_set_size),
+    stopped(false) {}
+};
+
+static void WorkerThreadBody(void* arg) {
+  WorkerThread* t = reinterpret_cast<WorkerThread*>(arg);
+  Random rnd(5);
+  string buf;
+  vector<pair<Blob, uint64_t>> blobs;
+  vector<string> random_strings;
+
+  for (int i = 0; i < 10; ++i) {
+    random_strings.push_back(RandomString(&rnd, t->data_size_to));
+  }
+
+  uint64_t total_size = 0;
+
+  uint64_t start_micros = env->NowMicros();
+  while (env->NowMicros() - start_micros < timeout_sec * 1000 * 1000) {
+    if (blobs.size() && rand() < RAND_MAX * t->read_ratio) {
+      // read
+      int bi = rand() % blobs.size();
+      Status s = bs->Get(blobs[bi].first, &buf);
+      assert(s.ok());
+      t->result.data_read += buf.size();
+      t->result.reads++;
+    } else {
+      // write
+      uint64_t size = rand() % (t->data_size_to - t->data_size_from) +
+        t->data_size_from;
+      total_size += size;
+      string put_str = random_strings[rand() % random_strings.size()];
+      blobs.push_back(make_pair(Blob(), size));
+      Status s = bs->Put(Slice(put_str.data(), size), &blobs.back().first);
+      assert(s.ok());
+      t->result.data_written += size;
+      t->result.writes++;
+    }
+
+    while (total_size >= t->working_set_size) {
+      // delete random
+      int bi = rand() % blobs.size();
+      total_size -= blobs[bi].second;
+      bs->Delete(blobs[bi].first);
+      blobs.erase(blobs.begin() + bi);
+      t->result.deletes++;
+    }
+  }
+  t->stopped.store(true);
+}
+
+Result StartBenchmark(vector<WorkerThread*>& config) {
+  for (auto w : config) {
+    env->StartThread(WorkerThreadBody, w);
+  }
+
+  Result result;
+
+  for (auto w : config) {
+    while (!w->stopped.load());
+    result = result + w->result;
+  }
+
+  for (auto w : config) {
+    delete w;
+  }
+
+  delete bs;
+
+  return result;
+}
+
+vector<WorkerThread*> SetupBenchmarkBalanced() {
+  string test_path;
+  env->GetTestDirectory(&test_path);
+  test_path.append("/blob_store");
+
+  // config start
+  uint32_t block_size = 16*KB;
+  uint32_t file_size = 1*MB;
+  double read_write_ratio = 0.5;
+  uint64_t data_read_from = 16*KB;
+  uint64_t data_read_to = 32*KB;
+  int number_of_threads = 10;
+  uint64_t working_set_size = 5*MB;
+  timeout_sec = 5;
+  // config end
+
+  bs = new BlobStore(test_path, block_size, file_size / block_size, 10000, env);
+
+  vector <WorkerThread*> config;
+
+  for (int i = 0; i < number_of_threads; ++i) {
+    config.push_back(new WorkerThread(data_read_from,
+                                      data_read_to,
+                                      read_write_ratio,
+                                      working_set_size));
+  };
+
+  return config;
+}
+
+vector<WorkerThread*> SetupBenchmarkWriteHeavy() {
+  string test_path;
+  env->GetTestDirectory(&test_path);
+  test_path.append("/blob_store");
+
+  // config start
+  uint32_t block_size = 16*KB;
+  uint32_t file_size = 1*MB;
+  double read_write_ratio = 0.1;
+  uint64_t data_read_from = 16*KB;
+  uint64_t data_read_to = 32*KB;
+  int number_of_threads = 10;
+  uint64_t working_set_size = 5*MB;
+  timeout_sec = 5;
+  // config end
+
+  bs = new BlobStore(test_path, block_size, file_size / block_size, 10000, env);
+
+  vector <WorkerThread*> config;
+
+  for (int i = 0; i < number_of_threads; ++i) {
+    config.push_back(new WorkerThread(data_read_from,
+                                      data_read_to,
+                                      read_write_ratio,
+                                      working_set_size));
+  };
+
+  return config;
+}
+
+vector<WorkerThread*> SetupBenchmarkReadHeavy() {
+  string test_path;
+  env->GetTestDirectory(&test_path);
+  test_path.append("/blob_store");
+
+  // config start
+  uint32_t block_size = 16*KB;
+  uint32_t file_size = 1*MB;
+  double read_write_ratio = 0.9;
+  uint64_t data_read_from = 16*KB;
+  uint64_t data_read_to = 32*KB;
+  int number_of_threads = 10;
+  uint64_t working_set_size = 5*MB;
+  timeout_sec = 5;
+  // config end
+
+  bs = new BlobStore(test_path, block_size, file_size / block_size, 10000, env);
+
+  vector <WorkerThread*> config;
+
+  for (int i = 0; i < number_of_threads; ++i) {
+    config.push_back(new WorkerThread(data_read_from,
+                                      data_read_to,
+                                      read_write_ratio,
+                                      working_set_size));
+  };
+
+  return config;
+}
+
+int main(int argc, const char** argv) {
+  srand(33);
+  env = Env::Default();
+
+  {
+    printf("--- Balanced read/write benchmark ---\n");
+    vector <WorkerThread*> config = SetupBenchmarkBalanced();
+    Result r = StartBenchmark(config);
+    r.print();
+  }
+  {
+    printf("--- Write heavy benchmark ---\n");
+    vector <WorkerThread*> config = SetupBenchmarkWriteHeavy();
+    Result r = StartBenchmark(config);
+    r.print();
+  }
+  {
+    printf("--- Read heavy benchmark ---\n");
+    vector <WorkerThread*> config = SetupBenchmarkReadHeavy();
+    Result r = StartBenchmark(config);
+    r.print();
+  }
+
+  return 0;
+}
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
new file mode 100644 (file)
index 0000000..6270d69
--- /dev/null
@@ -0,0 +1,136 @@
+#! /usr/bin/env python
+import os
+import re
+import sys
+import time
+import random
+import getopt
+import logging
+import tempfile
+import subprocess
+
+# This script runs and kills db_stress multiple times. It checks consistency
+# in case of unsafe crashes in Rocksdb.
+
+def main(argv):
+    try:
+        opts, args = getopt.getopt(argv, "hd:t:i:o:b:")
+    except getopt.GetoptError:
+        print("db_crashtest.py -d <duration_test> -t <#threads> "
+              "-i <interval for one run> -o <ops_per_thread> "
+              "-b <write_buffer_size>\n")
+        sys.exit(2)
+
+    # default values, will be overridden by cmdline args
+    interval = 120  # time for one db_stress instance to run
+    duration = 6000  # total time for this script to test db_stress
+    threads = 32
+    # since we will be killing anyway, use large value for ops_per_thread
+    ops_per_thread = 100000000
+    write_buf_size = 4 * 1024 * 1024
+
+    for opt, arg in opts:
+        if opt == '-h':
+            print("db_crashtest.py -d <duration_test>"
+                  " -t <#threads> -i <interval for one run>"
+                  " -o <ops_per_thread> -b <write_buffer_size>\n")
+            sys.exit()
+        elif opt == "-d":
+            duration = int(arg)
+        elif opt == "-t":
+            threads = int(arg)
+        elif opt == "-i":
+            interval = int(arg)
+        elif opt == "-o":
+            ops_per_thread = int(arg)
+        elif opt == "-b":
+            write_buf_size = int(arg)
+        else:
+            print("db_crashtest.py -d <duration_test>"
+                  " -t <#threads> -i <interval for one run>"
+                  " -o <ops_per_thread> -b <write_buffer_size>\n")
+            sys.exit(2)
+
+    exit_time = time.time() + duration
+
+    print("Running blackbox-crash-test with \ninterval_between_crash="
+          + str(interval) + "\ntotal-duration=" + str(duration)
+          + "\nthreads=" + str(threads) + "\nops_per_thread="
+          + str(ops_per_thread) + "\nwrite_buffer_size="
+          + str(write_buf_size) + "\n")
+
+    while time.time() < exit_time:
+        run_had_errors = False
+        killtime = time.time() + interval
+
+        cmd = re.sub('\s+', ' ', """
+            ./db_stress
+            --test_batches_snapshots=1
+            --ops_per_thread=%s
+            --threads=%s
+            --write_buffer_size=%s
+            --destroy_db_initially=0
+            --reopen=0
+            --readpercent=45
+            --prefixpercent=5
+            --writepercent=35
+            --delpercent=5
+            --iterpercent=10
+            --db=%s
+            --max_key=100000000
+            --disable_seek_compaction=%s
+            --mmap_read=%s
+            --block_size=16384
+            --cache_size=1048576
+            --open_files=500000
+            --verify_checksum=1
+            --sync=%s
+            --disable_wal=0
+            --disable_data_sync=%s
+            --target_file_size_base=2097152
+            --target_file_size_multiplier=2
+            --max_write_buffer_number=3
+            --max_background_compactions=20
+            --max_bytes_for_level_base=10485760
+            --filter_deletes=%s
+            """ % (ops_per_thread,
+                   threads,
+                   write_buf_size,
+                   tempfile.mkdtemp(),
+                   random.randint(0, 1),
+                   random.randint(0, 1),
+                   random.randint(0, 1),
+                   random.randint(0, 1),
+                   random.randint(0, 1)))
+
+        child = subprocess.Popen([cmd],
+                                 stderr=subprocess.PIPE, shell=True)
+        print("Running db_stress with pid=%d: %s\n\n"
+              % (child.pid, cmd))
+
+        while time.time() < killtime:
+            time.sleep(10)
+
+        if child.poll() is not None:
+            print("WARNING: db_stress ended before kill: exitcode=%d\n"
+                  % child.returncode)
+        else:
+            child.kill()
+            print("KILLED %d\n" % child.pid)
+            time.sleep(1)  # time to stabilize after a kill
+
+        while True:
+            line = child.stderr.readline().strip()
+            if line != '':
+                run_had_errors = True
+                print('***' + line + '^')
+            else:
+                break
+
+        if run_had_errors:
+            sys.exit(2)
+
+        time.sleep(1)  # time to stabilize before the next run
+
+if __name__ == "__main__":
+    sys.exit(main(sys.argv[1:]))
diff --git a/tools/db_crashtest2.py b/tools/db_crashtest2.py
new file mode 100644 (file)
index 0000000..dbb7059
--- /dev/null
@@ -0,0 +1,163 @@
+#! /usr/bin/env python
+import os
+import re
+import sys
+import time
+import random
+import getopt
+import logging
+import tempfile
+import subprocess
+
+# This python script runs db_stress multiple times. Some runs with
+# kill_random_test that causes rocksdb to crash at various points in code.
+
+def main(argv):
+    try:
+        opts, args = getopt.getopt(argv, "hd:t:k:o:b:")
+    except getopt.GetoptError:
+        print str(getopt.GetoptError)
+        print "db_crashtest2.py -d <duration_test> -t <#threads> " \
+              "-k <kills with prob 1/k> -o <ops_per_thread> "\
+              "-b <write_buffer_size>\n"
+        sys.exit(2)
+
+    # default values, will be overridden by cmdline args
+    kill_random_test = 97  # kill with probability 1/97 by default
+    duration = 10000  # total time for this script to test db_stress
+    threads = 32
+    ops_per_thread = 200000
+    write_buf_size = 4 * 1024 * 1024
+
+    for opt, arg in opts:
+        if opt == '-h':
+            print "db_crashtest2.py -d <duration_test> -t <#threads> " \
+                  "-k <kills with prob 1/k> -o <ops_per_thread> " \
+                  "-b <write_buffer_size>\n"
+            sys.exit()
+        elif opt == "-d":
+            duration = int(arg)
+        elif opt == "-t":
+            threads = int(arg)
+        elif opt == "-k":
+            kill_random_test = int(arg)
+        elif opt == "-o":
+            ops_per_thread = int(arg)
+        elif opt == "-b":
+            write_buf_size = int(arg)
+        else:
+            print "unrecognized option " + str(opt) + "\n"
+            print "db_crashtest2.py -d <duration_test> -t <#threads> " \
+                  "-k <kills with prob 1/k> -o <ops_per_thread> " \
+                  "-b <write_buffer_size>\n"
+            sys.exit(2)
+
+    exit_time = time.time() + duration
+
+    print "Running whitebox-crash-test with \ntotal-duration=" + str(duration) \
+          + "\nthreads=" + str(threads) + "\nops_per_thread=" \
+          + str(ops_per_thread) + "\nwrite_buffer_size=" \
+          + str(write_buf_size) + "\n"
+
+    total_check_mode = 3
+    check_mode = 0
+
+    while time.time() < exit_time:
+        killoption = ""
+        if check_mode == 0:
+            # run with kill_random_test
+            killoption = " --kill_random_test=" + str(kill_random_test)
+            # use large ops per thread since we will kill it anyway
+            additional_opts = "--ops_per_thread=" + \
+                              str(100 * ops_per_thread) + killoption
+        elif check_mode == 1:
+            # normal run with universal compaction mode
+            additional_opts = "--ops_per_thread=" + str(ops_per_thread) + \
+                              " --compaction_style=1"
+        else:
+            # nomral run
+            additional_opts = "--ops_per_thread=" + str(ops_per_thread)
+
+        cmd = re.sub('\s+', ' ', """
+            ./db_stress
+            --test_batches_snapshots=%s
+            --threads=%s
+            --write_buffer_size=%s
+            --destroy_db_initially=0
+            --reopen=0
+            --readpercent=45
+            --prefixpercent=5
+            --writepercent=35
+            --delpercent=5
+            --iterpercent=10
+            --db=%s
+            --max_key=100000000
+            --disable_seek_compaction=%s
+            --mmap_read=%s
+            --block_size=16384
+            --cache_size=1048576
+            --open_files=500000
+            --verify_checksum=1
+            --sync=%s
+            --disable_wal=0
+            --disable_data_sync=%s
+            --target_file_size_base=2097152
+            --target_file_size_multiplier=2
+            --max_write_buffer_number=3
+            --max_background_compactions=20
+            --max_bytes_for_level_base=10485760
+            --filter_deletes=%s
+            %s
+            """ % (random.randint(0, 1),
+                   threads,
+                   write_buf_size,
+                   tempfile.mkdtemp(),
+                   random.randint(0, 1),
+                   random.randint(0, 1),
+                   random.randint(0, 1),
+                   random.randint(0, 1),
+                   random.randint(0, 1),
+                   additional_opts))
+
+        print "Running:" + cmd + "\n"
+
+        popen = subprocess.Popen([cmd], stdout=subprocess.PIPE,
+                                 stderr=subprocess.STDOUT,
+                                 shell=True)
+        stdoutdata, stderrdata = popen.communicate()
+        retncode = popen.returncode
+        msg = ("check_mode={0}, kill option={1}, exitcode={2}\n".format(
+               check_mode, killoption, retncode))
+        print msg
+        print stdoutdata
+
+        expected = False
+        if (killoption == '') and (retncode == 0):
+            # we expect zero retncode if no kill option
+            expected = True
+        elif killoption != '' and retncode < 0:
+            # we expect negative retncode if kill option was given
+            expected = True
+
+        if not expected:
+            print "TEST FAILED. See kill option and exit code above!!!\n"
+            sys.exit(1)
+
+        stdoutdata = stdoutdata.lower()
+        errorcount = (stdoutdata.count('error') -
+                      stdoutdata.count('got errors 0 times'))
+        print "#times error occurred in output is " + str(errorcount) + "\n"
+
+        if (errorcount > 0):
+            print "TEST FAILED. Output has 'error'!!!\n"
+            sys.exit(2)
+        if (stdoutdata.find('fail') >= 0):
+            print "TEST FAILED. Output has 'fail'!!!\n"
+            sys.exit(2)
+
+        check_mode = (check_mode + 1) % total_check_mode
+
+        time.sleep(1)  # time to stabilize after a kill
+
+if __name__ == "__main__":
+    sys.exit(main(sys.argv[1:]))
diff --git a/tools/db_repl_stress.cc b/tools/db_repl_stress.cc
new file mode 100644 (file)
index 0000000..9dfe4b6
--- /dev/null
@@ -0,0 +1,134 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#include <cstdio>
+
+#include <gflags/gflags.h>
+
+#include "db/write_batch_internal.h"
+#include "rocksdb/db.h"
+#include "rocksdb/types.h"
+#include "port/atomic_pointer.h"
+#include "util/testutil.h"
+
+
+// Run a thread to perform Put's.
+// Another thread uses GetUpdatesSince API to keep getting the updates.
+// options :
+// --num_inserts = the num of inserts the first thread should perform.
+// --wal_ttl = the wal ttl for the run.
+
+using namespace rocksdb;
+
+struct DataPumpThread {
+  size_t no_records;
+  DB* db; // Assumption DB is Open'ed already.
+};
+
+static std::string RandomString(Random* rnd, int len) {
+  std::string r;
+  test::RandomString(rnd, len, &r);
+  return r;
+}
+
+static void DataPumpThreadBody(void* arg) {
+  DataPumpThread* t = reinterpret_cast<DataPumpThread*>(arg);
+  DB* db = t->db;
+  Random rnd(301);
+  size_t i = 0;
+  while(i++ < t->no_records) {
+    if(!db->Put(WriteOptions(), Slice(RandomString(&rnd, 500)),
+                Slice(RandomString(&rnd, 500))).ok()) {
+      fprintf(stderr, "Error in put\n");
+      exit(1);
+    }
+  }
+}
+
+struct ReplicationThread {
+  port::AtomicPointer stop;
+  DB* db;
+  volatile size_t no_read;
+};
+
+static void ReplicationThreadBody(void* arg) {
+  ReplicationThread* t = reinterpret_cast<ReplicationThread*>(arg);
+  DB* db = t->db;
+  unique_ptr<TransactionLogIterator> iter;
+  SequenceNumber currentSeqNum = 1;
+  while (t->stop.Acquire_Load() != nullptr) {
+    iter.reset();
+    Status s;
+    while(!db->GetUpdatesSince(currentSeqNum, &iter).ok()) {
+      if (t->stop.Acquire_Load() == nullptr) {
+        return;
+      }
+    }
+    fprintf(stderr, "Refreshing iterator\n");
+    for(;iter->Valid(); iter->Next(), t->no_read++, currentSeqNum++) {
+      BatchResult res = iter->GetBatch();
+      if (res.sequence != currentSeqNum) {
+        fprintf(stderr,
+                "Missed a seq no. b/w %ld and %ld\n",
+                (long)currentSeqNum,
+                (long)res.sequence);
+        exit(1);
+      }
+    }
+  }
+}
+
+DEFINE_uint64(num_inserts, 1000, "the num of inserts the first thread should"
+              " perform.");
+DEFINE_uint64(wal_ttl_seconds, 1000, "the wal ttl for the run(in seconds)");
+DEFINE_uint64(wal_size_limit_MB, 10, "the wal size limit for the run"
+              "(in MB)");
+
+int main(int argc, const char** argv) {
+  google::SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
+    " --num_inserts=<num_inserts> --wal_ttl_seconds=<WAL_ttl_seconds>" +
+    " --wal_size_limit_MB=<WAL_size_limit_MB>");
+  google::ParseCommandLineFlags(&argc, const_cast<char***>(&argv), true);
+
+  Env* env = Env::Default();
+  std::string default_db_path;
+  env->GetTestDirectory(&default_db_path);
+  default_db_path += "db_repl_stress";
+  Options options;
+  options.create_if_missing = true;
+  options.WAL_ttl_seconds = FLAGS_wal_ttl_seconds;
+  options.WAL_size_limit_MB = FLAGS_wal_size_limit_MB;
+  DB* db;
+  DestroyDB(default_db_path, options);
+
+  Status s = DB::Open(options, default_db_path, &db);
+
+  if (!s.ok()) {
+    fprintf(stderr, "Could not open DB due to %s\n", s.ToString().c_str());
+    exit(1);
+  }
+
+  DataPumpThread dataPump;
+  dataPump.no_records = FLAGS_num_inserts;
+  dataPump.db = db;
+  env->StartThread(DataPumpThreadBody, &dataPump);
+
+  ReplicationThread replThread;
+  replThread.db = db;
+  replThread.no_read = 0;
+  replThread.stop.Release_Store(env); // store something to make it non-null.
+
+  env->StartThread(ReplicationThreadBody, &replThread);
+  while(replThread.no_read < FLAGS_num_inserts);
+  replThread.stop.Release_Store(nullptr);
+  if (replThread.no_read < dataPump.no_records) {
+    // no. read should be => than inserted.
+    fprintf(stderr, "No. of Record's written and read not same\nRead : %ld"
+            " Written : %ld\n", replThread.no_read, dataPump.no_records);
+    exit(1);
+  }
+  fprintf(stderr, "Successful!\n");
+  exit(0);
+}
diff --git a/tools/db_stress.cc b/tools/db_stress.cc
new file mode 100644 (file)
index 0000000..966f007
--- /dev/null
@@ -0,0 +1,1539 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// The test uses an array to compare against values written to the database.
+// Keys written to the array are in 1:1 correspondence to the actual values in
+// the database according to the formula in the function GenerateValue.
+
+// Space is reserved in the array from 0 to FLAGS_max_key and values are
+// randomly written/deleted/read from those positions. During verification we
+// compare all the positions in the array. To shorten/elongate the running
+// time, you could change the settings: FLAGS_max_key, FLAGS_ops_per_thread,
+// (sometimes also FLAGS_threads).
+//
+// NOTE that if FLAGS_test_batches_snapshots is set, the test will have
+// different behavior. See comment of the flag for details.
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <gflags/gflags.h>
+#include "db/db_impl.h"
+#include "db/version_set.h"
+#include "db/db_statistics.h"
+#include "rocksdb/cache.h"
+#include "utilities/utility_db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/write_batch.h"
+#include "rocksdb/statistics.h"
+#include "port/port.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+#include "util/histogram.h"
+#include "util/mutexlock.h"
+#include "util/random.h"
+#include "util/testutil.h"
+#include "util/logging.h"
+#include "utilities/ttl/db_ttl.h"
+#include "hdfs/env_hdfs.h"
+#include "utilities/merge_operators.h"
+
+static const long KB = 1024;
+
+
+static bool ValidateUint32Range(const char* flagname, uint64_t value) {
+  if (value > std::numeric_limits<uint32_t>::max()) {
+    fprintf(stderr,
+            "Invalid value for --%s: %lu, overflow\n",
+            flagname,
+            (unsigned long)value);
+    return false;
+  }
+  return true;
+}
+DEFINE_uint64(seed, 2341234, "Seed for PRNG");
+static const bool FLAGS_seed_dummy =
+  google::RegisterFlagValidator(&FLAGS_seed, &ValidateUint32Range);
+
+DEFINE_int64(max_key, 1 * KB * KB * KB,
+             "Max number of key/values to place in database");
+
+DEFINE_bool(test_batches_snapshots, false,
+            "If set, the test uses MultiGet(), MultiPut() and MultiDelete()"
+            " which read/write/delete multiple keys in a batch. In this mode,"
+            " we do not verify db content by comparing the content with the "
+            "pre-allocated array. Instead, we do partial verification inside"
+            " MultiGet() by checking various values in a batch. Benefit of"
+            " this mode:\n"
+            "\t(a) No need to acquire mutexes during writes (less cache "
+            "flushes in multi-core leading to speed up)\n"
+            "\t(b) No long validation at the end (more speed up)\n"
+            "\t(c) Test snapshot and atomicity of batch writes");
+
+DEFINE_int32(threads, 32, "Number of concurrent threads to run.");
+
+DEFINE_int32(ttl, -1,
+             "Opens the db with this ttl value if this is not -1. "
+             "Carefully specify a large value such that verifications on "
+             "deleted values don't fail");
+
+DEFINE_int32(value_size_mult, 8,
+             "Size of value will be this number times rand_int(1,3) bytes");
+
+DEFINE_bool(verify_before_write, false, "Verify before write");
+
+DEFINE_bool(histogram, false, "Print histogram of operation timings");
+
+DEFINE_bool(destroy_db_initially, true,
+            "Destroys the database dir before start if this is true");
+
+DEFINE_bool (verbose, false, "Verbose");
+
+DEFINE_int32(write_buffer_size, rocksdb::Options().write_buffer_size,
+             "Number of bytes to buffer in memtable before compacting");
+
+DEFINE_int32(max_write_buffer_number,
+             rocksdb::Options().max_write_buffer_number,
+             "The number of in-memory memtables. "
+             "Each memtable is of size FLAGS_write_buffer_size.");
+
+DEFINE_int32(min_write_buffer_number_to_merge,
+             rocksdb::Options().min_write_buffer_number_to_merge,
+             "The minimum number of write buffers that will be merged together "
+             "before writing to storage. This is cheap because it is an "
+             "in-memory merge. If this feature is not enabled, then all these "
+             "write buffers are flushed to L0 as separate files and this "
+             "increases read amplification because a get request has to check "
+             "in all of these files. Also, an in-memory merge may result in "
+             "writing less data to storage if there are duplicate records in"
+             " each of these individual write buffers.");
+
+DEFINE_int32(open_files, rocksdb::Options().max_open_files,
+             "Maximum number of files to keep open at the same time "
+             "(use default if == 0)");
+
+DEFINE_int64(compressed_cache_size, -1,
+             "Number of bytes to use as a cache of compressed data."
+             " Negative means use default settings.");
+
+DEFINE_int32(compaction_style, rocksdb::Options().compaction_style, "");
+
+DEFINE_int32(level0_file_num_compaction_trigger,
+             rocksdb::Options().level0_file_num_compaction_trigger,
+             "Level0 compaction start trigger");
+
+DEFINE_int32(level0_slowdown_writes_trigger,
+             rocksdb::Options().level0_slowdown_writes_trigger,
+             "Number of files in level-0 that will slow down writes");
+
+DEFINE_int32(level0_stop_writes_trigger,
+             rocksdb::Options().level0_stop_writes_trigger,
+             "Number of files in level-0 that will trigger put stop.");
+
+DEFINE_int32(block_size, rocksdb::Options().block_size,
+             "Number of bytes in a block.");
+
+DEFINE_int32(max_background_compactions,
+             rocksdb::Options().max_background_compactions,
+             "The maximum number of concurrent background compactions "
+             "that can occur in parallel.");
+
+DEFINE_int32(universal_size_ratio, 0, "The ratio of file sizes that trigger"
+             " compaction in universal style");
+
+DEFINE_int32(universal_min_merge_width, 0, "The minimum number of files to "
+             "compact in universal style compaction");
+
+DEFINE_int32(universal_max_merge_width, 0, "The max number of files to compact"
+             " in universal style compaction");
+
+DEFINE_int32(universal_max_size_amplification_percent, 0,
+             "The max size amplification for universal style compaction");
+
+DEFINE_int64(cache_size, 2 * KB * KB * KB,
+             "Number of bytes to use as a cache of uncompressed data.");
+
+static bool ValidateInt32Positive(const char* flagname, int32_t value) {
+  if (value < 0) {
+    fprintf(stderr, "Invalid value for --%s: %d, must be >=0\n",
+            flagname, value);
+    return false;
+  }
+  return true;
+}
+DEFINE_int32(reopen, 10, "Number of times database reopens");
+static const bool FLAGS_reopen_dummy =
+  google::RegisterFlagValidator(&FLAGS_reopen, &ValidateInt32Positive);
+
+DEFINE_int32(bloom_bits, 10, "Bloom filter bits per key. "
+             "Negative means use default settings.");
+
+DEFINE_string(db, "", "Use the db with the following name.");
+
+DEFINE_bool(verify_checksum, false,
+            "Verify checksum for every block read from storage");
+
+DEFINE_bool(mmap_read, rocksdb::EnvOptions().use_mmap_reads,
+            "Allow reads to occur via mmap-ing files");
+
+// Database statistics
+static std::shared_ptr<rocksdb::Statistics> dbstats;
+DEFINE_bool(statistics, false, "Create database statistics");
+
+DEFINE_bool(sync, false, "Sync all writes to disk");
+
+DEFINE_bool(disable_data_sync, false,
+            "If true, do not wait until data is synced to disk.");
+
+DEFINE_bool(use_fsync, false, "If true, issue fsync instead of fdatasync");
+
+DEFINE_int32(kill_random_test, 0,
+             "If non-zero, kill at various points in source code with "
+             "probability 1/this");
+static const bool FLAGS_kill_random_test_dummy =
+  google::RegisterFlagValidator(&FLAGS_kill_random_test,
+                                &ValidateInt32Positive);
+extern int rocksdb_kill_odds;
+
+DEFINE_bool(disable_wal, false, "If true, do not write WAL for write.");
+
+DEFINE_int32(target_file_size_base, 64 * KB,
+             "Target level-1 file size for compaction");
+
+DEFINE_int32(target_file_size_multiplier, 1,
+             "A multiplier to compute targe level-N file size (N >= 2)");
+
+DEFINE_uint64(max_bytes_for_level_base, 256 * KB, "Max bytes for level-1");
+
+DEFINE_int32(max_bytes_for_level_multiplier, 2,
+             "A multiplier to compute max bytes for level-N (N >= 2)");
+
+static bool ValidateInt32Percent(const char* flagname, int32_t value) {
+  if (value < 0 || value>100) {
+    fprintf(stderr, "Invalid value for --%s: %d, 0<= pct <=100 \n",
+            flagname, value);
+    return false;
+  }
+  return true;
+}
+DEFINE_int32(readpercent, 10,
+             "Ratio of reads to total workload (expressed as a percentage)");
+static const bool FLAGS_readpercent_dummy =
+  google::RegisterFlagValidator(&FLAGS_readpercent, &ValidateInt32Percent);
+
+DEFINE_int32(prefixpercent, 20,
+             "Ratio of prefix iterators to total workload (expressed as a"
+             " percentage)");
+static const bool FLAGS_prefixpercent_dummy =
+  google::RegisterFlagValidator(&FLAGS_prefixpercent, &ValidateInt32Percent);
+
+DEFINE_int32(writepercent, 45,
+             " Ratio of deletes to total workload (expressed as a percentage)");
+static const bool FLAGS_writepercent_dummy =
+  google::RegisterFlagValidator(&FLAGS_writepercent, &ValidateInt32Percent);
+
+DEFINE_int32(delpercent, 15,
+             "Ratio of deletes to total workload (expressed as a percentage)");
+static const bool FLAGS_delpercent_dummy =
+  google::RegisterFlagValidator(&FLAGS_delpercent, &ValidateInt32Percent);
+
+DEFINE_int32(iterpercent, 10, "Ratio of iterations to total workload"
+             " (expressed as a percentage)");
+static const bool FLAGS_iterpercent_dummy =
+  google::RegisterFlagValidator(&FLAGS_iterpercent, &ValidateInt32Percent);
+
+DEFINE_uint64(num_iterations, 10, "Number of iterations per MultiIterate run");
+static const bool FLAGS_num_iterations_dummy =
+  google::RegisterFlagValidator(&FLAGS_num_iterations, &ValidateUint32Range);
+
+DEFINE_bool(disable_seek_compaction, false,
+            "Option to disable compation triggered by read.");
+
+DEFINE_uint64(delete_obsolete_files_period_micros, 0,
+              "Option to delete obsolete files periodically"
+              "0 means that obsolete files are "
+              " deleted after every compaction run.");
+
+enum rocksdb::CompressionType StringToCompressionType(const char* ctype) {
+  assert(ctype);
+
+  if (!strcasecmp(ctype, "none"))
+    return rocksdb::kNoCompression;
+  else if (!strcasecmp(ctype, "snappy"))
+    return rocksdb::kSnappyCompression;
+  else if (!strcasecmp(ctype, "zlib"))
+    return rocksdb::kZlibCompression;
+  else if (!strcasecmp(ctype, "bzip2"))
+    return rocksdb::kBZip2Compression;
+
+  fprintf(stdout, "Cannot parse compression type '%s'\n", ctype);
+  return rocksdb::kSnappyCompression; //default value
+}
+DEFINE_string(compression_type, "snappy",
+              "Algorithm to use to compress the database");
+static enum rocksdb::CompressionType FLAGS_compression_type_e =
+    rocksdb::kSnappyCompression;
+
+DEFINE_string(hdfs, "", "Name of hdfs environment");
+// posix or hdfs environment
+static rocksdb::Env* FLAGS_env = rocksdb::Env::Default();
+
+DEFINE_uint64(ops_per_thread, 600000, "Number of operations per thread.");
+static const bool FLAGS_ops_per_thread_dummy =
+  google::RegisterFlagValidator(&FLAGS_ops_per_thread, &ValidateUint32Range);
+
+DEFINE_uint64(log2_keys_per_lock, 2, "Log2 of number of keys per lock");
+static const bool FLAGS_log2_keys_per_lock_dummy =
+  google::RegisterFlagValidator(&FLAGS_log2_keys_per_lock,
+                                &ValidateUint32Range);
+
+DEFINE_int32(purge_redundant_percent, 50,
+             "Percentage of times we want to purge redundant keys in memory "
+             "before flushing");
+static const bool FLAGS_purge_redundant_percent_dummy =
+  google::RegisterFlagValidator(&FLAGS_purge_redundant_percent,
+                                &ValidateInt32Percent);
+
+DEFINE_bool(filter_deletes, false, "On true, deletes use KeyMayExist to drop"
+            " the delete if key not present");
+
+enum RepFactory {
+  kSkipList,
+  kHashSkipList,
+  kVectorRep
+};
+enum RepFactory StringToRepFactory(const char* ctype) {
+  assert(ctype);
+
+  if (!strcasecmp(ctype, "skip_list"))
+    return kSkipList;
+  else if (!strcasecmp(ctype, "prefix_hash"))
+    return kHashSkipList;
+  else if (!strcasecmp(ctype, "vector"))
+    return kVectorRep;
+
+  fprintf(stdout, "Cannot parse memreptable %s\n", ctype);
+  return kSkipList;
+}
+static enum RepFactory FLAGS_rep_factory;
+DEFINE_string(memtablerep, "skip_list", "");
+
+static bool ValidatePrefixSize(const char* flagname, int32_t value) {
+  if (value < 0 || value>=2000000000) {
+    fprintf(stderr, "Invalid value for --%s: %d. 0<= PrefixSize <=2000000000\n",
+            flagname, value);
+    return false;
+  }
+  return true;
+}
+DEFINE_int32(prefix_size, 0, "Control the prefix size for HashSkipListRep");
+static const bool FLAGS_prefix_size_dummy =
+  google::RegisterFlagValidator(&FLAGS_prefix_size, &ValidatePrefixSize);
+
+DEFINE_bool(use_merge, false, "On true, replaces all writes with a Merge "
+            "that behaves like a Put");
+
+
+namespace rocksdb {
+
+// convert long to a big-endian slice key
+static std::string Key(long val) {
+  std::string little_endian_key;
+  std::string big_endian_key;
+  PutFixed64(&little_endian_key, val);
+  assert(little_endian_key.size() == sizeof(val));
+  big_endian_key.resize(sizeof(val));
+  for (int i=0; i<(int)sizeof(val); i++) {
+    big_endian_key[i] = little_endian_key[sizeof(val) - 1 - i];
+  }
+  return big_endian_key;
+}
+
+class StressTest;
+namespace {
+
+class Stats {
+ private:
+  double start_;
+  double finish_;
+  double seconds_;
+  long done_;
+  long gets_;
+  long prefixes_;
+  long writes_;
+  long deletes_;
+  long iterator_size_sums_;
+  long founds_;
+  long iterations_;
+  long errors_;
+  int next_report_;
+  size_t bytes_;
+  double last_op_finish_;
+  HistogramImpl hist_;
+
+ public:
+  Stats() { }
+
+  void Start() {
+    next_report_ = 100;
+    hist_.Clear();
+    done_ = 0;
+    gets_ = 0;
+    prefixes_ = 0;
+    writes_ = 0;
+    deletes_ = 0;
+    iterator_size_sums_ = 0;
+    founds_ = 0;
+    iterations_ = 0;
+    errors_ = 0;
+    bytes_ = 0;
+    seconds_ = 0;
+    start_ = FLAGS_env->NowMicros();
+    last_op_finish_ = start_;
+    finish_ = start_;
+  }
+
+  void Merge(const Stats& other) {
+    hist_.Merge(other.hist_);
+    done_ += other.done_;
+    gets_ += other.gets_;
+    prefixes_ += other.prefixes_;
+    writes_ += other.writes_;
+    deletes_ += other.deletes_;
+    iterator_size_sums_ += other.iterator_size_sums_;
+    founds_ += other.founds_;
+    iterations_ += other.iterations_;
+    errors_ += other.errors_;
+    bytes_ += other.bytes_;
+    seconds_ += other.seconds_;
+    if (other.start_ < start_) start_ = other.start_;
+    if (other.finish_ > finish_) finish_ = other.finish_;
+  }
+
+  void Stop() {
+    finish_ = FLAGS_env->NowMicros();
+    seconds_ = (finish_ - start_) * 1e-6;
+  }
+
+  void FinishedSingleOp() {
+    if (FLAGS_histogram) {
+      double now = FLAGS_env->NowMicros();
+      double micros = now - last_op_finish_;
+      hist_.Add(micros);
+      if (micros > 20000) {
+        fprintf(stdout, "long op: %.1f micros%30s\r", micros, "");
+      }
+      last_op_finish_ = now;
+    }
+
+    done_++;
+    if (done_ >= next_report_) {
+      if      (next_report_ < 1000)   next_report_ += 100;
+      else if (next_report_ < 5000)   next_report_ += 500;
+      else if (next_report_ < 10000)  next_report_ += 1000;
+      else if (next_report_ < 50000)  next_report_ += 5000;
+      else if (next_report_ < 100000) next_report_ += 10000;
+      else if (next_report_ < 500000) next_report_ += 50000;
+      else                            next_report_ += 100000;
+      fprintf(stdout, "... finished %ld ops%30s\r", done_, "");
+    }
+  }
+
+  void AddBytesForWrites(int nwrites, size_t nbytes) {
+    writes_ += nwrites;
+    bytes_ += nbytes;
+  }
+
+  void AddGets(int ngets, int nfounds) {
+    founds_ += nfounds;
+    gets_ += ngets;
+  }
+
+  void AddPrefixes(int nprefixes, int count) {
+    prefixes_ += nprefixes;
+    iterator_size_sums_ += count;
+  }
+
+  void AddIterations(int n) {
+    iterations_ += n;
+  }
+
+  void AddDeletes(int n) {
+    deletes_ += n;
+  }
+
+  void AddErrors(int n) {
+    errors_ += n;
+  }
+
+  void Report(const char* name) {
+    std::string extra;
+    if (bytes_ < 1 || done_ < 1) {
+      fprintf(stderr, "No writes or ops?\n");
+      return;
+    }
+
+    double elapsed = (finish_ - start_) * 1e-6;
+    double bytes_mb = bytes_ / 1048576.0;
+    double rate = bytes_mb / elapsed;
+    double throughput = (double)done_/elapsed;
+
+    fprintf(stdout, "%-12s: ", name);
+    fprintf(stdout, "%.3f micros/op %ld ops/sec\n",
+            seconds_ * 1e6 / done_, (long)throughput);
+    fprintf(stdout, "%-12s: Wrote %.2f MB (%.2f MB/sec) (%ld%% of %ld ops)\n",
+            "", bytes_mb, rate, (100*writes_)/done_, done_);
+    fprintf(stdout, "%-12s: Wrote %ld times\n", "", writes_);
+    fprintf(stdout, "%-12s: Deleted %ld times\n", "", deletes_);
+    fprintf(stdout, "%-12s: %ld read and %ld found the key\n", "",
+            gets_, founds_);
+    fprintf(stdout, "%-12s: Prefix scanned %ld times\n", "", prefixes_);
+    fprintf(stdout, "%-12s: Iterator size sum is %ld\n", "",
+            iterator_size_sums_);
+    fprintf(stdout, "%-12s: Iterated %ld times\n", "", iterations_);
+    fprintf(stdout, "%-12s: Got errors %ld times\n", "", errors_);
+
+    if (FLAGS_histogram) {
+      fprintf(stdout, "Microseconds per op:\n%s\n", hist_.ToString().c_str());
+    }
+    fflush(stdout);
+  }
+};
+
+// State shared by all concurrent executions of the same benchmark.
+class SharedState {
+ public:
+  static const uint32_t SENTINEL = 0xffffffff;
+
+  explicit SharedState(StressTest* stress_test) :
+      cv_(&mu_),
+      seed_(FLAGS_seed),
+      max_key_(FLAGS_max_key),
+      log2_keys_per_lock_(FLAGS_log2_keys_per_lock),
+      num_threads_(FLAGS_threads),
+      num_initialized_(0),
+      num_populated_(0),
+      vote_reopen_(0),
+      num_done_(0),
+      start_(false),
+      start_verify_(false),
+      stress_test_(stress_test) {
+    if (FLAGS_test_batches_snapshots) {
+      key_locks_ = nullptr;
+      values_ = nullptr;
+      fprintf(stdout, "No lock creation because test_batches_snapshots set\n");
+      return;
+    }
+    values_ = new uint32_t[max_key_];
+    for (long i = 0; i < max_key_; i++) {
+      values_[i] = SENTINEL;
+    }
+
+    long num_locks = (max_key_ >> log2_keys_per_lock_);
+    if (max_key_ & ((1 << log2_keys_per_lock_) - 1)) {
+      num_locks ++;
+    }
+    fprintf(stdout, "Creating %ld locks\n", num_locks);
+    key_locks_ = new port::Mutex[num_locks];
+  }
+
+  ~SharedState() {
+    delete[] values_;
+    delete[] key_locks_;
+  }
+
+  port::Mutex* GetMutex() {
+    return &mu_;
+  }
+
+  port::CondVar* GetCondVar() {
+    return &cv_;
+  }
+
+  StressTest* GetStressTest() const {
+    return stress_test_;
+  }
+
+  long GetMaxKey() const {
+    return max_key_;
+  }
+
+  uint32_t GetNumThreads() const {
+    return num_threads_;
+  }
+
+  void IncInitialized() {
+    num_initialized_++;
+  }
+
+  void IncOperated() {
+    num_populated_++;
+  }
+
+  void IncDone() {
+    num_done_++;
+  }
+
+  void IncVotedReopen() {
+    vote_reopen_ = (vote_reopen_ + 1) % num_threads_;
+  }
+
+  bool AllInitialized() const {
+    return num_initialized_ >= num_threads_;
+  }
+
+  bool AllOperated() const {
+    return num_populated_ >= num_threads_;
+  }
+
+  bool AllDone() const {
+    return num_done_ >= num_threads_;
+  }
+
+  bool AllVotedReopen() {
+    return (vote_reopen_ == 0);
+  }
+
+  void SetStart() {
+    start_ = true;
+  }
+
+  void SetStartVerify() {
+    start_verify_ = true;
+  }
+
+  bool Started() const {
+    return start_;
+  }
+
+  bool VerifyStarted() const {
+    return start_verify_;
+  }
+
+  port::Mutex* GetMutexForKey(long key) {
+    return &key_locks_[key >> log2_keys_per_lock_];
+  }
+
+  void Put(long key, uint32_t value_base) {
+    values_[key] = value_base;
+  }
+
+  uint32_t Get(long key) const {
+    return values_[key];
+  }
+
+  void Delete(long key) const {
+    values_[key] = SENTINEL;
+  }
+
+  uint32_t GetSeed() const {
+    return seed_;
+  }
+
+ private:
+  port::Mutex mu_;
+  port::CondVar cv_;
+  const uint32_t seed_;
+  const long max_key_;
+  const uint32_t log2_keys_per_lock_;
+  const int num_threads_;
+  long num_initialized_;
+  long num_populated_;
+  long vote_reopen_;
+  long num_done_;
+  bool start_;
+  bool start_verify_;
+  StressTest* stress_test_;
+
+  uint32_t *values_;
+  port::Mutex *key_locks_;
+
+};
+
+// Per-thread state for concurrent executions of the same benchmark.
+struct ThreadState {
+  uint32_t tid; // 0..n-1
+  Random rand;  // Has different seeds for different threads
+  SharedState* shared;
+  Stats stats;
+
+  ThreadState(uint32_t index, SharedState *shared)
+      : tid(index),
+        rand(1000 + index + shared->GetSeed()),
+        shared(shared) {
+  }
+};
+
+}  // namespace
+
+class StressTest {
+ public:
+  StressTest()
+      : cache_(NewLRUCache(FLAGS_cache_size)),
+        compressed_cache_(FLAGS_compressed_cache_size >= 0 ?
+                          NewLRUCache(FLAGS_compressed_cache_size) :
+                          nullptr),
+        filter_policy_(FLAGS_bloom_bits >= 0
+                       ? NewBloomFilterPolicy(FLAGS_bloom_bits)
+                       : nullptr),
+        prefix_extractor_(NewFixedPrefixTransform(
+                          FLAGS_test_batches_snapshots ?
+                          sizeof(long) : sizeof(long)-1)),
+        db_(nullptr),
+        num_times_reopened_(0) {
+    if (FLAGS_destroy_db_initially) {
+      std::vector<std::string> files;
+      FLAGS_env->GetChildren(FLAGS_db, &files);
+      for (unsigned int i = 0; i < files.size(); i++) {
+        if (Slice(files[i]).starts_with("heap-")) {
+          FLAGS_env->DeleteFile(FLAGS_db + "/" + files[i]);
+        }
+      }
+      DestroyDB(FLAGS_db, Options());
+    }
+  }
+
+  ~StressTest() {
+    delete db_;
+    delete filter_policy_;
+    delete prefix_extractor_;
+  }
+
+  void Run() {
+    PrintEnv();
+    Open();
+    SharedState shared(this);
+    uint32_t n = shared.GetNumThreads();
+
+    std::vector<ThreadState*> threads(n);
+    for (uint32_t i = 0; i < n; i++) {
+      threads[i] = new ThreadState(i, &shared);
+      FLAGS_env->StartThread(ThreadBody, threads[i]);
+    }
+    // Each thread goes through the following states:
+    // initializing -> wait for others to init -> read/populate/depopulate
+    // wait for others to operate -> verify -> done
+
+    {
+      MutexLock l(shared.GetMutex());
+      while (!shared.AllInitialized()) {
+        shared.GetCondVar()->Wait();
+      }
+
+      double now = FLAGS_env->NowMicros();
+      fprintf(stdout, "%s Starting database operations\n",
+              FLAGS_env->TimeToString((uint64_t) now/1000000).c_str());
+
+      shared.SetStart();
+      shared.GetCondVar()->SignalAll();
+      while (!shared.AllOperated()) {
+        shared.GetCondVar()->Wait();
+      }
+
+      now = FLAGS_env->NowMicros();
+      if (FLAGS_test_batches_snapshots) {
+        fprintf(stdout, "%s Limited verification already done during gets\n",
+                FLAGS_env->TimeToString((uint64_t) now/1000000).c_str());
+      } else {
+        fprintf(stdout, "%s Starting verification\n",
+                FLAGS_env->TimeToString((uint64_t) now/1000000).c_str());
+      }
+
+      shared.SetStartVerify();
+      shared.GetCondVar()->SignalAll();
+      while (!shared.AllDone()) {
+        shared.GetCondVar()->Wait();
+      }
+    }
+
+    for (unsigned int i = 1; i < n; i++) {
+      threads[0]->stats.Merge(threads[i]->stats);
+    }
+    threads[0]->stats.Report("Stress Test");
+
+    for (unsigned int i = 0; i < n; i++) {
+      delete threads[i];
+      threads[i] = nullptr;
+    }
+    double now = FLAGS_env->NowMicros();
+    if (!FLAGS_test_batches_snapshots) {
+      fprintf(stdout, "%s Verification successful\n",
+              FLAGS_env->TimeToString((uint64_t) now/1000000).c_str());
+    }
+    PrintStatistics();
+  }
+
+ private:
+
+  static void ThreadBody(void* v) {
+    ThreadState* thread = reinterpret_cast<ThreadState*>(v);
+    SharedState* shared = thread->shared;
+
+    {
+      MutexLock l(shared->GetMutex());
+      shared->IncInitialized();
+      if (shared->AllInitialized()) {
+        shared->GetCondVar()->SignalAll();
+      }
+      while (!shared->Started()) {
+        shared->GetCondVar()->Wait();
+      }
+    }
+    thread->shared->GetStressTest()->OperateDb(thread);
+
+    {
+      MutexLock l(shared->GetMutex());
+      shared->IncOperated();
+      if (shared->AllOperated()) {
+        shared->GetCondVar()->SignalAll();
+      }
+      while (!shared->VerifyStarted()) {
+        shared->GetCondVar()->Wait();
+      }
+    }
+
+    if (!FLAGS_test_batches_snapshots) {
+      thread->shared->GetStressTest()->VerifyDb(thread);
+    }
+
+    {
+      MutexLock l(shared->GetMutex());
+      shared->IncDone();
+      if (shared->AllDone()) {
+        shared->GetCondVar()->SignalAll();
+      }
+    }
+
+  }
+
+  // Given a key K and value V, this puts ("0"+K, "0"+V), ("1"+K, "1"+V), ...
+  // ("9"+K, "9"+V) in DB atomically i.e in a single batch.
+  // Also refer MultiGet.
+  Status MultiPut(ThreadState* thread,
+                  const WriteOptions& writeoptions,
+                  const Slice& key, const Slice& value, size_t sz) {
+    std::string keys[10] = {"9", "8", "7", "6", "5",
+                            "4", "3", "2", "1", "0"};
+    std::string values[10] = {"9", "8", "7", "6", "5",
+                              "4", "3", "2", "1", "0"};
+    Slice value_slices[10];
+    WriteBatch batch;
+    Status s;
+    for (int i = 0; i < 10; i++) {
+      keys[i] += key.ToString();
+      values[i] += value.ToString();
+      value_slices[i] = values[i];
+      if (FLAGS_use_merge) {
+        batch.Merge(keys[i], value_slices[i]);
+      } else {
+        batch.Put(keys[i], value_slices[i]);
+      }
+    }
+
+    s = db_->Write(writeoptions, &batch);
+    if (!s.ok()) {
+      fprintf(stderr, "multiput error: %s\n", s.ToString().c_str());
+      thread->stats.AddErrors(1);
+    } else {
+      // we did 10 writes each of size sz + 1
+      thread->stats.AddBytesForWrites(10, (sz + 1) * 10);
+    }
+
+    return s;
+  }
+
+  // Given a key K, this deletes ("0"+K), ("1"+K),... ("9"+K)
+  // in DB atomically i.e in a single batch. Also refer MultiGet.
+  Status MultiDelete(ThreadState* thread,
+                     const WriteOptions& writeoptions,
+                     const Slice& key) {
+    std::string keys[10] = {"9", "7", "5", "3", "1",
+                            "8", "6", "4", "2", "0"};
+
+    WriteBatch batch;
+    Status s;
+    for (int i = 0; i < 10; i++) {
+      keys[i] += key.ToString();
+      batch.Delete(keys[i]);
+    }
+
+    s = db_->Write(writeoptions, &batch);
+    if (!s.ok()) {
+      fprintf(stderr, "multidelete error: %s\n", s.ToString().c_str());
+      thread->stats.AddErrors(1);
+    } else {
+      thread->stats.AddDeletes(10);
+    }
+
+    return s;
+  }
+
+  // Given a key K, this gets values for "0"+K, "1"+K,..."9"+K
+  // in the same snapshot, and verifies that all the values are of the form
+  // "0"+V, "1"+V,..."9"+V.
+  // ASSUMES that MultiPut was used to put (K, V) into the DB.
+  Status MultiGet(ThreadState* thread,
+                  const ReadOptions& readoptions,
+                  const Slice& key, std::string* value) {
+    std::string keys[10] = {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"};
+    Slice key_slices[10];
+    std::string values[10];
+    ReadOptions readoptionscopy = readoptions;
+    readoptionscopy.snapshot = db_->GetSnapshot();
+    Status s;
+    for (int i = 0; i < 10; i++) {
+      keys[i] += key.ToString();
+      key_slices[i] = keys[i];
+      s = db_->Get(readoptionscopy, key_slices[i], value);
+      if (!s.ok() && !s.IsNotFound()) {
+        fprintf(stderr, "get error: %s\n", s.ToString().c_str());
+        values[i] = "";
+        thread->stats.AddErrors(1);
+        // we continue after error rather than exiting so that we can
+        // find more errors if any
+      } else if (s.IsNotFound()) {
+        values[i] = "";
+        thread->stats.AddGets(1, 0);
+      } else {
+        values[i] = *value;
+
+        char expected_prefix = (keys[i])[0];
+        char actual_prefix = (values[i])[0];
+        if (actual_prefix != expected_prefix) {
+          fprintf(stderr, "error expected prefix = %c actual = %c\n",
+                  expected_prefix, actual_prefix);
+        }
+        (values[i])[0] = ' '; // blank out the differing character
+        thread->stats.AddGets(1, 1);
+      }
+    }
+    db_->ReleaseSnapshot(readoptionscopy.snapshot);
+
+    // Now that we retrieved all values, check that they all match
+    for (int i = 1; i < 10; i++) {
+      if (values[i] != values[0]) {
+        fprintf(stderr, "error : inconsistent values for key %s: %s, %s\n",
+                key.ToString().c_str(), values[0].c_str(),
+                values[i].c_str());
+      // we continue after error rather than exiting so that we can
+      // find more errors if any
+      }
+    }
+
+    return s;
+  }
+
+  // Given a prefix P, this does prefix scans for "0"+P, "1"+P,..."9"+P
+  // in the same snapshot.  Each of these 10 scans returns a series of
+  // values; each series should be the same length, and it is verified
+  // for each index i that all the i'th values are of the form "0"+V,
+  // "1"+V,..."9"+V.
+  // ASSUMES that MultiPut was used to put (K, V)
+  Status MultiPrefixScan(ThreadState* thread,
+                         const ReadOptions& readoptions,
+                         const Slice& prefix) {
+    std::string prefixes[10] = {"0", "1", "2", "3", "4",
+                                "5", "6", "7", "8", "9"};
+    Slice prefix_slices[10];
+    ReadOptions readoptionscopy[10];
+    const Snapshot* snapshot = db_->GetSnapshot();
+    Iterator* iters[10];
+    Status s = Status::OK();
+    for (int i = 0; i < 10; i++) {
+      prefixes[i] += prefix.ToString();
+      prefix_slices[i] = prefixes[i];
+      readoptionscopy[i] = readoptions;
+      readoptionscopy[i].prefix = &prefix_slices[i];
+      readoptionscopy[i].snapshot = snapshot;
+      iters[i] = db_->NewIterator(readoptionscopy[i]);
+      iters[i]->SeekToFirst();
+    }
+
+    int count = 0;
+    while (iters[0]->Valid()) {
+      count++;
+      std::string values[10];
+      // get list of all values for this iteration
+      for (int i = 0; i < 10; i++) {
+        // no iterator should finish before the first one
+        assert(iters[i]->Valid());
+        values[i] = iters[i]->value().ToString();
+
+        char expected_first = (prefixes[i])[0];
+        char actual_first = (values[i])[0];
+
+        if (actual_first != expected_first) {
+          fprintf(stderr, "error expected first = %c actual = %c\n",
+                  expected_first, actual_first);
+        }
+        (values[i])[0] = ' '; // blank out the differing character
+      }
+      // make sure all values are equivalent
+      for (int i = 0; i < 10; i++) {
+        if (values[i] != values[0]) {
+          fprintf(stderr, "error : inconsistent values for prefix %s: %s, %s\n",
+                  prefix.ToString().c_str(), values[0].c_str(),
+                  values[i].c_str());
+          // we continue after error rather than exiting so that we can
+          // find more errors if any
+        }
+        iters[i]->Next();
+      }
+    }
+
+    // cleanup iterators and snapshot
+    for (int i = 0; i < 10; i++) {
+      // if the first iterator finished, they should have all finished
+      assert(!iters[i]->Valid());
+      assert(iters[i]->status().ok());
+      delete iters[i];
+    }
+    db_->ReleaseSnapshot(snapshot);
+
+    if (s.ok()) {
+      thread->stats.AddPrefixes(1, count);
+    } else {
+      thread->stats.AddErrors(1);
+    }
+
+    return s;
+  }
+
+  // Given a key K, this creates an iterator which scans to K and then
+  // does a random sequence of Next/Prev operations.
+  Status MultiIterate(ThreadState* thread,
+                      const ReadOptions& readoptions,
+                      const Slice& key) {
+    Status s;
+    const Snapshot* snapshot = db_->GetSnapshot();
+    ReadOptions readoptionscopy = readoptions;
+    readoptionscopy.snapshot = snapshot;
+    unique_ptr<Iterator> iter(db_->NewIterator(readoptionscopy));
+
+    iter->Seek(key);
+    for (uint64_t i = 0; i < FLAGS_num_iterations && iter->Valid(); i++) {
+      if (thread->rand.OneIn(2)) {
+        iter->Next();
+      } else {
+        iter->Prev();
+      }
+    }
+
+    if (s.ok()) {
+      thread->stats.AddIterations(1);
+    } else {
+      thread->stats.AddErrors(1);
+    }
+
+    db_->ReleaseSnapshot(snapshot);
+
+    return s;
+  }
+
+  void OperateDb(ThreadState* thread) {
+    ReadOptions read_opts(FLAGS_verify_checksum, true);
+    WriteOptions write_opts;
+    char value[100];
+    long max_key = thread->shared->GetMaxKey();
+    std::string from_db;
+    if (FLAGS_sync) {
+      write_opts.sync = true;
+    }
+    write_opts.disableWAL = FLAGS_disable_wal;
+    const int prefixBound = (int)FLAGS_readpercent + (int)FLAGS_prefixpercent;
+    const int writeBound = prefixBound + (int)FLAGS_writepercent;
+    const int delBound = writeBound + (int)FLAGS_delpercent;
+
+    thread->stats.Start();
+    for (uint64_t i = 0; i < FLAGS_ops_per_thread; i++) {
+      if(i != 0 && (i % (FLAGS_ops_per_thread / (FLAGS_reopen + 1))) == 0) {
+        {
+          thread->stats.FinishedSingleOp();
+          MutexLock l(thread->shared->GetMutex());
+          thread->shared->IncVotedReopen();
+          if (thread->shared->AllVotedReopen()) {
+            thread->shared->GetStressTest()->Reopen();
+            thread->shared->GetCondVar()->SignalAll();
+          }
+          else {
+            thread->shared->GetCondVar()->Wait();
+          }
+          // Commenting this out as we don't want to reset stats on each open.
+          // thread->stats.Start();
+        }
+      }
+
+      long rand_key = thread->rand.Next() % max_key;
+      std::string keystr = Key(rand_key);
+      Slice key = keystr;
+      int prob_op = thread->rand.Uniform(100);
+
+      if (prob_op >= 0 && prob_op < (int)FLAGS_readpercent) {
+        // OPERATION read
+        if (!FLAGS_test_batches_snapshots) {
+          Status s = db_->Get(read_opts, key, &from_db);
+          if (s.ok()) {
+            // found case
+            thread->stats.AddGets(1, 1);
+          } else if (s.IsNotFound()) {
+            // not found case
+            thread->stats.AddGets(1, 0);
+          } else {
+            // errors case
+            thread->stats.AddErrors(1);
+          }
+        } else {
+          MultiGet(thread, read_opts, key, &from_db);
+        }
+      } else if ((int)FLAGS_readpercent <= prob_op && prob_op < prefixBound) {
+        // OPERATION prefix scan
+        // keys are longs (e.g., 8 bytes), so we let prefixes be
+        // everything except the last byte.  So there will be 2^8=256
+        // keys per prefix.
+        Slice prefix = Slice(key.data(), key.size() - 1);
+        if (!FLAGS_test_batches_snapshots) {
+          read_opts.prefix = &prefix;
+          Iterator* iter = db_->NewIterator(read_opts);
+          int count = 0;
+          for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+            assert(iter->key().starts_with(prefix));
+            count++;
+          }
+          assert(count <= 256);
+          if (iter->status().ok()) {
+            thread->stats.AddPrefixes(1, count);
+          } else {
+            thread->stats.AddErrors(1);
+          }
+          delete iter;
+        } else {
+          MultiPrefixScan(thread, read_opts, prefix);
+        }
+        read_opts.prefix = nullptr;
+      } else if (prefixBound <= prob_op && prob_op < writeBound) {
+        // OPERATION write
+        uint32_t value_base = thread->rand.Next();
+        size_t sz = GenerateValue(value_base, value, sizeof(value));
+        Slice v(value, sz);
+        if (!FLAGS_test_batches_snapshots) {
+          MutexLock l(thread->shared->GetMutexForKey(rand_key));
+          if (FLAGS_verify_before_write) {
+            std::string keystr2 = Key(rand_key);
+            Slice k = keystr2;
+            Status s = db_->Get(read_opts, k, &from_db);
+            VerifyValue(rand_key,
+                        read_opts,
+                        *(thread->shared),
+                        from_db,
+                        s,
+                        true);
+          }
+          thread->shared->Put(rand_key, value_base);
+          if (FLAGS_use_merge) {
+            db_->Merge(write_opts, key, v);
+          } else {
+            db_->Put(write_opts, key, v);
+          }
+          thread->stats.AddBytesForWrites(1, sz);
+        } else {
+          MultiPut(thread, write_opts, key, v, sz);
+        }
+        PrintKeyValue(rand_key, value, sz);
+      } else if (writeBound <= prob_op && prob_op < delBound) {
+        // OPERATION delete
+        if (!FLAGS_test_batches_snapshots) {
+          MutexLock l(thread->shared->GetMutexForKey(rand_key));
+          thread->shared->Delete(rand_key);
+          db_->Delete(write_opts, key);
+          thread->stats.AddDeletes(1);
+        } else {
+          MultiDelete(thread, write_opts, key);
+        }
+      } else {
+        // OPERATION iterate
+        MultiIterate(thread, read_opts, key);
+      }
+      thread->stats.FinishedSingleOp();
+    }
+
+    thread->stats.Stop();
+  }
+
+  void VerifyDb(ThreadState* thread) const {
+    ReadOptions options(FLAGS_verify_checksum, true);
+    const SharedState& shared = *(thread->shared);
+    static const long max_key = shared.GetMaxKey();
+    static const long keys_per_thread = max_key / shared.GetNumThreads();
+    long start = keys_per_thread * thread->tid;
+    long end = start + keys_per_thread;
+    if (thread->tid == shared.GetNumThreads() - 1) {
+      end = max_key;
+    }
+    if (!thread->rand.OneIn(2)) {
+      // Use iterator to verify this range
+      unique_ptr<Iterator> iter(db_->NewIterator(options));
+      iter->Seek(Key(start));
+      for (long i = start; i < end; i++) {
+        std::string from_db;
+        std::string keystr = Key(i);
+        Slice k = keystr;
+        Status s = iter->status();
+        if (iter->Valid()) {
+          if (iter->key().compare(k) > 0) {
+            s = Status::NotFound(Slice());
+          } else if (iter->key().compare(k) == 0) {
+            from_db = iter->value().ToString();
+            iter->Next();
+          } else if (iter->key().compare(k) < 0) {
+            VerificationAbort("An out of range key was found", i);
+          }
+        } else {
+          // The iterator found no value for the key in question, so do not
+          // move to the next item in the iterator
+          s = Status::NotFound(Slice());
+        }
+        VerifyValue(i, options, shared, from_db, s, true);
+        if (from_db.length()) {
+          PrintKeyValue(i, from_db.data(), from_db.length());
+        }
+      }
+    }
+    else {
+      // Use Get to verify this range
+      for (long i = start; i < end; i++) {
+        std::string from_db;
+        std::string keystr = Key(i);
+        Slice k = keystr;
+        Status s = db_->Get(options, k, &from_db);
+        VerifyValue(i, options, shared, from_db, s, true);
+        if (from_db.length()) {
+          PrintKeyValue(i, from_db.data(), from_db.length());
+        }
+      }
+    }
+  }
+
+  void VerificationAbort(std::string msg, long key) const {
+    fprintf(stderr, "Verification failed for key %ld: %s\n",
+            key, msg.c_str());
+    exit(1);
+  }
+
+  void VerifyValue(long key,
+                   const ReadOptions &opts,
+                   const SharedState &shared,
+                   const std::string &value_from_db,
+                   Status s,
+                   bool strict=false) const {
+    // compare value_from_db with the value in the shared state
+    char value[100];
+    uint32_t value_base = shared.Get(key);
+    if (value_base == SharedState::SENTINEL && !strict) {
+      return;
+    }
+
+    if (s.ok()) {
+      if (value_base == SharedState::SENTINEL) {
+        VerificationAbort("Unexpected value found", key);
+      }
+      size_t sz = GenerateValue(value_base, value, sizeof(value));
+      if (value_from_db.length() != sz) {
+        VerificationAbort("Length of value read is not equal", key);
+      }
+      if (memcmp(value_from_db.data(), value, sz) != 0) {
+        VerificationAbort("Contents of value read don't match", key);
+      }
+    } else {
+      if (value_base != SharedState::SENTINEL) {
+        VerificationAbort("Value not found", key);
+      }
+    }
+  }
+
+  static void PrintKeyValue(uint32_t key, const char *value, size_t sz) {
+    if (!FLAGS_verbose) return;
+    fprintf(stdout, "%u ==> (%u) ", key, (unsigned int)sz);
+    for (size_t i=0; i<sz; i++) {
+      fprintf(stdout, "%X", value[i]);
+    }
+    fprintf(stdout, "\n");
+  }
+
+  static size_t GenerateValue(uint32_t rand, char *v, size_t max_sz) {
+    size_t value_sz = ((rand % 3) + 1) * FLAGS_value_size_mult;
+    assert(value_sz <= max_sz && value_sz >= sizeof(uint32_t));
+    *((uint32_t*)v) = rand;
+    for (size_t i=sizeof(uint32_t); i < value_sz; i++) {
+      v[i] = (char)(rand ^ i);
+    }
+    v[value_sz] = '\0';
+    return value_sz; // the size of the value set.
+  }
+
+  void PrintEnv() const {
+    fprintf(stdout, "LevelDB version     : %d.%d\n",
+            kMajorVersion, kMinorVersion);
+    fprintf(stdout, "Number of threads   : %d\n", FLAGS_threads);
+    fprintf(stdout,
+            "Ops per thread      : %lu\n",
+            (unsigned long)FLAGS_ops_per_thread);
+    std::string ttl_state("unused");
+    if (FLAGS_ttl > 0) {
+      ttl_state = NumberToString(FLAGS_ttl);
+    }
+    fprintf(stdout, "Time to live(sec)   : %s\n", ttl_state.c_str());
+    fprintf(stdout, "Read percentage     : %d%%\n", FLAGS_readpercent);
+    fprintf(stdout, "Prefix percentage   : %d%%\n", FLAGS_prefixpercent);
+    fprintf(stdout, "Write percentage    : %d%%\n", FLAGS_writepercent);
+    fprintf(stdout, "Delete percentage   : %d%%\n", FLAGS_delpercent);
+    fprintf(stdout, "Iterate percentage  : %d%%\n", FLAGS_iterpercent);
+    fprintf(stdout, "Write-buffer-size   : %d\n", FLAGS_write_buffer_size);
+    fprintf(stdout,
+            "Iterations          : %lu\n",
+            (unsigned long)FLAGS_num_iterations);
+    fprintf(stdout,
+            "Max key             : %lu\n",
+            (unsigned long)FLAGS_max_key);
+    fprintf(stdout, "Ratio #ops/#keys    : %f\n",
+            (1.0 * FLAGS_ops_per_thread * FLAGS_threads)/FLAGS_max_key);
+    fprintf(stdout, "Num times DB reopens: %d\n", FLAGS_reopen);
+    fprintf(stdout, "Batches/snapshots   : %d\n",
+            FLAGS_test_batches_snapshots);
+    fprintf(stdout, "Purge redundant %%   : %d\n",
+            FLAGS_purge_redundant_percent);
+    fprintf(stdout, "Deletes use filter  : %d\n",
+            FLAGS_filter_deletes);
+    fprintf(stdout, "Num keys per lock   : %d\n",
+            1 << FLAGS_log2_keys_per_lock);
+
+    const char* compression = "";
+    switch (FLAGS_compression_type_e) {
+      case rocksdb::kNoCompression:
+        compression = "none";
+        break;
+      case rocksdb::kSnappyCompression:
+        compression = "snappy";
+        break;
+      case rocksdb::kZlibCompression:
+        compression = "zlib";
+        break;
+      case rocksdb::kBZip2Compression:
+        compression = "bzip2";
+        break;
+    }
+
+    fprintf(stdout, "Compression         : %s\n", compression);
+
+    const char* memtablerep = "";
+    switch (FLAGS_rep_factory) {
+      case kSkipList:
+        memtablerep = "skip_list";
+        break;
+      case kHashSkipList:
+        memtablerep = "prefix_hash";
+        break;
+      case kVectorRep:
+        memtablerep = "vector";
+        break;
+    }
+
+    fprintf(stdout, "Memtablerep         : %s\n", memtablerep);
+
+    fprintf(stdout, "------------------------------------------------\n");
+  }
+
+  void Open() {
+    assert(db_ == nullptr);
+    Options options;
+    options.block_cache = cache_;
+    options.block_cache_compressed = compressed_cache_;
+    options.write_buffer_size = FLAGS_write_buffer_size;
+    options.max_write_buffer_number = FLAGS_max_write_buffer_number;
+    options.min_write_buffer_number_to_merge =
+      FLAGS_min_write_buffer_number_to_merge;
+    options.max_background_compactions = FLAGS_max_background_compactions;
+    options.compaction_style =
+      static_cast<rocksdb::CompactionStyle>(FLAGS_compaction_style);
+    options.block_size = FLAGS_block_size;
+    options.filter_policy = filter_policy_;
+    options.prefix_extractor = prefix_extractor_;
+    options.max_open_files = FLAGS_open_files;
+    options.statistics = dbstats;
+    options.env = FLAGS_env;
+    options.disableDataSync = FLAGS_disable_data_sync;
+    options.use_fsync = FLAGS_use_fsync;
+    options.allow_mmap_reads = FLAGS_mmap_read;
+    rocksdb_kill_odds = FLAGS_kill_random_test;
+    options.target_file_size_base = FLAGS_target_file_size_base;
+    options.target_file_size_multiplier = FLAGS_target_file_size_multiplier;
+    options.max_bytes_for_level_base = FLAGS_max_bytes_for_level_base;
+    options.max_bytes_for_level_multiplier =
+        FLAGS_max_bytes_for_level_multiplier;
+    options.level0_stop_writes_trigger = FLAGS_level0_stop_writes_trigger;
+    options.level0_slowdown_writes_trigger =
+      FLAGS_level0_slowdown_writes_trigger;
+    options.level0_file_num_compaction_trigger =
+      FLAGS_level0_file_num_compaction_trigger;
+    options.compression = FLAGS_compression_type_e;
+    options.create_if_missing = true;
+    options.disable_seek_compaction = FLAGS_disable_seek_compaction;
+    options.delete_obsolete_files_period_micros =
+      FLAGS_delete_obsolete_files_period_micros;
+    options.max_manifest_file_size = 1024;
+    options.filter_deletes = FLAGS_filter_deletes;
+    if ((FLAGS_prefix_size == 0) == (FLAGS_rep_factory == kHashSkipList)) {
+      fprintf(stderr,
+            "prefix_size should be non-zero iff memtablerep == prefix_hash\n");
+      exit(1);
+    }
+    switch (FLAGS_rep_factory) {
+      case kHashSkipList:
+        options.memtable_factory.reset(NewHashSkipListRepFactory(
+            NewFixedPrefixTransform(FLAGS_prefix_size)));
+        break;
+      case kSkipList:
+        // no need to do anything
+        break;
+      case kVectorRep:
+        options.memtable_factory.reset(
+          new VectorRepFactory()
+        );
+        break;
+    }
+    static Random purge_percent(1000); // no benefit from non-determinism here
+    if (static_cast<int32_t>(purge_percent.Uniform(100)) <
+        FLAGS_purge_redundant_percent - 1) {
+      options.purge_redundant_kvs_while_flush = false;
+    }
+
+    if (FLAGS_use_merge) {
+      options.merge_operator = MergeOperators::CreatePutOperator();
+    }
+
+    // set universal style compaction configurations, if applicable
+    if (FLAGS_universal_size_ratio != 0) {
+      options.compaction_options_universal.size_ratio =
+        FLAGS_universal_size_ratio;
+    }
+    if (FLAGS_universal_min_merge_width != 0) {
+      options.compaction_options_universal.min_merge_width =
+        FLAGS_universal_min_merge_width;
+    }
+    if (FLAGS_universal_max_merge_width != 0) {
+      options.compaction_options_universal.max_merge_width =
+        FLAGS_universal_max_merge_width;
+    }
+    if (FLAGS_universal_max_size_amplification_percent != 0) {
+      options.compaction_options_universal.max_size_amplification_percent =
+        FLAGS_universal_max_size_amplification_percent;
+    }
+
+    fprintf(stdout, "DB path: [%s]\n", FLAGS_db.c_str());
+
+    Status s;
+    if (FLAGS_ttl == -1) {
+      s = DB::Open(options, FLAGS_db, &db_);
+    } else {
+      s = UtilityDB::OpenTtlDB(options, FLAGS_db, &sdb_, FLAGS_ttl);
+      db_ = sdb_;
+    }
+    if (!s.ok()) {
+      fprintf(stderr, "open error: %s\n", s.ToString().c_str());
+      exit(1);
+    }
+  }
+
+  void Reopen() {
+    // do not close the db. Just delete the lock file. This
+    // simulates a crash-recovery kind of situation.
+    if (FLAGS_ttl != -1) {
+      ((DBWithTTL*) db_)->TEST_Destroy_DBWithTtl();
+    } else {
+      ((DBImpl*) db_)->TEST_Destroy_DBImpl();
+    }
+    db_ = nullptr;
+
+    num_times_reopened_++;
+    double now = FLAGS_env->NowMicros();
+    fprintf(stdout, "%s Reopening database for the %dth time\n",
+            FLAGS_env->TimeToString((uint64_t) now/1000000).c_str(),
+            num_times_reopened_);
+    Open();
+  }
+
+  void PrintStatistics() {
+    if (dbstats) {
+      fprintf(stdout, "STATISTICS:\n%s\n", dbstats->ToString().c_str());
+    }
+  }
+
+ private:
+  shared_ptr<Cache> cache_;
+  shared_ptr<Cache> compressed_cache_;
+  const FilterPolicy* filter_policy_;
+  const SliceTransform* prefix_extractor_;
+  DB* db_;
+  StackableDB* sdb_;
+  int num_times_reopened_;
+};
+
+}  // namespace rocksdb
+
+
+
+int main(int argc, char** argv) {
+  google::SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
+                          " [OPTIONS]...");
+  google::ParseCommandLineFlags(&argc, &argv, true);
+
+  if (FLAGS_statistics) {
+    dbstats = rocksdb::CreateDBStatistics();
+  }
+  FLAGS_compression_type_e =
+    StringToCompressionType(FLAGS_compression_type.c_str());
+  if (!FLAGS_hdfs.empty()) {
+    FLAGS_env  = new rocksdb::HdfsEnv(FLAGS_hdfs);
+  }
+  FLAGS_rep_factory = StringToRepFactory(FLAGS_memtablerep.c_str());
+
+  // The number of background threads should be at least as much the
+  // max number of concurrent compactions.
+  FLAGS_env->SetBackgroundThreads(FLAGS_max_background_compactions);
+
+  if ((FLAGS_readpercent + FLAGS_prefixpercent +
+       FLAGS_writepercent + FLAGS_delpercent + FLAGS_iterpercent) != 100) {
+      fprintf(stderr,
+              "Error: Read+Prefix+Write+Delete+Iterate percents != 100!\n");
+      exit(1);
+  }
+  if (FLAGS_disable_wal == 1 && FLAGS_reopen > 0) {
+      fprintf(stderr, "Error: Db cannot reopen safely with disable_wal set!\n");
+      exit(1);
+  }
+  if ((unsigned)FLAGS_reopen >= FLAGS_ops_per_thread) {
+      fprintf(stderr,
+              "Error: #DB-reopens should be < ops_per_thread\n"
+              "Provided reopens = %d and ops_per_thread = %lu\n",
+              FLAGS_reopen,
+              (unsigned long)FLAGS_ops_per_thread);
+      exit(1);
+  }
+
+  // Choose a location for the test database if none given with --db=<path>
+  if (FLAGS_db.empty()) {
+      std::string default_db_path;
+      rocksdb::Env::Default()->GetTestDirectory(&default_db_path);
+      default_db_path += "/dbstress";
+      FLAGS_db = default_db_path;
+  }
+
+  rocksdb::StressTest stress;
+  stress.Run();
+  return 0;
+}
diff --git a/tools/ldb.cc b/tools/ldb.cc
new file mode 100644 (file)
index 0000000..4581b80
--- /dev/null
@@ -0,0 +1,13 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+
+#include "rocksdb/ldb_tool.h"
+
+int main(int argc, char** argv) {
+  rocksdb::LDBTool tool;
+  tool.Run(argc, argv);
+  return 0;
+}
diff --git a/tools/ldb_test.py b/tools/ldb_test.py
new file mode 100644 (file)
index 0000000..fe9a6c6
--- /dev/null
@@ -0,0 +1,356 @@
+import os
+import os.path
+import shutil
+import subprocess
+import time
+import unittest
+import tempfile
+
+def my_check_output(*popenargs, **kwargs):
+    """
+    If we had python 2.7, we should simply use subprocess.check_output.
+    This is a stop-gap solution for python 2.6
+    """
+    if 'stdout' in kwargs:
+        raise ValueError('stdout argument not allowed, it will be overridden.')
+    process = subprocess.Popen(stderr=subprocess.PIPE, stdout=subprocess.PIPE,
+                               *popenargs, **kwargs)
+    output, unused_err = process.communicate()
+    retcode = process.poll()
+    if retcode:
+        cmd = kwargs.get("args")
+        if cmd is None:
+            cmd = popenargs[0]
+        raise Exception("Exit code is not 0.  It is %d.  Command: %s" %
+                (retcode, cmd))
+    return output
+
+def run_err_null(cmd):
+    return os.system(cmd + " 2>/dev/null ")
+
+class LDBTestCase(unittest.TestCase):
+    def setUp(self):
+        self.TMP_DIR  = tempfile.mkdtemp(prefix="ldb_test_")
+        self.DB_NAME = "testdb"
+
+    def tearDown(self):
+        assert(self.TMP_DIR.strip() != "/"
+                and self.TMP_DIR.strip() != "/tmp"
+                and self.TMP_DIR.strip() != "/tmp/") #Just some paranoia
+
+        shutil.rmtree(self.TMP_DIR)
+
+    def dbParam(self, dbName):
+        return "--db=%s" % os.path.join(self.TMP_DIR, dbName)
+
+    def assertRunOKFull(self, params, expectedOutput, unexpected=False):
+        """
+        All command-line params must be specified.
+        Allows full flexibility in testing; for example: missing db param.
+
+        """
+
+        output = my_check_output("./ldb %s |grep -v \"Created bg thread\"" %
+                            params, shell=True)
+        if not unexpected:
+            self.assertEqual(output.strip(), expectedOutput.strip())
+        else:
+            self.assertNotEqual(output.strip(), expectedOutput.strip())
+
+    def assertRunFAILFull(self, params):
+        """
+        All command-line params must be specified.
+        Allows full flexibility in testing; for example: missing db param.
+
+        """
+        try:
+
+            my_check_output("./ldb %s >/dev/null 2>&1 |grep -v \"Created bg \
+                thread\"" % params, shell=True)
+        except Exception, e:
+            return
+        self.fail(
+            "Exception should have been raised for command with params: %s" %
+            params)
+
+    def assertRunOK(self, params, expectedOutput, unexpected=False):
+        """
+        Uses the default test db.
+
+        """
+        self.assertRunOKFull("%s %s" % (self.dbParam(self.DB_NAME), params),
+                             expectedOutput, unexpected)
+
+    def assertRunFAIL(self, params):
+        """
+        Uses the default test db.
+        """
+        self.assertRunFAILFull("%s %s" % (self.dbParam(self.DB_NAME), params))
+
+    def testSimpleStringPutGet(self):
+        print "Running testSimpleStringPutGet..."
+        self.assertRunFAIL("put x1 y1")
+        self.assertRunOK("put --create_if_missing x1 y1", "OK")
+        self.assertRunOK("get x1", "y1")
+        self.assertRunFAIL("get x2")
+
+        self.assertRunOK("put x2 y2", "OK")
+        self.assertRunOK("get x1", "y1")
+        self.assertRunOK("get x2", "y2")
+        self.assertRunFAIL("get x3")
+
+        self.assertRunOK("scan --from=x1 --to=z", "x1 : y1\nx2 : y2")
+        self.assertRunOK("put x3 y3", "OK")
+
+        self.assertRunOK("scan --from=x1 --to=z", "x1 : y1\nx2 : y2\nx3 : y3")
+        self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3")
+        self.assertRunOK("scan --from=x", "x1 : y1\nx2 : y2\nx3 : y3")
+
+        self.assertRunOK("scan --to=x2", "x1 : y1")
+        self.assertRunOK("scan --from=x1 --to=z --max_keys=1", "x1 : y1")
+        self.assertRunOK("scan --from=x1 --to=z --max_keys=2",
+                "x1 : y1\nx2 : y2")
+
+        self.assertRunOK("scan --from=x1 --to=z --max_keys=3",
+                "x1 : y1\nx2 : y2\nx3 : y3")
+        self.assertRunOK("scan --from=x1 --to=z --max_keys=4",
+                "x1 : y1\nx2 : y2\nx3 : y3")
+        self.assertRunOK("scan --from=x1 --to=x2", "x1 : y1")
+        self.assertRunOK("scan --from=x2 --to=x4", "x2 : y2\nx3 : y3")
+        self.assertRunFAIL("scan --from=x4 --to=z") # No results => FAIL
+        self.assertRunFAIL("scan --from=x1 --to=z --max_keys=foo")
+
+        self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3")
+
+        self.assertRunOK("delete x1", "OK")
+        self.assertRunOK("scan", "x2 : y2\nx3 : y3")
+
+        self.assertRunOK("delete NonExistentKey", "OK")
+        # It is weird that GET and SCAN raise exception for
+        # non-existent key, while delete does not
+
+    def dumpDb(self, params, dumpFile):
+        return 0 == run_err_null("./ldb dump %s > %s" % (params, dumpFile))
+
+    def loadDb(self, params, dumpFile):
+        return 0 == run_err_null("cat %s | ./ldb load %s" % (dumpFile, params))
+
+    def testStringBatchPut(self):
+        print "Running testStringBatchPut..."
+        self.assertRunOK("batchput x1 y1 --create_if_missing", "OK")
+        self.assertRunOK("scan", "x1 : y1")
+        self.assertRunOK("batchput x2 y2 x3 y3 \"x4 abc\" \"y4 xyz\"", "OK")
+        self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 abc : y4 xyz")
+        self.assertRunFAIL("batchput")
+        self.assertRunFAIL("batchput k1")
+        self.assertRunFAIL("batchput k1 v1 k2")
+
+    def testCountDelimDump(self):
+        print "Running testCountDelimDump..."
+        self.assertRunOK("batchput x.1 x1 --create_if_missing", "OK")
+        self.assertRunOK("batchput y.abc abc y.2 2 z.13c pqr", "OK")
+        self.assertRunOK("dump --count_delim", "x => count:1\tsize:5\ny => count:2\tsize:12\nz => count:1\tsize:8")
+        self.assertRunOK("dump --count_delim=\".\"", "x => count:1\tsize:5\ny => count:2\tsize:12\nz => count:1\tsize:8")
+        self.assertRunOK("batchput x,2 x2 x,abc xabc", "OK")
+        self.assertRunOK("dump --count_delim=\",\"", "x => count:2\tsize:14\nx.1 => count:1\tsize:5\ny.2 => count:1\tsize:4\ny.abc => count:1\tsize:8\nz.13c => count:1\tsize:8")
+
+    def testCountDelimIDump(self):
+        print "Running testCountDelimIDump..."
+        self.assertRunOK("batchput x.1 x1 --create_if_missing", "OK")
+        self.assertRunOK("batchput y.abc abc y.2 2 z.13c pqr", "OK")
+        self.assertRunOK("dump --count_delim", "x => count:1\tsize:5\ny => count:2\tsize:12\nz => count:1\tsize:8")
+        self.assertRunOK("dump --count_delim=\".\"", "x => count:1\tsize:5\ny => count:2\tsize:12\nz => count:1\tsize:8")
+        self.assertRunOK("batchput x,2 x2 x,abc xabc", "OK")
+        self.assertRunOK("dump --count_delim=\",\"", "x => count:2\tsize:14\nx.1 => count:1\tsize:5\ny.2 => count:1\tsize:4\ny.abc => count:1\tsize:8\nz.13c => count:1\tsize:8")
+
+    def testInvalidCmdLines(self):
+        print "Running testInvalidCmdLines..."
+        # db not specified
+        self.assertRunFAILFull("put 0x6133 0x6233 --hex --create_if_missing")
+        # No param called he
+        self.assertRunFAIL("put 0x6133 0x6233 --he --create_if_missing")
+        # max_keys is not applicable for put
+        self.assertRunFAIL("put 0x6133 0x6233 --max_keys=1 --create_if_missing")
+        # hex has invalid boolean value
+
+    def testHexPutGet(self):
+        print "Running testHexPutGet..."
+        self.assertRunOK("put a1 b1 --create_if_missing", "OK")
+        self.assertRunOK("scan", "a1 : b1")
+        self.assertRunOK("scan --hex", "0x6131 : 0x6231")
+        self.assertRunFAIL("put --hex 6132 6232")
+        self.assertRunOK("put --hex 0x6132 0x6232", "OK")
+        self.assertRunOK("scan --hex", "0x6131 : 0x6231\n0x6132 : 0x6232")
+        self.assertRunOK("scan", "a1 : b1\na2 : b2")
+        self.assertRunOK("get a1", "b1")
+        self.assertRunOK("get --hex 0x6131", "0x6231")
+        self.assertRunOK("get a2", "b2")
+        self.assertRunOK("get --hex 0x6132", "0x6232")
+        self.assertRunOK("get --key_hex 0x6132", "b2")
+        self.assertRunOK("get --key_hex --value_hex 0x6132", "0x6232")
+        self.assertRunOK("get --value_hex a2", "0x6232")
+        self.assertRunOK("scan --key_hex --value_hex",
+                "0x6131 : 0x6231\n0x6132 : 0x6232")
+        self.assertRunOK("scan --hex --from=0x6131 --to=0x6133",
+                "0x6131 : 0x6231\n0x6132 : 0x6232")
+        self.assertRunOK("scan --hex --from=0x6131 --to=0x6132",
+                "0x6131 : 0x6231")
+        self.assertRunOK("scan --key_hex", "0x6131 : b1\n0x6132 : b2")
+        self.assertRunOK("scan --value_hex", "a1 : 0x6231\na2 : 0x6232")
+        self.assertRunOK("batchput --hex 0x6133 0x6233 0x6134 0x6234", "OK")
+        self.assertRunOK("scan", "a1 : b1\na2 : b2\na3 : b3\na4 : b4")
+        self.assertRunOK("delete --hex 0x6133", "OK")
+        self.assertRunOK("scan", "a1 : b1\na2 : b2\na4 : b4")
+
+    def testTtlPutGet(self):
+        print "Running testTtlPutGet..."
+        self.assertRunOK("put a1 b1 --ttl --create_if_missing", "OK")
+        self.assertRunOK("scan ", "a1 : b1", True)
+        self.assertRunOK("dump --ttl ", "a1 ==> b1", True)
+        self.assertRunOK("scan --hex --ttl", "0x6131 : 0x6231")
+        self.assertRunOK("get a1", "b1", True)
+        self.assertRunOK("get --ttl a1", "b1")
+        self.assertRunOK("put a3 b3 --create_if_missing", "OK")
+        # fails because timstamp's length is greater than value's
+        self.assertRunFAIL("get --ttl a3")
+
+    def testInvalidCmdLines(self):
+        print "Running testInvalidCmdLines..."
+        # db not specified
+        self.assertRunFAILFull("put 0x6133 0x6233 --hex --create_if_missing")
+        # No param called he
+        self.assertRunFAIL("put 0x6133 0x6233 --he --create_if_missing")
+        # max_keys is not applicable for put
+        self.assertRunFAIL("put 0x6133 0x6233 --max_keys=1 --create_if_missing")
+        # hex has invalid boolean value
+        self.assertRunFAIL("put 0x6133 0x6233 --hex=Boo --create_if_missing")
+
+    def testDumpLoad(self):
+        print "Running testDumpLoad..."
+        self.assertRunOK("batchput --create_if_missing x1 y1 x2 y2 x3 y3 x4 y4",
+                "OK")
+        self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+        origDbPath = os.path.join(self.TMP_DIR, self.DB_NAME)
+
+        # Dump and load without any additional params specified
+        dumpFilePath = os.path.join(self.TMP_DIR, "dump1")
+        loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump1")
+        self.assertTrue(self.dumpDb("--db=%s" % origDbPath, dumpFilePath))
+        self.assertTrue(self.loadDb(
+            "--db=%s --create_if_missing" % loadedDbPath, dumpFilePath))
+        self.assertRunOKFull("scan --db=%s" % loadedDbPath,
+                "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+
+        # Dump and load in hex
+        dumpFilePath = os.path.join(self.TMP_DIR, "dump2")
+        loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump2")
+        self.assertTrue(self.dumpDb("--db=%s --hex" % origDbPath, dumpFilePath))
+        self.assertTrue(self.loadDb(
+            "--db=%s --hex --create_if_missing" % loadedDbPath, dumpFilePath))
+        self.assertRunOKFull("scan --db=%s" % loadedDbPath,
+                "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+
+        # Dump only a portion of the key range
+        dumpFilePath = os.path.join(self.TMP_DIR, "dump3")
+        loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump3")
+        self.assertTrue(self.dumpDb(
+            "--db=%s --from=x1 --to=x3" % origDbPath, dumpFilePath))
+        self.assertTrue(self.loadDb(
+            "--db=%s --create_if_missing" % loadedDbPath, dumpFilePath))
+        self.assertRunOKFull("scan --db=%s" % loadedDbPath, "x1 : y1\nx2 : y2")
+
+        # Dump upto max_keys rows
+        dumpFilePath = os.path.join(self.TMP_DIR, "dump4")
+        loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump4")
+        self.assertTrue(self.dumpDb(
+            "--db=%s --max_keys=3" % origDbPath, dumpFilePath))
+        self.assertTrue(self.loadDb(
+            "--db=%s --create_if_missing" % loadedDbPath, dumpFilePath))
+        self.assertRunOKFull("scan --db=%s" % loadedDbPath,
+                "x1 : y1\nx2 : y2\nx3 : y3")
+
+        # Load into an existing db, create_if_missing is not specified
+        self.assertTrue(self.dumpDb("--db=%s" % origDbPath, dumpFilePath))
+        self.assertTrue(self.loadDb("--db=%s" % loadedDbPath, dumpFilePath))
+        self.assertRunOKFull("scan --db=%s" % loadedDbPath,
+                "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+
+        # Dump and load with WAL disabled
+        dumpFilePath = os.path.join(self.TMP_DIR, "dump5")
+        loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump5")
+        self.assertTrue(self.dumpDb("--db=%s" % origDbPath, dumpFilePath))
+        self.assertTrue(self.loadDb(
+            "--db=%s --disable_wal --create_if_missing" % loadedDbPath,
+            dumpFilePath))
+        self.assertRunOKFull("scan --db=%s" % loadedDbPath,
+                "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+
+        # Dump and load with lots of extra params specified
+        extraParams = " ".join(["--bloom_bits=14", "--compression_type=bzip2",
+                                "--block_size=1024", "--auto_compaction=true",
+                                "--write_buffer_size=4194304",
+                                "--file_size=2097152"])
+        dumpFilePath = os.path.join(self.TMP_DIR, "dump6")
+        loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump6")
+        self.assertTrue(self.dumpDb(
+            "--db=%s %s" % (origDbPath, extraParams), dumpFilePath))
+        self.assertTrue(self.loadDb(
+            "--db=%s %s --create_if_missing" % (loadedDbPath, extraParams),
+            dumpFilePath))
+        self.assertRunOKFull("scan --db=%s" % loadedDbPath,
+                "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+
+        # Dump with count_only
+        dumpFilePath = os.path.join(self.TMP_DIR, "dump7")
+        loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump7")
+        self.assertTrue(self.dumpDb(
+            "--db=%s --count_only" % origDbPath, dumpFilePath))
+        self.assertTrue(self.loadDb(
+            "--db=%s --create_if_missing" % loadedDbPath, dumpFilePath))
+        # DB should have atleast one value for scan to work
+        self.assertRunOKFull("put --db=%s k1 v1" % loadedDbPath, "OK")
+        self.assertRunOKFull("scan --db=%s" % loadedDbPath, "k1 : v1")
+
+        # Dump command fails because of typo in params
+        dumpFilePath = os.path.join(self.TMP_DIR, "dump8")
+        self.assertFalse(self.dumpDb(
+            "--db=%s --create_if_missing" % origDbPath, dumpFilePath))
+
+    def testMiscAdminTask(self):
+        print "Running testMiscAdminTask..."
+        # These tests need to be improved; for example with asserts about
+        # whether compaction or level reduction actually took place.
+        self.assertRunOK("batchput --create_if_missing x1 y1 x2 y2 x3 y3 x4 y4",
+                "OK")
+        self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+        origDbPath = os.path.join(self.TMP_DIR, self.DB_NAME)
+
+        self.assertTrue(0 == run_err_null(
+            "./ldb compact --db=%s" % origDbPath))
+        self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+
+        self.assertTrue(0 == run_err_null(
+            "./ldb reduce_levels --db=%s --new_levels=2" % origDbPath))
+        self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+
+        self.assertTrue(0 == run_err_null(
+            "./ldb reduce_levels --db=%s --new_levels=3" % origDbPath))
+        self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+
+        self.assertTrue(0 == run_err_null(
+            "./ldb compact --db=%s --from=x1 --to=x3" % origDbPath))
+        self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+
+        self.assertTrue(0 == run_err_null(
+            "./ldb compact --db=%s --hex --from=0x6131 --to=0x6134"
+            % origDbPath))
+        self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+
+        #TODO(dilip): Not sure what should be passed to WAL.Currently corrupted.
+        self.assertTrue(0 == run_err_null(
+            "./ldb dump_wal --db=%s --walfile=%s --header" % (
+                origDbPath, os.path.join(origDbPath, "LOG"))))
+        self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tools/reduce_levels_test.cc b/tools/reduce_levels_test.cc
new file mode 100644 (file)
index 0000000..b588b52
--- /dev/null
@@ -0,0 +1,197 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#include "rocksdb/db.h"
+#include "db/db_impl.h"
+#include "db/version_set.h"
+#include "util/logging.h"
+#include "util/testutil.h"
+#include "util/testharness.h"
+#include "util/ldb_cmd.h"
+
+namespace rocksdb {
+
+class ReduceLevelTest {
+public:
+  ReduceLevelTest() {
+    dbname_ = test::TmpDir() + "/db_reduce_levels_test";
+    DestroyDB(dbname_, Options());
+    db_ = nullptr;
+  }
+
+  Status OpenDB(bool create_if_missing, int levels,
+      int mem_table_compact_level);
+
+  Status Put(const std::string& k, const std::string& v) {
+    return db_->Put(WriteOptions(), k, v);
+  }
+
+  std::string Get(const std::string& k) {
+    ReadOptions options;
+    std::string result;
+    Status s = db_->Get(options, k, &result);
+    if (s.IsNotFound()) {
+      result = "NOT_FOUND";
+    } else if (!s.ok()) {
+      result = s.ToString();
+    }
+    return result;
+  }
+
+  Status CompactMemTable() {
+    if (db_ == nullptr) {
+      return Status::InvalidArgument("DB not opened.");
+    }
+    DBImpl* db_impl = reinterpret_cast<DBImpl*>(db_);
+    return db_impl->TEST_FlushMemTable();
+  }
+
+  void CloseDB() {
+    if (db_ != nullptr) {
+      delete db_;
+      db_ = nullptr;
+    }
+  }
+
+  bool ReduceLevels(int target_level);
+
+  int FilesOnLevel(int level) {
+    std::string property;
+    ASSERT_TRUE(
+        db_->GetProperty("rocksdb.num-files-at-level" + NumberToString(level),
+                         &property));
+    return atoi(property.c_str());
+  }
+
+private:
+  std::string dbname_;
+  DB* db_;
+};
+
+Status ReduceLevelTest::OpenDB(bool create_if_missing, int num_levels,
+    int mem_table_compact_level) {
+  rocksdb::Options opt;
+  opt.num_levels = num_levels;
+  opt.create_if_missing = create_if_missing;
+  opt.max_mem_compaction_level = mem_table_compact_level;
+  rocksdb::Status st = rocksdb::DB::Open(opt, dbname_, &db_);
+  if (!st.ok()) {
+    fprintf(stderr, "Can't open the db:%s\n", st.ToString().c_str());
+  }
+  return st;
+}
+
+bool ReduceLevelTest::ReduceLevels(int target_level) {
+  std::vector<std::string> args = rocksdb::ReduceDBLevelsCommand::PrepareArgs(
+      dbname_, target_level, false);
+  LDBCommand* level_reducer = LDBCommand::InitFromCmdLineArgs(args);
+  level_reducer->Run();
+  bool is_succeed = level_reducer->GetExecuteState().IsSucceed();
+  delete level_reducer;
+  return is_succeed;
+}
+
+TEST(ReduceLevelTest, Last_Level) {
+  // create files on all levels;
+  ASSERT_OK(OpenDB(true, 4, 3));
+  ASSERT_OK(Put("aaaa", "11111"));
+  ASSERT_OK(CompactMemTable());
+  ASSERT_EQ(FilesOnLevel(3), 1);
+  CloseDB();
+
+  ASSERT_TRUE(ReduceLevels(3));
+  ASSERT_OK(OpenDB(true, 3, 1));
+  ASSERT_EQ(FilesOnLevel(2), 1);
+  CloseDB();
+
+  ASSERT_TRUE(ReduceLevels(2));
+  ASSERT_OK(OpenDB(true, 2, 1));
+  ASSERT_EQ(FilesOnLevel(1), 1);
+  CloseDB();
+}
+
+TEST(ReduceLevelTest, Top_Level) {
+  // create files on all levels;
+  ASSERT_OK(OpenDB(true, 5, 0));
+  ASSERT_OK(Put("aaaa", "11111"));
+  ASSERT_OK(CompactMemTable());
+  ASSERT_EQ(FilesOnLevel(0), 1);
+  CloseDB();
+
+  ASSERT_TRUE(ReduceLevels(4));
+  ASSERT_OK(OpenDB(true, 4, 0));
+  CloseDB();
+
+  ASSERT_TRUE(ReduceLevels(3));
+  ASSERT_OK(OpenDB(true, 3, 0));
+  CloseDB();
+
+  ASSERT_TRUE(ReduceLevels(2));
+  ASSERT_OK(OpenDB(true, 2, 0));
+  CloseDB();
+}
+
+TEST(ReduceLevelTest, All_Levels) {
+  // create files on all levels;
+  ASSERT_OK(OpenDB(true, 5, 1));
+  ASSERT_OK(Put("a", "a11111"));
+  ASSERT_OK(CompactMemTable());
+  ASSERT_EQ(FilesOnLevel(1), 1);
+  CloseDB();
+
+  ASSERT_OK(OpenDB(true, 5, 2));
+  ASSERT_OK(Put("b", "b11111"));
+  ASSERT_OK(CompactMemTable());
+  ASSERT_EQ(FilesOnLevel(1), 1);
+  ASSERT_EQ(FilesOnLevel(2), 1);
+  CloseDB();
+
+  ASSERT_OK(OpenDB(true, 5, 3));
+  ASSERT_OK(Put("c", "c11111"));
+  ASSERT_OK(CompactMemTable());
+  ASSERT_EQ(FilesOnLevel(1), 1);
+  ASSERT_EQ(FilesOnLevel(2), 1);
+  ASSERT_EQ(FilesOnLevel(3), 1);
+  CloseDB();
+
+  ASSERT_OK(OpenDB(true, 5, 4));
+  ASSERT_OK(Put("d", "d11111"));
+  ASSERT_OK(CompactMemTable());
+  ASSERT_EQ(FilesOnLevel(1), 1);
+  ASSERT_EQ(FilesOnLevel(2), 1);
+  ASSERT_EQ(FilesOnLevel(3), 1);
+  ASSERT_EQ(FilesOnLevel(4), 1);
+  CloseDB();
+
+  ASSERT_TRUE(ReduceLevels(4));
+  ASSERT_OK(OpenDB(true, 4, 0));
+  ASSERT_EQ("a11111", Get("a"));
+  ASSERT_EQ("b11111", Get("b"));
+  ASSERT_EQ("c11111", Get("c"));
+  ASSERT_EQ("d11111", Get("d"));
+  CloseDB();
+
+  ASSERT_TRUE(ReduceLevels(3));
+  ASSERT_OK(OpenDB(true, 3, 0));
+  ASSERT_EQ("a11111", Get("a"));
+  ASSERT_EQ("b11111", Get("b"));
+  ASSERT_EQ("c11111", Get("c"));
+  ASSERT_EQ("d11111", Get("d"));
+  CloseDB();
+
+  ASSERT_TRUE(ReduceLevels(2));
+  ASSERT_OK(OpenDB(true, 2, 0));
+  ASSERT_EQ("a11111", Get("a"));
+  ASSERT_EQ("b11111", Get("b"));
+  ASSERT_EQ("c11111", Get("c"));
+  ASSERT_EQ("d11111", Get("d"));
+  CloseDB();
+}
+
+}
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/tools/shell/DBClientProxy.cpp b/tools/shell/DBClientProxy.cpp
new file mode 100644 (file)
index 0000000..93277ac
--- /dev/null
@@ -0,0 +1,271 @@
+
+#include <boost/shared_ptr.hpp>
+
+#include "DBClientProxy.h"
+
+
+#include "thrift/lib/cpp/protocol/TBinaryProtocol.h"
+#include "thrift/lib/cpp/transport/TSocket.h"
+#include "thrift/lib/cpp/transport/TTransportUtils.h"
+
+
+
+using namespace std;
+using namespace boost;
+using namespace Tleveldb;
+using namespace apache::thrift::protocol;
+using namespace apache::thrift::transport;
+
+namespace rocksdb {
+
+DBClientProxy::DBClientProxy(const string & host, int port) :
+  host_(host),
+  port_(port),
+  dbToHandle_(),
+  dbClient_() {
+}
+
+DBClientProxy::~DBClientProxy() {
+  cleanUp();
+}
+
+
+void DBClientProxy::connect(void) {
+  cleanUp();
+  printf("Connecting to %s:%d\n", host_.c_str(), port_);
+  try {
+    boost::shared_ptr<TSocket> socket(new TSocket(host_, port_));
+    boost::shared_ptr<TTransport> transport(new TBufferedTransport(socket));
+    boost::shared_ptr<TBinaryProtocol> protocol(new TBinaryProtocol(transport));
+    dbClient_.reset(new DBClient(protocol));
+
+    transport->open();
+  } catch (const std::exception & e) {
+    dbClient_.reset();
+    throw;
+  }
+}
+
+void DBClientProxy::cleanUp(void) {
+  if(dbClient_.get()) {
+    for(map<string, DBHandle>::iterator itor = dbToHandle_.begin();
+        itor != dbToHandle_.end();
+        ++itor) {
+      dbClient_->Close(itor->second, itor->first);
+    }
+    dbClient_.reset();
+  }
+  dbToHandle_.clear();
+}
+
+void DBClientProxy::open(const string & db) {
+  if(!dbClient_.get()) {
+    printf("please connect() first\n");
+    return;
+  }
+
+  //  printf("opening database : %s\n", db.c_str());
+  // we use default DBOptions here
+  DBOptions opt;
+  DBHandle handle;
+  try {
+    dbClient_->Open(handle, db, opt);
+  } catch (const LeveldbException & e) {
+    printf("%s\n", e.message.c_str());
+    if(kIOError == e.errorCode) {
+      printf("no such database : %s\n", db.c_str());
+      return;
+    }else {
+      printf("Unknown error : %d\n", e.errorCode);
+      return;
+    }
+  }
+
+  dbToHandle_[db] = handle;
+}
+
+
+bool DBClientProxy::create(const string & db) {
+  if(!dbClient_.get()) {
+    printf("please connect() first\n");
+    return false;
+  }
+
+  printf("creating database : %s\n", db.c_str());
+  DBOptions opt;
+  opt.create_if_missing = true;
+  opt.error_if_exists = true;
+  DBHandle handle;
+  try {
+    dbClient_->Open(handle, db, opt);
+  }catch (const LeveldbException & e) {
+    printf("%s\n", e.message.c_str());
+    printf("error code : %d\n", e.errorCode);
+    if(kNotFound == e.errorCode) {
+      printf("no such database : %s\n", db.c_str());
+      return false;;
+    } else {
+      printf("Unknown error : %d\n", e.errorCode);
+      return false;
+    }
+  }
+
+  dbToHandle_[db] = handle;
+  return true;
+}
+
+
+map<string, DBHandle>::iterator
+DBClientProxy::getHandle(const string & db) {
+  map<string, DBHandle>::iterator itor = dbToHandle_.find(db);
+  if(dbToHandle_.end() == itor) {
+    open(db);
+    itor = dbToHandle_.find(db);
+  }
+
+  return itor;
+}
+
+
+bool DBClientProxy::get(const string & db,
+                        const string & key,
+                        string & value) {
+  if(!dbClient_.get()) {
+    printf("please connect() first\n");
+    return false;
+  }
+
+  map<string, DBHandle>::iterator itor = getHandle(db);
+  if(dbToHandle_.end() == itor) {
+    return false;
+  }
+
+  ResultItem ret;
+  Slice k;
+  k.data = key;
+  k.size = key.size();
+  // we use default values of options here
+  ReadOptions opt;
+  dbClient_->Get(ret,
+                 itor->second,
+                 k,
+                 opt);
+  if(kOk == ret.status) {
+    value = ret.value.data;
+    return true;
+  } else if(kNotFound == ret.status) {
+    printf("no such key : %s\n", key.c_str());
+    return false;
+  } else {
+    printf("get data error : %d\n", ret.status);
+    return false;
+  }
+}
+
+
+
+bool DBClientProxy::put(const string & db,
+                        const string & key,
+                        const string & value) {
+  if(!dbClient_.get()) {
+    printf("please connect() first\n");
+    return false;
+  }
+
+  map<string, DBHandle>::iterator itor = getHandle(db);
+  if(dbToHandle_.end() == itor) {
+    return false;
+  }
+  kv temp;
+  temp.key.data = key;
+  temp.key.size = key.size();
+  temp.value.data = value;
+  temp.value.size = value.size();
+  WriteOptions opt;
+  opt.sync = true;
+  Code code;
+  code = dbClient_->Put(itor->second,
+                        temp,
+                        opt);
+
+
+  if(kOk == code) {
+    //    printf("set value finished\n");
+    return true;
+  } else {
+    printf("put data error : %d\n", code);
+    return false;
+  }
+}
+
+bool DBClientProxy::scan(const string & db,
+                         const string & start_key,
+                         const string & end_key,
+                         const string & limit,
+                         vector<pair<string, string> > & kvs) {
+  if(!dbClient_.get()) {
+    printf("please connect() first\n");
+    return false;
+  }
+
+  int limitInt = -1;
+  limitInt = atoi(limit.c_str());
+  if(limitInt <= 0) {
+    printf("Error while parse limit : %s\n", limit.c_str());
+    return false;
+  }
+
+  if(start_key > end_key) {
+    printf("empty range.\n");
+    return false;
+  }
+
+  map<string, DBHandle>::iterator itor = getHandle(db);
+  if(dbToHandle_.end() == itor) {
+    return false;
+  }
+
+  ResultIterator ret;
+  // we use the default values of options here
+  ReadOptions opt;
+  Slice k;
+  k.data = start_key;
+  k.size = start_key.size();
+  dbClient_->NewIterator(ret,
+                         itor->second,
+                         opt,
+                         seekToKey,
+                         k);
+  Iterator it;
+  if(kOk == ret.status) {
+    it = ret.iterator;
+  } else {
+    printf("get iterator error : %d\n", ret.status);
+    return false;
+  }
+
+  int idx = 0;
+  string ck = start_key;
+  while(idx < limitInt && ck < end_key) {
+    ResultPair retPair;
+    dbClient_->GetNext(retPair, itor->second, it);
+    if(kOk == retPair.status) {
+      ++idx;
+      ck = retPair.keyvalue.key.data;
+      if (ck < end_key) {
+        kvs.push_back(make_pair(retPair.keyvalue.key.data,
+                                retPair.keyvalue.value.data));
+      }
+    } else if(kEnd == retPair.status) {
+      printf("not enough values\n");
+      return true;
+    } else {
+      printf("GetNext() error : %d\n", retPair.status);
+      return false;
+    }
+  }
+  return true;
+}
+
+} // namespace
diff --git a/tools/shell/DBClientProxy.h b/tools/shell/DBClientProxy.h
new file mode 100644 (file)
index 0000000..fba228b
--- /dev/null
@@ -0,0 +1,64 @@
+
+#ifndef TOOLS_SHELL_DBCLIENTPROXY
+#define TOOLS_SHELL_DBCLIENTPROXY
+
+#include <vector>
+#include <map>
+#include <string>
+#include <boost/utility.hpp>
+#include <boost/shared_ptr.hpp>
+
+#include "DB.h"
+
+/*
+ * class DBClientProxy maintains:
+ * 1. a connection to rocksdb service
+ * 2. a map from db names to opened db handles
+ *
+ * it's client codes' responsibility to catch all possible exceptions.
+ */
+
+namespace rocksdb {
+
+class DBClientProxy : private boost::noncopyable {
+ public:
+  // connect to host_:port_
+  void connect(void);
+
+  // return true on success, false otherwise
+  bool get(const std::string & db,
+           const std::string & key,
+           std::string & value);
+
+  // return true on success, false otherwise
+  bool put(const std::string & db,
+           const std::string & key,
+           const std::string & value);
+
+  // return true on success, false otherwise
+  bool scan(const std::string & db,
+            const std::string & start_key,
+            const std::string & end_key,
+            const std::string & limit,
+            std::vector<std::pair<std::string, std::string> > & kvs);
+
+  // return true on success, false otherwise
+  bool create(const std::string & db);
+
+  DBClientProxy(const std::string & host, int port);
+  ~DBClientProxy();
+
+ private:
+  // some internal help functions
+  void cleanUp(void);
+  void open(const std::string & db);
+  std::map<std::string, Trocksdb::DBHandle>::iterator getHandle(const std::string & db);
+
+  const std::string host_;
+  const int port_;
+  std::map<std::string, Trocksdb::DBHandle> dbToHandle_;
+  boost::shared_ptr<Trocksdb::DBClient> dbClient_;
+};
+
+} // namespace
+#endif
diff --git a/tools/shell/LeveldbShell.cpp b/tools/shell/LeveldbShell.cpp
new file mode 100644 (file)
index 0000000..e6274d3
--- /dev/null
@@ -0,0 +1,8 @@
+
+
+#include "ShellContext.h"
+
+int main(int argc, char ** argv) {
+  ShellContext c(argc, argv);
+  c.run();
+}
diff --git a/tools/shell/ShellContext.cpp b/tools/shell/ShellContext.cpp
new file mode 100644 (file)
index 0000000..05a9bb8
--- /dev/null
@@ -0,0 +1,104 @@
+
+#include <iostream>
+#include <boost/shared_ptr.hpp>
+
+#include "ShellContext.h"
+#include "ShellState.h"
+
+
+
+#include "thrift/lib/cpp/protocol/TBinaryProtocol.h"
+#include "thrift/lib/cpp/transport/TSocket.h"
+#include "thrift/lib/cpp/transport/TTransportUtils.h"
+
+
+
+using namespace std;
+using namespace boost;
+using namespace Tleveldb;
+using namespace rocksdb;
+using namespace apache::thrift::protocol;
+using namespace apache::thrift::transport;
+
+void ShellContext::changeState(ShellState * pState) {
+  pShellState_ = pState;
+}
+
+void ShellContext::stop(void) {
+  exit_ = true;
+}
+
+bool ShellContext::ParseInput(void) {
+  if(argc_ != 3) {
+    printf("leveldb_shell host port\n");
+    return false;
+  }
+
+  port_ = atoi(argv_[2]);
+  if(port_ <= 0) {
+    printf("Error while parse port : %s\n", argv_[2]);
+    return false;
+  }
+
+  clientProxy_.reset(new DBClientProxy(argv_[1], port_));
+  if(!clientProxy_.get()) {
+    return false;
+  } else {
+    return true;
+  }
+}
+
+void ShellContext::connect(void) {
+  clientProxy_->connect();
+}
+
+void ShellContext::create(const string & db) {
+  if (clientProxy_->create(db)) {
+    printf("%s created\n", db.c_str());
+  }
+}
+
+void ShellContext::get(const string & db,
+                       const string & key) {
+  string v;
+  if (clientProxy_->get(db, key, v)) {
+    printf("%s\n", v.c_str());
+  }
+}
+
+void ShellContext::put(const string & db,
+                       const string & key,
+                       const string & value) {
+  if (clientProxy_->put(db, key, value)) {
+    printf("(%s, %s) has been set\n", key.c_str(), value.c_str());
+  }
+}
+
+void ShellContext::scan(const string & db,
+                        const string & start_key,
+                        const string & end_key,
+                        const string & limit) {
+  vector<pair<string, string> > kvs;
+  if (clientProxy_->scan(db, start_key, end_key, limit, kvs)) {
+    for(unsigned int i = 0; i < kvs.size(); ++i) {
+      printf("%d (%s, %s)\n", i, kvs[i].first.c_str(), kvs[i].second.c_str());
+    }
+  }
+}
+
+void ShellContext::run(void) {
+  while(!exit_) {
+    pShellState_->run(this);
+  }
+}
+
+ShellContext::ShellContext(int argc, char ** argv) :
+  pShellState_(ShellStateStart::getInstance()),
+  exit_(false),
+  argc_(argc),
+  argv_(argv),
+  port_(-1),
+  clientProxy_() {
+}
+
+
diff --git a/tools/shell/ShellContext.h b/tools/shell/ShellContext.h
new file mode 100644 (file)
index 0000000..5c2b944
--- /dev/null
@@ -0,0 +1,51 @@
+#ifndef TOOLS_SHELL_SHELLCONTEXT
+#define TOOLS_SHELL_SHELLCONTEXT
+
+#include <map>
+#include <string>
+#include <boost/utility.hpp>
+#include <boost/shared_ptr.hpp>
+
+#include "DB.h"
+#include "DBClientProxy.h"
+
+class ShellState;
+
+class ShellContext : private boost::noncopyable {
+ public:
+  void changeState(ShellState * pState);
+
+  void stop(void);
+
+  bool ParseInput(void);
+
+  void connect(void);
+
+  void get(const std::string & db,
+           const std::string & key);
+
+  void put(const std::string & db,
+           const std::string & key,
+           const std::string & value);
+
+  void scan(const std::string & db,
+            const std::string & start_key,
+            const std::string & end_key,
+            const std::string & limit);
+
+  void create(const std::string & db);
+
+  void run(void);
+
+  ShellContext(int argc, char ** argv);
+
+ private:
+  ShellState * pShellState_;
+  bool exit_;
+  int argc_;
+  char ** argv_;
+  int port_;
+  boost::shared_ptr<rocksdb::DBClientProxy> clientProxy_;
+};
+
+#endif
diff --git a/tools/shell/ShellState.cpp b/tools/shell/ShellState.cpp
new file mode 100644 (file)
index 0000000..057a337
--- /dev/null
@@ -0,0 +1,139 @@
+#include <iostream>
+#include <string>
+#include <sstream>
+#include <vector>
+
+#include "ShellState.h"
+#include "ShellContext.h"
+#include "transport/TTransportException.h"
+
+using namespace std;
+
+using namespace apache::thrift::transport;
+
+const char * PMT = ">> ";
+
+
+void ShellStateStart::run(ShellContext * c) {
+  if(!c->ParseInput()) {
+    c->changeState(ShellStateStop::getInstance());
+  } else {
+    c->changeState(ShellStateConnecting::getInstance());
+  }
+}
+
+
+void ShellStateStop::run(ShellContext * c) {
+  c->stop();
+}
+
+void ShellStateConnecting::run(ShellContext * c) {
+  try {
+    c->connect();
+  } catch (const TTransportException & e) {
+    cout << e.what() << endl;
+    c->changeState(ShellStateStop::getInstance());
+    return;
+  }
+  
+  c->changeState(ShellStateConnected::getInstance());
+}
+
+void ShellStateConnected::unknownCmd(void) {
+  cout << "Unknown command!" << endl;
+  cout << "Use help to list all available commands" << endl;
+}
+
+void ShellStateConnected::helpMsg(void) {
+  cout << "Currently supported commands:" << endl;
+  cout << "create db" << endl;
+  cout << "get db key" << endl;
+  cout << "scan db start_key end_key limit" << endl;
+  cout << "put db key value" << endl;
+  cout << "exit/quit" << endl;
+}
+
+void ShellStateConnected::handleConError(ShellContext * c) {
+  cout << "Connection down" << endl;
+  cout << "Reconnect ? (y/n) :" << endl;
+  string s;
+  while(getline(cin, s)) {
+    if("y" == s) {
+      c->changeState(ShellStateConnecting::getInstance());
+      break;
+    } else if("n" == s) {
+      c->changeState(ShellStateStop::getInstance());
+      break;
+    } else {
+      cout << "Reconnect ? (y/n) :" << endl;
+    }
+  }
+}
+
+void ShellStateConnected::run(ShellContext * c) {
+  string line;
+  cout << PMT;
+  getline(cin, line);
+  istringstream is(line);
+  vector<string> params;
+  string param;
+  while(is >> param) {
+    params.push_back(param);
+  }
+
+  // empty input line
+  if(params.empty())
+    return;
+
+  if("quit" == params[0] || "exit" == params[0]) {
+    c->changeState(ShellStateStop::getInstance());
+  } else if("get" == params[0]) {
+    if(params.size() == 3) {
+      try {
+        c->get(params[1], params[2]);
+      } catch (const TTransportException & e) {
+        cout << e.what() << endl;
+        handleConError(c);
+      }
+    } else {
+      unknownCmd();
+    }
+  } else if("create" == params[0]) {
+    if(params.size() == 2) {
+      try {
+        c->create(params[1]);
+      } catch (const TTransportException & e) {
+        cout << e.what() << endl;
+        handleConError(c);
+      }
+    } else {
+      unknownCmd();
+    }
+  }else if("put" == params[0]) {
+    if(params.size() == 4) {
+      try {
+        c->put(params[1], params[2], params[3]);
+      } catch (const TTransportException & e) {
+        cout << e.what() << endl;
+        handleConError(c);
+      }
+    } else {
+      unknownCmd();
+    }
+  } else if("scan" == params[0]) {
+    if(params.size() == 5) {
+      try {
+        c->scan(params[1], params[2], params[3], params[4]);
+      } catch (const TTransportException & e) {
+        cout << e.what() << endl;
+        handleConError(c);
+      }
+    } else {
+      unknownCmd();
+    }
+  } else if("help" == params[0]) {
+    helpMsg();
+  } else {
+    unknownCmd();
+  }
+}
diff --git a/tools/shell/ShellState.h b/tools/shell/ShellState.h
new file mode 100644 (file)
index 0000000..4027af2
--- /dev/null
@@ -0,0 +1,87 @@
+
+#ifndef TOOLS_SHELL_SHELLSTATE
+#define TOOLS_SHELL_SHELLSTATE
+
+class ShellContext;
+
+/*
+ * Currently, there are four types of state in total
+ * 1. start state: the first state the program enters
+ * 2. connecting state: the program try to connect to a rocksdb server, whose
+ *    previous states could be "start" or "connected" states
+ * 3. connected states: the program has already connected to a server, and is
+ *    processing user commands
+ * 4. stop state: the last state the program enters, do some cleaning up things
+ */
+
+class ShellState {
+ public:
+  virtual void run(ShellContext *) = 0;
+  virtual ~ShellState() {}
+};
+
+
+class ShellStateStart : public ShellState {
+ public:
+  static ShellStateStart * getInstance(void) {
+    static ShellStateStart instance;
+    return &instance;
+  }
+
+  virtual void run(ShellContext *);
+
+ private:
+  ShellStateStart() {}
+  virtual ~ShellStateStart() {}
+};
+
+class ShellStateStop : public ShellState {
+ public:
+  static ShellStateStop * getInstance(void) {
+    static ShellStateStop instance;
+    return &instance;
+  }
+
+  virtual void run(ShellContext *);
+
+ private:
+  ShellStateStop() {}
+  virtual ~ShellStateStop() {}
+
+};
+
+class ShellStateConnecting : public ShellState {
+ public:
+  static ShellStateConnecting * getInstance(void) {
+    static ShellStateConnecting instance;
+    return &instance;
+  }
+
+  virtual void run(ShellContext *);
+
+ private:
+  ShellStateConnecting() {}
+  virtual ~ShellStateConnecting() {}
+
+};
+
+class ShellStateConnected : public ShellState {
+ public:
+  static ShellStateConnected * getInstance(void) {
+    static ShellStateConnected instance;
+    return &instance;
+  }
+
+  virtual void run(ShellContext *);
+
+ private:
+  ShellStateConnected() {}
+  virtual ~ShellStateConnected() {}
+
+  void unknownCmd();
+  void handleConError(ShellContext *);
+  void helpMsg();
+};
+
+#endif
+
diff --git a/tools/shell/test/DBClientProxyTest.cpp b/tools/shell/test/DBClientProxyTest.cpp
new file mode 100644 (file)
index 0000000..3b64ffc
--- /dev/null
@@ -0,0 +1,182 @@
+/**
+ * Tests for DBClientProxy class for leveldb
+ * @author Bo Liu (newpoo.liu@gmail.com)
+ * Copyright 2012 Facebook
+ */
+
+#include <algorithm>
+#include <vector>
+#include <string>
+#include <protocol/TBinaryProtocol.h>
+#include <transport/TSocket.h>
+#include <transport/TBufferTransports.h>
+#include <util/testharness.h>
+#include <DB.h>
+#include <AssocService.h>
+#include <leveldb_types.h>
+
+#include "server_options.h"
+
+
+#include "../DBClientProxy.h"
+using namespace rocksdb;
+
+
+using namespace apache::thrift;
+using namespace apache::thrift::protocol;
+using namespace apache::thrift::transport;
+using boost::shared_ptr;
+using namespace Tleveldb;
+using namespace std;
+
+
+
+extern "C" void startServer(int argc, char**argv);
+extern "C" void stopServer(int port);
+extern  ServerOptions server_options;
+
+static const string db1("db1");
+
+
+static void testDBClientProxy(DBClientProxy & dbcp) {
+  bool flag;
+  const int NOK = 100;
+  const int BUFSIZE = 16;
+  int testcase = 0;
+
+  vector<string> keys, values;
+  vector<pair<string, string> > kvs, correctKvs;
+  string k, v;
+
+  for(int i = 0; i < NOK; ++i) {
+    char bufKey[BUFSIZE];
+    char bufValue[BUFSIZE];
+    snprintf(bufKey, BUFSIZE, "key%d", i);
+    snprintf(bufValue, BUFSIZE, "value%d", i);
+    keys.push_back(bufKey);
+    values.push_back(bufValue);
+    correctKvs.push_back((make_pair(string(bufKey), string(bufValue))));
+  }
+
+  sort(correctKvs.begin(), correctKvs.end());
+
+
+  // can not do get(), put(), scan() or create() before connected.
+  flag = dbcp.get(db1, keys[0], v);
+  ASSERT_TRUE(false == flag);
+  printf("\033[01;40;32mTEST CASE %d passed\033[01;40;37m\n", ++testcase);
+  flag = dbcp.put(db1, keys[0], keys[1]);
+  ASSERT_TRUE(false == flag);
+  printf("\033[01;40;32mTEST CASE %d passed\033[01;40;37m\n", ++testcase);
+  flag = dbcp.scan(db1, "a", "w", "100", kvs);
+  ASSERT_TRUE(false == flag);
+  printf("\033[01;40;32mTEST CASE %d passed\033[01;40;37m\n", ++testcase);
+  flag = dbcp.create(db1);
+  ASSERT_TRUE(false == flag);
+  printf("\033[01;40;32mTEST CASE %d passed\033[01;40;37m\n", ++testcase);
+
+  dbcp.connect();
+
+  // create a database
+  flag = dbcp.create(db1);
+  ASSERT_TRUE(true == flag);
+  printf("\033[01;40;32mTEST CASE %d passed\033[01;40;37m\n", ++testcase);
+
+  // no such key
+  flag = dbcp.get(db1, keys[0], v);
+  ASSERT_TRUE(false == flag);
+  printf("\033[01;40;32mTEST CASE %d passed\033[01;40;37m\n", ++testcase);
+
+
+  // scan() success with empty returned key-value pairs
+  kvs.clear();
+  flag = dbcp.scan(db1, "a", "w", "100", kvs);
+  ASSERT_TRUE(true == flag);
+  ASSERT_TRUE(kvs.empty());
+  printf("\033[01;40;32mTEST CASE %d passed\033[01;40;37m\n", ++testcase);
+
+
+  // put()
+  for(int i = 0; i < NOK; ++i) {
+    flag = dbcp.put(db1, keys[i], values[i]);
+    ASSERT_TRUE(true == flag);
+  }
+  printf("\033[01;40;32mTEST CASE %d passed\033[01;40;37m\n", ++testcase);
+
+
+  // scan all of key-value pairs
+  kvs.clear();
+  flag = dbcp.scan(db1, "a", "w", "100", kvs);
+  ASSERT_TRUE(true == flag);
+  ASSERT_TRUE(kvs == correctKvs);
+  printf("\033[01;40;32mTEST CASE %d passed\033[01;40;37m\n", ++testcase);
+
+
+  // scan the first 20 key-value pairs
+  {
+    kvs.clear();
+    flag = dbcp.scan(db1, "a", "w", "20", kvs);
+    ASSERT_TRUE(true == flag);
+    vector<pair<string, string> > tkvs(correctKvs.begin(), correctKvs.begin() + 20);
+    ASSERT_TRUE(kvs == tkvs);
+    printf("\033[01;40;32mTEST CASE %d passed\033[01;40;37m\n", ++testcase);
+  }
+
+  // scan key[10] to key[50]
+  {
+    kvs.clear();
+    flag = dbcp.scan(db1, correctKvs[10].first, correctKvs[50].first, "100", kvs);
+    ASSERT_TRUE(true == flag);
+
+    vector<pair<string, string> > tkvs(correctKvs.begin() + 10, correctKvs.begin() + 50);
+    ASSERT_TRUE(kvs == tkvs);
+    printf("\033[01;40;32mTEST CASE %d passed\033[01;40;37m\n", ++testcase);
+  }
+
+  // scan "key10" to "key40" by limit constraint
+  {
+    kvs.clear();
+    flag = dbcp.scan(db1, correctKvs[10].first.c_str(), "w", "30", kvs);
+    ASSERT_TRUE(true == flag);
+    vector<pair<string, string> > tkvs(correctKvs.begin() + 10, correctKvs.begin() + 40);
+    ASSERT_TRUE(kvs == tkvs);
+    printf("\033[01;40;32mTEST CASE %d passed\033[01;40;37m\n", ++testcase);
+  }
+
+
+  // get()
+  flag = dbcp.get(db1, "unknownKey", v);
+  ASSERT_TRUE(false == flag);
+  printf("\033[01;40;32mTEST CASE %d passed\033[01;40;37m\n", ++testcase);
+
+  flag = dbcp.get(db1, keys[0], v);
+  ASSERT_TRUE(true == flag);
+  ASSERT_TRUE(v == values[0]);
+  printf("\033[01;40;32mTEST CASE %d passed\033[01;40;37m\n", ++testcase);
+}
+
+
+
+static void cleanupDir(std::string dir) {
+  // remove old data, if any
+  char* cleanup = new char[100];
+  snprintf(cleanup, 100, "rm -rf %s", dir.c_str());
+  system(cleanup);
+}
+
+int main(int argc, char **argv) {
+  // create a server
+  startServer(argc, argv);
+  printf("Server thread created.\n");
+
+  // give some time to the server to initialize itself
+  while (server_options.getPort() == 0) {
+    sleep(1);
+  }
+
+  cleanupDir(server_options.getDataDirectory(db1));
+
+  DBClientProxy dbcp("localhost", server_options.getPort());
+  testDBClientProxy(dbcp);
+}
+
diff --git a/tools/sst_dump.cc b/tools/sst_dump.cc
new file mode 100644 (file)
index 0000000..9038895
--- /dev/null
@@ -0,0 +1,261 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "db/memtable.h"
+#include "db/write_batch_internal.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/table.h"
+#include "table/block.h"
+#include "table/block_builder.h"
+#include "table/format.h"
+#include "util/ldb_cmd.h"
+#include "util/random.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+
+class SstFileReader {
+ public:
+  explicit SstFileReader(const std::string& file_name,
+                         bool verify_checksum,
+                         bool output_hex);
+
+  Status ReadSequential(bool print_kv,
+                        uint64_t read_num,
+                        bool has_from,
+                        const std::string& from_key,
+                        bool has_to,
+                        const std::string& to_key);
+
+  uint64_t GetReadNumber() { return read_num_; }
+
+private:
+  std::string file_name_;
+  uint64_t read_num_;
+  bool verify_checksum_;
+  bool output_hex_;
+  EnvOptions soptions_;
+};
+
+SstFileReader::SstFileReader(const std::string& file_path,
+                             bool verify_checksum,
+                             bool output_hex)
+ :file_name_(file_path), read_num_(0), verify_checksum_(verify_checksum),
+  output_hex_(output_hex) {
+  std::cout << "Process " << file_path << "\n";
+}
+
+Status SstFileReader::ReadSequential(bool print_kv,
+                                     uint64_t read_num,
+                                     bool has_from,
+                                     const std::string& from_key,
+                                     bool has_to,
+                                     const std::string& to_key)
+{
+  unique_ptr<TableReader> table_reader;
+  InternalKeyComparator internal_comparator_(BytewiseComparator());
+  Options table_options;
+  table_options.comparator = &internal_comparator_;
+  unique_ptr<RandomAccessFile> file;
+  Status s = table_options.env->NewRandomAccessFile(file_name_, &file,
+                                                    soptions_);
+  if(!s.ok()) {
+   return s;
+  }
+  uint64_t file_size;
+  table_options.env->GetFileSize(file_name_, &file_size);
+  unique_ptr<TableFactory> table_factory;
+  s = table_options.table_factory->GetTableReader(table_options, soptions_,
+                                                  std::move(file), file_size,
+                                                  &table_reader);
+  if(!s.ok()) {
+   return s;
+  }
+
+  Iterator* iter = table_reader->NewIterator(ReadOptions(verify_checksum_,
+                                                         false));
+  uint64_t i = 0;
+  if (has_from) {
+    InternalKey ikey(from_key, kMaxSequenceNumber, kValueTypeForSeek);
+    iter->Seek(ikey.Encode());
+  } else {
+    iter->SeekToFirst();
+  }
+  for (; iter->Valid(); iter->Next()) {
+    Slice key = iter->key();
+    Slice value = iter->value();
+    ++i;
+    if (read_num > 0 && i > read_num)
+      break;
+
+    ParsedInternalKey ikey;
+    if (!ParseInternalKey(key, &ikey)) {
+      std::cerr << "Internal Key ["
+                << key.ToString(true /* in hex*/)
+                << "] parse error!\n";
+      continue;
+    }
+
+    // If end marker was specified, we stop before it
+    if (has_to && BytewiseComparator()->Compare(ikey.user_key, to_key) >= 0) {
+      break;
+    }
+
+    if (print_kv) {
+      std::cout << ikey.DebugString(output_hex_)
+                << " => "
+                << value.ToString(output_hex_) << "\n";
+    }
+
+   }
+
+   read_num_ += i;
+
+   Status ret = iter->status();
+   delete iter;
+   return ret;
+}
+
+} // namespace rocksdb
+
+static void print_help() {
+  fprintf(stderr,
+      "sst_dump [--command=check|scan] [--verify_checksum] "
+      "--file=data_dir_OR_sst_file"
+      " [--output_hex]"
+      " [--input_key_hex]"
+      " [--from=<user_key>]"
+      " [--to=<user_key>]"
+      " [--read_num=NUM]\n");
+}
+
+string HexToString(const string& str) {
+  string parsed;
+  if (str[0] != '0' || str[1] != 'x') {
+    fprintf(stderr, "Invalid hex input %s.  Must start with 0x\n",
+            str.c_str());
+    throw "Invalid hex input";
+  }
+
+  for (unsigned int i = 2; i < str.length();) {
+    int c;
+    sscanf(str.c_str() + i, "%2X", &c);
+    parsed.push_back(c);
+    i += 2;
+  }
+  return parsed;
+}
+
+int main(int argc, char** argv) {
+
+  const char* dir_or_file = nullptr;
+  uint64_t read_num = -1;
+  std::string command;
+
+  char junk;
+  uint64_t n;
+  bool verify_checksum = false;
+  bool output_hex = false;
+  bool input_key_hex = false;
+  bool has_from = false;
+  bool has_to = false;
+  std::string from_key;
+  std::string to_key;
+  for (int i = 1; i < argc; i++)
+  {
+    if (strncmp(argv[i], "--file=", 7) == 0) {
+      dir_or_file = argv[i] + 7;
+    } else if (strcmp(argv[i], "--output_hex") == 0) {
+      output_hex = true;
+    } else if (strcmp(argv[i], "--input_key_hex") == 0) {
+      input_key_hex = true;
+    } else if (sscanf(argv[i],
+               "--read_num=%lu%c",
+               (unsigned long*)&n, &junk) == 1) {
+      read_num = n;
+    } else if (strcmp(argv[i], "--verify_checksum") == 0) {
+      verify_checksum = true;
+    } else if (strncmp(argv[i], "--command=", 10) == 0) {
+      command = argv[i] + 10;
+    } else if (strncmp(argv[i], "--from=", 7) == 0) {
+      from_key = argv[i] + 7;
+      has_from = true;
+    } else if (strncmp(argv[i], "--to=", 5) == 0) {
+      to_key = argv[i] + 5;
+      has_to = true;
+    }else {
+      print_help();
+      exit(1);
+    }
+  }
+
+
+  if (input_key_hex) {
+    if (has_from) {
+      from_key = HexToString(from_key);
+    }
+    if (has_to) {
+      to_key = HexToString(to_key);
+    }
+  }
+
+  if(dir_or_file == nullptr) {
+    print_help();
+    exit(1);
+  }
+
+  std::vector<std::string> filenames;
+  rocksdb::Env* env = rocksdb::Env::Default();
+  rocksdb::Status st = env->GetChildren(dir_or_file, &filenames);
+  bool dir = true;
+  if (!st.ok()) {
+    filenames.clear();
+    filenames.push_back(dir_or_file);
+    dir = false;
+  }
+
+  std::cout << "from [" << rocksdb::Slice(from_key).ToString(true)
+            << "] to [" << rocksdb::Slice(to_key).ToString(true) << "]\n";
+
+  uint64_t total_read = 0;
+  for (size_t i = 0; i < filenames.size(); i++) {
+    std::string filename = filenames.at(i);
+    if (filename.length() <= 4 ||
+        filename.rfind(".sst") != filename.length() - 4) {
+      //ignore
+      continue;
+    }
+    if(dir) {
+      filename = std::string(dir_or_file) + "/" + filename;
+    }
+    rocksdb::SstFileReader reader(filename, verify_checksum,
+                                  output_hex);
+    rocksdb::Status st;
+    // scan all files in give file path.
+    if (command == "" || command == "scan" || command == "check") {
+      st = reader.ReadSequential(command != "check",
+                                 read_num > 0 ? (read_num - total_read) :
+                                                read_num,
+                                 has_from, from_key, has_to, to_key);
+      if (!st.ok()) {
+        fprintf(stderr, "%s: %s\n", filename.c_str(),
+            st.ToString().c_str());
+      }
+      total_read += reader.GetReadNumber();
+      if (read_num > 0 && total_read > read_num) {
+        break;
+      }
+    }
+  }
+}
diff --git a/util/arena_impl.cc b/util/arena_impl.cc
new file mode 100644 (file)
index 0000000..5125e23
--- /dev/null
@@ -0,0 +1,93 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/arena_impl.h"
+#include <algorithm>
+
+namespace rocksdb {
+
+const size_t ArenaImpl::kMinBlockSize = 4096;
+const size_t ArenaImpl::kMaxBlockSize = 2 << 30;
+static const int kAlignUnit = sizeof(void*);
+
+size_t OptimizeBlockSize(size_t block_size) {
+  // Make sure block_size is in optimal range
+  block_size = std::max(ArenaImpl::kMinBlockSize, block_size);
+  block_size = std::min(ArenaImpl::kMaxBlockSize, block_size);
+
+  // make sure block_size is the multiple of kAlignUnit
+  if (block_size % kAlignUnit != 0) {
+    block_size = (1 + block_size / kAlignUnit) * kAlignUnit;
+  }
+
+  return block_size;
+}
+
+ArenaImpl::ArenaImpl(size_t block_size)
+    : kBlockSize(OptimizeBlockSize(block_size)) {
+  assert(kBlockSize >= kMinBlockSize && kBlockSize <= kMaxBlockSize &&
+         kBlockSize % kAlignUnit == 0);
+}
+
+ArenaImpl::~ArenaImpl() {
+  for (const auto& block : blocks_) {
+    delete[] block;
+  }
+}
+
+char* ArenaImpl::AllocateFallback(size_t bytes, bool aligned) {
+  if (bytes > kBlockSize / 4) {
+    // Object is more than a quarter of our block size.  Allocate it separately
+    // to avoid wasting too much space in leftover bytes.
+    return AllocateNewBlock(bytes);
+  }
+
+  // We waste the remaining space in the current block.
+  auto block_head = AllocateNewBlock(kBlockSize);
+  alloc_bytes_remaining_ = kBlockSize - bytes;
+
+  if (aligned) {
+    aligned_alloc_ptr_ = block_head + bytes;
+    unaligned_alloc_ptr_ = block_head + kBlockSize;
+    return block_head;
+  } else {
+    aligned_alloc_ptr_ = block_head;
+    unaligned_alloc_ptr_ = block_head + kBlockSize - bytes;
+    return unaligned_alloc_ptr_;
+  }
+}
+
+char* ArenaImpl::AllocateAligned(size_t bytes) {
+  assert((kAlignUnit & (kAlignUnit - 1)) ==
+         0);  // Pointer size should be a power of 2
+  size_t current_mod =
+      reinterpret_cast<uintptr_t>(aligned_alloc_ptr_) & (kAlignUnit - 1);
+  size_t slop = (current_mod == 0 ? 0 : kAlignUnit - current_mod);
+  size_t needed = bytes + slop;
+  char* result;
+  if (needed <= alloc_bytes_remaining_) {
+    result = aligned_alloc_ptr_ + slop;
+    aligned_alloc_ptr_ += needed;
+    alloc_bytes_remaining_ -= needed;
+  } else {
+    // AllocateFallback always returned aligned memory
+    result = AllocateFallback(bytes, true /* aligned */);
+  }
+  assert((reinterpret_cast<uintptr_t>(result) & (kAlignUnit - 1)) == 0);
+  return result;
+}
+
+char* ArenaImpl::AllocateNewBlock(size_t block_bytes) {
+  char* block = new char[block_bytes];
+  blocks_memory_ += block_bytes;
+  blocks_.push_back(block);
+  return block;
+}
+
+}  // namespace rocksdb
diff --git a/util/arena_impl.h b/util/arena_impl.h
new file mode 100644 (file)
index 0000000..538385c
--- /dev/null
@@ -0,0 +1,93 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// ArenaImpl is an implementation of Arena class. For a request of small size,
+// it allocates a block with pre-defined block size. For a request of big
+// size, it uses malloc to directly get the requested size.
+
+#pragma once
+#include <cstddef>
+#include <vector>
+#include <assert.h>
+#include <stdint.h>
+#include "rocksdb/arena.h"
+
+namespace rocksdb {
+
+class ArenaImpl : public Arena {
+ public:
+  // No copying allowed
+  ArenaImpl(const ArenaImpl&) = delete;
+  void operator=(const ArenaImpl&) = delete;
+
+  static const size_t kMinBlockSize;
+  static const size_t kMaxBlockSize;
+
+  explicit ArenaImpl(size_t block_size = kMinBlockSize);
+  virtual ~ArenaImpl();
+
+  virtual char* Allocate(size_t bytes) override;
+
+  virtual char* AllocateAligned(size_t bytes) override;
+
+  // Returns an estimate of the total memory usage of data allocated
+  // by the arena (exclude the space allocated but not yet used for future
+  // allocations).
+  virtual const size_t ApproximateMemoryUsage() {
+    return blocks_memory_ + blocks_.capacity() * sizeof(char*) -
+           alloc_bytes_remaining_;
+  }
+
+  virtual const size_t MemoryAllocatedBytes() override {
+    return blocks_memory_;
+  }
+
+ private:
+  // Number of bytes allocated in one block
+  const size_t kBlockSize;
+  // Array of new[] allocated memory blocks
+  typedef std::vector<char*> Blocks;
+  Blocks blocks_;
+
+  // Stats for current active block.
+  // For each block, we allocate aligned memory chucks from one end and
+  // allocate unaligned memory chucks from the other end. Otherwise the
+  // memory waste for alignment will be higher if we allocate both types of
+  // memory from one direction.
+  char* unaligned_alloc_ptr_ = nullptr;
+  char* aligned_alloc_ptr_ = nullptr;
+  // How many bytes left in currently active block?
+  size_t alloc_bytes_remaining_ = 0;
+
+  char* AllocateFallback(size_t bytes, bool aligned);
+  char* AllocateNewBlock(size_t block_bytes);
+
+  // Bytes of memory in blocks allocated so far
+  size_t blocks_memory_ = 0;
+};
+
+inline char* ArenaImpl::Allocate(size_t bytes) {
+  // The semantics of what to return are a bit messy if we allow
+  // 0-byte allocations, so we disallow them here (we don't need
+  // them for our internal use).
+  assert(bytes > 0);
+  if (bytes <= alloc_bytes_remaining_) {
+    unaligned_alloc_ptr_ -= bytes;
+    alloc_bytes_remaining_ -= bytes;
+    return unaligned_alloc_ptr_;
+  }
+  return AllocateFallback(bytes, false /* unaligned */);
+}
+
+// check and adjust the block_size so that the return value is
+//  1. in the range of [kMinBlockSize, kMaxBlockSize].
+//  2. the multiple of align unit.
+extern size_t OptimizeBlockSize(size_t block_size);
+
+}  // namespace rocksdb
diff --git a/util/arena_test.cc b/util/arena_test.cc
new file mode 100644 (file)
index 0000000..ca6dfc9
--- /dev/null
@@ -0,0 +1,137 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/arena_impl.h"
+#include "util/random.h"
+#include "util/testharness.h"
+
+namespace rocksdb {
+
+class ArenaImplTest { };
+
+TEST(ArenaImplTest, Empty) {
+  ArenaImpl arena0;
+}
+
+TEST(ArenaImplTest, MemoryAllocatedBytes) {
+  const int N = 17;
+  size_t req_sz;  //requested size
+  size_t bsz = 8192;  // block size
+  size_t expected_memory_allocated;
+
+  ArenaImpl arena_impl(bsz);
+
+  // requested size > quarter of a block:
+  //   allocate requested size separately
+  req_sz = 3001;
+  for (int i = 0; i < N; i++) {
+    arena_impl.Allocate(req_sz);
+  }
+  expected_memory_allocated = req_sz * N;
+  ASSERT_EQ(arena_impl.MemoryAllocatedBytes(), expected_memory_allocated);
+
+  // requested size < quarter of a block:
+  //   allocate a block with the default size, then try to use unused part
+  //   of the block. So one new block will be allocated for the first
+  //   Allocate(99) call. All the remaining calls won't lead to new allocation.
+  req_sz = 99;
+  for (int i = 0; i < N; i++) {
+    arena_impl.Allocate(req_sz);
+  }
+  expected_memory_allocated += bsz;
+  ASSERT_EQ(arena_impl.MemoryAllocatedBytes(), expected_memory_allocated);
+
+  // requested size > quarter of a block:
+  //   allocate requested size separately
+  req_sz = 99999999;
+  for (int i = 0; i < N; i++) {
+    arena_impl.Allocate(req_sz);
+  }
+  expected_memory_allocated += req_sz * N;
+  ASSERT_EQ(arena_impl.MemoryAllocatedBytes(), expected_memory_allocated);
+}
+
+// Make sure we didn't count the allocate but not used memory space in
+// Arena::ApproximateMemoryUsage()
+TEST(ArenaImplTest, ApproximateMemoryUsageTest) {
+  const size_t kBlockSize = 4096;
+  const size_t kEntrySize = kBlockSize / 8;
+       const size_t kZero = 0;
+  ArenaImpl arena(kBlockSize);
+  ASSERT_EQ(kZero, arena.ApproximateMemoryUsage());
+
+  auto num_blocks = kBlockSize / kEntrySize;
+
+  // first allocation
+  arena.AllocateAligned(kEntrySize);
+  auto mem_usage = arena.MemoryAllocatedBytes();
+  ASSERT_EQ(mem_usage, kBlockSize);
+  auto usage = arena.ApproximateMemoryUsage();
+  ASSERT_LT(usage, mem_usage);
+  for (size_t i = 1; i < num_blocks; ++i) {
+    arena.AllocateAligned(kEntrySize);
+    ASSERT_EQ(mem_usage, arena.MemoryAllocatedBytes());
+    ASSERT_EQ(arena.ApproximateMemoryUsage(), usage + kEntrySize);
+    usage = arena.ApproximateMemoryUsage();
+  }
+  ASSERT_GT(usage, mem_usage);
+}
+
+TEST(ArenaImplTest, Simple) {
+  std::vector<std::pair<size_t, char*>> allocated;
+  ArenaImpl arena_impl;
+  const int N = 100000;
+  size_t bytes = 0;
+  Random rnd(301);
+  for (int i = 0; i < N; i++) {
+    size_t s;
+    if (i % (N / 10) == 0) {
+      s = i;
+    } else {
+      s = rnd.OneIn(4000)
+              ? rnd.Uniform(6000)
+              : (rnd.OneIn(10) ? rnd.Uniform(100) : rnd.Uniform(20));
+    }
+    if (s == 0) {
+      // Our arena disallows size 0 allocations.
+      s = 1;
+    }
+    char* r;
+    if (rnd.OneIn(10)) {
+      r = arena_impl.AllocateAligned(s);
+    } else {
+      r = arena_impl.Allocate(s);
+    }
+
+    for (unsigned int b = 0; b < s; b++) {
+      // Fill the "i"th allocation with a known bit pattern
+      r[b] = i % 256;
+    }
+    bytes += s;
+    allocated.push_back(std::make_pair(s, r));
+    ASSERT_GE(arena_impl.ApproximateMemoryUsage(), bytes);
+    if (i > N / 10) {
+      ASSERT_LE(arena_impl.ApproximateMemoryUsage(), bytes * 1.10);
+    }
+  }
+  for (unsigned int i = 0; i < allocated.size(); i++) {
+    size_t num_bytes = allocated[i].first;
+    const char* p = allocated[i].second;
+    for (unsigned int b = 0; b < num_bytes; b++) {
+      // Check the "i"th allocation for the known bit pattern
+      ASSERT_EQ(int(p[b]) & 0xff, (int)(i % 256));
+    }
+  }
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/util/auto_roll_logger.cc b/util/auto_roll_logger.cc
new file mode 100644 (file)
index 0000000..95f2fae
--- /dev/null
@@ -0,0 +1,108 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#include "util/auto_roll_logger.h"
+#include "util/mutexlock.h"
+
+using namespace std;
+
+namespace rocksdb {
+
+// -- AutoRollLogger
+Status AutoRollLogger::ResetLogger() {
+  status_ = env_->NewLogger(log_fname_, &logger_);
+
+  if (!status_.ok()) {
+    return status_;
+  }
+
+  if (logger_->GetLogFileSize() ==
+      (size_t)Logger::DO_NOT_SUPPORT_GET_LOG_FILE_SIZE) {
+    status_ = Status::NotSupported(
+        "The underlying logger doesn't support GetLogFileSize()");
+  }
+  if (status_.ok()) {
+    cached_now = static_cast<uint64_t>(env_->NowMicros() * 1e-6);
+    ctime_ = cached_now;
+    cached_now_access_count = 0;
+  }
+
+  return status_;
+}
+
+void AutoRollLogger::RollLogFile() {
+  std::string old_fname = OldInfoLogFileName(
+      dbname_, env_->NowMicros(), db_absolute_path_, db_log_dir_);
+  env_->RenameFile(log_fname_, old_fname);
+}
+
+void AutoRollLogger::Logv(const char* format, va_list ap) {
+  assert(GetStatus().ok());
+
+  std::shared_ptr<Logger> logger;
+  {
+    MutexLock l(&mutex_);
+    if ((kLogFileTimeToRoll > 0 && LogExpired()) ||
+        (kMaxLogFileSize > 0 && logger_->GetLogFileSize() >= kMaxLogFileSize)) {
+      RollLogFile();
+      ResetLogger();
+    }
+
+    // pin down the current logger_ instance before releasing the mutex.
+    logger = logger_;
+  }
+
+  // Another thread could have put a new Logger instance into logger_ by now.
+  // However, since logger is still hanging on to the previous instance
+  // (reference count is not zero), we don't have to worry about it being
+  // deleted while we are accessing it.
+  // Note that logv itself is not mutex protected to allow maximum concurrency,
+  // as thread safety should have been handled by the underlying logger.
+  logger->Logv(format, ap);
+}
+
+bool AutoRollLogger::LogExpired() {
+  if (cached_now_access_count >= call_NowMicros_every_N_records_) {
+    cached_now = static_cast<uint64_t>(env_->NowMicros() * 1e-6);
+    cached_now_access_count = 0;
+  }
+
+  ++cached_now_access_count;
+  return cached_now >= ctime_ + kLogFileTimeToRoll;
+}
+
+Status CreateLoggerFromOptions(
+    const std::string& dbname,
+    const std::string& db_log_dir,
+    Env* env,
+    const Options& options,
+    std::shared_ptr<Logger>* logger) {
+  std::string db_absolute_path;
+  env->GetAbsolutePath(dbname, &db_absolute_path);
+  std::string fname = InfoLogFileName(dbname, db_absolute_path, db_log_dir);
+
+  // Currently we only support roll by time-to-roll and log size
+  if (options.log_file_time_to_roll > 0 || options.max_log_file_size > 0) {
+    AutoRollLogger* result = new AutoRollLogger(
+        env, dbname, db_log_dir,
+        options.max_log_file_size,
+        options.log_file_time_to_roll);
+    Status s = result->GetStatus();
+    if (!s.ok()) {
+      delete result;
+    } else {
+      logger->reset(result);
+    }
+    return s;
+  } else {
+    // Open a log file in the same directory as the db
+    env->CreateDir(dbname);  // In case it does not exist
+    env->RenameFile(fname, OldInfoLogFileName(dbname, env->NowMicros(),
+                                              db_absolute_path, db_log_dir));
+    return env->NewLogger(fname, logger);
+  }
+}
+
+}  // namespace rocksdb
diff --git a/util/auto_roll_logger.h b/util/auto_roll_logger.h
new file mode 100644 (file)
index 0000000..db70f15
--- /dev/null
@@ -0,0 +1,90 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Logger implementation that can be shared by all environments
+// where enough posix functionality is available.
+
+#pragma once
+#include "db/filename.h"
+#include "port/port.h"
+#include "util/posix_logger.h"
+
+namespace rocksdb {
+
+// Rolls the log file by size and/or time
+class AutoRollLogger : public Logger {
+ public:
+  AutoRollLogger(Env* env, const std::string& dbname,
+                 const std::string& db_log_dir,
+                 size_t log_max_size,
+                 size_t log_file_time_to_roll):
+     dbname_(dbname),
+     db_log_dir_(db_log_dir),
+     env_(env),
+     status_(Status::OK()),
+     kMaxLogFileSize(log_max_size),
+     kLogFileTimeToRoll(log_file_time_to_roll),
+     cached_now(static_cast<uint64_t>(env_->NowMicros() * 1e-6)),
+     ctime_(cached_now),
+     cached_now_access_count(0),
+     call_NowMicros_every_N_records_(100),
+     mutex_() {
+    env->GetAbsolutePath(dbname, &db_absolute_path_);
+    log_fname_ = InfoLogFileName(dbname_, db_absolute_path_, db_log_dir_);
+    RollLogFile();
+    ResetLogger();
+  }
+
+  void Logv(const char* format, va_list ap);
+
+  // check if the logger has encountered any problem.
+  Status GetStatus() {
+    return status_;
+  }
+
+  size_t GetLogFileSize() const {
+    return logger_->GetLogFileSize();
+  }
+
+  virtual ~AutoRollLogger() {
+  }
+
+  void SetCallNowMicrosEveryNRecords(uint64_t call_NowMicros_every_N_records) {
+    call_NowMicros_every_N_records_ = call_NowMicros_every_N_records;
+  }
+
+ private:
+
+  bool LogExpired();
+  Status ResetLogger();
+  void RollLogFile();
+
+  std::string log_fname_; // Current active info log's file name.
+  std::string dbname_;
+  std::string db_log_dir_;
+  std::string db_absolute_path_;
+  Env* env_;
+  std::shared_ptr<Logger> logger_;
+  // current status of the logger
+  Status status_;
+  const size_t kMaxLogFileSize;
+  const size_t kLogFileTimeToRoll;
+  // to avoid frequent env->NowMicros() calls, we cached the current time
+  uint64_t cached_now;
+  uint64_t ctime_;
+  uint64_t cached_now_access_count;
+  uint64_t call_NowMicros_every_N_records_;
+  port::Mutex mutex_;
+};
+
+// Facade to craete logger automatically
+Status CreateLoggerFromOptions(
+    const std::string& dbname,
+    const std::string& db_log_dir,
+    Env* env,
+    const Options& options,
+    std::shared_ptr<Logger>* logger);
+
+}  // namespace rocksdb
diff --git a/util/auto_roll_logger_test.cc b/util/auto_roll_logger_test.cc
new file mode 100755 (executable)
index 0000000..2fd2c51
--- /dev/null
@@ -0,0 +1,262 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#include <string>
+#include <cmath>
+#include "util/testharness.h"
+#include "util/auto_roll_logger.h"
+#include "rocksdb/db.h"
+#include <sys/stat.h>
+#include <errno.h>
+#include <iostream>
+
+using namespace std;
+
+namespace rocksdb {
+
+class AutoRollLoggerTest {
+ public:
+  static void InitTestDb() {
+    string deleteCmd = "rm -rf " + kTestDir;
+    ASSERT_TRUE(system(deleteCmd.c_str()) == 0);
+    Env::Default()->CreateDir(kTestDir);
+  }
+
+  void RollLogFileBySizeTest(AutoRollLogger* logger,
+                             size_t log_max_size,
+                             const string& log_message);
+  uint64_t RollLogFileByTimeTest(AutoRollLogger* logger,
+                                 size_t time,
+                                 const string& log_message);
+
+  static const string kSampleMessage;
+  static const string kTestDir;
+  static const string kLogFile;
+  static Env* env;
+};
+
+const string AutoRollLoggerTest::kSampleMessage(
+    "this is the message to be written to the log file!!");
+const string AutoRollLoggerTest::kTestDir(
+    test::TmpDir() + "/db_log_test");
+const string AutoRollLoggerTest::kLogFile(
+    test::TmpDir() + "/db_log_test/LOG");
+Env* AutoRollLoggerTest::env = Env::Default();
+
+// In this test we only want to Log some simple log message with
+// no format. LogMessage() provides such a simple interface and
+// avoids the [format-security] warning which occurs when you
+// call Log(logger, log_message) directly.
+void LogMessage(Logger* logger, const char* message) {
+  Log(logger, "%s", message);
+}
+
+void GetFileCreateTime(const std::string& fname, uint64_t* file_ctime) {
+  struct stat s;
+  if (stat(fname.c_str(), &s) != 0) {
+    *file_ctime = (uint64_t)0;
+  }
+  *file_ctime = static_cast<uint64_t>(s.st_ctime);
+}
+
+void AutoRollLoggerTest::RollLogFileBySizeTest(AutoRollLogger* logger,
+                                               size_t log_max_size,
+                                               const string& log_message) {
+  // measure the size of each message, which is supposed
+  // to be equal or greater than log_message.size()
+  LogMessage(logger, log_message.c_str());
+  size_t message_size = logger->GetLogFileSize();
+  size_t current_log_size = message_size;
+
+  // Test the cases when the log file will not be rolled.
+  while (current_log_size + message_size < log_max_size) {
+    LogMessage(logger, log_message.c_str());
+    current_log_size += message_size;
+    ASSERT_EQ(current_log_size, logger->GetLogFileSize());
+  }
+
+  // Now the log file will be rolled
+  LogMessage(logger, log_message.c_str());
+  // Since rotation is checked before actual logging, we need to
+  // trigger the rotation by logging another message.
+  LogMessage(logger, log_message.c_str());
+
+  ASSERT_TRUE(message_size == logger->GetLogFileSize());
+}
+
+uint64_t AutoRollLoggerTest::RollLogFileByTimeTest(
+    AutoRollLogger* logger, size_t time, const string& log_message) {
+  uint64_t expected_create_time;
+  uint64_t actual_create_time;
+  uint64_t total_log_size;
+  ASSERT_OK(env->GetFileSize(kLogFile, &total_log_size));
+  GetFileCreateTime(kLogFile, &expected_create_time);
+  logger->SetCallNowMicrosEveryNRecords(0);
+
+  // -- Write to the log for several times, which is supposed
+  // to be finished before time.
+  for (int i = 0; i < 10; ++i) {
+     LogMessage(logger, log_message.c_str());
+     ASSERT_OK(logger->GetStatus());
+     // Make sure we always write to the same log file (by
+     // checking the create time);
+     GetFileCreateTime(kLogFile, &actual_create_time);
+
+     // Also make sure the log size is increasing.
+     ASSERT_EQ(expected_create_time, actual_create_time);
+     ASSERT_GT(logger->GetLogFileSize(), total_log_size);
+     total_log_size = logger->GetLogFileSize();
+  }
+
+  // -- Make the log file expire
+  sleep(time);
+  LogMessage(logger, log_message.c_str());
+
+  // At this time, the new log file should be created.
+  GetFileCreateTime(kLogFile, &actual_create_time);
+  ASSERT_GT(actual_create_time, expected_create_time);
+  ASSERT_LT(logger->GetLogFileSize(), total_log_size);
+  expected_create_time = actual_create_time;
+
+  return expected_create_time;
+}
+
+TEST(AutoRollLoggerTest, RollLogFileBySize) {
+    InitTestDb();
+    size_t log_max_size = 1024 * 5;
+
+    AutoRollLogger logger(Env::Default(), kTestDir, "", log_max_size, 0);
+
+    RollLogFileBySizeTest(&logger, log_max_size,
+                          kSampleMessage + ":RollLogFileBySize");
+
+}
+
+TEST(AutoRollLoggerTest, RollLogFileByTime) {
+    size_t time = 1;
+    size_t log_size = 1024 * 5;
+
+    InitTestDb();
+    // -- Test the existence of file during the server restart.
+    ASSERT_TRUE(!env->FileExists(kLogFile));
+    AutoRollLogger logger(Env::Default(), kTestDir, "", log_size, 1);
+    ASSERT_TRUE(env->FileExists(kLogFile));
+
+    RollLogFileByTimeTest(&logger, time, kSampleMessage + ":RollLogFileByTime");
+}
+
+TEST(AutoRollLoggerTest,
+     OpenLogFilesMultipleTimesWithOptionLog_max_size) {
+  // If only 'log_max_size' options is specified, then every time
+  // when rocksdb is restarted, a new empty log file will be created.
+  InitTestDb();
+  // WORKAROUND:
+  // avoid complier's complaint of "comparison between signed
+  // and unsigned integer expressions" because literal 0 is
+  // treated as "singed".
+  size_t kZero = 0;
+  size_t log_size = 1024;
+
+  AutoRollLogger* logger = new AutoRollLogger(
+    Env::Default(), kTestDir, "", log_size, 0);
+
+  LogMessage(logger, kSampleMessage.c_str());
+  ASSERT_GT(logger->GetLogFileSize(), kZero);
+  delete logger;
+
+  // reopens the log file and an empty log file will be created.
+  logger = new AutoRollLogger(
+    Env::Default(), kTestDir, "", log_size, 0);
+  ASSERT_EQ(logger->GetLogFileSize(), kZero);
+  delete logger;
+}
+
+TEST(AutoRollLoggerTest, CompositeRollByTimeAndSizeLogger) {
+  size_t time = 1, log_max_size = 1024 * 5;
+
+  InitTestDb();
+
+  AutoRollLogger logger(Env::Default(), kTestDir, "", log_max_size, time);
+
+  // Test the ability to roll by size
+  RollLogFileBySizeTest(
+      &logger, log_max_size,
+      kSampleMessage + ":CompositeRollByTimeAndSizeLogger");
+
+  // Test the ability to roll by Time
+  RollLogFileByTimeTest( &logger, time,
+      kSampleMessage + ":CompositeRollByTimeAndSizeLogger");
+}
+
+TEST(AutoRollLoggerTest, CreateLoggerFromOptions) {
+  Options options;
+  shared_ptr<Logger> logger;
+
+  // Normal logger
+  ASSERT_OK(CreateLoggerFromOptions(kTestDir, "", env, options, &logger));
+  ASSERT_TRUE(dynamic_cast<PosixLogger*>(logger.get()));
+
+  // Only roll by size
+  InitTestDb();
+  options.max_log_file_size = 1024;
+  ASSERT_OK(CreateLoggerFromOptions(kTestDir, "", env, options, &logger));
+  AutoRollLogger* auto_roll_logger =
+    dynamic_cast<AutoRollLogger*>(logger.get());
+  ASSERT_TRUE(auto_roll_logger);
+  RollLogFileBySizeTest(
+      auto_roll_logger, options.max_log_file_size,
+      kSampleMessage + ":CreateLoggerFromOptions - size");
+
+  // Only roll by Time
+  InitTestDb();
+  options.max_log_file_size = 0;
+  options.log_file_time_to_roll = 1;
+  ASSERT_OK(CreateLoggerFromOptions(kTestDir, "", env, options, &logger));
+  auto_roll_logger =
+    dynamic_cast<AutoRollLogger*>(logger.get());
+  RollLogFileByTimeTest(
+      auto_roll_logger, options.log_file_time_to_roll,
+      kSampleMessage + ":CreateLoggerFromOptions - time");
+
+  // roll by both Time and size
+  InitTestDb();
+  options.max_log_file_size = 1024 * 5;
+  options.log_file_time_to_roll = 1;
+  ASSERT_OK(CreateLoggerFromOptions(kTestDir, "", env, options, &logger));
+  auto_roll_logger =
+    dynamic_cast<AutoRollLogger*>(logger.get());
+  RollLogFileBySizeTest(
+      auto_roll_logger, options.max_log_file_size,
+      kSampleMessage + ":CreateLoggerFromOptions - both");
+  RollLogFileByTimeTest(
+      auto_roll_logger, options.log_file_time_to_roll,
+      kSampleMessage + ":CreateLoggerFromOptions - both");
+}
+
+int OldLogFileCount(const string& dir) {
+  std::vector<std::string> files;
+  Env::Default()->GetChildren(dir, &files);
+  int log_file_count = 0;
+
+  for (std::vector<std::string>::iterator it = files.begin();
+       it != files.end(); ++it) {
+    uint64_t create_time;
+    FileType type;
+    if (!ParseFileName(*it, &create_time, &type)) {
+      continue;
+    }
+    if (type == kInfoLogFile && create_time > 0) {
+      ++log_file_count;
+    }
+  }
+
+  return log_file_count;
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/util/autovector.h b/util/autovector.h
new file mode 100644 (file)
index 0000000..9998e29
--- /dev/null
@@ -0,0 +1,329 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+#pragma once
+
+#include <algorithm>
+#include <cassert>
+#include <stdexcept>
+#include <iterator>
+#include <vector>
+
+namespace rocksdb {
+
+// A vector that leverages pre-allocated stack-based array to achieve better
+// performance for array with small amount of items.
+//
+// The interface resembles that of vector, but with less features since we aim
+// to solve the problem that we have in hand, rather than implementing a
+// full-fledged generic container.
+//
+// Currently we don't support:
+//  * reserve()/shrink_to_fit()/resize()
+//     If used correctly, in most cases, people should not touch the
+//     underlying vector at all.
+//  * random insert()/erase(), please only use push_back()/pop_back().
+//  * No move/swap operations. Each autovector instance has a
+//     stack-allocated array and if we want support move/swap operations, we
+//     need to copy the arrays other than just swapping the pointers. In this
+//     case we'll just explicitly forbid these operations since they may
+//     lead users to make false assumption by thinking they are inexpensive
+//     operations.
+//
+// Naming style of public methods almost follows that of the STL's.
+template <class T, size_t kSize = 8>
+class autovector {
+ public:
+  // General STL-style container member types.
+  typedef T value_type;
+  typedef typename std::vector<T>::difference_type difference_type;
+  typedef typename std::vector<T>::size_type size_type;
+  typedef value_type& reference;
+  typedef const value_type& const_reference;
+  typedef value_type* pointer;
+  typedef const value_type* const_pointer;
+
+  // This class is the base for regular/const iterator
+  template <class TAutoVector, class TValueType>
+  class iterator_impl {
+   public:
+    // -- iterator traits
+    typedef iterator_impl<TAutoVector, TValueType> self_type;
+    typedef TValueType value_type;
+    typedef TValueType& reference;
+    typedef TValueType* pointer;
+    typedef typename TAutoVector::difference_type difference_type;
+    typedef std::random_access_iterator_tag iterator_category;
+
+    iterator_impl(TAutoVector* vect, size_t index)
+      : vect_(vect)
+      , index_(index) {
+    };
+    iterator_impl(const iterator_impl&) = default;
+    ~iterator_impl() { }
+    iterator_impl& operator=(const iterator_impl&) = default;
+
+    // -- Advancement
+    // iterator++
+    self_type& operator++() {
+      ++index_;
+      return *this;
+    }
+
+    // ++iterator
+    self_type operator++(int) {
+      auto old = *this;
+      ++index_;
+      return old;
+    }
+
+    // iterator--
+    self_type& operator--() {
+      --index_;
+      return *this;
+    }
+
+    // --iterator
+    self_type operator--(int) {
+      auto old = *this;
+      --index_;
+      return old;
+    }
+
+    self_type operator-(difference_type len) {
+      return self_type(vect_, index_ - len);
+    }
+
+    difference_type operator-(const self_type& other) {
+      assert(vect_ == other.vect_);
+      return index_ - other.index_;
+    }
+
+    self_type operator+(difference_type len) {
+      return self_type(vect_, index_ + len);
+    }
+
+    self_type& operator+=(difference_type len) {
+      index_ += len;
+      return *this;
+    }
+
+    self_type& operator-=(difference_type len) {
+      index_ -= len;
+      return *this;
+    }
+
+    // -- Reference
+    reference operator*() {
+      assert(vect_->size() >= index_);
+      return (*vect_)[index_];
+    }
+    pointer operator->() {
+      assert(vect_->size() >= index_);
+      return &(*vect_)[index_];
+    }
+
+    // -- Logical Operators
+    bool operator==(const self_type& other) const {
+      assert(vect_ == other.vect_);
+      return index_ == other.index_;
+    }
+
+    bool operator!=(const self_type& other) const {
+      return !(*this == other);
+    }
+
+    bool operator>(const self_type& other) const {
+      assert(vect_ == other.vect_);
+      return index_ > other.index_;
+    }
+
+    bool operator<(const self_type& other) const {
+      assert(vect_ == other.vect_);
+      return index_ < other.index_;
+    }
+
+    bool operator>=(const self_type& other) const {
+      assert(vect_ == other.vect_);
+      return index_ >= other.index_;
+    }
+
+    bool operator<=(const self_type& other) const {
+      assert(vect_ == other.vect_);
+      return index_ <= other.index_;
+    }
+
+   private:
+    TAutoVector* vect_ = nullptr;
+    size_t index_ = 0;
+  };
+
+  typedef iterator_impl<autovector, value_type> iterator;
+  typedef iterator_impl<const autovector, const value_type> const_iterator;
+  typedef std::reverse_iterator<iterator> reverse_iterator;
+  typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
+
+  autovector() = default;
+  ~autovector() = default;
+
+  // -- Immutable operations
+  // Indicate if all data resides in in-stack data structure.
+  bool only_in_stack() const {
+    // If no element was inserted at all, the vector's capacity will be `0`.
+    return vect_.capacity() == 0;
+  }
+
+  size_type size() const {
+    return num_stack_items_ + vect_.size();
+  }
+
+  bool empty() const {
+    return size() == 0;
+  }
+
+  // will not check boundry
+  const_reference operator[](size_type n) const {
+    return n < kSize ? values_[n] : vect_[n - kSize];
+  }
+
+  reference operator[](size_type n) {
+    return n < kSize ? values_[n] : vect_[n - kSize];
+  }
+
+  // will check boundry
+  const_reference at(size_type n) const {
+    if (n >= size()) {
+      throw std::out_of_range("autovector: index out of range");
+    }
+    return (*this)[n];
+  }
+
+  reference at(size_type n) {
+    if (n >= size()) {
+      throw std::out_of_range("autovector: index out of range");
+    }
+    return (*this)[n];
+  }
+
+  reference front() {
+    assert(!empty());
+    return *begin();
+  }
+
+  const_reference front() const {
+    assert(!empty());
+    return *begin();
+  }
+
+  reference back() {
+    assert(!empty());
+    return *(end() - 1);
+  }
+
+  const_reference back() const {
+    assert(!empty());
+    return *(end() - 1);
+  }
+
+  // -- Mutable Operations
+  void push_back(T&& item) {
+    if (num_stack_items_ < kSize) {
+      values_[num_stack_items_++] = std::move(item);
+    } else {
+      vect_.push_back(item);
+    }
+  }
+
+  void push_back(const T& item) {
+    push_back(value_type(item));
+  }
+
+  template<class... Args>
+  void emplace_back(Args&&... args) {
+    push_back(value_type(args...));
+  }
+
+  void pop_back() {
+    assert(!empty());
+    if (!vect_.empty()) {
+      vect_.pop_back();
+    } else {
+      --num_stack_items_;
+    }
+  }
+
+  void clear() {
+    num_stack_items_ = 0;
+    vect_.clear();
+  }
+
+  // -- Copy and Assignment
+  autovector& assign(const autovector& other);
+
+  autovector(const autovector& other) {
+    assign(other);
+  }
+
+  autovector& operator=(const autovector& other) {
+    return assign(other);
+  }
+
+  // move operation are disallowed since it is very hard to make sure both
+  // autovectors are allocated from the same function stack.
+  autovector& operator=(autovector&& other) = delete;
+  autovector(autovector&& other) = delete;
+
+  // -- Iterator Operations
+  iterator begin() {
+    return iterator(this, 0);
+  }
+
+  const_iterator begin() const {
+    return const_iterator(this, 0);
+  }
+
+  iterator end() {
+    return iterator(this, this->size());
+  }
+
+  const_iterator end() const {
+    return const_iterator(this, this->size());
+  }
+
+  reverse_iterator rbegin() {
+    return reverse_iterator(end());
+  }
+
+  const_reverse_iterator rbegin() const {
+    return const_reverse_iterator(end());
+  }
+
+  reverse_iterator rend() {
+    return reverse_iterator(begin());
+  }
+
+  const_reverse_iterator rend() const {
+    return const_reverse_iterator(begin());
+  }
+
+ private:
+  size_type num_stack_items_ = 0; // current number of items
+  value_type values_[kSize]; // the first `kSize` items
+  // used only if there are more than `kSize` items.
+  std::vector<T> vect_;
+};
+
+template <class T, size_t kSize>
+autovector<T, kSize>& autovector<T, kSize>::assign(const autovector& other) {
+  // copy the internal vector
+  vect_.assign(other.vect_.begin(), other.vect_.end());
+
+  // copy array
+  num_stack_items_ = other.num_stack_items_;
+  std::copy(other.values_, other.values_ + num_stack_items_, values_);
+
+  return *this;
+}
+
+}  // rocksdb
diff --git a/util/autovector_test.cc b/util/autovector_test.cc
new file mode 100644 (file)
index 0000000..eb244aa
--- /dev/null
@@ -0,0 +1,290 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include <atomic>
+#include <iostream>
+
+#include "rocksdb/env.h"
+#include "util/autovector.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+
+using namespace std;
+
+class AutoVectorTest { };
+
+const unsigned long kSize = 8;
+TEST(AutoVectorTest, PushBackAndPopBack) {
+  autovector<size_t, kSize> vec;
+  ASSERT_TRUE(vec.empty());
+  ASSERT_EQ(0ul, vec.size());
+
+  for (size_t i = 0; i < 1000 * kSize; ++i) {
+    vec.push_back(i);
+    ASSERT_TRUE(!vec.empty());
+    if (i < kSize) {
+      ASSERT_TRUE(vec.only_in_stack());
+    } else {
+      ASSERT_TRUE(!vec.only_in_stack());
+    }
+    ASSERT_EQ(i + 1, vec.size());
+    ASSERT_EQ(i, vec[i]);
+    ASSERT_EQ(i, vec.at(i));
+  }
+
+  size_t size = vec.size();
+  while (size != 0) {
+    vec.pop_back();
+    // will always be in heap
+    ASSERT_TRUE(!vec.only_in_stack());
+    ASSERT_EQ(--size, vec.size());
+  }
+
+  ASSERT_TRUE(vec.empty());
+}
+
+TEST(AutoVectorTest, EmplaceBack) {
+  typedef std::pair<size_t, std::string> ValueType;
+  autovector<ValueType, kSize> vec;
+
+  for (size_t i = 0; i < 1000 * kSize; ++i) {
+    vec.emplace_back(i, std::to_string(i + 123));
+    ASSERT_TRUE(!vec.empty());
+    if (i < kSize) {
+      ASSERT_TRUE(vec.only_in_stack());
+    } else {
+      ASSERT_TRUE(!vec.only_in_stack());
+    }
+
+    ASSERT_EQ(i + 1, vec.size());
+    ASSERT_EQ(i, vec[i].first);
+    ASSERT_EQ(std::to_string(i + 123), vec[i].second);
+  }
+
+  vec.clear();
+  ASSERT_TRUE(vec.empty());
+  ASSERT_TRUE(!vec.only_in_stack());
+}
+
+void AssertEqual(
+    const autovector<size_t, kSize>& a, const autovector<size_t, kSize>& b) {
+  ASSERT_EQ(a.size(), b.size());
+  ASSERT_EQ(a.empty(), b.empty());
+  ASSERT_EQ(a.only_in_stack(), b.only_in_stack());
+  for (size_t i = 0; i < a.size(); ++i) {
+    ASSERT_EQ(a[i], b[i]);
+  }
+}
+
+TEST(AutoVectorTest, CopyAndAssignment) {
+  // Test both heap-allocated and stack-allocated cases.
+  for (auto size : { kSize / 2, kSize * 1000 }) {
+    autovector<size_t, kSize> vec;
+    for (size_t i = 0; i < size; ++i) {
+      vec.push_back(i);
+    }
+
+    {
+      autovector<size_t, kSize> other;
+      other = vec;
+      AssertEqual(other, vec);
+    }
+
+    {
+      autovector<size_t, kSize> other(vec);
+      AssertEqual(other, vec);
+    }
+  }
+}
+
+TEST(AutoVectorTest, Iterators) {
+  autovector<std::string, kSize> vec;
+  for (size_t i = 0; i < kSize * 1000; ++i) {
+    vec.push_back(std::to_string(i));
+  }
+
+  // basic operator test
+  ASSERT_EQ(vec.front(), *vec.begin());
+  ASSERT_EQ(vec.back(), *(vec.end() - 1));
+  ASSERT_TRUE(vec.begin() < vec.end());
+
+  // non-const iterator
+  size_t index = 0;
+  for (const auto& item : vec) {
+    ASSERT_EQ(vec[index++], item);
+  }
+
+  index = vec.size() - 1;
+  for (auto pos = vec.rbegin(); pos != vec.rend(); ++pos) {
+    ASSERT_EQ(vec[index--], *pos);
+  }
+
+  // const iterator
+  const auto& cvec = vec;
+  index = 0;
+  for (const auto& item : cvec) {
+    ASSERT_EQ(cvec[index++], item);
+  }
+
+  index = vec.size() - 1;
+  for (auto pos = cvec.rbegin(); pos != cvec.rend(); ++pos) {
+    ASSERT_EQ(cvec[index--], *pos);
+  }
+
+  // forward and backward
+  auto pos = vec.begin();
+  while (pos != vec.end()) {
+    auto old_val = *pos;
+    auto old = pos++;
+    // HACK: make sure -> works
+    ASSERT_TRUE(!old->empty());
+    ASSERT_EQ(old_val, *old);
+    ASSERT_TRUE(pos == vec.end() || old_val != *pos);
+  }
+
+  pos = vec.begin();
+  for (size_t i = 0; i < vec.size(); i += 2) {
+    // Cannot use ASSERT_EQ since that macro depends on iostream serialization
+    ASSERT_TRUE(pos + 2 - 2 == pos);
+    pos += 2;
+    ASSERT_TRUE(pos >= vec.begin());
+    ASSERT_TRUE(pos <= vec.end());
+
+    size_t diff = static_cast<size_t>(pos - vec.begin());
+    ASSERT_EQ(i + 2, diff);
+  }
+}
+
+vector<string> GetTestKeys(size_t size) {
+  vector<string> keys;
+  keys.resize(size);
+
+  int index = 0;
+  for (auto& key : keys) {
+    key = "item-" + to_string(index++);
+  }
+  return keys;
+}
+
+template<class TVector>
+void BenchmarkVectorCreationAndInsertion(
+    string name, size_t ops, size_t item_size,
+    const std::vector<typename TVector::value_type>& items) {
+  auto env = Env::Default();
+
+  int index = 0;
+  auto start_time = env->NowNanos();
+  auto ops_remaining = ops;
+  while(ops_remaining--) {
+    TVector v;
+    for (size_t i = 0; i < item_size; ++i) {
+      v.push_back(items[index++]);
+    }
+  }
+  auto elapsed = env->NowNanos() - start_time;
+  cout << "created " << ops << " " << name << " instances:\n\t"
+       << "each was inserted with " << item_size << " elements\n\t"
+       << "total time elapsed: " << elapsed << " (ns)" << endl;
+}
+
+template <class TVector>
+size_t BenchmarkSequenceAccess(string name, size_t ops, size_t elem_size) {
+  TVector v;
+  for (const auto& item : GetTestKeys(elem_size)) {
+    v.push_back(item);
+  }
+  auto env = Env::Default();
+
+  auto ops_remaining = ops;
+  auto start_time = env->NowNanos();
+  size_t total = 0;
+  while (ops_remaining--) {
+    auto end = v.end();
+    for (auto pos = v.begin(); pos != end; ++pos) {
+      total += pos->size();
+    }
+  }
+  auto elapsed = env->NowNanos() - start_time;
+  cout << "performed " << ops << " sequence access against " << name << "\n\t"
+       << "size: " << elem_size << "\n\t"
+       << "total time elapsed: " << elapsed << " (ns)" << endl;
+  // HACK avoid compiler's optimization to ignore total
+  return total;
+}
+
+// This test case only reports the performance between std::vector<string>
+// and autovector<string>. We chose string for comparison because in most
+// o our use cases we used std::vector<string>.
+TEST(AutoVectorTest, PerfBench) {
+  // We run same operations for kOps times in order to get a more fair result.
+  size_t kOps = 100000;
+
+  // Creation and insertion test
+  // Test the case when there is:
+  //  * no element inserted: internal array of std::vector may not really get
+  //    initialize.
+  //  * one element inserted: internal array of std::vector must have
+  //    initialized.
+  //  * kSize elements inserted. This shows the most time we'll spend if we
+  //    keep everything in stack.
+  //  * 2 * kSize elements inserted. The internal vector of
+  //    autovector must have been initialized.
+  cout << "=====================================================" << endl;
+  cout << "Creation and Insertion Test (value type: std::string)" << endl;
+  cout << "=====================================================" << endl;
+
+  // pre-generated unique keys
+  auto string_keys = GetTestKeys(kOps * 2 * kSize);
+  for (auto insertions : { 0ul, 1ul, kSize / 2, kSize, 2 * kSize }) {
+    BenchmarkVectorCreationAndInsertion<vector<string>>(
+      "vector<string>", kOps, insertions, string_keys
+    );
+    BenchmarkVectorCreationAndInsertion<autovector<string, kSize>>(
+      "autovector<string>", kOps, insertions, string_keys
+    );
+    cout << "-----------------------------------" << endl;
+  }
+
+  cout << "=====================================================" << endl;
+  cout << "Creation and Insertion Test (value type: uint64_t)" << endl;
+  cout << "=====================================================" << endl;
+
+  // pre-generated unique keys
+  vector<uint64_t> int_keys(kOps * 2 * kSize);
+  for (size_t i = 0; i < kOps * 2 * kSize; ++i) {
+    int_keys[i] = i;
+  }
+  for (auto insertions : { 0ul, 1ul, kSize / 2, kSize, 2 * kSize }) {
+    BenchmarkVectorCreationAndInsertion<vector<uint64_t>>(
+      "vector<uint64_t>", kOps, insertions, int_keys
+    );
+    BenchmarkVectorCreationAndInsertion<autovector<uint64_t, kSize>>(
+      "autovector<uint64_t>", kOps, insertions, int_keys
+    );
+    cout << "-----------------------------------" << endl;
+  }
+
+  // Sequence Access Test
+  cout << "=====================================================" << endl;
+  cout << "Sequence Access Test" << endl;
+  cout << "=====================================================" << endl;
+  for (auto elem_size : { kSize / 2, kSize, 2 * kSize }) {
+    BenchmarkSequenceAccess<vector<string>>(
+        "vector", kOps, elem_size
+    );
+    BenchmarkSequenceAccess<autovector<string, kSize>>(
+        "autovector", kOps, elem_size
+    );
+    cout << "-----------------------------------" << endl;
+  }
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/util/bit_set.h b/util/bit_set.h
new file mode 100644 (file)
index 0000000..0172706
--- /dev/null
@@ -0,0 +1,71 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#pragma once
+#include <cassert>
+
+namespace rocksdb {
+
+class BitSet {
+ public:
+  /**
+   * Create a bit set of numBits, with the bits set to either true or false.
+   */
+  explicit BitSet(size_t numBits, bool initial=false)
+    : numBits_(numBits),
+      data_(numWords(), initial ? ~0UL : 0UL) {
+  }
+
+  /**
+   * Set bit b to 1.
+   */
+  void set(size_t b) {
+    assert(b >= 0 && b < numBits_);
+    data_[word(b)] |= wordOffsetMask(b);
+  }
+
+  /**
+   * Set bit b to 0;
+   */
+  void reset(size_t b) {
+    assert(b >= 0 && b < numBits_);
+    data_[word(b)] &= ~wordOffsetMask(b);
+  }
+
+  /**
+   * Get a bit.
+   */
+  bool test(int b) const {
+    return data_[word(b)] & wordOffsetMask(b);
+  }
+
+ /**
+   * Return the size of the BitSet, in bits.
+   */
+  size_t size() const {
+    return numBits_;
+  }
+
+ private:
+
+ inline size_t numWords() const {
+    if (numBits_ == 0) return 0;
+    return 1 + (numBits_-1) / (8*sizeof(unsigned long));
+  }
+  inline static size_t word(int b) {
+    return b / (8*sizeof(unsigned long));
+  }
+  inline static int wordOffset(int b) {
+    return b % (8*sizeof(unsigned long));
+  }
+  inline static unsigned long wordOffsetMask(int b) {
+    return 1UL << wordOffset(b);
+  }
+
+  size_t numBits_;
+  std::vector<unsigned long> data_;
+};
+
+}  // namespace facebook
diff --git a/util/blob_store.cc b/util/blob_store.cc
new file mode 100644 (file)
index 0000000..9f06712
--- /dev/null
@@ -0,0 +1,264 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/blob_store.h"
+
+namespace rocksdb {
+
+using namespace std;
+
+// BlobChunk
+bool BlobChunk::ImmediatelyBefore(const BlobChunk& chunk) const {
+  // overlapping!?
+  assert(!Overlap(chunk));
+  // size == 0 is a marker, not a block
+  return size != 0 &&
+    bucket_id == chunk.bucket_id &&
+    offset + size == chunk.offset;
+}
+
+bool BlobChunk::Overlap(const BlobChunk &chunk) const {
+  return size != 0 && chunk.size != 0 && bucket_id == chunk.bucket_id &&
+    ((offset >= chunk.offset && offset < chunk.offset + chunk.size) ||
+     (chunk.offset >= offset && chunk.offset < offset + size));
+}
+
+// Blob
+string Blob::ToString() const {
+  string ret;
+  for (auto chunk : chunks) {
+    PutFixed32(&ret, chunk.bucket_id);
+    PutFixed32(&ret, chunk.offset);
+    PutFixed32(&ret, chunk.size);
+  }
+  return ret;
+}
+
+Blob::Blob(const std::string& blob) {
+  for (uint32_t i = 0; i < blob.size(); ) {
+    uint32_t t[3] = {0};
+    for (int j = 0; j < 3 && i + sizeof(uint32_t) - 1 < blob.size();
+                    ++j, i += sizeof(uint32_t)) {
+      t[j] = DecodeFixed32(blob.data() + i);
+    }
+    chunks.push_back(BlobChunk(t[0], t[1], t[2]));
+  }
+}
+
+// FreeList
+Status FreeList::Free(const Blob& blob) {
+  // add it back to the free list
+  for (auto chunk : blob.chunks) {
+    free_blocks_ += chunk.size;
+    if (fifo_free_chunks_.size() &&
+        fifo_free_chunks_.back().ImmediatelyBefore(chunk)) {
+      fifo_free_chunks_.back().size += chunk.size;
+    } else {
+      fifo_free_chunks_.push_back(chunk);
+    }
+  }
+
+  return Status::OK();
+}
+
+Status FreeList::Allocate(uint32_t blocks, Blob* blob) {
+  if (free_blocks_ < blocks) {
+    return Status::Incomplete("");
+  }
+
+  blob->chunks.clear();
+  free_blocks_ -= blocks;
+
+  while (blocks > 0) {
+    assert(fifo_free_chunks_.size() > 0);
+    auto& front = fifo_free_chunks_.front();
+    if (front.size > blocks) {
+      blob->chunks.push_back(BlobChunk(front.bucket_id, front.offset, blocks));
+      front.offset += blocks;
+      front.size -= blocks;
+      blocks = 0;
+    } else {
+      blob->chunks.push_back(front);
+      blocks -= front.size;
+      fifo_free_chunks_.pop_front();
+    }
+  }
+  assert(blocks == 0);
+
+  return Status::OK();
+}
+
+bool FreeList::Overlap(const Blob &blob) const {
+  for (auto chunk : blob.chunks) {
+    for (auto itr = fifo_free_chunks_.begin();
+         itr != fifo_free_chunks_.end();
+         ++itr) {
+      if (itr->Overlap(chunk)) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+// BlobStore
+BlobStore::BlobStore(const string& directory,
+                     uint64_t block_size,
+                     uint32_t blocks_per_bucket,
+                     uint32_t max_buckets,
+                     Env* env) :
+    directory_(directory),
+    block_size_(block_size),
+    blocks_per_bucket_(blocks_per_bucket),
+    env_(env),
+    max_buckets_(max_buckets) {
+  env_->CreateDirIfMissing(directory_);
+
+  storage_options_.use_mmap_writes = false;
+  storage_options_.use_mmap_reads = false;
+
+  buckets_size_ = 0;
+  buckets_ = new unique_ptr<RandomRWFile>[max_buckets_];
+
+  CreateNewBucket();
+}
+
+BlobStore::~BlobStore() {
+  // TODO we don't care about recovery for now
+  delete [] buckets_;
+}
+
+Status BlobStore::Put(const Slice& value, Blob* blob) {
+  // convert size to number of blocks
+  Status s = Allocate((value.size() + block_size_ - 1) / block_size_, blob);
+  if (!s.ok()) {
+    return s;
+  }
+  auto size_left = (uint64_t) value.size();
+
+  uint64_t offset = 0; // in bytes, not blocks
+  for (auto chunk : blob->chunks) {
+    uint64_t write_size = min(chunk.size * block_size_, size_left);
+    assert(chunk.bucket_id < buckets_size_);
+    s = buckets_[chunk.bucket_id].get()->Write(chunk.offset * block_size_,
+                                               Slice(value.data() + offset,
+                                                     write_size));
+    if (!s.ok()) {
+      Delete(*blob);
+      return s;
+    }
+    offset += write_size;
+    size_left -= write_size;
+    if (write_size < chunk.size * block_size_) {
+      // if we have any space left in the block, fill it up with zeros
+      string zero_string(chunk.size * block_size_ - write_size, 0);
+      s = buckets_[chunk.bucket_id].get()->Write(chunk.offset * block_size_ +
+                                                    write_size,
+                                                 Slice(zero_string));
+    }
+  }
+
+  if (size_left > 0) {
+    Delete(*blob);
+    return Status::IOError("Tried to write more data than fits in the blob");
+  }
+
+  return Status::OK();
+}
+
+Status BlobStore::Get(const Blob& blob,
+                      string* value) const {
+  {
+    // assert that it doesn't overlap with free list
+    // it will get compiled out for release
+    MutexLock l(&free_list_mutex_);
+    assert(!free_list_.Overlap(blob));
+  }
+
+  value->resize(blob.Size() * block_size_);
+
+  uint64_t offset = 0; // in bytes, not blocks
+  for (auto chunk : blob.chunks) {
+    Slice result;
+    assert(chunk.bucket_id < buckets_size_);
+    Status s;
+    s = buckets_[chunk.bucket_id].get()->Read(chunk.offset * block_size_,
+                                              chunk.size * block_size_,
+                                              &result,
+                                              &value->at(offset));
+    if (!s.ok() || result.size() < chunk.size * block_size_) {
+      value->clear();
+      return Status::IOError("Could not read in from file");
+    }
+    offset += chunk.size * block_size_;
+  }
+
+  // remove the '\0's at the end of the string
+  value->erase(find(value->begin(), value->end(), '\0'), value->end());
+
+  return Status::OK();
+}
+
+Status BlobStore::Delete(const Blob& blob) {
+  MutexLock l(&free_list_mutex_);
+  return free_list_.Free(blob);
+}
+
+Status BlobStore::Sync() {
+  for (size_t i = 0; i < buckets_size_; ++i) {
+    Status s = buckets_[i].get()->Sync();
+    if (!s.ok()) {
+      return s;
+    }
+  }
+  return Status::OK();
+}
+
+Status BlobStore::Allocate(uint32_t blocks, Blob* blob) {
+  MutexLock l(&free_list_mutex_);
+  Status s;
+
+  s = free_list_.Allocate(blocks, blob);
+  if (!s.ok()) {
+    s = CreateNewBucket();
+    if (!s.ok()) {
+      return s;
+    }
+    s = free_list_.Allocate(blocks, blob);
+  }
+
+  return s;
+}
+
+// called with free_list_mutex_ held
+Status BlobStore::CreateNewBucket() {
+  MutexLock l(&buckets_mutex_);
+
+  if (buckets_size_ >= max_buckets_) {
+    return Status::IOError("Max size exceeded\n");
+  }
+
+  int new_bucket_id = buckets_size_;
+
+  char fname[200];
+  sprintf(fname, "%s/%d.bs", directory_.c_str(), new_bucket_id);
+
+  Status s = env_->NewRandomRWFile(string(fname),
+                                   &buckets_[new_bucket_id],
+                                   storage_options_);
+  if (!s.ok()) {
+    return s;
+  }
+
+  // whether Allocate succeeds or not, does not affect the overall correctness
+  // of this function - calling Allocate is really optional
+  // (also, tmpfs does not support allocate)
+  buckets_[new_bucket_id].get()->Allocate(0, block_size_ * blocks_per_bucket_);
+
+  buckets_size_ = new_bucket_id + 1;
+
+  return free_list_.Free(Blob(new_bucket_id, 0, blocks_per_bucket_));
+}
+
+} // namespace rocksdb
diff --git a/util/blob_store.h b/util/blob_store.h
new file mode 100644 (file)
index 0000000..0a81d01
--- /dev/null
@@ -0,0 +1,161 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include "rocksdb/env.h"
+#include "rocksdb/status.h"
+#include "port/port.h"
+#include "util/mutexlock.h"
+#include "util/coding.h"
+
+#include <list>
+#include <deque>
+#include <cstdint>
+#include <iostream>
+#include <stdexcept>
+#include <algorithm>
+#include <cstdio>
+
+namespace rocksdb {
+
+struct BlobChunk {
+  uint32_t bucket_id;
+  uint32_t offset; // in blocks
+  uint32_t size; // in blocks
+  BlobChunk() {}
+  BlobChunk(uint32_t bucket_id, uint32_t offset, uint32_t size) :
+    bucket_id(bucket_id), offset(offset), size(size) {}
+
+  // returns true if it's immediately before chunk
+  bool ImmediatelyBefore(const BlobChunk& chunk) const;
+  // returns true if chunks overlap
+  bool Overlap(const BlobChunk &chunk) const;
+};
+
+// We represent each Blob as a string in format:
+// bucket_id offset size|bucket_id offset size...
+// The string can be used to reference the Blob stored on external
+// device/file
+// Not thread-safe!
+struct Blob {
+  // Generates the string
+  std::string ToString() const;
+  // Parses the previously generated string
+  explicit Blob(const std::string& blob);
+  // Creates unfragmented Blob
+  Blob(uint32_t bucket_id, uint32_t offset, uint32_t size) {
+    SetOneChunk(bucket_id, offset, size);
+  }
+  Blob() {}
+
+  void SetOneChunk(uint32_t bucket_id, uint32_t offset, uint32_t size) {
+    chunks.clear();
+    chunks.push_back(BlobChunk(bucket_id, offset, size));
+  }
+
+  uint32_t Size() const { // in blocks
+    uint32_t ret = 0;
+    for (auto chunk : chunks) {
+      ret += chunk.size;
+    }
+    assert(ret > 0);
+    return ret;
+  }
+
+  // bucket_id, offset, size
+  std::vector<BlobChunk> chunks;
+};
+
+// Keeps a list of free chunks
+// NOT thread-safe. Externally synchronized
+class FreeList {
+ public:
+  FreeList() :
+    free_blocks_(0) {}
+  ~FreeList() {}
+
+  // Allocates a a blob. Stores the allocated blob in
+  // 'blob'. Returns non-OK status if it failed to allocate.
+  // Thread-safe
+  Status Allocate(uint32_t blocks, Blob* blob);
+  // Frees the blob for reuse. Thread-safe
+  Status Free(const Blob& blob);
+
+  // returns true if blob is overlapping with any of the
+  // chunks stored in free list
+  bool Overlap(const Blob &blob) const;
+
+ private:
+  std::deque<BlobChunk> fifo_free_chunks_;
+  uint32_t free_blocks_;
+  mutable port::Mutex mutex_;
+};
+
+// thread-safe
+class BlobStore {
+ public:
+   // directory - wherever the blobs should be stored. It will be created
+   //   if missing
+   // block_size - self explanatory
+   // blocks_per_bucket - how many blocks we want to keep in one bucket.
+   //   Bucket is a device or a file that we use to store the blobs.
+   //   If we don't have enough blocks to allocate a new blob, we will
+   //   try to create a new file or device.
+   // max_buckets - maximum number of buckets BlobStore will create
+   //   BlobStore max size in bytes is
+   //     max_buckets * blocks_per_bucket * block_size
+   // env - env for creating new files
+  BlobStore(const std::string& directory,
+            uint64_t block_size,
+            uint32_t blocks_per_bucket,
+            uint32_t max_buckets,
+            Env* env);
+  ~BlobStore();
+
+  // Allocates space for value.size bytes (rounded up to be multiple of
+  // block size) and writes value.size bytes from value.data to a backing store.
+  // Sets Blob blob that can than be used for addressing the
+  // stored value. Returns non-OK status on error.
+  Status Put(const Slice& value, Blob* blob);
+  // Value needs to have enough space to store all the loaded stuff.
+  // This function is thread safe!
+  Status Get(const Blob& blob, std::string* value) const;
+  // Frees the blob for reuse, but does not delete the data
+  // on the backing store.
+  Status Delete(const Blob& blob);
+  // Sync all opened files that are modified
+  Status Sync();
+
+ private:
+  const std::string directory_;
+  // block_size_ is uint64_t because when we multiply with
+  // blocks_size_ we want the result to be uint64_t or
+  // we risk overflowing
+  const uint64_t block_size_;
+  const uint32_t blocks_per_bucket_;
+  Env* env_;
+  EnvOptions storage_options_;
+  // protected by free_list_mutex_
+  FreeList free_list_;
+  // free_list_mutex_ is locked BEFORE buckets_mutex_
+  mutable port::Mutex free_list_mutex_;
+  // protected by buckets_mutex_
+  // array of buckets
+  unique_ptr<RandomRWFile>* buckets_;
+  // number of buckets in the array
+  uint32_t buckets_size_;
+  uint32_t max_buckets_;
+  mutable port::Mutex buckets_mutex_;
+
+  // Calls FreeList allocate. If free list can't allocate
+  // new blob, creates new bucket and tries again
+  // Thread-safe
+  Status Allocate(uint32_t blocks, Blob* blob);
+
+  // Creates a new backing store and adds all the blocks
+  // from the new backing store to the free list
+  Status CreateNewBucket();
+};
+
+} // namespace rocksdb
diff --git a/util/blob_store_test.cc b/util/blob_store_test.cc
new file mode 100644 (file)
index 0000000..f199f5d
--- /dev/null
@@ -0,0 +1,200 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/blob_store.h"
+
+#include "util/testharness.h"
+#include "util/testutil.h"
+#include "util/random.h"
+
+#include <cstdlib>
+#include <string>
+
+namespace rocksdb {
+
+using namespace std;
+
+class BlobStoreTest { };
+
+TEST(BlobStoreTest, RangeParseTest) {
+  Blob e;
+  for (int i = 0; i < 5; ++i) {
+    e.chunks.push_back(BlobChunk(rand(), rand(), rand()));
+  }
+  string x = e.ToString();
+  Blob nx(x);
+
+  ASSERT_EQ(nx.ToString(), x);
+}
+
+// make sure we're reusing the freed space
+TEST(BlobStoreTest, SanityTest) {
+  const uint64_t block_size = 10;
+  const uint32_t blocks_per_file = 20;
+  Random random(5);
+
+  BlobStore blob_store(test::TmpDir() + "/blob_store_test",
+                       block_size,
+                       blocks_per_file,
+                       1000,
+                       Env::Default());
+
+  string buf;
+
+  // put string of size 170
+  test::RandomString(&random, 170, &buf);
+  Blob r1;
+  ASSERT_OK(blob_store.Put(Slice(buf), &r1));
+  // use the first file
+  for (size_t i = 0; i < r1.chunks.size(); ++i) {
+    ASSERT_EQ(r1.chunks[0].bucket_id, 0u);
+  }
+
+  // put string of size 30
+  test::RandomString(&random, 30, &buf);
+  Blob r2;
+  ASSERT_OK(blob_store.Put(Slice(buf), &r2));
+  // use the first file
+  for (size_t i = 0; i < r2.chunks.size(); ++i) {
+    ASSERT_EQ(r2.chunks[0].bucket_id, 0u);
+  }
+
+  // delete blob of size 170
+  ASSERT_OK(blob_store.Delete(r1));
+
+  // put a string of size 100
+  test::RandomString(&random, 100, &buf);
+  Blob r3;
+  ASSERT_OK(blob_store.Put(Slice(buf), &r3));
+  // use the first file
+  for (size_t i = 0; i < r3.chunks.size(); ++i) {
+    ASSERT_EQ(r3.chunks[0].bucket_id, 0u);
+  }
+
+  // put a string of size 70
+  test::RandomString(&random, 70, &buf);
+  Blob r4;
+  ASSERT_OK(blob_store.Put(Slice(buf), &r4));
+  // use the first file
+  for (size_t i = 0; i < r4.chunks.size(); ++i) {
+    ASSERT_EQ(r4.chunks[0].bucket_id, 0u);
+  }
+
+  // put a string of size 5
+  test::RandomString(&random, 5, &buf);
+  Blob r5;
+  ASSERT_OK(blob_store.Put(Slice(buf), &r5));
+  // now you get to use the second file
+  for (size_t i = 0; i < r5.chunks.size(); ++i) {
+    ASSERT_EQ(r5.chunks[0].bucket_id, 1u);
+  }
+}
+
+TEST(BlobStoreTest, FragmentedChunksTest) {
+  const uint64_t block_size = 10;
+  const uint32_t blocks_per_file = 20;
+  Random random(5);
+
+  BlobStore blob_store(test::TmpDir() + "/blob_store_test",
+                       block_size,
+                       blocks_per_file,
+                       1000,
+                       Env::Default());
+
+  string buf;
+
+  vector <Blob> r(4);
+
+  // put 4 strings of size 50
+  for (int k = 0; k < 4; ++k)  {
+    test::RandomString(&random, 50, &buf);
+    ASSERT_OK(blob_store.Put(Slice(buf), &r[k]));
+    // use the first file
+    for (size_t i = 0; i < r[k].chunks.size(); ++i) {
+      ASSERT_EQ(r[k].chunks[0].bucket_id, 0u);
+    }
+  }
+
+  // delete the first and third
+  ASSERT_OK(blob_store.Delete(r[0]));
+  ASSERT_OK(blob_store.Delete(r[2]));
+
+  // put string of size 100. it should reuse space that we deleting
+  // by deleting first and third strings of size 50
+  test::RandomString(&random, 100, &buf);
+  Blob r2;
+  ASSERT_OK(blob_store.Put(Slice(buf), &r2));
+  // use the first file
+  for (size_t i = 0; i < r2.chunks.size(); ++i) {
+    ASSERT_EQ(r2.chunks[0].bucket_id, 0u);
+  }
+}
+
+TEST(BlobStoreTest, CreateAndStoreTest) {
+  const uint64_t block_size = 10;
+  const uint32_t blocks_per_file = 1000;
+  const int max_blurb_size = 300;
+  Random random(5);
+
+  BlobStore blob_store(test::TmpDir() + "/blob_store_test",
+                       block_size,
+                       blocks_per_file,
+                       10000,
+                       Env::Default());
+  vector<pair<Blob, string>> ranges;
+
+  for (int i = 0; i < 2000; ++i) {
+    int decision = rand() % 5;
+    if (decision <= 2 || ranges.size() == 0) {
+      string buf;
+      int size_blocks = (rand() % max_blurb_size + 1);
+      int string_size = size_blocks * block_size - (rand() % block_size);
+      test::RandomString(&random, string_size, &buf);
+      Blob r;
+      ASSERT_OK(blob_store.Put(Slice(buf), &r));
+      ranges.push_back(make_pair(r, buf));
+    } else if (decision == 3) {
+      int ti = rand() % ranges.size();
+      string out_buf;
+      ASSERT_OK(blob_store.Get(ranges[ti].first, &out_buf));
+      ASSERT_EQ(ranges[ti].second, out_buf);
+    } else {
+      int ti = rand() % ranges.size();
+      ASSERT_OK(blob_store.Delete(ranges[ti].first));
+      ranges.erase(ranges.begin() + ti);
+    }
+  }
+  ASSERT_OK(blob_store.Sync());
+}
+
+TEST(BlobStoreTest, MaxSizeTest) {
+  const uint64_t block_size = 10;
+  const uint32_t blocks_per_file = 100;
+  const int max_buckets = 10;
+  Random random(5);
+
+  BlobStore blob_store(test::TmpDir() + "/blob_store_test",
+                       block_size,
+                       blocks_per_file,
+                       max_buckets,
+                       Env::Default());
+  string buf;
+  for (int i = 0; i < max_buckets; ++i) {
+    test::RandomString(&random, 1000, &buf);
+    Blob r;
+    ASSERT_OK(blob_store.Put(Slice(buf), &r));
+  }
+
+  test::RandomString(&random, 1000, &buf);
+  Blob r;
+  // should fail because max size
+  Status s = blob_store.Put(Slice(buf), &r);
+  ASSERT_EQ(s.ok(), false);
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/util/bloom.cc b/util/bloom.cc
new file mode 100644 (file)
index 0000000..78ae04a
--- /dev/null
@@ -0,0 +1,111 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/filter_policy.h"
+
+#include "rocksdb/slice.h"
+#include "util/hash.h"
+
+namespace rocksdb {
+
+namespace {
+static uint32_t BloomHash(const Slice& key) {
+  return Hash(key.data(), key.size(), 0xbc9f1d34);
+}
+
+class BloomFilterPolicy : public FilterPolicy {
+ private:
+  size_t bits_per_key_;
+  size_t k_;
+  uint32_t (*hash_func_)(const Slice& key);
+
+  void initialize() {
+    // We intentionally round down to reduce probing cost a little bit
+    k_ = static_cast<size_t>(bits_per_key_ * 0.69);  // 0.69 =~ ln(2)
+    if (k_ < 1) k_ = 1;
+    if (k_ > 30) k_ = 30;
+  }
+
+ public:
+  explicit BloomFilterPolicy(int bits_per_key,
+                             uint32_t (*hash_func)(const Slice& key))
+      : bits_per_key_(bits_per_key), hash_func_(hash_func) {
+    initialize();
+  }
+  explicit BloomFilterPolicy(int bits_per_key)
+      : bits_per_key_(bits_per_key) {
+    hash_func_ = BloomHash;
+    initialize();
+  }
+
+  virtual const char* Name() const {
+    return "rocksdb.BuiltinBloomFilter";
+  }
+
+  virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const {
+    // Compute bloom filter size (in both bits and bytes)
+    size_t bits = n * bits_per_key_;
+
+    // For small n, we can see a very high false positive rate.  Fix it
+    // by enforcing a minimum bloom filter length.
+    if (bits < 64) bits = 64;
+
+    size_t bytes = (bits + 7) / 8;
+    bits = bytes * 8;
+
+    const size_t init_size = dst->size();
+    dst->resize(init_size + bytes, 0);
+    dst->push_back(static_cast<char>(k_));  // Remember # of probes in filter
+    char* array = &(*dst)[init_size];
+    for (size_t i = 0; i < (size_t)n; i++) {
+      // Use double-hashing to generate a sequence of hash values.
+      // See analysis in [Kirsch,Mitzenmacher 2006].
+      uint32_t h = hash_func_(keys[i]);
+      const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
+      for (size_t j = 0; j < k_; j++) {
+        const uint32_t bitpos = h % bits;
+        array[bitpos/8] |= (1 << (bitpos % 8));
+        h += delta;
+      }
+    }
+  }
+
+  virtual bool KeyMayMatch(const Slice& key, const Slice& bloom_filter) const {
+    const size_t len = bloom_filter.size();
+    if (len < 2) return false;
+
+    const char* array = bloom_filter.data();
+    const size_t bits = (len - 1) * 8;
+
+    // Use the encoded k so that we can read filters generated by
+    // bloom filters created using different parameters.
+    const size_t k = array[len-1];
+    if (k > 30) {
+      // Reserved for potentially new encodings for short bloom filters.
+      // Consider it a match.
+      return true;
+    }
+
+    uint32_t h = hash_func_(key);
+    const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
+    for (size_t j = 0; j < k; j++) {
+      const uint32_t bitpos = h % bits;
+      if ((array[bitpos/8] & (1 << (bitpos % 8))) == 0) return false;
+      h += delta;
+    }
+    return true;
+  }
+};
+}
+
+const FilterPolicy* NewBloomFilterPolicy(int bits_per_key) {
+  return new BloomFilterPolicy(bits_per_key);
+}
+
+}  // namespace rocksdb
diff --git a/util/bloom_test.cc b/util/bloom_test.cc
new file mode 100644 (file)
index 0000000..9dbd5d2
--- /dev/null
@@ -0,0 +1,164 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/filter_policy.h"
+
+#include "util/logging.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+
+static const int kVerbose = 1;
+
+static Slice Key(int i, char* buffer) {
+  memcpy(buffer, &i, sizeof(i));
+  return Slice(buffer, sizeof(i));
+}
+
+class BloomTest {
+ private:
+  const FilterPolicy* policy_;
+  std::string filter_;
+  std::vector<std::string> keys_;
+
+ public:
+  BloomTest() : policy_(NewBloomFilterPolicy(10)) { }
+
+  ~BloomTest() {
+    delete policy_;
+  }
+
+  void Reset() {
+    keys_.clear();
+    filter_.clear();
+  }
+
+  void Add(const Slice& s) {
+    keys_.push_back(s.ToString());
+  }
+
+  void Build() {
+    std::vector<Slice> key_slices;
+    for (size_t i = 0; i < keys_.size(); i++) {
+      key_slices.push_back(Slice(keys_[i]));
+    }
+    filter_.clear();
+    policy_->CreateFilter(&key_slices[0], key_slices.size(), &filter_);
+    keys_.clear();
+    if (kVerbose >= 2) DumpFilter();
+  }
+
+  size_t FilterSize() const {
+    return filter_.size();
+  }
+
+  void DumpFilter() {
+    fprintf(stderr, "F(");
+    for (size_t i = 0; i+1 < filter_.size(); i++) {
+      const unsigned int c = static_cast<unsigned int>(filter_[i]);
+      for (int j = 0; j < 8; j++) {
+        fprintf(stderr, "%c", (c & (1 <<j)) ? '1' : '.');
+      }
+    }
+    fprintf(stderr, ")\n");
+  }
+
+  bool Matches(const Slice& s) {
+    if (!keys_.empty()) {
+      Build();
+    }
+    return policy_->KeyMayMatch(s, filter_);
+  }
+
+  double FalsePositiveRate() {
+    char buffer[sizeof(int)];
+    int result = 0;
+    for (int i = 0; i < 10000; i++) {
+      if (Matches(Key(i + 1000000000, buffer))) {
+        result++;
+      }
+    }
+    return result / 10000.0;
+  }
+};
+
+TEST(BloomTest, EmptyFilter) {
+  ASSERT_TRUE(! Matches("hello"));
+  ASSERT_TRUE(! Matches("world"));
+}
+
+TEST(BloomTest, Small) {
+  Add("hello");
+  Add("world");
+  ASSERT_TRUE(Matches("hello"));
+  ASSERT_TRUE(Matches("world"));
+  ASSERT_TRUE(! Matches("x"));
+  ASSERT_TRUE(! Matches("foo"));
+}
+
+static int NextLength(int length) {
+  if (length < 10) {
+    length += 1;
+  } else if (length < 100) {
+    length += 10;
+  } else if (length < 1000) {
+    length += 100;
+  } else {
+    length += 1000;
+  }
+  return length;
+}
+
+TEST(BloomTest, VaryingLengths) {
+  char buffer[sizeof(int)];
+
+  // Count number of filters that significantly exceed the false positive rate
+  int mediocre_filters = 0;
+  int good_filters = 0;
+
+  for (int length = 1; length <= 10000; length = NextLength(length)) {
+    Reset();
+    for (int i = 0; i < length; i++) {
+      Add(Key(i, buffer));
+    }
+    Build();
+
+    ASSERT_LE(FilterSize(), (size_t)((length * 10 / 8) + 40)) << length;
+
+    // All added keys must match
+    for (int i = 0; i < length; i++) {
+      ASSERT_TRUE(Matches(Key(i, buffer)))
+          << "Length " << length << "; key " << i;
+    }
+
+    // Check false positive rate
+    double rate = FalsePositiveRate();
+    if (kVerbose >= 1) {
+      fprintf(stderr, "False positives: %5.2f%% @ length = %6d ; bytes = %6d\n",
+              rate*100.0, length, static_cast<int>(FilterSize()));
+    }
+    ASSERT_LE(rate, 0.02);   // Must not be over 2%
+    if (rate > 0.0125) mediocre_filters++;  // Allowed, but not too often
+    else good_filters++;
+  }
+  if (kVerbose >= 1) {
+    fprintf(stderr, "Filters: %d good, %d mediocre\n",
+            good_filters, mediocre_filters);
+  }
+  ASSERT_LE(mediocre_filters, good_filters/5);
+}
+
+// Different bits-per-byte
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/util/build_version.h b/util/build_version.h
new file mode 100644 (file)
index 0000000..3d2ed29
--- /dev/null
@@ -0,0 +1,13 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#pragma once
+// these variables tell us about the git config and time
+extern const char* rocksdb_build_git_sha;
+
+// these variables tell us when the compilation occurred
+extern const char* rocksdb_build_compile_time;
+extern const char* rocksdb_build_compile_date;
+
diff --git a/util/cache.cc b/util/cache.cc
new file mode 100644 (file)
index 0000000..8fa0362
--- /dev/null
@@ -0,0 +1,434 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <vector>
+
+#include "rocksdb/cache.h"
+#include "port/port.h"
+#include "util/hash.h"
+#include "util/mutexlock.h"
+
+namespace rocksdb {
+
+Cache::~Cache() {
+}
+
+namespace {
+
+// LRU cache implementation
+
+// An entry is a variable length heap-allocated structure.  Entries
+// are kept in a circular doubly linked list ordered by access time.
+struct LRUHandle {
+  void* value;
+  void (*deleter)(const Slice&, void* value);
+  LRUHandle* next_hash;
+  LRUHandle* next;
+  LRUHandle* prev;
+  size_t charge;      // TODO(opt): Only allow uint32_t?
+  size_t key_length;
+  uint32_t refs;
+  uint32_t hash;      // Hash of key(); used for fast sharding and comparisons
+  char key_data[1];   // Beginning of key
+
+  Slice key() const {
+    // For cheaper lookups, we allow a temporary Handle object
+    // to store a pointer to a key in "value".
+    if (next == this) {
+      return *(reinterpret_cast<Slice*>(value));
+    } else {
+      return Slice(key_data, key_length);
+    }
+  }
+};
+
+// We provide our own simple hash table since it removes a whole bunch
+// of porting hacks and is also faster than some of the built-in hash
+// table implementations in some of the compiler/runtime combinations
+// we have tested.  E.g., readrandom speeds up by ~5% over the g++
+// 4.4.3's builtin hashtable.
+class HandleTable {
+ public:
+  HandleTable() : length_(0), elems_(0), list_(nullptr) { Resize(); }
+  ~HandleTable() { delete[] list_; }
+
+  LRUHandle* Lookup(const Slice& key, uint32_t hash) {
+    return *FindPointer(key, hash);
+  }
+
+  LRUHandle* Insert(LRUHandle* h) {
+    LRUHandle** ptr = FindPointer(h->key(), h->hash);
+    LRUHandle* old = *ptr;
+    h->next_hash = (old == nullptr ? nullptr : old->next_hash);
+    *ptr = h;
+    if (old == nullptr) {
+      ++elems_;
+      if (elems_ > length_) {
+        // Since each cache entry is fairly large, we aim for a small
+        // average linked list length (<= 1).
+        Resize();
+      }
+    }
+    return old;
+  }
+
+  LRUHandle* Remove(const Slice& key, uint32_t hash) {
+    LRUHandle** ptr = FindPointer(key, hash);
+    LRUHandle* result = *ptr;
+    if (result != nullptr) {
+      *ptr = result->next_hash;
+      --elems_;
+    }
+    return result;
+  }
+
+ private:
+  // The table consists of an array of buckets where each bucket is
+  // a linked list of cache entries that hash into the bucket.
+  uint32_t length_;
+  uint32_t elems_;
+  LRUHandle** list_;
+
+  // Return a pointer to slot that points to a cache entry that
+  // matches key/hash.  If there is no such cache entry, return a
+  // pointer to the trailing slot in the corresponding linked list.
+  LRUHandle** FindPointer(const Slice& key, uint32_t hash) {
+    LRUHandle** ptr = &list_[hash & (length_ - 1)];
+    while (*ptr != nullptr &&
+           ((*ptr)->hash != hash || key != (*ptr)->key())) {
+      ptr = &(*ptr)->next_hash;
+    }
+    return ptr;
+  }
+
+  void Resize() {
+    uint32_t new_length = 16;
+    while (new_length < elems_ * 1.5) {
+      new_length *= 2;
+    }
+    LRUHandle** new_list = new LRUHandle*[new_length];
+    memset(new_list, 0, sizeof(new_list[0]) * new_length);
+    uint32_t count = 0;
+    for (uint32_t i = 0; i < length_; i++) {
+      LRUHandle* h = list_[i];
+      while (h != nullptr) {
+        LRUHandle* next = h->next_hash;
+        uint32_t hash = h->hash;
+        LRUHandle** ptr = &new_list[hash & (new_length - 1)];
+        h->next_hash = *ptr;
+        *ptr = h;
+        h = next;
+        count++;
+      }
+    }
+    assert(elems_ == count);
+    delete[] list_;
+    list_ = new_list;
+    length_ = new_length;
+  }
+};
+
+// A single shard of sharded cache.
+class LRUCache {
+ public:
+  LRUCache();
+  ~LRUCache();
+
+  // Separate from constructor so caller can easily make an array of LRUCache
+  void SetCapacity(size_t capacity) { capacity_ = capacity; }
+  void SetRemoveScanCountLimit(size_t remove_scan_count_limit) {
+    remove_scan_count_limit_ = remove_scan_count_limit;
+  }
+
+  // Like Cache methods, but with an extra "hash" parameter.
+  Cache::Handle* Insert(const Slice& key, uint32_t hash,
+                        void* value, size_t charge,
+                        void (*deleter)(const Slice& key, void* value));
+  Cache::Handle* Lookup(const Slice& key, uint32_t hash);
+  void Release(Cache::Handle* handle);
+  void Erase(const Slice& key, uint32_t hash);
+
+ private:
+  void LRU_Remove(LRUHandle* e);
+  void LRU_Append(LRUHandle* e);
+  // Just reduce the reference count by 1.
+  // Return true if last reference
+  bool Unref(LRUHandle* e);
+  // Call deleter and free
+  void FreeEntry(LRUHandle* e);
+
+  // Initialized before use.
+  size_t capacity_;
+  uint32_t remove_scan_count_limit_;
+
+  // mutex_ protects the following state.
+  port::Mutex mutex_;
+  size_t usage_;
+
+  // Dummy head of LRU list.
+  // lru.prev is newest entry, lru.next is oldest entry.
+  LRUHandle lru_;
+
+  HandleTable table_;
+};
+
+LRUCache::LRUCache()
+    : usage_(0) {
+  // Make empty circular linked list
+  lru_.next = &lru_;
+  lru_.prev = &lru_;
+}
+
+LRUCache::~LRUCache() {
+  for (LRUHandle* e = lru_.next; e != &lru_; ) {
+    LRUHandle* next = e->next;
+    assert(e->refs == 1);  // Error if caller has an unreleased handle
+    if (Unref(e)) {
+      FreeEntry(e);
+    }
+    e = next;
+  }
+}
+
+bool LRUCache::Unref(LRUHandle* e) {
+  assert(e->refs > 0);
+  e->refs--;
+  return e->refs == 0;
+}
+
+void LRUCache::FreeEntry(LRUHandle* e) {
+  assert(e->refs == 0);
+  (*e->deleter)(e->key(), e->value);
+  free(e);
+}
+
+void LRUCache::LRU_Remove(LRUHandle* e) {
+  e->next->prev = e->prev;
+  e->prev->next = e->next;
+  usage_ -= e->charge;
+}
+
+void LRUCache::LRU_Append(LRUHandle* e) {
+  // Make "e" newest entry by inserting just before lru_
+  e->next = &lru_;
+  e->prev = lru_.prev;
+  e->prev->next = e;
+  e->next->prev = e;
+  usage_ += e->charge;
+}
+
+Cache::Handle* LRUCache::Lookup(const Slice& key, uint32_t hash) {
+  MutexLock l(&mutex_);
+  LRUHandle* e = table_.Lookup(key, hash);
+  if (e != nullptr) {
+    e->refs++;
+    LRU_Remove(e);
+    LRU_Append(e);
+  }
+  return reinterpret_cast<Cache::Handle*>(e);
+}
+
+void LRUCache::Release(Cache::Handle* handle) {
+  LRUHandle* e = reinterpret_cast<LRUHandle*>(handle);
+  bool last_reference = false;
+  {
+    MutexLock l(&mutex_);
+    last_reference = Unref(e);
+  }
+  if (last_reference) {
+    FreeEntry(e);
+  }
+}
+
+Cache::Handle* LRUCache::Insert(
+    const Slice& key, uint32_t hash, void* value, size_t charge,
+    void (*deleter)(const Slice& key, void* value)) {
+
+  LRUHandle* e = reinterpret_cast<LRUHandle*>(
+      malloc(sizeof(LRUHandle)-1 + key.size()));
+  std::vector<LRUHandle*> last_reference_list;
+  last_reference_list.reserve(1);
+
+  e->value = value;
+  e->deleter = deleter;
+  e->charge = charge;
+  e->key_length = key.size();
+  e->hash = hash;
+  e->refs = 2;  // One from LRUCache, one for the returned handle
+  memcpy(e->key_data, key.data(), key.size());
+
+  {
+    MutexLock l(&mutex_);
+
+    LRU_Append(e);
+
+    LRUHandle* old = table_.Insert(e);
+    if (old != nullptr) {
+      LRU_Remove(old);
+      if (Unref(old)) {
+        last_reference_list.push_back(old);
+      }
+    }
+
+    if (remove_scan_count_limit_ > 0) {
+      // Try to free the space by evicting the entries that are only
+      // referenced by the cache first.
+      LRUHandle* cur = lru_.next;
+      for (unsigned int scanCount = 0;
+           usage_ > capacity_ && cur != &lru_
+           && scanCount < remove_scan_count_limit_; scanCount++) {
+        LRUHandle* next = cur->next;
+        if (cur->refs <= 1) {
+          LRU_Remove(cur);
+          table_.Remove(cur->key(), cur->hash);
+          if (Unref(cur)) {
+            last_reference_list.push_back(cur);
+          }
+        }
+        cur = next;
+      }
+    }
+
+    // Free the space following strict LRU policy until enough space
+    // is freed.
+    while (usage_ > capacity_ && lru_.next != &lru_) {
+      LRUHandle* old = lru_.next;
+      LRU_Remove(old);
+      table_.Remove(old->key(), old->hash);
+      if (Unref(old)) {
+        last_reference_list.push_back(old);
+      }
+    }
+  }
+
+  // we free the entries here outside of mutex for
+  // performance reasons
+  for (auto entry : last_reference_list) {
+    FreeEntry(entry);
+  }
+
+  return reinterpret_cast<Cache::Handle*>(e);
+}
+
+void LRUCache::Erase(const Slice& key, uint32_t hash) {
+  LRUHandle* e;
+  bool last_reference = false;
+  {
+    MutexLock l(&mutex_);
+    e = table_.Remove(key, hash);
+    if (e != nullptr) {
+      LRU_Remove(e);
+      last_reference = Unref(e);
+    }
+  }
+  // mutex not held here
+  // last_reference will only be true if e != nullptr
+  if (last_reference) {
+    FreeEntry(e);
+  }
+}
+
+static int kNumShardBits = 4;          // default values, can be overridden
+static int kRemoveScanCountLimit = 0; // default values, can be overridden
+
+class ShardedLRUCache : public Cache {
+ private:
+  LRUCache* shard_;
+  port::Mutex id_mutex_;
+  uint64_t last_id_;
+  int numShardBits;
+  size_t capacity_;
+
+  static inline uint32_t HashSlice(const Slice& s) {
+    return Hash(s.data(), s.size(), 0);
+  }
+
+  uint32_t Shard(uint32_t hash) {
+    // Note, hash >> 32 yields hash in gcc, not the zero we expect!
+    return (numShardBits > 0) ? (hash >> (32 - numShardBits)) : 0;
+  }
+
+  void init(size_t capacity, int numbits, int removeScanCountLimit) {
+    numShardBits = numbits;
+    capacity_ = capacity;
+    int numShards = 1 << numShardBits;
+    shard_ = new LRUCache[numShards];
+    const size_t per_shard = (capacity + (numShards - 1)) / numShards;
+    for (int s = 0; s < numShards; s++) {
+      shard_[s].SetCapacity(per_shard);
+      shard_[s].SetRemoveScanCountLimit(removeScanCountLimit);
+    }
+  }
+
+ public:
+  explicit ShardedLRUCache(size_t capacity)
+      : last_id_(0) {
+    init(capacity, kNumShardBits, kRemoveScanCountLimit);
+  }
+  ShardedLRUCache(size_t capacity, int numShardBits,
+                  int removeScanCountLimit)
+     : last_id_(0) {
+    init(capacity, numShardBits, removeScanCountLimit);
+  }
+  virtual ~ShardedLRUCache() {
+    delete[] shard_;
+  }
+  virtual Handle* Insert(const Slice& key, void* value, size_t charge,
+                         void (*deleter)(const Slice& key, void* value)) {
+    const uint32_t hash = HashSlice(key);
+    return shard_[Shard(hash)].Insert(key, hash, value, charge, deleter);
+  }
+  virtual Handle* Lookup(const Slice& key) {
+    const uint32_t hash = HashSlice(key);
+    return shard_[Shard(hash)].Lookup(key, hash);
+  }
+  virtual void Release(Handle* handle) {
+    LRUHandle* h = reinterpret_cast<LRUHandle*>(handle);
+    shard_[Shard(h->hash)].Release(handle);
+  }
+  virtual void Erase(const Slice& key) {
+    const uint32_t hash = HashSlice(key);
+    shard_[Shard(hash)].Erase(key, hash);
+  }
+  virtual void* Value(Handle* handle) {
+    return reinterpret_cast<LRUHandle*>(handle)->value;
+  }
+  virtual uint64_t NewId() {
+    MutexLock l(&id_mutex_);
+    return ++(last_id_);
+  }
+  virtual size_t GetCapacity() {
+    return capacity_;
+  }
+};
+
+}  // end anonymous namespace
+
+shared_ptr<Cache> NewLRUCache(size_t capacity) {
+  return NewLRUCache(capacity, kNumShardBits);
+}
+
+shared_ptr<Cache> NewLRUCache(size_t capacity, int numShardBits) {
+  return NewLRUCache(capacity, numShardBits, kRemoveScanCountLimit);
+}
+
+shared_ptr<Cache> NewLRUCache(size_t capacity, int numShardBits,
+                              int removeScanCountLimit) {
+  if (numShardBits >= 20) {
+    return nullptr;  // the cache cannot be sharded into too many fine pieces
+  }
+  return std::make_shared<ShardedLRUCache>(capacity,
+                                           numShardBits,
+                                           removeScanCountLimit);
+}
+
+}  // namespace rocksdb
diff --git a/util/cache_test.cc b/util/cache_test.cc
new file mode 100644 (file)
index 0000000..87ab913
--- /dev/null
@@ -0,0 +1,391 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/cache.h"
+
+#include <vector>
+#include <string>
+#include <iostream>
+#include "util/coding.h"
+#include "util/testharness.h"
+
+namespace rocksdb {
+
+// Conversions between numeric keys/values and the types expected by Cache.
+static std::string EncodeKey(int k) {
+  std::string result;
+  PutFixed32(&result, k);
+  return result;
+}
+static int DecodeKey(const Slice& k) {
+  assert(k.size() == 4);
+  return DecodeFixed32(k.data());
+}
+static void* EncodeValue(uintptr_t v) { return reinterpret_cast<void*>(v); }
+static int DecodeValue(void* v) { return reinterpret_cast<uintptr_t>(v); }
+
+class CacheTest {
+ public:
+  static CacheTest* current_;
+
+  static void Deleter(const Slice& key, void* v) {
+    current_->deleted_keys_.push_back(DecodeKey(key));
+    current_->deleted_values_.push_back(DecodeValue(v));
+  }
+
+  static const int kCacheSize = 1000;
+  static const int kNumShardBits = 4;
+  static const int kRemoveScanCountLimit = 16;
+
+  static const int kCacheSize2 = 100;
+  static const int kNumShardBits2 = 2;
+  static const int kRemoveScanCountLimit2 = 200;
+
+  std::vector<int> deleted_keys_;
+  std::vector<int> deleted_values_;
+  shared_ptr<Cache> cache_;
+  shared_ptr<Cache> cache2_;
+
+  CacheTest() :
+      cache_(NewLRUCache(kCacheSize, kNumShardBits, kRemoveScanCountLimit)),
+      cache2_(NewLRUCache(kCacheSize2, kNumShardBits2,
+                          kRemoveScanCountLimit2)) {
+    current_ = this;
+  }
+
+  ~CacheTest() {
+  }
+
+  int Lookup(shared_ptr<Cache> cache, int key) {
+    Cache::Handle* handle = cache->Lookup(EncodeKey(key));
+    const int r = (handle == nullptr) ? -1 : DecodeValue(cache->Value(handle));
+    if (handle != nullptr) {
+      cache->Release(handle);
+    }
+    return r;
+  }
+
+  void Insert(shared_ptr<Cache> cache, int key, int value, int charge = 1) {
+    cache->Release(cache->Insert(EncodeKey(key), EncodeValue(value), charge,
+                                  &CacheTest::Deleter));
+  }
+
+  void Erase(shared_ptr<Cache> cache, int key) {
+    cache->Erase(EncodeKey(key));
+  }
+
+
+  int Lookup(int key) {
+    return Lookup(cache_, key);
+  }
+
+  void Insert(int key, int value, int charge = 1) {
+    Insert(cache_, key, value, charge);
+  }
+
+  void Erase(int key) {
+    Erase(cache_, key);
+  }
+
+  int Lookup2(int key) {
+    return Lookup(cache2_, key);
+  }
+
+  void Insert2(int key, int value, int charge = 1) {
+    Insert(cache2_, key, value, charge);
+  }
+
+  void Erase2(int key) {
+    Erase(cache2_, key);
+  }
+};
+CacheTest* CacheTest::current_;
+
+TEST(CacheTest, HitAndMiss) {
+  ASSERT_EQ(-1, Lookup(100));
+
+  Insert(100, 101);
+  ASSERT_EQ(101, Lookup(100));
+  ASSERT_EQ(-1,  Lookup(200));
+  ASSERT_EQ(-1,  Lookup(300));
+
+  Insert(200, 201);
+  ASSERT_EQ(101, Lookup(100));
+  ASSERT_EQ(201, Lookup(200));
+  ASSERT_EQ(-1,  Lookup(300));
+
+  Insert(100, 102);
+  ASSERT_EQ(102, Lookup(100));
+  ASSERT_EQ(201, Lookup(200));
+  ASSERT_EQ(-1,  Lookup(300));
+
+  ASSERT_EQ(1U, deleted_keys_.size());
+  ASSERT_EQ(100, deleted_keys_[0]);
+  ASSERT_EQ(101, deleted_values_[0]);
+}
+
+TEST(CacheTest, Erase) {
+  Erase(200);
+  ASSERT_EQ(0U, deleted_keys_.size());
+
+  Insert(100, 101);
+  Insert(200, 201);
+  Erase(100);
+  ASSERT_EQ(-1,  Lookup(100));
+  ASSERT_EQ(201, Lookup(200));
+  ASSERT_EQ(1U, deleted_keys_.size());
+  ASSERT_EQ(100, deleted_keys_[0]);
+  ASSERT_EQ(101, deleted_values_[0]);
+
+  Erase(100);
+  ASSERT_EQ(-1,  Lookup(100));
+  ASSERT_EQ(201, Lookup(200));
+  ASSERT_EQ(1U, deleted_keys_.size());
+}
+
+TEST(CacheTest, EntriesArePinned) {
+  Insert(100, 101);
+  Cache::Handle* h1 = cache_->Lookup(EncodeKey(100));
+  ASSERT_EQ(101, DecodeValue(cache_->Value(h1)));
+
+  Insert(100, 102);
+  Cache::Handle* h2 = cache_->Lookup(EncodeKey(100));
+  ASSERT_EQ(102, DecodeValue(cache_->Value(h2)));
+  ASSERT_EQ(0U, deleted_keys_.size());
+
+  cache_->Release(h1);
+  ASSERT_EQ(1U, deleted_keys_.size());
+  ASSERT_EQ(100, deleted_keys_[0]);
+  ASSERT_EQ(101, deleted_values_[0]);
+
+  Erase(100);
+  ASSERT_EQ(-1, Lookup(100));
+  ASSERT_EQ(1U, deleted_keys_.size());
+
+  cache_->Release(h2);
+  ASSERT_EQ(2U, deleted_keys_.size());
+  ASSERT_EQ(100, deleted_keys_[1]);
+  ASSERT_EQ(102, deleted_values_[1]);
+}
+
+TEST(CacheTest, EvictionPolicy) {
+  Insert(100, 101);
+  Insert(200, 201);
+
+  // Frequently used entry must be kept around
+  for (int i = 0; i < kCacheSize + 100; i++) {
+    Insert(1000+i, 2000+i);
+    ASSERT_EQ(2000+i, Lookup(1000+i));
+    ASSERT_EQ(101, Lookup(100));
+  }
+  ASSERT_EQ(101, Lookup(100));
+  ASSERT_EQ(-1, Lookup(200));
+}
+
+TEST(CacheTest, EvictionPolicyRef) {
+  Insert(100, 101);
+  Insert(101, 102);
+  Insert(102, 103);
+  Insert(103, 104);
+  Insert(200, 101);
+  Insert(201, 102);
+  Insert(202, 103);
+  Insert(203, 104);
+  Cache::Handle* h201 = cache_->Lookup(EncodeKey(200));
+  Cache::Handle* h202 = cache_->Lookup(EncodeKey(201));
+  Cache::Handle* h203 = cache_->Lookup(EncodeKey(202));
+  Cache::Handle* h204 = cache_->Lookup(EncodeKey(203));
+  Insert(300, 101);
+  Insert(301, 102);
+  Insert(302, 103);
+  Insert(303, 104);
+
+  // Insert entries much more than Cache capacity
+  for (int i = 0; i < kCacheSize + 100; i++) {
+    Insert(1000 + i, 2000 + i);
+  }
+
+  // Check whether the entries inserted in the beginning
+  // are evicted. Ones without extra ref are evicted and
+  // those with are not.
+  ASSERT_EQ(-1, Lookup(100));
+  ASSERT_EQ(-1, Lookup(101));
+  ASSERT_EQ(-1, Lookup(102));
+  ASSERT_EQ(-1, Lookup(103));
+
+  ASSERT_EQ(-1, Lookup(300));
+  ASSERT_EQ(-1, Lookup(301));
+  ASSERT_EQ(-1, Lookup(302));
+  ASSERT_EQ(-1, Lookup(303));
+
+  ASSERT_EQ(101, Lookup(200));
+  ASSERT_EQ(102, Lookup(201));
+  ASSERT_EQ(103, Lookup(202));
+  ASSERT_EQ(104, Lookup(203));
+
+  // Cleaning up all the handles
+  cache_->Release(h201);
+  cache_->Release(h202);
+  cache_->Release(h203);
+  cache_->Release(h204);
+}
+
+TEST(CacheTest, EvictionPolicyRef2) {
+  std::vector<Cache::Handle*> handles;
+
+  Insert(100, 101);
+  // Insert entries much more than Cache capacity
+  for (int i = 0; i < kCacheSize + 100; i++) {
+    Insert(1000 + i, 2000 + i);
+    if (i < kCacheSize ) {
+      handles.push_back(cache_->Lookup(EncodeKey(1000 + i)));
+    }
+  }
+
+  // Make sure referenced keys are also possible to be deleted
+  // if there are not sufficient non-referenced keys
+  for (int i = 0; i < 5; i++) {
+    ASSERT_EQ(-1, Lookup(1000 + i));
+  }
+
+  for (int i = kCacheSize; i < kCacheSize + 100; i++) {
+    ASSERT_EQ(2000 + i, Lookup(1000 + i));
+  }
+  ASSERT_EQ(-1, Lookup(100));
+
+  // Cleaning up all the handles
+  while (handles.size() > 0) {
+    cache_->Release(handles.back());
+    handles.pop_back();
+  }
+}
+
+TEST(CacheTest, EvictionPolicyRefLargeScanLimit) {
+  std::vector<Cache::Handle*> handles2;
+
+  // Cache2 has a cache RemoveScanCountLimit higher than cache size
+  // so it would trigger a boundary condition.
+
+  // Populate the cache with 10 more keys than its size.
+  // Reference all keys except one close to the end.
+  for (int i = 0; i < kCacheSize2 + 10; i++) {
+    Insert2(1000 + i, 2000+i);
+    if (i != kCacheSize2 ) {
+      handles2.push_back(cache2_->Lookup(EncodeKey(1000 + i)));
+    }
+  }
+
+  // Make sure referenced keys are also possible to be deleted
+  // if there are not sufficient non-referenced keys
+  for (int i = 0; i < 3; i++) {
+    ASSERT_EQ(-1, Lookup2(1000 + i));
+  }
+  // The non-referenced value is deleted even if it's accessed
+  // recently.
+  ASSERT_EQ(-1, Lookup2(1000 + kCacheSize2));
+  // Other values recently accessed are not deleted since they
+  // are referenced.
+  for (int i = kCacheSize2 - 10; i < kCacheSize2 + 10; i++) {
+    if (i != kCacheSize2) {
+      ASSERT_EQ(2000 + i, Lookup2(1000 + i));
+    }
+  }
+
+  // Cleaning up all the handles
+  while (handles2.size() > 0) {
+    cache2_->Release(handles2.back());
+    handles2.pop_back();
+  }
+}
+
+
+
+TEST(CacheTest, HeavyEntries) {
+  // Add a bunch of light and heavy entries and then count the combined
+  // size of items still in the cache, which must be approximately the
+  // same as the total capacity.
+  const int kLight = 1;
+  const int kHeavy = 10;
+  int added = 0;
+  int index = 0;
+  while (added < 2*kCacheSize) {
+    const int weight = (index & 1) ? kLight : kHeavy;
+    Insert(index, 1000+index, weight);
+    added += weight;
+    index++;
+  }
+
+  int cached_weight = 0;
+  for (int i = 0; i < index; i++) {
+    const int weight = (i & 1 ? kLight : kHeavy);
+    int r = Lookup(i);
+    if (r >= 0) {
+      cached_weight += weight;
+      ASSERT_EQ(1000+i, r);
+    }
+  }
+  ASSERT_LE(cached_weight, kCacheSize + kCacheSize/10);
+}
+
+TEST(CacheTest, NewId) {
+  uint64_t a = cache_->NewId();
+  uint64_t b = cache_->NewId();
+  ASSERT_NE(a, b);
+}
+
+
+class Value {
+ private:
+  int v_;
+ public:
+  explicit Value(int v) : v_(v) { }
+
+  ~Value() { std::cout << v_ << " is destructed\n"; }
+};
+
+void deleter(const Slice& key, void* value) {
+  delete (Value *)value;
+}
+
+
+TEST(CacheTest, BadEviction) {
+  int n = 10;
+
+  // a LRUCache with n entries and one shard only
+  std::shared_ptr<Cache> cache = NewLRUCache(n, 0);
+
+  std::vector<Cache::Handle*> handles(n+1);
+
+  // Insert n+1 entries, but not releasing.
+  for (int i = 0; i < n+1; i++) {
+    std::string key = std::to_string(i+1);
+    handles[i] = cache->Insert(key, new Value(i+1), 1, &deleter);
+  }
+
+  // Guess what's in the cache now?
+  for (int i = 0; i < n+1; i++) {
+    std::string key = std::to_string(i+1);
+    auto h = cache->Lookup(key);
+    std::cout << key << (h?" found\n":" not found\n");
+    // Only the first entry should be missing
+    ASSERT_TRUE(h || i == 0);
+    if (h) cache->Release(h);
+  }
+
+  for (int i = 0; i < n+1; i++) {
+    cache->Release(handles[i]);
+  }
+  std::cout << "Poor entries\n";
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/util/coding.cc b/util/coding.cc
new file mode 100644 (file)
index 0000000..ce67fa4
--- /dev/null
@@ -0,0 +1,329 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/coding.h"
+
+#include <algorithm>
+
+namespace rocksdb {
+
+void EncodeFixed32(char* buf, uint32_t value) {
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+  memcpy(buf, &value, sizeof(value));
+#else
+  buf[0] = value & 0xff;
+  buf[1] = (value >> 8) & 0xff;
+  buf[2] = (value >> 16) & 0xff;
+  buf[3] = (value >> 24) & 0xff;
+#endif
+}
+
+void EncodeFixed64(char* buf, uint64_t value) {
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+  memcpy(buf, &value, sizeof(value));
+#else
+  buf[0] = value & 0xff;
+  buf[1] = (value >> 8) & 0xff;
+  buf[2] = (value >> 16) & 0xff;
+  buf[3] = (value >> 24) & 0xff;
+  buf[4] = (value >> 32) & 0xff;
+  buf[5] = (value >> 40) & 0xff;
+  buf[6] = (value >> 48) & 0xff;
+  buf[7] = (value >> 56) & 0xff;
+#endif
+}
+
+void PutFixed32(std::string* dst, uint32_t value) {
+  char buf[sizeof(value)];
+  EncodeFixed32(buf, value);
+  dst->append(buf, sizeof(buf));
+}
+
+void PutFixed64(std::string* dst, uint64_t value) {
+  char buf[sizeof(value)];
+  EncodeFixed64(buf, value);
+  dst->append(buf, sizeof(buf));
+}
+
+char* EncodeVarint32(char* dst, uint32_t v) {
+  // Operate on characters as unsigneds
+  unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
+  static const int B = 128;
+  if (v < (1<<7)) {
+    *(ptr++) = v;
+  } else if (v < (1<<14)) {
+    *(ptr++) = v | B;
+    *(ptr++) = v>>7;
+  } else if (v < (1<<21)) {
+    *(ptr++) = v | B;
+    *(ptr++) = (v>>7) | B;
+    *(ptr++) = v>>14;
+  } else if (v < (1<<28)) {
+    *(ptr++) = v | B;
+    *(ptr++) = (v>>7) | B;
+    *(ptr++) = (v>>14) | B;
+    *(ptr++) = v>>21;
+  } else {
+    *(ptr++) = v | B;
+    *(ptr++) = (v>>7) | B;
+    *(ptr++) = (v>>14) | B;
+    *(ptr++) = (v>>21) | B;
+    *(ptr++) = v>>28;
+  }
+  return reinterpret_cast<char*>(ptr);
+}
+
+void PutVarint32(std::string* dst, uint32_t v) {
+  char buf[5];
+  char* ptr = EncodeVarint32(buf, v);
+  dst->append(buf, ptr - buf);
+}
+
+char* EncodeVarint64(char* dst, uint64_t v) {
+  static const unsigned int B = 128;
+  unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
+  while (v >= B) {
+    *(ptr++) = (v & (B-1)) | B;
+    v >>= 7;
+  }
+  *(ptr++) = static_cast<unsigned char>(v);
+  return reinterpret_cast<char*>(ptr);
+}
+
+void PutVarint64(std::string* dst, uint64_t v) {
+  char buf[10];
+  char* ptr = EncodeVarint64(buf, v);
+  dst->append(buf, ptr - buf);
+}
+
+void PutLengthPrefixedSlice(std::string* dst, const Slice& value) {
+  PutVarint32(dst, value.size());
+  dst->append(value.data(), value.size());
+}
+
+void PutLengthPrefixedSliceParts(std::string* dst,
+                                 const SliceParts& slice_parts) {
+  uint32_t total_bytes = 0;
+  for (int i = 0; i < slice_parts.num_parts; ++i) {
+    total_bytes += slice_parts.parts[i].size();
+  }
+  PutVarint32(dst, total_bytes);
+  for (int i = 0; i < slice_parts.num_parts; ++i) {
+    dst->append(slice_parts.parts[i].data(), slice_parts.parts[i].size());
+  }
+}
+
+int VarintLength(uint64_t v) {
+  int len = 1;
+  while (v >= 128) {
+    v >>= 7;
+    len++;
+  }
+  return len;
+}
+
+const char* GetVarint32PtrFallback(const char* p,
+                                   const char* limit,
+                                   uint32_t* value) {
+  uint32_t result = 0;
+  for (uint32_t shift = 0; shift <= 28 && p < limit; shift += 7) {
+    uint32_t byte = *(reinterpret_cast<const unsigned char*>(p));
+    p++;
+    if (byte & 128) {
+      // More bytes are present
+      result |= ((byte & 127) << shift);
+    } else {
+      result |= (byte << shift);
+      *value = result;
+      return reinterpret_cast<const char*>(p);
+    }
+  }
+  return nullptr;
+}
+
+bool GetVarint32(Slice* input, uint32_t* value) {
+  const char* p = input->data();
+  const char* limit = p + input->size();
+  const char* q = GetVarint32Ptr(p, limit, value);
+  if (q == nullptr) {
+    return false;
+  } else {
+    *input = Slice(q, limit - q);
+    return true;
+  }
+}
+
+const char* GetVarint64Ptr(const char* p, const char* limit, uint64_t* value) {
+  uint64_t result = 0;
+  for (uint32_t shift = 0; shift <= 63 && p < limit; shift += 7) {
+    uint64_t byte = *(reinterpret_cast<const unsigned char*>(p));
+    p++;
+    if (byte & 128) {
+      // More bytes are present
+      result |= ((byte & 127) << shift);
+    } else {
+      result |= (byte << shift);
+      *value = result;
+      return reinterpret_cast<const char*>(p);
+    }
+  }
+  return nullptr;
+}
+
+bool GetVarint64(Slice* input, uint64_t* value) {
+  const char* p = input->data();
+  const char* limit = p + input->size();
+  const char* q = GetVarint64Ptr(p, limit, value);
+  if (q == nullptr) {
+    return false;
+  } else {
+    *input = Slice(q, limit - q);
+    return true;
+  }
+}
+
+const char* GetLengthPrefixedSlice(const char* p, const char* limit,
+                                   Slice* result) {
+  uint32_t len;
+  p = GetVarint32Ptr(p, limit, &len);
+  if (p == nullptr) return nullptr;
+  if (p + len > limit) return nullptr;
+  *result = Slice(p, len);
+  return p + len;
+}
+
+bool GetLengthPrefixedSlice(Slice* input, Slice* result) {
+  uint32_t len;
+  if (GetVarint32(input, &len) &&
+      input->size() >= len) {
+    *result = Slice(input->data(), len);
+    input->remove_prefix(len);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+Slice GetLengthPrefixedSlice(const char* data) {
+  uint32_t len;
+  const char* p = data;
+  p = GetVarint32Ptr(p, p + 5, &len);  // +5: we assume "p" is not corrupted
+  return Slice(p, len);
+}
+
+Slice GetSliceUntil(Slice* slice, char delimiter) {
+  uint32_t len;
+  for (len = 0; len < slice->size() && slice->data()[len] != delimiter; ++len) {
+    // nothing
+  }
+
+  Slice ret(slice->data(), len);
+  slice->remove_prefix(len + ((len < slice->size()) ? 1 : 0));
+  return ret;
+}
+
+void BitStreamPutInt(char* dst, size_t dstlen, size_t offset,
+                     uint32_t bits, uint64_t value) {
+  assert((offset + bits + 7)/8 <= dstlen);
+  assert(bits <= 64);
+
+  unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
+
+  size_t byteOffset = offset / 8;
+  size_t bitOffset = offset % 8;
+
+  // This prevents unused variable warnings when compiling.
+#ifndef NDEBUG
+  // Store truncated value.
+  uint64_t origValue = (bits < 64)?(value & (((uint64_t)1 << bits) - 1)):value;
+  uint32_t origBits = bits;
+#endif
+
+  while (bits > 0) {
+    size_t bitsToGet = std::min<size_t>(bits, 8 - bitOffset);
+    unsigned char mask = ((1 << bitsToGet) - 1);
+
+    ptr[byteOffset] = (ptr[byteOffset] & ~(mask << bitOffset)) +
+                      ((value & mask) << bitOffset);
+
+    value >>= bitsToGet;
+    byteOffset += 1;
+    bitOffset = 0;
+    bits -= bitsToGet;
+  }
+
+  assert(origValue == BitStreamGetInt(dst, dstlen, offset, origBits));
+}
+
+uint64_t BitStreamGetInt(const char* src, size_t srclen, size_t offset,
+                         uint32_t bits) {
+  assert((offset + bits + 7)/8 <= srclen);
+  assert(bits <= 64);
+
+  const unsigned char* ptr = reinterpret_cast<const unsigned char*>(src);
+
+  uint64_t result = 0;
+
+  size_t byteOffset = offset / 8;
+  size_t bitOffset = offset % 8;
+  size_t shift = 0;
+
+  while (bits > 0) {
+    size_t bitsToGet = std::min<size_t>(bits, 8 - bitOffset);
+    unsigned char mask = ((1 << bitsToGet) - 1);
+
+    result += (uint64_t)((ptr[byteOffset] >> bitOffset) & mask) << shift;
+
+    shift += bitsToGet;
+    byteOffset += 1;
+    bitOffset = 0;
+    bits -= bitsToGet;
+  }
+
+  return result;
+}
+
+void BitStreamPutInt(std::string* dst, size_t offset, uint32_t bits,
+                     uint64_t value) {
+  assert((offset + bits + 7)/8 <= dst->size());
+
+  const size_t kTmpBufLen = sizeof(value) + 1;
+  char tmpBuf[kTmpBufLen];
+
+  // Number of bytes of tmpBuf being used
+  const size_t kUsedBytes = (offset%8 + bits)/8;
+
+  // Copy relevant parts of dst to tmpBuf
+  for (size_t idx = 0; idx <= kUsedBytes; ++idx) {
+    tmpBuf[idx] = (*dst)[offset/8 + idx];
+  }
+
+  BitStreamPutInt(tmpBuf, kTmpBufLen, offset%8, bits, value);
+
+  // Copy tmpBuf back to dst
+  for (size_t idx = 0; idx <= kUsedBytes; ++idx) {
+    (*dst)[offset/8 + idx] = tmpBuf[idx];
+  }
+
+  // Do the check here too as we are working with a buffer.
+  assert(((bits < 64)?(value & (((uint64_t)1 << bits) - 1)):value) ==
+         BitStreamGetInt(dst, offset, bits));
+}
+
+uint64_t BitStreamGetInt(const std::string* src, size_t offset,
+                         uint32_t bits) {
+  return BitStreamGetInt(src->data(), src->size(), offset, bits);
+}
+
+uint64_t BitStreamGetInt(const Slice* src, size_t offset,
+                         uint32_t bits) {
+  return BitStreamGetInt(src->data(), src->size(), offset, bits);
+}
+
+}  // namespace rocksdb
diff --git a/util/coding.h b/util/coding.h
new file mode 100644 (file)
index 0000000..4477dc7
--- /dev/null
@@ -0,0 +1,139 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Endian-neutral encoding:
+// * Fixed-length numbers are encoded with least-significant byte first
+// * In addition we support variable length "varint" encoding
+// * Strings are encoded prefixed by their length in varint format
+
+#pragma once
+#include <stdint.h>
+#include <string.h>
+#include <string>
+#include "port/port.h"
+
+namespace rocksdb {
+
+// The maximum length of a varint in bytes for 32 and 64 bits respectively.
+const unsigned int kMaxVarint32Length = 5;
+const unsigned int kMaxVarint64Length = 10;
+
+// Standard Put... routines append to a string
+extern void PutFixed32(std::string* dst, uint32_t value);
+extern void PutFixed64(std::string* dst, uint64_t value);
+extern void PutVarint32(std::string* dst, uint32_t value);
+extern void PutVarint64(std::string* dst, uint64_t value);
+extern void PutLengthPrefixedSlice(std::string* dst, const Slice& value);
+extern void PutLengthPrefixedSliceParts(std::string* dst,
+                                        const SliceParts& slice_parts);
+
+// Standard Get... routines parse a value from the beginning of a Slice
+// and advance the slice past the parsed value.
+extern bool GetVarint32(Slice* input, uint32_t* value);
+extern bool GetVarint64(Slice* input, uint64_t* value);
+extern bool GetLengthPrefixedSlice(Slice* input, Slice* result);
+extern Slice GetLengthPrefixedSlice(const char* data);
+
+extern Slice GetSliceUntil(Slice* slice, char delimiter);
+
+// Pointer-based variants of GetVarint...  These either store a value
+// in *v and return a pointer just past the parsed value, or return
+// nullptr on error.  These routines only look at bytes in the range
+// [p..limit-1]
+extern const char* GetVarint32Ptr(const char* p,const char* limit, uint32_t* v);
+extern const char* GetVarint64Ptr(const char* p,const char* limit, uint64_t* v);
+
+// Returns the length of the varint32 or varint64 encoding of "v"
+extern int VarintLength(uint64_t v);
+
+// Lower-level versions of Put... that write directly into a character buffer
+// REQUIRES: dst has enough space for the value being written
+extern void EncodeFixed32(char* dst, uint32_t value);
+extern void EncodeFixed64(char* dst, uint64_t value);
+
+// Lower-level versions of Put... that write directly into a character buffer
+// and return a pointer just past the last byte written.
+// REQUIRES: dst has enough space for the value being written
+extern char* EncodeVarint32(char* dst, uint32_t value);
+extern char* EncodeVarint64(char* dst, uint64_t value);
+
+// Lower-level versions of Get... that read directly from a character buffer
+// without any bounds checking.
+
+inline uint32_t DecodeFixed32(const char* ptr) {
+  if (port::kLittleEndian) {
+    // Load the raw bytes
+    uint32_t result;
+    memcpy(&result, ptr, sizeof(result));  // gcc optimizes this to a plain load
+    return result;
+  } else {
+    return ((static_cast<uint32_t>(static_cast<unsigned char>(ptr[0])))
+        | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[1])) << 8)
+        | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[2])) << 16)
+        | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[3])) << 24));
+  }
+}
+
+inline uint64_t DecodeFixed64(const char* ptr) {
+  if (port::kLittleEndian) {
+    // Load the raw bytes
+    uint64_t result;
+    memcpy(&result, ptr, sizeof(result));  // gcc optimizes this to a plain load
+    return result;
+  } else {
+    uint64_t lo = DecodeFixed32(ptr);
+    uint64_t hi = DecodeFixed32(ptr + 4);
+    return (hi << 32) | lo;
+  }
+}
+
+// Internal routine for use by fallback path of GetVarint32Ptr
+extern const char* GetVarint32PtrFallback(const char* p,
+                                          const char* limit,
+                                          uint32_t* value);
+inline const char* GetVarint32Ptr(const char* p,
+                                  const char* limit,
+                                  uint32_t* value) {
+  if (p < limit) {
+    uint32_t result = *(reinterpret_cast<const unsigned char*>(p));
+    if ((result & 128) == 0) {
+      *value = result;
+      return p + 1;
+    }
+  }
+  return GetVarint32PtrFallback(p, limit, value);
+}
+
+// Writes an unsigned integer with bits number of bits with its least
+// significant bit at offset.
+// Bits are numbered from 0 to 7 in the first byte, 8 to 15 in the second and
+// so on.
+// value is truncated to the bits number of least significant bits.
+// REQUIRES: (offset+bits+7)/8 <= dstlen
+// REQUIRES: bits <= 64
+extern void BitStreamPutInt(char* dst, size_t dstlen, size_t offset,
+                            uint32_t bits, uint64_t value);
+
+// Reads an unsigned integer with bits number of bits with its least
+// significant bit at offset.
+// Bits are numbered in the same way as ByteStreamPutInt().
+// REQUIRES: (offset+bits+7)/8 <= srclen
+// REQUIRES: bits <= 64
+extern uint64_t BitStreamGetInt(const char* src, size_t srclen, size_t offset,
+                                uint32_t bits);
+
+// Convenience functions
+extern void BitStreamPutInt(std::string* dst, size_t offset, uint32_t bits,
+                            uint64_t value);
+extern uint64_t BitStreamGetInt(const std::string* src, size_t offset,
+                                uint32_t bits);
+extern uint64_t BitStreamGetInt(const Slice* src, size_t offset,
+                                uint32_t bits);
+
+}  // namespace rocksdb
diff --git a/util/coding_test.cc b/util/coding_test.cc
new file mode 100644 (file)
index 0000000..fb06132
--- /dev/null
@@ -0,0 +1,296 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/coding.h"
+
+#include "util/testharness.h"
+
+namespace rocksdb {
+
+class Coding { };
+
+TEST(Coding, Fixed32) {
+  std::string s;
+  for (uint32_t v = 0; v < 100000; v++) {
+    PutFixed32(&s, v);
+  }
+
+  const char* p = s.data();
+  for (uint32_t v = 0; v < 100000; v++) {
+    uint32_t actual = DecodeFixed32(p);
+    ASSERT_EQ(v, actual);
+    p += sizeof(uint32_t);
+  }
+}
+
+TEST(Coding, Fixed64) {
+  std::string s;
+  for (int power = 0; power <= 63; power++) {
+    uint64_t v = static_cast<uint64_t>(1) << power;
+    PutFixed64(&s, v - 1);
+    PutFixed64(&s, v + 0);
+    PutFixed64(&s, v + 1);
+  }
+
+  const char* p = s.data();
+  for (int power = 0; power <= 63; power++) {
+    uint64_t v = static_cast<uint64_t>(1) << power;
+    uint64_t actual;
+    actual = DecodeFixed64(p);
+    ASSERT_EQ(v-1, actual);
+    p += sizeof(uint64_t);
+
+    actual = DecodeFixed64(p);
+    ASSERT_EQ(v+0, actual);
+    p += sizeof(uint64_t);
+
+    actual = DecodeFixed64(p);
+    ASSERT_EQ(v+1, actual);
+    p += sizeof(uint64_t);
+  }
+}
+
+// Test that encoding routines generate little-endian encodings
+TEST(Coding, EncodingOutput) {
+  std::string dst;
+  PutFixed32(&dst, 0x04030201);
+  ASSERT_EQ(4U, dst.size());
+  ASSERT_EQ(0x01, static_cast<int>(dst[0]));
+  ASSERT_EQ(0x02, static_cast<int>(dst[1]));
+  ASSERT_EQ(0x03, static_cast<int>(dst[2]));
+  ASSERT_EQ(0x04, static_cast<int>(dst[3]));
+
+  dst.clear();
+  PutFixed64(&dst, 0x0807060504030201ull);
+  ASSERT_EQ(8U, dst.size());
+  ASSERT_EQ(0x01, static_cast<int>(dst[0]));
+  ASSERT_EQ(0x02, static_cast<int>(dst[1]));
+  ASSERT_EQ(0x03, static_cast<int>(dst[2]));
+  ASSERT_EQ(0x04, static_cast<int>(dst[3]));
+  ASSERT_EQ(0x05, static_cast<int>(dst[4]));
+  ASSERT_EQ(0x06, static_cast<int>(dst[5]));
+  ASSERT_EQ(0x07, static_cast<int>(dst[6]));
+  ASSERT_EQ(0x08, static_cast<int>(dst[7]));
+}
+
+TEST(Coding, Varint32) {
+  std::string s;
+  for (uint32_t i = 0; i < (32 * 32); i++) {
+    uint32_t v = (i / 32) << (i % 32);
+    PutVarint32(&s, v);
+  }
+
+  const char* p = s.data();
+  const char* limit = p + s.size();
+  for (uint32_t i = 0; i < (32 * 32); i++) {
+    uint32_t expected = (i / 32) << (i % 32);
+    uint32_t actual;
+    const char* start = p;
+    p = GetVarint32Ptr(p, limit, &actual);
+    ASSERT_TRUE(p != nullptr);
+    ASSERT_EQ(expected, actual);
+    ASSERT_EQ(VarintLength(actual), p - start);
+  }
+  ASSERT_EQ(p, s.data() + s.size());
+}
+
+TEST(Coding, Varint64) {
+  // Construct the list of values to check
+  std::vector<uint64_t> values;
+  // Some special values
+  values.push_back(0);
+  values.push_back(100);
+  values.push_back(~static_cast<uint64_t>(0));
+  values.push_back(~static_cast<uint64_t>(0) - 1);
+  for (uint32_t k = 0; k < 64; k++) {
+    // Test values near powers of two
+    const uint64_t power = 1ull << k;
+    values.push_back(power);
+    values.push_back(power-1);
+    values.push_back(power+1);
+  };
+
+  std::string s;
+  for (unsigned int i = 0; i < values.size(); i++) {
+    PutVarint64(&s, values[i]);
+  }
+
+  const char* p = s.data();
+  const char* limit = p + s.size();
+  for (unsigned int i = 0; i < values.size(); i++) {
+    ASSERT_TRUE(p < limit);
+    uint64_t actual;
+    const char* start = p;
+    p = GetVarint64Ptr(p, limit, &actual);
+    ASSERT_TRUE(p != nullptr);
+    ASSERT_EQ(values[i], actual);
+    ASSERT_EQ(VarintLength(actual), p - start);
+  }
+  ASSERT_EQ(p, limit);
+
+}
+
+TEST(Coding, Varint32Overflow) {
+  uint32_t result;
+  std::string input("\x81\x82\x83\x84\x85\x11");
+  ASSERT_TRUE(GetVarint32Ptr(input.data(), input.data() + input.size(), &result)
+              == nullptr);
+}
+
+TEST(Coding, Varint32Truncation) {
+  uint32_t large_value = (1u << 31) + 100;
+  std::string s;
+  PutVarint32(&s, large_value);
+  uint32_t result;
+  for (unsigned int len = 0; len < s.size() - 1; len++) {
+    ASSERT_TRUE(GetVarint32Ptr(s.data(), s.data() + len, &result) == nullptr);
+  }
+  ASSERT_TRUE(
+      GetVarint32Ptr(s.data(), s.data() + s.size(), &result) != nullptr);
+  ASSERT_EQ(large_value, result);
+}
+
+TEST(Coding, Varint64Overflow) {
+  uint64_t result;
+  std::string input("\x81\x82\x83\x84\x85\x81\x82\x83\x84\x85\x11");
+  ASSERT_TRUE(GetVarint64Ptr(input.data(), input.data() + input.size(), &result)
+              == nullptr);
+}
+
+TEST(Coding, Varint64Truncation) {
+  uint64_t large_value = (1ull << 63) + 100ull;
+  std::string s;
+  PutVarint64(&s, large_value);
+  uint64_t result;
+  for (unsigned int len = 0; len < s.size() - 1; len++) {
+    ASSERT_TRUE(GetVarint64Ptr(s.data(), s.data() + len, &result) == nullptr);
+  }
+  ASSERT_TRUE(
+      GetVarint64Ptr(s.data(), s.data() + s.size(), &result) != nullptr);
+  ASSERT_EQ(large_value, result);
+}
+
+TEST(Coding, Strings) {
+  std::string s;
+  PutLengthPrefixedSlice(&s, Slice(""));
+  PutLengthPrefixedSlice(&s, Slice("foo"));
+  PutLengthPrefixedSlice(&s, Slice("bar"));
+  PutLengthPrefixedSlice(&s, Slice(std::string(200, 'x')));
+
+  Slice input(s);
+  Slice v;
+  ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v));
+  ASSERT_EQ("", v.ToString());
+  ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v));
+  ASSERT_EQ("foo", v.ToString());
+  ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v));
+  ASSERT_EQ("bar", v.ToString());
+  ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v));
+  ASSERT_EQ(std::string(200, 'x'), v.ToString());
+  ASSERT_EQ("", input.ToString());
+}
+
+TEST(Coding, BitStream) {
+  const int kNumBytes = 10;
+  char bytes[kNumBytes+1];
+  for (int i = 0; i < kNumBytes + 1; ++i) {
+      bytes[i] = '\0';
+  }
+
+  // Simple byte aligned test.
+  for (int i = 0; i < kNumBytes; ++i) {
+    BitStreamPutInt(bytes, kNumBytes, i*8, 8, 255-i);
+
+    ASSERT_EQ((unsigned char)bytes[i], (unsigned char)(255-i));
+  }
+  for (int i = 0; i < kNumBytes; ++i) {
+    ASSERT_EQ(BitStreamGetInt(bytes, kNumBytes, i*8, 8), (uint32_t)(255-i));
+  }
+  ASSERT_EQ(bytes[kNumBytes], '\0');
+
+  // Write and read back at strange offsets
+  for (int i = 0; i < kNumBytes + 1; ++i) {
+      bytes[i] = '\0';
+  }
+  for (int i = 0; i < kNumBytes; ++i) {
+    BitStreamPutInt(bytes, kNumBytes, i*5+1, 4, (i * 7) % (1 << 4));
+  }
+  for (int i = 0; i < kNumBytes; ++i) {
+    ASSERT_EQ(BitStreamGetInt(bytes, kNumBytes, i*5+1, 4),
+              (uint32_t)((i * 7) % (1 << 4)));
+  }
+  ASSERT_EQ(bytes[kNumBytes], '\0');
+
+  // Create 11011011 as a bit pattern
+  for (int i = 0; i < kNumBytes + 1; ++i) {
+      bytes[i] = '\0';
+  }
+  for (int i = 0; i < kNumBytes; ++i) {
+    BitStreamPutInt(bytes, kNumBytes, i*8, 2, 3);
+    BitStreamPutInt(bytes, kNumBytes, i*8+3, 2, 3);
+    BitStreamPutInt(bytes, kNumBytes, i*8+6, 2, 3);
+
+    ASSERT_EQ((unsigned char)bytes[i],
+              (unsigned char)(3 + (3 << 3) + (3 << 6)));
+  }
+  ASSERT_EQ(bytes[kNumBytes], '\0');
+
+
+  // Test large values
+  for (int i = 0; i < kNumBytes + 1; ++i) {
+      bytes[i] = '\0';
+  }
+  BitStreamPutInt(bytes, kNumBytes, 0, 64, (uint64_t)(-1));
+  for (int i = 0; i < 64/8; ++i) {
+    ASSERT_EQ((unsigned char)bytes[i],
+              (unsigned char)(255));
+  }
+  ASSERT_EQ(bytes[64/8], '\0');
+
+
+}
+
+TEST(Coding, BitStreamConvenienceFuncs) {
+  std::string bytes(1, '\0');
+
+  // Check that independent changes to byte are preserved.
+  BitStreamPutInt(&bytes, 0, 2, 3);
+  BitStreamPutInt(&bytes, 3, 2, 3);
+  BitStreamPutInt(&bytes, 6, 2, 3);
+  ASSERT_EQ((unsigned char)bytes[0], (unsigned char)(3 + (3 << 3) + (3 << 6)));
+  ASSERT_EQ(BitStreamGetInt(&bytes, 0, 2), 3u);
+  ASSERT_EQ(BitStreamGetInt(&bytes, 3, 2), 3u);
+  ASSERT_EQ(BitStreamGetInt(&bytes, 6, 2), 3u);
+  Slice slice(bytes);
+  ASSERT_EQ(BitStreamGetInt(&slice, 0, 2), 3u);
+  ASSERT_EQ(BitStreamGetInt(&slice, 3, 2), 3u);
+  ASSERT_EQ(BitStreamGetInt(&slice, 6, 2), 3u);
+
+  // Test overlapping crossing over byte boundaries
+  bytes = std::string(2, '\0');
+  BitStreamPutInt(&bytes, 6, 4, 15);
+  ASSERT_EQ((unsigned char)bytes[0], 3 << 6);
+  ASSERT_EQ((unsigned char)bytes[1], 3);
+  ASSERT_EQ(BitStreamGetInt(&bytes, 6, 4), 15u);
+  slice = Slice(bytes);
+  ASSERT_EQ(BitStreamGetInt(&slice, 6, 4), 15u);
+
+  // Test 64-bit number
+  bytes = std::string(64/8, '\0');
+  BitStreamPutInt(&bytes, 0, 64, (uint64_t)(-1));
+  ASSERT_EQ(BitStreamGetInt(&bytes, 0, 64), (uint64_t)(-1));
+  slice = Slice(bytes);
+  ASSERT_EQ(BitStreamGetInt(&slice, 0, 64), (uint64_t)(-1));
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/util/comparator.cc b/util/comparator.cc
new file mode 100644 (file)
index 0000000..adeacac
--- /dev/null
@@ -0,0 +1,86 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <algorithm>
+#include <stdint.h>
+#include "rocksdb/comparator.h"
+#include "rocksdb/slice.h"
+#include "port/port.h"
+#include "util/logging.h"
+
+namespace rocksdb {
+
+Comparator::~Comparator() { }
+
+namespace {
+class BytewiseComparatorImpl : public Comparator {
+ public:
+  BytewiseComparatorImpl() { }
+
+  virtual const char* Name() const {
+    return "leveldb.BytewiseComparator";
+  }
+
+  virtual int Compare(const Slice& a, const Slice& b) const {
+    return a.compare(b);
+  }
+
+  virtual void FindShortestSeparator(
+      std::string* start,
+      const Slice& limit) const {
+    // Find length of common prefix
+    size_t min_length = std::min(start->size(), limit.size());
+    size_t diff_index = 0;
+    while ((diff_index < min_length) &&
+           ((*start)[diff_index] == limit[diff_index])) {
+      diff_index++;
+    }
+
+    if (diff_index >= min_length) {
+      // Do not shorten if one string is a prefix of the other
+    } else {
+      uint8_t diff_byte = static_cast<uint8_t>((*start)[diff_index]);
+      if (diff_byte < static_cast<uint8_t>(0xff) &&
+          diff_byte + 1 < static_cast<uint8_t>(limit[diff_index])) {
+        (*start)[diff_index]++;
+        start->resize(diff_index + 1);
+        assert(Compare(*start, limit) < 0);
+      }
+    }
+  }
+
+  virtual void FindShortSuccessor(std::string* key) const {
+    // Find first character that can be incremented
+    size_t n = key->size();
+    for (size_t i = 0; i < n; i++) {
+      const uint8_t byte = (*key)[i];
+      if (byte != static_cast<uint8_t>(0xff)) {
+        (*key)[i] = byte + 1;
+        key->resize(i+1);
+        return;
+      }
+    }
+    // *key is a run of 0xffs.  Leave it alone.
+  }
+};
+}  // namespace
+
+static port::OnceType once = LEVELDB_ONCE_INIT;
+static const Comparator* bytewise;
+
+static void InitModule() {
+  bytewise = new BytewiseComparatorImpl;
+}
+
+const Comparator* BytewiseComparator() {
+  port::InitOnce(&once, InitModule);
+  return bytewise;
+}
+
+}  // namespace rocksdb
diff --git a/util/crc32c.cc b/util/crc32c.cc
new file mode 100644 (file)
index 0000000..bca955a
--- /dev/null
@@ -0,0 +1,393 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A portable implementation of crc32c, optimized to handle
+// four bytes at a time.
+
+#include "util/crc32c.h"
+
+#include <stdint.h>
+#ifdef __SSE4_2__
+#include <nmmintrin.h>
+#endif
+#include "util/coding.h"
+
+namespace rocksdb {
+namespace crc32c {
+
+static const uint32_t table0_[256] = {
+  0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4,
+  0xc79a971f, 0x35f1141c, 0x26a1e7e8, 0xd4ca64eb,
+  0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b,
+  0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24,
+  0x105ec76f, 0xe235446c, 0xf165b798, 0x030e349b,
+  0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384,
+  0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54,
+  0x5d1d08bf, 0xaf768bbc, 0xbc267848, 0x4e4dfb4b,
+  0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a,
+  0xe72719c1, 0x154c9ac2, 0x061c6936, 0xf477ea35,
+  0xaa64d611, 0x580f5512, 0x4b5fa6e6, 0xb93425e5,
+  0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa,
+  0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45,
+  0xf779deae, 0x05125dad, 0x1642ae59, 0xe4292d5a,
+  0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a,
+  0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595,
+  0x417b1dbc, 0xb3109ebf, 0xa0406d4b, 0x522bee48,
+  0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957,
+  0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687,
+  0x0c38d26c, 0xfe53516f, 0xed03a29b, 0x1f682198,
+  0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927,
+  0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38,
+  0xdbfc821c, 0x2997011f, 0x3ac7f2eb, 0xc8ac71e8,
+  0x1c661503, 0xee0d9600, 0xfd5d65f4, 0x0f36e6f7,
+  0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096,
+  0xa65c047d, 0x5437877e, 0x4767748a, 0xb50cf789,
+  0xeb1fcbad, 0x197448ae, 0x0a24bb5a, 0xf84f3859,
+  0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46,
+  0x7198540d, 0x83f3d70e, 0x90a324fa, 0x62c8a7f9,
+  0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6,
+  0xfb410cc2, 0x092a8fc1, 0x1a7a7c35, 0xe811ff36,
+  0x3cdb9bdd, 0xceb018de, 0xdde0eb2a, 0x2f8b6829,
+  0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c,
+  0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93,
+  0x082f63b7, 0xfa44e0b4, 0xe9141340, 0x1b7f9043,
+  0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c,
+  0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3,
+  0x55326b08, 0xa759e80b, 0xb4091bff, 0x466298fc,
+  0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0x0b21572c,
+  0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033,
+  0xa24bb5a6, 0x502036a5, 0x4370c551, 0xb11b4652,
+  0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d,
+  0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d,
+  0xef087a76, 0x1d63f975, 0x0e330a81, 0xfc588982,
+  0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d,
+  0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622,
+  0x38cc2a06, 0xcaa7a905, 0xd9f75af1, 0x2b9cd9f2,
+  0xff56bd19, 0x0d3d3e1a, 0x1e6dcdee, 0xec064eed,
+  0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530,
+  0x0417b1db, 0xf67c32d8, 0xe52cc12c, 0x1747422f,
+  0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff,
+  0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0,
+  0xd3d3e1ab, 0x21b862a8, 0x32e8915c, 0xc083125f,
+  0x144976b4, 0xe622f5b7, 0xf5720643, 0x07198540,
+  0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90,
+  0x9e902e7b, 0x6cfbad78, 0x7fab5e8c, 0x8dc0dd8f,
+  0xe330a81a, 0x115b2b19, 0x020bd8ed, 0xf0605bee,
+  0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1,
+  0x69e9f0d5, 0x9b8273d6, 0x88d28022, 0x7ab90321,
+  0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e,
+  0xf36e6f75, 0x0105ec76, 0x12551f82, 0xe03e9c81,
+  0x34f4f86a, 0xc69f7b69, 0xd5cf889d, 0x27a40b9e,
+  0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e,
+  0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351
+};
+static const uint32_t table1_[256] = {
+  0x00000000, 0x13a29877, 0x274530ee, 0x34e7a899,
+  0x4e8a61dc, 0x5d28f9ab, 0x69cf5132, 0x7a6dc945,
+  0x9d14c3b8, 0x8eb65bcf, 0xba51f356, 0xa9f36b21,
+  0xd39ea264, 0xc03c3a13, 0xf4db928a, 0xe7790afd,
+  0x3fc5f181, 0x2c6769f6, 0x1880c16f, 0x0b225918,
+  0x714f905d, 0x62ed082a, 0x560aa0b3, 0x45a838c4,
+  0xa2d13239, 0xb173aa4e, 0x859402d7, 0x96369aa0,
+  0xec5b53e5, 0xfff9cb92, 0xcb1e630b, 0xd8bcfb7c,
+  0x7f8be302, 0x6c297b75, 0x58ced3ec, 0x4b6c4b9b,
+  0x310182de, 0x22a31aa9, 0x1644b230, 0x05e62a47,
+  0xe29f20ba, 0xf13db8cd, 0xc5da1054, 0xd6788823,
+  0xac154166, 0xbfb7d911, 0x8b507188, 0x98f2e9ff,
+  0x404e1283, 0x53ec8af4, 0x670b226d, 0x74a9ba1a,
+  0x0ec4735f, 0x1d66eb28, 0x298143b1, 0x3a23dbc6,
+  0xdd5ad13b, 0xcef8494c, 0xfa1fe1d5, 0xe9bd79a2,
+  0x93d0b0e7, 0x80722890, 0xb4958009, 0xa737187e,
+  0xff17c604, 0xecb55e73, 0xd852f6ea, 0xcbf06e9d,
+  0xb19da7d8, 0xa23f3faf, 0x96d89736, 0x857a0f41,
+  0x620305bc, 0x71a19dcb, 0x45463552, 0x56e4ad25,
+  0x2c896460, 0x3f2bfc17, 0x0bcc548e, 0x186eccf9,
+  0xc0d23785, 0xd370aff2, 0xe797076b, 0xf4359f1c,
+  0x8e585659, 0x9dface2e, 0xa91d66b7, 0xbabffec0,
+  0x5dc6f43d, 0x4e646c4a, 0x7a83c4d3, 0x69215ca4,
+  0x134c95e1, 0x00ee0d96, 0x3409a50f, 0x27ab3d78,
+  0x809c2506, 0x933ebd71, 0xa7d915e8, 0xb47b8d9f,
+  0xce1644da, 0xddb4dcad, 0xe9537434, 0xfaf1ec43,
+  0x1d88e6be, 0x0e2a7ec9, 0x3acdd650, 0x296f4e27,
+  0x53028762, 0x40a01f15, 0x7447b78c, 0x67e52ffb,
+  0xbf59d487, 0xacfb4cf0, 0x981ce469, 0x8bbe7c1e,
+  0xf1d3b55b, 0xe2712d2c, 0xd69685b5, 0xc5341dc2,
+  0x224d173f, 0x31ef8f48, 0x050827d1, 0x16aabfa6,
+  0x6cc776e3, 0x7f65ee94, 0x4b82460d, 0x5820de7a,
+  0xfbc3faf9, 0xe861628e, 0xdc86ca17, 0xcf245260,
+  0xb5499b25, 0xa6eb0352, 0x920cabcb, 0x81ae33bc,
+  0x66d73941, 0x7575a136, 0x419209af, 0x523091d8,
+  0x285d589d, 0x3bffc0ea, 0x0f186873, 0x1cbaf004,
+  0xc4060b78, 0xd7a4930f, 0xe3433b96, 0xf0e1a3e1,
+  0x8a8c6aa4, 0x992ef2d3, 0xadc95a4a, 0xbe6bc23d,
+  0x5912c8c0, 0x4ab050b7, 0x7e57f82e, 0x6df56059,
+  0x1798a91c, 0x043a316b, 0x30dd99f2, 0x237f0185,
+  0x844819fb, 0x97ea818c, 0xa30d2915, 0xb0afb162,
+  0xcac27827, 0xd960e050, 0xed8748c9, 0xfe25d0be,
+  0x195cda43, 0x0afe4234, 0x3e19eaad, 0x2dbb72da,
+  0x57d6bb9f, 0x447423e8, 0x70938b71, 0x63311306,
+  0xbb8de87a, 0xa82f700d, 0x9cc8d894, 0x8f6a40e3,
+  0xf50789a6, 0xe6a511d1, 0xd242b948, 0xc1e0213f,
+  0x26992bc2, 0x353bb3b5, 0x01dc1b2c, 0x127e835b,
+  0x68134a1e, 0x7bb1d269, 0x4f567af0, 0x5cf4e287,
+  0x04d43cfd, 0x1776a48a, 0x23910c13, 0x30339464,
+  0x4a5e5d21, 0x59fcc556, 0x6d1b6dcf, 0x7eb9f5b8,
+  0x99c0ff45, 0x8a626732, 0xbe85cfab, 0xad2757dc,
+  0xd74a9e99, 0xc4e806ee, 0xf00fae77, 0xe3ad3600,
+  0x3b11cd7c, 0x28b3550b, 0x1c54fd92, 0x0ff665e5,
+  0x759baca0, 0x663934d7, 0x52de9c4e, 0x417c0439,
+  0xa6050ec4, 0xb5a796b3, 0x81403e2a, 0x92e2a65d,
+  0xe88f6f18, 0xfb2df76f, 0xcfca5ff6, 0xdc68c781,
+  0x7b5fdfff, 0x68fd4788, 0x5c1aef11, 0x4fb87766,
+  0x35d5be23, 0x26772654, 0x12908ecd, 0x013216ba,
+  0xe64b1c47, 0xf5e98430, 0xc10e2ca9, 0xd2acb4de,
+  0xa8c17d9b, 0xbb63e5ec, 0x8f844d75, 0x9c26d502,
+  0x449a2e7e, 0x5738b609, 0x63df1e90, 0x707d86e7,
+  0x0a104fa2, 0x19b2d7d5, 0x2d557f4c, 0x3ef7e73b,
+  0xd98eedc6, 0xca2c75b1, 0xfecbdd28, 0xed69455f,
+  0x97048c1a, 0x84a6146d, 0xb041bcf4, 0xa3e32483
+};
+static const uint32_t table2_[256] = {
+  0x00000000, 0xa541927e, 0x4f6f520d, 0xea2ec073,
+  0x9edea41a, 0x3b9f3664, 0xd1b1f617, 0x74f06469,
+  0x38513ec5, 0x9d10acbb, 0x773e6cc8, 0xd27ffeb6,
+  0xa68f9adf, 0x03ce08a1, 0xe9e0c8d2, 0x4ca15aac,
+  0x70a27d8a, 0xd5e3eff4, 0x3fcd2f87, 0x9a8cbdf9,
+  0xee7cd990, 0x4b3d4bee, 0xa1138b9d, 0x045219e3,
+  0x48f3434f, 0xedb2d131, 0x079c1142, 0xa2dd833c,
+  0xd62de755, 0x736c752b, 0x9942b558, 0x3c032726,
+  0xe144fb14, 0x4405696a, 0xae2ba919, 0x0b6a3b67,
+  0x7f9a5f0e, 0xdadbcd70, 0x30f50d03, 0x95b49f7d,
+  0xd915c5d1, 0x7c5457af, 0x967a97dc, 0x333b05a2,
+  0x47cb61cb, 0xe28af3b5, 0x08a433c6, 0xade5a1b8,
+  0x91e6869e, 0x34a714e0, 0xde89d493, 0x7bc846ed,
+  0x0f382284, 0xaa79b0fa, 0x40577089, 0xe516e2f7,
+  0xa9b7b85b, 0x0cf62a25, 0xe6d8ea56, 0x43997828,
+  0x37691c41, 0x92288e3f, 0x78064e4c, 0xdd47dc32,
+  0xc76580d9, 0x622412a7, 0x880ad2d4, 0x2d4b40aa,
+  0x59bb24c3, 0xfcfab6bd, 0x16d476ce, 0xb395e4b0,
+  0xff34be1c, 0x5a752c62, 0xb05bec11, 0x151a7e6f,
+  0x61ea1a06, 0xc4ab8878, 0x2e85480b, 0x8bc4da75,
+  0xb7c7fd53, 0x12866f2d, 0xf8a8af5e, 0x5de93d20,
+  0x29195949, 0x8c58cb37, 0x66760b44, 0xc337993a,
+  0x8f96c396, 0x2ad751e8, 0xc0f9919b, 0x65b803e5,
+  0x1148678c, 0xb409f5f2, 0x5e273581, 0xfb66a7ff,
+  0x26217bcd, 0x8360e9b3, 0x694e29c0, 0xcc0fbbbe,
+  0xb8ffdfd7, 0x1dbe4da9, 0xf7908dda, 0x52d11fa4,
+  0x1e704508, 0xbb31d776, 0x511f1705, 0xf45e857b,
+  0x80aee112, 0x25ef736c, 0xcfc1b31f, 0x6a802161,
+  0x56830647, 0xf3c29439, 0x19ec544a, 0xbcadc634,
+  0xc85da25d, 0x6d1c3023, 0x8732f050, 0x2273622e,
+  0x6ed23882, 0xcb93aafc, 0x21bd6a8f, 0x84fcf8f1,
+  0xf00c9c98, 0x554d0ee6, 0xbf63ce95, 0x1a225ceb,
+  0x8b277743, 0x2e66e53d, 0xc448254e, 0x6109b730,
+  0x15f9d359, 0xb0b84127, 0x5a968154, 0xffd7132a,
+  0xb3764986, 0x1637dbf8, 0xfc191b8b, 0x595889f5,
+  0x2da8ed9c, 0x88e97fe2, 0x62c7bf91, 0xc7862def,
+  0xfb850ac9, 0x5ec498b7, 0xb4ea58c4, 0x11abcaba,
+  0x655baed3, 0xc01a3cad, 0x2a34fcde, 0x8f756ea0,
+  0xc3d4340c, 0x6695a672, 0x8cbb6601, 0x29faf47f,
+  0x5d0a9016, 0xf84b0268, 0x1265c21b, 0xb7245065,
+  0x6a638c57, 0xcf221e29, 0x250cde5a, 0x804d4c24,
+  0xf4bd284d, 0x51fcba33, 0xbbd27a40, 0x1e93e83e,
+  0x5232b292, 0xf77320ec, 0x1d5de09f, 0xb81c72e1,
+  0xccec1688, 0x69ad84f6, 0x83834485, 0x26c2d6fb,
+  0x1ac1f1dd, 0xbf8063a3, 0x55aea3d0, 0xf0ef31ae,
+  0x841f55c7, 0x215ec7b9, 0xcb7007ca, 0x6e3195b4,
+  0x2290cf18, 0x87d15d66, 0x6dff9d15, 0xc8be0f6b,
+  0xbc4e6b02, 0x190ff97c, 0xf321390f, 0x5660ab71,
+  0x4c42f79a, 0xe90365e4, 0x032da597, 0xa66c37e9,
+  0xd29c5380, 0x77ddc1fe, 0x9df3018d, 0x38b293f3,
+  0x7413c95f, 0xd1525b21, 0x3b7c9b52, 0x9e3d092c,
+  0xeacd6d45, 0x4f8cff3b, 0xa5a23f48, 0x00e3ad36,
+  0x3ce08a10, 0x99a1186e, 0x738fd81d, 0xd6ce4a63,
+  0xa23e2e0a, 0x077fbc74, 0xed517c07, 0x4810ee79,
+  0x04b1b4d5, 0xa1f026ab, 0x4bdee6d8, 0xee9f74a6,
+  0x9a6f10cf, 0x3f2e82b1, 0xd50042c2, 0x7041d0bc,
+  0xad060c8e, 0x08479ef0, 0xe2695e83, 0x4728ccfd,
+  0x33d8a894, 0x96993aea, 0x7cb7fa99, 0xd9f668e7,
+  0x9557324b, 0x3016a035, 0xda386046, 0x7f79f238,
+  0x0b899651, 0xaec8042f, 0x44e6c45c, 0xe1a75622,
+  0xdda47104, 0x78e5e37a, 0x92cb2309, 0x378ab177,
+  0x437ad51e, 0xe63b4760, 0x0c158713, 0xa954156d,
+  0xe5f54fc1, 0x40b4ddbf, 0xaa9a1dcc, 0x0fdb8fb2,
+  0x7b2bebdb, 0xde6a79a5, 0x3444b9d6, 0x91052ba8
+};
+static const uint32_t table3_[256] = {
+  0x00000000, 0xdd45aab8, 0xbf672381, 0x62228939,
+  0x7b2231f3, 0xa6679b4b, 0xc4451272, 0x1900b8ca,
+  0xf64463e6, 0x2b01c95e, 0x49234067, 0x9466eadf,
+  0x8d665215, 0x5023f8ad, 0x32017194, 0xef44db2c,
+  0xe964b13d, 0x34211b85, 0x560392bc, 0x8b463804,
+  0x924680ce, 0x4f032a76, 0x2d21a34f, 0xf06409f7,
+  0x1f20d2db, 0xc2657863, 0xa047f15a, 0x7d025be2,
+  0x6402e328, 0xb9474990, 0xdb65c0a9, 0x06206a11,
+  0xd725148b, 0x0a60be33, 0x6842370a, 0xb5079db2,
+  0xac072578, 0x71428fc0, 0x136006f9, 0xce25ac41,
+  0x2161776d, 0xfc24ddd5, 0x9e0654ec, 0x4343fe54,
+  0x5a43469e, 0x8706ec26, 0xe524651f, 0x3861cfa7,
+  0x3e41a5b6, 0xe3040f0e, 0x81268637, 0x5c632c8f,
+  0x45639445, 0x98263efd, 0xfa04b7c4, 0x27411d7c,
+  0xc805c650, 0x15406ce8, 0x7762e5d1, 0xaa274f69,
+  0xb327f7a3, 0x6e625d1b, 0x0c40d422, 0xd1057e9a,
+  0xaba65fe7, 0x76e3f55f, 0x14c17c66, 0xc984d6de,
+  0xd0846e14, 0x0dc1c4ac, 0x6fe34d95, 0xb2a6e72d,
+  0x5de23c01, 0x80a796b9, 0xe2851f80, 0x3fc0b538,
+  0x26c00df2, 0xfb85a74a, 0x99a72e73, 0x44e284cb,
+  0x42c2eeda, 0x9f874462, 0xfda5cd5b, 0x20e067e3,
+  0x39e0df29, 0xe4a57591, 0x8687fca8, 0x5bc25610,
+  0xb4868d3c, 0x69c32784, 0x0be1aebd, 0xd6a40405,
+  0xcfa4bccf, 0x12e11677, 0x70c39f4e, 0xad8635f6,
+  0x7c834b6c, 0xa1c6e1d4, 0xc3e468ed, 0x1ea1c255,
+  0x07a17a9f, 0xdae4d027, 0xb8c6591e, 0x6583f3a6,
+  0x8ac7288a, 0x57828232, 0x35a00b0b, 0xe8e5a1b3,
+  0xf1e51979, 0x2ca0b3c1, 0x4e823af8, 0x93c79040,
+  0x95e7fa51, 0x48a250e9, 0x2a80d9d0, 0xf7c57368,
+  0xeec5cba2, 0x3380611a, 0x51a2e823, 0x8ce7429b,
+  0x63a399b7, 0xbee6330f, 0xdcc4ba36, 0x0181108e,
+  0x1881a844, 0xc5c402fc, 0xa7e68bc5, 0x7aa3217d,
+  0x52a0c93f, 0x8fe56387, 0xedc7eabe, 0x30824006,
+  0x2982f8cc, 0xf4c75274, 0x96e5db4d, 0x4ba071f5,
+  0xa4e4aad9, 0x79a10061, 0x1b838958, 0xc6c623e0,
+  0xdfc69b2a, 0x02833192, 0x60a1b8ab, 0xbde41213,
+  0xbbc47802, 0x6681d2ba, 0x04a35b83, 0xd9e6f13b,
+  0xc0e649f1, 0x1da3e349, 0x7f816a70, 0xa2c4c0c8,
+  0x4d801be4, 0x90c5b15c, 0xf2e73865, 0x2fa292dd,
+  0x36a22a17, 0xebe780af, 0x89c50996, 0x5480a32e,
+  0x8585ddb4, 0x58c0770c, 0x3ae2fe35, 0xe7a7548d,
+  0xfea7ec47, 0x23e246ff, 0x41c0cfc6, 0x9c85657e,
+  0x73c1be52, 0xae8414ea, 0xcca69dd3, 0x11e3376b,
+  0x08e38fa1, 0xd5a62519, 0xb784ac20, 0x6ac10698,
+  0x6ce16c89, 0xb1a4c631, 0xd3864f08, 0x0ec3e5b0,
+  0x17c35d7a, 0xca86f7c2, 0xa8a47efb, 0x75e1d443,
+  0x9aa50f6f, 0x47e0a5d7, 0x25c22cee, 0xf8878656,
+  0xe1873e9c, 0x3cc29424, 0x5ee01d1d, 0x83a5b7a5,
+  0xf90696d8, 0x24433c60, 0x4661b559, 0x9b241fe1,
+  0x8224a72b, 0x5f610d93, 0x3d4384aa, 0xe0062e12,
+  0x0f42f53e, 0xd2075f86, 0xb025d6bf, 0x6d607c07,
+  0x7460c4cd, 0xa9256e75, 0xcb07e74c, 0x16424df4,
+  0x106227e5, 0xcd278d5d, 0xaf050464, 0x7240aedc,
+  0x6b401616, 0xb605bcae, 0xd4273597, 0x09629f2f,
+  0xe6264403, 0x3b63eebb, 0x59416782, 0x8404cd3a,
+  0x9d0475f0, 0x4041df48, 0x22635671, 0xff26fcc9,
+  0x2e238253, 0xf36628eb, 0x9144a1d2, 0x4c010b6a,
+  0x5501b3a0, 0x88441918, 0xea669021, 0x37233a99,
+  0xd867e1b5, 0x05224b0d, 0x6700c234, 0xba45688c,
+  0xa345d046, 0x7e007afe, 0x1c22f3c7, 0xc167597f,
+  0xc747336e, 0x1a0299d6, 0x782010ef, 0xa565ba57,
+  0xbc65029d, 0x6120a825, 0x0302211c, 0xde478ba4,
+  0x31035088, 0xec46fa30, 0x8e647309, 0x5321d9b1,
+  0x4a21617b, 0x9764cbc3, 0xf54642fa, 0x2803e842
+};
+
+// Used to fetch a naturally-aligned 32-bit word in little endian byte-order
+static inline uint32_t LE_LOAD32(const uint8_t *p) {
+  return DecodeFixed32(reinterpret_cast<const char*>(p));
+}
+
+static inline uint64_t LE_LOAD64(const uint8_t *p) {
+  return DecodeFixed64(reinterpret_cast<const char*>(p));
+}
+
+static inline void Slow_CRC32(uint64_t* l, uint8_t const **p) {
+  uint32_t c = *l ^ LE_LOAD32(*p);
+  *p += 4;
+  *l = table3_[c & 0xff] ^
+  table2_[(c >> 8) & 0xff] ^
+  table1_[(c >> 16) & 0xff] ^
+  table0_[c >> 24];
+  // DO it twice.
+  c = *l ^ LE_LOAD32(*p);
+  *p += 4;
+  *l = table3_[c & 0xff] ^
+  table2_[(c >> 8) & 0xff] ^
+  table1_[(c >> 16) & 0xff] ^
+  table0_[c >> 24];
+}
+
+static inline void Fast_CRC32(uint64_t* l, uint8_t const **p) {
+  #ifdef __SSE4_2__
+  *l = _mm_crc32_u64(*l, LE_LOAD64(*p));
+  *p += 8;
+  #else
+  Slow_CRC32(l, p);
+  #endif
+}
+
+// Detect if SS42 or not.
+static bool isSSE42() {
+  #ifdef __GNUC__
+  uint32_t c_;
+  uint32_t d_;
+  __asm__("cpuid" : "=c"(c_), "=d"(d_) : "a"(1) : "ebx");
+  return c_ & (1U << 20); // copied from CpuId.h in Folly.
+  #else
+  return false;
+  #endif
+}
+
+typedef void (*Function)(uint64_t*, uint8_t const**);
+static Function func = nullptr;
+
+static inline Function Choose_CRC32() {
+  return isSSE42() ? Fast_CRC32 : Slow_CRC32;
+}
+
+static inline void CRC32(uint64_t* l, uint8_t const **p) {
+  if (func != nullptr) {
+    return func(l, p);
+  }
+  func = Choose_CRC32();
+  func(l, p);
+}
+
+uint32_t Extend(uint32_t crc, const char* buf, size_t size) {
+  const uint8_t *p = reinterpret_cast<const uint8_t *>(buf);
+  const uint8_t *e = p + size;
+  uint64_t l = crc ^ 0xffffffffu;
+
+// Align n to (1 << m) byte boundary
+#define ALIGN(n, m)     ((n + ((1 << m) - 1)) & ~((1 << m) - 1))
+
+#define STEP1 do {                              \
+    int c = (l & 0xff) ^ *p++;                  \
+    l = table0_[c] ^ (l >> 8);                  \
+} while (0)
+
+
+  // Point x at first 16-byte aligned byte in string.  This might be
+  // just past the end of the string.
+  const uintptr_t pval = reinterpret_cast<uintptr_t>(p);
+  const uint8_t* x = reinterpret_cast<const uint8_t*>(ALIGN(pval, 4));
+  if (x <= e) {
+    // Process bytes until finished or p is 16-byte aligned
+    while (p != x) {
+      STEP1;
+    }
+  }
+  // Process bytes 16 at a time
+  while ((e-p) >= 16) {
+    CRC32(&l, &p);
+    CRC32(&l, &p);
+  }
+  // Process bytes 8 at a time
+  while ((e-p) >= 8) {
+    CRC32(&l, &p);
+  }
+  // Process the last few bytes
+  while (p != e) {
+    STEP1;
+  }
+#undef STEP1
+#undef ALIGN
+  return l ^ 0xffffffffu;
+}
+
+}  // namespace crc32c
+}  // namespace rocksdb
diff --git a/util/crc32c.h b/util/crc32c.h
new file mode 100644 (file)
index 0000000..e5e6e14
--- /dev/null
@@ -0,0 +1,46 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <stddef.h>
+#include <stdint.h>
+
+namespace rocksdb {
+namespace crc32c {
+
+// Return the crc32c of concat(A, data[0,n-1]) where init_crc is the
+// crc32c of some string A.  Extend() is often used to maintain the
+// crc32c of a stream of data.
+extern uint32_t Extend(uint32_t init_crc, const char* data, size_t n);
+
+// Return the crc32c of data[0,n-1]
+inline uint32_t Value(const char* data, size_t n) {
+  return Extend(0, data, n);
+}
+
+static const uint32_t kMaskDelta = 0xa282ead8ul;
+
+// Return a masked representation of crc.
+//
+// Motivation: it is problematic to compute the CRC of a string that
+// contains embedded CRCs.  Therefore we recommend that CRCs stored
+// somewhere (e.g., in files) should be masked before being stored.
+inline uint32_t Mask(uint32_t crc) {
+  // Rotate right by 15 bits and add a constant.
+  return ((crc >> 15) | (crc << 17)) + kMaskDelta;
+}
+
+// Return the crc whose masked representation is masked_crc.
+inline uint32_t Unmask(uint32_t masked_crc) {
+  uint32_t rot = masked_crc - kMaskDelta;
+  return ((rot >> 17) | (rot << 15));
+}
+
+}  // namespace crc32c
+}  // namespace rocksdb
diff --git a/util/crc32c_test.cc b/util/crc32c_test.cc
new file mode 100644 (file)
index 0000000..300c9d3
--- /dev/null
@@ -0,0 +1,77 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/crc32c.h"
+#include "util/testharness.h"
+
+namespace rocksdb {
+namespace crc32c {
+
+class CRC { };
+
+TEST(CRC, StandardResults) {
+  // From rfc3720 section B.4.
+  char buf[32];
+
+  memset(buf, 0, sizeof(buf));
+  ASSERT_EQ(0x8a9136aaU, Value(buf, sizeof(buf)));
+
+  memset(buf, 0xff, sizeof(buf));
+  ASSERT_EQ(0x62a8ab43U, Value(buf, sizeof(buf)));
+
+  for (int i = 0; i < 32; i++) {
+    buf[i] = i;
+  }
+  ASSERT_EQ(0x46dd794eU, Value(buf, sizeof(buf)));
+
+  for (int i = 0; i < 32; i++) {
+    buf[i] = 31 - i;
+  }
+  ASSERT_EQ(0x113fdb5cU, Value(buf, sizeof(buf)));
+
+  unsigned char data[48] = {
+    0x01, 0xc0, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00,
+    0x14, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x04, 0x00,
+    0x00, 0x00, 0x00, 0x14,
+    0x00, 0x00, 0x00, 0x18,
+    0x28, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00,
+    0x02, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00,
+  };
+  ASSERT_EQ(0xd9963a56, Value(reinterpret_cast<char*>(data), sizeof(data)));
+}
+
+TEST(CRC, Values) {
+  ASSERT_NE(Value("a", 1), Value("foo", 3));
+}
+
+TEST(CRC, Extend) {
+  ASSERT_EQ(Value("hello world", 11),
+            Extend(Value("hello ", 6), "world", 5));
+}
+
+TEST(CRC, Mask) {
+  uint32_t crc = Value("foo", 3);
+  ASSERT_NE(crc, Mask(crc));
+  ASSERT_NE(crc, Mask(Mask(crc)));
+  ASSERT_EQ(crc, Unmask(Mask(crc)));
+  ASSERT_EQ(crc, Unmask(Unmask(Mask(Mask(crc)))));
+}
+
+}  // namespace crc32c
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/util/env.cc b/util/env.cc
new file mode 100644 (file)
index 0000000..bd19d48
--- /dev/null
@@ -0,0 +1,142 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+
+namespace rocksdb {
+
+Env::~Env() {
+}
+
+SequentialFile::~SequentialFile() {
+}
+
+RandomAccessFile::~RandomAccessFile() {
+}
+
+WritableFile::~WritableFile() {
+}
+
+Logger::~Logger() {
+}
+
+FileLock::~FileLock() {
+}
+
+void LogFlush(Logger *info_log) {
+  if (info_log) {
+    info_log->Flush();
+  }
+}
+
+void Log(Logger* info_log, const char* format, ...) {
+  if (info_log) {
+    va_list ap;
+    va_start(ap, format);
+    info_log->Logv(format, ap);
+    va_end(ap);
+  }
+}
+
+void LogFlush(const shared_ptr<Logger>& info_log) {
+  if (info_log) {
+    info_log->Flush();
+  }
+}
+
+void Log(const shared_ptr<Logger>& info_log, const char* format, ...) {
+  if (info_log) {
+    va_list ap;
+    va_start(ap, format);
+    info_log->Logv(format, ap);
+    va_end(ap);
+  }
+}
+
+static Status DoWriteStringToFile(Env* env, const Slice& data,
+                                  const std::string& fname,
+                                  bool should_sync) {
+  unique_ptr<WritableFile> file;
+  EnvOptions soptions;
+  Status s = env->NewWritableFile(fname, &file, soptions);
+  if (!s.ok()) {
+    return s;
+  }
+  s = file->Append(data);
+  if (s.ok() && should_sync) {
+    s = file->Sync();
+  }
+  if (!s.ok()) {
+    env->DeleteFile(fname);
+  }
+  return s;
+}
+
+Status WriteStringToFile(Env* env, const Slice& data,
+                         const std::string& fname) {
+  return DoWriteStringToFile(env, data, fname, false);
+}
+
+Status WriteStringToFileSync(Env* env, const Slice& data,
+                             const std::string& fname) {
+  return DoWriteStringToFile(env, data, fname, true);
+}
+
+Status ReadFileToString(Env* env, const std::string& fname, std::string* data) {
+  EnvOptions soptions;
+  data->clear();
+  unique_ptr<SequentialFile> file;
+  Status s = env->NewSequentialFile(fname, &file, soptions);
+  if (!s.ok()) {
+    return s;
+  }
+  static const int kBufferSize = 8192;
+  char* space = new char[kBufferSize];
+  while (true) {
+    Slice fragment;
+    s = file->Read(kBufferSize, &fragment, space);
+    if (!s.ok()) {
+      break;
+    }
+    data->append(fragment.data(), fragment.size());
+    if (fragment.empty()) {
+      break;
+    }
+  }
+  delete[] space;
+  return s;
+}
+
+EnvWrapper::~EnvWrapper() {
+}
+
+namespace {  // anonymous namespace
+
+void AssignEnvOptions(EnvOptions* env_options, const Options& options) {
+  env_options->use_os_buffer = options.allow_os_buffer;
+  env_options->use_mmap_reads = options.allow_mmap_reads;
+  env_options->use_mmap_writes = options.allow_mmap_writes;
+  env_options->set_fd_cloexec = options.is_fd_close_on_exec;
+  env_options->bytes_per_sync = options.bytes_per_sync;
+}
+
+}
+
+EnvOptions::EnvOptions(const Options& options) {
+  AssignEnvOptions(this, options);
+}
+
+EnvOptions::EnvOptions() {
+  Options options;
+  AssignEnvOptions(this, options);
+}
+
+
+}  // namespace rocksdb
diff --git a/util/env_hdfs.cc b/util/env_hdfs.cc
new file mode 100644 (file)
index 0000000..0f8fe0d
--- /dev/null
@@ -0,0 +1,517 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#ifdef USE_HDFS
+#ifndef ROCKSDB_HDFS_FILE_C
+#define ROCKSDB_HDFS_FILE_C
+
+#include <algorithm>
+#include <stdio.h>
+#include <sys/time.h>
+#include <time.h>
+#include <iostream>
+#include <sstream>
+#include "rocksdb/env.h"
+#include "rocksdb/status.h"
+#include "hdfs/hdfs.h"
+#include "hdfs/env_hdfs.h"
+
+//
+// This file defines an HDFS environment for rocksdb. It uses the libhdfs
+// api to access HDFS. All HDFS files created by one instance of rocksdb
+// will reside on the same HDFS cluster.
+//
+
+namespace rocksdb {
+
+namespace {
+
+// Log error message
+static Status IOError(const std::string& context, int err_number) {
+  return Status::IOError(context, strerror(err_number));
+}
+
+// assume that there is one global logger for now. It is not thread-safe,
+// but need not be because the logger is initialized at db-open time.
+static Logger* mylog = nullptr;
+
+// Used for reading a file from HDFS. It implements both sequential-read
+// access methods as well as random read access methods.
+class HdfsReadableFile: virtual public SequentialFile, virtual public RandomAccessFile {
+ private:
+  hdfsFS fileSys_;
+  std::string filename_;
+  hdfsFile hfile_;
+
+ public:
+  HdfsReadableFile(hdfsFS fileSys, const std::string& fname)
+      : fileSys_(fileSys), filename_(fname), hfile_(nullptr) {
+    Log(mylog, "[hdfs] HdfsReadableFile opening file %s\n",
+        filename_.c_str());
+    hfile_ = hdfsOpenFile(fileSys_, filename_.c_str(), O_RDONLY, 0, 0, 0);
+    Log(mylog, "[hdfs] HdfsReadableFile opened file %s hfile_=0x%p\n",
+            filename_.c_str(), hfile_);
+  }
+
+  virtual ~HdfsReadableFile() {
+    Log(mylog, "[hdfs] HdfsReadableFile closing file %s\n",
+       filename_.c_str());
+    hdfsCloseFile(fileSys_, hfile_);
+    Log(mylog, "[hdfs] HdfsReadableFile closed file %s\n",
+        filename_.c_str());
+    hfile_ = nullptr;
+  }
+
+  bool isValid() {
+    return hfile_ != nullptr;
+  }
+
+  // sequential access, read data at current offset in file
+  virtual Status Read(size_t n, Slice* result, char* scratch) {
+    Status s;
+    Log(mylog, "[hdfs] HdfsReadableFile reading %s %ld\n",
+        filename_.c_str(), n);
+    size_t bytes_read = hdfsRead(fileSys_, hfile_, scratch, (tSize)n);
+    Log(mylog, "[hdfs] HdfsReadableFile read %s\n", filename_.c_str());
+    *result = Slice(scratch, bytes_read);
+    if (bytes_read < n) {
+      if (feof()) {
+        // We leave status as ok if we hit the end of the file
+      } else {
+        // A partial read with an error: return a non-ok status
+        s = IOError(filename_, errno);
+      }
+    }
+    return s;
+  }
+
+  // random access, read data from specified offset in file
+  virtual Status Read(uint64_t offset, size_t n, Slice* result,
+                      char* scratch) const {
+    Status s;
+    Log(mylog, "[hdfs] HdfsReadableFile preading %s\n", filename_.c_str());
+    ssize_t bytes_read = hdfsPread(fileSys_, hfile_, offset,
+                                   (void*)scratch, (tSize)n);
+    Log(mylog, "[hdfs] HdfsReadableFile pread %s\n", filename_.c_str());
+    *result = Slice(scratch, (bytes_read < 0) ? 0 : bytes_read);
+    if (bytes_read < 0) {
+      // An error: return a non-ok status
+      s = IOError(filename_, errno);
+    }
+    return s;
+  }
+
+  virtual Status Skip(uint64_t n) {
+    Log(mylog, "[hdfs] HdfsReadableFile skip %s\n", filename_.c_str());
+    // get current offset from file
+    tOffset current = hdfsTell(fileSys_, hfile_);
+    if (current < 0) {
+      return IOError(filename_, errno);
+    }
+    // seek to new offset in file
+    tOffset newoffset = current + n;
+    int val = hdfsSeek(fileSys_, hfile_, newoffset);
+    if (val < 0) {
+      return IOError(filename_, errno);
+    }
+    return Status::OK();
+  }
+
+ private:
+
+  // returns true if we are at the end of file, false otherwise
+  bool feof() {
+    Log(mylog, "[hdfs] HdfsReadableFile feof %s\n", filename_.c_str());
+    if (hdfsTell(fileSys_, hfile_) == fileSize()) {
+      return true;
+    }
+    return false;
+  }
+
+  // the current size of the file
+  tOffset fileSize() {
+    Log(mylog, "[hdfs] HdfsReadableFile fileSize %s\n", filename_.c_str());
+    hdfsFileInfo* pFileInfo = hdfsGetPathInfo(fileSys_, filename_.c_str());
+    tOffset size = 0L;
+    if (pFileInfo != nullptr) {
+      size = pFileInfo->mSize;
+      hdfsFreeFileInfo(pFileInfo, 1);
+    } else {
+      throw rocksdb::HdfsFatalException("fileSize on unknown file " +
+                                            filename_);
+    }
+    return size;
+  }
+};
+
+// Appends to an existing file in HDFS.
+class HdfsWritableFile: public WritableFile {
+ private:
+  hdfsFS fileSys_;
+  std::string filename_;
+  hdfsFile hfile_;
+
+ public:
+  HdfsWritableFile(hdfsFS fileSys, const std::string& fname)
+      : fileSys_(fileSys), filename_(fname) , hfile_(nullptr) {
+    Log(mylog, "[hdfs] HdfsWritableFile opening %s\n", filename_.c_str());
+    hfile_ = hdfsOpenFile(fileSys_, filename_.c_str(), O_WRONLY, 0, 0, 0);
+    Log(mylog, "[hdfs] HdfsWritableFile opened %s\n", filename_.c_str());
+    assert(hfile_ != nullptr);
+  }
+  virtual ~HdfsWritableFile() {
+    if (hfile_ != nullptr) {
+      Log(mylog, "[hdfs] HdfsWritableFile closing %s\n", filename_.c_str());
+      hdfsCloseFile(fileSys_, hfile_);
+      Log(mylog, "[hdfs] HdfsWritableFile closed %s\n", filename_.c_str());
+      hfile_ = nullptr;
+    }
+  }
+
+  // If the file was successfully created, then this returns true.
+  // Otherwise returns false.
+  bool isValid() {
+    return hfile_ != nullptr;
+  }
+
+  // The name of the file, mostly needed for debug logging.
+  const std::string& getName() {
+    return filename_;
+  }
+
+  virtual Status Append(const Slice& data) {
+    Log(mylog, "[hdfs] HdfsWritableFile Append %s\n", filename_.c_str());
+    const char* src = data.data();
+    size_t left = data.size();
+    size_t ret = hdfsWrite(fileSys_, hfile_, src, left);
+    Log(mylog, "[hdfs] HdfsWritableFile Appended %s\n", filename_.c_str());
+    if (ret != left) {
+      return IOError(filename_, errno);
+    }
+    return Status::OK();
+  }
+
+  virtual Status Flush() {
+    return Status::OK();
+  }
+
+  virtual Status Sync() {
+    Status s;
+    Log(mylog, "[hdfs] HdfsWritableFile Sync %s\n", filename_.c_str());
+    if (hdfsFlush(fileSys_, hfile_) == -1) {
+      return IOError(filename_, errno);
+    }
+    if (hdfsSync(fileSys_, hfile_) == -1) {
+      return IOError(filename_, errno);
+    }
+    Log(mylog, "[hdfs] HdfsWritableFile Synced %s\n", filename_.c_str());
+    return Status::OK();
+  }
+
+  // This is used by HdfsLogger to write data to the debug log file
+  virtual Status Append(const char* src, size_t size) {
+    if (hdfsWrite(fileSys_, hfile_, src, size) != (tSize)size) {
+      return IOError(filename_, errno);
+    }
+    return Status::OK();
+  }
+
+  virtual Status Close() {
+    Log(mylog, "[hdfs] HdfsWritableFile closing %s\n", filename_.c_str());
+    if (hdfsCloseFile(fileSys_, hfile_) != 0) {
+      return IOError(filename_, errno);
+    }
+    Log(mylog, "[hdfs] HdfsWritableFile closed %s\n", filename_.c_str());
+    hfile_ = nullptr;
+    return Status::OK();
+  }
+};
+
+// The object that implements the debug logs to reside in HDFS.
+class HdfsLogger : public Logger {
+ private:
+  HdfsWritableFile* file_;
+  uint64_t (*gettid_)();  // Return the thread id for the current thread
+
+ public:
+  HdfsLogger(HdfsWritableFile* f, uint64_t (*gettid)())
+    : file_(f), gettid_(gettid) {
+    Log(mylog, "[hdfs] HdfsLogger opened %s\n",
+            file_->getName().c_str());
+  }
+
+  virtual ~HdfsLogger() {
+    Log(mylog, "[hdfs] HdfsLogger closed %s\n",
+            file_->getName().c_str());
+    delete file_;
+    if (mylog != nullptr && mylog == this) {
+      mylog = nullptr;
+    }
+  }
+
+  virtual void Logv(const char* format, va_list ap) {
+    const uint64_t thread_id = (*gettid_)();
+
+    // We try twice: the first time with a fixed-size stack allocated buffer,
+    // and the second time with a much larger dynamically allocated buffer.
+    char buffer[500];
+    for (int iter = 0; iter < 2; iter++) {
+      char* base;
+      int bufsize;
+      if (iter == 0) {
+        bufsize = sizeof(buffer);
+        base = buffer;
+      } else {
+        bufsize = 30000;
+        base = new char[bufsize];
+      }
+      char* p = base;
+      char* limit = base + bufsize;
+
+      struct timeval now_tv;
+      gettimeofday(&now_tv, nullptr);
+      const time_t seconds = now_tv.tv_sec;
+      struct tm t;
+      localtime_r(&seconds, &t);
+      p += snprintf(p, limit - p,
+                    "%04d/%02d/%02d-%02d:%02d:%02d.%06d %llx ",
+                    t.tm_year + 1900,
+                    t.tm_mon + 1,
+                    t.tm_mday,
+                    t.tm_hour,
+                    t.tm_min,
+                    t.tm_sec,
+                    static_cast<int>(now_tv.tv_usec),
+                    static_cast<long long unsigned int>(thread_id));
+
+      // Print the message
+      if (p < limit) {
+        va_list backup_ap;
+        va_copy(backup_ap, ap);
+        p += vsnprintf(p, limit - p, format, backup_ap);
+        va_end(backup_ap);
+      }
+
+      // Truncate to available space if necessary
+      if (p >= limit) {
+        if (iter == 0) {
+          continue;       // Try again with larger buffer
+        } else {
+          p = limit - 1;
+        }
+      }
+
+      // Add newline if necessary
+      if (p == base || p[-1] != '\n') {
+        *p++ = '\n';
+      }
+
+      assert(p <= limit);
+      file_->Append(base, p-base);
+      file_->Flush();
+      if (base != buffer) {
+        delete[] base;
+      }
+      break;
+    }
+  }
+};
+
+}  // namespace
+
+// Finally, the hdfs environment
+
+// open a file for sequential reading
+Status HdfsEnv::NewSequentialFile(const std::string& fname,
+                                 SequentialFile** result) {
+  HdfsReadableFile* f = new HdfsReadableFile(fileSys_, fname);
+  if (f == nullptr) {
+    *result = nullptr;
+    return IOError(fname, errno);
+  }
+  *result = dynamic_cast<SequentialFile*>(f);
+  return Status::OK();
+}
+
+// open a file for random reading
+Status HdfsEnv::NewRandomAccessFile(const std::string& fname,
+                                   RandomAccessFile** result) {
+  HdfsReadableFile* f = new HdfsReadableFile(fileSys_, fname);
+  if (f == nullptr) {
+    *result = nullptr;
+    return IOError(fname, errno);
+  }
+  *result = dynamic_cast<RandomAccessFile*>(f);
+  return Status::OK();
+}
+
+// create a new file for writing
+Status HdfsEnv::NewWritableFile(const std::string& fname,
+                               WritableFile** result) {
+  Status s;
+  HdfsWritableFile* f = new HdfsWritableFile(fileSys_, fname);
+  if (f == nullptr || !f->isValid()) {
+    *result = nullptr;
+    return IOError(fname, errno);
+  }
+  *result = dynamic_cast<WritableFile*>(f);
+  return Status::OK();
+}
+
+Status HdfsEnv::NewRandomRWFile(const std::string& fname,
+                                unique_ptr<RandomRWFile>* result,
+                                const EnvOptions& options) {
+  return Status::NotSupported("NewRandomRWFile not supported on HdfsEnv");
+}
+
+bool HdfsEnv::FileExists(const std::string& fname) {
+  int value = hdfsExists(fileSys_, fname.c_str());
+  if (value == 0) {
+    return true;
+  }
+  return false;
+}
+
+Status HdfsEnv::GetChildren(const std::string& path,
+                            std::vector<std::string>* result) {
+  int value = hdfsExists(fileSys_, path.c_str());
+  switch (value) {
+  case 0: {
+    int numEntries = 0;
+    hdfsFileInfo* pHdfsFileInfo = 0;
+    pHdfsFileInfo = hdfsListDirectory(fileSys_, path.c_str(), &numEntries);
+    if (numEntries >= 0) {
+      for(int i = 0; i < numEntries; i++) {
+        char* pathname = pHdfsFileInfo[i].mName;
+        char* filename = rindex(pathname, '/');
+        if (filename != nullptr) {
+          result->push_back(filename+1);
+        }
+      }
+      if (pHdfsFileInfo != nullptr) {
+        hdfsFreeFileInfo(pHdfsFileInfo, numEntries);
+      }
+    } else {
+      // numEntries < 0 indicates error
+      Log(mylog, "hdfsListDirectory call failed with error ");
+      throw HdfsFatalException("hdfsListDirectory call failed negative error.\n");
+    }
+    break;
+  }
+  case 1:           // directory does not exist, exit
+    break;
+  default:          // anything else should be an error
+    Log(mylog, "hdfsListDirectory call failed with error ");
+    throw HdfsFatalException("hdfsListDirectory call failed with error.\n");
+  }
+  return Status::OK();
+}
+
+Status HdfsEnv::DeleteFile(const std::string& fname) {
+  if (hdfsDelete(fileSys_, fname.c_str()) == 0) {
+    return Status::OK();
+  }
+  return IOError(fname, errno);
+};
+
+Status HdfsEnv::CreateDir(const std::string& name) {
+  if (hdfsCreateDirectory(fileSys_, name.c_str()) == 0) {
+    return Status::OK();
+  }
+  return IOError(name, errno);
+};
+
+Status HdfsEnv::CreateDirIfMissing(const std::string& name) {
+  const int value = hdfsExists(fileSys_, name.c_str());
+  //  Not atomic. state might change b/w hdfsExists and CreateDir.
+  if (value == 0) {
+    return Status::OK();
+  } else {
+    return CreateDir(name);
+  }
+};
+
+Status HdfsEnv::DeleteDir(const std::string& name) {
+  return DeleteFile(name);
+};
+
+Status HdfsEnv::GetFileSize(const std::string& fname, uint64_t* size) {
+  *size = 0L;
+  hdfsFileInfo* pFileInfo = hdfsGetPathInfo(fileSys_, fname.c_str());
+  if (pFileInfo != nullptr) {
+    *size = pFileInfo->mSize;
+    hdfsFreeFileInfo(pFileInfo, 1);
+    return Status::OK();
+  }
+  return IOError(fname, errno);
+}
+
+Status HdfsEnv::GetFileModificationTime(const std::string& fname,
+                                        uint64_t* time) {
+  hdfsFileInfo* pFileInfo = hdfsGetPathInfo(fileSys_, fname.c_str());
+  if (pFileInfo != nullptr) {
+    *time = static_cast<uint64_t>(pFileInfo->mLastMod);
+    hdfsFreeFileInfo(pFileInfo, 1);
+    return Status::OK();
+  }
+  return IOError(fname, errno);
+
+}
+
+// The rename is not atomic. HDFS does not allow a renaming if the
+// target already exists. So, we delete the target before attemting the
+// rename.
+Status HdfsEnv::RenameFile(const std::string& src, const std::string& target) {
+  hdfsDelete(fileSys_, target.c_str());
+  if (hdfsRename(fileSys_, src.c_str(), target.c_str()) == 0) {
+    return Status::OK();
+  }
+  return IOError(src, errno);
+}
+
+Status HdfsEnv::LockFile(const std::string& fname, FileLock** lock) {
+  // there isn's a very good way to atomically check and create
+  // a file via libhdfs
+  *lock = nullptr;
+  return Status::OK();
+}
+
+Status HdfsEnv::UnlockFile(FileLock* lock) {
+  return Status::OK();
+}
+
+Status HdfsEnv::NewLogger(const std::string& fname,
+                          shared_ptr<Logger>* result) {
+  HdfsWritableFile* f = new HdfsWritableFile(fileSys_, fname);
+  if (f == nullptr || !f->isValid()) {
+    *result = nullptr;
+    return IOError(fname, errno);
+  }
+  HdfsLogger* h = new HdfsLogger(f, &HdfsEnv::gettid);
+  *result = h;
+  if (mylog == nullptr) {
+    // mylog = h; // uncomment this for detailed logging
+  }
+  return Status::OK();
+}
+
+}  // namespace rocksdb
+
+#endif // ROCKSDB_HDFS_FILE_C
+
+#else // USE_HDFS
+
+// dummy placeholders used when HDFS is not available
+#include "rocksdb/env.h"
+#include "hdfs/env_hdfs.h"
+namespace rocksdb {
+ Status HdfsEnv::NewSequentialFile(const std::string& fname,
+                                   unique_ptr<SequentialFile>* result,
+                                   const EnvOptions& options) {
+   return Status::NotSupported("Not compiled with hdfs support");
+ }
+}
+
+#endif
diff --git a/util/env_posix.cc b/util/env_posix.cc
new file mode 100644 (file)
index 0000000..2be524e
--- /dev/null
@@ -0,0 +1,1511 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <deque>
+#include <set>
+#include <dirent.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#ifdef OS_LINUX
+#include <sys/statfs.h>
+#endif
+#include <sys/time.h>
+#include <sys/types.h>
+#include <time.h>
+#include <unistd.h>
+#if defined(OS_LINUX)
+#include <linux/fs.h>
+#include <fcntl.h>
+#endif
+#if defined(LEVELDB_PLATFORM_ANDROID)
+#include <sys/stat.h>
+#endif
+#include "rocksdb/env.h"
+#include "rocksdb/slice.h"
+#include "port/port.h"
+#include "util/coding.h"
+#include "util/logging.h"
+#include "util/posix_logger.h"
+#include "util/random.h"
+#include <signal.h>
+
+// Get nano time for mach systems
+#ifdef __MACH__
+#include <mach/clock.h>
+#include <mach/mach.h>
+#endif
+
+#if !defined(TMPFS_MAGIC)
+#define TMPFS_MAGIC 0x01021994
+#endif
+#if !defined(XFS_SUPER_MAGIC)
+#define XFS_SUPER_MAGIC 0x58465342
+#endif
+#if !defined(EXT4_SUPER_MAGIC)
+#define EXT4_SUPER_MAGIC 0xEF53
+#endif
+
+// For non linux platform, the following macros are used only as place
+// holder.
+#ifndef OS_LINUX
+#define POSIX_FADV_NORMAL 0 /* [MC1] no further special treatment */
+#define POSIX_FADV_RANDOM 1 /* [MC1] expect random page refs */
+#define POSIX_FADV_SEQUENTIAL 2 /* [MC1] expect sequential page refs */
+#define POSIX_FADV_WILLNEED 3 /* [MC1] will need these pages */
+#define POSIX_FADV_DONTNEED 4 /* [MC1] dont need these pages */
+#endif
+
+// This is only set from db_stress.cc and for testing only.
+// If non-zero, kill at various points in source code with probability 1/this
+int rocksdb_kill_odds = 0;
+
+namespace rocksdb {
+
+namespace {
+
+// A wrapper for fadvise, if the platform doesn't support fadvise,
+// it will simply return Status::NotSupport.
+int Fadvise(int fd, off_t offset, size_t len, int advice) {
+#ifdef OS_LINUX
+  return posix_fadvise(fd, offset, len, advice);
+#else
+  return 0;  // simply do nothing.
+#endif
+}
+
+// list of pathnames that are locked
+static std::set<std::string> lockedFiles;
+static port::Mutex mutex_lockedFiles;
+
+static Status IOError(const std::string& context, int err_number) {
+  return Status::IOError(context, strerror(err_number));
+}
+
+#ifdef NDEBUG
+// empty in release build
+#define TEST_KILL_RANDOM(rocksdb_kill_odds)
+#else
+
+// Kill the process with probablity 1/odds for testing.
+static void TestKillRandom(int odds, const std::string& srcfile,
+                           int srcline) {
+  time_t curtime = time(nullptr);
+  Random r((uint32_t)curtime);
+
+  assert(odds > 0);
+  bool crash = r.OneIn(odds);
+  if (crash) {
+    fprintf(stdout, "Crashing at %s:%d\n", srcfile.c_str(), srcline);
+    fflush(stdout);
+    kill(getpid(), SIGTERM);
+  }
+}
+
+// To avoid crashing always at some frequently executed codepaths (during
+// kill random test), use this factor to reduce odds
+#define REDUCE_ODDS 2
+#define REDUCE_ODDS2 4
+
+#define TEST_KILL_RANDOM(rocksdb_kill_odds) {   \
+  if (rocksdb_kill_odds > 0) { \
+    TestKillRandom(rocksdb_kill_odds, __FILE__, __LINE__);     \
+  } \
+}
+
+#endif
+
+#if defined(OS_LINUX)
+namespace {
+  static size_t GetUniqueIdFromFile(int fd, char* id, size_t max_size) {
+    if (max_size < kMaxVarint64Length*3) {
+      return 0;
+    }
+
+    struct stat buf;
+    int result = fstat(fd, &buf);
+    if (result == -1) {
+      return 0;
+    }
+
+    long version = 0;
+    result = ioctl(fd, FS_IOC_GETVERSION, &version);
+    if (result == -1) {
+      return 0;
+    }
+    uint64_t uversion = (uint64_t)version;
+
+    char* rid = id;
+    rid = EncodeVarint64(rid, buf.st_dev);
+    rid = EncodeVarint64(rid, buf.st_ino);
+    rid = EncodeVarint64(rid, uversion);
+    assert(rid >= id);
+    return static_cast<size_t>(rid-id);
+  }
+}
+#endif
+
+class PosixSequentialFile: public SequentialFile {
+ private:
+  std::string filename_;
+  FILE* file_;
+  int fd_;
+  bool use_os_buffer_;
+
+ public:
+  PosixSequentialFile(const std::string& fname, FILE* f,
+      const EnvOptions& options)
+      : filename_(fname), file_(f), fd_(fileno(f)),
+        use_os_buffer_(options.use_os_buffer) {
+  }
+  virtual ~PosixSequentialFile() { fclose(file_); }
+
+  virtual Status Read(size_t n, Slice* result, char* scratch) {
+    Status s;
+    size_t r = fread_unlocked(scratch, 1, n, file_);
+    *result = Slice(scratch, r);
+    if (r < n) {
+      if (feof(file_)) {
+        // We leave status as ok if we hit the end of the file
+      } else {
+        // A partial read with an error: return a non-ok status
+        s = IOError(filename_, errno);
+      }
+    }
+    if (!use_os_buffer_) {
+      // we need to fadvise away the entire range of pages because
+      // we do not want readahead pages to be cached.
+      Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED); // free OS pages
+    }
+    return s;
+  }
+
+  virtual Status Skip(uint64_t n) {
+    if (fseek(file_, n, SEEK_CUR)) {
+      return IOError(filename_, errno);
+    }
+    return Status::OK();
+  }
+
+  virtual Status InvalidateCache(size_t offset, size_t length) {
+#ifndef OS_LINUX
+    return Status::OK();
+#else
+    // free OS pages
+    int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
+    if (ret == 0) {
+      return Status::OK();
+    }
+    return IOError(filename_, errno);
+#endif
+  }
+};
+
+// pread() based random-access
+class PosixRandomAccessFile: public RandomAccessFile {
+ private:
+  std::string filename_;
+  int fd_;
+  bool use_os_buffer_;
+
+ public:
+  PosixRandomAccessFile(const std::string& fname, int fd,
+                        const EnvOptions& options)
+      : filename_(fname), fd_(fd), use_os_buffer_(options.use_os_buffer) {
+    assert(!options.use_mmap_reads);
+  }
+  virtual ~PosixRandomAccessFile() { close(fd_); }
+
+  virtual Status Read(uint64_t offset, size_t n, Slice* result,
+                      char* scratch) const {
+    Status s;
+    ssize_t r = pread(fd_, scratch, n, static_cast<off_t>(offset));
+    *result = Slice(scratch, (r < 0) ? 0 : r);
+    if (r < 0) {
+      // An error: return a non-ok status
+      s = IOError(filename_, errno);
+    }
+    if (!use_os_buffer_) {
+      // we need to fadvise away the entire range of pages because
+      // we do not want readahead pages to be cached.
+      Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED); // free OS pages
+    }
+    return s;
+  }
+
+#ifdef OS_LINUX
+  virtual size_t GetUniqueId(char* id, size_t max_size) const {
+    return GetUniqueIdFromFile(fd_, id, max_size);
+  }
+#endif
+
+  virtual void Hint(AccessPattern pattern) {
+    switch(pattern) {
+      case NORMAL:
+        Fadvise(fd_, 0, 0, POSIX_FADV_NORMAL);
+        break;
+      case RANDOM:
+        Fadvise(fd_, 0, 0, POSIX_FADV_RANDOM);
+        break;
+      case SEQUENTIAL:
+        Fadvise(fd_, 0, 0, POSIX_FADV_SEQUENTIAL);
+        break;
+      case WILLNEED:
+        Fadvise(fd_, 0, 0, POSIX_FADV_WILLNEED);
+        break;
+      case DONTNEED:
+        Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED);
+        break;
+      default:
+        assert(false);
+        break;
+    }
+  }
+
+  virtual Status InvalidateCache(size_t offset, size_t length) {
+#ifndef OS_LINUX
+    return Status::OK();
+#else
+    // free OS pages
+    int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
+    if (ret == 0) {
+      return Status::OK();
+    }
+    return IOError(filename_, errno);
+#endif
+  }
+};
+
+// mmap() based random-access
+class PosixMmapReadableFile: public RandomAccessFile {
+ private:
+  int fd_;
+  std::string filename_;
+  void* mmapped_region_;
+  size_t length_;
+
+ public:
+  // base[0,length-1] contains the mmapped contents of the file.
+  PosixMmapReadableFile(const int fd, const std::string& fname,
+                        void* base, size_t length,
+                        const EnvOptions& options)
+      : fd_(fd), filename_(fname), mmapped_region_(base), length_(length) {
+    fd_ = fd_ + 0;  // suppress the warning for used variables
+    assert(options.use_mmap_reads);
+    assert(options.use_os_buffer);
+  }
+  virtual ~PosixMmapReadableFile() { munmap(mmapped_region_, length_); }
+
+  virtual Status Read(uint64_t offset, size_t n, Slice* result,
+                      char* scratch) const {
+    Status s;
+    if (offset + n > length_) {
+      *result = Slice();
+      s = IOError(filename_, EINVAL);
+    } else {
+      *result = Slice(reinterpret_cast<char*>(mmapped_region_) + offset, n);
+    }
+    return s;
+  }
+  virtual Status InvalidateCache(size_t offset, size_t length) {
+#ifndef OS_LINUX
+    return Status::OK();
+#else
+    // free OS pages
+    int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
+    if (ret == 0) {
+      return Status::OK();
+    }
+    return IOError(filename_, errno);
+#endif
+  }
+};
+
+// We preallocate up to an extra megabyte and use memcpy to append new
+// data to the file.  This is safe since we either properly close the
+// file before reading from it, or for log files, the reading code
+// knows enough to skip zero suffixes.
+class PosixMmapFile : public WritableFile {
+ private:
+  std::string filename_;
+  int fd_;
+  size_t page_size_;
+  size_t map_size_;       // How much extra memory to map at a time
+  char* base_;            // The mapped region
+  char* limit_;           // Limit of the mapped region
+  char* dst_;             // Where to write next  (in range [base_,limit_])
+  char* last_sync_;       // Where have we synced up to
+  uint64_t file_offset_;  // Offset of base_ in file
+
+  // Have we done an munmap of unsynced data?
+  bool pending_sync_;
+
+  // Roundup x to a multiple of y
+  static size_t Roundup(size_t x, size_t y) {
+    return ((x + y - 1) / y) * y;
+  }
+
+  size_t TruncateToPageBoundary(size_t s) {
+    s -= (s & (page_size_ - 1));
+    assert((s % page_size_) == 0);
+    return s;
+  }
+
+  bool UnmapCurrentRegion() {
+    bool result = true;
+    TEST_KILL_RANDOM(rocksdb_kill_odds);
+    if (base_ != nullptr) {
+      if (last_sync_ < limit_) {
+        // Defer syncing this data until next Sync() call, if any
+        pending_sync_ = true;
+      }
+      if (munmap(base_, limit_ - base_) != 0) {
+        result = false;
+      }
+      file_offset_ += limit_ - base_;
+      base_ = nullptr;
+      limit_ = nullptr;
+      last_sync_ = nullptr;
+      dst_ = nullptr;
+
+      // Increase the amount we map the next time, but capped at 1MB
+      if (map_size_ < (1<<20)) {
+        map_size_ *= 2;
+      }
+    }
+    return result;
+  }
+
+  Status MapNewRegion() {
+#ifdef ROCKSDB_FALLOCATE_PRESENT
+    assert(base_ == nullptr);
+
+    TEST_KILL_RANDOM(rocksdb_kill_odds);
+    int alloc_status = posix_fallocate(fd_, file_offset_, map_size_);
+    if (alloc_status != 0) {
+      return Status::IOError("Error allocating space to file : " + filename_ +
+        "Error : " + strerror(alloc_status));
+    }
+
+    TEST_KILL_RANDOM(rocksdb_kill_odds);
+    void* ptr = mmap(nullptr, map_size_, PROT_READ | PROT_WRITE, MAP_SHARED,
+                     fd_, file_offset_);
+    if (ptr == MAP_FAILED) {
+      return Status::IOError("MMap failed on " + filename_);
+    }
+
+    TEST_KILL_RANDOM(rocksdb_kill_odds);
+
+    base_ = reinterpret_cast<char*>(ptr);
+    limit_ = base_ + map_size_;
+    dst_ = base_;
+    last_sync_ = base_;
+    return Status::OK();
+#else
+    return Status::NotSupported("This platform doesn't support fallocate()");
+#endif
+  }
+
+ public:
+  PosixMmapFile(const std::string& fname, int fd, size_t page_size,
+                const EnvOptions& options)
+      : filename_(fname),
+        fd_(fd),
+        page_size_(page_size),
+        map_size_(Roundup(65536, page_size)),
+        base_(nullptr),
+        limit_(nullptr),
+        dst_(nullptr),
+        last_sync_(nullptr),
+        file_offset_(0),
+        pending_sync_(false) {
+    assert((page_size & (page_size - 1)) == 0);
+    assert(options.use_mmap_writes);
+  }
+
+
+  ~PosixMmapFile() {
+    if (fd_ >= 0) {
+      PosixMmapFile::Close();
+    }
+  }
+
+  virtual Status Append(const Slice& data) {
+    const char* src = data.data();
+    size_t left = data.size();
+    TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS);
+    PrepareWrite(GetFileSize(), left);
+    while (left > 0) {
+      assert(base_ <= dst_);
+      assert(dst_ <= limit_);
+      size_t avail = limit_ - dst_;
+      if (avail == 0) {
+        if (UnmapCurrentRegion()) {
+          Status s = MapNewRegion();
+          if (!s.ok()) {
+            return s;
+          }
+          TEST_KILL_RANDOM(rocksdb_kill_odds);
+        }
+      }
+
+      size_t n = (left <= avail) ? left : avail;
+      memcpy(dst_, src, n);
+      dst_ += n;
+      src += n;
+      left -= n;
+    }
+    TEST_KILL_RANDOM(rocksdb_kill_odds);
+    return Status::OK();
+  }
+
+  virtual Status Close() {
+    Status s;
+    size_t unused = limit_ - dst_;
+
+    TEST_KILL_RANDOM(rocksdb_kill_odds);
+
+    if (!UnmapCurrentRegion()) {
+      s = IOError(filename_, errno);
+    } else if (unused > 0) {
+      // Trim the extra space at the end of the file
+      if (ftruncate(fd_, file_offset_ - unused) < 0) {
+        s = IOError(filename_, errno);
+      }
+    }
+
+    TEST_KILL_RANDOM(rocksdb_kill_odds);
+
+    if (close(fd_) < 0) {
+      if (s.ok()) {
+        s = IOError(filename_, errno);
+      }
+    }
+
+    fd_ = -1;
+    base_ = nullptr;
+    limit_ = nullptr;
+    return s;
+  }
+
+  virtual Status Flush() {
+    TEST_KILL_RANDOM(rocksdb_kill_odds);
+    return Status::OK();
+  }
+
+  virtual Status Sync() {
+    Status s;
+
+    if (pending_sync_) {
+      // Some unmapped data was not synced
+      TEST_KILL_RANDOM(rocksdb_kill_odds);
+      pending_sync_ = false;
+      if (fdatasync(fd_) < 0) {
+        s = IOError(filename_, errno);
+      }
+      TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS);
+    }
+
+    if (dst_ > last_sync_) {
+      // Find the beginnings of the pages that contain the first and last
+      // bytes to be synced.
+      size_t p1 = TruncateToPageBoundary(last_sync_ - base_);
+      size_t p2 = TruncateToPageBoundary(dst_ - base_ - 1);
+      last_sync_ = dst_;
+      TEST_KILL_RANDOM(rocksdb_kill_odds);
+      if (msync(base_ + p1, p2 - p1 + page_size_, MS_SYNC) < 0) {
+        s = IOError(filename_, errno);
+      }
+      TEST_KILL_RANDOM(rocksdb_kill_odds);
+    }
+
+    return s;
+  }
+
+  /**
+   * Flush data as well as metadata to stable storage.
+   */
+  virtual Status Fsync() {
+    if (pending_sync_) {
+      // Some unmapped data was not synced
+      TEST_KILL_RANDOM(rocksdb_kill_odds);
+      pending_sync_ = false;
+      if (fsync(fd_) < 0) {
+        return IOError(filename_, errno);
+      }
+      TEST_KILL_RANDOM(rocksdb_kill_odds);
+    }
+    // This invocation to Sync will not issue the call to
+    // fdatasync because pending_sync_ has already been cleared.
+    return Sync();
+  }
+
+  /**
+   * Get the size of valid data in the file. This will not match the
+   * size that is returned from the filesystem because we use mmap
+   * to extend file by map_size every time.
+   */
+  virtual uint64_t GetFileSize() {
+    size_t used = dst_ - base_;
+    return file_offset_ + used;
+  }
+
+  virtual Status InvalidateCache(size_t offset, size_t length) {
+#ifndef OS_LINUX
+    return Status::OK();
+#else
+    // free OS pages
+    int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
+    if (ret == 0) {
+      return Status::OK();
+    }
+    return IOError(filename_, errno);
+#endif
+  }
+
+#ifdef ROCKSDB_FALLOCATE_PRESENT
+  virtual Status Allocate(off_t offset, off_t len) {
+    TEST_KILL_RANDOM(rocksdb_kill_odds);
+    if (!fallocate(fd_, FALLOC_FL_KEEP_SIZE, offset, len)) {
+      return Status::OK();
+    } else {
+      return IOError(filename_, errno);
+    }
+  }
+#endif
+};
+
+// Use posix write to write data to a file.
+class PosixWritableFile : public WritableFile {
+ private:
+  const std::string filename_;
+  int fd_;
+  size_t cursize_;      // current size of cached data in buf_
+  size_t capacity_;     // max size of buf_
+  unique_ptr<char[]> buf_;           // a buffer to cache writes
+  uint64_t filesize_;
+  bool pending_sync_;
+  bool pending_fsync_;
+  uint64_t last_sync_size_;
+  uint64_t bytes_per_sync_;
+
+ public:
+  PosixWritableFile(const std::string& fname, int fd, size_t capacity,
+                    const EnvOptions& options) :
+    filename_(fname),
+    fd_(fd),
+    cursize_(0),
+    capacity_(capacity),
+    buf_(new char[capacity]),
+    filesize_(0),
+    pending_sync_(false),
+    pending_fsync_(false),
+    last_sync_size_(0),
+    bytes_per_sync_(options.bytes_per_sync) {
+    assert(!options.use_mmap_writes);
+  }
+
+  ~PosixWritableFile() {
+    if (fd_ >= 0) {
+      PosixWritableFile::Close();
+    }
+  }
+
+  virtual Status Append(const Slice& data) {
+    const char* src = data.data();
+    size_t left = data.size();
+    Status s;
+    pending_sync_ = true;
+    pending_fsync_ = true;
+
+    TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS2);
+
+    PrepareWrite(GetFileSize(), left);
+    // if there is no space in the cache, then flush
+    if (cursize_ + left > capacity_) {
+      s = Flush();
+      if (!s.ok()) {
+        return s;
+      }
+      // Increase the buffer size, but capped at 1MB
+      if (capacity_ < (1<<20)) {
+        capacity_ *= 2;
+        buf_.reset(new char[capacity_]);
+      }
+      assert(cursize_ == 0);
+    }
+
+    // if the write fits into the cache, then write to cache
+    // otherwise do a write() syscall to write to OS buffers.
+    if (cursize_ + left <= capacity_) {
+      memcpy(buf_.get()+cursize_, src, left);
+      cursize_ += left;
+    } else {
+      while (left != 0) {
+        ssize_t done = write(fd_, src, left);
+        if (done < 0) {
+          return IOError(filename_, errno);
+        }
+        TEST_KILL_RANDOM(rocksdb_kill_odds);
+
+        left -= done;
+        src += done;
+      }
+    }
+    filesize_ += data.size();
+    return Status::OK();
+  }
+
+  virtual Status Close() {
+    Status s;
+    s = Flush(); // flush cache to OS
+    if (!s.ok()) {
+    }
+
+    TEST_KILL_RANDOM(rocksdb_kill_odds);
+
+    if (close(fd_) < 0) {
+      if (s.ok()) {
+        s = IOError(filename_, errno);
+      }
+    }
+    fd_ = -1;
+    return s;
+  }
+
+  // write out the cached data to the OS cache
+  virtual Status Flush() {
+    TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS2);
+    size_t left = cursize_;
+    char* src = buf_.get();
+    while (left != 0) {
+      ssize_t done = write(fd_, src, left);
+      if (done < 0) {
+        return IOError(filename_, errno);
+      }
+      TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS2);
+      left -= done;
+      src += done;
+    }
+    cursize_ = 0;
+
+    // sync OS cache to disk for every bytes_per_sync_
+    // TODO: give log file and sst file different options (log
+    // files could be potentially cached in OS for their whole
+    // life time, thus we might not want to flush at all).
+    if (bytes_per_sync_ &&
+        filesize_ - last_sync_size_ >= bytes_per_sync_) {
+      RangeSync(last_sync_size_, filesize_ - last_sync_size_);
+      last_sync_size_ = filesize_;
+    }
+
+    return Status::OK();
+  }
+
+  virtual Status Sync() {
+    TEST_KILL_RANDOM(rocksdb_kill_odds);
+    if (pending_sync_ && fdatasync(fd_) < 0) {
+      return IOError(filename_, errno);
+    }
+    TEST_KILL_RANDOM(rocksdb_kill_odds);
+    pending_sync_ = false;
+    return Status::OK();
+  }
+
+  virtual Status Fsync() {
+    TEST_KILL_RANDOM(rocksdb_kill_odds);
+    if (pending_fsync_ && fsync(fd_) < 0) {
+      return IOError(filename_, errno);
+    }
+    TEST_KILL_RANDOM(rocksdb_kill_odds);
+    pending_fsync_ = false;
+    pending_sync_ = false;
+    return Status::OK();
+  }
+
+  virtual uint64_t GetFileSize() {
+    return filesize_;
+  }
+
+  virtual Status InvalidateCache(size_t offset, size_t length) {
+#ifndef OS_LINUX
+    return Status::OK();
+#else
+    // free OS pages
+    int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
+    if (ret == 0) {
+      return Status::OK();
+    }
+    return IOError(filename_, errno);
+#endif
+  }
+
+#ifdef ROCKSDB_FALLOCATE_PRESENT
+  virtual Status Allocate(off_t offset, off_t len) {
+    TEST_KILL_RANDOM(rocksdb_kill_odds);
+    if (!fallocate(fd_, FALLOC_FL_KEEP_SIZE, offset, len)) {
+      return Status::OK();
+    } else {
+      return IOError(filename_, errno);
+    }
+  }
+
+  virtual Status RangeSync(off64_t offset, off64_t nbytes) {
+    if (sync_file_range(fd_, offset, nbytes, SYNC_FILE_RANGE_WRITE) == 0) {
+      return Status::OK();
+    } else {
+      return IOError(filename_, errno);
+    }
+  }
+  virtual size_t GetUniqueId(char* id, size_t max_size) const {
+    return GetUniqueIdFromFile(fd_, id, max_size);
+  }
+#endif
+};
+
+class PosixRandomRWFile : public RandomRWFile {
+ private:
+  const std::string filename_;
+  int fd_;
+  bool pending_sync_;
+  bool pending_fsync_;
+
+ public:
+  PosixRandomRWFile(const std::string& fname, int fd,
+                    const EnvOptions& options) :
+      filename_(fname),
+      fd_(fd),
+      pending_sync_(false),
+      pending_fsync_(false) {
+    assert(!options.use_mmap_writes && !options.use_mmap_reads);
+  }
+
+  ~PosixRandomRWFile() {
+    if (fd_ >= 0) {
+      Close();
+    }
+  }
+
+  virtual Status Write(uint64_t offset, const Slice& data) {
+    const char* src = data.data();
+    size_t left = data.size();
+    Status s;
+    pending_sync_ = true;
+    pending_fsync_ = true;
+
+    while (left != 0) {
+      ssize_t done = pwrite(fd_, src, left, offset);
+      if (done < 0) {
+        return IOError(filename_, errno);
+      }
+
+      left -= done;
+      src += done;
+      offset += done;
+    }
+
+    return Status::OK();
+  }
+
+  virtual Status Read(uint64_t offset, size_t n, Slice* result,
+                      char* scratch) const {
+    Status s;
+    ssize_t r = pread(fd_, scratch, n, static_cast<off_t>(offset));
+    *result = Slice(scratch, (r < 0) ? 0 : r);
+    if (r < 0) {
+      s = IOError(filename_, errno);
+    }
+    return s;
+  }
+
+  virtual Status Close() {
+    Status s = Status::OK();
+    if (fd_ >= 0 && close(fd_) < 0) {
+      s = IOError(filename_, errno);
+    }
+    fd_ = -1;
+    return s;
+  }
+
+  virtual Status Sync() {
+    if (pending_sync_ && fdatasync(fd_) < 0) {
+      return IOError(filename_, errno);
+    }
+    pending_sync_ = false;
+    return Status::OK();
+  }
+
+  virtual Status Fsync() {
+    if (pending_fsync_ && fsync(fd_) < 0) {
+      return IOError(filename_, errno);
+    }
+    pending_fsync_ = false;
+    pending_sync_ = false;
+    return Status::OK();
+  }
+
+#ifdef ROCKSDB_FALLOCATE_PRESENT
+  virtual Status Allocate(off_t offset, off_t len) {
+    if (!fallocate(fd_, FALLOC_FL_KEEP_SIZE, offset, len)) {
+      return Status::OK();
+    } else {
+      return IOError(filename_, errno);
+    }
+  }
+#endif
+};
+
+static int LockOrUnlock(const std::string& fname, int fd, bool lock) {
+  mutex_lockedFiles.Lock();
+  if (lock) {
+    // If it already exists in the lockedFiles set, then it is already locked,
+    // and fail this lock attempt. Otherwise, insert it into lockedFiles.
+    // This check is needed because fcntl() does not detect lock conflict
+    // if the fcntl is issued by the same thread that earlier acquired
+    // this lock.
+    if (lockedFiles.insert(fname).second == false) {
+      mutex_lockedFiles.Unlock();
+      errno = ENOLCK;
+      return -1;
+    }
+  } else {
+    // If we are unlocking, then verify that we had locked it earlier,
+    // it should already exist in lockedFiles. Remove it from lockedFiles.
+    if (lockedFiles.erase(fname) != 1) {
+      mutex_lockedFiles.Unlock();
+      errno = ENOLCK;
+      return -1;
+    }
+  }
+  errno = 0;
+  struct flock f;
+  memset(&f, 0, sizeof(f));
+  f.l_type = (lock ? F_WRLCK : F_UNLCK);
+  f.l_whence = SEEK_SET;
+  f.l_start = 0;
+  f.l_len = 0;        // Lock/unlock entire file
+  int value = fcntl(fd, F_SETLK, &f);
+  if (value == -1 && lock) {
+    // if there is an error in locking, then remove the pathname from lockedfiles
+    lockedFiles.erase(fname);
+  }
+  mutex_lockedFiles.Unlock();
+  return value;
+}
+
+class PosixFileLock : public FileLock {
+ public:
+  int fd_;
+  std::string filename;
+};
+
+
+namespace {
+void PthreadCall(const char* label, int result) {
+  if (result != 0) {
+    fprintf(stderr, "pthread %s: %s\n", label, strerror(result));
+    exit(1);
+  }
+}
+}
+
+class PosixEnv : public Env {
+ public:
+  PosixEnv();
+
+  virtual ~PosixEnv(){
+    for (const auto tid : threads_to_join_) {
+      pthread_join(tid, nullptr);
+    }
+  }
+
+  void SetFD_CLOEXEC(int fd, const EnvOptions* options) {
+    if ((options == nullptr || options->set_fd_cloexec) && fd > 0) {
+      fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC);
+    }
+  }
+
+  virtual Status NewSequentialFile(const std::string& fname,
+                                   unique_ptr<SequentialFile>* result,
+                                   const EnvOptions& options) {
+    result->reset();
+    FILE* f = fopen(fname.c_str(), "r");
+    if (f == nullptr) {
+      *result = nullptr;
+      return IOError(fname, errno);
+    } else {
+      int fd = fileno(f);
+      SetFD_CLOEXEC(fd, &options);
+      result->reset(new PosixSequentialFile(fname, f, options));
+      return Status::OK();
+    }
+  }
+
+  virtual Status NewRandomAccessFile(const std::string& fname,
+                                     unique_ptr<RandomAccessFile>* result,
+                                     const EnvOptions& options) {
+    result->reset();
+    Status s;
+    int fd = open(fname.c_str(), O_RDONLY);
+    SetFD_CLOEXEC(fd, &options);
+    if (fd < 0) {
+      s = IOError(fname, errno);
+    } else if (options.use_mmap_reads && sizeof(void*) >= 8) {
+      // Use of mmap for random reads has been removed because it
+      // kills performance when storage is fast.
+      // Use mmap when virtual address-space is plentiful.
+      uint64_t size;
+      s = GetFileSize(fname, &size);
+      if (s.ok()) {
+        void* base = mmap(nullptr, size, PROT_READ, MAP_SHARED, fd, 0);
+        if (base != MAP_FAILED) {
+          result->reset(new PosixMmapReadableFile(fd, fname, base,
+                                                  size, options));
+        } else {
+          s = IOError(fname, errno);
+        }
+      }
+      close(fd);
+    } else {
+      result->reset(new PosixRandomAccessFile(fname, fd, options));
+    }
+    return s;
+  }
+
+  virtual Status NewWritableFile(const std::string& fname,
+                                 unique_ptr<WritableFile>* result,
+                                 const EnvOptions& options) {
+    result->reset();
+    Status s;
+    const int fd = open(fname.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644);
+    if (fd < 0) {
+      s = IOError(fname, errno);
+    } else {
+      SetFD_CLOEXEC(fd, &options);
+      if (options.use_mmap_writes) {
+        if (!checkedDiskForMmap_) {
+          // this will be executed once in the program's lifetime.
+          // do not use mmapWrite on non ext-3/xfs/tmpfs systems.
+          if (!SupportsFastAllocate(fname)) {
+            forceMmapOff = true;
+          }
+          checkedDiskForMmap_ = true;
+        }
+      }
+      if (options.use_mmap_writes && !forceMmapOff) {
+        result->reset(new PosixMmapFile(fname, fd, page_size_, options));
+      } else {
+        // disable mmap writes
+        EnvOptions no_mmap_writes_options = options;
+        no_mmap_writes_options.use_mmap_writes = false;
+
+        result->reset(
+            new PosixWritableFile(fname, fd, 65536, no_mmap_writes_options)
+        );
+      }
+    }
+    return s;
+  }
+
+  virtual Status NewRandomRWFile(const std::string& fname,
+                                 unique_ptr<RandomRWFile>* result,
+                                 const EnvOptions& options) {
+    result->reset();
+    Status s;
+    const int fd = open(fname.c_str(), O_CREAT | O_RDWR, 0644);
+    if (fd < 0) {
+      s = IOError(fname, errno);
+    } else {
+      SetFD_CLOEXEC(fd, &options);
+      // no support for mmap yet
+      if (options.use_mmap_writes || options.use_mmap_reads) {
+        return Status::NotSupported("No support for mmap read/write yet");
+      }
+      result->reset(new PosixRandomRWFile(fname, fd, options));
+    }
+    return s;
+  }
+
+  virtual bool FileExists(const std::string& fname) {
+    return access(fname.c_str(), F_OK) == 0;
+  }
+
+  virtual Status GetChildren(const std::string& dir,
+                             std::vector<std::string>* result) {
+    result->clear();
+    DIR* d = opendir(dir.c_str());
+    if (d == nullptr) {
+      return IOError(dir, errno);
+    }
+    struct dirent* entry;
+    while ((entry = readdir(d)) != nullptr) {
+      result->push_back(entry->d_name);
+    }
+    closedir(d);
+    return Status::OK();
+  }
+
+  virtual Status DeleteFile(const std::string& fname) {
+    Status result;
+    if (unlink(fname.c_str()) != 0) {
+      result = IOError(fname, errno);
+    }
+    return result;
+  };
+
+  virtual Status CreateDir(const std::string& name) {
+    Status result;
+    if (mkdir(name.c_str(), 0755) != 0) {
+      result = IOError(name, errno);
+    }
+    return result;
+  };
+
+  virtual Status CreateDirIfMissing(const std::string& name) {
+    Status result;
+    if (mkdir(name.c_str(), 0755) != 0) {
+      if (errno != EEXIST) {
+        result = IOError(name, errno);
+      } else if (!DirExists(name)) { // Check that name is actually a
+                                     // directory.
+        // Message is taken from mkdir
+        result = Status::IOError("`"+name+"' exists but is not a directory");
+      }
+    }
+    return result;
+  };
+
+  virtual Status DeleteDir(const std::string& name) {
+    Status result;
+    if (rmdir(name.c_str()) != 0) {
+      result = IOError(name, errno);
+    }
+    return result;
+  };
+
+  virtual Status GetFileSize(const std::string& fname, uint64_t* size) {
+    Status s;
+    struct stat sbuf;
+    if (stat(fname.c_str(), &sbuf) != 0) {
+      *size = 0;
+      s = IOError(fname, errno);
+    } else {
+      *size = sbuf.st_size;
+    }
+    return s;
+  }
+
+  virtual Status GetFileModificationTime(const std::string& fname,
+                                         uint64_t* file_mtime) {
+    struct stat s;
+    if (stat(fname.c_str(), &s) !=0) {
+      return IOError(fname, errno);
+    }
+    *file_mtime = static_cast<uint64_t>(s.st_mtime);
+    return Status::OK();
+  }
+  virtual Status RenameFile(const std::string& src, const std::string& target) {
+    Status result;
+    if (rename(src.c_str(), target.c_str()) != 0) {
+      result = IOError(src, errno);
+    }
+    return result;
+  }
+
+  virtual Status LockFile(const std::string& fname, FileLock** lock) {
+    *lock = nullptr;
+    Status result;
+    int fd = open(fname.c_str(), O_RDWR | O_CREAT, 0644);
+    if (fd < 0) {
+      result = IOError(fname, errno);
+    } else if (LockOrUnlock(fname, fd, true) == -1) {
+      result = IOError("lock " + fname, errno);
+      close(fd);
+    } else {
+      SetFD_CLOEXEC(fd, nullptr);
+      PosixFileLock* my_lock = new PosixFileLock;
+      my_lock->fd_ = fd;
+      my_lock->filename = fname;
+      *lock = my_lock;
+    }
+    return result;
+  }
+
+  virtual Status UnlockFile(FileLock* lock) {
+    PosixFileLock* my_lock = reinterpret_cast<PosixFileLock*>(lock);
+    Status result;
+    if (LockOrUnlock(my_lock->filename, my_lock->fd_, false) == -1) {
+      result = IOError("unlock", errno);
+    }
+    close(my_lock->fd_);
+    delete my_lock;
+    return result;
+  }
+
+  virtual void Schedule(void (*function)(void*), void* arg, Priority pri = LOW);
+
+  virtual void StartThread(void (*function)(void* arg), void* arg);
+
+  virtual Status GetTestDirectory(std::string* result) {
+    const char* env = getenv("TEST_TMPDIR");
+    if (env && env[0] != '\0') {
+      *result = env;
+    } else {
+      char buf[100];
+      snprintf(buf, sizeof(buf), "/tmp/rocksdbtest-%d", int(geteuid()));
+      *result = buf;
+    }
+    // Directory may already exist
+    CreateDir(*result);
+    return Status::OK();
+  }
+
+  static uint64_t gettid() {
+    pthread_t tid = pthread_self();
+    uint64_t thread_id = 0;
+    memcpy(&thread_id, &tid, std::min(sizeof(thread_id), sizeof(tid)));
+    return thread_id;
+  }
+
+  virtual Status NewLogger(const std::string& fname,
+                           shared_ptr<Logger>* result) {
+    FILE* f = fopen(fname.c_str(), "w");
+    if (f == nullptr) {
+      result->reset();
+      return IOError(fname, errno);
+    } else {
+      int fd = fileno(f);
+      SetFD_CLOEXEC(fd, nullptr);
+      result->reset(new PosixLogger(f, &PosixEnv::gettid, this));
+      return Status::OK();
+    }
+  }
+
+  virtual uint64_t NowMicros() {
+    struct timeval tv;
+    // TODO(kailiu) MAC DON'T HAVE THIS
+    gettimeofday(&tv, nullptr);
+    return static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
+  }
+
+  virtual uint64_t NowNanos() {
+#ifdef OS_LINUX
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return static_cast<uint64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec;
+#elif __MACH__
+    clock_serv_t cclock;
+    mach_timespec_t ts;
+    host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock);
+    clock_get_time(cclock, &ts);
+    mach_port_deallocate(mach_task_self(), cclock);
+#endif
+    return static_cast<uint64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec;
+  }
+
+  virtual void SleepForMicroseconds(int micros) {
+    usleep(micros);
+  }
+
+  virtual Status GetHostName(char* name, uint64_t len) {
+    int ret = gethostname(name, len);
+    if (ret < 0) {
+      if (errno == EFAULT || errno == EINVAL)
+        return Status::InvalidArgument(strerror(errno));
+      else
+        return IOError("GetHostName", errno);
+    }
+    return Status::OK();
+  }
+
+  virtual Status GetCurrentTime(int64_t* unix_time) {
+    time_t ret = time(nullptr);
+    if (ret == (time_t) -1) {
+      return IOError("GetCurrentTime", errno);
+    }
+    *unix_time = (int64_t) ret;
+    return Status::OK();
+  }
+
+  virtual Status GetAbsolutePath(const std::string& db_path,
+      std::string* output_path) {
+    if (db_path.find('/') == 0) {
+      *output_path = db_path;
+      return Status::OK();
+    }
+
+    char the_path[256];
+    char* ret = getcwd(the_path, 256);
+    if (ret == nullptr) {
+      return Status::IOError(strerror(errno));
+    }
+
+    *output_path = ret;
+    return Status::OK();
+  }
+
+  // Allow increasing the number of worker threads.
+  virtual void SetBackgroundThreads(int num, Priority pri) {
+    assert(pri >= Priority::LOW && pri <= Priority::HIGH);
+    thread_pools_[pri].SetBackgroundThreads(num);
+  }
+
+  virtual std::string TimeToString(uint64_t secondsSince1970) {
+    const time_t seconds = (time_t)secondsSince1970;
+    struct tm t;
+    int maxsize = 64;
+    std::string dummy;
+    dummy.reserve(maxsize);
+    dummy.resize(maxsize);
+    char* p = &dummy[0];
+    localtime_r(&seconds, &t);
+    snprintf(p, maxsize,
+             "%04d/%02d/%02d-%02d:%02d:%02d ",
+             t.tm_year + 1900,
+             t.tm_mon + 1,
+             t.tm_mday,
+             t.tm_hour,
+             t.tm_min,
+             t.tm_sec);
+    return dummy;
+  }
+
+ private:
+  bool checkedDiskForMmap_;
+  bool forceMmapOff; // do we override Env options?
+
+
+  // Returns true iff the named directory exists and is a directory.
+  virtual bool DirExists(const std::string& dname) {
+    struct stat statbuf;
+    if (stat(dname.c_str(), &statbuf) == 0) {
+      return S_ISDIR(statbuf.st_mode);
+    }
+    return false; // stat() failed return false
+  }
+
+  bool SupportsFastAllocate(const std::string& path) {
+#ifdef ROCKSDB_FALLOCATE_PRESENT
+    struct statfs s;
+    if (statfs(path.c_str(), &s)){
+      return false;
+    }
+    switch (s.f_type) {
+      case EXT4_SUPER_MAGIC:
+        return true;
+      case XFS_SUPER_MAGIC:
+        return true;
+      case TMPFS_MAGIC:
+        return true;
+      default:
+        return false;
+    }
+#else
+    return false;
+#endif
+  }
+
+  size_t page_size_;
+
+
+  class ThreadPool {
+   public:
+
+    ThreadPool() :
+        total_threads_limit_(1),
+        bgthreads_(0),
+        queue_(),
+        exit_all_threads_(false) {
+      PthreadCall("mutex_init", pthread_mutex_init(&mu_, nullptr));
+      PthreadCall("cvar_init", pthread_cond_init(&bgsignal_, nullptr));
+    }
+
+    ~ThreadPool() {
+      PthreadCall("lock", pthread_mutex_lock(&mu_));
+      assert(!exit_all_threads_);
+      exit_all_threads_ = true;
+      PthreadCall("signalall", pthread_cond_broadcast(&bgsignal_));
+      PthreadCall("unlock", pthread_mutex_unlock(&mu_));
+      for (const auto tid : bgthreads_) {
+        pthread_join(tid, nullptr);
+      }
+    }
+
+    void BGThread() {
+      while (true) {
+        // Wait until there is an item that is ready to run
+        PthreadCall("lock", pthread_mutex_lock(&mu_));
+        while (queue_.empty() && !exit_all_threads_) {
+          PthreadCall("wait", pthread_cond_wait(&bgsignal_, &mu_));
+        }
+        if (exit_all_threads_) { // mechanism to let BG threads exit safely
+          PthreadCall("unlock", pthread_mutex_unlock(&mu_));
+          break;
+        }
+        void (*function)(void*) = queue_.front().function;
+        void* arg = queue_.front().arg;
+        queue_.pop_front();
+
+        PthreadCall("unlock", pthread_mutex_unlock(&mu_));
+        (*function)(arg);
+      }
+    }
+
+    static void* BGThreadWrapper(void* arg) {
+      reinterpret_cast<ThreadPool*>(arg)->BGThread();
+      return nullptr;
+    }
+
+    void SetBackgroundThreads(int num) {
+      PthreadCall("lock", pthread_mutex_lock(&mu_));
+      if (num > total_threads_limit_) {
+        total_threads_limit_ = num;
+      }
+      assert(total_threads_limit_ > 0);
+      PthreadCall("unlock", pthread_mutex_unlock(&mu_));
+    }
+
+    void Schedule(void (*function)(void*), void* arg) {
+      PthreadCall("lock", pthread_mutex_lock(&mu_));
+
+      if (exit_all_threads_) {
+        PthreadCall("unlock", pthread_mutex_unlock(&mu_));
+        return;
+      }
+      // Start background thread if necessary
+      while ((int)bgthreads_.size() < total_threads_limit_) {
+        pthread_t t;
+        PthreadCall(
+          "create thread",
+          pthread_create(&t,
+                         nullptr,
+                         &ThreadPool::BGThreadWrapper,
+                         this));
+        fprintf(stdout,
+                "Created bg thread 0x%lx\n",
+                (unsigned long)t);
+
+        // Set the thread name to aid debugging
+#if defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ)
+#if __GLIBC_PREREQ(2, 12)
+        char name_buf[16];
+        snprintf(name_buf, sizeof name_buf, "rocksdb:bg%zu", bgthreads_.size());
+        name_buf[sizeof name_buf - 1] = '\0';
+        pthread_setname_np(t, name_buf);
+#endif
+#endif
+
+        bgthreads_.push_back(t);
+      }
+
+      // Add to priority queue
+      queue_.push_back(BGItem());
+      queue_.back().function = function;
+      queue_.back().arg = arg;
+
+      // always wake up at least one waiting thread.
+      PthreadCall("signal", pthread_cond_signal(&bgsignal_));
+
+      PthreadCall("unlock", pthread_mutex_unlock(&mu_));
+    }
+
+   private:
+    // Entry per Schedule() call
+    struct BGItem { void* arg; void (*function)(void*); };
+    typedef std::deque<BGItem> BGQueue;
+
+    pthread_mutex_t mu_;
+    pthread_cond_t bgsignal_;
+    int total_threads_limit_;
+    std::vector<pthread_t> bgthreads_;
+    BGQueue queue_;
+    bool exit_all_threads_;
+  };
+
+  std::vector<ThreadPool> thread_pools_;
+
+  pthread_mutex_t mu_;
+  std::vector<pthread_t> threads_to_join_;
+
+};
+
+PosixEnv::PosixEnv() : checkedDiskForMmap_(false),
+                       forceMmapOff(false),
+                       page_size_(getpagesize()),
+                       thread_pools_(Priority::TOTAL) {
+  PthreadCall("mutex_init", pthread_mutex_init(&mu_, nullptr));
+}
+
+void PosixEnv::Schedule(void (*function)(void*), void* arg, Priority pri) {
+  assert(pri >= Priority::LOW && pri <= Priority::HIGH);
+  thread_pools_[pri].Schedule(function, arg);
+}
+
+namespace {
+struct StartThreadState {
+  void (*user_function)(void*);
+  void* arg;
+};
+}
+static void* StartThreadWrapper(void* arg) {
+  StartThreadState* state = reinterpret_cast<StartThreadState*>(arg);
+  state->user_function(state->arg);
+  delete state;
+  return nullptr;
+}
+
+void PosixEnv::StartThread(void (*function)(void* arg), void* arg) {
+  pthread_t t;
+  StartThreadState* state = new StartThreadState;
+  state->user_function = function;
+  state->arg = arg;
+  PthreadCall("start thread",
+              pthread_create(&t, nullptr,  &StartThreadWrapper, state));
+  PthreadCall("lock", pthread_mutex_lock(&mu_));
+  threads_to_join_.push_back(t);
+  PthreadCall("unlock", pthread_mutex_unlock(&mu_));
+}
+
+}  // namespace
+
+std::string Env::GenerateUniqueId() {
+  std::string uuid_file = "/proc/sys/kernel/random/uuid";
+  if (FileExists(uuid_file)) {
+    std::string uuid;
+    Status s = ReadFileToString(this, uuid_file, &uuid);
+    if (s.ok()) {
+      return uuid;
+    }
+  }
+  // Could not read uuid_file - generate uuid using "nanos-random"
+  Random64 r(time(nullptr));
+  uint64_t random_uuid_portion =
+    r.Uniform(std::numeric_limits<uint64_t>::max());
+  uint64_t nanos_uuid_portion = NowNanos();
+  char uuid2[200];
+  snprintf(uuid2,
+           200,
+           "%lx-%lx",
+           (unsigned long)nanos_uuid_portion,
+           (unsigned long)random_uuid_portion);
+  return uuid2;
+}
+
+Env* Env::Default() {
+  static PosixEnv default_env;
+  return &default_env;
+}
+
+}  // namespace rocksdb
diff --git a/util/env_test.cc b/util/env_test.cc
new file mode 100644 (file)
index 0000000..828b49a
--- /dev/null
@@ -0,0 +1,397 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+
+#include <iostream>
+#include <unordered_set>
+
+#include "rocksdb/env.h"
+#include "port/port.h"
+#include "util/coding.h"
+#include "util/mutexlock.h"
+#include "util/testharness.h"
+
+namespace rocksdb {
+
+static const int kDelayMicros = 100000;
+
+class EnvPosixTest {
+ private:
+  port::Mutex mu_;
+  std::string events_;
+
+ public:
+  Env* env_;
+  EnvPosixTest() : env_(Env::Default()) { }
+};
+
+static void SetBool(void* ptr) {
+  reinterpret_cast<port::AtomicPointer*>(ptr)->NoBarrier_Store(ptr);
+}
+
+TEST(EnvPosixTest, RunImmediately) {
+  port::AtomicPointer called (nullptr);
+  env_->Schedule(&SetBool, &called);
+  Env::Default()->SleepForMicroseconds(kDelayMicros);
+  ASSERT_TRUE(called.NoBarrier_Load() != nullptr);
+}
+
+TEST(EnvPosixTest, RunMany) {
+  port::AtomicPointer last_id (nullptr);
+
+  struct CB {
+    port::AtomicPointer* last_id_ptr;   // Pointer to shared slot
+    uintptr_t id;             // Order# for the execution of this callback
+
+    CB(port::AtomicPointer* p, int i) : last_id_ptr(p), id(i) { }
+
+    static void Run(void* v) {
+      CB* cb = reinterpret_cast<CB*>(v);
+      void* cur = cb->last_id_ptr->NoBarrier_Load();
+      ASSERT_EQ(cb->id-1, reinterpret_cast<uintptr_t>(cur));
+      cb->last_id_ptr->Release_Store(reinterpret_cast<void*>(cb->id));
+    }
+  };
+
+  // Schedule in different order than start time
+  CB cb1(&last_id, 1);
+  CB cb2(&last_id, 2);
+  CB cb3(&last_id, 3);
+  CB cb4(&last_id, 4);
+  env_->Schedule(&CB::Run, &cb1);
+  env_->Schedule(&CB::Run, &cb2);
+  env_->Schedule(&CB::Run, &cb3);
+  env_->Schedule(&CB::Run, &cb4);
+
+  Env::Default()->SleepForMicroseconds(kDelayMicros);
+  void* cur = last_id.Acquire_Load();
+  ASSERT_EQ(4U, reinterpret_cast<uintptr_t>(cur));
+}
+
+struct State {
+  port::Mutex mu;
+  int val;
+  int num_running;
+};
+
+static void ThreadBody(void* arg) {
+  State* s = reinterpret_cast<State*>(arg);
+  s->mu.Lock();
+  s->val += 1;
+  s->num_running -= 1;
+  s->mu.Unlock();
+}
+
+TEST(EnvPosixTest, StartThread) {
+  State state;
+  state.val = 0;
+  state.num_running = 3;
+  for (int i = 0; i < 3; i++) {
+    env_->StartThread(&ThreadBody, &state);
+  }
+  while (true) {
+    state.mu.Lock();
+    int num = state.num_running;
+    state.mu.Unlock();
+    if (num == 0) {
+      break;
+    }
+    Env::Default()->SleepForMicroseconds(kDelayMicros);
+  }
+  ASSERT_EQ(state.val, 3);
+}
+
+TEST(EnvPosixTest, TwoPools) {
+
+  class CB {
+   public:
+    CB(const std::string& pool_name, int pool_size)
+        : mu_(),
+          num_running_(0),
+          num_finished_(0),
+          pool_size_(pool_size),
+          pool_name_(pool_name) { }
+
+    static void Run(void* v) {
+      CB* cb = reinterpret_cast<CB*>(v);
+      cb->Run();
+    }
+
+    void Run() {
+      {
+        MutexLock l(&mu_);
+        num_running_++;
+        std::cout << "Pool " << pool_name_ << ": "
+                  << num_running_ << " running threads.\n";
+        // make sure we don't have more than pool_size_ jobs running.
+        ASSERT_LE(num_running_, pool_size_);
+      }
+
+      // sleep for 1 sec
+      Env::Default()->SleepForMicroseconds(1000000);
+
+      {
+        MutexLock l(&mu_);
+        num_running_--;
+        num_finished_++;
+      }
+    }
+
+    int NumFinished() {
+      MutexLock l(&mu_);
+      return num_finished_;
+    }
+
+   private:
+    port::Mutex mu_;
+    int num_running_;
+    int num_finished_;
+    int pool_size_;
+    std::string pool_name_;
+  };
+
+  const int kLowPoolSize = 2;
+  const int kHighPoolSize = 4;
+  const int kJobs = 8;
+
+  CB low_pool_job("low", kLowPoolSize);
+  CB high_pool_job("high", kHighPoolSize);
+
+  env_->SetBackgroundThreads(kLowPoolSize);
+  env_->SetBackgroundThreads(kHighPoolSize, Env::Priority::HIGH);
+
+  // schedule same number of jobs in each pool
+  for (int i = 0; i < kJobs; i++) {
+    env_->Schedule(&CB::Run, &low_pool_job);
+    env_->Schedule(&CB::Run, &high_pool_job, Env::Priority::HIGH);
+  }
+
+  // wait for all jobs to finish
+  while (low_pool_job.NumFinished() < kJobs ||
+         high_pool_job.NumFinished() < kJobs) {
+    env_->SleepForMicroseconds(kDelayMicros);
+  }
+}
+
+bool IsSingleVarint(const std::string& s) {
+  Slice slice(s);
+
+  uint64_t v;
+  if (!GetVarint64(&slice, &v)) {
+    return false;
+  }
+
+  return slice.size() == 0;
+}
+
+#ifdef OS_LINUX
+bool IsUniqueIDValid(const std::string& s) {
+  return !s.empty() && !IsSingleVarint(s);
+}
+
+const size_t MAX_ID_SIZE = 100;
+char temp_id[MAX_ID_SIZE];
+
+// Only works in linux platforms
+TEST(EnvPosixTest, RandomAccessUniqueID) {
+  // Create file.
+  const EnvOptions soptions;
+  std::string fname = test::TmpDir() + "/" + "testfile";
+  unique_ptr<WritableFile> wfile;
+  ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions));
+
+  unique_ptr<RandomAccessFile> file;
+
+  // Get Unique ID
+  ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions));
+  size_t id_size = file->GetUniqueId(temp_id, MAX_ID_SIZE);
+  ASSERT_TRUE(id_size > 0);
+  std::string unique_id1(temp_id, id_size);
+  ASSERT_TRUE(IsUniqueIDValid(unique_id1));
+
+  // Get Unique ID again
+  ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions));
+  id_size = file->GetUniqueId(temp_id, MAX_ID_SIZE);
+  ASSERT_TRUE(id_size > 0);
+  std::string unique_id2(temp_id, id_size);
+  ASSERT_TRUE(IsUniqueIDValid(unique_id2));
+
+  // Get Unique ID again after waiting some time.
+  env_->SleepForMicroseconds(1000000);
+  ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions));
+  id_size = file->GetUniqueId(temp_id, MAX_ID_SIZE);
+  ASSERT_TRUE(id_size > 0);
+  std::string unique_id3(temp_id, id_size);
+  ASSERT_TRUE(IsUniqueIDValid(unique_id3));
+
+  // Check IDs are the same.
+  ASSERT_EQ(unique_id1, unique_id2);
+  ASSERT_EQ(unique_id2, unique_id3);
+
+  // Delete the file
+  env_->DeleteFile(fname);
+}
+
+// Returns true if any of the strings in ss are the prefix of another string.
+bool HasPrefix(const std::unordered_set<std::string>& ss) {
+  for (const std::string& s: ss) {
+    if (s.empty()) {
+      return true;
+    }
+    for (size_t i = 1; i < s.size(); ++i) {
+      if (ss.count(s.substr(0, i)) != 0) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+// Only works in linux platforms
+TEST(EnvPosixTest, RandomAccessUniqueIDConcurrent) {
+  // Check whether a bunch of concurrently existing files have unique IDs.
+  const EnvOptions soptions;
+
+  // Create the files
+  std::vector<std::string> fnames;
+  for (int i = 0; i < 1000; ++i) {
+    fnames.push_back(test::TmpDir() + "/" + "testfile" + std::to_string(i));
+
+    // Create file.
+    unique_ptr<WritableFile> wfile;
+    ASSERT_OK(env_->NewWritableFile(fnames[i], &wfile, soptions));
+  }
+
+  // Collect and check whether the IDs are unique.
+  std::unordered_set<std::string> ids;
+  for (const std::string fname: fnames) {
+    unique_ptr<RandomAccessFile> file;
+    std::string unique_id;
+    ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions));
+    size_t id_size = file->GetUniqueId(temp_id, MAX_ID_SIZE);
+    ASSERT_TRUE(id_size > 0);
+    unique_id = std::string(temp_id, id_size);
+    ASSERT_TRUE(IsUniqueIDValid(unique_id));
+
+    ASSERT_TRUE(ids.count(unique_id) == 0);
+    ids.insert(unique_id);
+  }
+
+  // Delete the files
+  for (const std::string fname: fnames) {
+    ASSERT_OK(env_->DeleteFile(fname));
+  }
+
+  ASSERT_TRUE(!HasPrefix(ids));
+}
+
+// Only works in linux platforms
+TEST(EnvPosixTest, RandomAccessUniqueIDDeletes) {
+  const EnvOptions soptions;
+  std::string fname = test::TmpDir() + "/" + "testfile";
+
+  // Check that after file is deleted we don't get same ID again in a new file.
+  std::unordered_set<std::string> ids;
+  for (int i = 0; i < 1000; ++i) {
+    // Create file.
+    {
+      unique_ptr<WritableFile> wfile;
+      ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions));
+    }
+
+    // Get Unique ID
+    std::string unique_id;
+    {
+      unique_ptr<RandomAccessFile> file;
+      ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions));
+      size_t id_size = file->GetUniqueId(temp_id, MAX_ID_SIZE);
+      ASSERT_TRUE(id_size > 0);
+      unique_id = std::string(temp_id, id_size);
+    }
+
+    ASSERT_TRUE(IsUniqueIDValid(unique_id));
+    ASSERT_TRUE(ids.count(unique_id) == 0);
+    ids.insert(unique_id);
+
+    // Delete the file
+    ASSERT_OK(env_->DeleteFile(fname));
+  }
+
+  ASSERT_TRUE(!HasPrefix(ids));
+}
+
+// Only works in linux platforms
+TEST(EnvPosixTest, InvalidateCache) {
+  const EnvOptions soptions;
+  std::string fname = test::TmpDir() + "/" + "testfile";
+
+  // Create file.
+  {
+    unique_ptr<WritableFile> wfile;
+    ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions));
+    ASSERT_OK(wfile.get()->Append(Slice("Hello world")));
+    ASSERT_OK(wfile.get()->InvalidateCache(0, 0));
+    ASSERT_OK(wfile.get()->Close());
+  }
+
+  // Random Read
+  {
+    unique_ptr<RandomAccessFile> file;
+    char scratch[100];
+    Slice result;
+    ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions));
+    ASSERT_OK(file.get()->Read(0, 11, &result, scratch));
+    ASSERT_EQ(memcmp(scratch, "Hello world", 11), 0);
+    ASSERT_OK(file.get()->InvalidateCache(0, 11));
+    ASSERT_OK(file.get()->InvalidateCache(0, 0));
+  }
+
+  // Sequential Read
+  {
+    unique_ptr<SequentialFile> file;
+    char scratch[100];
+    Slice result;
+    ASSERT_OK(env_->NewSequentialFile(fname, &file, soptions));
+    ASSERT_OK(file.get()->Read(11, &result, scratch));
+    ASSERT_EQ(memcmp(scratch, "Hello world", 11), 0);
+    ASSERT_OK(file.get()->InvalidateCache(0, 11));
+    ASSERT_OK(file.get()->InvalidateCache(0, 0));
+  }
+  // Delete the file
+  ASSERT_OK(env_->DeleteFile(fname));
+}
+#endif
+
+TEST(EnvPosixTest, PosixRandomRWFileTest) {
+  EnvOptions soptions;
+  soptions.use_mmap_writes = soptions.use_mmap_reads = false;
+  std::string fname = test::TmpDir() + "/" + "testfile";
+
+  unique_ptr<RandomRWFile> file;
+  ASSERT_OK(env_->NewRandomRWFile(fname, &file, soptions));
+  // If you run the unit test on tmpfs, then tmpfs might not
+  // support fallocate. It is still better to trigger that
+  // code-path instead of eliminating it completely.
+  file.get()->Allocate(0, 10*1024*1024);
+  ASSERT_OK(file.get()->Write(100, Slice("Hello world")));
+  ASSERT_OK(file.get()->Write(105, Slice("Hello world")));
+  ASSERT_OK(file.get()->Sync());
+  ASSERT_OK(file.get()->Fsync());
+  char scratch[100];
+  Slice result;
+  ASSERT_OK(file.get()->Read(100, 16, &result, scratch));
+  ASSERT_EQ(result.compare("HelloHello world"), 0);
+  ASSERT_OK(file.get()->Close());
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/util/filelock_test.cc b/util/filelock_test.cc
new file mode 100644 (file)
index 0000000..a9e30a5
--- /dev/null
@@ -0,0 +1,58 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#include "rocksdb/status.h"
+#include "rocksdb/env.h"
+
+#include <vector>
+#include "util/coding.h"
+#include "util/testharness.h"
+
+namespace rocksdb {
+
+class LockTest {
+ public:
+  static LockTest* current_;
+  std::string file_;
+  rocksdb::Env* env_;
+
+  LockTest() : file_(test::TmpDir() + "/db_testlock_file"),
+               env_(rocksdb::Env::Default()) {
+    current_ = this;
+  }
+
+  ~LockTest() {
+  }
+
+  Status LockFile(FileLock** db_lock) {
+    return env_->LockFile(file_, db_lock);
+  }
+
+  Status UnlockFile(FileLock* db_lock) {
+    return env_->UnlockFile(db_lock);
+  }
+};
+LockTest* LockTest::current_;
+
+TEST(LockTest, LockBySameThread) {
+  FileLock* lock1;
+  FileLock* lock2;
+
+  // acquire a lock on a file
+  ASSERT_OK(LockFile(&lock1));
+
+  // re-acquire the lock on the same file. This should fail.
+  ASSERT_TRUE(LockFile(&lock2).IsIOError());
+
+  // release the lock
+  ASSERT_OK(UnlockFile(lock1));
+
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/util/filter_policy.cc b/util/filter_policy.cc
new file mode 100644 (file)
index 0000000..e950b75
--- /dev/null
@@ -0,0 +1,16 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/filter_policy.h"
+
+namespace rocksdb {
+
+FilterPolicy::~FilterPolicy() { }
+
+}  // namespace rocksdb
diff --git a/util/hash.cc b/util/hash.cc
new file mode 100644 (file)
index 0000000..6f0e9cc
--- /dev/null
@@ -0,0 +1,50 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <string.h>
+#include "util/coding.h"
+#include "util/hash.h"
+
+namespace rocksdb {
+
+uint32_t Hash(const char* data, size_t n, uint32_t seed) {
+  // Similar to murmur hash
+  const uint32_t m = 0xc6a4a793;
+  const uint32_t r = 24;
+  const char* limit = data + n;
+  uint32_t h = seed ^ (n * m);
+
+  // Pick up four bytes at a time
+  while (data + 4 <= limit) {
+    uint32_t w = DecodeFixed32(data);
+    data += 4;
+    h += w;
+    h *= m;
+    h ^= (h >> 16);
+  }
+
+  // Pick up remaining bytes
+  switch (limit - data) {
+    case 3:
+      h += data[2] << 16;
+      // fall through
+    case 2:
+      h += data[1] << 8;
+      // fall through
+    case 1:
+      h += data[0];
+      h *= m;
+      h ^= (h >> r);
+      break;
+  }
+  return h;
+}
+
+
+}  // namespace rocksdb
diff --git a/util/hash.h b/util/hash.h
new file mode 100644 (file)
index 0000000..c9eb659
--- /dev/null
@@ -0,0 +1,20 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Simple hash function used for internal data structures
+
+#pragma once
+#include <stddef.h>
+#include <stdint.h>
+
+namespace rocksdb {
+
+extern uint32_t Hash(const char* data, size_t n, uint32_t seed);
+
+}
diff --git a/util/hash_skiplist_rep.cc b/util/hash_skiplist_rep.cc
new file mode 100644 (file)
index 0000000..c669769
--- /dev/null
@@ -0,0 +1,313 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+
+#include "util/hash_skiplist_rep.h"
+
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/arena.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "port/port.h"
+#include "port/atomic_pointer.h"
+#include "util/murmurhash.h"
+#include "db/skiplist.h"
+
+namespace rocksdb {
+namespace {
+
+class HashSkipListRep : public MemTableRep {
+ public:
+  HashSkipListRep(MemTableRep::KeyComparator& compare, Arena* arena,
+    const SliceTransform* transform, size_t bucket_size);
+
+  virtual void Insert(const char* key) override;
+
+  virtual bool Contains(const char* key) const override;
+
+  virtual size_t ApproximateMemoryUsage() override;
+
+  virtual ~HashSkipListRep();
+
+  virtual std::shared_ptr<MemTableRep::Iterator> GetIterator() override;
+
+  virtual std::shared_ptr<MemTableRep::Iterator> GetIterator(
+      const Slice& slice) override;
+
+  virtual std::shared_ptr<MemTableRep::Iterator> GetPrefixIterator(
+      const Slice& prefix) override;
+
+  virtual std::shared_ptr<MemTableRep::Iterator> GetDynamicPrefixIterator()
+      override;
+
+ private:
+  friend class DynamicIterator;
+  typedef SkipList<const char*, MemTableRep::KeyComparator&> Bucket;
+
+  size_t bucket_size_;
+
+  // Maps slices (which are transformed user keys) to buckets of keys sharing
+  // the same transform.
+  port::AtomicPointer* buckets_;
+
+  // The user-supplied transform whose domain is the user keys.
+  const SliceTransform* transform_;
+
+  MemTableRep::KeyComparator& compare_;
+  // immutable after construction
+  Arena* const arena_;
+
+  inline size_t GetHash(const Slice& slice) const {
+    return MurmurHash(slice.data(), slice.size(), 0) % bucket_size_;
+  }
+  inline Bucket* GetBucket(size_t i) const {
+    return static_cast<Bucket*>(buckets_[i].Acquire_Load());
+  }
+  inline Bucket* GetBucket(const Slice& slice) const {
+    return GetBucket(GetHash(slice));
+  }
+  // Get a bucket from buckets_. If the bucket hasn't been initialized yet,
+  // initialize it before returning.
+  Bucket* GetInitializedBucket(const Slice& transformed);
+
+  class Iterator : public MemTableRep::Iterator {
+   public:
+    explicit Iterator(Bucket* list, bool own_list = true)
+      : list_(list),
+        iter_(list),
+        own_list_(own_list) {}
+
+    virtual ~Iterator() {
+      // if we own the list, we should also delete it
+      if (own_list_) {
+        assert(list_ != nullptr);
+        delete list_;
+      }
+    }
+
+    // Returns true iff the iterator is positioned at a valid node.
+    virtual bool Valid() const {
+      return list_ != nullptr && iter_.Valid();
+    }
+
+    // Returns the key at the current position.
+    // REQUIRES: Valid()
+    virtual const char* key() const {
+      assert(Valid());
+      return iter_.key();
+    }
+
+    // Advances to the next position.
+    // REQUIRES: Valid()
+    virtual void Next() {
+      assert(Valid());
+      iter_.Next();
+    }
+
+    // Advances to the previous position.
+    // REQUIRES: Valid()
+    virtual void Prev() {
+      assert(Valid());
+      iter_.Prev();
+    }
+
+    // Advance to the first entry with a key >= target
+    virtual void Seek(const char* target) {
+      if (list_ != nullptr) {
+        iter_.Seek(target);
+      }
+    }
+
+    // Position at the first entry in collection.
+    // Final state of iterator is Valid() iff collection is not empty.
+    virtual void SeekToFirst() {
+      if (list_ != nullptr) {
+        iter_.SeekToFirst();
+      }
+    }
+
+    // Position at the last entry in collection.
+    // Final state of iterator is Valid() iff collection is not empty.
+    virtual void SeekToLast() {
+      if (list_ != nullptr) {
+        iter_.SeekToLast();
+      }
+    }
+   protected:
+    void Reset(Bucket* list) {
+      if (own_list_) {
+        assert(list_ != nullptr);
+        delete list_;
+      }
+      list_ = list;
+      iter_.SetList(list);
+      own_list_ = false;
+    }
+   private:
+    // if list_ is nullptr, we should NEVER call any methods on iter_
+    // if list_ is nullptr, this Iterator is not Valid()
+    Bucket* list_;
+    Bucket::Iterator iter_;
+    // here we track if we own list_. If we own it, we are also
+    // responsible for it's cleaning. This is a poor man's shared_ptr
+    bool own_list_;
+  };
+
+  class DynamicIterator : public HashSkipListRep::Iterator {
+   public:
+    explicit DynamicIterator(const HashSkipListRep& memtable_rep)
+      : HashSkipListRep::Iterator(nullptr, false),
+        memtable_rep_(memtable_rep) {}
+
+    // Advance to the first entry with a key >= target
+    virtual void Seek(const char* target) {
+      auto transformed = memtable_rep_.transform_->Transform(
+        memtable_rep_.UserKey(target));
+      Reset(memtable_rep_.GetBucket(transformed));
+      HashSkipListRep::Iterator::Seek(target);
+    }
+
+    // Position at the first entry in collection.
+    // Final state of iterator is Valid() iff collection is not empty.
+    virtual void SeekToFirst() {
+      // Prefix iterator does not support total order.
+      // We simply set the iterator to invalid state
+      Reset(nullptr);
+    }
+
+    // Position at the last entry in collection.
+    // Final state of iterator is Valid() iff collection is not empty.
+    virtual void SeekToLast() {
+      // Prefix iterator does not support total order.
+      // We simply set the iterator to invalid state
+      Reset(nullptr);
+    }
+   private:
+    // the underlying memtable
+    const HashSkipListRep& memtable_rep_;
+  };
+
+  class EmptyIterator : public MemTableRep::Iterator {
+    // This is used when there wasn't a bucket. It is cheaper than
+    // instantiating an empty bucket over which to iterate.
+   public:
+    EmptyIterator() { }
+    virtual bool Valid() const {
+      return false;
+    }
+    virtual const char* key() const {
+      assert(false);
+      return nullptr;
+    }
+    virtual void Next() { }
+    virtual void Prev() { }
+    virtual void Seek(const char* target) { }
+    virtual void SeekToFirst() { }
+    virtual void SeekToLast() { }
+   private:
+  };
+
+  std::shared_ptr<EmptyIterator> empty_iterator_;
+};
+
+HashSkipListRep::HashSkipListRep(MemTableRep::KeyComparator& compare,
+    Arena* arena, const SliceTransform* transform, size_t bucket_size)
+  : bucket_size_(bucket_size),
+    transform_(transform),
+    compare_(compare),
+    arena_(arena),
+    empty_iterator_(std::make_shared<EmptyIterator>()) {
+
+  buckets_ = new port::AtomicPointer[bucket_size];
+
+  for (size_t i = 0; i < bucket_size_; ++i) {
+    buckets_[i].NoBarrier_Store(nullptr);
+  }
+}
+
+HashSkipListRep::~HashSkipListRep() {
+  delete[] buckets_;
+}
+
+HashSkipListRep::Bucket* HashSkipListRep::GetInitializedBucket(
+    const Slice& transformed) {
+  size_t hash = GetHash(transformed);
+  auto bucket = GetBucket(hash);
+  if (bucket == nullptr) {
+    auto addr = arena_->AllocateAligned(sizeof(Bucket));
+    bucket = new (addr) Bucket(compare_, arena_);
+    buckets_[hash].Release_Store(static_cast<void*>(bucket));
+  }
+  return bucket;
+}
+
+void HashSkipListRep::Insert(const char* key) {
+  assert(!Contains(key));
+  auto transformed = transform_->Transform(UserKey(key));
+  auto bucket = GetInitializedBucket(transformed);
+  bucket->Insert(key);
+}
+
+bool HashSkipListRep::Contains(const char* key) const {
+  auto transformed = transform_->Transform(UserKey(key));
+  auto bucket = GetBucket(transformed);
+  if (bucket == nullptr) {
+    return false;
+  }
+  return bucket->Contains(key);
+}
+
+size_t HashSkipListRep::ApproximateMemoryUsage() {
+  return sizeof(buckets_);
+}
+
+std::shared_ptr<MemTableRep::Iterator> HashSkipListRep::GetIterator() {
+  auto list = new Bucket(compare_, arena_);
+  for (size_t i = 0; i < bucket_size_; ++i) {
+    auto bucket = GetBucket(i);
+    if (bucket != nullptr) {
+      Bucket::Iterator itr(bucket);
+      for (itr.SeekToFirst(); itr.Valid(); itr.Next()) {
+        list->Insert(itr.key());
+      }
+    }
+  }
+  return std::make_shared<Iterator>(list);
+}
+
+std::shared_ptr<MemTableRep::Iterator> HashSkipListRep::GetPrefixIterator(
+  const Slice& prefix) {
+  auto bucket = GetBucket(prefix);
+  if (bucket == nullptr) {
+    return empty_iterator_;
+  }
+  return std::make_shared<Iterator>(bucket, false);
+}
+
+std::shared_ptr<MemTableRep::Iterator> HashSkipListRep::GetIterator(
+    const Slice& slice) {
+  return GetPrefixIterator(transform_->Transform(slice));
+}
+
+std::shared_ptr<MemTableRep::Iterator>
+    HashSkipListRep::GetDynamicPrefixIterator() {
+  return std::make_shared<DynamicIterator>(*this);
+}
+
+} // anon namespace
+
+std::shared_ptr<MemTableRep>
+HashSkipListRepFactory::CreateMemTableRep(MemTableRep::KeyComparator &compare,
+                                          Arena *arena) {
+  return std::make_shared<HashSkipListRep>(compare, arena, transform_,
+      bucket_count_);
+}
+
+MemTableRepFactory* NewHashSkipListRepFactory(
+    const SliceTransform* transform, size_t bucket_count) {
+  return new HashSkipListRepFactory(transform, bucket_count);
+}
+
+} // namespace rocksdb
diff --git a/util/hash_skiplist_rep.h b/util/hash_skiplist_rep.h
new file mode 100644 (file)
index 0000000..b946cf0
--- /dev/null
@@ -0,0 +1,38 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/memtablerep.h"
+
+namespace rocksdb {
+
+class HashSkipListRepFactory : public MemTableRepFactory {
+ public:
+  explicit HashSkipListRepFactory(const SliceTransform* transform,
+      size_t bucket_count = 1000000)
+    : transform_(transform),
+      bucket_count_(bucket_count) { }
+
+  virtual ~HashSkipListRepFactory() { delete transform_; }
+
+  virtual std::shared_ptr<MemTableRep> CreateMemTableRep(
+      MemTableRep::KeyComparator& compare, Arena* arena) override;
+
+  virtual const char* Name() const override {
+    return "HashSkipListRepFactory";
+  }
+
+  const SliceTransform* GetTransform() { return transform_; }
+
+ private:
+  const SliceTransform* transform_;
+  const size_t bucket_count_;
+};
+
+}
diff --git a/util/histogram.cc b/util/histogram.cc
new file mode 100644 (file)
index 0000000..e839980
--- /dev/null
@@ -0,0 +1,194 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/histogram.h"
+
+#include <cassert>
+#include <math.h>
+#include <stdio.h>
+#include "port/port.h"
+
+namespace rocksdb {
+
+HistogramBucketMapper::HistogramBucketMapper() :
+  // Add newer bucket index here.
+  // Should be alwyas added in sorted order.
+  bucketValues_({
+  1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 25, 30, 35, 40, 45,
+  50, 60, 70, 80, 90, 100, 120, 140, 160, 180, 200, 250, 300, 350, 400, 450,
+  500, 600, 700, 800, 900, 1000, 1200, 1400, 1600, 1800, 2000, 2500, 3000,
+  3500, 4000, 4500, 5000, 6000, 7000, 8000, 9000, 10000, 12000, 14000,
+  16000, 18000, 20000, 25000, 30000, 35000, 40000, 45000, 50000, 60000,
+  70000, 80000, 90000, 100000, 120000, 140000, 160000, 180000, 200000,
+  250000, 300000, 350000, 400000, 450000, 500000, 600000, 700000, 800000,
+  900000, 1000000, 1200000, 1400000, 1600000, 1800000, 2000000, 2500000,
+  3000000, 3500000, 4000000, 4500000, 5000000, 6000000, 7000000, 8000000,
+  9000000, 10000000, 12000000, 14000000, 16000000, 18000000, 20000000,
+  25000000, 30000000, 35000000, 40000000, 45000000, 50000000, 60000000,
+  70000000, 80000000, 90000000, 100000000, 120000000, 140000000, 160000000,
+  180000000, 200000000, 250000000, 300000000, 350000000, 400000000,
+  450000000, 500000000, 600000000, 700000000, 800000000, 900000000,
+  1000000000}),
+  maxBucketValue_(bucketValues_.back()),
+  minBucketValue_(bucketValues_.front()) {
+  for (size_t i =0; i < bucketValues_.size(); ++i) {
+    valueIndexMap_[bucketValues_[i]] = i;
+  }
+}
+
+const size_t HistogramBucketMapper::IndexForValue(const uint64_t value) const {
+  if (value >= maxBucketValue_) {
+    return bucketValues_.size() - 1;
+  } else if ( value >= minBucketValue_ ) {
+    std::map<uint64_t, uint64_t>::const_iterator lowerBound =
+      valueIndexMap_.lower_bound(value);
+    if (lowerBound != valueIndexMap_.end()) {
+      return lowerBound->second;
+    } else {
+      return 0;
+    }
+  } else {
+    return 0;
+  }
+}
+
+namespace {
+  const HistogramBucketMapper bucketMapper;
+}
+
+
+HistogramImpl::HistogramImpl() :
+  min_(bucketMapper.LastValue()),
+  max_(0),
+  num_(0),
+  sum_(0),
+  sum_squares_(0),
+  buckets_(std::vector<uint64_t>(bucketMapper.BucketCount(), 0)) {}
+
+void HistogramImpl::Clear() {
+  min_ = bucketMapper.LastValue();
+  max_ = 0;
+  num_ = 0;
+  sum_ = 0;
+  sum_squares_ = 0;
+  buckets_.resize(bucketMapper.BucketCount(), 0);
+}
+
+void HistogramImpl::Add(uint64_t value) {
+  const size_t index = bucketMapper.IndexForValue(value);
+  buckets_[index] += 1;
+  if (min_ > value) min_ = value;
+  if (max_ < value) max_ = value;
+  num_++;
+  sum_ += value;
+  sum_squares_ += (value * value);
+}
+
+void HistogramImpl::Merge(const HistogramImpl& other) {
+  if (other.min_ < min_) min_ = other.min_;
+  if (other.max_ > max_) max_ = other.max_;
+  num_ += other.num_;
+  sum_ += other.sum_;
+  sum_squares_ += other.sum_squares_;
+  for (unsigned int b = 0; b < bucketMapper.BucketCount(); b++) {
+    buckets_[b] += other.buckets_[b];
+  }
+}
+
+double HistogramImpl::Median() const {
+  return Percentile(50.0);
+}
+
+double HistogramImpl::Percentile(double p) const {
+  double threshold = num_ * (p / 100.0);
+  double sum = 0;
+  for (unsigned int b = 0; b < bucketMapper.BucketCount(); b++) {
+    sum += buckets_[b];
+    if (sum >= threshold) {
+      // Scale linearly within this bucket
+      double left_point = (b == 0) ? 0 : bucketMapper.BucketLimit(b-1);
+      double right_point = bucketMapper.BucketLimit(b);
+      double left_sum = sum - buckets_[b];
+      double right_sum = sum;
+      double pos = 0;
+      double right_left_diff = right_sum - left_sum;
+      if (right_left_diff != 0) {
+       pos = (threshold - left_sum) / (right_sum - left_sum);
+      }
+      double r = left_point + (right_point - left_point) * pos;
+      if (r < min_) r = min_;
+      if (r > max_) r = max_;
+      return r;
+    }
+  }
+  return max_;
+}
+
+double HistogramImpl::Average() const {
+  if (num_ == 0.0) return 0;
+  return sum_ / num_;
+}
+
+double HistogramImpl::StandardDeviation() const {
+  if (num_ == 0.0) return 0;
+  double variance = (sum_squares_ * num_ - sum_ * sum_) / (num_ * num_);
+  return sqrt(variance);
+}
+
+std::string HistogramImpl::ToString() const {
+  std::string r;
+  char buf[200];
+  snprintf(buf, sizeof(buf),
+           "Count: %.0f  Average: %.4f  StdDev: %.2f\n",
+           num_, Average(), StandardDeviation());
+  r.append(buf);
+  snprintf(buf, sizeof(buf),
+           "Min: %.4f  Median: %.4f  Max: %.4f\n",
+           (num_ == 0.0 ? 0.0 : min_), Median(), max_);
+  r.append(buf);
+  snprintf(buf, sizeof(buf),
+           "Percentiles: "
+           "P50: %.2f P75: %.2f P99: %.2f P99.9: %.2f P99.99: %.2f\n",
+           Percentile(50), Percentile(75), Percentile(99), Percentile(99.9),
+           Percentile(99.99));
+  r.append(buf);
+  r.append("------------------------------------------------------\n");
+  const double mult = 100.0 / num_;
+  double sum = 0;
+  for (unsigned int b = 0; b < bucketMapper.BucketCount(); b++) {
+    if (buckets_[b] <= 0.0) continue;
+    sum += buckets_[b];
+    snprintf(buf, sizeof(buf),
+             "[ %7lu, %7lu ) %8lu %7.3f%% %7.3f%% ",
+             // left
+             (unsigned long)((b == 0) ? 0 : bucketMapper.BucketLimit(b-1)),
+             (unsigned long)bucketMapper.BucketLimit(b), // right
+             (unsigned long)buckets_[b],                 // count
+             (mult * buckets_[b]),        // percentage
+             (mult * sum));               // cumulative percentage
+    r.append(buf);
+
+    // Add hash marks based on percentage; 20 marks for 100%.
+    int marks = static_cast<int>(20*(buckets_[b] / num_) + 0.5);
+    r.append(marks, '#');
+    r.push_back('\n');
+  }
+  return r;
+}
+
+void HistogramImpl::Data(HistogramData * const data) const {
+  assert(data);
+  data->median = Median();
+  data->percentile95 = Percentile(95);
+  data->percentile99 = Percentile(99);
+  data->average = Average();
+  data->standard_deviation = StandardDeviation();
+}
+
+} // namespace levedb
diff --git a/util/histogram.h b/util/histogram.h
new file mode 100644 (file)
index 0000000..c01594d
--- /dev/null
@@ -0,0 +1,79 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include "rocksdb/statistics.h"
+
+#include <cassert>
+#include <string>
+#include <vector>
+#include <map>
+
+namespace rocksdb {
+
+class HistogramBucketMapper {
+ public:
+
+  HistogramBucketMapper();
+
+  // converts a value to the bucket index.
+  const size_t IndexForValue(const uint64_t value) const;
+  // number of buckets required.
+
+  const size_t BucketCount() const {
+    return bucketValues_.size();
+  }
+
+  uint64_t LastValue() const {
+    return maxBucketValue_;
+  }
+
+  uint64_t FirstValue() const {
+    return minBucketValue_;
+  }
+
+  uint64_t BucketLimit(const uint64_t bucketNumber) const {
+    assert(bucketNumber < BucketCount());
+    return bucketValues_[bucketNumber];
+  }
+
+ private:
+  const std::vector<uint64_t> bucketValues_;
+  const uint64_t maxBucketValue_;
+  const uint64_t minBucketValue_;
+  std::map<uint64_t, uint64_t> valueIndexMap_;
+};
+
+class HistogramImpl {
+ public:
+  HistogramImpl();
+  virtual ~HistogramImpl() {}
+  virtual void Clear();
+  virtual void Add(uint64_t value);
+  void Merge(const HistogramImpl& other);
+
+  virtual std::string ToString() const;
+
+  virtual double Median() const;
+  virtual double Percentile(double p) const;
+  virtual double Average() const;
+  virtual double StandardDeviation() const;
+  virtual void Data(HistogramData * const data) const;
+
+ private:
+  double min_;
+  double max_;
+  double num_;
+  double sum_;
+  double sum_squares_;
+  std::vector<uint64_t> buckets_;
+
+};
+
+}  // namespace rocksdb
diff --git a/util/histogram_test.cc b/util/histogram_test.cc
new file mode 100644 (file)
index 0000000..065f957
--- /dev/null
@@ -0,0 +1,62 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#include "util/histogram.h"
+
+#include "util/testharness.h"
+
+namespace rocksdb {
+
+class HistogramTest { };
+
+TEST(HistogramTest, BasicOperation) {
+
+  HistogramImpl histogram;
+  for (uint64_t i = 1; i <= 100; i++) {
+    histogram.Add(i);
+  }
+
+  {
+    double median = histogram.Median();
+    // ASSERT_LE(median, 50);
+    ASSERT_GT(median, 0);
+  }
+
+  {
+    double percentile100 = histogram.Percentile(100.0);
+    ASSERT_LE(percentile100, 100.0);
+    ASSERT_GT(percentile100, 0.0);
+    double percentile99 = histogram.Percentile(99.0);
+    double percentile85 = histogram.Percentile(85.0);
+    ASSERT_LE(percentile99, 99.0);
+    ASSERT_TRUE(percentile99 >= percentile85);
+  }
+
+  ASSERT_EQ(histogram.Average(), 50.5); // avg is acurately caluclated.
+}
+
+TEST(HistogramTest, EmptyHistogram) {
+  HistogramImpl histogram;
+  ASSERT_EQ(histogram.Median(), 0.0);
+  ASSERT_EQ(histogram.Percentile(85.0), 0.0);
+  ASSERT_EQ(histogram.Average(), 0.0);
+}
+
+TEST(HistogramTest, ClearHistogram) {
+  HistogramImpl histogram;
+  for (uint64_t i = 1; i <= 100; i++) {
+    histogram.Add(i);
+  }
+  histogram.Clear();
+  ASSERT_EQ(histogram.Median(), 0);
+  ASSERT_EQ(histogram.Percentile(85.0), 0);
+  ASSERT_EQ(histogram.Average(), 0);
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/util/ldb_cmd.cc b/util/ldb_cmd.cc
new file mode 100644 (file)
index 0000000..65ecd61
--- /dev/null
@@ -0,0 +1,1764 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#include "util/ldb_cmd.h"
+
+#include "db/dbformat.h"
+#include "db/db_impl.h"
+#include "db/log_reader.h"
+#include "db/filename.h"
+#include "db/write_batch_internal.h"
+#include "rocksdb/write_batch.h"
+#include "util/coding.h"
+
+#include <ctime>
+#include <dirent.h>
+#include <sstream>
+#include <string>
+#include <stdexcept>
+
+namespace rocksdb {
+
+using namespace std;
+
+const string LDBCommand::ARG_DB = "db";
+const string LDBCommand::ARG_HEX = "hex";
+const string LDBCommand::ARG_KEY_HEX = "key_hex";
+const string LDBCommand::ARG_VALUE_HEX = "value_hex";
+const string LDBCommand::ARG_TTL = "ttl";
+const string LDBCommand::ARG_TTL_START = "start_time";
+const string LDBCommand::ARG_TTL_END = "end_time";
+const string LDBCommand::ARG_TIMESTAMP = "timestamp";
+const string LDBCommand::ARG_FROM = "from";
+const string LDBCommand::ARG_TO = "to";
+const string LDBCommand::ARG_MAX_KEYS = "max_keys";
+const string LDBCommand::ARG_BLOOM_BITS = "bloom_bits";
+const string LDBCommand::ARG_COMPRESSION_TYPE = "compression_type";
+const string LDBCommand::ARG_BLOCK_SIZE = "block_size";
+const string LDBCommand::ARG_AUTO_COMPACTION = "auto_compaction";
+const string LDBCommand::ARG_WRITE_BUFFER_SIZE = "write_buffer_size";
+const string LDBCommand::ARG_FILE_SIZE = "file_size";
+const string LDBCommand::ARG_CREATE_IF_MISSING = "create_if_missing";
+
+const char* LDBCommand::DELIM = " ==> ";
+
+LDBCommand* LDBCommand::InitFromCmdLineArgs(
+  int argc,
+  char** argv,
+  const Options& options
+) {
+  vector<string> args;
+  for (int i = 1; i < argc; i++) {
+    args.push_back(argv[i]);
+  }
+  return InitFromCmdLineArgs(args, options);
+}
+
+/**
+ * Parse the command-line arguments and create the appropriate LDBCommand2
+ * instance.
+ * The command line arguments must be in the following format:
+ * ./ldb --db=PATH_TO_DB [--commonOpt1=commonOpt1Val] ..
+ *        COMMAND <PARAM1> <PARAM2> ... [-cmdSpecificOpt1=cmdSpecificOpt1Val] ..
+ * This is similar to the command line format used by HBaseClientTool.
+ * Command name is not included in args.
+ * Returns nullptr if the command-line cannot be parsed.
+ */
+LDBCommand* LDBCommand::InitFromCmdLineArgs(
+  const vector<string>& args,
+  const Options& options
+) {
+  // --x=y command line arguments are added as x->y map entries.
+  map<string, string> option_map;
+
+  // Command-line arguments of the form --hex end up in this array as hex
+  vector<string> flags;
+
+  // Everything other than option_map and flags. Represents commands
+  // and their parameters.  For eg: put key1 value1 go into this vector.
+  vector<string> cmdTokens;
+
+  const string OPTION_PREFIX = "--";
+
+  for (const auto& arg : args) {
+    if (arg[0] == '-' && arg[1] == '-'){
+      vector<string> splits = stringSplit(arg, '=');
+      if (splits.size() == 2) {
+        string optionKey = splits[0].substr(OPTION_PREFIX.size());
+        option_map[optionKey] = splits[1];
+      } else {
+        string optionKey = splits[0].substr(OPTION_PREFIX.size());
+        flags.push_back(optionKey);
+      }
+    } else {
+      cmdTokens.push_back(arg);
+    }
+  }
+
+  if (cmdTokens.size() < 1) {
+    fprintf(stderr, "Command not specified!");
+    return nullptr;
+  }
+
+  string cmd = cmdTokens[0];
+  vector<string> cmdParams(cmdTokens.begin()+1, cmdTokens.end());
+  LDBCommand* command = LDBCommand::SelectCommand(
+    cmd,
+    cmdParams,
+    option_map,
+    flags
+  );
+
+  if (command) {
+    command->SetOptions(options);
+  }
+  return command;
+}
+
+LDBCommand* LDBCommand::SelectCommand(
+    const std::string& cmd,
+    const vector<string>& cmdParams,
+    const map<string, string>& option_map,
+    const vector<string>& flags
+  ) {
+
+  if (cmd == GetCommand::Name()) {
+    return new GetCommand(cmdParams, option_map, flags);
+  } else if (cmd == PutCommand::Name()) {
+    return new PutCommand(cmdParams, option_map, flags);
+  } else if (cmd == BatchPutCommand::Name()) {
+    return new BatchPutCommand(cmdParams, option_map, flags);
+  } else if (cmd == ScanCommand::Name()) {
+    return new ScanCommand(cmdParams, option_map, flags);
+  } else if (cmd == DeleteCommand::Name()) {
+    return new DeleteCommand(cmdParams, option_map, flags);
+  } else if (cmd == ApproxSizeCommand::Name()) {
+    return new ApproxSizeCommand(cmdParams, option_map, flags);
+  } else if (cmd == DBQuerierCommand::Name()) {
+    return new DBQuerierCommand(cmdParams, option_map, flags);
+  } else if (cmd == CompactorCommand::Name()) {
+    return new CompactorCommand(cmdParams, option_map, flags);
+  } else if (cmd == WALDumperCommand::Name()) {
+    return new WALDumperCommand(cmdParams, option_map, flags);
+  } else if (cmd == ReduceDBLevelsCommand::Name()) {
+    return new ReduceDBLevelsCommand(cmdParams, option_map, flags);
+  } else if (cmd == ChangeCompactionStyleCommand::Name()) {
+    return new ChangeCompactionStyleCommand(cmdParams, option_map, flags);
+  } else if (cmd == DBDumperCommand::Name()) {
+    return new DBDumperCommand(cmdParams, option_map, flags);
+  } else if (cmd == DBLoaderCommand::Name()) {
+    return new DBLoaderCommand(cmdParams, option_map, flags);
+  } else if (cmd == ManifestDumpCommand::Name()) {
+    return new ManifestDumpCommand(cmdParams, option_map, flags);
+  } else if (cmd == InternalDumpCommand::Name()) {
+    return new InternalDumpCommand(cmdParams, option_map, flags);
+  }
+  return nullptr;
+}
+
+
+/**
+ * Parses the specific integer option and fills in the value.
+ * Returns true if the option is found.
+ * Returns false if the option is not found or if there is an error parsing the
+ * value.  If there is an error, the specified exec_state is also
+ * updated.
+ */
+bool LDBCommand::ParseIntOption(const map<string, string>& options,
+                                const string& option, int& value,
+                                LDBCommandExecuteResult& exec_state) {
+
+  map<string, string>::const_iterator itr = option_map_.find(option);
+  if (itr != option_map_.end()) {
+    try {
+      value = stoi(itr->second);
+      return true;
+    } catch(const invalid_argument&) {
+      exec_state = LDBCommandExecuteResult::FAILED(option +
+                      " has an invalid value.");
+    } catch(const out_of_range&) {
+      exec_state = LDBCommandExecuteResult::FAILED(option +
+                      " has a value out-of-range.");
+    }
+  }
+  return false;
+}
+
+/**
+ * Parses the specified option and fills in the value.
+ * Returns true if the option is found.
+ * Returns false otherwise.
+ */
+bool LDBCommand::ParseStringOption(const map<string, string>& options,
+                                   const string& option, string* value) {
+  auto itr = option_map_.find(option);
+  if (itr != option_map_.end()) {
+    *value = itr->second;
+    return true;
+  }
+  return false;
+}
+
+Options LDBCommand::PrepareOptionsForOpenDB() {
+
+  Options opt = options_;
+  opt.create_if_missing = false;
+
+  map<string, string>::const_iterator itr;
+
+  int bits;
+  if (ParseIntOption(option_map_, ARG_BLOOM_BITS, bits, exec_state_)) {
+    if (bits > 0) {
+      opt.filter_policy = NewBloomFilterPolicy(bits);
+    } else {
+      exec_state_ = LDBCommandExecuteResult::FAILED(ARG_BLOOM_BITS +
+                      " must be > 0.");
+    }
+  }
+
+  int block_size;
+  if (ParseIntOption(option_map_, ARG_BLOCK_SIZE, block_size, exec_state_)) {
+    if (block_size > 0) {
+      opt.block_size = block_size;
+    } else {
+      exec_state_ = LDBCommandExecuteResult::FAILED(ARG_BLOCK_SIZE +
+                      " must be > 0.");
+    }
+  }
+
+  itr = option_map_.find(ARG_AUTO_COMPACTION);
+  if (itr != option_map_.end()) {
+    opt.disable_auto_compactions = ! StringToBool(itr->second);
+  }
+
+  itr = option_map_.find(ARG_COMPRESSION_TYPE);
+  if (itr != option_map_.end()) {
+    string comp = itr->second;
+    if (comp == "no") {
+      opt.compression = kNoCompression;
+    } else if (comp == "snappy") {
+      opt.compression = kSnappyCompression;
+    } else if (comp == "zlib") {
+      opt.compression = kZlibCompression;
+    } else if (comp == "bzip2") {
+      opt.compression = kBZip2Compression;
+    } else {
+      // Unknown compression.
+      exec_state_ = LDBCommandExecuteResult::FAILED(
+                      "Unknown compression level: " + comp);
+    }
+  }
+
+  int write_buffer_size;
+  if (ParseIntOption(option_map_, ARG_WRITE_BUFFER_SIZE, write_buffer_size,
+        exec_state_)) {
+    if (write_buffer_size > 0) {
+      opt.write_buffer_size = write_buffer_size;
+    } else {
+      exec_state_ = LDBCommandExecuteResult::FAILED(ARG_WRITE_BUFFER_SIZE +
+                      " must be > 0.");
+    }
+  }
+
+  int file_size;
+  if (ParseIntOption(option_map_, ARG_FILE_SIZE, file_size, exec_state_)) {
+    if (file_size > 0) {
+      opt.target_file_size_base = file_size;
+    } else {
+      exec_state_ = LDBCommandExecuteResult::FAILED(ARG_FILE_SIZE +
+                      " must be > 0.");
+    }
+  }
+
+  return opt;
+}
+
+bool LDBCommand::ParseKeyValue(const string& line, string* key, string* value,
+                              bool is_key_hex, bool is_value_hex) {
+  size_t pos = line.find(DELIM);
+  if (pos != string::npos) {
+    *key = line.substr(0, pos);
+    *value = line.substr(pos + strlen(DELIM));
+    if (is_key_hex) {
+      *key = HexToString(*key);
+    }
+    if (is_value_hex) {
+      *value = HexToString(*value);
+    }
+    return true;
+  } else {
+    return false;
+  }
+}
+
+/**
+ * Make sure that ONLY the command-line options and flags expected by this
+ * command are specified on the command-line.  Extraneous options are usually
+ * the result of user error.
+ * Returns true if all checks pass.  Else returns false, and prints an
+ * appropriate error msg to stderr.
+ */
+bool LDBCommand::ValidateCmdLineOptions() {
+
+  for (map<string, string>::const_iterator itr = option_map_.begin();
+        itr != option_map_.end(); itr++) {
+    if (find(valid_cmd_line_options_.begin(),
+          valid_cmd_line_options_.end(), itr->first) ==
+          valid_cmd_line_options_.end()) {
+      fprintf(stderr, "Invalid command-line option %s\n", itr->first.c_str());
+      return false;
+    }
+  }
+
+  for (vector<string>::const_iterator itr = flags_.begin();
+        itr != flags_.end(); itr++) {
+    if (find(valid_cmd_line_options_.begin(),
+          valid_cmd_line_options_.end(), *itr) ==
+          valid_cmd_line_options_.end()) {
+      fprintf(stderr, "Invalid command-line flag %s\n", itr->c_str());
+      return false;
+    }
+  }
+
+  if (!NoDBOpen() && option_map_.find(ARG_DB) == option_map_.end()) {
+    fprintf(stderr, "%s must be specified\n", ARG_DB.c_str());
+    return false;
+  }
+
+  return true;
+}
+
+CompactorCommand::CompactorCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags) :
+    LDBCommand(options, flags, false,
+               BuildCmdLineOptions({ARG_FROM, ARG_TO, ARG_HEX, ARG_KEY_HEX,
+                                    ARG_VALUE_HEX, ARG_TTL})),
+    null_from_(true), null_to_(true) {
+
+  map<string, string>::const_iterator itr = options.find(ARG_FROM);
+  if (itr != options.end()) {
+    null_from_ = false;
+    from_ = itr->second;
+  }
+
+  itr = options.find(ARG_TO);
+  if (itr != options.end()) {
+    null_to_ = false;
+    to_ = itr->second;
+  }
+
+  if (is_key_hex_) {
+    if (!null_from_) {
+      from_ = HexToString(from_);
+    }
+    if (!null_to_) {
+      to_ = HexToString(to_);
+    }
+  }
+}
+
+void CompactorCommand::Help(string& ret) {
+  ret.append("  ");
+  ret.append(CompactorCommand::Name());
+  ret.append(HelpRangeCmdArgs());
+  ret.append("\n");
+}
+
+void CompactorCommand::DoCommand() {
+
+  Slice* begin = nullptr;
+  Slice* end = nullptr;
+  if (!null_from_) {
+    begin = new Slice(from_);
+  }
+  if (!null_to_) {
+    end = new Slice(to_);
+  }
+
+  db_->CompactRange(begin, end);
+  exec_state_ = LDBCommandExecuteResult::SUCCEED("");
+
+  delete begin;
+  delete end;
+}
+
+const string DBLoaderCommand::ARG_DISABLE_WAL = "disable_wal";
+const string DBLoaderCommand::ARG_BULK_LOAD = "bulk_load";
+const string DBLoaderCommand::ARG_COMPACT = "compact";
+
+DBLoaderCommand::DBLoaderCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags) :
+    LDBCommand(options, flags, false,
+               BuildCmdLineOptions({ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX,
+                                    ARG_FROM, ARG_TO, ARG_CREATE_IF_MISSING,
+                                    ARG_DISABLE_WAL, ARG_BULK_LOAD,
+                                    ARG_COMPACT})),
+    create_if_missing_(false), disable_wal_(false), bulk_load_(false),
+    compact_(false) {
+
+  create_if_missing_ = IsFlagPresent(flags, ARG_CREATE_IF_MISSING);
+  disable_wal_ = IsFlagPresent(flags, ARG_DISABLE_WAL);
+  bulk_load_ = IsFlagPresent(flags, ARG_BULK_LOAD);
+  compact_ = IsFlagPresent(flags, ARG_COMPACT);
+}
+
+void DBLoaderCommand::Help(string& ret) {
+  ret.append("  ");
+  ret.append(DBLoaderCommand::Name());
+  ret.append(" [--" + ARG_CREATE_IF_MISSING + "]");
+  ret.append(" [--" + ARG_DISABLE_WAL + "]");
+  ret.append(" [--" + ARG_BULK_LOAD + "]");
+  ret.append(" [--" + ARG_COMPACT + "]");
+  ret.append("\n");
+}
+
+Options DBLoaderCommand::PrepareOptionsForOpenDB() {
+  Options opt = LDBCommand::PrepareOptionsForOpenDB();
+  opt.create_if_missing = create_if_missing_;
+  if (bulk_load_) {
+    opt.PrepareForBulkLoad();
+  }
+  return opt;
+}
+
+void DBLoaderCommand::DoCommand() {
+  if (!db_) {
+    return;
+  }
+
+  WriteOptions write_options;
+  if (disable_wal_) {
+    write_options.disableWAL = true;
+  }
+
+  int bad_lines = 0;
+  string line;
+  while (getline(cin, line, '\n')) {
+    string key;
+    string value;
+    if (ParseKeyValue(line, &key, &value, is_key_hex_, is_value_hex_)) {
+      db_->Put(write_options, Slice(key), Slice(value));
+    } else if (0 == line.find("Keys in range:")) {
+      // ignore this line
+    } else if (0 == line.find("Created bg thread 0x")) {
+      // ignore this line
+    } else {
+      bad_lines ++;
+    }
+  }
+
+  if (bad_lines > 0) {
+    cout << "Warning: " << bad_lines << " bad lines ignored." << endl;
+  }
+  if (compact_) {
+    db_->CompactRange(nullptr, nullptr);
+  }
+}
+
+// ----------------------------------------------------------------------------
+
+const string ManifestDumpCommand::ARG_VERBOSE = "verbose";
+const string ManifestDumpCommand::ARG_PATH    = "path";
+
+void ManifestDumpCommand::Help(string& ret) {
+  ret.append("  ");
+  ret.append(ManifestDumpCommand::Name());
+  ret.append(" [--" + ARG_VERBOSE + "]");
+  ret.append(" [--" + ARG_PATH + "=<path_to_manifest_file>]");
+  ret.append("\n");
+}
+
+ManifestDumpCommand::ManifestDumpCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags) :
+    LDBCommand(options, flags, false,
+               BuildCmdLineOptions({ARG_VERBOSE, ARG_PATH, ARG_HEX})),
+    verbose_(false),
+    path_("")
+{
+  verbose_ = IsFlagPresent(flags, ARG_VERBOSE);
+
+  map<string, string>::const_iterator itr = options.find(ARG_PATH);
+  if (itr != options.end()) {
+    path_ = itr->second;
+    if (path_.empty()) {
+      exec_state_ = LDBCommandExecuteResult::FAILED("--path: missing pathname");
+    }
+  }
+}
+
+void ManifestDumpCommand::DoCommand() {
+
+  std::string manifestfile;
+
+  if (!path_.empty()) {
+    manifestfile = path_;
+  } else {
+    bool found = false;
+    // We need to find the manifest file by searching the directory
+    // containing the db for files of the form MANIFEST_[0-9]+
+    DIR* d = opendir(db_path_.c_str());
+    if (d == nullptr) {
+      exec_state_ = LDBCommandExecuteResult::FAILED(
+        db_path_ + " is not a directory");
+      return;
+    }
+    struct dirent* entry;
+    while ((entry = readdir(d)) != nullptr) {
+      unsigned int match;
+      unsigned long long num;
+      if (sscanf(entry->d_name,
+                 "MANIFEST-%ln%ln",
+                 (unsigned long*)&num,
+                 (unsigned long*)&match)
+          && match == strlen(entry->d_name)) {
+        if (!found) {
+          manifestfile = db_path_ + "/" + std::string(entry->d_name);
+          found = true;
+        } else {
+          exec_state_ = LDBCommandExecuteResult::FAILED(
+            "Multiple MANIFEST files found; use --path to select one");
+          return;
+        }
+      }
+    }
+    closedir(d);
+  }
+
+  if (verbose_) {
+    printf("Processing Manifest file %s\n", manifestfile.c_str());
+  }
+
+  Options options;
+  EnvOptions sopt;
+  std::string file(manifestfile);
+  std::string dbname("dummy");
+  TableCache* tc = new TableCache(dbname, &options, sopt, 10);
+  const InternalKeyComparator* cmp =
+    new InternalKeyComparator(options.comparator);
+
+  VersionSet* versions = new VersionSet(dbname, &options, sopt, tc, cmp);
+  Status s = versions->DumpManifest(options, file, verbose_, is_key_hex_);
+  if (!s.ok()) {
+    printf("Error in processing file %s %s\n", manifestfile.c_str(),
+           s.ToString().c_str());
+  }
+  if (verbose_) {
+    printf("Processing Manifest file %s done\n", manifestfile.c_str());
+  }
+}
+
+// ----------------------------------------------------------------------------
+
+string ReadableTime(int unixtime) {
+  char time_buffer [80];
+  time_t rawtime = unixtime;
+  struct tm * timeinfo = localtime(&rawtime);
+  strftime(time_buffer, 80, "%c", timeinfo);
+  return string(time_buffer);
+}
+
+// This function only called when it's the sane case of >1 buckets in time-range
+// Also called only when timekv falls between ttl_start and ttl_end provided
+void IncBucketCounts(vector<uint64_t>& bucket_counts, int ttl_start,
+      int time_range, int bucket_size, int timekv, int num_buckets) {
+  assert(time_range > 0 && timekv >= ttl_start && bucket_size > 0 &&
+    timekv < (ttl_start + time_range) && num_buckets > 1);
+  int bucket = (timekv - ttl_start) / bucket_size;
+  bucket_counts[bucket]++;
+}
+
+void PrintBucketCounts(const vector<uint64_t>& bucket_counts, int ttl_start,
+      int ttl_end, int bucket_size, int num_buckets) {
+  int time_point = ttl_start;
+  for(int i = 0; i < num_buckets - 1; i++, time_point += bucket_size) {
+    fprintf(stdout, "Keys in range %s to %s : %lu\n",
+            ReadableTime(time_point).c_str(),
+            ReadableTime(time_point + bucket_size).c_str(),
+            (unsigned long)bucket_counts[i]);
+  }
+  fprintf(stdout, "Keys in range %s to %s : %lu\n",
+          ReadableTime(time_point).c_str(),
+          ReadableTime(ttl_end).c_str(),
+          (unsigned long)bucket_counts[num_buckets - 1]);
+}
+
+const string InternalDumpCommand::ARG_COUNT_ONLY = "count_only";
+const string InternalDumpCommand::ARG_COUNT_DELIM = "count_delim";
+const string InternalDumpCommand::ARG_STATS = "stats";
+const string InternalDumpCommand::ARG_INPUT_KEY_HEX = "input_key_hex";
+
+InternalDumpCommand::InternalDumpCommand(const vector<string>& params,
+                                         const map<string, string>& options,
+                                         const vector<string>& flags) :
+    LDBCommand(options, flags, true,
+               BuildCmdLineOptions({ ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX,
+                                     ARG_FROM, ARG_TO, ARG_MAX_KEYS,
+                                     ARG_COUNT_ONLY, ARG_COUNT_DELIM, ARG_STATS,
+                                     ARG_INPUT_KEY_HEX})),
+    has_from_(false),
+    has_to_(false),
+    max_keys_(-1),
+    delim_("."),
+    count_only_(false),
+    count_delim_(false),
+    print_stats_(false),
+    is_input_key_hex_(false) {
+
+  has_from_ = ParseStringOption(options, ARG_FROM, &from_);
+  has_to_ = ParseStringOption(options, ARG_TO, &to_);
+
+  ParseIntOption(options, ARG_MAX_KEYS, max_keys_, exec_state_);
+  map<string, string>::const_iterator itr = options.find(ARG_COUNT_DELIM);
+  if (itr != options.end()) {
+    delim_ = itr->second;
+    count_delim_ = true;
+   // fprintf(stdout,"delim = %c\n",delim_[0]);
+  } else {
+    count_delim_ = IsFlagPresent(flags, ARG_COUNT_DELIM);
+    delim_=".";
+  }
+
+  print_stats_ = IsFlagPresent(flags, ARG_STATS);
+  count_only_ = IsFlagPresent(flags, ARG_COUNT_ONLY);
+  is_input_key_hex_ = IsFlagPresent(flags, ARG_INPUT_KEY_HEX);
+
+  if (is_input_key_hex_) {
+    if (has_from_) {
+      from_ = HexToString(from_);
+    }
+    if (has_to_) {
+      to_ = HexToString(to_);
+    }
+  }
+}
+
+void InternalDumpCommand::Help(string& ret) {
+  ret.append("  ");
+  ret.append(InternalDumpCommand::Name());
+  ret.append(HelpRangeCmdArgs());
+  ret.append(" [--" + ARG_INPUT_KEY_HEX + "]");
+  ret.append(" [--" + ARG_MAX_KEYS + "=<N>]");
+  ret.append(" [--" + ARG_COUNT_ONLY + "]");
+  ret.append(" [--" + ARG_COUNT_DELIM + "=<char>]");
+  ret.append(" [--" + ARG_STATS + "]");
+  ret.append("\n");
+}
+
+void InternalDumpCommand::DoCommand() {
+  if (!db_) {
+    return;
+  }
+
+  if (print_stats_) {
+    string stats;
+    if (db_->GetProperty("rocksdb.stats", &stats)) {
+      fprintf(stdout, "%s\n", stats.c_str());
+    }
+  }
+
+  // Cast as DBImpl to get internal iterator
+  DBImpl* idb = dynamic_cast<DBImpl*>(db_);
+  if (!idb) {
+    exec_state_ = LDBCommandExecuteResult::FAILED("DB is not DBImpl");
+    return;
+  }
+  string rtype1,rtype2,row,val;
+  rtype2 = "";
+  uint64_t c=0;
+  uint64_t s1=0,s2=0;
+  // Setup internal key iterator
+  auto iter = unique_ptr<Iterator>(idb->TEST_NewInternalIterator());
+  Status st = iter->status();
+  if (!st.ok()) {
+    exec_state_ = LDBCommandExecuteResult::FAILED("Iterator error:"
+                                                  + st.ToString());
+  }
+
+  if (has_from_) {
+    InternalKey ikey(from_, kMaxSequenceNumber, kValueTypeForSeek);
+    iter->Seek(ikey.Encode());
+  } else {
+    iter->SeekToFirst();
+  }
+
+  long long count = 0;
+  for (; iter->Valid(); iter->Next()) {
+    ParsedInternalKey ikey;
+    if (!ParseInternalKey(iter->key(), &ikey)) {
+      fprintf(stderr, "Internal Key [%s] parse error!\n",
+              iter->key().ToString(true /* in hex*/).data());
+      // TODO: add error counter
+      continue;
+    }
+
+    // If end marker was specified, we stop before it
+    if (has_to_ && options_.comparator->Compare(ikey.user_key, to_) >= 0) {
+      break;
+    }
+
+    ++count;
+    int k;
+    if (count_delim_) {
+      rtype1 = "";
+      s1=0;
+      row = iter->key().ToString();
+      val = iter->value().ToString();
+      for(k=0;row[k]!='\x01' && row[k]!='\0';k++)
+        s1++;
+      for(k=0;val[k]!='\x01' && val[k]!='\0';k++)
+        s1++;
+      for(int j=0;row[j]!=delim_[0] && row[j]!='\0' && row[j]!='\x01';j++)
+        rtype1+=row[j];
+      if(rtype2.compare("") && rtype2.compare(rtype1)!=0) {
+        fprintf(stdout,"%s => count:%lld\tsize:%lld\n",rtype2.c_str(),
+            (long long)c,(long long)s2);
+        c=1;
+        s2=s1;
+        rtype2 = rtype1;
+      } else {
+        c++;
+        s2+=s1;
+        rtype2=rtype1;
+    }
+  }
+
+    if (!count_only_ && !count_delim_) {
+      string key = ikey.DebugString(is_key_hex_);
+      string value = iter->value().ToString(is_value_hex_);
+      std::cout << key << " => " << value << "\n";
+    }
+
+    // Terminate if maximum number of keys have been dumped
+    if (max_keys_ > 0 && count >= max_keys_) break;
+  }
+  if(count_delim_) {
+    fprintf(stdout,"%s => count:%lld\tsize:%lld\n", rtype2.c_str(),
+        (long long)c,(long long)s2);
+  } else
+  fprintf(stdout, "Internal keys in range: %lld\n", (long long) count);
+}
+
+
+const string DBDumperCommand::ARG_COUNT_ONLY = "count_only";
+const string DBDumperCommand::ARG_COUNT_DELIM = "count_delim";
+const string DBDumperCommand::ARG_STATS = "stats";
+const string DBDumperCommand::ARG_TTL_BUCKET = "bucket";
+
+DBDumperCommand::DBDumperCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags) :
+    LDBCommand(options, flags, true,
+               BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX,
+                                    ARG_VALUE_HEX, ARG_FROM, ARG_TO,
+                                    ARG_MAX_KEYS, ARG_COUNT_ONLY,
+                                    ARG_COUNT_DELIM, ARG_STATS, ARG_TTL_START,
+                                    ARG_TTL_END, ARG_TTL_BUCKET,
+                                    ARG_TIMESTAMP})),
+    null_from_(true),
+    null_to_(true),
+    max_keys_(-1),
+    count_only_(false),
+    count_delim_(false),
+    print_stats_(false) {
+
+  map<string, string>::const_iterator itr = options.find(ARG_FROM);
+  if (itr != options.end()) {
+    null_from_ = false;
+    from_ = itr->second;
+  }
+
+  itr = options.find(ARG_TO);
+  if (itr != options.end()) {
+    null_to_ = false;
+    to_ = itr->second;
+  }
+
+  itr = options.find(ARG_MAX_KEYS);
+  if (itr != options.end()) {
+    try {
+      max_keys_ = stoi(itr->second);
+    } catch(const invalid_argument&) {
+      exec_state_ = LDBCommandExecuteResult::FAILED(ARG_MAX_KEYS +
+                        " has an invalid value");
+    } catch(const out_of_range&) {
+      exec_state_ = LDBCommandExecuteResult::FAILED(ARG_MAX_KEYS +
+                        " has a value out-of-range");
+    }
+  }
+  itr = options.find(ARG_COUNT_DELIM);
+  if (itr != options.end()) {
+    delim_ = itr->second;
+    count_delim_ = true;
+  } else {
+    count_delim_ = IsFlagPresent(flags, ARG_COUNT_DELIM);
+    delim_=".";
+  }
+
+  print_stats_ = IsFlagPresent(flags, ARG_STATS);
+  count_only_ = IsFlagPresent(flags, ARG_COUNT_ONLY);
+
+  if (is_key_hex_) {
+    if (!null_from_) {
+      from_ = HexToString(from_);
+    }
+    if (!null_to_) {
+      to_ = HexToString(to_);
+    }
+  }
+}
+
+void DBDumperCommand::Help(string& ret) {
+  ret.append("  ");
+  ret.append(DBDumperCommand::Name());
+  ret.append(HelpRangeCmdArgs());
+  ret.append(" [--" + ARG_TTL + "]");
+  ret.append(" [--" + ARG_MAX_KEYS + "=<N>]");
+  ret.append(" [--" + ARG_TIMESTAMP + "]");
+  ret.append(" [--" + ARG_COUNT_ONLY + "]");
+  ret.append(" [--" + ARG_COUNT_DELIM + "=<char>]");
+  ret.append(" [--" + ARG_STATS + "]");
+  ret.append(" [--" + ARG_TTL_BUCKET + "=<N>]");
+  ret.append(" [--" + ARG_TTL_START + "=<N>:- is inclusive]");
+  ret.append(" [--" + ARG_TTL_END + "=<N>:- is exclusive]");
+  ret.append("\n");
+}
+
+void DBDumperCommand::DoCommand() {
+  if (!db_) {
+    return;
+  }
+  // Parse command line args
+  uint64_t count = 0;
+  if (print_stats_) {
+    string stats;
+    if (db_->GetProperty("rocksdb.stats", &stats)) {
+      fprintf(stdout, "%s\n", stats.c_str());
+    }
+  }
+
+  // Setup key iterator
+  Iterator* iter = db_->NewIterator(ReadOptions());
+  Status st = iter->status();
+  if (!st.ok()) {
+    exec_state_ = LDBCommandExecuteResult::FAILED("Iterator error."
+        + st.ToString());
+  }
+
+  if (!null_from_) {
+    iter->Seek(from_);
+  } else {
+    iter->SeekToFirst();
+  }
+
+  int max_keys = max_keys_;
+  int ttl_start;
+  if (!ParseIntOption(option_map_, ARG_TTL_START, ttl_start, exec_state_)) {
+    ttl_start = DBWithTTL::kMinTimestamp; // TTL introduction time
+  }
+  int ttl_end;
+  if (!ParseIntOption(option_map_, ARG_TTL_END, ttl_end, exec_state_)) {
+    ttl_end = DBWithTTL::kMaxTimestamp; // Max time allowed by TTL feature
+  }
+  if (ttl_end < ttl_start) {
+    fprintf(stderr, "Error: End time can't be less than start time\n");
+    delete iter;
+    return;
+  }
+  int time_range = ttl_end - ttl_start;
+  int bucket_size;
+  if (!ParseIntOption(option_map_, ARG_TTL_BUCKET, bucket_size, exec_state_) ||
+      bucket_size <= 0) {
+    bucket_size = time_range; // Will have just 1 bucket by default
+  }
+  //cretaing variables for row count of each type
+  string rtype1,rtype2,row,val;
+  rtype2 = "";
+  uint64_t c=0;
+  uint64_t s1=0,s2=0;
+
+  // At this point, bucket_size=0 => time_range=0
+  uint64_t num_buckets = (bucket_size >= time_range) ? 1 :
+    ((time_range + bucket_size - 1) / bucket_size);
+  vector<uint64_t> bucket_counts(num_buckets, 0);
+  if (is_db_ttl_ && !count_only_ && timestamp_ && !count_delim_) {
+    fprintf(stdout, "Dumping key-values from %s to %s\n",
+            ReadableTime(ttl_start).c_str(), ReadableTime(ttl_end).c_str());
+  }
+
+  for (; iter->Valid(); iter->Next()) {
+    int rawtime = 0;
+    // If end marker was specified, we stop before it
+    if (!null_to_ && (iter->key().ToString() >= to_))
+      break;
+    // Terminate if maximum number of keys have been dumped
+    if (max_keys == 0)
+      break;
+    if (is_db_ttl_) {
+      TtlIterator* it_ttl = dynamic_cast<TtlIterator*>(iter);
+      assert(it_ttl);
+      rawtime = it_ttl->timestamp();
+      if (rawtime < ttl_start || rawtime >= ttl_end) {
+        continue;
+      }
+    }
+    if (max_keys > 0) {
+      --max_keys;
+    }
+    if (is_db_ttl_ && num_buckets > 1) {
+      IncBucketCounts(bucket_counts, ttl_start, time_range, bucket_size,
+                      rawtime, num_buckets);
+    }
+    ++count;
+    if (count_delim_) {
+      rtype1 = "";
+      row = iter->key().ToString();
+      val = iter->value().ToString();
+      s1 = row.size()+val.size();
+      for(int j=0;row[j]!=delim_[0] && row[j]!='\0';j++)
+        rtype1+=row[j];
+      if(rtype2.compare("") && rtype2.compare(rtype1)!=0) {
+        fprintf(stdout,"%s => count:%lld\tsize:%lld\n",rtype2.c_str(),
+            (long long )c,(long long)s2);
+        c=1;
+        s2=s1;
+        rtype2 = rtype1;
+      } else {
+          c++;
+          s2+=s1;
+          rtype2=rtype1;
+      }
+
+    }
+
+
+
+    if (!count_only_ && !count_delim_) {
+      if (is_db_ttl_ && timestamp_) {
+        fprintf(stdout, "%s ", ReadableTime(rawtime).c_str());
+      }
+      string str = PrintKeyValue(iter->key().ToString(),
+                                 iter->value().ToString(), is_key_hex_,
+                                 is_value_hex_);
+      fprintf(stdout, "%s\n", str.c_str());
+    }
+  }
+
+  if (num_buckets > 1 && is_db_ttl_) {
+    PrintBucketCounts(bucket_counts, ttl_start, ttl_end, bucket_size,
+                      num_buckets);
+  } else if(count_delim_) {
+    fprintf(stdout,"%s => count:%lld\tsize:%lld\n",rtype2.c_str(),
+        (long long )c,(long long)s2);
+  } else {
+    fprintf(stdout, "Keys in range: %lld\n", (long long) count);
+  }
+  // Clean up
+  delete iter;
+}
+
+const string ReduceDBLevelsCommand::ARG_NEW_LEVELS = "new_levels";
+const string  ReduceDBLevelsCommand::ARG_PRINT_OLD_LEVELS = "print_old_levels";
+
+ReduceDBLevelsCommand::ReduceDBLevelsCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags) :
+    LDBCommand(options, flags, false,
+               BuildCmdLineOptions({ARG_NEW_LEVELS, ARG_PRINT_OLD_LEVELS})),
+    old_levels_(1 << 16),
+    new_levels_(-1),
+    print_old_levels_(false) {
+
+
+  ParseIntOption(option_map_, ARG_NEW_LEVELS, new_levels_, exec_state_);
+  print_old_levels_ = IsFlagPresent(flags, ARG_PRINT_OLD_LEVELS);
+
+  if(new_levels_ <= 0) {
+    exec_state_ = LDBCommandExecuteResult::FAILED(
+           " Use --" + ARG_NEW_LEVELS + " to specify a new level number\n");
+  }
+}
+
+vector<string> ReduceDBLevelsCommand::PrepareArgs(const string& db_path,
+    int new_levels, bool print_old_level) {
+  vector<string> ret;
+  ret.push_back("reduce_levels");
+  ret.push_back("--" + ARG_DB + "=" + db_path);
+  ret.push_back("--" + ARG_NEW_LEVELS + "=" + to_string(new_levels));
+  if(print_old_level) {
+    ret.push_back("--" + ARG_PRINT_OLD_LEVELS);
+  }
+  return ret;
+}
+
+void ReduceDBLevelsCommand::Help(string& ret) {
+  ret.append("  ");
+  ret.append(ReduceDBLevelsCommand::Name());
+  ret.append(" --" + ARG_NEW_LEVELS + "=<New number of levels>");
+  ret.append(" [--" + ARG_PRINT_OLD_LEVELS + "]");
+  ret.append("\n");
+}
+
+Options ReduceDBLevelsCommand::PrepareOptionsForOpenDB() {
+  Options opt = LDBCommand::PrepareOptionsForOpenDB();
+  opt.num_levels = old_levels_;
+  opt.max_bytes_for_level_multiplier_additional.resize(opt.num_levels, 1);
+  // Disable size compaction
+  opt.max_bytes_for_level_base = 1UL << 50;
+  opt.max_bytes_for_level_multiplier = 1;
+  opt.max_mem_compaction_level = 0;
+  return opt;
+}
+
+Status ReduceDBLevelsCommand::GetOldNumOfLevels(Options& opt,
+    int* levels) {
+  EnvOptions soptions;
+  TableCache tc(db_path_, &opt, soptions, 10);
+  const InternalKeyComparator cmp(opt.comparator);
+  VersionSet versions(db_path_, &opt, soptions, &tc, &cmp);
+  // We rely the VersionSet::Recover to tell us the internal data structures
+  // in the db. And the Recover() should never do any change
+  // (like LogAndApply) to the manifest file.
+  Status st = versions.Recover();
+  if (!st.ok()) {
+    return st;
+  }
+  int max = -1;
+  for (int i = 0; i < versions.NumberLevels(); i++) {
+    if (versions.current()->NumLevelFiles(i)) {
+      max = i;
+    }
+  }
+
+  *levels = max + 1;
+  return st;
+}
+
+void ReduceDBLevelsCommand::DoCommand() {
+  if (new_levels_ <= 1) {
+    exec_state_ = LDBCommandExecuteResult::FAILED(
+        "Invalid number of levels.\n");
+    return;
+  }
+
+  Status st;
+  Options opt = PrepareOptionsForOpenDB();
+  int old_level_num = -1;
+  st = GetOldNumOfLevels(opt, &old_level_num);
+  if (!st.ok()) {
+    exec_state_ = LDBCommandExecuteResult::FAILED(st.ToString());
+    return;
+  }
+
+  if (print_old_levels_) {
+    fprintf(stdout, "The old number of levels in use is %d\n", old_level_num);
+  }
+
+  if (old_level_num <= new_levels_) {
+    return;
+  }
+
+  old_levels_ = old_level_num;
+
+  OpenDB();
+  if (!db_) {
+    return;
+  }
+  // Compact the whole DB to put all files to the highest level.
+  fprintf(stdout, "Compacting the db...\n");
+  db_->CompactRange(nullptr, nullptr);
+  CloseDB();
+
+  EnvOptions soptions;
+  TableCache tc(db_path_, &opt, soptions, 10);
+  const InternalKeyComparator cmp(opt.comparator);
+  VersionSet versions(db_path_, &opt, soptions, &tc, &cmp);
+  // We rely the VersionSet::Recover to tell us the internal data structures
+  // in the db. And the Recover() should never do any change (like LogAndApply)
+  // to the manifest file.
+  st = versions.Recover();
+  if (!st.ok()) {
+    exec_state_ = LDBCommandExecuteResult::FAILED(st.ToString());
+    return;
+  }
+
+  port::Mutex mu;
+  mu.Lock();
+  st = versions.ReduceNumberOfLevels(new_levels_, &mu);
+  mu.Unlock();
+
+  if (!st.ok()) {
+    exec_state_ = LDBCommandExecuteResult::FAILED(st.ToString());
+    return;
+  }
+}
+
+const string ChangeCompactionStyleCommand::ARG_OLD_COMPACTION_STYLE =
+  "old_compaction_style";
+const string ChangeCompactionStyleCommand::ARG_NEW_COMPACTION_STYLE =
+  "new_compaction_style";
+
+ChangeCompactionStyleCommand::ChangeCompactionStyleCommand(
+      const vector<string>& params, const map<string, string>& options,
+      const vector<string>& flags) :
+    LDBCommand(options, flags, false,
+               BuildCmdLineOptions({ARG_OLD_COMPACTION_STYLE,
+                                    ARG_NEW_COMPACTION_STYLE})),
+    old_compaction_style_(-1),
+    new_compaction_style_(-1) {
+
+  ParseIntOption(option_map_, ARG_OLD_COMPACTION_STYLE, old_compaction_style_,
+    exec_state_);
+  if (old_compaction_style_ != kCompactionStyleLevel &&
+     old_compaction_style_ != kCompactionStyleUniversal) {
+    exec_state_ = LDBCommandExecuteResult::FAILED(
+      "Use --" + ARG_OLD_COMPACTION_STYLE + " to specify old compaction " +
+      "style. Check ldb help for proper compaction style value.\n");
+    return;
+  }
+
+  ParseIntOption(option_map_, ARG_NEW_COMPACTION_STYLE, new_compaction_style_,
+    exec_state_);
+  if (new_compaction_style_ != kCompactionStyleLevel &&
+     new_compaction_style_ != kCompactionStyleUniversal) {
+    exec_state_ = LDBCommandExecuteResult::FAILED(
+      "Use --" + ARG_NEW_COMPACTION_STYLE + " to specify new compaction " +
+      "style. Check ldb help for proper compaction style value.\n");
+    return;
+  }
+
+  if (new_compaction_style_ == old_compaction_style_) {
+    exec_state_ = LDBCommandExecuteResult::FAILED(
+      "Old compaction style is the same as new compaction style. "
+      "Nothing to do.\n");
+    return;
+  }
+
+  if (old_compaction_style_ == kCompactionStyleUniversal &&
+      new_compaction_style_ == kCompactionStyleLevel) {
+    exec_state_ = LDBCommandExecuteResult::FAILED(
+      "Convert from universal compaction to level compaction. "
+      "Nothing to do.\n");
+    return;
+  }
+}
+
+void ChangeCompactionStyleCommand::Help(string& ret) {
+  ret.append("  ");
+  ret.append(ChangeCompactionStyleCommand::Name());
+  ret.append(" --" + ARG_OLD_COMPACTION_STYLE + "=<Old compaction style: 0 " +
+             "for level compaction, 1 for universal compaction>");
+  ret.append(" --" + ARG_NEW_COMPACTION_STYLE + "=<New compaction style: 0 " +
+             "for level compaction, 1 for universal compaction>");
+  ret.append("\n");
+}
+
+Options ChangeCompactionStyleCommand::PrepareOptionsForOpenDB() {
+  Options opt = LDBCommand::PrepareOptionsForOpenDB();
+
+  if (old_compaction_style_ == kCompactionStyleLevel &&
+      new_compaction_style_ == kCompactionStyleUniversal) {
+    // In order to convert from level compaction to universal compaction, we
+    // need to compact all data into a single file and move it to level 0.
+    opt.disable_auto_compactions = true;
+    opt.target_file_size_base = INT_MAX;
+    opt.target_file_size_multiplier = 1;
+    opt.max_bytes_for_level_base = INT_MAX;
+    opt.max_bytes_for_level_multiplier = 1;
+  }
+
+  return opt;
+}
+
+void ChangeCompactionStyleCommand::DoCommand() {
+  // print db stats before we have made any change
+  std::string property;
+  std::string files_per_level;
+  for (int i = 0; i < db_->NumberLevels(); i++) {
+    db_->GetProperty("rocksdb.num-files-at-level" + NumberToString(i),
+                     &property);
+
+    // format print string
+    char buf[100];
+    snprintf(buf, sizeof(buf), "%s%s", (i ? "," : ""), property.c_str());
+    files_per_level += buf;
+  }
+  fprintf(stdout, "files per level before compaction: %s\n",
+          files_per_level.c_str());
+
+  // manual compact into a single file and move the file to level 0
+  db_->CompactRange(nullptr, nullptr,
+                    true /* reduce level */,
+                    0    /* reduce to level 0 */);
+
+  // verify compaction result
+  files_per_level = "";
+  int num_files = 0;
+  for (int i = 0; i < db_->NumberLevels(); i++) {
+    db_->GetProperty("rocksdb.num-files-at-level" + NumberToString(i),
+                     &property);
+
+    // format print string
+    char buf[100];
+    snprintf(buf, sizeof(buf), "%s%s", (i ? "," : ""), property.c_str());
+    files_per_level += buf;
+
+    num_files = atoi(property.c_str());
+
+    // level 0 should have only 1 file
+    if (i == 0 && num_files != 1) {
+      exec_state_ = LDBCommandExecuteResult::FAILED("Number of db files at "
+        "level 0 after compaction is " + std::to_string(num_files) +
+        ", not 1.\n");
+      return;
+    }
+    // other levels should have no file
+    if (i > 0 && num_files != 0) {
+      exec_state_ = LDBCommandExecuteResult::FAILED("Number of db files at "
+        "level " + std::to_string(i) + " after compaction is " +
+        std::to_string(num_files) + ", not 0.\n");
+      return;
+    }
+  }
+
+  fprintf(stdout, "files per level after compaction: %s\n",
+          files_per_level.c_str());
+}
+
+class InMemoryHandler : public WriteBatch::Handler {
+ public:
+  InMemoryHandler(stringstream& row, bool print_values) : Handler(),row_(row) {
+    print_values_ = print_values;
+  }
+
+  void commonPutMerge(const Slice& key, const Slice& value) {
+    string k = LDBCommand::StringToHex(key.ToString());
+    if (print_values_) {
+      string v = LDBCommand::StringToHex(value.ToString());
+      row_ << k << " : ";
+      row_ << v << " ";
+    } else {
+      row_ << k << " ";
+    }
+  }
+
+  virtual void Put(const Slice& key, const Slice& value) {
+    row_ << "PUT : ";
+    commonPutMerge(key, value);
+  }
+
+  virtual void Merge(const Slice& key, const Slice& value) {
+    row_ << "MERGE : ";
+    commonPutMerge(key, value);
+  }
+
+  virtual void Delete(const Slice& key) {
+    row_ <<",DELETE : ";
+    row_ << LDBCommand::StringToHex(key.ToString()) << " ";
+  }
+
+  virtual ~InMemoryHandler() { };
+
+ private:
+  stringstream & row_;
+  bool print_values_;
+};
+
+const string WALDumperCommand::ARG_WAL_FILE = "walfile";
+const string WALDumperCommand::ARG_PRINT_VALUE = "print_value";
+const string WALDumperCommand::ARG_PRINT_HEADER = "header";
+
+WALDumperCommand::WALDumperCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags) :
+    LDBCommand(options, flags, true,
+               BuildCmdLineOptions(
+                {ARG_WAL_FILE, ARG_PRINT_HEADER, ARG_PRINT_VALUE})),
+    print_header_(false), print_values_(false) {
+
+  wal_file_.clear();
+
+  map<string, string>::const_iterator itr = options.find(ARG_WAL_FILE);
+  if (itr != options.end()) {
+    wal_file_ = itr->second;
+  }
+
+
+  print_header_ = IsFlagPresent(flags, ARG_PRINT_HEADER);
+  print_values_ = IsFlagPresent(flags, ARG_PRINT_VALUE);
+  if (wal_file_.empty()) {
+    exec_state_ = LDBCommandExecuteResult::FAILED(
+                    "Argument " + ARG_WAL_FILE + " must be specified.");
+  }
+}
+
+void WALDumperCommand::Help(string& ret) {
+  ret.append("  ");
+  ret.append(WALDumperCommand::Name());
+  ret.append(" --" + ARG_WAL_FILE + "=<write_ahead_log_file_path>");
+  ret.append(" [--" + ARG_PRINT_HEADER + "] ");
+  ret.append(" [--" + ARG_PRINT_VALUE + "] ");
+  ret.append("\n");
+}
+
+void WALDumperCommand::DoCommand() {
+  struct StdErrReporter : public log::Reader::Reporter {
+    virtual void Corruption(size_t bytes, const Status& s) {
+      cerr<<"Corruption detected in log file "<<s.ToString()<<"\n";
+    }
+  };
+
+  unique_ptr<SequentialFile> file;
+  Env* env_ = Env::Default();
+  EnvOptions soptions;
+  Status status = env_->NewSequentialFile(wal_file_, &file, soptions);
+  if (!status.ok()) {
+    exec_state_ = LDBCommandExecuteResult::FAILED("Failed to open WAL file " +
+      status.ToString());
+  } else {
+    StdErrReporter reporter;
+    log::Reader reader(move(file), &reporter, true, 0);
+    string scratch;
+    WriteBatch batch;
+    Slice record;
+    stringstream row;
+    if (print_header_) {
+      cout<<"Sequence,Count,ByteSize,Physical Offset,Key(s)";
+      if (print_values_) {
+        cout << " : value ";
+      }
+      cout << "\n";
+    }
+    while(reader.ReadRecord(&record, &scratch)) {
+      row.str("");
+      if (record.size() < 12) {
+        reporter.Corruption(
+            record.size(), Status::Corruption("log record too small"));
+      } else {
+        WriteBatchInternal::SetContents(&batch, record);
+        row<<WriteBatchInternal::Sequence(&batch)<<",";
+        row<<WriteBatchInternal::Count(&batch)<<",";
+        row<<WriteBatchInternal::ByteSize(&batch)<<",";
+        row<<reader.LastRecordOffset()<<",";
+        InMemoryHandler handler(row, print_values_);
+        batch.Iterate(&handler);
+        row<<"\n";
+      }
+      cout<<row.str();
+    }
+  }
+}
+
+
+GetCommand::GetCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags) :
+  LDBCommand(options, flags, true, BuildCmdLineOptions({ARG_TTL, ARG_HEX,
+                                                        ARG_KEY_HEX,
+                                                        ARG_VALUE_HEX})) {
+
+  if (params.size() != 1) {
+    exec_state_ = LDBCommandExecuteResult::FAILED(
+                    "<key> must be specified for the get command");
+  } else {
+    key_ = params.at(0);
+  }
+
+  if (is_key_hex_) {
+    key_ = HexToString(key_);
+  }
+}
+
+void GetCommand::Help(string& ret) {
+  ret.append("  ");
+  ret.append(GetCommand::Name());
+  ret.append(" <key>");
+  ret.append(" [--" + ARG_TTL + "]");
+  ret.append("\n");
+}
+
+void GetCommand::DoCommand() {
+  string value;
+  Status st = db_->Get(ReadOptions(), key_, &value);
+  if (st.ok()) {
+    fprintf(stdout, "%s\n",
+              (is_value_hex_ ? StringToHex(value) : value).c_str());
+  } else {
+    exec_state_ = LDBCommandExecuteResult::FAILED(st.ToString());
+  }
+}
+
+
+ApproxSizeCommand::ApproxSizeCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags) :
+  LDBCommand(options, flags, true,
+             BuildCmdLineOptions({ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX,
+                                  ARG_FROM, ARG_TO})) {
+
+  if (options.find(ARG_FROM) != options.end()) {
+    start_key_ = options.find(ARG_FROM)->second;
+  } else {
+    exec_state_ = LDBCommandExecuteResult::FAILED(ARG_FROM +
+                    " must be specified for approxsize command");
+    return;
+  }
+
+  if (options.find(ARG_TO) != options.end()) {
+    end_key_ = options.find(ARG_TO)->second;
+  } else {
+    exec_state_ = LDBCommandExecuteResult::FAILED(ARG_TO +
+                    " must be specified for approxsize command");
+    return;
+  }
+
+  if (is_key_hex_) {
+    start_key_ = HexToString(start_key_);
+    end_key_ = HexToString(end_key_);
+  }
+}
+
+void ApproxSizeCommand::Help(string& ret) {
+  ret.append("  ");
+  ret.append(ApproxSizeCommand::Name());
+  ret.append(HelpRangeCmdArgs());
+  ret.append("\n");
+}
+
+void ApproxSizeCommand::DoCommand() {
+
+  Range ranges[1];
+  ranges[0] = Range(start_key_, end_key_);
+  uint64_t sizes[1];
+  db_->GetApproximateSizes(ranges, 1, sizes);
+  fprintf(stdout, "%lu\n", (unsigned long)sizes[0]);
+  /* Weird that GetApproximateSizes() returns void, although documentation
+   * says that it returns a Status object.
+  if (!st.ok()) {
+    exec_state_ = LDBCommandExecuteResult::FAILED(st.ToString());
+  }
+  */
+}
+
+
+BatchPutCommand::BatchPutCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags) :
+  LDBCommand(options, flags, false,
+             BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX,
+                                  ARG_CREATE_IF_MISSING})) {
+
+  if (params.size() < 2) {
+    exec_state_ = LDBCommandExecuteResult::FAILED(
+        "At least one <key> <value> pair must be specified batchput.");
+  } else if (params.size() % 2 != 0) {
+    exec_state_ = LDBCommandExecuteResult::FAILED(
+        "Equal number of <key>s and <value>s must be specified for batchput.");
+  } else {
+    for (size_t i = 0; i < params.size(); i += 2) {
+      string key = params.at(i);
+      string value = params.at(i+1);
+      key_values_.push_back(pair<string, string>(
+                    is_key_hex_ ? HexToString(key) : key,
+                    is_value_hex_ ? HexToString(value) : value));
+    }
+  }
+}
+
+void BatchPutCommand::Help(string& ret) {
+  ret.append("  ");
+  ret.append(BatchPutCommand::Name());
+  ret.append(" <key> <value> [<key> <value>] [..]");
+  ret.append(" [--" + ARG_TTL + "]");
+  ret.append("\n");
+}
+
+void BatchPutCommand::DoCommand() {
+  WriteBatch batch;
+
+  for (vector<pair<string, string>>::const_iterator itr
+        = key_values_.begin(); itr != key_values_.end(); itr++) {
+      batch.Put(itr->first, itr->second);
+  }
+  Status st = db_->Write(WriteOptions(), &batch);
+  if (st.ok()) {
+    fprintf(stdout, "OK\n");
+  } else {
+    exec_state_ = LDBCommandExecuteResult::FAILED(st.ToString());
+  }
+}
+
+Options BatchPutCommand::PrepareOptionsForOpenDB() {
+  Options opt = LDBCommand::PrepareOptionsForOpenDB();
+  opt.create_if_missing = IsFlagPresent(flags_, ARG_CREATE_IF_MISSING);
+  return opt;
+}
+
+
+ScanCommand::ScanCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags) :
+    LDBCommand(options, flags, true,
+               BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX, ARG_TO,
+                                    ARG_VALUE_HEX, ARG_FROM, ARG_TIMESTAMP,
+                                    ARG_MAX_KEYS, ARG_TTL_START, ARG_TTL_END})),
+    start_key_specified_(false),
+    end_key_specified_(false),
+    max_keys_scanned_(-1) {
+
+  map<string, string>::const_iterator itr = options.find(ARG_FROM);
+  if (itr != options.end()) {
+    start_key_ = itr->second;
+    if (is_key_hex_) {
+      start_key_ = HexToString(start_key_);
+    }
+    start_key_specified_ = true;
+  }
+  itr = options.find(ARG_TO);
+  if (itr != options.end()) {
+    end_key_ = itr->second;
+    if (is_key_hex_) {
+      end_key_ = HexToString(end_key_);
+    }
+    end_key_specified_ = true;
+  }
+
+  itr = options.find(ARG_MAX_KEYS);
+  if (itr != options.end()) {
+    try {
+      max_keys_scanned_ = stoi(itr->second);
+    } catch(const invalid_argument&) {
+      exec_state_ = LDBCommandExecuteResult::FAILED(ARG_MAX_KEYS +
+                        " has an invalid value");
+    } catch(const out_of_range&) {
+      exec_state_ = LDBCommandExecuteResult::FAILED(ARG_MAX_KEYS +
+                        " has a value out-of-range");
+    }
+  }
+}
+
+void ScanCommand::Help(string& ret) {
+  ret.append("  ");
+  ret.append(ScanCommand::Name());
+  ret.append(HelpRangeCmdArgs());
+  ret.append(" [--" + ARG_TTL + "]");
+  ret.append(" [--" + ARG_TIMESTAMP + "]");
+  ret.append(" [--" + ARG_MAX_KEYS + "=<N>q] ");
+  ret.append(" [--" + ARG_TTL_START + "=<N>:- is inclusive]");
+  ret.append(" [--" + ARG_TTL_END + "=<N>:- is exclusive]");
+  ret.append("\n");
+}
+
+void ScanCommand::DoCommand() {
+
+  int num_keys_scanned = 0;
+  Iterator* it = db_->NewIterator(ReadOptions());
+  if (start_key_specified_) {
+    it->Seek(start_key_);
+  } else {
+    it->SeekToFirst();
+  }
+  int ttl_start;
+  if (!ParseIntOption(option_map_, ARG_TTL_START, ttl_start, exec_state_)) {
+    ttl_start = DBWithTTL::kMinTimestamp; // TTL introduction time
+  }
+  int ttl_end;
+  if (!ParseIntOption(option_map_, ARG_TTL_END, ttl_end, exec_state_)) {
+    ttl_end = DBWithTTL::kMaxTimestamp; // Max time allowed by TTL feature
+  }
+  if (ttl_end < ttl_start) {
+    fprintf(stderr, "Error: End time can't be less than start time\n");
+    delete it;
+    return;
+  }
+  if (is_db_ttl_ && timestamp_) {
+    fprintf(stdout, "Scanning key-values from %s to %s\n",
+            ReadableTime(ttl_start).c_str(), ReadableTime(ttl_end).c_str());
+  }
+  for ( ;
+        it->Valid() && (!end_key_specified_ || it->key().ToString() < end_key_);
+        it->Next()) {
+    string key = it->key().ToString();
+    if (is_db_ttl_) {
+      TtlIterator* it_ttl = dynamic_cast<TtlIterator*>(it);
+      assert(it_ttl);
+      int rawtime = it_ttl->timestamp();
+      if (rawtime < ttl_start || rawtime >= ttl_end) {
+        continue;
+      }
+      if (timestamp_) {
+        fprintf(stdout, "%s ", ReadableTime(rawtime).c_str());
+      }
+    }
+    string value = it->value().ToString();
+    fprintf(stdout, "%s : %s\n",
+          (is_key_hex_ ? StringToHex(key) : key).c_str(),
+          (is_value_hex_ ? StringToHex(value) : value).c_str()
+        );
+    num_keys_scanned++;
+    if (max_keys_scanned_ >= 0 && num_keys_scanned >= max_keys_scanned_) {
+      break;
+    }
+  }
+  if (!it->status().ok()) {  // Check for any errors found during the scan
+    exec_state_ = LDBCommandExecuteResult::FAILED(it->status().ToString());
+  }
+  delete it;
+}
+
+
+DeleteCommand::DeleteCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags) :
+  LDBCommand(options, flags, false,
+             BuildCmdLineOptions({ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX})) {
+
+  if (params.size() != 1) {
+    exec_state_ = LDBCommandExecuteResult::FAILED(
+                    "KEY must be specified for the delete command");
+  } else {
+    key_ = params.at(0);
+    if (is_key_hex_) {
+      key_ = HexToString(key_);
+    }
+  }
+}
+
+void DeleteCommand::Help(string& ret) {
+  ret.append("  ");
+  ret.append(DeleteCommand::Name() + " <key>");
+  ret.append("\n");
+}
+
+void DeleteCommand::DoCommand() {
+  Status st = db_->Delete(WriteOptions(), key_);
+  if (st.ok()) {
+    fprintf(stdout, "OK\n");
+  } else {
+    exec_state_ = LDBCommandExecuteResult::FAILED(st.ToString());
+  }
+}
+
+
+PutCommand::PutCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags) :
+  LDBCommand(options, flags, false,
+             BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX,
+                                  ARG_CREATE_IF_MISSING})) {
+
+  if (params.size() != 2) {
+    exec_state_ = LDBCommandExecuteResult::FAILED(
+                    "<key> and <value> must be specified for the put command");
+  } else {
+    key_ = params.at(0);
+    value_ = params.at(1);
+  }
+
+  if (is_key_hex_) {
+    key_ = HexToString(key_);
+  }
+
+  if (is_value_hex_) {
+    value_ = HexToString(value_);
+  }
+}
+
+void PutCommand::Help(string& ret) {
+  ret.append("  ");
+  ret.append(PutCommand::Name());
+  ret.append(" <key> <value> ");
+  ret.append(" [--" + ARG_TTL + "]");
+  ret.append("\n");
+}
+
+void PutCommand::DoCommand() {
+  Status st = db_->Put(WriteOptions(), key_, value_);
+  if (st.ok()) {
+    fprintf(stdout, "OK\n");
+  } else {
+    exec_state_ = LDBCommandExecuteResult::FAILED(st.ToString());
+  }
+}
+
+Options PutCommand::PrepareOptionsForOpenDB() {
+  Options opt = LDBCommand::PrepareOptionsForOpenDB();
+  opt.create_if_missing = IsFlagPresent(flags_, ARG_CREATE_IF_MISSING);
+  return opt;
+}
+
+
+const char* DBQuerierCommand::HELP_CMD = "help";
+const char* DBQuerierCommand::GET_CMD = "get";
+const char* DBQuerierCommand::PUT_CMD = "put";
+const char* DBQuerierCommand::DELETE_CMD = "delete";
+
+DBQuerierCommand::DBQuerierCommand(const vector<string>& params,
+    const map<string, string>& options, const vector<string>& flags) :
+  LDBCommand(options, flags, false,
+             BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX,
+                                  ARG_VALUE_HEX})) {
+
+}
+
+void DBQuerierCommand::Help(string& ret) {
+  ret.append("  ");
+  ret.append(DBQuerierCommand::Name());
+  ret.append(" [--" + ARG_TTL + "]");
+  ret.append("\n");
+  ret.append("    Starts a REPL shell.  Type help for list of available "
+             "commands.");
+  ret.append("\n");
+}
+
+void DBQuerierCommand::DoCommand() {
+  if (!db_) {
+    return;
+  }
+
+  ReadOptions read_options;
+  WriteOptions write_options;
+
+  string line;
+  string key;
+  string value;
+  while (getline(cin, line, '\n')) {
+
+    // Parse line into vector<string>
+    vector<string> tokens;
+    size_t pos = 0;
+    while (true) {
+      size_t pos2 = line.find(' ', pos);
+      if (pos2 == string::npos) {
+        break;
+      }
+      tokens.push_back(line.substr(pos, pos2-pos));
+      pos = pos2 + 1;
+    }
+    tokens.push_back(line.substr(pos));
+
+    const string& cmd = tokens[0];
+
+    if (cmd == HELP_CMD) {
+      fprintf(stdout,
+              "get <key>\n"
+              "put <key> <value>\n"
+              "delete <key>\n");
+    } else if (cmd == DELETE_CMD && tokens.size() == 2) {
+      key = (is_key_hex_ ? HexToString(tokens[1]) : tokens[1]);
+      db_->Delete(write_options, Slice(key));
+      fprintf(stdout, "Successfully deleted %s\n", tokens[1].c_str());
+    } else if (cmd == PUT_CMD && tokens.size() == 3) {
+      key = (is_key_hex_ ? HexToString(tokens[1]) : tokens[1]);
+      value = (is_value_hex_ ? HexToString(tokens[2]) : tokens[2]);
+      db_->Put(write_options, Slice(key), Slice(value));
+      fprintf(stdout, "Successfully put %s %s\n",
+              tokens[1].c_str(), tokens[2].c_str());
+    } else if (cmd == GET_CMD && tokens.size() == 2) {
+      key = (is_key_hex_ ? HexToString(tokens[1]) : tokens[1]);
+      if (db_->Get(read_options, Slice(key), &value).ok()) {
+        fprintf(stdout, "%s\n", PrintKeyValue(key, value,
+              is_key_hex_, is_value_hex_).c_str());
+      } else {
+        fprintf(stdout, "Not found %s\n", tokens[1].c_str());
+      }
+    } else {
+      fprintf(stdout, "Unknown command %s\n", line.c_str());
+    }
+  }
+}
+
+
+}
diff --git a/util/ldb_cmd.h b/util/ldb_cmd.h
new file mode 100644 (file)
index 0000000..022f5fa
--- /dev/null
@@ -0,0 +1,689 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#pragma once
+#include <string>
+#include <iostream>
+#include <sstream>
+#include <stdlib.h>
+#include <algorithm>
+#include <stdio.h>
+
+#include "db/version_set.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/slice.h"
+#include "util/logging.h"
+#include "util/ldb_cmd_execute_result.h"
+#include "util/string_util.h"
+#include "utilities/utility_db.h"
+#include "utilities/ttl/db_ttl.h"
+
+using std::string;
+using std::map;
+using std::vector;
+using std::ostringstream;
+
+namespace rocksdb {
+
+class LDBCommand {
+public:
+
+  // Command-line arguments
+  static const string ARG_DB;
+  static const string ARG_HEX;
+  static const string ARG_KEY_HEX;
+  static const string ARG_VALUE_HEX;
+  static const string ARG_TTL;
+  static const string ARG_TTL_START;
+  static const string ARG_TTL_END;
+  static const string ARG_TIMESTAMP;
+  static const string ARG_FROM;
+  static const string ARG_TO;
+  static const string ARG_MAX_KEYS;
+  static const string ARG_BLOOM_BITS;
+  static const string ARG_COMPRESSION_TYPE;
+  static const string ARG_BLOCK_SIZE;
+  static const string ARG_AUTO_COMPACTION;
+  static const string ARG_WRITE_BUFFER_SIZE;
+  static const string ARG_FILE_SIZE;
+  static const string ARG_CREATE_IF_MISSING;
+
+  static LDBCommand* InitFromCmdLineArgs(
+    const vector<string>& args,
+    const Options& options = Options()
+  );
+
+  static LDBCommand* InitFromCmdLineArgs(
+    int argc,
+    char** argv,
+    const Options& options = Options()
+  );
+
+  bool ValidateCmdLineOptions();
+
+  virtual Options PrepareOptionsForOpenDB();
+
+  virtual void SetOptions(Options options) {
+    options_ = options;
+  }
+
+  virtual bool NoDBOpen() {
+    return false;
+  }
+
+  virtual ~LDBCommand() {
+    if (db_ != nullptr) {
+      delete db_;
+      db_ = nullptr;
+    }
+  }
+
+  /* Run the command, and return the execute result. */
+  void Run() {
+    if (!exec_state_.IsNotStarted()) {
+      return;
+    }
+
+    if (db_ == nullptr && !NoDBOpen()) {
+      OpenDB();
+      if (!exec_state_.IsNotStarted()) {
+        return;
+      }
+    }
+
+    DoCommand();
+    if (exec_state_.IsNotStarted()) {
+      exec_state_ = LDBCommandExecuteResult::SUCCEED("");
+    }
+
+    if (db_ != nullptr) {
+      CloseDB ();
+    }
+  }
+
+  virtual void DoCommand() = 0;
+
+  LDBCommandExecuteResult GetExecuteState() {
+    return exec_state_;
+  }
+
+  void ClearPreviousRunState() {
+    exec_state_.Reset();
+  }
+
+  static string HexToString(const string& str) {
+    string parsed;
+    if (str[0] != '0' || str[1] != 'x') {
+      fprintf(stderr, "Invalid hex input %s.  Must start with 0x\n",
+              str.c_str());
+      throw "Invalid hex input";
+    }
+
+    for (unsigned int i = 2; i < str.length();) {
+      int c;
+      sscanf(str.c_str() + i, "%2X", &c);
+      parsed.push_back(c);
+      i += 2;
+    }
+    return parsed;
+  }
+
+  static string StringToHex(const string& str) {
+    string result = "0x";
+    char buf[10];
+    for (size_t i = 0; i < str.length(); i++) {
+      snprintf(buf, 10, "%02X", (unsigned char)str[i]);
+      result += buf;
+    }
+    return result;
+  }
+
+  static const char* DELIM;
+
+protected:
+
+  LDBCommandExecuteResult exec_state_;
+  string db_path_;
+  DB* db_;
+  StackableDB* sdb_;
+
+  /**
+   * true implies that this command can work if the db is opened in read-only
+   * mode.
+   */
+  bool is_read_only_;
+
+  /** If true, the key is input/output as hex in get/put/scan/delete etc. */
+  bool is_key_hex_;
+
+  /** If true, the value is input/output as hex in get/put/scan/delete etc. */
+  bool is_value_hex_;
+
+  /** If true, the value is treated as timestamp suffixed */
+  bool is_db_ttl_;
+
+  // If true, the kvs are output with their insert/modify timestamp in a ttl db
+  bool timestamp_;
+
+  /**
+   * Map of options passed on the command-line.
+   */
+  const map<string, string> option_map_;
+
+  /**
+   * Flags passed on the command-line.
+   */
+  const vector<string> flags_;
+
+  /** List of command-line options valid for this command */
+  const vector<string> valid_cmd_line_options_;
+
+  bool ParseKeyValue(const string& line, string* key, string* value,
+                      bool is_key_hex, bool is_value_hex);
+
+  LDBCommand(const map<string, string>& options, const vector<string>& flags,
+             bool is_read_only, const vector<string>& valid_cmd_line_options) :
+      db_(nullptr),
+      is_read_only_(is_read_only),
+      is_key_hex_(false),
+      is_value_hex_(false),
+      is_db_ttl_(false),
+      timestamp_(false),
+      option_map_(options),
+      flags_(flags),
+      valid_cmd_line_options_(valid_cmd_line_options) {
+
+    map<string, string>::const_iterator itr = options.find(ARG_DB);
+    if (itr != options.end()) {
+      db_path_ = itr->second;
+    }
+
+    is_key_hex_ = IsKeyHex(options, flags);
+    is_value_hex_ = IsValueHex(options, flags);
+    is_db_ttl_ = IsFlagPresent(flags, ARG_TTL);
+    timestamp_ = IsFlagPresent(flags, ARG_TIMESTAMP);
+  }
+
+  void OpenDB() {
+    Options opt = PrepareOptionsForOpenDB();
+    if (!exec_state_.IsNotStarted()) {
+      return;
+    }
+    // Open the DB.
+    Status st;
+    if (is_db_ttl_) {
+      if (is_read_only_) {
+        st = UtilityDB::OpenTtlDB(opt, db_path_, &sdb_, 0, true);
+      } else {
+        st = UtilityDB::OpenTtlDB(opt, db_path_, &sdb_);
+      }
+      db_ = sdb_;
+    } else if (is_read_only_) {
+      st = DB::OpenForReadOnly(opt, db_path_, &db_);
+    } else {
+      st = DB::Open(opt, db_path_, &db_);
+    }
+    if (!st.ok()) {
+      string msg = st.ToString();
+      exec_state_ = LDBCommandExecuteResult::FAILED(msg);
+    }
+
+    options_ = opt;
+  }
+
+  void CloseDB () {
+    if (db_ != nullptr) {
+      delete db_;
+      db_ = nullptr;
+    }
+  }
+
+  static string PrintKeyValue(const string& key, const string& value,
+        bool is_key_hex, bool is_value_hex) {
+    string result;
+    result.append(is_key_hex ? StringToHex(key) : key);
+    result.append(DELIM);
+    result.append(is_value_hex ? StringToHex(value) : value);
+    return result;
+  }
+
+  static string PrintKeyValue(const string& key, const string& value,
+        bool is_hex) {
+    return PrintKeyValue(key, value, is_hex, is_hex);
+  }
+
+  /**
+   * Return true if the specified flag is present in the specified flags vector
+   */
+  static bool IsFlagPresent(const vector<string>& flags, const string& flag) {
+    return (std::find(flags.begin(), flags.end(), flag) != flags.end());
+  }
+
+  static string HelpRangeCmdArgs() {
+    ostringstream str_stream;
+    str_stream << " ";
+    str_stream << "[--" << ARG_FROM << "] ";
+    str_stream << "[--" << ARG_TO << "] ";
+    return str_stream.str();
+  }
+
+  /**
+   * A helper function that returns a list of command line options
+   * used by this command.  It includes the common options and the ones
+   * passed in.
+   */
+  vector<string> BuildCmdLineOptions(vector<string> options) {
+    vector<string> ret = {ARG_DB, ARG_BLOOM_BITS, ARG_BLOCK_SIZE,
+                          ARG_AUTO_COMPACTION, ARG_COMPRESSION_TYPE,
+                          ARG_WRITE_BUFFER_SIZE, ARG_FILE_SIZE};
+    ret.insert(ret.end(), options.begin(), options.end());
+    return ret;
+  }
+
+  bool ParseIntOption(const map<string, string>& options, const string& option,
+                      int& value, LDBCommandExecuteResult& exec_state);
+
+  bool ParseStringOption(const map<string, string>& options,
+                         const string& option, string* value);
+
+  Options options_;
+
+private:
+
+  /**
+   * Interpret command line options and flags to determine if the key
+   * should be input/output in hex.
+   */
+  bool IsKeyHex(const map<string, string>& options,
+      const vector<string>& flags) {
+    return (IsFlagPresent(flags, ARG_HEX) ||
+        IsFlagPresent(flags, ARG_KEY_HEX) ||
+        ParseBooleanOption(options, ARG_HEX, false) ||
+        ParseBooleanOption(options, ARG_KEY_HEX, false));
+  }
+
+  /**
+   * Interpret command line options and flags to determine if the value
+   * should be input/output in hex.
+   */
+  bool IsValueHex(const map<string, string>& options,
+      const vector<string>& flags) {
+    return (IsFlagPresent(flags, ARG_HEX) ||
+          IsFlagPresent(flags, ARG_VALUE_HEX) ||
+          ParseBooleanOption(options, ARG_HEX, false) ||
+          ParseBooleanOption(options, ARG_VALUE_HEX, false));
+  }
+
+  /**
+   * Returns the value of the specified option as a boolean.
+   * default_val is used if the option is not found in options.
+   * Throws an exception if the value of the option is not
+   * "true" or "false" (case insensitive).
+   */
+  bool ParseBooleanOption(const map<string, string>& options,
+      const string& option, bool default_val) {
+
+    map<string, string>::const_iterator itr = options.find(option);
+    if (itr != options.end()) {
+      string option_val = itr->second;
+      return StringToBool(itr->second);
+    }
+    return default_val;
+  }
+
+  /**
+   * Converts val to a boolean.
+   * val must be either true or false (case insensitive).
+   * Otherwise an exception is thrown.
+   */
+  bool StringToBool(string val) {
+    std::transform(val.begin(), val.end(), val.begin(), ::tolower);
+    if (val == "true") {
+      return true;
+    } else if (val == "false") {
+      return false;
+    } else {
+      throw "Invalid value for boolean argument";
+    }
+  }
+
+  static LDBCommand* SelectCommand(
+    const string& cmd,
+    const vector<string>& cmdParams,
+    const map<string, string>& option_map,
+    const vector<string>& flags
+  );
+
+};
+
+class CompactorCommand: public LDBCommand {
+public:
+  static string Name() { return "compact"; }
+
+  CompactorCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags);
+
+  static void Help(string& ret);
+
+  virtual void DoCommand();
+
+private:
+  bool null_from_;
+  string from_;
+  bool null_to_;
+  string to_;
+};
+
+class DBDumperCommand: public LDBCommand {
+public:
+  static string Name() { return "dump"; }
+
+  DBDumperCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags);
+
+  static void Help(string& ret);
+
+  virtual void DoCommand();
+
+private:
+  bool null_from_;
+  string from_;
+  bool null_to_;
+  string to_;
+  int max_keys_;
+  string delim_;
+  bool count_only_;
+  bool count_delim_;
+  bool print_stats_;
+
+  static const string ARG_COUNT_ONLY;
+  static const string ARG_COUNT_DELIM;
+  static const string ARG_STATS;
+  static const string ARG_TTL_BUCKET;
+};
+
+class InternalDumpCommand: public LDBCommand {
+public:
+  static string Name() { return "idump"; }
+
+  InternalDumpCommand(const vector<string>& params,
+                      const map<string, string>& options,
+                      const vector<string>& flags);
+
+  static void Help(string& ret);
+
+  virtual void DoCommand();
+
+private:
+  bool has_from_;
+  string from_;
+  bool has_to_;
+  string to_;
+  int max_keys_;
+  string delim_;
+  bool count_only_;
+  bool count_delim_;
+  bool print_stats_;
+  bool is_input_key_hex_;
+
+  static const string ARG_DELIM;
+  static const string ARG_COUNT_ONLY;
+  static const string ARG_COUNT_DELIM;
+  static const string ARG_STATS;
+  static const string ARG_INPUT_KEY_HEX;
+};
+
+class DBLoaderCommand: public LDBCommand {
+public:
+  static string Name() { return "load"; }
+
+  DBLoaderCommand(string& db_name, vector<string>& args);
+
+  DBLoaderCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags);
+
+  static void Help(string& ret);
+  virtual void DoCommand();
+
+  virtual Options PrepareOptionsForOpenDB();
+
+private:
+  bool create_if_missing_;
+  bool disable_wal_;
+  bool bulk_load_;
+  bool compact_;
+
+  static const string ARG_DISABLE_WAL;
+  static const string ARG_BULK_LOAD;
+  static const string ARG_COMPACT;
+};
+
+class ManifestDumpCommand: public LDBCommand {
+public:
+  static string Name() { return "manifest_dump"; }
+
+  ManifestDumpCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags);
+
+  static void Help(string& ret);
+  virtual void DoCommand();
+
+  virtual bool NoDBOpen() {
+    return true;
+  }
+
+private:
+  bool verbose_;
+  string path_;
+
+  static const string ARG_VERBOSE;
+  static const string ARG_PATH;
+};
+
+class ReduceDBLevelsCommand : public LDBCommand {
+public:
+  static string Name() { return "reduce_levels"; }
+
+  ReduceDBLevelsCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags);
+
+  virtual Options PrepareOptionsForOpenDB();
+
+  virtual void DoCommand();
+
+  virtual bool NoDBOpen() {
+    return true;
+  }
+
+  static void Help(string& msg);
+
+  static vector<string> PrepareArgs(const string& db_path, int new_levels,
+      bool print_old_level = false);
+
+private:
+  int old_levels_;
+  int new_levels_;
+  bool print_old_levels_;
+
+  static const string ARG_NEW_LEVELS;
+  static const string ARG_PRINT_OLD_LEVELS;
+
+  Status GetOldNumOfLevels(Options& opt, int* levels);
+};
+
+class ChangeCompactionStyleCommand : public LDBCommand {
+public:
+  static string Name() { return "change_compaction_style"; }
+
+  ChangeCompactionStyleCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags);
+
+  virtual Options PrepareOptionsForOpenDB();
+
+  virtual void DoCommand();
+
+  static void Help(string& msg);
+
+private:
+  int old_compaction_style_;
+  int new_compaction_style_;
+
+  static const string ARG_OLD_COMPACTION_STYLE;
+  static const string ARG_NEW_COMPACTION_STYLE;
+};
+
+class WALDumperCommand : public LDBCommand {
+public:
+  static string Name() { return "dump_wal"; }
+
+  WALDumperCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags);
+
+  virtual bool  NoDBOpen() {
+    return true;
+  }
+
+  static void Help(string& ret);
+  virtual void DoCommand();
+
+private:
+  bool print_header_;
+  string wal_file_;
+  bool print_values_;
+
+  static const string ARG_WAL_FILE;
+  static const string ARG_PRINT_HEADER;
+  static const string ARG_PRINT_VALUE;
+};
+
+
+class GetCommand : public LDBCommand {
+public:
+  static string Name() { return "get"; }
+
+  GetCommand(const vector<string>& params, const map<string, string>& options,
+      const vector<string>& flags);
+
+  virtual void DoCommand();
+
+  static void Help(string& ret);
+
+private:
+  string key_;
+};
+
+class ApproxSizeCommand : public LDBCommand {
+public:
+  static string Name() { return "approxsize"; }
+
+  ApproxSizeCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags);
+
+  virtual void DoCommand();
+
+  static void Help(string& ret);
+
+private:
+  string start_key_;
+  string end_key_;
+};
+
+class BatchPutCommand : public LDBCommand {
+public:
+  static string Name() { return "batchput"; }
+
+  BatchPutCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags);
+
+  virtual void DoCommand();
+
+  static void Help(string& ret);
+
+  virtual Options PrepareOptionsForOpenDB();
+
+private:
+  /**
+   * The key-values to be inserted.
+   */
+  vector<std::pair<string, string>> key_values_;
+};
+
+class ScanCommand : public LDBCommand {
+public:
+  static string Name() { return "scan"; }
+
+  ScanCommand(const vector<string>& params, const map<string, string>& options,
+      const vector<string>& flags);
+
+  virtual void DoCommand();
+
+  static void Help(string& ret);
+
+private:
+  string start_key_;
+  string end_key_;
+  bool start_key_specified_;
+  bool end_key_specified_;
+  int max_keys_scanned_;
+};
+
+class DeleteCommand : public LDBCommand {
+public:
+  static string Name() { return "delete"; }
+
+  DeleteCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags);
+
+  virtual void DoCommand();
+
+  static void Help(string& ret);
+
+private:
+  string key_;
+};
+
+class PutCommand : public LDBCommand {
+public:
+  static string Name() { return "put"; }
+
+  PutCommand(const vector<string>& params, const map<string, string>& options,
+      const vector<string>& flags);
+
+  virtual void DoCommand();
+
+  static void Help(string& ret);
+
+  virtual Options PrepareOptionsForOpenDB();
+
+private:
+  string key_;
+  string value_;
+};
+
+/**
+ * Command that starts up a REPL shell that allows
+ * get/put/delete.
+ */
+class DBQuerierCommand: public LDBCommand {
+public:
+  static string Name() { return "query"; }
+
+  DBQuerierCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags);
+
+  static void Help(string& ret);
+
+  virtual void DoCommand();
+
+private:
+  static const char* HELP_CMD;
+  static const char* GET_CMD;
+  static const char* PUT_CMD;
+  static const char* DELETE_CMD;
+};
+
+} // namespace rocksdb
diff --git a/util/ldb_cmd_execute_result.h b/util/ldb_cmd_execute_result.h
new file mode 100644 (file)
index 0000000..b9121b2
--- /dev/null
@@ -0,0 +1,76 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#pragma once
+
+namespace rocksdb {
+
+class LDBCommandExecuteResult {
+public:
+  enum State {
+    EXEC_NOT_STARTED = 0, EXEC_SUCCEED = 1, EXEC_FAILED = 2,
+  };
+
+  LDBCommandExecuteResult() {
+    state_ = EXEC_NOT_STARTED;
+    message_ = "";
+  }
+
+  LDBCommandExecuteResult(State state, std::string& msg) {
+    state_ = state;
+    message_ = msg;
+  }
+
+  std::string ToString() {
+    std::string ret;
+    switch (state_) {
+    case EXEC_SUCCEED:
+      break;
+    case EXEC_FAILED:
+      ret.append("Failed: ");
+      break;
+    case EXEC_NOT_STARTED:
+      ret.append("Not started: ");
+    }
+    if (!message_.empty()) {
+      ret.append(message_);
+    }
+    return ret;
+  }
+
+  void Reset() {
+    state_ = EXEC_NOT_STARTED;
+    message_ = "";
+  }
+
+  bool IsSucceed() {
+    return state_ == EXEC_SUCCEED;
+  }
+
+  bool IsNotStarted() {
+    return state_ == EXEC_NOT_STARTED;
+  }
+
+  bool IsFailed() {
+    return state_ == EXEC_FAILED;
+  }
+
+  static LDBCommandExecuteResult SUCCEED(std::string msg) {
+    return LDBCommandExecuteResult(EXEC_SUCCEED, msg);
+  }
+
+  static LDBCommandExecuteResult FAILED(std::string msg) {
+    return LDBCommandExecuteResult(EXEC_FAILED, msg);
+  }
+
+private:
+  State state_;
+  std::string message_;
+
+  bool operator==(const LDBCommandExecuteResult&);
+  bool operator!=(const LDBCommandExecuteResult&);
+};
+
+}
diff --git a/util/ldb_tool.cc b/util/ldb_tool.cc
new file mode 100644 (file)
index 0000000..2dbbbf8
--- /dev/null
@@ -0,0 +1,103 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#include "rocksdb/ldb_tool.h"
+#include "util/ldb_cmd.h"
+
+namespace rocksdb {
+
+class LDBCommandRunner {
+public:
+
+  static void PrintHelp(const char* exec_name) {
+    string ret;
+
+    ret.append("ldb - LevelDB Tool");
+    ret.append("\n\n");
+    ret.append("commands MUST specify --" + LDBCommand::ARG_DB +
+        "=<full_path_to_db_directory> when necessary\n");
+    ret.append("\n");
+    ret.append("The following optional parameters control if keys/values are "
+        "input/output as hex or as plain strings:\n");
+    ret.append("  --" + LDBCommand::ARG_KEY_HEX +
+        " : Keys are input/output as hex\n");
+    ret.append("  --" + LDBCommand::ARG_VALUE_HEX +
+        " : Values are input/output as hex\n");
+    ret.append("  --" + LDBCommand::ARG_HEX +
+        " : Both keys and values are input/output as hex\n");
+    ret.append("\n");
+
+    ret.append("The following optional parameters control the database "
+        "internals:\n");
+    ret.append("  --" + LDBCommand::ARG_TTL +
+        " with 'put','get','scan','dump','query','batchput'"
+        " : DB supports ttl and value is internally timestamp-suffixed\n");
+    ret.append("  --" + LDBCommand::ARG_BLOOM_BITS + "=<int,e.g.:14>\n");
+    ret.append("  --" + LDBCommand::ARG_COMPRESSION_TYPE +
+        "=<no|snappy|zlib|bzip2>\n");
+    ret.append("  --" + LDBCommand::ARG_BLOCK_SIZE +
+        "=<block_size_in_bytes>\n");
+    ret.append("  --" + LDBCommand::ARG_AUTO_COMPACTION + "=<true|false>\n");
+    ret.append("  --" + LDBCommand::ARG_WRITE_BUFFER_SIZE +
+        "=<int,e.g.:4194304>\n");
+    ret.append("  --" + LDBCommand::ARG_FILE_SIZE + "=<int,e.g.:2097152>\n");
+
+    ret.append("\n\n");
+    ret.append("Data Access Commands:\n");
+    PutCommand::Help(ret);
+    GetCommand::Help(ret);
+    BatchPutCommand::Help(ret);
+    ScanCommand::Help(ret);
+    DeleteCommand::Help(ret);
+    DBQuerierCommand::Help(ret);
+    ApproxSizeCommand::Help(ret);
+
+    ret.append("\n\n");
+    ret.append("Admin Commands:\n");
+    WALDumperCommand::Help(ret);
+    CompactorCommand::Help(ret);
+    ReduceDBLevelsCommand::Help(ret);
+    ChangeCompactionStyleCommand::Help(ret);
+    DBDumperCommand::Help(ret);
+    DBLoaderCommand::Help(ret);
+    ManifestDumpCommand::Help(ret);
+    InternalDumpCommand::Help(ret);
+
+    fprintf(stderr, "%s\n", ret.c_str());
+  }
+
+  static void RunCommand(int argc, char** argv, Options options) {
+    if (argc <= 2) {
+      PrintHelp(argv[0]);
+      exit(1);
+    }
+
+    LDBCommand* cmdObj = LDBCommand::InitFromCmdLineArgs(argc, argv, options);
+    if (cmdObj == nullptr) {
+      fprintf(stderr, "Unknown command\n");
+      PrintHelp(argv[0]);
+      exit(1);
+    }
+
+    if (!cmdObj->ValidateCmdLineOptions()) {
+      exit(1);
+    }
+
+    cmdObj->Run();
+    LDBCommandExecuteResult ret = cmdObj->GetExecuteState();
+    fprintf(stderr, "%s\n", ret.ToString().c_str());
+    delete cmdObj;
+
+    exit(ret.IsFailed());
+  }
+
+};
+
+
+void LDBTool::Run(int argc, char** argv, Options options) {
+  LDBCommandRunner::RunCommand(argc, argv, options);
+}
+} // namespace rocksdb
+
diff --git a/util/logging.cc b/util/logging.cc
new file mode 100644 (file)
index 0000000..6973413
--- /dev/null
@@ -0,0 +1,86 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/logging.h"
+
+#include <errno.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "rocksdb/env.h"
+#include "rocksdb/slice.h"
+
+namespace rocksdb {
+
+void AppendNumberTo(std::string* str, uint64_t num) {
+  char buf[30];
+  snprintf(buf, sizeof(buf), "%llu", (unsigned long long) num);
+  str->append(buf);
+}
+
+void AppendEscapedStringTo(std::string* str, const Slice& value) {
+  for (size_t i = 0; i < value.size(); i++) {
+    char c = value[i];
+    if (c >= ' ' && c <= '~') {
+      str->push_back(c);
+    } else {
+      char buf[10];
+      snprintf(buf, sizeof(buf), "\\x%02x",
+               static_cast<unsigned int>(c) & 0xff);
+      str->append(buf);
+    }
+  }
+}
+
+std::string NumberToString(uint64_t num) {
+  std::string r;
+  AppendNumberTo(&r, num);
+  return r;
+}
+
+std::string EscapeString(const Slice& value) {
+  std::string r;
+  AppendEscapedStringTo(&r, value);
+  return r;
+}
+
+bool ConsumeChar(Slice* in, char c) {
+  if (!in->empty() && (*in)[0] == c) {
+    in->remove_prefix(1);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+bool ConsumeDecimalNumber(Slice* in, uint64_t* val) {
+  uint64_t v = 0;
+  int digits = 0;
+  while (!in->empty()) {
+    char c = (*in)[0];
+    if (c >= '0' && c <= '9') {
+      ++digits;
+      const unsigned int delta = (c - '0');
+      static const uint64_t kMaxUint64 = ~static_cast<uint64_t>(0);
+      if (v > kMaxUint64/10 ||
+          (v == kMaxUint64/10 && delta > kMaxUint64%10)) {
+        // Overflow
+        return false;
+      }
+      v = (v * 10) + delta;
+      in->remove_prefix(1);
+    } else {
+      break;
+    }
+  }
+  *val = v;
+  return (digits > 0);
+}
+
+}  // namespace rocksdb
diff --git a/util/logging.h b/util/logging.h
new file mode 100644 (file)
index 0000000..411c83b
--- /dev/null
@@ -0,0 +1,48 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Must not be included from any .h files to avoid polluting the namespace
+// with macros.
+
+#pragma once
+#include <stdio.h>
+#include <stdint.h>
+#include <string>
+#include "port/port.h"
+
+namespace rocksdb {
+
+class Slice;
+class WritableFile;
+
+// Append a human-readable printout of "num" to *str
+extern void AppendNumberTo(std::string* str, uint64_t num);
+
+// Append a human-readable printout of "value" to *str.
+// Escapes any non-printable characters found in "value".
+extern void AppendEscapedStringTo(std::string* str, const Slice& value);
+
+// Return a human-readable printout of "num"
+extern std::string NumberToString(uint64_t num);
+
+// Return a human-readable version of "value".
+// Escapes any non-printable characters found in "value".
+extern std::string EscapeString(const Slice& value);
+
+// If *in starts with "c", advances *in past the first character and
+// returns true.  Otherwise, returns false.
+extern bool ConsumeChar(Slice* in, char c);
+
+// Parse a human-readable number from "*in" into *value.  On success,
+// advances "*in" past the consumed number and sets "*val" to the
+// numeric value.  Otherwise, returns false and leaves *in in an
+// unspecified state.
+extern bool ConsumeDecimalNumber(Slice* in, uint64_t* val);
+
+}  // namespace rocksdb
diff --git a/util/manual_compaction_test.cc b/util/manual_compaction_test.cc
new file mode 100644 (file)
index 0000000..dd615f0
--- /dev/null
@@ -0,0 +1,156 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Test for issue 178: a manual compaction causes deleted data to reappear.
+#include <iostream>
+#include <sstream>
+#include <cstdlib>
+
+#include "rocksdb/db.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/write_batch.h"
+#include "util/testharness.h"
+
+using namespace rocksdb;
+
+namespace {
+
+const int kNumKeys = 1100000;
+
+std::string Key1(int i) {
+  char buf[100];
+  snprintf(buf, sizeof(buf), "my_key_%d", i);
+  return buf;
+}
+
+std::string Key2(int i) {
+  return Key1(i) + "_xxx";
+}
+
+class ManualCompactionTest {
+ public:
+  ManualCompactionTest() {
+    // Get rid of any state from an old run.
+    dbname_ = rocksdb::test::TmpDir() + "/rocksdb_cbug_test";
+    DestroyDB(dbname_, rocksdb::Options());
+  }
+
+  std::string dbname_;
+};
+
+class DestroyAllCompactionFilter : public CompactionFilter {
+ public:
+  DestroyAllCompactionFilter() {}
+
+  virtual bool Filter(int level,
+                      const Slice& key,
+                      const Slice& existing_value,
+                      std::string* new_value,
+                      bool* value_changed) const {
+    return existing_value.ToString() == "destroy";
+  }
+
+  virtual const char* Name() const {
+    return "DestroyAllCompactionFilter";
+  }
+};
+
+TEST(ManualCompactionTest, CompactTouchesAllKeys) {
+  for (int iter = 0; iter < 2; ++iter) {
+    DB* db;
+    Options options;
+    if (iter == 0) { // level compaction
+      options.num_levels = 3;
+      options.compaction_style = kCompactionStyleLevel;
+    } else { // universal compaction
+      options.compaction_style = kCompactionStyleUniversal;
+    }
+    options.create_if_missing = true;
+    options.compression = rocksdb::kNoCompression;
+    options.compaction_filter = new DestroyAllCompactionFilter();
+    ASSERT_OK(DB::Open(options, dbname_, &db));
+
+    db->Put(WriteOptions(), Slice("key1"), Slice("destroy"));
+    db->Put(WriteOptions(), Slice("key2"), Slice("destroy"));
+    db->Put(WriteOptions(), Slice("key3"), Slice("value3"));
+    db->Put(WriteOptions(), Slice("key4"), Slice("destroy"));
+
+    Slice key4("key4");
+    db->CompactRange(nullptr, &key4);
+    Iterator* itr = db->NewIterator(ReadOptions());
+    itr->SeekToFirst();
+    ASSERT_TRUE(itr->Valid());
+    ASSERT_EQ("key3", itr->key().ToString());
+    itr->Next();
+    ASSERT_TRUE(!itr->Valid());
+    delete itr;
+
+    delete options.compaction_filter;
+    delete db;
+    DestroyDB(dbname_, options);
+  }
+}
+
+TEST(ManualCompactionTest, Test) {
+
+  // Open database.  Disable compression since it affects the creation
+  // of layers and the code below is trying to test against a very
+  // specific scenario.
+  rocksdb::DB* db;
+  rocksdb::Options db_options;
+  db_options.create_if_missing = true;
+  db_options.compression = rocksdb::kNoCompression;
+  ASSERT_OK(rocksdb::DB::Open(db_options, dbname_, &db));
+
+  // create first key range
+  rocksdb::WriteBatch batch;
+  for (int i = 0; i < kNumKeys; i++) {
+    batch.Put(Key1(i), "value for range 1 key");
+  }
+  ASSERT_OK(db->Write(rocksdb::WriteOptions(), &batch));
+
+  // create second key range
+  batch.Clear();
+  for (int i = 0; i < kNumKeys; i++) {
+    batch.Put(Key2(i), "value for range 2 key");
+  }
+  ASSERT_OK(db->Write(rocksdb::WriteOptions(), &batch));
+
+  // delete second key range
+  batch.Clear();
+  for (int i = 0; i < kNumKeys; i++) {
+    batch.Delete(Key2(i));
+  }
+  ASSERT_OK(db->Write(rocksdb::WriteOptions(), &batch));
+
+  // compact database
+  std::string start_key = Key1(0);
+  std::string end_key = Key1(kNumKeys - 1);
+  rocksdb::Slice least(start_key.data(), start_key.size());
+  rocksdb::Slice greatest(end_key.data(), end_key.size());
+
+  // commenting out the line below causes the example to work correctly
+  db->CompactRange(&least, &greatest);
+
+  // count the keys
+  rocksdb::Iterator* iter = db->NewIterator(rocksdb::ReadOptions());
+  int num_keys = 0;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    num_keys++;
+  }
+  delete iter;
+  ASSERT_EQ(kNumKeys, num_keys) << "Bad number of keys";
+
+  // close database
+  delete db;
+  DestroyDB(dbname_, rocksdb::Options());
+}
+
+}  // anonymous namespace
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/util/murmurhash.cc b/util/murmurhash.cc
new file mode 100644 (file)
index 0000000..d9d8b70
--- /dev/null
@@ -0,0 +1,183 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+/*
+  Murmurhash from http://sites.google.com/site/murmurhash/
+
+  All code is released to the public domain. For business purposes, Murmurhash is
+  under the MIT license.
+*/
+#include "murmurhash.h"
+
+#if defined(__x86_64__)
+
+// -------------------------------------------------------------------
+//
+// The same caveats as 32-bit MurmurHash2 apply here - beware of alignment
+// and endian-ness issues if used across multiple platforms.
+//
+// 64-bit hash for 64-bit platforms
+
+uint64_t MurmurHash64A ( const void * key, int len, unsigned int seed )
+{
+    const uint64_t m = 0xc6a4a7935bd1e995;
+    const int r = 47;
+
+    uint64_t h = seed ^ (len * m);
+
+    const uint64_t * data = (const uint64_t *)key;
+    const uint64_t * end = data + (len/8);
+
+    while(data != end)
+    {
+        uint64_t k = *data++;
+
+        k *= m;
+        k ^= k >> r;
+        k *= m;
+
+        h ^= k;
+        h *= m;
+    }
+
+    const unsigned char * data2 = (const unsigned char*)data;
+
+    switch(len & 7)
+    {
+    case 7: h ^= ((uint64_t)data2[6]) << 48;
+    case 6: h ^= ((uint64_t)data2[5]) << 40;
+    case 5: h ^= ((uint64_t)data2[4]) << 32;
+    case 4: h ^= ((uint64_t)data2[3]) << 24;
+    case 3: h ^= ((uint64_t)data2[2]) << 16;
+    case 2: h ^= ((uint64_t)data2[1]) << 8;
+    case 1: h ^= ((uint64_t)data2[0]);
+        h *= m;
+    };
+
+    h ^= h >> r;
+    h *= m;
+    h ^= h >> r;
+
+    return h;
+}
+
+#elif defined(__i386__)
+
+// -------------------------------------------------------------------
+//
+// Note - This code makes a few assumptions about how your machine behaves -
+//
+// 1. We can read a 4-byte value from any address without crashing
+// 2. sizeof(int) == 4
+//
+// And it has a few limitations -
+//
+// 1. It will not work incrementally.
+// 2. It will not produce the same results on little-endian and big-endian
+//    machines.
+
+unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed )
+{
+    // 'm' and 'r' are mixing constants generated offline.
+    // They're not really 'magic', they just happen to work well.
+
+    const unsigned int m = 0x5bd1e995;
+    const int r = 24;
+
+    // Initialize the hash to a 'random' value
+
+    unsigned int h = seed ^ len;
+
+    // Mix 4 bytes at a time into the hash
+
+    const unsigned char * data = (const unsigned char *)key;
+
+    while(len >= 4)
+    {
+        unsigned int k = *(unsigned int *)data;
+
+        k *= m;
+        k ^= k >> r;
+        k *= m;
+
+        h *= m;
+        h ^= k;
+
+        data += 4;
+        len -= 4;
+    }
+
+    // Handle the last few bytes of the input array
+
+    switch(len)
+    {
+    case 3: h ^= data[2] << 16;
+    case 2: h ^= data[1] << 8;
+    case 1: h ^= data[0];
+        h *= m;
+    };
+
+    // Do a few final mixes of the hash to ensure the last few
+    // bytes are well-incorporated.
+
+    h ^= h >> 13;
+    h *= m;
+    h ^= h >> 15;
+
+    return h;
+}
+
+#else
+
+// -------------------------------------------------------------------
+//
+// Same as MurmurHash2, but endian- and alignment-neutral.
+// Half the speed though, alas.
+
+unsigned int MurmurHashNeutral2 ( const void * key, int len, unsigned int seed )
+{
+    const unsigned int m = 0x5bd1e995;
+    const int r = 24;
+
+    unsigned int h = seed ^ len;
+
+    const unsigned char * data = (const unsigned char *)key;
+
+    while(len >= 4)
+    {
+        unsigned int k;
+
+        k  = data[0];
+        k |= data[1] << 8;
+        k |= data[2] << 16;
+        k |= data[3] << 24;
+
+        k *= m;
+        k ^= k >> r;
+        k *= m;
+
+        h *= m;
+        h ^= k;
+
+        data += 4;
+        len -= 4;
+    }
+
+    switch(len)
+    {
+    case 3: h ^= data[2] << 16;
+    case 2: h ^= data[1] << 8;
+    case 1: h ^= data[0];
+        h *= m;
+    };
+
+    h ^= h >> 13;
+    h *= m;
+    h ^= h >> 15;
+
+    return h;
+}
+
+#endif
diff --git a/util/murmurhash.h b/util/murmurhash.h
new file mode 100644 (file)
index 0000000..9707e56
--- /dev/null
@@ -0,0 +1,33 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+/*
+  Murmurhash from http://sites.google.com/site/murmurhash/
+
+  All code is released to the public domain. For business purposes, Murmurhash is
+  under the MIT license.
+*/
+#pragma once
+#include <stdint.h>
+
+#if defined(__x86_64__)
+#define MURMUR_HASH MurmurHash64A
+uint64_t MurmurHash64A ( const void * key, int len, unsigned int seed );
+#define MurmurHash MurmurHash64A
+typedef uint64_t murmur_t;
+
+#elif defined(__i386__)
+#define MURMUR_HASH MurmurHash2
+unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed );
+#define MurmurHash MurmurHash2
+typedef unsigned int murmur_t;
+
+#else
+#define MURMUR_HASH MurmurHashNeutral2
+unsigned int MurmurHashNeutral2 ( const void * key, int len, unsigned int seed );
+#define MurmurHash MurmurHashNeutral2
+typedef unsigned int murmur_t;
+
+#endif
diff --git a/util/mutexlock.h b/util/mutexlock.h
new file mode 100644 (file)
index 0000000..0f4e5c8
--- /dev/null
@@ -0,0 +1,78 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include "port/port.h"
+
+namespace rocksdb {
+
+// Helper class that locks a mutex on construction and unlocks the mutex when
+// the destructor of the MutexLock object is invoked.
+//
+// Typical usage:
+//
+//   void MyClass::MyMethod() {
+//     MutexLock l(&mu_);       // mu_ is an instance variable
+//     ... some complex code, possibly with multiple return paths ...
+//   }
+
+class MutexLock {
+ public:
+  explicit MutexLock(port::Mutex *mu) : mu_(mu) {
+    this->mu_->Lock();
+  }
+  ~MutexLock() { this->mu_->Unlock(); }
+
+ private:
+  port::Mutex *const mu_;
+  // No copying allowed
+  MutexLock(const MutexLock&);
+  void operator=(const MutexLock&);
+};
+
+//
+// Acquire a ReadLock on the specified RWMutex.
+// The Lock will be automatically released then the
+// object goes out of scope.
+//
+class ReadLock {
+ public:
+  explicit ReadLock(port::RWMutex *mu) : mu_(mu) {
+    this->mu_->ReadLock();
+  }
+  ~ReadLock() { this->mu_->Unlock(); }
+
+ private:
+  port::RWMutex *const mu_;
+  // No copying allowed
+  ReadLock(const ReadLock&);
+  void operator=(const ReadLock&);
+};
+
+
+//
+// Acquire a WriteLock on the specified RWMutex.
+// The Lock will be automatically released then the
+// object goes out of scope.
+//
+class WriteLock {
+ public:
+  explicit WriteLock(port::RWMutex *mu) : mu_(mu) {
+    this->mu_->WriteLock();
+  }
+  ~WriteLock() { this->mu_->Unlock(); }
+
+ private:
+  port::RWMutex *const mu_;
+  // No copying allowed
+  WriteLock(const WriteLock&);
+  void operator=(const WriteLock&);
+};
+
+}  // namespace rocksdb
diff --git a/util/options.cc b/util/options.cc
new file mode 100644 (file)
index 0000000..64cabc8
--- /dev/null
@@ -0,0 +1,337 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/options.h"
+
+#include <limits>
+
+#include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/merge_operator.h"
+#include "table/block_based_table_factory.h"
+
+namespace rocksdb {
+
+Options::Options()
+    : comparator(BytewiseComparator()),
+      merge_operator(nullptr),
+      compaction_filter(nullptr),
+      compaction_filter_factory(
+          std::shared_ptr<CompactionFilterFactory>(
+            new DefaultCompactionFilterFactory())),
+      create_if_missing(false),
+      error_if_exists(false),
+      paranoid_checks(false),
+      env(Env::Default()),
+      info_log(nullptr),
+      write_buffer_size(4<<20),
+      max_write_buffer_number(2),
+      min_write_buffer_number_to_merge(1),
+      max_open_files(1000),
+      block_cache(nullptr),
+      block_cache_compressed(nullptr),
+      block_size(4096),
+      block_restart_interval(16),
+      compression(kSnappyCompression),
+      filter_policy(nullptr),
+      prefix_extractor(nullptr),
+      whole_key_filtering(true),
+      num_levels(7),
+      level0_file_num_compaction_trigger(4),
+      level0_slowdown_writes_trigger(8),
+      level0_stop_writes_trigger(12),
+      max_mem_compaction_level(2),
+      target_file_size_base(2 * 1048576),
+      target_file_size_multiplier(1),
+      max_bytes_for_level_base(10 * 1048576),
+      max_bytes_for_level_multiplier(10),
+      max_bytes_for_level_multiplier_additional(num_levels, 1),
+      expanded_compaction_factor(25),
+      source_compaction_factor(1),
+      max_grandparent_overlap_factor(10),
+      disableDataSync(false),
+      use_fsync(false),
+      db_stats_log_interval(1800),
+      db_log_dir(""),
+      wal_dir(""),
+      disable_seek_compaction(false),
+      delete_obsolete_files_period_micros(6 * 60 * 60 * 1000000UL),
+      max_background_compactions(1),
+      max_background_flushes(0),
+      max_log_file_size(0),
+      log_file_time_to_roll(0),
+      keep_log_file_num(1000),
+      soft_rate_limit(0.0),
+      hard_rate_limit(0.0),
+      rate_limit_delay_max_milliseconds(1000),
+      max_manifest_file_size(std::numeric_limits<uint64_t>::max()),
+      no_block_cache(false),
+      table_cache_numshardbits(4),
+      table_cache_remove_scan_count_limit(16),
+      arena_block_size(0),
+      disable_auto_compactions(false),
+      WAL_ttl_seconds(0),
+      WAL_size_limit_MB(0),
+      manifest_preallocation_size(4 * 1024 * 1024),
+      purge_redundant_kvs_while_flush(true),
+      allow_os_buffer(true),
+      allow_mmap_reads(false),
+      allow_mmap_writes(true),
+      is_fd_close_on_exec(true),
+      skip_log_error_on_recovery(false),
+      stats_dump_period_sec(3600),
+      block_size_deviation (10),
+      advise_random_on_open(true),
+      access_hint_on_compaction_start(NORMAL),
+      use_adaptive_mutex(false),
+      bytes_per_sync(0),
+      compaction_style(kCompactionStyleLevel),
+      filter_deletes(false),
+      max_sequential_skip_in_iterations(8),
+      memtable_factory(std::shared_ptr<SkipListFactory>(new SkipListFactory)),
+      table_factory(
+        std::shared_ptr<TableFactory>(new BlockBasedTableFactory())),
+      inplace_update_support(false),
+      inplace_update_num_locks(10000),
+      max_successive_merges(0) {
+  assert(memtable_factory.get() != nullptr);
+}
+
+static const char* const access_hints[] = {
+  "NONE", "NORMAL", "SEQUENTIAL", "WILLNEED"
+};
+
+void
+Options::Dump(Logger* log) const
+{
+    Log(log,"              Options.comparator: %s", comparator->Name());
+    Log(log,"          Options.merge_operator: %s",
+        merge_operator? merge_operator->Name() : "None");
+    Log(log,"       Options.compaction_filter: %s",
+        compaction_filter? compaction_filter->Name() : "None");
+    Log(log,"       Options.compaction_filter_factory: %s",
+        compaction_filter_factory->Name());
+    Log(log,"        Options.memtable_factory: %s",
+        memtable_factory->Name());
+    Log(log,"           Options.table_factory: %s", table_factory->Name());
+    Log(log,"         Options.error_if_exists: %d", error_if_exists);
+    Log(log,"       Options.create_if_missing: %d", create_if_missing);
+    Log(log,"         Options.paranoid_checks: %d", paranoid_checks);
+    Log(log,"                     Options.env: %p", env);
+    Log(log,"                Options.info_log: %p", info_log.get());
+    Log(log,"       Options.write_buffer_size: %zd", write_buffer_size);
+    Log(log," Options.max_write_buffer_number: %d", max_write_buffer_number);
+    Log(log,"          Options.max_open_files: %d", max_open_files);
+    Log(log,"             Options.block_cache: %p", block_cache.get());
+    Log(log,"  Options.block_cache_compressed: %p",
+        block_cache_compressed.get());
+    if (block_cache) {
+      Log(log,"        Options.block_cache_size: %zd",
+          block_cache->GetCapacity());
+    }
+    if (block_cache_compressed) {
+      Log(log,"Options.block_cache_compressed_size: %zd",
+          block_cache_compressed->GetCapacity());
+    }
+    Log(log,"              Options.block_size: %zd", block_size);
+    Log(log,"  Options.block_restart_interval: %d", block_restart_interval);
+    if (!compression_per_level.empty()) {
+      for (unsigned int i = 0; i < compression_per_level.size(); i++) {
+          Log(log,"       Options.compression[%d]: %d",
+              i, compression_per_level[i]);
+       }
+    } else {
+      Log(log,"         Options.compression: %d", compression);
+    }
+    Log(log,"         Options.filter_policy: %s",
+        filter_policy == nullptr ? "nullptr" : filter_policy->Name());
+    Log(log,"      Options.prefix_extractor: %s",
+        prefix_extractor == nullptr ? "nullptr" : prefix_extractor->Name());
+    Log(log,"   Options.whole_key_filtering: %d", whole_key_filtering);
+    Log(log,"            Options.num_levels: %d", num_levels);
+    Log(log,"       Options.disableDataSync: %d", disableDataSync);
+    Log(log,"             Options.use_fsync: %d", use_fsync);
+    Log(log,"     Options.max_log_file_size: %ld", max_log_file_size);
+    Log(log,"Options.max_manifest_file_size: %lu",
+        (unsigned long)max_manifest_file_size);
+    Log(log,"     Options.log_file_time_to_roll: %ld", log_file_time_to_roll);
+    Log(log,"     Options.keep_log_file_num: %ld", keep_log_file_num);
+    Log(log," Options.db_stats_log_interval: %d",
+        db_stats_log_interval);
+    Log(log,"       Options.allow_os_buffer: %d", allow_os_buffer);
+    Log(log,"      Options.allow_mmap_reads: %d", allow_mmap_reads);
+    Log(log,"     Options.allow_mmap_writes: %d", allow_mmap_writes);
+    Log(log,"       Options.min_write_buffer_number_to_merge: %d",
+        min_write_buffer_number_to_merge);
+    Log(log,"        Options.purge_redundant_kvs_while_flush: %d",
+         purge_redundant_kvs_while_flush);
+    Log(log,"           Options.compression_opts.window_bits: %d",
+        compression_opts.window_bits);
+    Log(log,"                 Options.compression_opts.level: %d",
+        compression_opts.level);
+    Log(log,"              Options.compression_opts.strategy: %d",
+        compression_opts.strategy);
+    Log(log,"     Options.level0_file_num_compaction_trigger: %d",
+        level0_file_num_compaction_trigger);
+    Log(log,"         Options.level0_slowdown_writes_trigger: %d",
+        level0_slowdown_writes_trigger);
+    Log(log,"             Options.level0_stop_writes_trigger: %d",
+        level0_stop_writes_trigger);
+    Log(log,"               Options.max_mem_compaction_level: %d",
+        max_mem_compaction_level);
+    Log(log,"                  Options.target_file_size_base: %d",
+        target_file_size_base);
+    Log(log,"            Options.target_file_size_multiplier: %d",
+        target_file_size_multiplier);
+    Log(log,"               Options.max_bytes_for_level_base: %lu",
+        (unsigned long)max_bytes_for_level_base);
+    Log(log,"         Options.max_bytes_for_level_multiplier: %d",
+        max_bytes_for_level_multiplier);
+    for (int i = 0; i < num_levels; i++) {
+      Log(log,"Options.max_bytes_for_level_multiplier_addtl[%d]: %d",
+          i, max_bytes_for_level_multiplier_additional[i]);
+    }
+    Log(log,"      Options.max_sequential_skip_in_iterations: %lu",
+        (unsigned long)max_sequential_skip_in_iterations);
+    Log(log,"             Options.expanded_compaction_factor: %d",
+        expanded_compaction_factor);
+    Log(log,"               Options.source_compaction_factor: %d",
+        source_compaction_factor);
+    Log(log,"         Options.max_grandparent_overlap_factor: %d",
+        max_grandparent_overlap_factor);
+    Log(log,"                             Options.db_log_dir: %s",
+        db_log_dir.c_str());
+    Log(log,"                             Options.wal_dir: %s",
+        wal_dir.c_str());
+    Log(log,"                Options.disable_seek_compaction: %d",
+        disable_seek_compaction);
+    Log(log,"                         Options.no_block_cache: %d",
+        no_block_cache);
+    Log(log,"               Options.table_cache_numshardbits: %d",
+        table_cache_numshardbits);
+    Log(log,"    Options.table_cache_remove_scan_count_limit: %d",
+        table_cache_remove_scan_count_limit);
+    Log(log,"                       Options.arena_block_size: %ld",
+        arena_block_size);
+    Log(log,"    Options.delete_obsolete_files_period_micros: %lu",
+        (unsigned long)delete_obsolete_files_period_micros);
+    Log(log,"             Options.max_background_compactions: %d",
+        max_background_compactions);
+    Log(log,"                 Options.max_background_flushes: %d",
+        max_background_flushes);
+    Log(log,"                      Options.soft_rate_limit: %.2f",
+        soft_rate_limit);
+    Log(log,"                      Options.hard_rate_limit: %.2f",
+        hard_rate_limit);
+    Log(log,"      Options.rate_limit_delay_max_milliseconds: %u",
+        rate_limit_delay_max_milliseconds);
+    Log(log,"               Options.disable_auto_compactions: %d",
+        disable_auto_compactions);
+    Log(log,"                        Options.WAL_ttl_seconds: %lu",
+        (unsigned long)WAL_ttl_seconds);
+    Log(log,"                      Options.WAL_size_limit_MB: %lu",
+        (unsigned long)WAL_size_limit_MB);
+    Log(log,"            Options.manifest_preallocation_size: %ld",
+        manifest_preallocation_size);
+    Log(log,"         Options.purge_redundant_kvs_while_flush: %d",
+        purge_redundant_kvs_while_flush);
+    Log(log,"                         Options.allow_os_buffer: %d",
+        allow_os_buffer);
+    Log(log,"                        Options.allow_mmap_reads: %d",
+        allow_mmap_reads);
+    Log(log,"                       Options.allow_mmap_writes: %d",
+        allow_mmap_writes);
+    Log(log,"                     Options.is_fd_close_on_exec: %d",
+        is_fd_close_on_exec);
+    Log(log,"              Options.skip_log_error_on_recovery: %d",
+        skip_log_error_on_recovery);
+    Log(log,"                   Options.stats_dump_period_sec: %u",
+        stats_dump_period_sec);
+    Log(log,"                    Options.block_size_deviation: %d",
+        block_size_deviation);
+    Log(log,"                   Options.advise_random_on_open: %d",
+        advise_random_on_open);
+    Log(log,"         Options.access_hint_on_compaction_start: %s",
+        access_hints[access_hint_on_compaction_start]);
+    Log(log,"                      Options.use_adaptive_mutex: %d",
+        use_adaptive_mutex);
+    Log(log,"                          Options.bytes_per_sync: %lu",
+        (unsigned long)bytes_per_sync);
+    Log(log,"                          Options.filter_deletes: %d",
+        filter_deletes);
+    Log(log,"                        Options.compaction_style: %d",
+        compaction_style);
+    Log(log," Options.compaction_options_universal.size_ratio: %u",
+        compaction_options_universal.size_ratio);
+    Log(log,"Options.compaction_options_universal.min_merge_width: %u",
+        compaction_options_universal.min_merge_width);
+    Log(log,"Options.compaction_options_universal.max_merge_width: %u",
+        compaction_options_universal.max_merge_width);
+    Log(log,"Options.compaction_options_universal."
+            "max_size_amplification_percent: %u",
+        compaction_options_universal.max_size_amplification_percent);
+    Log(log,
+        "Options.compaction_options_universal.compression_size_percent: %u",
+        compaction_options_universal.compression_size_percent);
+    std::string collector_names;
+    for (auto collector : table_properties_collectors) {
+      collector_names.append(collector->Name());
+      collector_names.append("; ");
+    }
+    Log(log, "                  Options.table_properties_collectors: %s",
+        collector_names.c_str());
+    Log(log, "                  Options.inplace_update_support: %d",
+        inplace_update_support);
+    Log(log, "                Options.inplace_update_num_locks: %zd",
+        inplace_update_num_locks);
+    Log(log, "                   Options.max_successive_merges: %zd",
+        max_successive_merges);
+}   // Options::Dump
+
+//
+// The goal of this method is to create a configuration that
+// allows an application to write all files into L0 and
+// then do a single compaction to output all files into L1.
+Options*
+Options::PrepareForBulkLoad()
+{
+  // never slowdown ingest.
+  level0_file_num_compaction_trigger = (1<<30);
+  level0_slowdown_writes_trigger = (1<<30);
+  level0_stop_writes_trigger = (1<<30);
+
+  // no auto compactions please. The application should issue a
+  // manual compaction after all data is loaded into L0.
+  disable_auto_compactions = true;
+  disable_seek_compaction = true;
+  disableDataSync = true;
+
+  // A manual compaction run should pick all files in L0 in
+  // a single compaction run.
+  source_compaction_factor = (1<<30);
+
+  // It is better to have only 2 levels, otherwise a manual
+  // compaction would compact at every possible level, thereby
+  // increasing the total time needed for compactions.
+  num_levels = 2;
+
+  // Prevent a memtable flush to automatically promote files
+  // to L1. This is helpful so that all files that are
+  // input to the manual compaction are all at L0.
+  max_background_compactions = 2;
+
+  // The compaction would create large files in L1.
+  target_file_size_base = 256 * 1024 * 1024;
+  return this;
+}
+
+}  // namespace rocksdb
diff --git a/util/perf_context.cc b/util/perf_context.cc
new file mode 100644 (file)
index 0000000..1e8ddfb
--- /dev/null
@@ -0,0 +1,30 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#include "util/perf_context_imp.h"
+
+namespace rocksdb {
+
+// by default, enable counts only
+PerfLevel perf_level = kEnableCount;
+
+void SetPerfLevel(PerfLevel level) { perf_level = level; }
+
+void PerfContext::Reset() {
+  user_key_comparison_count = 0;
+  block_cache_hit_count = 0;
+  block_read_count = 0;
+  block_read_byte = 0;
+  block_read_time = 0;
+  block_checksum_time = 0;
+  block_decompress_time = 0;
+  internal_key_skipped_count = 0;
+  internal_delete_skipped_count = 0;
+  wal_write_time = 0;
+}
+
+__thread PerfContext perf_context;
+
+}
diff --git a/util/perf_context_imp.h b/util/perf_context_imp.h
new file mode 100644 (file)
index 0000000..f7818e6
--- /dev/null
@@ -0,0 +1,34 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#pragma once
+#include "rocksdb/perf_context.h"
+#include "util/stop_watch.h"
+
+namespace rocksdb {
+
+extern enum PerfLevel perf_level;
+
+inline void StartPerfTimer(StopWatchNano* timer) {
+  if (perf_level >= PerfLevel::kEnableTime) {
+    timer->Start();
+  }
+}
+
+inline void BumpPerfCount(uint64_t* count, uint64_t delta = 1) {
+  if (perf_level >= PerfLevel::kEnableCount) {
+    *count += delta;
+  }
+}
+
+inline void BumpPerfTime(uint64_t* time,
+                         StopWatchNano* timer,
+                         bool reset = true) {
+  if (perf_level >= PerfLevel::kEnableTime) {
+    *time += timer->ElapsedNanos(reset);
+  }
+}
+
+}
diff --git a/util/posix_logger.h b/util/posix_logger.h
new file mode 100644 (file)
index 0000000..8f7463c
--- /dev/null
@@ -0,0 +1,154 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Logger implementation that can be shared by all environments
+// where enough posix functionality is available.
+
+#pragma once
+#include <algorithm>
+#include <stdio.h>
+#include <sys/time.h>
+#include <time.h>
+#include <fcntl.h>
+#include <unistd.h>
+#ifdef OS_LINUX
+#include <linux/falloc.h>
+#endif
+#include "rocksdb/env.h"
+#include <atomic>
+
+namespace rocksdb {
+
+const int kDebugLogChunkSize = 128 * 1024;
+
+class PosixLogger : public Logger {
+ private:
+  FILE* file_;
+  uint64_t (*gettid_)();  // Return the thread id for the current thread
+  std::atomic_size_t log_size_;
+  int fd_;
+  const static uint64_t flush_every_seconds_ = 5;
+  std::atomic_uint_fast64_t last_flush_micros_;
+  Env* env_;
+  bool flush_pending_;
+ public:
+  PosixLogger(FILE* f, uint64_t (*gettid)(), Env* env) :
+    file_(f), gettid_(gettid), log_size_(0), fd_(fileno(f)),
+    last_flush_micros_(0), env_(env), flush_pending_(false) { }
+  virtual ~PosixLogger() {
+    fclose(file_);
+  }
+  virtual void Flush() {
+    if (flush_pending_) {
+      flush_pending_ = false;
+      fflush(file_);
+    }
+    last_flush_micros_ = env_->NowMicros();
+  }
+  virtual void Logv(const char* format, va_list ap) {
+    const uint64_t thread_id = (*gettid_)();
+
+    // We try twice: the first time with a fixed-size stack allocated buffer,
+    // and the second time with a much larger dynamically allocated buffer.
+    char buffer[500];
+    for (int iter = 0; iter < 2; iter++) {
+      char* base;
+      int bufsize;
+      if (iter == 0) {
+        bufsize = sizeof(buffer);
+        base = buffer;
+      } else {
+        bufsize = 30000;
+        base = new char[bufsize];
+      }
+      char* p = base;
+      char* limit = base + bufsize;
+
+      struct timeval now_tv;
+      gettimeofday(&now_tv, nullptr);
+      const time_t seconds = now_tv.tv_sec;
+      struct tm t;
+      localtime_r(&seconds, &t);
+      p += snprintf(p, limit - p,
+                    "%04d/%02d/%02d-%02d:%02d:%02d.%06d %llx ",
+                    t.tm_year + 1900,
+                    t.tm_mon + 1,
+                    t.tm_mday,
+                    t.tm_hour,
+                    t.tm_min,
+                    t.tm_sec,
+                    static_cast<int>(now_tv.tv_usec),
+                    static_cast<long long unsigned int>(thread_id));
+
+      // Print the message
+      if (p < limit) {
+        va_list backup_ap;
+        va_copy(backup_ap, ap);
+        p += vsnprintf(p, limit - p, format, backup_ap);
+        va_end(backup_ap);
+      }
+
+      // Truncate to available space if necessary
+      if (p >= limit) {
+        if (iter == 0) {
+          continue;       // Try again with larger buffer
+        } else {
+          p = limit - 1;
+        }
+      }
+
+      // Add newline if necessary
+      if (p == base || p[-1] != '\n') {
+        *p++ = '\n';
+      }
+
+      assert(p <= limit);
+      const size_t write_size = p - base;
+
+#ifdef ROCKSDB_FALLOCATE_PRESENT
+      // If this write would cross a boundary of kDebugLogChunkSize
+      // space, pre-allocate more space to avoid overly large
+      // allocations from filesystem allocsize options.
+      const size_t log_size = log_size_;
+      const int last_allocation_chunk =
+        ((kDebugLogChunkSize - 1 + log_size) / kDebugLogChunkSize);
+      const int desired_allocation_chunk =
+        ((kDebugLogChunkSize - 1 + log_size + write_size) /
+           kDebugLogChunkSize);
+      if (last_allocation_chunk != desired_allocation_chunk) {
+        fallocate(fd_, FALLOC_FL_KEEP_SIZE, 0,
+                  desired_allocation_chunk * kDebugLogChunkSize);
+      }
+#endif
+
+      size_t sz = fwrite(base, 1, write_size, file_);
+      flush_pending_ = true;
+      assert(sz == write_size);
+      if (sz > 0) {
+        log_size_ += write_size;
+      }
+      uint64_t now_micros = static_cast<uint64_t>(now_tv.tv_sec) * 1000000 +
+        now_tv.tv_usec;
+      if (now_micros - last_flush_micros_ >= flush_every_seconds_ * 1000000) {
+        flush_pending_ = false;
+        fflush(file_);
+        last_flush_micros_ = now_micros;
+      }
+      if (base != buffer) {
+        delete[] base;
+      }
+      break;
+    }
+  }
+  size_t GetLogFileSize() const {
+    return log_size_;
+  }
+};
+
+}  // namespace rocksdb
diff --git a/util/random.h b/util/random.h
new file mode 100644 (file)
index 0000000..e5b3315
--- /dev/null
@@ -0,0 +1,90 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <random>
+#include <stdint.h>
+
+namespace rocksdb {
+
+// A very simple random number generator.  Not especially good at
+// generating truly random bits, but good enough for our needs in this
+// package.
+class Random {
+ private:
+  uint32_t seed_;
+ public:
+  explicit Random(uint32_t s) : seed_(s & 0x7fffffffu) { }
+  uint32_t Next() {
+    static const uint32_t M = 2147483647L;   // 2^31-1
+    static const uint64_t A = 16807;  // bits 14, 8, 7, 5, 2, 1, 0
+    // We are computing
+    //       seed_ = (seed_ * A) % M,    where M = 2^31-1
+    //
+    // seed_ must not be zero or M, or else all subsequent computed values
+    // will be zero or M respectively.  For all other values, seed_ will end
+    // up cycling through every number in [1,M-1]
+    uint64_t product = seed_ * A;
+
+    // Compute (product % M) using the fact that ((x << 31) % M) == x.
+    seed_ = static_cast<uint32_t>((product >> 31) + (product & M));
+    // The first reduction may overflow by 1 bit, so we may need to
+    // repeat.  mod == M is not possible; using > allows the faster
+    // sign-bit-based test.
+    if (seed_ > M) {
+      seed_ -= M;
+    }
+    return seed_;
+  }
+  // Returns a uniformly distributed value in the range [0..n-1]
+  // REQUIRES: n > 0
+  uint32_t Uniform(int n) { return Next() % n; }
+
+  // Randomly returns true ~"1/n" of the time, and false otherwise.
+  // REQUIRES: n > 0
+  bool OneIn(int n) { return (Next() % n) == 0; }
+
+  // Skewed: pick "base" uniformly from range [0,max_log] and then
+  // return "base" random bits.  The effect is to pick a number in the
+  // range [0,2^max_log-1] with exponential bias towards smaller numbers.
+  uint32_t Skewed(int max_log) {
+    return Uniform(1 << Uniform(max_log + 1));
+  }
+};
+
+// A simple 64bit random number generator based on std::mt19937_64
+class Random64 {
+ private:
+  std::mt19937_64 generator_;
+
+ public:
+  explicit Random64(uint64_t s) : generator_(s) { }
+
+  // Generates the next random number
+  uint64_t Next() { return generator_(); }
+
+  // Returns a uniformly distributed value in the range [0..n-1]
+  // REQUIRES: n > 0
+  uint64_t Uniform(uint64_t n) {
+    return std::uniform_int_distribution<uint64_t>(0, n - 1)(generator_);
+  }
+
+  // Randomly returns true ~"1/n" of the time, and false otherwise.
+  // REQUIRES: n > 0
+  bool OneIn(uint64_t n) { return Uniform(n) == 0; }
+
+  // Skewed: pick "base" uniformly from range [0,max_log] and then
+  // return "base" random bits.  The effect is to pick a number in the
+  // range [0,2^max_log-1] with exponential bias towards smaller numbers.
+  uint64_t Skewed(int max_log) {
+    return Uniform(1 << Uniform(max_log + 1));
+  }
+};
+
+}  // namespace rocksdb
diff --git a/util/signal_test.cc b/util/signal_test.cc
new file mode 100644 (file)
index 0000000..bffc298
--- /dev/null
@@ -0,0 +1,32 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#include "util/stack_trace.h"
+#include <assert.h>
+
+void f0() {
+  char *p = nullptr;
+  *p = 10;  /* SIGSEGV here!! */
+}
+
+void f1() {
+  f0();
+}
+
+void f2() {
+  f1();
+}
+
+void f3() {
+  f2();
+}
+
+int main() {
+  rocksdb::InstallStackTraceHandler();
+
+  f3();
+
+  return 0;
+}
diff --git a/util/skiplistrep.cc b/util/skiplistrep.cc
new file mode 100644 (file)
index 0000000..955d754
--- /dev/null
@@ -0,0 +1,104 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#include "rocksdb/memtablerep.h"
+#include "db/memtable.h"
+#include "db/skiplist.h"
+
+namespace rocksdb {
+namespace {
+class SkipListRep : public MemTableRep {
+  SkipList<const char*, MemTableRep::KeyComparator&> skip_list_;
+public:
+  explicit SkipListRep(MemTableRep::KeyComparator& compare, Arena* arena)
+    : skip_list_(compare, arena) {
+}
+
+  // Insert key into the list.
+  // REQUIRES: nothing that compares equal to key is currently in the list.
+  virtual void Insert(const char* key) override {
+    skip_list_.Insert(key);
+  }
+
+  // Returns true iff an entry that compares equal to key is in the list.
+  virtual bool Contains(const char* key) const override {
+    return skip_list_.Contains(key);
+  }
+
+  virtual size_t ApproximateMemoryUsage() override {
+    // All memory is allocated through arena; nothing to report here
+    return 0;
+  }
+
+  virtual ~SkipListRep() override { }
+
+  // Iteration over the contents of a skip list
+  class Iterator : public MemTableRep::Iterator {
+    SkipList<const char*, MemTableRep::KeyComparator&>::Iterator iter_;
+   public:
+    // Initialize an iterator over the specified list.
+    // The returned iterator is not valid.
+    explicit Iterator(
+      const SkipList<const char*, MemTableRep::KeyComparator&>* list
+    ) : iter_(list) { }
+
+    virtual ~Iterator() override { }
+
+    // Returns true iff the iterator is positioned at a valid node.
+    virtual bool Valid() const override {
+      return iter_.Valid();
+    }
+
+    // Returns the key at the current position.
+    // REQUIRES: Valid()
+    virtual const char* key() const override {
+      return iter_.key();
+    }
+
+    // Advances to the next position.
+    // REQUIRES: Valid()
+    virtual void Next() override {
+      iter_.Next();
+    }
+
+    // Advances to the previous position.
+    // REQUIRES: Valid()
+    virtual void Prev() override {
+      iter_.Prev();
+    }
+
+    // Advance to the first entry with a key >= target
+    virtual void Seek(const char* target) override {
+      iter_.Seek(target);
+    }
+
+    // Position at the first entry in list.
+    // Final state of iterator is Valid() iff list is not empty.
+    virtual void SeekToFirst() override {
+      iter_.SeekToFirst();
+    }
+
+    // Position at the last entry in list.
+    // Final state of iterator is Valid() iff list is not empty.
+    virtual void SeekToLast() override {
+      iter_.SeekToLast();
+    }
+  };
+
+  // Unhide default implementations of GetIterator
+  using MemTableRep::GetIterator;
+
+  virtual std::shared_ptr<MemTableRep::Iterator> GetIterator() override {
+    return std::make_shared<SkipListRep::Iterator>(&skip_list_);
+  }
+};
+}
+
+std::shared_ptr<MemTableRep> SkipListFactory::CreateMemTableRep (
+  MemTableRep::KeyComparator& compare, Arena* arena) {
+    return std::shared_ptr<MemTableRep>(new SkipListRep(compare, arena));
+}
+
+} // namespace rocksdb
diff --git a/util/slice.cc b/util/slice.cc
new file mode 100644 (file)
index 0000000..55f561f
--- /dev/null
@@ -0,0 +1,73 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/slice.h"
+
+namespace rocksdb {
+
+namespace {
+
+class FixedPrefixTransform : public SliceTransform {
+ private:
+  size_t prefix_len_;
+
+ public:
+  explicit FixedPrefixTransform(size_t prefix_len) : prefix_len_(prefix_len) { }
+
+  virtual const char* Name() const {
+    return "rocksdb.FixedPrefix";
+  }
+
+  virtual Slice Transform(const Slice& src) const {
+    assert(InDomain(src));
+    return Slice(src.data(), prefix_len_);
+  }
+
+  virtual bool InDomain(const Slice& src) const {
+    return (src.size() >= prefix_len_);
+  }
+
+  virtual bool InRange(const Slice& dst) const {
+    return (dst.size() == prefix_len_);
+  }
+};
+
+class NoopTransform : public SliceTransform {
+ public:
+  explicit NoopTransform() { }
+
+  virtual const char* Name() const {
+    return "rocksdb.Noop";
+  }
+
+  virtual Slice Transform(const Slice& src) const {
+    return src;
+  }
+
+  virtual bool InDomain(const Slice& src) const {
+    return true;
+  }
+
+  virtual bool InRange(const Slice& dst) const {
+    return true;
+  }
+};
+
+}
+
+const SliceTransform* NewFixedPrefixTransform(size_t prefix_len) {
+  return new FixedPrefixTransform(prefix_len);
+}
+
+const SliceTransform* NewNoopTransform() {
+  return new NoopTransform;
+}
+
+}  // namespace rocksdb
diff --git a/util/stack_trace.h b/util/stack_trace.h
new file mode 100644 (file)
index 0000000..3b06e1d
--- /dev/null
@@ -0,0 +1,17 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#pragma once
+namespace rocksdb {
+
+// Install a signal handler to print callstack on the following signals:
+// SIGILL SIGSEGV SIGBUS SIGABRT
+// Currently supports linux only. No-op otherwise.
+void InstallStackTraceHandler();
+
+// Prints stack, skips skip_first_frames frames
+void PrintStack(int first_frames_to_skip = 0);
+
+}   // namespace rocksdb
diff --git a/util/statistics.cc b/util/statistics.cc
new file mode 100644 (file)
index 0000000..5f7a5ba
--- /dev/null
@@ -0,0 +1,60 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#include "rocksdb/statistics.h"
+#include <cstdio>
+
+namespace rocksdb {
+
+namespace {
+// a buffer size used for temp string buffers
+const int kBufferSize = 200;
+
+std::string HistogramToString (
+    Statistics* dbstats,
+    const Histograms& histogram_type,
+    const std::string& name) {
+
+  char buffer[kBufferSize];
+  HistogramData histogramData;
+  dbstats->histogramData(histogram_type, &histogramData);
+  snprintf(
+      buffer,
+      kBufferSize,
+      "%s statistics Percentiles :=> 50 : %f 95 : %f 99 : %f\n",
+      name.c_str(),
+      histogramData.median,
+      histogramData.percentile95,
+      histogramData.percentile99
+  );
+  return std::string(buffer);
+};
+
+std::string TickerToString (
+    Statistics* dbstats,
+    const Tickers& ticker,
+    const std::string& name) {
+
+  char buffer[kBufferSize];
+  snprintf(buffer, kBufferSize, "%s COUNT : %ld\n",
+            name.c_str(), dbstats->getTickerCount(ticker));
+  return std::string(buffer);
+};
+} // namespace
+
+std::string Statistics::ToString() {
+  std::string res;
+  res.reserve(20000);
+  for (const auto& t : TickersNameMap) {
+    res.append(TickerToString(this, t.first, t.second));
+  }
+  for (const auto& h : HistogramsNameMap) {
+    res.append(HistogramToString(this, h.first, h.second));
+  }
+  res.shrink_to_fit();
+  return res;
+}
+
+} // namespace rocksdb
diff --git a/util/statistics_imp.h b/util/statistics_imp.h
new file mode 100644 (file)
index 0000000..0dc8884
--- /dev/null
@@ -0,0 +1,32 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#pragma once
+#include "rocksdb/statistics.h"
+
+namespace rocksdb {
+
+// Utility functions
+inline void RecordTick(Statistics* statistics,
+                       Tickers ticker,
+                       uint64_t count = 1) {
+  assert(HistogramsNameMap.size() == HISTOGRAM_ENUM_MAX);
+  assert(TickersNameMap.size() == TICKER_ENUM_MAX);
+  if (statistics) {
+    statistics->recordTick(ticker, count);
+  }
+}
+
+inline void SetTickerCount(Statistics* statistics,
+                           Tickers ticker,
+                           uint64_t count) {
+  assert(HistogramsNameMap.size() == HISTOGRAM_ENUM_MAX);
+  assert(TickersNameMap.size() == TICKER_ENUM_MAX);
+  if (statistics) {
+    statistics->setTickerCount(ticker, count);
+  }
+}
+
+}
diff --git a/util/stats_logger.h b/util/stats_logger.h
new file mode 100644 (file)
index 0000000..f0b4540
--- /dev/null
@@ -0,0 +1,26 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#pragma once
+
+namespace rocksdb {
+
+class StatsLogger {
+
+ public:
+
+  virtual void Log_Deploy_Stats(const std::string& db_version,
+                                const std::string& machine_info,
+                                const std::string& data_dir,
+                                const uint64_t data_size,
+                                const uint32_t file_number,
+                                const std::string& data_size_per_level,
+                                const std::string& file_number_per_level,
+                                const int64_t& ts_unix) = 0;
+  virtual ~StatsLogger() {}
+
+};
+
+}
diff --git a/util/status.cc b/util/status.cc
new file mode 100644 (file)
index 0000000..69060a7
--- /dev/null
@@ -0,0 +1,80 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <stdio.h>
+#include "port/port.h"
+#include "rocksdb/status.h"
+
+namespace rocksdb {
+
+const char* Status::CopyState(const char* state) {
+  uint32_t size;
+  memcpy(&size, state, sizeof(size));
+  char* result = new char[size + 4];
+  memcpy(result, state, size + 4);
+  return result;
+}
+
+Status::Status(Code code, const Slice& msg, const Slice& msg2) :
+    code_(code) {
+  assert(code != kOk);
+  const uint32_t len1 = msg.size();
+  const uint32_t len2 = msg2.size();
+  const uint32_t size = len1 + (len2 ? (2 + len2) : 0);
+  char* result = new char[size + 4];
+  memcpy(result, &size, sizeof(size));
+  memcpy(result + 4, msg.data(), len1);
+  if (len2) {
+    result[4 + len1] = ':';
+    result[5 + len1] = ' ';
+    memcpy(result + 6 + len1, msg2.data(), len2);
+  }
+  state_ = result;
+}
+
+std::string Status::ToString() const {
+  char tmp[30];
+  const char* type;
+  switch (code_) {
+    case kOk:
+      return "OK";
+    case kNotFound:
+      type = "NotFound: ";
+      break;
+    case kCorruption:
+      type = "Corruption: ";
+      break;
+    case kNotSupported:
+      type = "Not implemented: ";
+      break;
+    case kInvalidArgument:
+      type = "Invalid argument: ";
+      break;
+    case kIOError:
+      type = "IO error: ";
+      break;
+    case kMergeInProgress:
+      type = "Merge In Progress: ";
+      break;
+    default:
+      snprintf(tmp, sizeof(tmp), "Unknown code(%d): ",
+               static_cast<int>(code()));
+      type = tmp;
+      break;
+  }
+  std::string result(type);
+  if (state_ != nullptr) {
+    uint32_t length;
+    memcpy(&length, state_, sizeof(length));
+    result.append(state_ + 4, length);
+  }
+  return result;
+}
+
+}  // namespace rocksdb
diff --git a/util/stl_wrappers.h b/util/stl_wrappers.h
new file mode 100644 (file)
index 0000000..b4c14b4
--- /dev/null
@@ -0,0 +1,32 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#pragma once
+
+#include "util/murmurhash.h"
+#include "util/coding.h"
+
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/slice.h"
+
+namespace rocksdb {
+namespace stl_wrappers {
+  class Base {
+   protected:
+    const MemTableRep::KeyComparator& compare_;
+    explicit Base(const MemTableRep::KeyComparator& compare)
+      : compare_(compare) { }
+  };
+
+  struct Compare : private Base {
+    explicit Compare(const MemTableRep::KeyComparator& compare)
+      : Base(compare) { }
+    inline bool operator()(const char* a, const char* b) const {
+      return compare_(a, b) < 0;
+    }
+  };
+
+}
+}
diff --git a/util/stop_watch.h b/util/stop_watch.h
new file mode 100644 (file)
index 0000000..6325a74
--- /dev/null
@@ -0,0 +1,71 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#pragma once
+#include "rocksdb/env.h"
+#include "util/statistics_imp.h"
+
+namespace rocksdb {
+// Auto-scoped.
+// Records the statistic into the corresponding histogram.
+class StopWatch {
+ public:
+  explicit StopWatch(
+    Env * const env,
+    Statistics* statistics = nullptr,
+    const Histograms histogram_name = DB_GET,
+    bool auto_start = true) :
+      env_(env),
+      start_time_((!auto_start && !statistics) ? 0 : env->NowMicros()),
+      statistics_(statistics),
+      histogram_name_(histogram_name) {}
+
+
+
+  uint64_t ElapsedMicros() {
+    return env_->NowMicros() - start_time_;
+  }
+
+  ~StopWatch() {
+    if (statistics_) {
+      statistics_->measureTime(histogram_name_, ElapsedMicros());
+    }
+  }
+
+ private:
+  Env* const env_;
+  const uint64_t start_time_;
+  Statistics* statistics_;
+  const Histograms histogram_name_;
+
+};
+
+// a nano second precision stopwatch
+class StopWatchNano {
+ public:
+  explicit StopWatchNano(Env* const env, bool auto_start = false)
+      : env_(env), start_(0) {
+    if (auto_start) {
+      Start();
+    }
+  }
+
+  void Start() { start_ = env_->NowNanos(); }
+
+  uint64_t ElapsedNanos(bool reset = false) {
+    auto now = env_->NowNanos();
+    auto elapsed = now - start_;
+    if (reset) {
+      start_ = now;
+    }
+    return elapsed;
+  }
+
+ private:
+  Env* const env_;
+  uint64_t start_;
+};
+
+} // namespace rocksdb
diff --git a/util/string_util.cc b/util/string_util.cc
new file mode 100644 (file)
index 0000000..33f84d9
--- /dev/null
@@ -0,0 +1,26 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace rocksdb {
+
+using namespace std;
+using std::string;
+using std::vector;
+using std::stringstream;
+
+vector<string> stringSplit(string arg, char delim) {
+  vector<string> splits;
+  stringstream ss(arg);
+  string item;
+  while(getline(ss, item, delim)) {
+    splits.push_back(item);
+  }
+  return splits;
+}
+}
diff --git a/util/string_util.h b/util/string_util.h
new file mode 100644 (file)
index 0000000..7dfd68a
--- /dev/null
@@ -0,0 +1,11 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#pragma once
+namespace rocksdb {
+
+extern std::vector<std::string> stringSplit(std::string arg, char delim);
+
+}
diff --git a/util/testharness.cc b/util/testharness.cc
new file mode 100644 (file)
index 0000000..85716cd
--- /dev/null
@@ -0,0 +1,82 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/testharness.h"
+
+#include <string>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+namespace rocksdb {
+namespace test {
+
+namespace {
+struct Test {
+  const char* base;
+  const char* name;
+  void (*func)();
+};
+std::vector<Test>* tests;
+}
+
+bool RegisterTest(const char* base, const char* name, void (*func)()) {
+  if (tests == nullptr) {
+    tests = new std::vector<Test>;
+  }
+  Test t;
+  t.base = base;
+  t.name = name;
+  t.func = func;
+  tests->push_back(t);
+  return true;
+}
+
+int RunAllTests() {
+  const char* matcher = getenv("ROCKSDB_TESTS");
+
+  int num = 0;
+  if (tests != nullptr) {
+    for (unsigned int i = 0; i < tests->size(); i++) {
+      const Test& t = (*tests)[i];
+      if (matcher != nullptr) {
+        std::string name = t.base;
+        name.push_back('.');
+        name.append(t.name);
+        if (strstr(name.c_str(), matcher) == nullptr) {
+          continue;
+        }
+      }
+      fprintf(stderr, "==== Test %s.%s\n", t.base, t.name);
+      (*t.func)();
+      ++num;
+    }
+  }
+  fprintf(stderr, "==== PASSED %d tests\n", num);
+  return 0;
+}
+
+std::string TmpDir() {
+  std::string dir;
+  Status s = Env::Default()->GetTestDirectory(&dir);
+  ASSERT_TRUE(s.ok()) << s.ToString();
+  return dir;
+}
+
+int RandomSeed() {
+  const char* env = getenv("TEST_RANDOM_SEED");
+  int result = (env != nullptr ? atoi(env) : 301);
+  if (result <= 0) {
+    result = 301;
+  }
+  return result;
+}
+
+}  // namespace test
+}  // namespace rocksdb
diff --git a/util/testharness.h b/util/testharness.h
new file mode 100644 (file)
index 0000000..f159178
--- /dev/null
@@ -0,0 +1,142 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sstream>
+#include "rocksdb/env.h"
+#include "rocksdb/slice.h"
+#include "util/random.h"
+#include "util/stack_trace.h"
+
+namespace rocksdb {
+namespace test {
+
+// Run some of the tests registered by the TEST() macro.  If the
+// environment variable "ROCKSDB_TESTS" is not set, runs all tests.
+// Otherwise, runs only the tests whose name contains the value of
+// "ROCKSDB_TESTS" as a substring.  E.g., suppose the tests are:
+//    TEST(Foo, Hello) { ... }
+//    TEST(Foo, World) { ... }
+// ROCKSDB_TESTS=Hello will run the first test
+// ROCKSDB_TESTS=o     will run both tests
+// ROCKSDB_TESTS=Junk  will run no tests
+//
+// Returns 0 if all tests pass.
+// Dies or returns a non-zero value if some test fails.
+extern int RunAllTests();
+
+// Return the directory to use for temporary storage.
+extern std::string TmpDir();
+
+// Return a randomization seed for this run.  Typically returns the
+// same number on repeated invocations of this binary, but automated
+// runs may be able to vary the seed.
+extern int RandomSeed();
+
+// An instance of Tester is allocated to hold temporary state during
+// the execution of an assertion.
+class Tester {
+ private:
+  bool ok_;
+  const char* fname_;
+  int line_;
+  std::stringstream ss_;
+
+ public:
+  Tester(const char* f, int l)
+      : ok_(true), fname_(f), line_(l) {
+  }
+
+  ~Tester() {
+    if (!ok_) {
+      fprintf(stderr, "%s:%d:%s\n", fname_, line_, ss_.str().c_str());
+      PrintStack(2);
+      exit(1);
+    }
+  }
+
+  Tester& Is(bool b, const char* msg) {
+    if (!b) {
+      ss_ << " Assertion failure " << msg;
+      ok_ = false;
+    }
+    return *this;
+  }
+
+  Tester& IsOk(const Status& s) {
+    if (!s.ok()) {
+      ss_ << " " << s.ToString();
+      ok_ = false;
+    }
+    return *this;
+  }
+
+#define BINARY_OP(name,op)                              \
+  template <class X, class Y>                           \
+  Tester& name(const X& x, const Y& y) {                \
+    if (! (x op y)) {                                   \
+      ss_ << " failed: " << x << (" " #op " ") << y;    \
+      ok_ = false;                                      \
+    }                                                   \
+    return *this;                                       \
+  }
+
+  BINARY_OP(IsEq, ==)
+  BINARY_OP(IsNe, !=)
+  BINARY_OP(IsGe, >=)
+  BINARY_OP(IsGt, >)
+  BINARY_OP(IsLe, <=)
+  BINARY_OP(IsLt, <)
+#undef BINARY_OP
+
+  // Attach the specified value to the error message if an error has occurred
+  template <class V>
+  Tester& operator<<(const V& value) {
+    if (!ok_) {
+      ss_ << " " << value;
+    }
+    return *this;
+  }
+};
+
+#define ASSERT_TRUE(c) ::rocksdb::test::Tester(__FILE__, __LINE__).Is((c), #c)
+#define ASSERT_OK(s) ::rocksdb::test::Tester(__FILE__, __LINE__).IsOk((s))
+#define ASSERT_EQ(a,b) ::rocksdb::test::Tester(__FILE__, __LINE__).IsEq((a),(b))
+#define ASSERT_NE(a,b) ::rocksdb::test::Tester(__FILE__, __LINE__).IsNe((a),(b))
+#define ASSERT_GE(a,b) ::rocksdb::test::Tester(__FILE__, __LINE__).IsGe((a),(b))
+#define ASSERT_GT(a,b) ::rocksdb::test::Tester(__FILE__, __LINE__).IsGt((a),(b))
+#define ASSERT_LE(a,b) ::rocksdb::test::Tester(__FILE__, __LINE__).IsLe((a),(b))
+#define ASSERT_LT(a,b) ::rocksdb::test::Tester(__FILE__, __LINE__).IsLt((a),(b))
+
+#define TCONCAT(a,b) TCONCAT1(a,b)
+#define TCONCAT1(a,b) a##b
+
+#define TEST(base,name)                                                 \
+class TCONCAT(_Test_,name) : public base {                              \
+ public:                                                                \
+  void _Run();                                                          \
+  static void _RunIt() {                                                \
+    TCONCAT(_Test_,name) t;                                             \
+    t._Run();                                                           \
+  }                                                                     \
+};                                                                      \
+bool TCONCAT(_Test_ignored_,name) =                                     \
+  ::rocksdb::test::RegisterTest(#base, #name, &TCONCAT(_Test_,name)::_RunIt); \
+void TCONCAT(_Test_,name)::_Run()
+
+// Register the specified test.  Typically not used directly, but
+// invoked via the macro expansion of TEST.
+extern bool RegisterTest(const char* base, const char* name, void (*func)());
+
+
+}  // namespace test
+}  // namespace rocksdb
diff --git a/util/testutil.cc b/util/testutil.cc
new file mode 100644 (file)
index 0000000..13e781e
--- /dev/null
@@ -0,0 +1,56 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/testutil.h"
+
+#include "util/random.h"
+
+namespace rocksdb {
+namespace test {
+
+Slice RandomString(Random* rnd, int len, std::string* dst) {
+  dst->resize(len);
+  for (int i = 0; i < len; i++) {
+    (*dst)[i] = static_cast<char>(' ' + rnd->Uniform(95));   // ' ' .. '~'
+  }
+  return Slice(*dst);
+}
+
+std::string RandomKey(Random* rnd, int len) {
+  // Make sure to generate a wide variety of characters so we
+  // test the boundary conditions for short-key optimizations.
+  static const char kTestChars[] = {
+    '\0', '\1', 'a', 'b', 'c', 'd', 'e', '\xfd', '\xfe', '\xff'
+  };
+  std::string result;
+  for (int i = 0; i < len; i++) {
+    result += kTestChars[rnd->Uniform(sizeof(kTestChars))];
+  }
+  return result;
+}
+
+
+extern Slice CompressibleString(Random* rnd, double compressed_fraction,
+                                int len, std::string* dst) {
+  int raw = static_cast<int>(len * compressed_fraction);
+  if (raw < 1) raw = 1;
+  std::string raw_data;
+  RandomString(rnd, raw, &raw_data);
+
+  // Duplicate the random data until we have filled "len" bytes
+  dst->clear();
+  while (dst->size() < (unsigned int)len) {
+    dst->append(raw_data);
+  }
+  dst->resize(len);
+  return Slice(*dst);
+}
+
+}  // namespace test
+}  // namespace rocksdb
diff --git a/util/testutil.h b/util/testutil.h
new file mode 100644 (file)
index 0000000..c73210f
--- /dev/null
@@ -0,0 +1,55 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include "rocksdb/env.h"
+#include "rocksdb/slice.h"
+#include "util/random.h"
+
+namespace rocksdb {
+namespace test {
+
+// Store in *dst a random string of length "len" and return a Slice that
+// references the generated data.
+extern Slice RandomString(Random* rnd, int len, std::string* dst);
+
+// Return a random key with the specified length that may contain interesting
+// characters (e.g. \x00, \xff, etc.).
+extern std::string RandomKey(Random* rnd, int len);
+
+// Store in *dst a string of length "len" that will compress to
+// "N*compressed_fraction" bytes and return a Slice that references
+// the generated data.
+extern Slice CompressibleString(Random* rnd, double compressed_fraction,
+                                int len, std::string* dst);
+
+// A wrapper that allows injection of errors.
+class ErrorEnv : public EnvWrapper {
+ public:
+  bool writable_file_error_;
+  int num_writable_file_errors_;
+
+  ErrorEnv() : EnvWrapper(Env::Default()),
+               writable_file_error_(false),
+               num_writable_file_errors_(0) { }
+
+  virtual Status NewWritableFile(const std::string& fname,
+                                 unique_ptr<WritableFile>* result,
+                                 const EnvOptions& soptions) {
+    result->reset();
+    if (writable_file_error_) {
+      ++num_writable_file_errors_;
+      return Status::IOError(fname, "fake error");
+    }
+    return target()->NewWritableFile(fname, result, soptions);
+  }
+};
+
+}  // namespace test
+}  // namespace rocksdb
diff --git a/util/vectorrep.cc b/util/vectorrep.cc
new file mode 100644 (file)
index 0000000..8d3ccc9
--- /dev/null
@@ -0,0 +1,249 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#include "rocksdb/memtablerep.h"
+
+#include <unordered_set>
+#include <set>
+#include <memory>
+#include <algorithm>
+#include <type_traits>
+
+#include "rocksdb/arena.h"
+#include "port/port.h"
+#include "util/mutexlock.h"
+#include "util/stl_wrappers.h"
+
+namespace rocksdb {
+namespace {
+
+using namespace stl_wrappers;
+
+class VectorRep : public MemTableRep {
+ public:
+  VectorRep(const KeyComparator& compare, Arena* arena, size_t count);
+
+  // Insert key into the collection. (The caller will pack key and value into a
+  // single buffer and pass that in as the parameter to Insert)
+  // REQUIRES: nothing that compares equal to key is currently in the
+  // collection.
+  virtual void Insert(const char* key) override;
+
+  // Returns true iff an entry that compares equal to key is in the collection.
+  virtual bool Contains(const char* key) const override;
+
+  virtual void MarkReadOnly() override;
+
+  virtual size_t ApproximateMemoryUsage() override;
+
+  virtual ~VectorRep() override { }
+
+  class Iterator : public MemTableRep::Iterator {
+    class VectorRep* vrep_;
+    std::shared_ptr<std::vector<const char*>> bucket_;
+    typename std::vector<const char*>::const_iterator mutable cit_;
+    const KeyComparator& compare_;
+    bool mutable sorted_;
+    void DoSort() const;
+   public:
+    explicit Iterator(class VectorRep* vrep,
+      std::shared_ptr<std::vector<const char*>> bucket,
+      const KeyComparator& compare);
+
+    // Initialize an iterator over the specified collection.
+    // The returned iterator is not valid.
+    // explicit Iterator(const MemTableRep* collection);
+    virtual ~Iterator() override { };
+
+    // Returns true iff the iterator is positioned at a valid node.
+    virtual bool Valid() const override;
+
+    // Returns the key at the current position.
+    // REQUIRES: Valid()
+    virtual const char* key() const override;
+
+    // Advances to the next position.
+    // REQUIRES: Valid()
+    virtual void Next() override;
+
+    // Advances to the previous position.
+    // REQUIRES: Valid()
+    virtual void Prev() override;
+
+    // Advance to the first entry with a key >= target
+    virtual void Seek(const char* target) override;
+
+    // Position at the first entry in collection.
+    // Final state of iterator is Valid() iff collection is not empty.
+    virtual void SeekToFirst() override;
+
+    // Position at the last entry in collection.
+    // Final state of iterator is Valid() iff collection is not empty.
+    virtual void SeekToLast() override;
+  };
+
+  // Unhide default implementations of GetIterator()
+  using MemTableRep::GetIterator;
+
+  // Return an iterator over the keys in this representation.
+  virtual std::shared_ptr<MemTableRep::Iterator> GetIterator() override;
+
+ private:
+  friend class Iterator;
+  typedef std::vector<const char*> Bucket;
+  std::shared_ptr<Bucket> bucket_;
+  mutable port::RWMutex rwlock_;
+  bool immutable_;
+  bool sorted_;
+  const KeyComparator& compare_;
+};
+
+void VectorRep::Insert(const char* key) {
+  assert(!Contains(key));
+  WriteLock l(&rwlock_);
+  assert(!immutable_);
+  bucket_->push_back(key);
+}
+
+// Returns true iff an entry that compares equal to key is in the collection.
+bool VectorRep::Contains(const char* key) const {
+  ReadLock l(&rwlock_);
+  return std::find(bucket_->begin(), bucket_->end(), key) != bucket_->end();
+}
+
+void VectorRep::MarkReadOnly() {
+  WriteLock l(&rwlock_);
+  immutable_ = true;
+}
+
+size_t VectorRep::ApproximateMemoryUsage() {
+  return
+    sizeof(bucket_) + sizeof(*bucket_) +
+    bucket_->size() *
+    sizeof(
+      std::remove_reference<decltype(*bucket_)>::type::value_type
+    );
+}
+
+VectorRep::VectorRep(const KeyComparator& compare, Arena* arena, size_t count)
+  : bucket_(new Bucket()),
+    immutable_(false),
+    sorted_(false),
+    compare_(compare) { bucket_.get()->reserve(count); }
+
+VectorRep::Iterator::Iterator(class VectorRep* vrep,
+                   std::shared_ptr<std::vector<const char*>> bucket,
+                   const KeyComparator& compare)
+: vrep_(vrep),
+  bucket_(bucket),
+  cit_(bucket_->end()),
+  compare_(compare),
+  sorted_(false) { }
+
+void VectorRep::Iterator::DoSort() const {
+  // vrep is non-null means that we are working on an immutable memtable
+  if (!sorted_ && vrep_ != nullptr) {
+    WriteLock l(&vrep_->rwlock_);
+    if (!vrep_->sorted_) {
+      std::sort(bucket_->begin(), bucket_->end(), Compare(compare_));
+      cit_ = bucket_->begin();
+      vrep_->sorted_ = true;
+    }
+    sorted_ = true;
+  }
+  if (!sorted_) {
+    std::sort(bucket_->begin(), bucket_->end(), Compare(compare_));
+    cit_ = bucket_->begin();
+    sorted_ = true;
+  }
+  assert(sorted_);
+  assert(vrep_ == nullptr || vrep_->sorted_);
+}
+
+// Returns true iff the iterator is positioned at a valid node.
+bool VectorRep::Iterator::Valid() const {
+  DoSort();
+  return cit_ != bucket_->end();
+}
+
+// Returns the key at the current position.
+// REQUIRES: Valid()
+const char* VectorRep::Iterator::key() const {
+  assert(Valid());
+  return *cit_;
+}
+
+// Advances to the next position.
+// REQUIRES: Valid()
+void VectorRep::Iterator::Next() {
+  assert(Valid());
+  if (cit_ == bucket_->end()) {
+    return;
+  }
+  ++cit_;
+}
+
+// Advances to the previous position.
+// REQUIRES: Valid()
+void VectorRep::Iterator::Prev() {
+  assert(Valid());
+  if (cit_ == bucket_->begin()) {
+    // If you try to go back from the first element, the iterator should be
+    // invalidated. So we set it to past-the-end. This means that you can
+    // treat the container circularly.
+    cit_ = bucket_->end();
+  } else {
+    --cit_;
+  }
+}
+
+// Advance to the first entry with a key >= target
+void VectorRep::Iterator::Seek(const char* target) {
+  DoSort();
+  // Do binary search to find first value not less than the target
+  cit_ = std::equal_range(bucket_->begin(),
+                          bucket_->end(),
+                          target,
+                          [this] (const char* a, const char* b) {
+                            return compare_(a, b) < 0;
+                          }).first;
+}
+
+// Position at the first entry in collection.
+// Final state of iterator is Valid() iff collection is not empty.
+void VectorRep::Iterator::SeekToFirst() {
+  DoSort();
+  cit_ = bucket_->begin();
+}
+
+// Position at the last entry in collection.
+// Final state of iterator is Valid() iff collection is not empty.
+void VectorRep::Iterator::SeekToLast() {
+  DoSort();
+  cit_ = bucket_->end();
+  if (bucket_->size() != 0) {
+    --cit_;
+  }
+}
+
+std::shared_ptr<MemTableRep::Iterator> VectorRep::GetIterator() {
+  ReadLock l(&rwlock_);
+  // Do not sort here. The sorting would be done the first time
+  // a Seek is performed on the iterator.
+  if (immutable_) {
+    return std::make_shared<Iterator>(this, bucket_, compare_);
+  } else {
+    std::shared_ptr<Bucket> tmp;
+    tmp.reset(new Bucket(*bucket_)); // make a copy
+    return std::make_shared<Iterator>(nullptr, tmp, compare_);
+  }
+}
+} // anon namespace
+
+std::shared_ptr<MemTableRep> VectorRepFactory::CreateMemTableRep(
+  MemTableRep::KeyComparator& compare, Arena* arena) {
+  return std::make_shared<VectorRep>(compare, arena, count_);
+}
+} // namespace rocksdb
diff --git a/utilities/backupable/backupable_db.cc b/utilities/backupable/backupable_db.cc
new file mode 100644 (file)
index 0000000..26bdd25
--- /dev/null
@@ -0,0 +1,912 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "utilities/backupable_db.h"
+#include "db/filename.h"
+#include "util/coding.h"
+#include "rocksdb/transaction_log.h"
+
+#define __STDC_FORMAT_MACROS
+
+#include <inttypes.h>
+#include <algorithm>
+#include <vector>
+#include <map>
+#include <string>
+#include <limits>
+#include <atomic>
+
+namespace rocksdb {
+
+// -------- BackupEngine class ---------
+class BackupEngine {
+ public:
+  BackupEngine(Env* db_env, const BackupableDBOptions& options);
+  ~BackupEngine();
+  Status CreateNewBackup(DB* db, bool flush_before_backup = false);
+  Status PurgeOldBackups(uint32_t num_backups_to_keep);
+  Status DeleteBackup(BackupID backup_id);
+  void StopBackup() {
+    stop_backup_.store(true, std::memory_order_release);
+  }
+
+  void GetBackupInfo(std::vector<BackupInfo>* backup_info);
+  Status RestoreDBFromBackup(BackupID backup_id, const std::string &db_dir,
+                             const std::string &wal_dir);
+  Status RestoreDBFromLatestBackup(const std::string &db_dir,
+                                   const std::string &wal_dir) {
+    return RestoreDBFromBackup(latest_backup_id_, db_dir, wal_dir);
+  }
+
+  void DeleteBackupsNewerThan(uint64_t sequence_number);
+
+ private:
+  class BackupMeta {
+   public:
+    BackupMeta(const std::string& meta_filename,
+        std::unordered_map<std::string, int>* file_refs, Env* env)
+      : timestamp_(0), size_(0), meta_filename_(meta_filename),
+        file_refs_(file_refs), env_(env) {}
+
+    ~BackupMeta() {}
+
+    void RecordTimestamp() {
+      env_->GetCurrentTime(&timestamp_);
+    }
+    int64_t GetTimestamp() const {
+      return timestamp_;
+    }
+    uint64_t GetSize() const {
+      return size_;
+    }
+    void SetSequenceNumber(uint64_t sequence_number) {
+      sequence_number_ = sequence_number;
+    }
+    uint64_t GetSequenceNumber() {
+      return sequence_number_;
+    }
+
+    void AddFile(const std::string& filename, uint64_t size);
+    void Delete();
+
+    bool Empty() {
+      return files_.empty();
+    }
+
+    const std::vector<std::string>& GetFiles() {
+      return files_;
+    }
+
+    Status LoadFromFile(const std::string& backup_dir);
+    Status StoreToFile(bool sync);
+
+   private:
+    int64_t timestamp_;
+    // sequence number is only approximate, should not be used
+    // by clients
+    uint64_t sequence_number_;
+    uint64_t size_;
+    std::string const meta_filename_;
+    // files with relative paths (without "/" prefix!!)
+    std::vector<std::string> files_;
+    std::unordered_map<std::string, int>* file_refs_;
+    Env* env_;
+
+    static const size_t max_backup_meta_file_size_ = 10 * 1024 * 1024; // 10MB
+  }; // BackupMeta
+
+  inline std::string GetAbsolutePath(
+      const std::string &relative_path = "") const {
+    assert(relative_path.size() == 0 || relative_path[0] != '/');
+    return options_.backup_dir + "/" + relative_path;
+  }
+  inline std::string GetPrivateDirRel() const {
+    return "private";
+  }
+  inline std::string GetPrivateFileRel(BackupID backup_id,
+                                       bool tmp = false,
+                                       const std::string& file = "") const {
+    assert(file.size() == 0 || file[0] != '/');
+    return GetPrivateDirRel() + "/" + std::to_string(backup_id) +
+           (tmp ? ".tmp" : "") + "/" + file;
+  }
+  inline std::string GetSharedFileRel(const std::string& file = "",
+                                      bool tmp = false) const {
+    assert(file.size() == 0 || file[0] != '/');
+    return "shared/" + file + (tmp ? ".tmp" : "");
+  }
+  inline std::string GetLatestBackupFile(bool tmp = false) const {
+    return GetAbsolutePath(std::string("LATEST_BACKUP") + (tmp ? ".tmp" : ""));
+  }
+  inline std::string GetBackupMetaDir() const {
+    return GetAbsolutePath("meta");
+  }
+  inline std::string GetBackupMetaFile(BackupID backup_id) const {
+    return GetBackupMetaDir() + "/" + std::to_string(backup_id);
+  }
+
+  Status GetLatestBackupFileContents(uint32_t* latest_backup);
+  Status PutLatestBackupFileContents(uint32_t latest_backup);
+  // if size_limit == 0, there is no size limit, copy everything
+  Status CopyFile(const std::string& src,
+                  const std::string& dst,
+                  Env* src_env,
+                  Env* dst_env,
+                  bool sync,
+                  uint64_t* size = nullptr,
+                  uint64_t size_limit = 0);
+  // if size_limit == 0, there is no size limit, copy everything
+  Status BackupFile(BackupID backup_id,
+                    BackupMeta* backup,
+                    bool shared,
+                    const std::string& src_dir,
+                    const std::string& src_fname, // starts with "/"
+                    uint64_t size_limit = 0);
+  // Will delete all the files we don't need anymore
+  // If full_scan == true, it will do the full scan of files/ directory
+  // and delete all the files that are not referenced from backuped_file_refs_
+  void GarbageCollection(bool full_scan);
+
+  // backup state data
+  BackupID latest_backup_id_;
+  std::map<BackupID, BackupMeta> backups_;
+  std::unordered_map<std::string, int> backuped_file_refs_;
+  std::vector<BackupID> obsolete_backups_;
+  std::atomic<bool> stop_backup_;
+
+  // options data
+  BackupableDBOptions options_;
+  Env* db_env_;
+  Env* backup_env_;
+
+  static const size_t copy_file_buffer_size_ = 5 * 1024 * 1024LL; // 5MB
+};
+
+BackupEngine::BackupEngine(Env* db_env, const BackupableDBOptions& options)
+    : stop_backup_(false),
+      options_(options),
+      db_env_(db_env),
+      backup_env_(options.backup_env != nullptr ? options.backup_env
+                                                : db_env_) {
+
+  // create all the dirs we need
+  backup_env_->CreateDirIfMissing(GetAbsolutePath());
+  if (options_.share_table_files) {
+    backup_env_->CreateDirIfMissing(GetAbsolutePath(GetSharedFileRel()));
+  }
+  backup_env_->CreateDirIfMissing(GetAbsolutePath(GetPrivateDirRel()));
+  backup_env_->CreateDirIfMissing(GetBackupMetaDir());
+
+  std::vector<std::string> backup_meta_files;
+  backup_env_->GetChildren(GetBackupMetaDir(), &backup_meta_files);
+  // create backups_ structure
+  for (auto& file : backup_meta_files) {
+    BackupID backup_id = 0;
+    sscanf(file.c_str(), "%u", &backup_id);
+    if (backup_id == 0 || file != std::to_string(backup_id)) {
+      // invalid file name, delete that
+      backup_env_->DeleteFile(GetBackupMetaDir() + "/" + file);
+      continue;
+    }
+    assert(backups_.find(backup_id) == backups_.end());
+    backups_.insert(std::make_pair(
+        backup_id, BackupMeta(GetBackupMetaFile(backup_id),
+                              &backuped_file_refs_, backup_env_)));
+  }
+
+  if (options_.destroy_old_data) { // Destory old data
+    for (auto& backup : backups_) {
+      backup.second.Delete();
+      obsolete_backups_.push_back(backup.first);
+    }
+    backups_.clear();
+    // start from beginning
+    latest_backup_id_ = 0;
+    // GarbageCollection() will do the actual deletion
+  } else { // Load data from storage
+    // load the backups if any
+    for (auto& backup : backups_) {
+      Status s = backup.second.LoadFromFile(options_.backup_dir);
+      if (!s.ok()) {
+        Log(options_.info_log, "Backup %u corrupted - deleting -- %s",
+            backup.first, s.ToString().c_str());
+        backup.second.Delete();
+        obsolete_backups_.push_back(backup.first);
+      }
+    }
+    // delete obsolete backups from the structure
+    for (auto ob : obsolete_backups_) {
+      backups_.erase(ob);
+    }
+
+    Status s = GetLatestBackupFileContents(&latest_backup_id_);
+    // If latest backup file is corrupted or non-existent
+    // set latest backup as the biggest backup we have
+    // or 0 if we have no backups
+    if (!s.ok() ||
+        backups_.find(latest_backup_id_) == backups_.end()) {
+      auto itr = backups_.end();
+      latest_backup_id_ = (itr == backups_.begin()) ? 0 : (--itr)->first;
+    }
+  }
+
+  // delete any backups that claim to be later than latest
+  for (auto itr = backups_.upper_bound(latest_backup_id_);
+       itr != backups_.end();) {
+    itr->second.Delete();
+    obsolete_backups_.push_back(itr->first);
+    itr = backups_.erase(itr);
+  }
+
+  PutLatestBackupFileContents(latest_backup_id_); // Ignore errors
+  GarbageCollection(true);
+  Log(options_.info_log,
+      "Initialized BackupEngine, the latest backup is %u.",
+      latest_backup_id_);
+}
+
+BackupEngine::~BackupEngine() {
+  LogFlush(options_.info_log);
+}
+
+void BackupEngine::DeleteBackupsNewerThan(uint64_t sequence_number) {
+  for (auto backup : backups_) {
+    if (backup.second.GetSequenceNumber() > sequence_number) {
+      Log(options_.info_log,
+          "Deleting backup %u because sequence number (%" PRIu64
+          ") is newer than %" PRIu64 "",
+          backup.first, backup.second.GetSequenceNumber(), sequence_number);
+      backup.second.Delete();
+      obsolete_backups_.push_back(backup.first);
+    }
+  }
+  for (auto ob : obsolete_backups_) {
+    backups_.erase(backups_.find(ob));
+  }
+  auto itr = backups_.end();
+  latest_backup_id_ = (itr == backups_.begin()) ? 0 : (--itr)->first;
+  PutLatestBackupFileContents(latest_backup_id_); // Ignore errors
+  GarbageCollection(false);
+}
+
+Status BackupEngine::CreateNewBackup(DB* db, bool flush_before_backup) {
+  Status s;
+  std::vector<std::string> live_files;
+  VectorLogPtr live_wal_files;
+  uint64_t manifest_file_size = 0;
+  uint64_t sequence_number = db->GetLatestSequenceNumber();
+
+  s = db->DisableFileDeletions();
+  if (s.ok()) {
+    // this will return live_files prefixed with "/"
+    s = db->GetLiveFiles(live_files, &manifest_file_size, flush_before_backup);
+  }
+  // if we didn't flush before backup, we need to also get WAL files
+  if (s.ok() && !flush_before_backup) {
+    // returns file names prefixed with "/"
+    s = db->GetSortedWalFiles(live_wal_files);
+  }
+  if (!s.ok()) {
+    db->EnableFileDeletions();
+    return s;
+  }
+
+  BackupID new_backup_id = latest_backup_id_ + 1;
+  assert(backups_.find(new_backup_id) == backups_.end());
+  auto ret = backups_.insert(std::make_pair(
+      new_backup_id, BackupMeta(GetBackupMetaFile(new_backup_id),
+                                &backuped_file_refs_, backup_env_)));
+  assert(ret.second == true);
+  auto& new_backup = ret.first->second;
+  new_backup.RecordTimestamp();
+  new_backup.SetSequenceNumber(sequence_number);
+
+  Log(options_.info_log, "Started the backup process -- creating backup %u",
+      new_backup_id);
+
+  // create temporary private dir
+  s = backup_env_->CreateDir(
+      GetAbsolutePath(GetPrivateFileRel(new_backup_id, true)));
+
+  // copy live_files
+  for (size_t i = 0; s.ok() && i < live_files.size(); ++i) {
+    uint64_t number;
+    FileType type;
+    bool ok = ParseFileName(live_files[i], &number, &type);
+    if (!ok) {
+      assert(false);
+      return Status::Corruption("Can't parse file name. This is very bad");
+    }
+    // we should only get sst, manifest and current files here
+    assert(type == kTableFile ||
+             type == kDescriptorFile ||
+             type == kCurrentFile);
+
+    // rules:
+    // * if it's kTableFile, than it's shared
+    // * if it's kDescriptorFile, limit the size to manifest_file_size
+    s = BackupFile(new_backup_id,
+                   &new_backup,
+                   options_.share_table_files && type == kTableFile,
+                   db->GetName(),            /* src_dir */
+                   live_files[i],            /* src_fname */
+                   (type == kDescriptorFile) ? manifest_file_size : 0);
+  }
+
+  // copy WAL files
+  for (size_t i = 0; s.ok() && i < live_wal_files.size(); ++i) {
+    if (live_wal_files[i]->Type() == kAliveLogFile) {
+      // we only care about live log files
+      // copy the file into backup_dir/files/<new backup>/
+      s = BackupFile(new_backup_id,
+                     &new_backup,
+                     false, /* not shared */
+                     db->GetOptions().wal_dir,
+                     live_wal_files[i]->PathName());
+    }
+  }
+
+  // we copied all the files, enable file deletions
+  db->EnableFileDeletions();
+
+  if (s.ok()) {
+    // move tmp private backup to real backup folder
+    s = backup_env_->RenameFile(
+        GetAbsolutePath(GetPrivateFileRel(new_backup_id, true)), // tmp
+        GetAbsolutePath(GetPrivateFileRel(new_backup_id, false)));
+  }
+
+  if (s.ok()) {
+    // persist the backup metadata on the disk
+    s = new_backup.StoreToFile(options_.sync);
+  }
+  if (s.ok()) {
+    // install the newly created backup meta! (atomic)
+    s = PutLatestBackupFileContents(new_backup_id);
+  }
+  if (!s.ok()) {
+    // clean all the files we might have created
+    Log(options_.info_log, "Backup failed -- %s", s.ToString().c_str());
+    backups_.erase(new_backup_id);
+    GarbageCollection(true);
+    return s;
+  }
+
+  // here we know that we succeeded and installed the new backup
+  // in the LATEST_BACKUP file
+  latest_backup_id_ = new_backup_id;
+  Log(options_.info_log, "Backup DONE. All is good");
+  return s;
+}
+
+Status BackupEngine::PurgeOldBackups(uint32_t num_backups_to_keep) {
+  Log(options_.info_log, "Purging old backups, keeping %u",
+      num_backups_to_keep);
+  while (num_backups_to_keep < backups_.size()) {
+    Log(options_.info_log, "Deleting backup %u", backups_.begin()->first);
+    backups_.begin()->second.Delete();
+    obsolete_backups_.push_back(backups_.begin()->first);
+    backups_.erase(backups_.begin());
+  }
+  GarbageCollection(false);
+  return Status::OK();
+}
+
+Status BackupEngine::DeleteBackup(BackupID backup_id) {
+  Log(options_.info_log, "Deleting backup %u", backup_id);
+  auto backup = backups_.find(backup_id);
+  if (backup == backups_.end()) {
+    return Status::NotFound("Backup not found");
+  }
+  backup->second.Delete();
+  obsolete_backups_.push_back(backup_id);
+  backups_.erase(backup);
+  GarbageCollection(false);
+  return Status::OK();
+}
+
+void BackupEngine::GetBackupInfo(std::vector<BackupInfo>* backup_info) {
+  backup_info->reserve(backups_.size());
+  for (auto& backup : backups_) {
+    if (!backup.second.Empty()) {
+      backup_info->push_back(BackupInfo(
+          backup.first, backup.second.GetTimestamp(), backup.second.GetSize()));
+    }
+  }
+}
+
+Status BackupEngine::RestoreDBFromBackup(BackupID backup_id,
+                                         const std::string &db_dir,
+                                         const std::string &wal_dir) {
+  auto backup_itr = backups_.find(backup_id);
+  if (backup_itr == backups_.end()) {
+    return Status::NotFound("Backup not found");
+  }
+  auto& backup = backup_itr->second;
+  if (backup.Empty()) {
+    return Status::NotFound("Backup not found");
+  }
+
+  Log(options_.info_log, "Restoring backup id %u\n", backup_id);
+
+  // just in case. Ignore errors
+  db_env_->CreateDirIfMissing(db_dir);
+  db_env_->CreateDirIfMissing(wal_dir);
+
+  // delete log files that might have been already in wal_dir.
+  // This is important since they might get replayed to the restored DB,
+  // which will then differ from the backuped DB
+  std::vector<std::string> delete_children;
+  db_env_->GetChildren(wal_dir, &delete_children); // ignore errors
+  for (auto f : delete_children) {
+    db_env_->DeleteFile(wal_dir + "/" + f); // ignore errors
+  }
+  // Also delete all the db_dir children. This is not so important
+  // because obsolete files will be deleted by DBImpl::PurgeObsoleteFiles()
+  delete_children.clear();
+  db_env_->GetChildren(db_dir, &delete_children); // ignore errors
+  for (auto f : delete_children) {
+    db_env_->DeleteFile(db_dir + "/" + f); // ignore errors
+  }
+
+  Status s;
+  for (auto& file : backup.GetFiles()) {
+    std::string dst;
+    // 1. extract the filename
+    size_t slash = file.find_last_of('/');
+    // file will either be shared/<file> or private/<number>/<file>
+    assert(slash != std::string::npos);
+    dst = file.substr(slash + 1);
+
+    // 2. find the filetype
+    uint64_t number;
+    FileType type;
+    bool ok = ParseFileName(dst, &number, &type);
+    if (!ok) {
+      return Status::Corruption("Backup corrupted");
+    }
+    // 3. Construct the final path
+    // kLogFile lives in wal_dir and all the rest live in db_dir
+    dst = ((type == kLogFile) ? wal_dir : db_dir) +
+      "/" + dst;
+
+    Log(options_.info_log, "Restoring %s to %s\n", file.c_str(), dst.c_str());
+    s = CopyFile(GetAbsolutePath(file), dst, backup_env_, db_env_, false);
+    if (!s.ok()) {
+      break;
+    }
+  }
+
+  Log(options_.info_log, "Restoring done -- %s\n", s.ToString().c_str());
+  return s;
+}
+
+// latest backup id is an ASCII representation of latest backup id
+Status BackupEngine::GetLatestBackupFileContents(uint32_t* latest_backup) {
+  Status s;
+  unique_ptr<SequentialFile> file;
+  s = backup_env_->NewSequentialFile(GetLatestBackupFile(),
+                                     &file,
+                                     EnvOptions());
+  if (!s.ok()) {
+    return s;
+  }
+
+  char buf[11];
+  Slice data;
+  s = file->Read(10, &data, buf);
+  if (!s.ok() || data.size() == 0) {
+    return s.ok() ? Status::Corruption("Latest backup file corrupted") : s;
+  }
+  buf[data.size()] = 0;
+
+  *latest_backup = 0;
+  sscanf(data.data(), "%u", latest_backup);
+  if (backup_env_->FileExists(GetBackupMetaFile(*latest_backup)) == false) {
+    s = Status::Corruption("Latest backup file corrupted");
+  }
+  return Status::OK();
+}
+
+// this operation HAS to be atomic
+// writing 4 bytes to the file is atomic alright, but we should *never*
+// do something like 1. delete file, 2. write new file
+// We write to a tmp file and then atomically rename
+Status BackupEngine::PutLatestBackupFileContents(uint32_t latest_backup) {
+  Status s;
+  unique_ptr<WritableFile> file;
+  EnvOptions env_options;
+  env_options.use_mmap_writes = false;
+  s = backup_env_->NewWritableFile(GetLatestBackupFile(true),
+                                   &file,
+                                   env_options);
+  if (!s.ok()) {
+    backup_env_->DeleteFile(GetLatestBackupFile(true));
+    return s;
+  }
+
+  char file_contents[10];
+  int len = sprintf(file_contents, "%u\n", latest_backup);
+  s = file->Append(Slice(file_contents, len));
+  if (s.ok() && options_.sync) {
+    file->Sync();
+  }
+  if (s.ok()) {
+    s = file->Close();
+  }
+  if (s.ok()) {
+    // atomically replace real file with new tmp
+    s = backup_env_->RenameFile(GetLatestBackupFile(true),
+                                GetLatestBackupFile(false));
+  }
+  return s;
+}
+
+Status BackupEngine::CopyFile(const std::string& src,
+                              const std::string& dst,
+                              Env* src_env,
+                              Env* dst_env,
+                              bool sync,
+                              uint64_t* size,
+                              uint64_t size_limit) {
+  Status s;
+  unique_ptr<WritableFile> dst_file;
+  unique_ptr<SequentialFile> src_file;
+  EnvOptions env_options;
+  env_options.use_mmap_writes = false;
+  if (size != nullptr) {
+    *size = 0;
+  }
+
+  // Check if size limit is set. if not, set it to very big number
+  if (size_limit == 0) {
+    size_limit = std::numeric_limits<uint64_t>::max();
+  }
+
+  s = src_env->NewSequentialFile(src, &src_file, env_options);
+  if (s.ok()) {
+    s = dst_env->NewWritableFile(dst, &dst_file, env_options);
+  }
+  if (!s.ok()) {
+    return s;
+  }
+
+  unique_ptr<char[]> buf(new char[copy_file_buffer_size_]);
+  Slice data;
+
+  do {
+    if (stop_backup_.load(std::memory_order_acquire)) {
+      return Status::Incomplete("Backup stopped");
+    }
+    size_t buffer_to_read = (copy_file_buffer_size_ < size_limit) ?
+      copy_file_buffer_size_ : size_limit;
+    s = src_file->Read(buffer_to_read, &data, buf.get());
+    size_limit -= data.size();
+    if (size != nullptr) {
+      *size += data.size();
+    }
+    if (s.ok()) {
+      s = dst_file->Append(data);
+    }
+  } while (s.ok() && data.size() > 0 && size_limit > 0);
+
+  if (s.ok() && sync) {
+    s = dst_file->Sync();
+  }
+
+  return s;
+}
+
+// src_fname will always start with "/"
+Status BackupEngine::BackupFile(BackupID backup_id,
+                                BackupMeta* backup,
+                                bool shared,
+                                const std::string& src_dir,
+                                const std::string& src_fname,
+                                uint64_t size_limit) {
+
+  assert(src_fname.size() > 0 && src_fname[0] == '/');
+  std::string dst_relative = src_fname.substr(1);
+  std::string dst_relative_tmp;
+  if (shared) {
+    dst_relative_tmp = GetSharedFileRel(dst_relative, true);
+    dst_relative = GetSharedFileRel(dst_relative, false);
+  } else {
+    dst_relative_tmp = GetPrivateFileRel(backup_id, true, dst_relative);
+    dst_relative = GetPrivateFileRel(backup_id, false, dst_relative);
+  }
+  std::string dst_path = GetAbsolutePath(dst_relative);
+  std::string dst_path_tmp = GetAbsolutePath(dst_relative_tmp);
+  Status s;
+  uint64_t size;
+
+  // if it's shared, we also need to check if it exists -- if it does,
+  // no need to copy it again
+  if (shared && backup_env_->FileExists(dst_path)) {
+    backup_env_->GetFileSize(dst_path, &size); // Ignore error
+    Log(options_.info_log, "%s already present", src_fname.c_str());
+  } else {
+    Log(options_.info_log, "Copying %s", src_fname.c_str());
+    s = CopyFile(src_dir + src_fname,
+                 dst_path_tmp,
+                 db_env_,
+                 backup_env_,
+                 options_.sync,
+                 &size,
+                 size_limit);
+    if (s.ok() && shared) {
+      s = backup_env_->RenameFile(dst_path_tmp, dst_path);
+    }
+  }
+  if (s.ok()) {
+    backup->AddFile(dst_relative, size);
+  }
+  return s;
+}
+
+void BackupEngine::GarbageCollection(bool full_scan) {
+  Log(options_.info_log, "Starting garbage collection");
+  std::vector<std::string> to_delete;
+  for (auto& itr : backuped_file_refs_) {
+    if (itr.second == 0) {
+      Status s = backup_env_->DeleteFile(GetAbsolutePath(itr.first));
+      Log(options_.info_log, "Deleting %s -- %s", itr.first.c_str(),
+          s.ToString().c_str());
+      to_delete.push_back(itr.first);
+    }
+  }
+  for (auto& td : to_delete) {
+    backuped_file_refs_.erase(td);
+  }
+  if (!full_scan) {
+    // take care of private dirs -- if full_scan == true, then full_scan will
+    // take care of them
+    for (auto backup_id : obsolete_backups_) {
+      std::string private_dir = GetPrivateFileRel(backup_id);
+      Status s = backup_env_->DeleteDir(GetAbsolutePath(private_dir));
+      Log(options_.info_log, "Deleting private dir %s -- %s",
+          private_dir.c_str(), s.ToString().c_str());
+    }
+  }
+  obsolete_backups_.clear();
+
+  if (full_scan) {
+    Log(options_.info_log, "Starting full scan garbage collection");
+    // delete obsolete shared files
+    std::vector<std::string> shared_children;
+    backup_env_->GetChildren(GetAbsolutePath(GetSharedFileRel()),
+                             &shared_children);
+    for (auto& child : shared_children) {
+      std::string rel_fname = GetSharedFileRel(child);
+      // if it's not refcounted, delete it
+      if (backuped_file_refs_.find(rel_fname) == backuped_file_refs_.end()) {
+        // this might be a directory, but DeleteFile will just fail in that
+        // case, so we're good
+        Status s = backup_env_->DeleteFile(GetAbsolutePath(rel_fname));
+        if (s.ok()) {
+          Log(options_.info_log, "Deleted %s", rel_fname.c_str());
+        }
+      }
+    }
+
+    // delete obsolete private files
+    std::vector<std::string> private_children;
+    backup_env_->GetChildren(GetAbsolutePath(GetPrivateDirRel()),
+                             &private_children);
+    for (auto& child : private_children) {
+      BackupID backup_id = 0;
+      bool tmp_dir = child.find(".tmp") != std::string::npos;
+      sscanf(child.c_str(), "%u", &backup_id);
+      if (!tmp_dir && // if it's tmp_dir, delete it
+          (backup_id == 0 || backups_.find(backup_id) != backups_.end())) {
+        // it's either not a number or it's still alive. continue
+        continue;
+      }
+      // here we have to delete the dir and all its children
+      std::string full_private_path =
+          GetAbsolutePath(GetPrivateFileRel(backup_id, tmp_dir));
+      std::vector<std::string> subchildren;
+      backup_env_->GetChildren(full_private_path, &subchildren);
+      for (auto& subchild : subchildren) {
+        Status s = backup_env_->DeleteFile(full_private_path + subchild);
+        if (s.ok()) {
+          Log(options_.info_log, "Deleted %s",
+              (full_private_path + subchild).c_str());
+        }
+      }
+      // finally delete the private dir
+      Status s = backup_env_->DeleteDir(full_private_path);
+      Log(options_.info_log, "Deleted dir %s -- %s", full_private_path.c_str(),
+          s.ToString().c_str());
+    }
+  }
+}
+
+// ------- BackupMeta class --------
+
+void BackupEngine::BackupMeta::AddFile(const std::string& filename,
+                                       uint64_t size) {
+  size_ += size;
+  files_.push_back(filename);
+  auto itr = file_refs_->find(filename);
+  if (itr == file_refs_->end()) {
+    file_refs_->insert(std::make_pair(filename, 1));
+  } else {
+    ++itr->second; // increase refcount if already present
+  }
+}
+
+void BackupEngine::BackupMeta::Delete() {
+  for (auto& file : files_) {
+    auto itr = file_refs_->find(file);
+    assert(itr != file_refs_->end());
+    --(itr->second); // decrease refcount
+  }
+  files_.clear();
+  // delete meta file
+  env_->DeleteFile(meta_filename_);
+  timestamp_ = 0;
+}
+
+// each backup meta file is of the format:
+// <timestamp>
+// <seq number>
+// <number of files>
+// <file1>
+// <file2>
+// ...
+// TODO: maybe add checksum?
+Status BackupEngine::BackupMeta::LoadFromFile(const std::string& backup_dir) {
+  assert(Empty());
+  Status s;
+  unique_ptr<SequentialFile> backup_meta_file;
+  s = env_->NewSequentialFile(meta_filename_, &backup_meta_file, EnvOptions());
+  if (!s.ok()) {
+    return s;
+  }
+
+  unique_ptr<char[]> buf(new char[max_backup_meta_file_size_ + 1]);
+  Slice data;
+  s = backup_meta_file->Read(max_backup_meta_file_size_, &data, buf.get());
+
+  if (!s.ok() || data.size() == max_backup_meta_file_size_) {
+    return s.ok() ? Status::IOError("File size too big") : s;
+  }
+  buf[data.size()] = 0;
+
+  uint32_t num_files = 0;
+  int bytes_read = 0;
+  sscanf(data.data(), "%" PRId64 "%n", &timestamp_, &bytes_read);
+  data.remove_prefix(bytes_read + 1); // +1 for '\n'
+  sscanf(data.data(), "%" PRIu64 "%n", &sequence_number_, &bytes_read);
+  data.remove_prefix(bytes_read + 1); // +1 for '\n'
+  sscanf(data.data(), "%u%n", &num_files, &bytes_read);
+  data.remove_prefix(bytes_read + 1); // +1 for '\n'
+
+  std::vector<std::pair<std::string, uint64_t>> files;
+
+  for (uint32_t i = 0; s.ok() && i < num_files; ++i) {
+    std::string filename = GetSliceUntil(&data, '\n').ToString();
+    uint64_t size;
+    s = env_->GetFileSize(backup_dir + "/" + filename, &size);
+    files.push_back(std::make_pair(filename, size));
+  }
+
+  if (s.ok()) {
+    for (auto file : files) {
+      AddFile(file.first, file.second);
+    }
+  }
+
+  return s;
+}
+
+Status BackupEngine::BackupMeta::StoreToFile(bool sync) {
+  Status s;
+  unique_ptr<WritableFile> backup_meta_file;
+  EnvOptions env_options;
+  env_options.use_mmap_writes = false;
+  s = env_->NewWritableFile(meta_filename_ + ".tmp", &backup_meta_file,
+                            env_options);
+  if (!s.ok()) {
+    return s;
+  }
+
+  unique_ptr<char[]> buf(new char[max_backup_meta_file_size_]);
+  int len = 0, buf_size = max_backup_meta_file_size_;
+  len += snprintf(buf.get(), buf_size, "%" PRId64 "\n", timestamp_);
+  len += snprintf(buf.get() + len, buf_size - len, "%" PRIu64 "\n",
+                  sequence_number_);
+  len += snprintf(buf.get() + len, buf_size - len, "%zu\n", files_.size());
+  for (size_t i = 0; i < files_.size(); ++i) {
+    len += snprintf(buf.get() + len, buf_size - len, "%s\n", files_[i].c_str());
+  }
+
+  s = backup_meta_file->Append(Slice(buf.get(), (size_t)len));
+  if (s.ok() && sync) {
+    s = backup_meta_file->Sync();
+  }
+  if (s.ok()) {
+    s = backup_meta_file->Close();
+  }
+  if (s.ok()) {
+    s = env_->RenameFile(meta_filename_ + ".tmp", meta_filename_);
+  }
+  return s;
+}
+
+// --- BackupableDB methods --------
+
+BackupableDB::BackupableDB(DB* db, const BackupableDBOptions& options)
+    : StackableDB(db), backup_engine_(new BackupEngine(db->GetEnv(), options)) {
+  if (options.share_table_files) {
+    backup_engine_->DeleteBackupsNewerThan(GetLatestSequenceNumber());
+  }
+}
+
+BackupableDB::~BackupableDB() {
+  delete backup_engine_;
+}
+
+Status BackupableDB::CreateNewBackup(bool flush_before_backup) {
+  return backup_engine_->CreateNewBackup(this, flush_before_backup);
+}
+
+void BackupableDB::GetBackupInfo(std::vector<BackupInfo>* backup_info) {
+  backup_engine_->GetBackupInfo(backup_info);
+}
+
+Status BackupableDB::PurgeOldBackups(uint32_t num_backups_to_keep) {
+  return backup_engine_->PurgeOldBackups(num_backups_to_keep);
+}
+
+Status BackupableDB::DeleteBackup(BackupID backup_id) {
+  return backup_engine_->DeleteBackup(backup_id);
+}
+
+void BackupableDB::StopBackup() {
+  backup_engine_->StopBackup();
+}
+
+// --- RestoreBackupableDB methods ------
+
+RestoreBackupableDB::RestoreBackupableDB(Env* db_env,
+                                         const BackupableDBOptions& options)
+    : backup_engine_(new BackupEngine(db_env, options)) {}
+
+RestoreBackupableDB::~RestoreBackupableDB() {
+  delete backup_engine_;
+}
+
+void
+RestoreBackupableDB::GetBackupInfo(std::vector<BackupInfo>* backup_info) {
+  backup_engine_->GetBackupInfo(backup_info);
+}
+
+Status RestoreBackupableDB::RestoreDBFromBackup(BackupID backup_id,
+                                                const std::string& db_dir,
+                                                const std::string& wal_dir) {
+  return backup_engine_->RestoreDBFromBackup(backup_id, db_dir, wal_dir);
+}
+
+Status
+RestoreBackupableDB::RestoreDBFromLatestBackup(const std::string& db_dir,
+                                               const std::string& wal_dir) {
+  return backup_engine_->RestoreDBFromLatestBackup(db_dir, wal_dir);
+}
+
+Status RestoreBackupableDB::PurgeOldBackups(uint32_t num_backups_to_keep) {
+  return backup_engine_->PurgeOldBackups(num_backups_to_keep);
+}
+
+Status RestoreBackupableDB::DeleteBackup(BackupID backup_id) {
+  return backup_engine_->DeleteBackup(backup_id);
+}
+
+}  // namespace rocksdb
diff --git a/utilities/backupable/backupable_db_test.cc b/utilities/backupable/backupable_db_test.cc
new file mode 100644 (file)
index 0000000..de24055
--- /dev/null
@@ -0,0 +1,702 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/types.h"
+#include "rocksdb/transaction_log.h"
+#include "utilities/utility_db.h"
+#include "utilities/backupable_db.h"
+#include "util/testharness.h"
+#include "util/random.h"
+#include "util/testutil.h"
+#include "util/auto_roll_logger.h"
+
+#include <string>
+#include <algorithm>
+
+namespace rocksdb {
+
+namespace {
+
+using std::unique_ptr;
+
+class DummyDB : public StackableDB {
+ public:
+  /* implicit */
+  DummyDB(const Options& options, const std::string& dbname)
+     : StackableDB(nullptr), options_(options), dbname_(dbname),
+       deletions_enabled_(true), sequence_number_(0) {}
+
+  virtual SequenceNumber GetLatestSequenceNumber() const {
+    return ++sequence_number_;
+  }
+
+  virtual const std::string& GetName() const override {
+    return dbname_;
+  }
+
+  virtual Env* GetEnv() const override {
+    return options_.env;
+  }
+
+  virtual const Options& GetOptions() const override {
+    return options_;
+  }
+
+  virtual Status EnableFileDeletions(bool force) override {
+    ASSERT_TRUE(!deletions_enabled_);
+    deletions_enabled_ = true;
+    return Status::OK();
+  }
+
+  virtual Status DisableFileDeletions() override {
+    ASSERT_TRUE(deletions_enabled_);
+    deletions_enabled_ = false;
+    return Status::OK();
+  }
+
+  virtual Status GetLiveFiles(std::vector<std::string>& vec, uint64_t* mfs,
+                              bool flush_memtable = true) override {
+    ASSERT_TRUE(!deletions_enabled_);
+    vec = live_files_;
+    *mfs = 100;
+    return Status::OK();
+  }
+
+  class DummyLogFile : public LogFile {
+   public:
+    /* implicit */
+     DummyLogFile(const std::string& path, bool alive = true)
+         : path_(path), alive_(alive) {}
+
+    virtual std::string PathName() const override {
+      return path_;
+    }
+
+    virtual uint64_t LogNumber() const {
+      // what business do you have calling this method?
+      ASSERT_TRUE(false);
+      return 0;
+    }
+
+    virtual WalFileType Type() const override {
+      return alive_ ? kAliveLogFile : kArchivedLogFile;
+    }
+
+    virtual SequenceNumber StartSequence() const {
+      // backupabledb should not need this method
+      ASSERT_TRUE(false);
+      return 0;
+    }
+
+    virtual uint64_t SizeFileBytes() const {
+      // backupabledb should not need this method
+      ASSERT_TRUE(false);
+      return 0;
+    }
+
+   private:
+    std::string path_;
+    bool alive_;
+  }; // DummyLogFile
+
+  virtual Status GetSortedWalFiles(VectorLogPtr& files) override {
+    ASSERT_TRUE(!deletions_enabled_);
+    files.resize(wal_files_.size());
+    for (size_t i = 0; i < files.size(); ++i) {
+      files[i].reset(
+          new DummyLogFile(wal_files_[i].first, wal_files_[i].second));
+    }
+    return Status::OK();
+  }
+
+  std::vector<std::string> live_files_;
+  // pair<filename, alive?>
+  std::vector<std::pair<std::string, bool>> wal_files_;
+ private:
+  Options options_;
+  std::string dbname_;
+  bool deletions_enabled_;
+  mutable SequenceNumber sequence_number_;
+}; // DummyDB
+
+class TestEnv : public EnvWrapper {
+ public:
+  explicit TestEnv(Env* t) : EnvWrapper(t) {}
+
+  class DummySequentialFile : public SequentialFile {
+   public:
+    DummySequentialFile() : SequentialFile(), rnd_(5) {}
+    virtual Status Read(size_t n, Slice* result, char* scratch) {
+      size_t read_size = (n > size_left) ? size_left : n;
+      for (size_t i = 0; i < read_size; ++i) {
+        scratch[i] = rnd_.Next() & 255;
+      }
+      *result = Slice(scratch, read_size);
+      size_left -= read_size;
+      return Status::OK();
+    }
+
+    virtual Status Skip(uint64_t n) {
+      size_left = (n > size_left) ? size_left - n : 0;
+      return Status::OK();
+    }
+   private:
+    size_t size_left = 200;
+    Random rnd_;
+  };
+
+  Status NewSequentialFile(const std::string& f,
+                           unique_ptr<SequentialFile>* r,
+                           const EnvOptions& options) {
+    opened_files_.push_back(f);
+    if (dummy_sequential_file_) {
+      r->reset(new TestEnv::DummySequentialFile());
+      return Status::OK();
+    } else {
+      return EnvWrapper::NewSequentialFile(f, r, options);
+    }
+  }
+
+  Status NewWritableFile(const std::string& f, unique_ptr<WritableFile>* r,
+                         const EnvOptions& options) {
+    if (limit_written_files_ <= 0) {
+      return Status::IOError("Sorry, can't do this");
+    }
+    limit_written_files_--;
+    return EnvWrapper::NewWritableFile(f, r, options);
+  }
+
+  void AssertOpenedFiles(std::vector<std::string>& should_have_opened) {
+    sort(should_have_opened.begin(), should_have_opened.end());
+    sort(opened_files_.begin(), opened_files_.end());
+    ASSERT_TRUE(opened_files_ == should_have_opened);
+  }
+
+  void ClearOpenedFiles() {
+    opened_files_.clear();
+  }
+
+  void SetLimitWrittenFiles(uint64_t limit) {
+    limit_written_files_ = limit;
+  }
+
+  void SetDummySequentialFile(bool dummy_sequential_file) {
+    dummy_sequential_file_ = dummy_sequential_file;
+  }
+
+ private:
+  bool dummy_sequential_file_ = false;
+  std::vector<std::string> opened_files_;
+  uint64_t limit_written_files_ = 1000000;
+}; // TestEnv
+
+class FileManager : public EnvWrapper {
+ public:
+  explicit FileManager(Env* t) : EnvWrapper(t), rnd_(5) {}
+
+  Status DeleteRandomFileInDir(const std::string dir) {
+    std::vector<std::string> children;
+    GetChildren(dir, &children);
+    if (children.size() <= 2) { // . and ..
+      return Status::NotFound("");
+    }
+    while (true) {
+      int i = rnd_.Next() % children.size();
+      if (children[i] != "." && children[i] != "..") {
+        return DeleteFile(dir + "/" + children[i]);
+      }
+    }
+    // should never get here
+    assert(false);
+    return Status::NotFound("");
+  }
+
+  Status CorruptFile(const std::string& fname, uint64_t bytes_to_corrupt) {
+    uint64_t size;
+    Status s = GetFileSize(fname, &size);
+    if (!s.ok()) {
+      return s;
+    }
+    unique_ptr<RandomRWFile> file;
+    EnvOptions env_options;
+    env_options.use_mmap_writes = false;
+    s = NewRandomRWFile(fname, &file, env_options);
+    if (!s.ok()) {
+      return s;
+    }
+
+    for (uint64_t i = 0; s.ok() && i < bytes_to_corrupt; ++i) {
+      std::string tmp;
+      // write one random byte to a random position
+      s = file->Write(rnd_.Next() % size, test::RandomString(&rnd_, 1, &tmp));
+    }
+    return s;
+  }
+
+  Status WriteToFile(const std::string& fname, const std::string& data) {
+    unique_ptr<WritableFile> file;
+    EnvOptions env_options;
+    env_options.use_mmap_writes = false;
+    Status s = EnvWrapper::NewWritableFile(fname, &file, env_options);
+    if (!s.ok()) {
+      return s;
+    }
+    return file->Append(Slice(data));
+  }
+ private:
+  Random rnd_;
+}; // FileManager
+
+// utility functions
+static void FillDB(DB* db, int from, int to) {
+  for (int i = from; i < to; ++i) {
+    std::string key = "testkey" + std::to_string(i);
+    std::string value = "testvalue" + std::to_string(i);
+
+    ASSERT_OK(db->Put(WriteOptions(), Slice(key), Slice(value)));
+  }
+}
+
+static void AssertExists(DB* db, int from, int to) {
+  for (int i = from; i < to; ++i) {
+    std::string key = "testkey" + std::to_string(i);
+    std::string value;
+    Status s = db->Get(ReadOptions(), Slice(key), &value);
+    ASSERT_EQ(value, "testvalue" + std::to_string(i));
+  }
+}
+
+static void AssertEmpty(DB* db, int from, int to) {
+  for (int i = from; i < to; ++i) {
+    std::string key = "testkey" + std::to_string(i);
+    std::string value = "testvalue" + std::to_string(i);
+
+    Status s = db->Get(ReadOptions(), Slice(key), &value);
+    ASSERT_TRUE(s.IsNotFound());
+  }
+}
+
+class BackupableDBTest {
+ public:
+  BackupableDBTest() {
+    // set up files
+    dbname_ = test::TmpDir() + "/backupable_db";
+    backupdir_ = test::TmpDir() + "/backupable_db_backup";
+
+    // set up envs
+    env_ = Env::Default();
+    test_db_env_.reset(new TestEnv(env_));
+    test_backup_env_.reset(new TestEnv(env_));
+    file_manager_.reset(new FileManager(env_));
+
+    // set up db options
+    options_.create_if_missing = true;
+    options_.paranoid_checks = true;
+    options_.write_buffer_size = 1 << 17; // 128KB
+    options_.env = test_db_env_.get();
+    options_.wal_dir = dbname_;
+    // set up backup db options
+    CreateLoggerFromOptions(dbname_, backupdir_, env_,
+                            Options(), &logger_);
+    backupable_options_.reset(new BackupableDBOptions(
+        backupdir_, test_backup_env_.get(), true, logger_.get(), true));
+
+    // delete old files in db
+    DestroyDB(dbname_, Options());
+  }
+
+  DB* OpenDB() {
+    DB* db;
+    ASSERT_OK(DB::Open(options_, dbname_, &db));
+    return db;
+  }
+
+  void OpenBackupableDB(bool destroy_old_data = false, bool dummy = false,
+                        bool share_table_files = true) {
+    // reset all the defaults
+    test_backup_env_->SetLimitWrittenFiles(1000000);
+    test_db_env_->SetLimitWrittenFiles(1000000);
+    test_db_env_->SetDummySequentialFile(dummy);
+
+    DB* db;
+    if (dummy) {
+      dummy_db_ = new DummyDB(options_, dbname_);
+      db = dummy_db_;
+    } else {
+      ASSERT_OK(DB::Open(options_, dbname_, &db));
+    }
+    backupable_options_->destroy_old_data = destroy_old_data;
+    backupable_options_->share_table_files = share_table_files;
+    db_.reset(new BackupableDB(db, *backupable_options_));
+  }
+
+  void CloseBackupableDB() {
+    db_.reset(nullptr);
+  }
+
+  void OpenRestoreDB() {
+    backupable_options_->destroy_old_data = false;
+    restore_db_.reset(
+        new RestoreBackupableDB(test_db_env_.get(), *backupable_options_));
+  }
+
+  void CloseRestoreDB() {
+    restore_db_.reset(nullptr);
+  }
+
+  // restores backup backup_id and asserts the existence of
+  // [start_exist, end_exist> and not-existence of
+  // [end_exist, end>
+  //
+  // if backup_id == 0, it means restore from latest
+  // if end == 0, don't check AssertEmpty
+  void AssertBackupConsistency(BackupID backup_id, uint32_t start_exist,
+                               uint32_t end_exist, uint32_t end = 0) {
+    bool opened_restore = false;
+    if (restore_db_.get() == nullptr) {
+      opened_restore = true;
+      OpenRestoreDB();
+    }
+    if (backup_id > 0) {
+      ASSERT_OK(restore_db_->RestoreDBFromBackup(backup_id, dbname_, dbname_));
+    } else {
+      ASSERT_OK(restore_db_->RestoreDBFromLatestBackup(dbname_, dbname_));
+    }
+    DB* db = OpenDB();
+    AssertExists(db, start_exist, end_exist);
+    if (end != 0) {
+      AssertEmpty(db, end_exist, end);
+    }
+    delete db;
+    if (opened_restore) {
+      CloseRestoreDB();
+    }
+  }
+
+  // files
+  std::string dbname_;
+  std::string backupdir_;
+
+  // envs
+  Env* env_;
+  unique_ptr<TestEnv> test_db_env_;
+  unique_ptr<TestEnv> test_backup_env_;
+  unique_ptr<FileManager> file_manager_;
+
+  // all the dbs!
+  DummyDB* dummy_db_; // BackupableDB owns dummy_db_
+  unique_ptr<BackupableDB> db_;
+  unique_ptr<RestoreBackupableDB> restore_db_;
+
+  // options
+  Options options_;
+  unique_ptr<BackupableDBOptions> backupable_options_;
+  std::shared_ptr<Logger> logger_;
+}; // BackupableDBTest
+
+void AppendPath(const std::string& path, std::vector<std::string>& v) {
+  for (auto& f : v) {
+    f = path + f;
+  }
+}
+
+// this will make sure that backup does not copy the same file twice
+TEST(BackupableDBTest, NoDoubleCopy) {
+  OpenBackupableDB(true, true);
+
+  // should write 5 DB files + LATEST_BACKUP + one meta file
+  test_backup_env_->SetLimitWrittenFiles(7);
+  test_db_env_->ClearOpenedFiles();
+  test_db_env_->SetLimitWrittenFiles(0);
+  dummy_db_->live_files_ = { "/00010.sst", "/00011.sst",
+                             "/CURRENT",   "/MANIFEST-01" };
+  dummy_db_->wal_files_ = {{"/00011.log", true}, {"/00012.log", false}};
+  ASSERT_OK(db_->CreateNewBackup(false));
+  std::vector<std::string> should_have_openened = dummy_db_->live_files_;
+  should_have_openened.push_back("/00011.log");
+  AppendPath(dbname_, should_have_openened);
+  test_db_env_->AssertOpenedFiles(should_have_openened);
+
+  // should write 4 new DB files + LATEST_BACKUP + one meta file
+  // should not write/copy 00010.sst, since it's already there!
+  test_backup_env_->SetLimitWrittenFiles(6);
+  test_db_env_->ClearOpenedFiles();
+  dummy_db_->live_files_ = { "/00010.sst", "/00015.sst",
+                             "/CURRENT",   "/MANIFEST-01" };
+  dummy_db_->wal_files_ = {{"/00011.log", true}, {"/00012.log", false}};
+  ASSERT_OK(db_->CreateNewBackup(false));
+  // should not open 00010.sst - it's already there
+  should_have_openened = { "/00015.sst",   "/CURRENT",
+                           "/MANIFEST-01", "/00011.log" };
+  AppendPath(dbname_, should_have_openened);
+  test_db_env_->AssertOpenedFiles(should_have_openened);
+
+  ASSERT_OK(db_->DeleteBackup(1));
+  ASSERT_EQ(true,
+            test_backup_env_->FileExists(backupdir_ + "/shared/00010.sst"));
+  // 00011.sst was only in backup 1, should be deleted
+  ASSERT_EQ(false,
+            test_backup_env_->FileExists(backupdir_ + "/shared/00011.sst"));
+  ASSERT_EQ(true,
+            test_backup_env_->FileExists(backupdir_ + "/shared/00015.sst"));
+
+  // MANIFEST file size should be only 100
+  uint64_t size;
+  test_backup_env_->GetFileSize(backupdir_ + "/private/2/MANIFEST-01", &size);
+  ASSERT_EQ(100UL, size);
+  test_backup_env_->GetFileSize(backupdir_ + "/shared/00015.sst", &size);
+  ASSERT_EQ(200UL, size);
+
+  CloseBackupableDB();
+}
+
+// test various kind of corruptions that may happen:
+// 1. Not able to write a file for backup - that backup should fail,
+//      everything else should work
+// 2. Corrupted/deleted LATEST_BACKUP - everything should work fine
+// 3. Corrupted backup meta file or missing backuped file - we should
+//      not be able to open that backup, but all other backups should be
+//      fine
+TEST(BackupableDBTest, CorruptionsTest) {
+  const int keys_iteration = 5000;
+  Random rnd(6);
+  Status s;
+
+  OpenBackupableDB(true);
+  // create five backups
+  for (int i = 0; i < 5; ++i) {
+    FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1));
+    ASSERT_OK(db_->CreateNewBackup(!!(rnd.Next() % 2)));
+  }
+
+  // ---------- case 1. - fail a write -----------
+  // try creating backup 6, but fail a write
+  FillDB(db_.get(), keys_iteration * 5, keys_iteration * 6);
+  test_backup_env_->SetLimitWrittenFiles(2);
+  // should fail
+  s = db_->CreateNewBackup(!!(rnd.Next() % 2));
+  ASSERT_TRUE(!s.ok());
+  test_backup_env_->SetLimitWrittenFiles(1000000);
+  // latest backup should have all the keys
+  CloseBackupableDB();
+  AssertBackupConsistency(0, 0, keys_iteration * 5, keys_iteration * 6);
+
+  // ---------- case 2. - corrupt/delete latest backup -----------
+  ASSERT_OK(file_manager_->CorruptFile(backupdir_ + "/LATEST_BACKUP", 2));
+  AssertBackupConsistency(0, 0, keys_iteration * 5);
+  ASSERT_OK(file_manager_->DeleteFile(backupdir_ + "/LATEST_BACKUP"));
+  AssertBackupConsistency(0, 0, keys_iteration * 5);
+  // create backup 6, point LATEST_BACKUP to 5
+  OpenBackupableDB();
+  FillDB(db_.get(), keys_iteration * 5, keys_iteration * 6);
+  ASSERT_OK(db_->CreateNewBackup(false));
+  CloseBackupableDB();
+  ASSERT_OK(file_manager_->WriteToFile(backupdir_ + "/LATEST_BACKUP", "5"));
+  AssertBackupConsistency(0, 0, keys_iteration * 5, keys_iteration * 6);
+  // assert that all 6 data is gone!
+  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/6") == false);
+  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/private/6") == false);
+
+  // --------- case 3. corrupted backup meta or missing backuped file ----
+  ASSERT_OK(file_manager_->CorruptFile(backupdir_ + "/meta/5", 3));
+  // since 5 meta is now corrupted, latest backup should be 4
+  AssertBackupConsistency(0, 0, keys_iteration * 4, keys_iteration * 5);
+  OpenRestoreDB();
+  s = restore_db_->RestoreDBFromBackup(5, dbname_, dbname_);
+  ASSERT_TRUE(!s.ok());
+  CloseRestoreDB();
+  ASSERT_OK(file_manager_->DeleteRandomFileInDir(backupdir_ + "/private/4"));
+  // 4 is corrupted, 3 is the latest backup now
+  AssertBackupConsistency(0, 0, keys_iteration * 3, keys_iteration * 5);
+  OpenRestoreDB();
+  s = restore_db_->RestoreDBFromBackup(4, dbname_, dbname_);
+  CloseRestoreDB();
+  ASSERT_TRUE(!s.ok());
+
+  // new backup should be 4!
+  OpenBackupableDB();
+  FillDB(db_.get(), keys_iteration * 3, keys_iteration * 4);
+  ASSERT_OK(db_->CreateNewBackup(!!(rnd.Next() % 2)));
+  CloseBackupableDB();
+  AssertBackupConsistency(4, 0, keys_iteration * 4, keys_iteration * 5);
+}
+
+// open DB, write, close DB, backup, restore, repeat
+TEST(BackupableDBTest, OfflineIntegrationTest) {
+  // has to be a big number, so that it triggers the memtable flush
+  const int keys_iteration = 5000;
+  const int max_key = keys_iteration * 4 + 10;
+  // first iter -- flush before backup
+  // second iter -- don't flush before backup
+  for (int iter = 0; iter < 2; ++iter) {
+    // delete old data
+    DestroyDB(dbname_, Options());
+    bool destroy_data = true;
+
+    // every iteration --
+    // 1. insert new data in the DB
+    // 2. backup the DB
+    // 3. destroy the db
+    // 4. restore the db, check everything is still there
+    for (int i = 0; i < 5; ++i) {
+      // in last iteration, put smaller amount of data,
+      int fill_up_to = std::min(keys_iteration * (i + 1), max_key);
+      // ---- insert new data and back up ----
+      OpenBackupableDB(destroy_data);
+      destroy_data = false;
+      FillDB(db_.get(), keys_iteration * i, fill_up_to);
+      ASSERT_OK(db_->CreateNewBackup(iter == 0));
+      CloseBackupableDB();
+      DestroyDB(dbname_, Options());
+
+      // ---- make sure it's empty ----
+      DB* db = OpenDB();
+      AssertEmpty(db, 0, fill_up_to);
+      delete db;
+
+      // ---- restore the DB ----
+      OpenRestoreDB();
+      if (i >= 3) { // test purge old backups
+        // when i == 4, purge to only 1 backup
+        // when i == 3, purge to 2 backups
+        ASSERT_OK(restore_db_->PurgeOldBackups(5 - i));
+      }
+      // ---- make sure the data is there ---
+      AssertBackupConsistency(0, 0, fill_up_to, max_key);
+      CloseRestoreDB();
+    }
+  }
+}
+
+// open DB, write, backup, write, backup, close, restore
+TEST(BackupableDBTest, OnlineIntegrationTest) {
+  // has to be a big number, so that it triggers the memtable flush
+  const int keys_iteration = 5000;
+  const int max_key = keys_iteration * 4 + 10;
+  Random rnd(7);
+  // delete old data
+  DestroyDB(dbname_, Options());
+
+  OpenBackupableDB(true);
+  // write some data, backup, repeat
+  for (int i = 0; i < 5; ++i) {
+    if (i == 4) {
+      // delete backup number 2, online delete!
+      OpenRestoreDB();
+      ASSERT_OK(restore_db_->DeleteBackup(2));
+      CloseRestoreDB();
+    }
+    // in last iteration, put smaller amount of data,
+    // so that backups can share sst files
+    int fill_up_to = std::min(keys_iteration * (i + 1), max_key);
+    FillDB(db_.get(), keys_iteration * i, fill_up_to);
+    // we should get consistent results with flush_before_backup
+    // set to both true and false
+    ASSERT_OK(db_->CreateNewBackup(!!(rnd.Next() % 2)));
+  }
+  // close and destroy
+  CloseBackupableDB();
+  DestroyDB(dbname_, Options());
+
+  // ---- make sure it's empty ----
+  DB* db = OpenDB();
+  AssertEmpty(db, 0, max_key);
+  delete db;
+
+  // ---- restore every backup and verify all the data is there ----
+  OpenRestoreDB();
+  for (int i = 1; i <= 5; ++i) {
+    if (i == 2) {
+      // we deleted backup 2
+      Status s = restore_db_->RestoreDBFromBackup(2, dbname_, dbname_);
+      ASSERT_TRUE(!s.ok());
+    } else {
+      int fill_up_to = std::min(keys_iteration * i, max_key);
+      AssertBackupConsistency(i, 0, fill_up_to, max_key);
+    }
+  }
+
+  // delete some backups -- this should leave only backups 3 and 5 alive
+  ASSERT_OK(restore_db_->DeleteBackup(4));
+  ASSERT_OK(restore_db_->PurgeOldBackups(2));
+
+  std::vector<BackupInfo> backup_info;
+  restore_db_->GetBackupInfo(&backup_info);
+  ASSERT_EQ(2UL, backup_info.size());
+
+  // check backup 3
+  AssertBackupConsistency(3, 0, 3 * keys_iteration, max_key);
+  // check backup 5
+  AssertBackupConsistency(5, 0, max_key);
+
+  CloseRestoreDB();
+}
+
+TEST(BackupableDBTest, DeleteNewerBackups) {
+  // create backups 1, 2, 3, 4, 5
+  OpenBackupableDB(true);
+  for (int i = 0; i < 5; ++i) {
+    FillDB(db_.get(), 100 * i, 100 * (i + 1));
+    ASSERT_OK(db_->CreateNewBackup(!!(i % 2)));
+  }
+  CloseBackupableDB();
+
+  // backup 3 is fine
+  AssertBackupConsistency(3, 0, 300, 500);
+  // this should delete backups 4 and 5
+  OpenBackupableDB();
+  CloseBackupableDB();
+  // backups 4 and 5 don't exist
+  OpenRestoreDB();
+  Status s = restore_db_->RestoreDBFromBackup(4, dbname_, dbname_);
+  ASSERT_TRUE(s.IsNotFound());
+  s = restore_db_->RestoreDBFromBackup(5, dbname_, dbname_);
+  ASSERT_TRUE(s.IsNotFound());
+  CloseRestoreDB();
+}
+
+TEST(BackupableDBTest, NoShareTableFiles) {
+  const int keys_iteration = 5000;
+  OpenBackupableDB(true, false, false);
+  for (int i = 0; i < 5; ++i) {
+    FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1));
+    ASSERT_OK(db_->CreateNewBackup(!!(i % 2)));
+  }
+  CloseBackupableDB();
+
+  for (int i = 0; i < 5; ++i) {
+    AssertBackupConsistency(i + 1, 0, keys_iteration * (i + 1),
+                            keys_iteration * 6);
+  }
+}
+
+TEST(BackupableDBTest, DeleteTmpFiles) {
+  OpenBackupableDB();
+  CloseBackupableDB();
+  std::string shared_tmp = backupdir_ + "/shared/00006.sst.tmp";
+  std::string private_tmp_dir = backupdir_ + "/private/10.tmp";
+  std::string private_tmp_file = private_tmp_dir + "/00003.sst";
+  file_manager_->WriteToFile(shared_tmp, "tmp");
+  file_manager_->CreateDir(private_tmp_dir);
+  file_manager_->WriteToFile(private_tmp_file, "tmp");
+  ASSERT_EQ(true, file_manager_->FileExists(private_tmp_dir));
+  OpenBackupableDB();
+  CloseBackupableDB();
+  ASSERT_EQ(false, file_manager_->FileExists(shared_tmp));
+  ASSERT_EQ(false, file_manager_->FileExists(private_tmp_file));
+  ASSERT_EQ(false, file_manager_->FileExists(private_tmp_dir));
+}
+
+} // anon namespace
+
+} //  namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/utilities/merge_operators.h b/utilities/merge_operators.h
new file mode 100644 (file)
index 0000000..fdf0664
--- /dev/null
@@ -0,0 +1,45 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#ifndef MERGE_OPERATORS_H
+#define MERGE_OPERATORS_H
+
+#include <memory>
+#include <stdio.h>
+
+#include "rocksdb/merge_operator.h"
+
+namespace rocksdb {
+
+class MergeOperators {
+ public:
+  static std::shared_ptr<MergeOperator> CreatePutOperator();
+  static std::shared_ptr<MergeOperator> CreateUInt64AddOperator();
+  static std::shared_ptr<MergeOperator> CreateStringAppendOperator();
+  static std::shared_ptr<MergeOperator> CreateStringAppendTESTOperator();
+
+  // Will return a different merge operator depending on the string.
+  // TODO: Hook the "name" up to the actual Name() of the MergeOperators?
+  static std::shared_ptr<MergeOperator> CreateFromStringId(
+      const std::string& name) {
+    if (name == "put") {
+      return CreatePutOperator();
+    } else if ( name == "uint64add") {
+      return CreateUInt64AddOperator();
+    } else if (name == "stringappend") {
+      return CreateStringAppendOperator();
+    } else if (name == "stringappendtest") {
+      return CreateStringAppendTESTOperator();
+    } else {
+      // Empty or unknown, just return nullptr
+      return nullptr;
+    }
+  }
+
+};
+
+} // namespace rocksdb
+
+#endif
diff --git a/utilities/merge_operators/put.cc b/utilities/merge_operators/put.cc
new file mode 100644 (file)
index 0000000..e77449d
--- /dev/null
@@ -0,0 +1,54 @@
+#include <memory>
+#include "rocksdb/slice.h"
+#include "rocksdb/merge_operator.h"
+#include "utilities/merge_operators.h"
+
+using namespace rocksdb;
+
+namespace { // anonymous namespace
+
+// A merge operator that mimics Put semantics
+// Since this merge-operator will not be used in production,
+// it is implemented as a non-associative merge operator to illustrate the
+// new interface and for testing purposes. (That is, we inherit from
+// the MergeOperator class rather than the AssociativeMergeOperator
+// which would be simpler in this case).
+//
+// From the client-perspective, semantics are the same.
+class PutOperator : public MergeOperator {
+ public:
+  virtual bool FullMerge(const Slice& key,
+                         const Slice* existing_value,
+                         const std::deque<std::string>& operand_sequence,
+                         std::string* new_value,
+                         Logger* logger) const override {
+    // Put basically only looks at the current/latest value
+    assert(!operand_sequence.empty());
+    assert(new_value != nullptr);
+    new_value->assign(operand_sequence.back());
+    return true;
+  }
+
+  virtual bool PartialMerge(const Slice& key,
+                            const Slice& left_operand,
+                            const Slice& right_operand,
+                            std::string* new_value,
+                            Logger* logger) const override {
+    new_value->assign(right_operand.data(), right_operand.size());
+    return true;
+  }
+
+  virtual const char* Name() const override {
+    return "PutOperator";
+  }
+};
+
+} // end of anonymous namespace
+
+namespace rocksdb {
+
+std::shared_ptr<MergeOperator> MergeOperators::CreatePutOperator() {
+  return std::make_shared<PutOperator>();
+}
+
+}
diff --git a/utilities/merge_operators/string_append/stringappend.cc b/utilities/merge_operators/string_append/stringappend.cc
new file mode 100644 (file)
index 0000000..38cd22e
--- /dev/null
@@ -0,0 +1,60 @@
+/**
+ * A MergeOperator for rocksdb that implements string append.
+ * @author Deon Nicholas (dnicholas@fb.com)
+ * Copyright 2013 Facebook
+ */
+
+#include "stringappend.h"
+
+#include <memory>
+#include <assert.h>
+
+#include "rocksdb/slice.h"
+#include "rocksdb/merge_operator.h"
+#include "utilities/merge_operators.h"
+
+namespace rocksdb {
+
+// Constructor: also specify the delimiter character.
+StringAppendOperator::StringAppendOperator(char delim_char)
+    : delim_(delim_char) {
+}
+
+// Implementation for the merge operation (concatenates two strings)
+bool StringAppendOperator::Merge(const Slice& key,
+                                 const Slice* existing_value,
+                                 const Slice& value,
+                                 std::string* new_value,
+                                 Logger* logger) const {
+
+  // Clear the *new_value for writing.
+  assert(new_value);
+  new_value->clear();
+
+  if (!existing_value) {
+    // No existing_value. Set *new_value = value
+    new_value->assign(value.data(),value.size());
+  } else {
+    // Generic append (existing_value != null).
+    // Reserve *new_value to correct size, and apply concatenation.
+    new_value->reserve(existing_value->size() + 1 + value.size());
+    new_value->assign(existing_value->data(),existing_value->size());
+    new_value->append(1,delim_);
+    new_value->append(value.data(), value.size());
+  }
+
+  return true;
+}
+
+const char* StringAppendOperator::Name() const  {
+  return "StringAppendOperator";
+}
+
+std::shared_ptr<MergeOperator> MergeOperators::CreateStringAppendOperator() {
+  return std::make_shared<StringAppendOperator>(',');
+}
+
+} // namespace rocksdb
+
+
+
diff --git a/utilities/merge_operators/string_append/stringappend.h b/utilities/merge_operators/string_append/stringappend.h
new file mode 100644 (file)
index 0000000..ca5b97e
--- /dev/null
@@ -0,0 +1,31 @@
+/**
+ * A MergeOperator for rocksdb that implements string append.
+ * @author Deon Nicholas (dnicholas@fb.com)
+ * Copyright 2013 Facebook
+ */
+
+#pragma once
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/slice.h"
+
+namespace rocksdb {
+
+class StringAppendOperator : public AssociativeMergeOperator {
+ public:
+  StringAppendOperator(char delim_char);    /// Constructor: specify delimiter
+
+  virtual bool Merge(const Slice& key,
+                     const Slice* existing_value,
+                     const Slice& value,
+                     std::string* new_value,
+                     Logger* logger) const override;
+
+  virtual const char* Name() const override;
+
+ private:
+  char delim_;         // The delimiter is inserted between elements
+
+};
+
+} // namespace rocksdb
+
diff --git a/utilities/merge_operators/string_append/stringappend2.cc b/utilities/merge_operators/string_append/stringappend2.cc
new file mode 100644 (file)
index 0000000..e153a38
--- /dev/null
@@ -0,0 +1,104 @@
+/**
+ * @author Deon Nicholas (dnicholas@fb.com)
+ * Copyright 2013 Facebook
+ */
+
+#include "stringappend2.h"
+
+#include <memory>
+#include <assert.h>
+
+#include "rocksdb/slice.h"
+#include "rocksdb/merge_operator.h"
+#include "utilities/merge_operators.h"
+
+namespace rocksdb {
+
+// Constructor: also specify the delimiter character.
+StringAppendTESTOperator::StringAppendTESTOperator(char delim_char)
+    : delim_(delim_char) {
+}
+
+// Implementation for the merge operation (concatenates two strings)
+bool StringAppendTESTOperator::FullMerge(
+    const Slice& key,
+    const Slice* existing_value,
+    const std::deque<std::string>& operands,
+    std::string* new_value,
+    Logger* logger) const {
+
+  // Clear the *new_value for writing.
+  assert(new_value);
+  new_value->clear();
+
+  // Compute the space needed for the final result.
+  int numBytes = 0;
+  for(auto it = operands.begin(); it != operands.end(); ++it) {
+    numBytes += it->size() + 1;   // Plus 1 for the delimiter
+  }
+
+  // Only print the delimiter after the first entry has been printed
+  bool printDelim = false;
+
+  // Prepend the *existing_value if one exists.
+  if (existing_value) {
+    new_value->reserve(numBytes + existing_value->size());
+    new_value->append(existing_value->data(), existing_value->size());
+    printDelim = true;
+  } else if (numBytes) {
+    new_value->reserve(numBytes-1); // Minus 1 since we have one less delimiter
+  }
+
+  // Concatenate the sequence of strings (and add a delimiter between each)
+  for(auto it = operands.begin(); it != operands.end(); ++it) {
+    if (printDelim) {
+      new_value->append(1,delim_);
+    }
+    new_value->append(*it);
+    printDelim = true;
+  }
+
+  return true;
+}
+
+bool StringAppendTESTOperator::PartialMerge(const Slice& key,
+                                            const Slice& left_operand,
+                                            const Slice& right_operand,
+                                            std::string* new_value,
+                                            Logger* logger) const {
+  return false;
+}
+
+// A version of PartialMerge that actually performs "partial merging".
+// Use this to simulate the exact behaviour of the StringAppendOperator.
+bool StringAppendTESTOperator::_AssocPartialMerge(const Slice& key,
+                                            const Slice& left_operand,
+                                            const Slice& right_operand,
+                                            std::string* new_value,
+                                            Logger* logger) const {
+  // Clear the *new_value for writing.
+  assert(new_value);
+  new_value->clear();
+
+  // Generic append
+  // Reserve correct size for *new_value, and apply concatenation.
+  new_value->reserve(left_operand.size() + 1 + right_operand.size());
+  new_value->assign(left_operand.data(), left_operand.size());
+  new_value->append(1,delim_);
+  new_value->append(right_operand.data(), right_operand.size());
+
+  return true;
+}
+
+const char* StringAppendTESTOperator::Name() const  {
+  return "StringAppendTESTOperator";
+}
+
+
+std::shared_ptr<MergeOperator>
+MergeOperators::CreateStringAppendTESTOperator() {
+  return std::make_shared<StringAppendTESTOperator>(',');
+}
+
+} // namespace rocksdb
+
diff --git a/utilities/merge_operators/string_append/stringappend2.h b/utilities/merge_operators/string_append/stringappend2.h
new file mode 100644 (file)
index 0000000..01a4be4
--- /dev/null
@@ -0,0 +1,51 @@
+/**
+ * A TEST MergeOperator for rocksdb that implements string append.
+ * It is built using the MergeOperator interface rather than the simpler
+ * AssociativeMergeOperator interface. This is useful for testing/benchmarking.
+ * While the two operators are semantically the same, all production code
+ * should use the StringAppendOperator defined in stringappend.{h,cc}. The
+ * operator defined in the present file is primarily for testing.
+ *
+ * @author Deon Nicholas (dnicholas@fb.com)
+ * Copyright 2013 Facebook
+ */
+
+#pragma once
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/slice.h"
+
+namespace rocksdb {
+
+class StringAppendTESTOperator : public MergeOperator {
+ public:
+
+  StringAppendTESTOperator(char delim_char);    /// Constructor with delimiter
+
+  virtual bool FullMerge(const Slice& key,
+                         const Slice* existing_value,
+                         const std::deque<std::string>& operand_sequence,
+                         std::string* new_value,
+                         Logger* logger) const override;
+
+  virtual bool PartialMerge(const Slice& key,
+                            const Slice& left_operand,
+                            const Slice& right_operand,
+                            std::string* new_value,
+                            Logger* logger) const override;
+
+  virtual const char* Name() const override;
+
+ private:
+  // A version of PartialMerge that actually performs "partial merging".
+  // Use this to simulate the exact behaviour of the StringAppendOperator.
+  bool _AssocPartialMerge(const Slice& key,
+                          const Slice& left_operand,
+                          const Slice& right_operand,
+                          std::string* new_value,
+                          Logger* logger) const;
+
+  char delim_;         // The delimiter is inserted between elements
+
+};
+
+} // namespace rocksdb
diff --git a/utilities/merge_operators/string_append/stringappend_test.cc b/utilities/merge_operators/string_append/stringappend_test.cc
new file mode 100644 (file)
index 0000000..81af646
--- /dev/null
@@ -0,0 +1,593 @@
+/**
+ * An persistent map : key -> (list of strings), using rocksdb merge.
+ * This file is a test-harness / use-case for the StringAppendOperator.
+ *
+ * @author Deon Nicholas (dnicholas@fb.com)
+ * Copyright 2013 Facebook, Inc.
+*/
+
+#include <iostream>
+#include <map>
+
+#include "rocksdb/db.h"
+#include "rocksdb/merge_operator.h"
+#include "utilities/merge_operators.h"
+#include "utilities/merge_operators/string_append/stringappend.h"
+#include "utilities/merge_operators/string_append/stringappend2.h"
+#include "utilities/ttl/db_ttl.h"
+#include "util/testharness.h"
+#include "util/random.h"
+
+using namespace rocksdb;
+
+namespace rocksdb {
+
+// Path to the database on file system
+const std::string kDbName = "/tmp/mergetestdb";
+
+// OpenDb opens a (possibly new) rocksdb database with a StringAppendOperator
+std::shared_ptr<DB> OpenNormalDb(char delim_char) {
+  DB* db;
+  Options options;
+  options.create_if_missing = true;
+  options.merge_operator.reset(new StringAppendOperator(delim_char));
+  ASSERT_OK(DB::Open(options, kDbName,  &db));
+  return std::shared_ptr<DB>(db);
+}
+
+// Open a TtlDB with a non-associative StringAppendTESTOperator
+std::shared_ptr<DB> OpenTtlDb(char delim_char) {
+  StackableDB* db;
+  Options options;
+  options.create_if_missing = true;
+  options.merge_operator.reset(new StringAppendTESTOperator(delim_char));
+  ASSERT_OK(UtilityDB::OpenTtlDB(options, kDbName, &db, 123456));
+  return std::shared_ptr<DB>(db);
+}
+
+/// StringLists represents a set of string-lists, each with a key-index.
+/// Supports Append(list, string) and Get(list)
+class StringLists {
+ public:
+
+  //Constructor: specifies the rocksdb db
+  /* implicit */
+  StringLists(std::shared_ptr<DB> db)
+      : db_(db),
+        merge_option_(),
+        get_option_() {
+    assert(db);
+  }
+
+  // Append string val onto the list defined by key; return true on success
+  bool Append(const std::string& key, const std::string& val){
+    Slice valSlice(val.data(), val.size());
+    auto s = db_->Merge(merge_option_, key, valSlice);
+
+    if (s.ok()) {
+      return true;
+    } else {
+      std::cerr << "ERROR " << s.ToString() << std::endl;
+      return false;
+    }
+  }
+
+  // Returns the list of strings associated with key (or "" if does not exist)
+  bool Get(const std::string& key, std::string* const result){
+    assert(result != nullptr); // we should have a place to store the result
+    auto s = db_->Get(get_option_, key, result);
+
+    if (s.ok()) {
+      return true;
+    }
+
+    // Either key does not exist, or there is some error.
+    *result = "";       // Always return empty string (just for convention)
+
+    //NotFound is okay; just return empty (similar to std::map)
+    //But network or db errors, etc, should fail the test (or at least yell)
+    if (!s.IsNotFound()) {
+      std::cerr << "ERROR " << s.ToString() << std::endl;
+    }
+
+    // Always return false if s.ok() was not true
+    return false;
+  }
+
+
+ private:
+  std::shared_ptr<DB> db_;
+  WriteOptions merge_option_;
+  ReadOptions get_option_;
+
+};
+
+
+// The class for unit-testing
+class StringAppendOperatorTest {
+ public:
+  StringAppendOperatorTest() {
+    DestroyDB(kDbName, Options());    // Start each test with a fresh DB
+  }
+
+  typedef std::shared_ptr<DB> (* OpenFuncPtr)(char);
+
+  // Allows user to open databases with different configurations.
+  // e.g.: Can open a DB or a TtlDB, etc.
+  static void SetOpenDbFunction(OpenFuncPtr func) {
+    OpenDb = func;
+  }
+
+ protected:
+  static OpenFuncPtr OpenDb;
+};
+StringAppendOperatorTest::OpenFuncPtr StringAppendOperatorTest::OpenDb = nullptr;
+
+// THE TEST CASES BEGIN HERE
+
+TEST(StringAppendOperatorTest, IteratorTest) {
+  auto db_ = OpenDb(',');
+  StringLists slists(db_);
+
+  slists.Append("k1", "v1");
+  slists.Append("k1", "v2");
+  slists.Append("k1", "v3");
+
+  slists.Append("k2", "a1");
+  slists.Append("k2", "a2");
+  slists.Append("k2", "a3");
+
+  std::string res;
+  std::unique_ptr<rocksdb::Iterator> it(db_->NewIterator(ReadOptions()));
+  std::string k1("k1");
+  std::string k2("k2");
+  bool first = true;
+  for (it->Seek(k1); it->Valid(); it->Next()) {
+    res = it->value().ToString();
+    if (first) {
+      ASSERT_EQ(res, "v1,v2,v3");
+      first = false;
+    } else {
+      ASSERT_EQ(res, "a1,a2,a3");
+    }
+  }
+  slists.Append("k2", "a4");
+  slists.Append("k1", "v4");
+
+  // Snapshot should still be the same. Should ignore a4 and v4.
+  first = true;
+  for (it->Seek(k1); it->Valid(); it->Next()) {
+    res = it->value().ToString();
+    if (first) {
+      ASSERT_EQ(res, "v1,v2,v3");
+      first = false;
+    } else {
+      ASSERT_EQ(res, "a1,a2,a3");
+    }
+  }
+
+
+  // Should release the snapshot and be aware of the new stuff now
+  it.reset(db_->NewIterator(ReadOptions()));
+  first = true;
+  for (it->Seek(k1); it->Valid(); it->Next()) {
+    res = it->value().ToString();
+    if (first) {
+      ASSERT_EQ(res, "v1,v2,v3,v4");
+      first = false;
+    } else {
+      ASSERT_EQ(res, "a1,a2,a3,a4");
+    }
+  }
+
+  // start from k2 this time.
+  for (it->Seek(k2); it->Valid(); it->Next()) {
+    res = it->value().ToString();
+    if (first) {
+      ASSERT_EQ(res, "v1,v2,v3,v4");
+      first = false;
+    } else {
+      ASSERT_EQ(res, "a1,a2,a3,a4");
+    }
+  }
+
+  slists.Append("k3", "g1");
+
+  it.reset(db_->NewIterator(ReadOptions()));
+  first = true;
+  std::string k3("k3");
+  for(it->Seek(k2); it->Valid(); it->Next()) {
+    res = it->value().ToString();
+    if (first) {
+      ASSERT_EQ(res, "a1,a2,a3,a4");
+      first = false;
+    } else {
+      ASSERT_EQ(res, "g1");
+    }
+  }
+  for(it->Seek(k3); it->Valid(); it->Next()) {
+    res = it->value().ToString();
+    if (first) {
+      // should not be hit
+      ASSERT_EQ(res, "a1,a2,a3,a4");
+      first = false;
+    } else {
+      ASSERT_EQ(res, "g1");
+    }
+  }
+
+}
+
+TEST(StringAppendOperatorTest, SimpleTest) {
+  auto db = OpenDb(',');
+  StringLists slists(db);
+
+  slists.Append("k1", "v1");
+  slists.Append("k1", "v2");
+  slists.Append("k1", "v3");
+
+  std::string res;
+  bool status = slists.Get("k1", &res);
+
+  ASSERT_TRUE(status);
+  ASSERT_EQ(res, "v1,v2,v3");
+}
+
+TEST(StringAppendOperatorTest, SimpleDelimiterTest) {
+  auto db = OpenDb('|');
+  StringLists slists(db);
+
+  slists.Append("k1", "v1");
+  slists.Append("k1", "v2");
+  slists.Append("k1", "v3");
+
+  std::string res;
+  slists.Get("k1", &res);
+  ASSERT_EQ(res, "v1|v2|v3");
+}
+
+TEST(StringAppendOperatorTest, OneValueNoDelimiterTest) {
+  auto db = OpenDb('!');
+  StringLists slists(db);
+
+  slists.Append("random_key", "single_val");
+
+  std::string res;
+  slists.Get("random_key", &res);
+  ASSERT_EQ(res, "single_val");
+}
+
+TEST(StringAppendOperatorTest, VariousKeys) {
+  auto db = OpenDb('\n');
+  StringLists slists(db);
+
+  slists.Append("c", "asdasd");
+  slists.Append("a", "x");
+  slists.Append("b", "y");
+  slists.Append("a", "t");
+  slists.Append("a", "r");
+  slists.Append("b", "2");
+  slists.Append("c", "asdasd");
+
+  std::string a, b, c;
+  bool sa, sb, sc;
+  sa = slists.Get("a", &a);
+  sb = slists.Get("b", &b);
+  sc = slists.Get("c", &c);
+
+  ASSERT_TRUE(sa && sb && sc); // All three keys should have been found
+
+  ASSERT_EQ(a, "x\nt\nr");
+  ASSERT_EQ(b, "y\n2");
+  ASSERT_EQ(c, "asdasd\nasdasd");
+}
+
+// Generate semi random keys/words from a small distribution.
+TEST(StringAppendOperatorTest, RandomMixGetAppend) {
+  auto db = OpenDb(' ');
+  StringLists slists(db);
+
+  // Generate a list of random keys and values
+  const int kWordCount = 15;
+  std::string words[] = {"sdasd", "triejf", "fnjsdfn", "dfjisdfsf", "342839",
+                         "dsuha", "mabuais", "sadajsid", "jf9834hf", "2d9j89",
+                         "dj9823jd", "a", "dk02ed2dh", "$(jd4h984$(*", "mabz"};
+  const int kKeyCount = 6;
+  std::string keys[] = {"dhaiusdhu", "denidw", "daisda", "keykey", "muki",
+                        "shzassdianmd"};
+
+  // Will store a local copy of all data in order to verify correctness
+  std::map<std::string, std::string> parallel_copy;
+
+  // Generate a bunch of random queries (Append and Get)!
+  enum query_t  { APPEND_OP, GET_OP, NUM_OPS };
+  Random randomGen(1337);       //deterministic seed; always get same results!
+
+  const int kNumQueries = 30;
+  for (int q=0; q<kNumQueries; ++q) {
+    // Generate a random query (Append or Get) and random parameters
+    query_t query = (query_t)randomGen.Uniform((int)NUM_OPS);
+    std::string key = keys[randomGen.Uniform((int)kKeyCount)];
+    std::string word = words[randomGen.Uniform((int)kWordCount)];
+
+    // Apply the query and any checks.
+    if (query == APPEND_OP) {
+
+      // Apply the rocksdb test-harness Append defined above
+      slists.Append(key, word);  //apply the rocksdb append
+
+      // Apply the similar "Append" to the parallel copy
+      if (parallel_copy[key].size() > 0) {
+        parallel_copy[key] += " " + word;
+      } else {
+        parallel_copy[key] = word;
+      }
+
+    } else if (query == GET_OP) {
+      // Assumes that a non-existent key just returns <empty>
+      std::string res;
+      slists.Get(key, &res);
+      ASSERT_EQ(res, parallel_copy[key]);
+    }
+
+  }
+
+}
+
+TEST(StringAppendOperatorTest, BIGRandomMixGetAppend) {
+  auto db = OpenDb(' ');
+  StringLists slists(db);
+
+  // Generate a list of random keys and values
+  const int kWordCount = 15;
+  std::string words[] = {"sdasd", "triejf", "fnjsdfn", "dfjisdfsf", "342839",
+                         "dsuha", "mabuais", "sadajsid", "jf9834hf", "2d9j89",
+                         "dj9823jd", "a", "dk02ed2dh", "$(jd4h984$(*", "mabz"};
+  const int kKeyCount = 6;
+  std::string keys[] = {"dhaiusdhu", "denidw", "daisda", "keykey", "muki",
+                        "shzassdianmd"};
+
+  // Will store a local copy of all data in order to verify correctness
+  std::map<std::string, std::string> parallel_copy;
+
+  // Generate a bunch of random queries (Append and Get)!
+  enum query_t  { APPEND_OP, GET_OP, NUM_OPS };
+  Random randomGen(9138204);       // deterministic seed
+
+  const int kNumQueries = 1000;
+  for (int q=0; q<kNumQueries; ++q) {
+    // Generate a random query (Append or Get) and random parameters
+    query_t query = (query_t)randomGen.Uniform((int)NUM_OPS);
+    std::string key = keys[randomGen.Uniform((int)kKeyCount)];
+    std::string word = words[randomGen.Uniform((int)kWordCount)];
+
+    //Apply the query and any checks.
+    if (query == APPEND_OP) {
+
+      // Apply the rocksdb test-harness Append defined above
+      slists.Append(key, word);  //apply the rocksdb append
+
+      // Apply the similar "Append" to the parallel copy
+      if (parallel_copy[key].size() > 0) {
+        parallel_copy[key] += " " + word;
+      } else {
+        parallel_copy[key] = word;
+      }
+
+    } else if (query == GET_OP) {
+      // Assumes that a non-existent key just returns <empty>
+      std::string res;
+      slists.Get(key, &res);
+      ASSERT_EQ(res, parallel_copy[key]);
+    }
+
+  }
+
+}
+
+
+TEST(StringAppendOperatorTest, PersistentVariousKeys) {
+  // Perform the following operations in limited scope
+  {
+    auto db = OpenDb('\n');
+    StringLists slists(db);
+
+    slists.Append("c", "asdasd");
+    slists.Append("a", "x");
+    slists.Append("b", "y");
+    slists.Append("a", "t");
+    slists.Append("a", "r");
+    slists.Append("b", "2");
+    slists.Append("c", "asdasd");
+
+    std::string a, b, c;
+    slists.Get("a", &a);
+    slists.Get("b", &b);
+    slists.Get("c", &c);
+
+    ASSERT_EQ(a, "x\nt\nr");
+    ASSERT_EQ(b, "y\n2");
+    ASSERT_EQ(c, "asdasd\nasdasd");
+  }
+
+  // Reopen the database (the previous changes should persist / be remembered)
+  {
+    auto db = OpenDb('\n');
+    StringLists slists(db);
+
+    slists.Append("c", "bbnagnagsx");
+    slists.Append("a", "sa");
+    slists.Append("b", "df");
+    slists.Append("a", "gh");
+    slists.Append("a", "jk");
+    slists.Append("b", "l;");
+    slists.Append("c", "rogosh");
+
+    // The previous changes should be on disk (L0)
+    // The most recent changes should be in memory (MemTable)
+    // Hence, this will test both Get() paths.
+    std::string a, b, c;
+    slists.Get("a", &a);
+    slists.Get("b", &b);
+    slists.Get("c", &c);
+
+    ASSERT_EQ(a, "x\nt\nr\nsa\ngh\njk");
+    ASSERT_EQ(b, "y\n2\ndf\nl;");
+    ASSERT_EQ(c, "asdasd\nasdasd\nbbnagnagsx\nrogosh");
+  }
+
+  // Reopen the database (the previous changes should persist / be remembered)
+  {
+    auto db = OpenDb('\n');
+    StringLists slists(db);
+
+    // All changes should be on disk. This will test VersionSet Get()
+    std::string a, b, c;
+    slists.Get("a", &a);
+    slists.Get("b", &b);
+    slists.Get("c", &c);
+
+    ASSERT_EQ(a, "x\nt\nr\nsa\ngh\njk");
+    ASSERT_EQ(b, "y\n2\ndf\nl;");
+    ASSERT_EQ(c, "asdasd\nasdasd\nbbnagnagsx\nrogosh");
+  }
+}
+
+TEST(StringAppendOperatorTest, PersistentFlushAndCompaction) {
+  // Perform the following operations in limited scope
+  {
+    auto db = OpenDb('\n');
+    StringLists slists(db);
+    std::string a, b, c;
+    bool success;
+
+    // Append, Flush, Get
+    slists.Append("c", "asdasd");
+    db->Flush(rocksdb::FlushOptions());
+    success = slists.Get("c", &c);
+    ASSERT_TRUE(success);
+    ASSERT_EQ(c, "asdasd");
+
+    // Append, Flush, Append, Get
+    slists.Append("a", "x");
+    slists.Append("b", "y");
+    db->Flush(rocksdb::FlushOptions());
+    slists.Append("a", "t");
+    slists.Append("a", "r");
+    slists.Append("b", "2");
+
+    success = slists.Get("a", &a);
+    assert(success == true);
+    ASSERT_EQ(a, "x\nt\nr");
+
+    success = slists.Get("b", &b);
+    assert(success == true);
+    ASSERT_EQ(b, "y\n2");
+
+    // Append, Get
+    success = slists.Append("c", "asdasd");
+    assert(success);
+    success = slists.Append("b", "monkey");
+    assert(success);
+
+    // I omit the "assert(success)" checks here.
+    slists.Get("a", &a);
+    slists.Get("b", &b);
+    slists.Get("c", &c);
+
+    ASSERT_EQ(a, "x\nt\nr");
+    ASSERT_EQ(b, "y\n2\nmonkey");
+    ASSERT_EQ(c, "asdasd\nasdasd");
+  }
+
+  // Reopen the database (the previous changes should persist / be remembered)
+  {
+    auto db = OpenDb('\n');
+    StringLists slists(db);
+    std::string a, b, c;
+
+    // Get (Quick check for persistence of previous database)
+    slists.Get("a", &a);
+    ASSERT_EQ(a, "x\nt\nr");
+
+    //Append, Compact, Get
+    slists.Append("c", "bbnagnagsx");
+    slists.Append("a", "sa");
+    slists.Append("b", "df");
+    db->CompactRange(nullptr, nullptr);
+    slists.Get("a", &a);
+    slists.Get("b", &b);
+    slists.Get("c", &c);
+    ASSERT_EQ(a, "x\nt\nr\nsa");
+    ASSERT_EQ(b, "y\n2\nmonkey\ndf");
+    ASSERT_EQ(c, "asdasd\nasdasd\nbbnagnagsx");
+
+    // Append, Get
+    slists.Append("a", "gh");
+    slists.Append("a", "jk");
+    slists.Append("b", "l;");
+    slists.Append("c", "rogosh");
+    slists.Get("a", &a);
+    slists.Get("b", &b);
+    slists.Get("c", &c);
+    ASSERT_EQ(a, "x\nt\nr\nsa\ngh\njk");
+    ASSERT_EQ(b, "y\n2\nmonkey\ndf\nl;");
+    ASSERT_EQ(c, "asdasd\nasdasd\nbbnagnagsx\nrogosh");
+
+    // Compact, Get
+    db->CompactRange(nullptr, nullptr);
+    ASSERT_EQ(a, "x\nt\nr\nsa\ngh\njk");
+    ASSERT_EQ(b, "y\n2\nmonkey\ndf\nl;");
+    ASSERT_EQ(c, "asdasd\nasdasd\nbbnagnagsx\nrogosh");
+
+    // Append, Flush, Compact, Get
+    slists.Append("b", "afcg");
+    db->Flush(rocksdb::FlushOptions());
+    db->CompactRange(nullptr, nullptr);
+    slists.Get("b", &b);
+    ASSERT_EQ(b, "y\n2\nmonkey\ndf\nl;\nafcg");
+  }
+}
+
+TEST(StringAppendOperatorTest, SimpleTestNullDelimiter) {
+  auto db = OpenDb('\0');
+  StringLists slists(db);
+
+  slists.Append("k1", "v1");
+  slists.Append("k1", "v2");
+  slists.Append("k1", "v3");
+
+  std::string res;
+  bool status = slists.Get("k1", &res);
+  ASSERT_TRUE(status);
+
+  // Construct the desired string. Default constructor doesn't like '\0' chars.
+  std::string checker("v1,v2,v3");    // Verify that the string is right size.
+  checker[2] = '\0';                  // Use null delimiter instead of comma.
+  checker[5] = '\0';
+  assert(checker.size() == 8);        // Verify it is still the correct size
+
+  // Check that the rocksdb result string matches the desired string
+  assert(res.size() == checker.size());
+  ASSERT_EQ(res, checker);
+}
+
+} // namespace rocksdb
+
+int main(int arc, char** argv) {
+  // Run with regular database
+  {
+    fprintf(stderr, "Running tests with regular db and operator.\n");
+    StringAppendOperatorTest::SetOpenDbFunction(&OpenNormalDb);
+    rocksdb::test::RunAllTests();
+  }
+
+  // Run with TTL
+  {
+    fprintf(stderr, "Running tests with ttl db and generic operator.\n");
+    StringAppendOperatorTest::SetOpenDbFunction(&OpenTtlDb);
+    rocksdb::test::RunAllTests();
+  }
+
+  return 0;
+}
diff --git a/utilities/merge_operators/uint64add.cc b/utilities/merge_operators/uint64add.cc
new file mode 100644 (file)
index 0000000..9d78651
--- /dev/null
@@ -0,0 +1,65 @@
+#include <memory>
+#include "rocksdb/env.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/slice.h"
+#include "util/coding.h"
+#include "utilities/merge_operators.h"
+
+using namespace rocksdb;
+
+namespace { // anonymous namespace
+
+// A 'model' merge operator with uint64 addition semantics
+// Implemented as an AssociativeMergeOperator for simplicity and example.
+class UInt64AddOperator : public AssociativeMergeOperator {
+ public:
+  virtual bool Merge(const Slice& key,
+                     const Slice* existing_value,
+                     const Slice& value,
+                     std::string* new_value,
+                     Logger* logger) const override {
+    uint64_t orig_value = 0;
+    if (existing_value){
+      orig_value = DecodeInteger(*existing_value, logger);
+    }
+    uint64_t operand = DecodeInteger(value, logger);
+
+    assert(new_value);
+    new_value->clear();
+    PutFixed64(new_value, orig_value + operand);
+
+    return true;  // Return true always since corruption will be treated as 0
+  }
+
+  virtual const char* Name() const override {
+    return "UInt64AddOperator";
+  }
+
+ private:
+  // Takes the string and decodes it into a uint64_t
+  // On error, prints a message and returns 0
+  uint64_t DecodeInteger(const Slice& value, Logger* logger) const {
+    uint64_t result = 0;
+
+    if (value.size() == sizeof(uint64_t)) {
+      result = DecodeFixed64(value.data());
+    } else if (logger != nullptr) {
+      // If value is corrupted, treat it as 0
+      Log(logger, "uint64 value corruption, size: %zu > %zu",
+          value.size(), sizeof(uint64_t));
+    }
+
+    return result;
+  }
+
+};
+
+}
+
+namespace rocksdb {
+
+std::shared_ptr<MergeOperator> MergeOperators::CreateUInt64AddOperator() {
+  return std::make_shared<UInt64AddOperator>();
+}
+
+}
diff --git a/utilities/redis/README b/utilities/redis/README
new file mode 100644 (file)
index 0000000..8b17bc0
--- /dev/null
@@ -0,0 +1,14 @@
+This folder defines a REDIS-style interface for Rocksdb.
+Right now it is written as a simple tag-on in the rocksdb::RedisLists class.
+It implements Redis Lists, and supports only the "non-blocking operations".
+
+Internally, the set of lists are stored in a rocksdb database, mapping keys to
+values. Each "value" is the list itself, storing a sequence of "elements".
+Each element is stored as a 32-bit-integer, followed by a sequence of bytes.
+The 32-bit-integer represents the length of the element (that is, the number
+of bytes that follow). And then that many bytes follow.
+
+
+NOTE: This README file may be old. See the actual redis_lists.cc file for
+definitive details on the implementation. There should be a header at the top
+of that file, explaining a bit of the implementation details.
diff --git a/utilities/redis/redis_list_exception.h b/utilities/redis/redis_list_exception.h
new file mode 100644 (file)
index 0000000..d409095
--- /dev/null
@@ -0,0 +1,20 @@
+/**
+ * A simple structure for exceptions in RedisLists.
+ *
+ * @author Deon Nicholas (dnicholas@fb.com)
+ * Copyright 2013 Facebook
+ */
+
+#pragma once
+#include <exception>
+
+namespace rocksdb {
+
+class RedisListException: public std::exception {
+ public:
+  const char* what() const throw() {
+    return "Invalid operation or corrupt data in Redis List.";
+  }
+};
+
+} // namespace rocksdb
diff --git a/utilities/redis/redis_list_iterator.h b/utilities/redis/redis_list_iterator.h
new file mode 100644 (file)
index 0000000..d57f8ac
--- /dev/null
@@ -0,0 +1,308 @@
+/**
+ * RedisListIterator:
+ * An abstraction over the "list" concept (e.g.: for redis lists).
+ * Provides functionality to read, traverse, edit, and write these lists.
+ *
+ * Upon construction, the RedisListIterator is given a block of list data.
+ * Internally, it stores a pointer to the data and a pointer to current item.
+ * It also stores a "result" list that will be mutated over time.
+ *
+ * Traversal and mutation are done by "forward iteration".
+ * The Push() and Skip() methods will advance the iterator to the next item.
+ * However, Push() will also "write the current item to the result".
+ * Skip() will simply move to next item, causing current item to be dropped.
+ *
+ * Upon completion, the result (accessible by WriteResult()) will be saved.
+ * All "skipped" items will be gone; all "pushed" items will remain.
+ *
+ * @throws Any of the operations may throw a RedisListException if an invalid
+ *          operation is performed or if the data is found to be corrupt.
+ *
+ * @notes By default, if WriteResult() is called part-way through iteration,
+ *        it will automatically advance the iterator to the end, and Keep()
+ *        all items that haven't been traversed yet. This may be subject
+ *        to review.
+ *
+ * @notes Can access the "current" item via GetCurrent(), and other
+ *        list-specific information such as Length().
+ *
+ * @notes The internal representation is due to change at any time. Presently,
+ *        the list is represented as follows:
+ *          - 32-bit integer header: the number of items in the list
+ *          - For each item:
+ *              - 32-bit int (n): the number of bytes representing this item
+ *              - n bytes of data: the actual data.
+ *
+ * @author Deon Nicholas (dnicholas@fb.com)
+ * Copyright 2013 Facebook
+ */
+
+#pragma once
+
+#include <string>
+
+#include "redis_list_exception.h"
+#include "rocksdb/slice.h"
+#include "util/coding.h"
+
+namespace rocksdb {
+
+/// An abstraction over the "list" concept.
+/// All operations may throw a RedisListException
+class RedisListIterator {
+ public:
+  /// Construct a redis-list-iterator based on data.
+  /// If the data is non-empty, it must formatted according to @notes above.
+  ///
+  /// If the data is valid, we can assume the following invariant(s):
+  ///  a) length_, num_bytes_ are set correctly.
+  ///  b) cur_byte_ always refers to the start of the current element,
+  ///       just before the bytes that specify element length.
+  ///  c) cur_elem_ is always the index of the current element.
+  ///  d) cur_elem_length_ is always the number of bytes in current element,
+  ///       excluding the 4-byte header itself.
+  ///  e) result_ will always contain data_[0..cur_byte_) and a header
+  ///  f) Whenever corrupt data is encountered or an invalid operation is
+  ///      attempted, a RedisListException will immediately be thrown.
+  RedisListIterator(const std::string& list_data)
+      : data_(list_data.data()),
+        num_bytes_(list_data.size()),
+        cur_byte_(0),
+        cur_elem_(0),
+        cur_elem_length_(0),
+        length_(0),
+        result_() {
+
+    // Initialize the result_ (reserve enough space for header)
+    InitializeResult();
+
+    // Parse the data only if it is not empty.
+    if (num_bytes_ == 0) {
+      return;
+    }
+
+    // If non-empty, but less than 4 bytes, data must be corrupt
+    if (num_bytes_ < sizeof(length_)) {
+      ThrowError("Corrupt header.");    // Will break control flow
+    }
+
+    // Good. The first bytes specify the number of elements
+    length_ = DecodeFixed32(data_);
+    cur_byte_ = sizeof(length_);
+
+    // If we have at least one element, point to that element.
+    // Also, read the first integer of the element (specifying the size),
+    //   if possible.
+    if (length_ > 0) {
+      if (cur_byte_ + sizeof(cur_elem_length_) <= num_bytes_) {
+        cur_elem_length_ = DecodeFixed32(data_+cur_byte_);
+      } else {
+        ThrowError("Corrupt data for first element.");
+      }
+    }
+
+    // At this point, we are fully set-up.
+    // The invariants described in the header should now be true.
+  }
+
+  /// Reserve some space for the result_.
+  /// Equivalent to result_.reserve(bytes).
+  void Reserve(int bytes) {
+    result_.reserve(bytes);
+  }
+
+  /// Go to next element in data file.
+  /// Also writes the current element to result_.
+  RedisListIterator& Push() {
+    WriteCurrentElement();
+    MoveNext();
+    return *this;
+  }
+
+  /// Go to next element in data file.
+  /// Drops/skips the current element. It will not be written to result_.
+  RedisListIterator& Skip() {
+    MoveNext();
+    --length_;          // One less item
+    --cur_elem_;        // We moved one forward, but index did not change
+    return *this;
+  }
+
+  /// Insert elem into the result_ (just BEFORE the current element / byte)
+  /// Note: if Done() (i.e.: iterator points to end), this will append elem.
+  void InsertElement(const Slice& elem) {
+    // Ensure we are in a valid state
+    CheckErrors();
+
+    const int kOrigSize = result_.size();
+    result_.resize(kOrigSize + SizeOf(elem));
+    EncodeFixed32(result_.data() + kOrigSize, elem.size());
+    memcpy(result_.data() + kOrigSize + sizeof(uint32_t),
+           elem.data(),
+           elem.size());
+    ++length_;
+    ++cur_elem_;
+  }
+
+  /// Access the current element, and save the result into *curElem
+  void GetCurrent(Slice* curElem) {
+    // Ensure we are in a valid state
+    CheckErrors();
+
+    // Ensure that we are not past the last element.
+    if (Done()) {
+      ThrowError("Invalid dereferencing.");
+    }
+
+    // Dereference the element
+    *curElem = Slice(data_+cur_byte_+sizeof(cur_elem_length_),
+                     cur_elem_length_);
+  }
+
+  // Number of elements
+  int Length() const {
+    return length_;
+  }
+
+  // Number of bytes in the final representation (i.e: WriteResult().size())
+  int Size() const {
+    // result_ holds the currently written data
+    // data_[cur_byte..num_bytes-1] is the remainder of the data
+    return result_.size() + (num_bytes_ - cur_byte_);
+  }
+
+  // Reached the end?
+  bool Done() const {
+    return cur_byte_ >= num_bytes_ || cur_elem_ >= length_;
+  }
+
+  /// Returns a string representing the final, edited, data.
+  /// Assumes that all bytes of data_ in the range [0,cur_byte_) have been read
+  ///  and that result_ contains this data.
+  /// The rest of the data must still be written.
+  /// So, this method ADVANCES THE ITERATOR TO THE END before writing.
+  Slice WriteResult() {
+    CheckErrors();
+
+    // The header should currently be filled with dummy data (0's)
+    // Correctly update the header.
+    // Note, this is safe since result_ is a vector (guaranteed contiguous)
+    EncodeFixed32(&result_[0],length_);
+
+    // Append the remainder of the data to the result.
+    result_.insert(result_.end(),data_+cur_byte_, data_ +num_bytes_);
+
+    // Seek to end of file
+    cur_byte_ = num_bytes_;
+    cur_elem_ = length_;
+    cur_elem_length_ = 0;
+
+    // Return the result
+    return Slice(result_.data(),result_.size());
+  }
+
+ public: // Static public functions
+
+  /// An upper-bound on the amount of bytes needed to store this element.
+  /// This is used to hide representation information from the client.
+  /// E.G. This can be used to compute the bytes we want to Reserve().
+  static uint32_t SizeOf(const Slice& elem) {
+    // [Integer Length . Data]
+    return sizeof(uint32_t) + elem.size();
+  }
+
+ private: // Private functions
+
+  /// Initializes the result_ string.
+  /// It will fill the first few bytes with 0's so that there is
+  ///  enough space for header information when we need to write later.
+  /// Currently, "header information" means: the length (number of elements)
+  /// Assumes that result_ is empty to begin with
+  void InitializeResult() {
+    assert(result_.empty());            // Should always be true.
+    result_.resize(sizeof(uint32_t),0); // Put a block of 0's as the header
+  }
+
+  /// Go to the next element (used in Push() and Skip())
+  void MoveNext() {
+    CheckErrors();
+
+    // Check to make sure we are not already in a finished state
+    if (Done()) {
+      ThrowError("Attempting to iterate past end of list.");
+    }
+
+    // Move forward one element.
+    cur_byte_ += sizeof(cur_elem_length_) + cur_elem_length_;
+    ++cur_elem_;
+
+    // If we are at the end, finish
+    if (Done()) {
+      cur_elem_length_ = 0;
+      return;
+    }
+
+    // Otherwise, we should be able to read the new element's length
+    if (cur_byte_ + sizeof(cur_elem_length_) > num_bytes_) {
+      ThrowError("Corrupt element data.");
+    }
+
+    // Set the new element's length
+    cur_elem_length_ = DecodeFixed32(data_+cur_byte_);
+
+    return;
+  }
+
+  /// Append the current element (pointed to by cur_byte_) to result_
+  /// Assumes result_ has already been reserved appropriately.
+  void WriteCurrentElement() {
+    // First verify that the iterator is still valid.
+    CheckErrors();
+    if (Done()) {
+      ThrowError("Attempting to write invalid element.");
+    }
+
+    // Append the cur element.
+    result_.insert(result_.end(),
+                   data_+cur_byte_,
+                   data_+cur_byte_+ sizeof(uint32_t) + cur_elem_length_);
+  }
+
+  /// Will ThrowError() if neccessary.
+  /// Checks for common/ubiquitous errors that can arise after most operations.
+  /// This method should be called before any reading operation.
+  /// If this function succeeds, then we are guaranteed to be in a valid state.
+  /// Other member functions should check for errors and ThrowError() also
+  ///  if an error occurs that is specific to it even while in a valid state.
+  void CheckErrors() {
+    // Check if any crazy thing has happened recently
+    if ((cur_elem_ > length_) ||                              // Bad index
+        (cur_byte_ > num_bytes_) ||                           // No more bytes
+        (cur_byte_ + cur_elem_length_ > num_bytes_) ||        // Item too large
+        (cur_byte_ == num_bytes_ && cur_elem_ != length_) ||  // Too many items
+        (cur_elem_ == length_ && cur_byte_ != num_bytes_)) {  // Too many bytes
+      ThrowError("Corrupt data.");
+    }
+  }
+
+  /// Will throw an exception based on the passed-in message.
+  /// This function is guaranteed to STOP THE CONTROL-FLOW.
+  /// (i.e.: you do not have to call "return" after calling ThrowError)
+  void ThrowError(const char* const msg = NULL) {
+    // TODO: For now we ignore the msg parameter. This can be expanded later.
+    throw RedisListException();
+  }
+
+ private:
+  const char* const data_;      // A pointer to the data (the first byte)
+  const uint32_t num_bytes_;    // The number of bytes in this list
+
+  uint32_t cur_byte_;           // The current byte being read
+  uint32_t cur_elem_;           // The current element being read
+  uint32_t cur_elem_length_;    // The number of bytes in current element
+
+  uint32_t length_;             // The number of elements in this list
+  std::vector<char> result_;    // The output data
+};
+
+} // namespace rocksdb
diff --git a/utilities/redis/redis_lists.cc b/utilities/redis/redis_lists.cc
new file mode 100644 (file)
index 0000000..50c544a
--- /dev/null
@@ -0,0 +1,551 @@
+/**
+ * A (persistent) Redis API built using the rocksdb backend.
+ * Implements Redis Lists as described on: http://redis.io/commands#list
+ *
+ * @throws All functions may throw a RedisListException on error/corruption.
+ *
+ * @notes Internally, the set of lists is stored in a rocksdb database,
+ *        mapping keys to values. Each "value" is the list itself, storing
+ *        some kind of internal representation of the data. All the
+ *        representation details are handled by the RedisListIterator class.
+ *        The present file should be oblivious to the representation details,
+ *        handling only the client (Redis) API, and the calls to rocksdb.
+ *
+ * @TODO  Presently, all operations take at least O(NV) time where
+ *        N is the number of elements in the list, and V is the average
+ *        number of bytes per value in the list. So maybe, with merge operator
+ *        we can improve this to an optimal O(V) amortized time, since we
+ *        wouldn't have to read and re-write the entire list.
+ *
+ * @author Deon Nicholas (dnicholas@fb.com)
+ * Copyright 2013 Facebook
+ */
+
+#include "redis_lists.h"
+
+#include <iostream>
+#include <memory>
+#include <cmath>
+
+#include "rocksdb/slice.h"
+#include "util/coding.h"
+
+namespace rocksdb
+{
+
+/// Constructors
+
+RedisLists::RedisLists(const std::string& db_path,
+                       Options options, bool destructive)
+    : put_option_(),
+      get_option_() {
+
+  // Store the name of the database
+  db_name_ = db_path;
+
+  // If destructive, destroy the DB before re-opening it.
+  if (destructive) {
+    DestroyDB(db_name_, Options());
+  }
+
+  // Now open and deal with the db
+  DB* db;
+  Status s = DB::Open(options, db_name_, &db);
+  if (!s.ok()) {
+    std::cerr << "ERROR " << s.ToString() << std::endl;
+    assert(false);
+  }
+
+  db_ = std::unique_ptr<DB>(db);
+}
+
+
+/// Accessors
+
+// Number of elements in the list associated with key
+//   : throws RedisListException
+int RedisLists::Length(const std::string& key) {
+  // Extract the string data representing the list.
+  std::string data;
+  db_->Get(get_option_, key, &data);
+
+  // Return the length
+  RedisListIterator it(data);
+  return it.Length();
+}
+
+// Get the element at the specified index in the (list: key)
+// Returns <empty> ("") on out-of-bounds
+//   : throws RedisListException
+bool RedisLists::Index(const std::string& key, int32_t index,
+                       std::string* result) {
+  // Extract the string data representing the list.
+  std::string data;
+  db_->Get(get_option_, key, &data);
+
+  // Handle REDIS negative indices (from the end); fast iff Length() takes O(1)
+  if (index < 0) {
+    index = Length(key) - (-index);  //replace (-i) with (N-i).
+  }
+
+  // Iterate through the list until the desired index is found.
+  int curIndex = 0;
+  RedisListIterator it(data);
+  while(curIndex < index && !it.Done()) {
+    ++curIndex;
+    it.Skip();
+  }
+
+  // If we actually found the index
+  if (curIndex == index && !it.Done()) {
+    Slice elem;
+    it.GetCurrent(&elem);
+    if (result != NULL) {
+      *result = elem.ToString();
+    }
+
+    return true;
+  } else {
+    return false;
+  }
+}
+
+// Return a truncated version of the list.
+// First, negative values for first/last are interpreted as "end of list".
+// So, if first == -1, then it is re-set to index: (Length(key) - 1)
+// Then, return exactly those indices i such that first <= i <= last.
+//   : throws RedisListException
+std::vector<std::string> RedisLists::Range(const std::string& key,
+                                           int32_t first, int32_t last) {
+  // Extract the string data representing the list.
+  std::string data;
+  db_->Get(get_option_, key, &data);
+
+  // Handle negative bounds (-1 means last element, etc.)
+  int listLen = Length(key);
+  if (first < 0) {
+    first = listLen - (-first);           // Replace (-x) with (N-x)
+  }
+  if (last < 0) {
+    last = listLen - (-last);
+  }
+
+  // Verify bounds (and truncate the range so that it is valid)
+  first = std::max(first, 0);
+  last = std::min(last, listLen-1);
+  int len = std::max(last-first+1, 0);
+
+  // Initialize the resulting list
+  std::vector<std::string> result(len);
+
+  // Traverse the list and update the vector
+  int curIdx = 0;
+  Slice elem;
+  for (RedisListIterator it(data); !it.Done() && curIdx<=last; it.Skip()) {
+    if (first <= curIdx && curIdx <= last) {
+      it.GetCurrent(&elem);
+      result[curIdx-first].assign(elem.data(),elem.size());
+    }
+
+    ++curIdx;
+  }
+
+  // Return the result. Might be empty
+  return result;
+}
+
+// Print the (list: key) out to stdout. For debugging mostly. Public for now.
+void RedisLists::Print(const std::string& key) {
+  // Extract the string data representing the list.
+  std::string data;
+  db_->Get(get_option_, key, &data);
+
+  // Iterate through the list and print the items
+  Slice elem;
+  for (RedisListIterator it(data); !it.Done(); it.Skip()) {
+    it.GetCurrent(&elem);
+    std::cout << "ITEM " << elem.ToString() << std::endl;
+  }
+
+  //Now print the byte data
+  RedisListIterator it(data);
+  std::cout << "==Printing data==" << std::endl;
+  std::cout << data.size() << std::endl;
+  std::cout << it.Size() << " " << it.Length() << std::endl;
+  Slice result = it.WriteResult();
+  std::cout << result.data() << std::endl;
+  if (true) {
+    std::cout << "size: " << result.size() << std::endl;
+    const char* val = result.data();
+    for(int i=0; i<(int)result.size(); ++i) {
+      std::cout << (int)val[i] << " " << (val[i]>=32?val[i]:' ') << std::endl;
+    }
+    std::cout << std::endl;
+  }
+}
+
+/// Insert/Update Functions
+/// Note: The "real" insert function is private. See below.
+
+// InsertBefore and InsertAfter are simply wrappers around the Insert function.
+int RedisLists::InsertBefore(const std::string& key, const std::string& pivot,
+                             const std::string& value) {
+  return Insert(key, pivot, value, false);
+}
+
+int RedisLists::InsertAfter(const std::string& key, const std::string& pivot,
+                            const std::string& value) {
+  return Insert(key, pivot, value, true);
+}
+
+// Prepend value onto beginning of (list: key)
+//   : throws RedisListException
+int RedisLists::PushLeft(const std::string& key, const std::string& value) {
+  // Get the original list data
+  std::string data;
+  db_->Get(get_option_, key, &data);
+
+  // Construct the result
+  RedisListIterator it(data);
+  it.Reserve(it.Size() + it.SizeOf(value));
+  it.InsertElement(value);
+
+  // Push the data back to the db and return the length
+  db_->Put(put_option_, key, it.WriteResult());
+  return it.Length();
+}
+
+// Append value onto end of (list: key)
+// TODO: Make this O(1) time. Might require MergeOperator.
+//   : throws RedisListException
+int RedisLists::PushRight(const std::string& key, const std::string& value) {
+  // Get the original list data
+  std::string data;
+  db_->Get(get_option_, key, &data);
+
+  // Create an iterator to the data and seek to the end.
+  RedisListIterator it(data);
+  it.Reserve(it.Size() + it.SizeOf(value));
+  while (!it.Done()) {
+    it.Push();    // Write each element as we go
+  }
+
+  // Insert the new element at the current position (the end)
+  it.InsertElement(value);
+
+  // Push it back to the db, and return length
+  db_->Put(put_option_, key, it.WriteResult());
+  return it.Length();
+}
+
+// Set (list: key)[idx] = val. Return true on success, false on fail.
+//   : throws RedisListException
+bool RedisLists::Set(const std::string& key, int32_t index,
+                     const std::string& value) {
+  // Get the original list data
+  std::string data;
+  db_->Get(get_option_, key, &data);
+
+  // Handle negative index for REDIS (meaning -index from end of list)
+  if (index < 0) {
+    index = Length(key) - (-index);
+  }
+
+  // Iterate through the list until we find the element we want
+  int curIndex = 0;
+  RedisListIterator it(data);
+  it.Reserve(it.Size() + it.SizeOf(value));  // Over-estimate is fine
+  while(curIndex < index && !it.Done()) {
+    it.Push();
+    ++curIndex;
+  }
+
+  // If not found, return false (this occurs when index was invalid)
+  if (it.Done() || curIndex != index) {
+    return false;
+  }
+
+  // Write the new element value, and drop the previous element value
+  it.InsertElement(value);
+  it.Skip();
+
+  // Write the data to the database
+  // Check status, since it needs to return true/false guarantee
+  Status s = db_->Put(put_option_, key, it.WriteResult());
+
+  // Success
+  return s.ok();
+}
+
+/// Delete / Remove / Pop functions
+
+// Trim (list: key) so that it will only contain the indices from start..stop
+//  Invalid indices will not generate an error, just empty,
+//  or the portion of the list that fits in this interval
+//   : throws RedisListException
+bool RedisLists::Trim(const std::string& key, int32_t start, int32_t stop) {
+  // Get the original list data
+  std::string data;
+  db_->Get(get_option_, key, &data);
+
+  // Handle negative indices in REDIS
+  int listLen = Length(key);
+  if (start < 0) {
+    start = listLen - (-start);
+  }
+  if (stop < 0) {
+    stop = listLen - (-stop);
+  }
+
+  // Truncate bounds to only fit in the list
+  start = std::max(start, 0);
+  stop = std::min(stop, listLen-1);
+
+  // Construct an iterator for the list. Drop all undesired elements.
+  int curIndex = 0;
+  RedisListIterator it(data);
+  it.Reserve(it.Size());          // Over-estimate
+  while(!it.Done()) {
+    // If not within the range, just skip the item (drop it).
+    // Otherwise, continue as usual.
+    if (start <= curIndex && curIndex <= stop) {
+      it.Push();
+    } else {
+      it.Skip();
+    }
+
+    // Increment the current index
+    ++curIndex;
+  }
+
+  // Write the (possibly empty) result to the database
+  Status s = db_->Put(put_option_, key, it.WriteResult());
+
+  // Return true as long as the write succeeded
+  return s.ok();
+}
+
+// Return and remove the first element in the list (or "" if empty)
+//   : throws RedisListException
+bool RedisLists::PopLeft(const std::string& key, std::string* result) {
+  // Get the original list data
+  std::string data;
+  db_->Get(get_option_, key, &data);
+
+  // Point to first element in the list (if it exists), and get its value/size
+  RedisListIterator it(data);
+  if (it.Length() > 0) {            // Proceed only if list is non-empty
+    Slice elem;
+    it.GetCurrent(&elem);           // Store the value of the first element
+    it.Reserve(it.Size() - it.SizeOf(elem));
+    it.Skip();                      // DROP the first item and move to next
+
+    // Update the db
+    db_->Put(put_option_, key, it.WriteResult());
+
+    // Return the value
+    if (result != NULL) {
+      *result = elem.ToString();
+    }
+    return true;
+  } else {
+    return false;
+  }
+}
+
+// Remove and return the last element in the list (or "" if empty)
+// TODO: Make this O(1). Might require MergeOperator.
+//   : throws RedisListException
+bool RedisLists::PopRight(const std::string& key, std::string* result) {
+  // Extract the original list data
+  std::string data;
+  db_->Get(get_option_, key, &data);
+
+  // Construct an iterator to the data and move to last element
+  RedisListIterator it(data);
+  it.Reserve(it.Size());
+  int len = it.Length();
+  int curIndex = 0;
+  while(curIndex < (len-1) && !it.Done()) {
+    it.Push();
+    ++curIndex;
+  }
+
+  // Extract and drop/skip the last element
+  if (curIndex == len-1) {
+    assert(!it.Done());         // Sanity check. Should not have ended here.
+
+    // Extract and pop the element
+    Slice elem;
+    it.GetCurrent(&elem);       // Save value of element.
+    it.Skip();                  // Skip the element
+
+    // Write the result to the database
+    db_->Put(put_option_, key, it.WriteResult());
+
+    // Return the value
+    if (result != NULL) {
+      *result = elem.ToString();
+    }
+    return true;
+  } else {
+    // Must have been an empty list
+    assert(it.Done() && len==0 && curIndex == 0);
+    return false;
+  }
+}
+
+// Remove the (first or last) "num" occurrences of value in (list: key)
+//   : throws RedisListException
+int RedisLists::Remove(const std::string& key, int32_t num,
+                       const std::string& value) {
+  // Negative num ==> RemoveLast; Positive num ==> Remove First
+  if (num < 0) {
+    return RemoveLast(key, -num, value);
+  } else if (num > 0) {
+    return RemoveFirst(key, num, value);
+  } else {
+    return RemoveFirst(key, Length(key), value);
+  }
+}
+
+// Remove the first "num" occurrences of value in (list: key).
+//   : throws RedisListException
+int RedisLists::RemoveFirst(const std::string& key, int32_t num,
+                            const std::string& value) {
+  // Ensure that the number is positive
+  assert(num >= 0);
+
+  // Extract the original list data
+  std::string data;
+  db_->Get(get_option_, key, &data);
+
+  // Traverse the list, appending all but the desired occurrences of value
+  int numSkipped = 0;         // Keep track of the number of times value is seen
+  Slice elem;
+  RedisListIterator it(data);
+  it.Reserve(it.Size());
+  while (!it.Done()) {
+    it.GetCurrent(&elem);
+
+    if (elem == value && numSkipped < num) {
+      // Drop this item if desired
+      it.Skip();
+      ++numSkipped;
+    } else {
+      // Otherwise keep the item and proceed as normal
+      it.Push();
+    }
+  }
+
+  // Put the result back to the database
+  db_->Put(put_option_, key, it.WriteResult());
+
+  // Return the number of elements removed
+  return numSkipped;
+}
+
+
+// Remove the last "num" occurrences of value in (list: key).
+// TODO: I traverse the list 2x. Make faster. Might require MergeOperator.
+//   : throws RedisListException
+int RedisLists::RemoveLast(const std::string& key, int32_t num,
+                           const std::string& value) {
+  // Ensure that the number is positive
+  assert(num >= 0);
+
+  // Extract the original list data
+  std::string data;
+  db_->Get(get_option_, key, &data);
+
+  // Temporary variable to hold the "current element" in the blocks below
+  Slice elem;
+
+  // Count the total number of occurrences of value
+  int totalOccs = 0;
+  for (RedisListIterator it(data); !it.Done(); it.Skip()) {
+    it.GetCurrent(&elem);
+    if (elem == value) {
+      ++totalOccs;
+    }
+  }
+
+  // Construct an iterator to the data. Reserve enough space for the result.
+  RedisListIterator it(data);
+  int bytesRemoved = std::min(num,totalOccs)*it.SizeOf(value);
+  it.Reserve(it.Size() - bytesRemoved);
+
+  // Traverse the list, appending all but the desired occurrences of value.
+  // Note: "Drop the last k occurrences" is equivalent to
+  //  "keep only the first n-k occurrences", where n is total occurrences.
+  int numKept = 0;          // Keep track of the number of times value is kept
+  while(!it.Done()) {
+    it.GetCurrent(&elem);
+
+    // If we are within the deletion range and equal to value, drop it.
+    // Otherwise, append/keep/push it.
+    if (elem == value) {
+      if (numKept < totalOccs - num) {
+        it.Push();
+        ++numKept;
+      } else {
+        it.Skip();
+      }
+    } else {
+      // Always append the others
+      it.Push();
+    }
+  }
+
+  // Put the result back to the database
+  db_->Put(put_option_, key, it.WriteResult());
+
+  // Return the number of elements removed
+  return totalOccs - numKept;
+}
+
+/// Private functions
+
+// Insert element value into (list: key), right before/after
+//  the first occurrence of pivot
+//   : throws RedisListException
+int RedisLists::Insert(const std::string& key, const std::string& pivot,
+                       const std::string& value, bool insert_after) {
+  // Get the original list data
+  std::string data;
+  db_->Get(get_option_, key, &data);
+
+  // Construct an iterator to the data and reserve enough space for result.
+  RedisListIterator it(data);
+  it.Reserve(it.Size() + it.SizeOf(value));
+
+  // Iterate through the list until we find the element we want
+  Slice elem;
+  bool found = false;
+  while(!it.Done() && !found) {
+    it.GetCurrent(&elem);
+
+    // When we find the element, insert the element and mark found
+    if (elem == pivot) {                // Found it!
+      found = true;
+      if (insert_after == true) {       // Skip one more, if inserting after it
+        it.Push();
+      }
+      it.InsertElement(value);
+    } else {
+      it.Push();
+    }
+
+  }
+
+  // Put the data (string) into the database
+  if (found) {
+    db_->Put(put_option_, key, it.WriteResult());
+  }
+
+  // Returns the new (possibly unchanged) length of the list
+  return it.Length();
+}
+
+
+}
diff --git a/utilities/redis/redis_lists.h b/utilities/redis/redis_lists.h
new file mode 100644 (file)
index 0000000..8c149bc
--- /dev/null
@@ -0,0 +1,106 @@
+/**
+ * A (persistent) Redis API built using the rocksdb backend.
+ * Implements Redis Lists as described on: http://redis.io/commands#list
+ *
+ * @throws All functions may throw a RedisListException
+ *
+ * @author Deon Nicholas (dnicholas@fb.com)
+ * Copyright 2013 Facebook
+ */
+
+#pragma once
+
+#include <string>
+#include "rocksdb/db.h"
+#include "redis_list_iterator.h"
+#include "redis_list_exception.h"
+
+namespace rocksdb {
+
+/// The Redis functionality (see http://redis.io/commands#list)
+/// All functions may THROW a RedisListException
+class RedisLists {
+ public: // Constructors / Destructors
+  /// Construct a new RedisLists database, with name/path of db.
+  /// Will clear the database on open iff destructive is true (default false).
+  /// Otherwise, it will restore saved changes.
+  /// May throw RedisListException
+  RedisLists(const std::string& db_path,
+             Options options, bool destructive = false);
+
+ public:  // Accessors
+  /// The number of items in (list: key)
+  int Length(const std::string& key);
+
+  /// Search the list for the (index)'th item (0-based) in (list:key)
+  /// A negative index indicates: "from end-of-list"
+  /// If index is within range: return true, and return the value in *result.
+  /// If (index < -length OR index>=length), then index is out of range:
+  ///   return false (and *result is left unchanged)
+  /// May throw RedisListException
+  bool Index(const std::string& key, int32_t index,
+             std::string* result);
+
+  /// Return (list: key)[first..last] (inclusive)
+  /// May throw RedisListException
+  std::vector<std::string> Range(const std::string& key,
+                                 int32_t first, int32_t last);
+
+  /// Prints the entire (list: key), for debugging.
+  void Print(const std::string& key);
+
+ public: // Insert/Update
+  /// Insert value before/after pivot in (list: key). Return the length.
+  /// May throw RedisListException
+  int InsertBefore(const std::string& key, const std::string& pivot,
+                   const std::string& value);
+  int InsertAfter(const std::string& key, const std::string& pivot,
+                  const std::string& value);
+
+  /// Push / Insert value at beginning/end of the list. Return the length.
+  /// May throw RedisListException
+  int PushLeft(const std::string& key, const std::string& value);
+  int PushRight(const std::string& key, const std::string& value);
+
+  /// Set (list: key)[idx] = val. Return true on success, false on fail
+  /// May throw RedisListException
+  bool Set(const std::string& key, int32_t index, const std::string& value);
+
+ public: // Delete / Remove / Pop / Trim
+  /// Trim (list: key) so that it will only contain the indices from start..stop
+  /// Returns true on success
+  /// May throw RedisListException
+  bool Trim(const std::string& key, int32_t start, int32_t stop);
+
+  /// If list is empty, return false and leave *result unchanged.
+  /// Else, remove the first/last elem, store it in *result, and return true
+  bool PopLeft(const std::string& key, std::string* result);  // First
+  bool PopRight(const std::string& key, std::string* result); // Last
+
+  /// Remove the first (or last) num occurrences of value from the list (key)
+  /// Return the number of elements removed.
+  /// May throw RedisListException
+  int Remove(const std::string& key, int32_t num,
+             const std::string& value);
+  int RemoveFirst(const std::string& key, int32_t num,
+                  const std::string& value);
+  int RemoveLast(const std::string& key, int32_t num,
+                 const std::string& value);
+
+ private: // Private Functions
+  /// Calls InsertBefore or InsertAfter
+  int Insert(const std::string& key, const std::string& pivot,
+             const std::string& value, bool insert_after);
+ private:
+  std::string db_name_;       // The actual database name/path
+  WriteOptions put_option_;
+  ReadOptions get_option_;
+
+  /// The backend rocksdb database.
+  /// Map : key --> list
+  ///       where a list is a sequence of elements
+  ///       and an element is a 4-byte integer (n), followed by n bytes of data
+  std::unique_ptr<DB> db_;
+};
+
+} // namespace rocksdb
diff --git a/utilities/redis/redis_lists_test.cc b/utilities/redis/redis_lists_test.cc
new file mode 100644 (file)
index 0000000..0600e0e
--- /dev/null
@@ -0,0 +1,875 @@
+/**
+ * A test harness for the Redis API built on rocksdb.
+ *
+ * USAGE: Build with: "make redis_test" (in rocksdb directory).
+ *        Run unit tests with: "./redis_test"
+ *        Manual/Interactive user testing: "./redis_test -m"
+ *        Manual user testing + restart database: "./redis_test -m -d"
+ *
+ * TODO:  Add LARGE random test cases to verify efficiency and scalability
+ *
+ * @author Deon Nicholas (dnicholas@fb.com)
+ * Copyright 2013 Facebook
+ */
+
+
+#include <iostream>
+#include <cctype>
+
+#include "redis_lists.h"
+#include "util/testharness.h"
+#include "util/random.h"
+
+using namespace rocksdb;
+using namespace std;
+
+namespace rocksdb {
+
+class RedisListsTest {
+ public:
+  static const string kDefaultDbName;
+  static Options options;
+
+  RedisListsTest() {
+    options.create_if_missing = true;
+  }
+};
+
+const string RedisListsTest::kDefaultDbName = "/tmp/redisdefaultdb/";
+Options RedisListsTest::options = Options();
+
+// operator== and operator<< are defined below for vectors (lists)
+// Needed for ASSERT_EQ
+
+void AssertListEq(const std::vector<std::string>& result,
+                  const std::vector<std::string>& expected_result) {
+  ASSERT_EQ(result.size(), expected_result.size());
+  for (size_t i = 0; i < result.size(); ++i) {
+    ASSERT_EQ(result[i], expected_result[i]);
+  }
+}
+
+// PushRight, Length, Index, Range
+TEST(RedisListsTest, SimpleTest) {
+  RedisLists redis(kDefaultDbName, options, true);   // Destructive
+
+  string tempv; // Used below for all Index(), PopRight(), PopLeft()
+
+  // Simple PushRight (should return the new length each time)
+  ASSERT_EQ(redis.PushRight("k1", "v1"), 1);
+  ASSERT_EQ(redis.PushRight("k1", "v2"), 2);
+  ASSERT_EQ(redis.PushRight("k1", "v3"), 3);
+
+  // Check Length and Index() functions
+  ASSERT_EQ(redis.Length("k1"), 3);        // Check length
+  ASSERT_TRUE(redis.Index("k1", 0, &tempv));
+  ASSERT_EQ(tempv, "v1");   // Check valid indices
+  ASSERT_TRUE(redis.Index("k1", 1, &tempv));
+  ASSERT_EQ(tempv, "v2");
+  ASSERT_TRUE(redis.Index("k1", 2, &tempv));
+  ASSERT_EQ(tempv, "v3");
+
+  // Check range function and vectors
+  std::vector<std::string> result = redis.Range("k1", 0, 2);   // Get the list
+  std::vector<std::string> expected_result(3);
+  expected_result[0] = "v1";
+  expected_result[1] = "v2";
+  expected_result[2] = "v3";
+  AssertListEq(result, expected_result);
+}
+
+// PushLeft, Length, Index, Range
+TEST(RedisListsTest, SimpleTest2) {
+  RedisLists redis(kDefaultDbName, options, true);   // Destructive
+
+  string tempv; // Used below for all Index(), PopRight(), PopLeft()
+
+  // Simple PushRight
+  ASSERT_EQ(redis.PushLeft("k1", "v3"), 1);
+  ASSERT_EQ(redis.PushLeft("k1", "v2"), 2);
+  ASSERT_EQ(redis.PushLeft("k1", "v1"), 3);
+
+  // Check Length and Index() functions
+  ASSERT_EQ(redis.Length("k1"), 3);        // Check length
+  ASSERT_TRUE(redis.Index("k1", 0, &tempv));
+  ASSERT_EQ(tempv, "v1");   // Check valid indices
+  ASSERT_TRUE(redis.Index("k1", 1, &tempv));
+  ASSERT_EQ(tempv, "v2");
+  ASSERT_TRUE(redis.Index("k1", 2, &tempv));
+  ASSERT_EQ(tempv, "v3");
+
+  // Check range function and vectors
+  std::vector<std::string> result = redis.Range("k1", 0, 2);   // Get the list
+  std::vector<std::string> expected_result(3);
+  expected_result[0] = "v1";
+  expected_result[1] = "v2";
+  expected_result[2] = "v3";
+  AssertListEq(result, expected_result);
+}
+
+// Exhaustive test of the Index() function
+TEST(RedisListsTest, IndexTest) {
+  RedisLists redis(kDefaultDbName, options, true);   // Destructive
+
+  string tempv; // Used below for all Index(), PopRight(), PopLeft()
+
+  // Empty Index check (return empty and should not crash or edit tempv)
+  tempv = "yo";
+  ASSERT_TRUE(!redis.Index("k1", 0, &tempv));
+  ASSERT_EQ(tempv, "yo");
+  ASSERT_TRUE(!redis.Index("fda", 3, &tempv));
+  ASSERT_EQ(tempv, "yo");
+  ASSERT_TRUE(!redis.Index("random", -12391, &tempv));
+  ASSERT_EQ(tempv, "yo");
+
+  // Simple Pushes (will yield: [v6, v4, v4, v1, v2, v3]
+  redis.PushRight("k1", "v1");
+  redis.PushRight("k1", "v2");
+  redis.PushRight("k1", "v3");
+  redis.PushLeft("k1", "v4");
+  redis.PushLeft("k1", "v4");
+  redis.PushLeft("k1", "v6");
+
+  // Simple, non-negative indices
+  ASSERT_TRUE(redis.Index("k1", 0, &tempv));
+  ASSERT_EQ(tempv, "v6");
+  ASSERT_TRUE(redis.Index("k1", 1, &tempv));
+  ASSERT_EQ(tempv, "v4");
+  ASSERT_TRUE(redis.Index("k1", 2, &tempv));
+  ASSERT_EQ(tempv, "v4");
+  ASSERT_TRUE(redis.Index("k1", 3, &tempv));
+  ASSERT_EQ(tempv, "v1");
+  ASSERT_TRUE(redis.Index("k1", 4, &tempv));
+  ASSERT_EQ(tempv, "v2");
+  ASSERT_TRUE(redis.Index("k1", 5, &tempv));
+  ASSERT_EQ(tempv, "v3");
+
+  // Negative indices
+  ASSERT_TRUE(redis.Index("k1", -6, &tempv));
+  ASSERT_EQ(tempv, "v6");
+  ASSERT_TRUE(redis.Index("k1", -5, &tempv));
+  ASSERT_EQ(tempv, "v4");
+  ASSERT_TRUE(redis.Index("k1", -4, &tempv));
+  ASSERT_EQ(tempv, "v4");
+  ASSERT_TRUE(redis.Index("k1", -3, &tempv));
+  ASSERT_EQ(tempv, "v1");
+  ASSERT_TRUE(redis.Index("k1", -2, &tempv));
+  ASSERT_EQ(tempv, "v2");
+  ASSERT_TRUE(redis.Index("k1", -1, &tempv));
+  ASSERT_EQ(tempv, "v3");
+
+  // Out of bounds (return empty, no crash)
+  ASSERT_TRUE(!redis.Index("k1", 6, &tempv));
+  ASSERT_TRUE(!redis.Index("k1", 123219, &tempv));
+  ASSERT_TRUE(!redis.Index("k1", -7, &tempv));
+  ASSERT_TRUE(!redis.Index("k1", -129, &tempv));
+}
+
+
+// Exhaustive test of the Range() function
+TEST(RedisListsTest, RangeTest) {
+  RedisLists redis(kDefaultDbName, options, true);   // Destructive
+
+  string tempv; // Used below for all Index(), PopRight(), PopLeft()
+
+  // Simple Pushes (will yield: [v6, v4, v4, v1, v2, v3])
+  redis.PushRight("k1", "v1");
+  redis.PushRight("k1", "v2");
+  redis.PushRight("k1", "v3");
+  redis.PushLeft("k1", "v4");
+  redis.PushLeft("k1", "v4");
+  redis.PushLeft("k1", "v6");
+
+  // Sanity check (check the length;  make sure it's 6)
+  ASSERT_EQ(redis.Length("k1"), 6);
+
+  // Simple range
+  std::vector<std::string> res = redis.Range("k1", 1, 4);
+  ASSERT_EQ((int)res.size(), 4);
+  ASSERT_EQ(res[0], "v4");
+  ASSERT_EQ(res[1], "v4");
+  ASSERT_EQ(res[2], "v1");
+  ASSERT_EQ(res[3], "v2");
+
+  // Negative indices (i.e.: measured from the end)
+  res = redis.Range("k1", 2, -1);
+  ASSERT_EQ((int)res.size(), 4);
+  ASSERT_EQ(res[0], "v4");
+  ASSERT_EQ(res[1], "v1");
+  ASSERT_EQ(res[2], "v2");
+  ASSERT_EQ(res[3], "v3");
+
+  res = redis.Range("k1", -6, -4);
+  ASSERT_EQ((int)res.size(), 3);
+  ASSERT_EQ(res[0], "v6");
+  ASSERT_EQ(res[1], "v4");
+  ASSERT_EQ(res[2], "v4");
+
+  res = redis.Range("k1", -1, 5);
+  ASSERT_EQ((int)res.size(), 1);
+  ASSERT_EQ(res[0], "v3");
+
+  // Partial / Broken indices
+  res = redis.Range("k1", -3, 1000000);
+  ASSERT_EQ((int)res.size(), 3);
+  ASSERT_EQ(res[0], "v1");
+  ASSERT_EQ(res[1], "v2");
+  ASSERT_EQ(res[2], "v3");
+
+  res = redis.Range("k1", -1000000, 1);
+  ASSERT_EQ((int)res.size(), 2);
+  ASSERT_EQ(res[0], "v6");
+  ASSERT_EQ(res[1], "v4");
+
+  // Invalid indices
+  res = redis.Range("k1", 7, 9);
+  ASSERT_EQ((int)res.size(), 0);
+
+  res = redis.Range("k1", -8, -7);
+  ASSERT_EQ((int)res.size(), 0);
+
+  res = redis.Range("k1", 3, 2);
+  ASSERT_EQ((int)res.size(), 0);
+
+  res = redis.Range("k1", 5, -2);
+  ASSERT_EQ((int)res.size(), 0);
+
+  // Range matches Index
+  res = redis.Range("k1", -6, -4);
+  ASSERT_TRUE(redis.Index("k1", -6, &tempv));
+  ASSERT_EQ(tempv, res[0]);
+  ASSERT_TRUE(redis.Index("k1", -5, &tempv));
+  ASSERT_EQ(tempv, res[1]);
+  ASSERT_TRUE(redis.Index("k1", -4, &tempv));
+  ASSERT_EQ(tempv, res[2]);
+
+  // Last check
+  res = redis.Range("k1", 0, -6);
+  ASSERT_EQ((int)res.size(), 1);
+  ASSERT_EQ(res[0], "v6");
+}
+
+// Exhaustive test for InsertBefore(), and InsertAfter()
+TEST(RedisListsTest, InsertTest) {
+  RedisLists redis(kDefaultDbName, options, true);
+
+  string tempv; // Used below for all Index(), PopRight(), PopLeft()
+
+  // Insert on empty list (return 0, and do not crash)
+  ASSERT_EQ(redis.InsertBefore("k1", "non-exist", "a"), 0);
+  ASSERT_EQ(redis.InsertAfter("k1", "other-non-exist", "c"), 0);
+  ASSERT_EQ(redis.Length("k1"), 0);
+
+  // Push some preliminary stuff [g, f, e, d, c, b, a]
+  redis.PushLeft("k1", "a");
+  redis.PushLeft("k1", "b");
+  redis.PushLeft("k1", "c");
+  redis.PushLeft("k1", "d");
+  redis.PushLeft("k1", "e");
+  redis.PushLeft("k1", "f");
+  redis.PushLeft("k1", "g");
+  ASSERT_EQ(redis.Length("k1"), 7);
+
+  // Test InsertBefore
+  int newLength = redis.InsertBefore("k1", "e", "hello");
+  ASSERT_EQ(newLength, 8);
+  ASSERT_EQ(redis.Length("k1"), newLength);
+  ASSERT_TRUE(redis.Index("k1", 1, &tempv));
+  ASSERT_EQ(tempv, "f");
+  ASSERT_TRUE(redis.Index("k1", 3, &tempv));
+  ASSERT_EQ(tempv, "e");
+  ASSERT_TRUE(redis.Index("k1", 2, &tempv));
+  ASSERT_EQ(tempv, "hello");
+
+  // Test InsertAfter
+  newLength =  redis.InsertAfter("k1", "c", "bye");
+  ASSERT_EQ(newLength, 9);
+  ASSERT_EQ(redis.Length("k1"), newLength);
+  ASSERT_TRUE(redis.Index("k1", 6, &tempv));
+  ASSERT_EQ(tempv, "bye");
+
+  // Test bad value on InsertBefore
+  newLength = redis.InsertBefore("k1", "yo", "x");
+  ASSERT_EQ(newLength, 9);
+  ASSERT_EQ(redis.Length("k1"), newLength);
+
+  // Test bad value on InsertAfter
+  newLength = redis.InsertAfter("k1", "xxxx", "y");
+  ASSERT_EQ(newLength, 9);
+  ASSERT_EQ(redis.Length("k1"), newLength);
+
+  // Test InsertBefore beginning
+  newLength = redis.InsertBefore("k1", "g", "begggggggggggggggg");
+  ASSERT_EQ(newLength, 10);
+  ASSERT_EQ(redis.Length("k1"), newLength);
+
+  // Test InsertAfter end
+  newLength = redis.InsertAfter("k1", "a", "enddd");
+  ASSERT_EQ(newLength, 11);
+  ASSERT_EQ(redis.Length("k1"), newLength);
+
+  // Make sure nothing weird happened.
+  ASSERT_TRUE(redis.Index("k1", 0, &tempv));
+  ASSERT_EQ(tempv, "begggggggggggggggg");
+  ASSERT_TRUE(redis.Index("k1", 1, &tempv));
+  ASSERT_EQ(tempv, "g");
+  ASSERT_TRUE(redis.Index("k1", 2, &tempv));
+  ASSERT_EQ(tempv, "f");
+  ASSERT_TRUE(redis.Index("k1", 3, &tempv));
+  ASSERT_EQ(tempv, "hello");
+  ASSERT_TRUE(redis.Index("k1", 4, &tempv));
+  ASSERT_EQ(tempv, "e");
+  ASSERT_TRUE(redis.Index("k1", 5, &tempv));
+  ASSERT_EQ(tempv, "d");
+  ASSERT_TRUE(redis.Index("k1", 6, &tempv));
+  ASSERT_EQ(tempv, "c");
+  ASSERT_TRUE(redis.Index("k1", 7, &tempv));
+  ASSERT_EQ(tempv, "bye");
+  ASSERT_TRUE(redis.Index("k1", 8, &tempv));
+  ASSERT_EQ(tempv, "b");
+  ASSERT_TRUE(redis.Index("k1", 9, &tempv));
+  ASSERT_EQ(tempv, "a");
+  ASSERT_TRUE(redis.Index("k1", 10, &tempv));
+  ASSERT_EQ(tempv, "enddd");
+}
+
+// Exhaustive test of Set function
+TEST(RedisListsTest, SetTest) {
+  RedisLists redis(kDefaultDbName, options, true);
+
+  string tempv; // Used below for all Index(), PopRight(), PopLeft()
+
+  // Set on empty list (return false, and do not crash)
+  ASSERT_EQ(redis.Set("k1", 7, "a"), false);
+  ASSERT_EQ(redis.Set("k1", 0, "a"), false);
+  ASSERT_EQ(redis.Set("k1", -49, "cx"), false);
+  ASSERT_EQ(redis.Length("k1"), 0);
+
+  // Push some preliminary stuff [g, f, e, d, c, b, a]
+  redis.PushLeft("k1", "a");
+  redis.PushLeft("k1", "b");
+  redis.PushLeft("k1", "c");
+  redis.PushLeft("k1", "d");
+  redis.PushLeft("k1", "e");
+  redis.PushLeft("k1", "f");
+  redis.PushLeft("k1", "g");
+  ASSERT_EQ(redis.Length("k1"), 7);
+
+  // Test Regular Set
+  ASSERT_TRUE(redis.Set("k1", 0, "0"));
+  ASSERT_TRUE(redis.Set("k1", 3, "3"));
+  ASSERT_TRUE(redis.Set("k1", 6, "6"));
+  ASSERT_TRUE(redis.Set("k1", 2, "2"));
+  ASSERT_TRUE(redis.Set("k1", 5, "5"));
+  ASSERT_TRUE(redis.Set("k1", 1, "1"));
+  ASSERT_TRUE(redis.Set("k1", 4, "4"));
+
+  ASSERT_EQ(redis.Length("k1"), 7); // Size should not change
+  ASSERT_TRUE(redis.Index("k1", 0, &tempv));
+  ASSERT_EQ(tempv, "0");
+  ASSERT_TRUE(redis.Index("k1", 1, &tempv));
+  ASSERT_EQ(tempv, "1");
+  ASSERT_TRUE(redis.Index("k1", 2, &tempv));
+  ASSERT_EQ(tempv, "2");
+  ASSERT_TRUE(redis.Index("k1", 3, &tempv));
+  ASSERT_EQ(tempv, "3");
+  ASSERT_TRUE(redis.Index("k1", 4, &tempv));
+  ASSERT_EQ(tempv, "4");
+  ASSERT_TRUE(redis.Index("k1", 5, &tempv));
+  ASSERT_EQ(tempv, "5");
+  ASSERT_TRUE(redis.Index("k1", 6, &tempv));
+  ASSERT_EQ(tempv, "6");
+
+  // Set with negative indices
+  ASSERT_TRUE(redis.Set("k1", -7, "a"));
+  ASSERT_TRUE(redis.Set("k1", -4, "d"));
+  ASSERT_TRUE(redis.Set("k1", -1, "g"));
+  ASSERT_TRUE(redis.Set("k1", -5, "c"));
+  ASSERT_TRUE(redis.Set("k1", -2, "f"));
+  ASSERT_TRUE(redis.Set("k1", -6, "b"));
+  ASSERT_TRUE(redis.Set("k1", -3, "e"));
+
+  ASSERT_EQ(redis.Length("k1"), 7); // Size should not change
+  ASSERT_TRUE(redis.Index("k1", 0, &tempv));
+  ASSERT_EQ(tempv, "a");
+  ASSERT_TRUE(redis.Index("k1", 1, &tempv));
+  ASSERT_EQ(tempv, "b");
+  ASSERT_TRUE(redis.Index("k1", 2, &tempv));
+  ASSERT_EQ(tempv, "c");
+  ASSERT_TRUE(redis.Index("k1", 3, &tempv));
+  ASSERT_EQ(tempv, "d");
+  ASSERT_TRUE(redis.Index("k1", 4, &tempv));
+  ASSERT_EQ(tempv, "e");
+  ASSERT_TRUE(redis.Index("k1", 5, &tempv));
+  ASSERT_EQ(tempv, "f");
+  ASSERT_TRUE(redis.Index("k1", 6, &tempv));
+  ASSERT_EQ(tempv, "g");
+
+  // Bad indices (just out-of-bounds / off-by-one check)
+  ASSERT_EQ(redis.Set("k1", -8, "off-by-one in negative index"), false);
+  ASSERT_EQ(redis.Set("k1", 7, "off-by-one-error in positive index"), false);
+  ASSERT_EQ(redis.Set("k1", 43892, "big random index should fail"), false);
+  ASSERT_EQ(redis.Set("k1", -21391, "large negative index should fail"), false);
+
+  // One last check (to make sure nothing weird happened)
+  ASSERT_EQ(redis.Length("k1"), 7); // Size should not change
+  ASSERT_TRUE(redis.Index("k1", 0, &tempv));
+  ASSERT_EQ(tempv, "a");
+  ASSERT_TRUE(redis.Index("k1", 1, &tempv));
+  ASSERT_EQ(tempv, "b");
+  ASSERT_TRUE(redis.Index("k1", 2, &tempv));
+  ASSERT_EQ(tempv, "c");
+  ASSERT_TRUE(redis.Index("k1", 3, &tempv));
+  ASSERT_EQ(tempv, "d");
+  ASSERT_TRUE(redis.Index("k1", 4, &tempv));
+  ASSERT_EQ(tempv, "e");
+  ASSERT_TRUE(redis.Index("k1", 5, &tempv));
+  ASSERT_EQ(tempv, "f");
+  ASSERT_TRUE(redis.Index("k1", 6, &tempv));
+  ASSERT_EQ(tempv, "g");
+}
+
+// Testing Insert, Push, and Set, in a mixed environment
+TEST(RedisListsTest, InsertPushSetTest) {
+  RedisLists redis(kDefaultDbName, options, true);   // Destructive
+
+  string tempv; // Used below for all Index(), PopRight(), PopLeft()
+
+  // A series of pushes and insertions
+  // Will result in [newbegin, z, a, aftera, x, newend]
+  // Also, check the return value sometimes (should return length)
+  int lengthCheck;
+  lengthCheck = redis.PushLeft("k1", "a");
+  ASSERT_EQ(lengthCheck, 1);
+  redis.PushLeft("k1", "z");
+  redis.PushRight("k1", "x");
+  lengthCheck = redis.InsertAfter("k1", "a", "aftera");
+  ASSERT_EQ(lengthCheck , 4);
+  redis.InsertBefore("k1", "z", "newbegin");  // InsertBefore beginning of list
+  redis.InsertAfter("k1", "x", "newend");     // InsertAfter end of list
+
+  // Check
+  std::vector<std::string> res = redis.Range("k1", 0, -1); // Get the list
+  ASSERT_EQ((int)res.size(), 6);
+  ASSERT_EQ(res[0], "newbegin");
+  ASSERT_EQ(res[5], "newend");
+  ASSERT_EQ(res[3], "aftera");
+
+  // Testing duplicate values/pivots (multiple occurrences of 'a')
+  ASSERT_TRUE(redis.Set("k1", 0, "a"));     // [a, z, a, aftera, x, newend]
+  redis.InsertAfter("k1", "a", "happy");    // [a, happy, z, a, aftera, ...]
+  ASSERT_TRUE(redis.Index("k1", 1, &tempv));
+  ASSERT_EQ(tempv, "happy");
+  redis.InsertBefore("k1", "a", "sad");     // [sad, a, happy, z, a, aftera, ...]
+  ASSERT_TRUE(redis.Index("k1", 0, &tempv));
+  ASSERT_EQ(tempv, "sad");
+  ASSERT_TRUE(redis.Index("k1", 2, &tempv));
+  ASSERT_EQ(tempv, "happy");
+  ASSERT_TRUE(redis.Index("k1", 5, &tempv));
+  ASSERT_EQ(tempv, "aftera");
+  redis.InsertAfter("k1", "a", "zz");         // [sad, a, zz, happy, z, a, aftera, ...]
+  ASSERT_TRUE(redis.Index("k1", 2, &tempv));
+  ASSERT_EQ(tempv, "zz");
+  ASSERT_TRUE(redis.Index("k1", 6, &tempv));
+  ASSERT_EQ(tempv, "aftera");
+  ASSERT_TRUE(redis.Set("k1", 1, "nota"));    // [sad, nota, zz, happy, z, a, ...]
+  redis.InsertBefore("k1", "a", "ba");        // [sad, nota, zz, happy, z, ba, a, ...]
+  ASSERT_TRUE(redis.Index("k1", 4, &tempv));
+  ASSERT_EQ(tempv, "z");
+  ASSERT_TRUE(redis.Index("k1", 5, &tempv));
+  ASSERT_EQ(tempv, "ba");
+  ASSERT_TRUE(redis.Index("k1", 6, &tempv));
+  ASSERT_EQ(tempv, "a");
+
+  // We currently have: [sad, nota, zz, happy, z, ba, a, aftera, x, newend]
+  // redis.Print("k1");   // manually check
+
+  // Test Inserting before/after non-existent values
+  lengthCheck = redis.Length("k1"); // Ensure that the length doesn't change
+  ASSERT_EQ(lengthCheck, 10);
+  ASSERT_EQ(redis.InsertBefore("k1", "non-exist", "randval"), lengthCheck);
+  ASSERT_EQ(redis.InsertAfter("k1", "nothing", "a"), lengthCheck);
+  ASSERT_EQ(redis.InsertAfter("randKey", "randVal", "ranValue"), 0); // Empty
+  ASSERT_EQ(redis.Length("k1"), lengthCheck); // The length should not change
+
+  // Simply Test the Set() function
+  redis.Set("k1", 5, "ba2");
+  redis.InsertBefore("k1", "ba2", "beforeba2");
+  ASSERT_TRUE(redis.Index("k1", 4, &tempv));
+  ASSERT_EQ(tempv, "z");
+  ASSERT_TRUE(redis.Index("k1", 5, &tempv));
+  ASSERT_EQ(tempv, "beforeba2");
+  ASSERT_TRUE(redis.Index("k1", 6, &tempv));
+  ASSERT_EQ(tempv, "ba2");
+  ASSERT_TRUE(redis.Index("k1", 7, &tempv));
+  ASSERT_EQ(tempv, "a");
+
+  // We have: [sad, nota, zz, happy, z, beforeba2, ba2, a, aftera, x, newend]
+
+  // Set() with negative indices
+  redis.Set("k1", -1, "endprank");
+  ASSERT_TRUE(!redis.Index("k1", 11, &tempv));
+  ASSERT_TRUE(redis.Index("k1", 10, &tempv));
+  ASSERT_EQ(tempv, "endprank"); // Ensure Set worked correctly
+  redis.Set("k1", -11, "t");
+  ASSERT_TRUE(redis.Index("k1", 0, &tempv));
+  ASSERT_EQ(tempv, "t");
+
+  // Test out of bounds Set
+  ASSERT_EQ(redis.Set("k1", -12, "ssd"), false);
+  ASSERT_EQ(redis.Set("k1", 11, "sasd"), false);
+  ASSERT_EQ(redis.Set("k1", 1200, "big"), false);
+}
+
+// Testing Trim, Pop
+TEST(RedisListsTest, TrimPopTest) {
+  RedisLists redis(kDefaultDbName, options, true);   // Destructive
+
+  string tempv; // Used below for all Index(), PopRight(), PopLeft()
+
+  // A series of pushes and insertions
+  // Will result in [newbegin, z, a, aftera, x, newend]
+  redis.PushLeft("k1", "a");
+  redis.PushLeft("k1", "z");
+  redis.PushRight("k1", "x");
+  redis.InsertBefore("k1", "z", "newbegin");    // InsertBefore start of list
+  redis.InsertAfter("k1", "x", "newend");       // InsertAfter end of list
+  redis.InsertAfter("k1", "a", "aftera");
+
+  // Simple PopLeft/Right test
+  ASSERT_TRUE(redis.PopLeft("k1", &tempv));
+  ASSERT_EQ(tempv, "newbegin");
+  ASSERT_EQ(redis.Length("k1"), 5);
+  ASSERT_TRUE(redis.Index("k1", 0, &tempv));
+  ASSERT_EQ(tempv, "z");
+  ASSERT_TRUE(redis.PopRight("k1", &tempv));
+  ASSERT_EQ(tempv, "newend");
+  ASSERT_EQ(redis.Length("k1"), 4);
+  ASSERT_TRUE(redis.Index("k1", -1, &tempv));
+  ASSERT_EQ(tempv, "x");
+
+  // Now have: [z, a, aftera, x]
+
+  // Test Trim
+  ASSERT_TRUE(redis.Trim("k1", 0, -1));       // [z, a, aftera, x] (do nothing)
+  ASSERT_EQ(redis.Length("k1"), 4);
+  ASSERT_TRUE(redis.Trim("k1", 0, 2));                     // [z, a, aftera]
+  ASSERT_EQ(redis.Length("k1"), 3);
+  ASSERT_TRUE(redis.Index("k1", -1, &tempv));
+  ASSERT_EQ(tempv, "aftera");
+  ASSERT_TRUE(redis.Trim("k1", 1, 1));                     // [a]
+  ASSERT_EQ(redis.Length("k1"), 1);
+  ASSERT_TRUE(redis.Index("k1", 0, &tempv));
+  ASSERT_EQ(tempv, "a");
+
+  // Test out of bounds (empty) trim
+  ASSERT_TRUE(redis.Trim("k1", 1, 0));
+  ASSERT_EQ(redis.Length("k1"), 0);
+
+  // Popping with empty list (return empty without error)
+  ASSERT_TRUE(!redis.PopLeft("k1", &tempv));
+  ASSERT_TRUE(!redis.PopRight("k1", &tempv));
+  ASSERT_TRUE(redis.Trim("k1", 0, 5));
+
+  // Exhaustive Trim test (negative and invalid indices)
+  // Will start in [newbegin, z, a, aftera, x, newend]
+  redis.PushLeft("k1", "a");
+  redis.PushLeft("k1", "z");
+  redis.PushRight("k1", "x");
+  redis.InsertBefore("k1", "z", "newbegin");    // InsertBefore start of list
+  redis.InsertAfter("k1", "x", "newend");       // InsertAfter end of list
+  redis.InsertAfter("k1", "a", "aftera");
+  ASSERT_TRUE(redis.Trim("k1", -6, -1));                     // Should do nothing
+  ASSERT_EQ(redis.Length("k1"), 6);
+  ASSERT_TRUE(redis.Trim("k1", 1, -2));
+  ASSERT_TRUE(redis.Index("k1", 0, &tempv));
+  ASSERT_EQ(tempv, "z");
+  ASSERT_TRUE(redis.Index("k1", 3, &tempv));
+  ASSERT_EQ(tempv, "x");
+  ASSERT_EQ(redis.Length("k1"), 4);
+  ASSERT_TRUE(redis.Trim("k1", -3, -2));
+  ASSERT_EQ(redis.Length("k1"), 2);
+}
+
+// Testing Remove, RemoveFirst, RemoveLast
+TEST(RedisListsTest, RemoveTest) {
+  RedisLists redis(kDefaultDbName, options, true);   // Destructive
+
+  string tempv; // Used below for all Index(), PopRight(), PopLeft()
+
+  // A series of pushes and insertions
+  // Will result in [newbegin, z, a, aftera, x, newend, a, a]
+  redis.PushLeft("k1", "a");
+  redis.PushLeft("k1", "z");
+  redis.PushRight("k1", "x");
+  redis.InsertBefore("k1", "z", "newbegin");    // InsertBefore start of list
+  redis.InsertAfter("k1", "x", "newend");       // InsertAfter end of list
+  redis.InsertAfter("k1", "a", "aftera");
+  redis.PushRight("k1", "a");
+  redis.PushRight("k1", "a");
+
+  // Verify
+  ASSERT_TRUE(redis.Index("k1", 0, &tempv));
+  ASSERT_EQ(tempv, "newbegin");
+  ASSERT_TRUE(redis.Index("k1", -1, &tempv));
+  ASSERT_EQ(tempv, "a");
+
+  // Check RemoveFirst (Remove the first two 'a')
+  // Results in [newbegin, z, aftera, x, newend, a]
+  int numRemoved = redis.Remove("k1", 2, "a");
+  ASSERT_EQ(numRemoved, 2);
+  ASSERT_TRUE(redis.Index("k1", 0, &tempv));
+  ASSERT_EQ(tempv, "newbegin");
+  ASSERT_TRUE(redis.Index("k1", 1, &tempv));
+  ASSERT_EQ(tempv, "z");
+  ASSERT_TRUE(redis.Index("k1", 4, &tempv));
+  ASSERT_EQ(tempv, "newend");
+  ASSERT_TRUE(redis.Index("k1", 5, &tempv));
+  ASSERT_EQ(tempv, "a");
+  ASSERT_EQ(redis.Length("k1"), 6);
+
+  // Repopulate some stuff
+  // Results in: [x, x, x, x, x, newbegin, z, x, aftera, x, newend, a, x]
+  redis.PushLeft("k1", "x");
+  redis.PushLeft("k1", "x");
+  redis.PushLeft("k1", "x");
+  redis.PushLeft("k1", "x");
+  redis.PushLeft("k1", "x");
+  redis.PushRight("k1", "x");
+  redis.InsertAfter("k1", "z", "x");
+
+  // Test removal from end
+  numRemoved = redis.Remove("k1", -2, "x");
+  ASSERT_EQ(numRemoved, 2);
+  ASSERT_TRUE(redis.Index("k1", 8, &tempv));
+  ASSERT_EQ(tempv, "aftera");
+  ASSERT_TRUE(redis.Index("k1", 9, &tempv));
+  ASSERT_EQ(tempv, "newend");
+  ASSERT_TRUE(redis.Index("k1", 10, &tempv));
+  ASSERT_EQ(tempv, "a");
+  ASSERT_TRUE(!redis.Index("k1", 11, &tempv));
+  numRemoved = redis.Remove("k1", -2, "x");
+  ASSERT_EQ(numRemoved, 2);
+  ASSERT_TRUE(redis.Index("k1", 4, &tempv));
+  ASSERT_EQ(tempv, "newbegin");
+  ASSERT_TRUE(redis.Index("k1", 6, &tempv));
+  ASSERT_EQ(tempv, "aftera");
+
+  // We now have: [x, x, x, x, newbegin, z, aftera, newend, a]
+  ASSERT_EQ(redis.Length("k1"), 9);
+  ASSERT_TRUE(redis.Index("k1", -1, &tempv));
+  ASSERT_EQ(tempv, "a");
+  ASSERT_TRUE(redis.Index("k1", 0, &tempv));
+  ASSERT_EQ(tempv, "x");
+
+  // Test over-shooting (removing more than there exists)
+  numRemoved = redis.Remove("k1", -9000, "x");
+  ASSERT_EQ(numRemoved , 4);    // Only really removed 4
+  ASSERT_EQ(redis.Length("k1"), 5);
+  ASSERT_TRUE(redis.Index("k1", 0, &tempv));
+  ASSERT_EQ(tempv, "newbegin");
+  numRemoved = redis.Remove("k1", 1, "x");
+  ASSERT_EQ(numRemoved, 0);
+
+  // Try removing ALL!
+  numRemoved = redis.Remove("k1", 0, "newbegin");   // REMOVE 0 will remove all!
+  ASSERT_EQ(numRemoved, 1);
+
+  // Removal from an empty-list
+  ASSERT_TRUE(redis.Trim("k1", 1, 0));
+  numRemoved = redis.Remove("k1", 1, "z");
+  ASSERT_EQ(numRemoved, 0);
+}
+
+
+// Test Multiple keys and Persistence
+TEST(RedisListsTest, PersistenceMultiKeyTest) {
+
+  string tempv; // Used below for all Index(), PopRight(), PopLeft()
+
+  // Block one: populate a single key in the database
+  {
+    RedisLists redis(kDefaultDbName, options, true);   // Destructive
+
+    // A series of pushes and insertions
+    // Will result in [newbegin, z, a, aftera, x, newend, a, a]
+    redis.PushLeft("k1", "a");
+    redis.PushLeft("k1", "z");
+    redis.PushRight("k1", "x");
+    redis.InsertBefore("k1", "z", "newbegin");    // InsertBefore start of list
+    redis.InsertAfter("k1", "x", "newend");       // InsertAfter end of list
+    redis.InsertAfter("k1", "a", "aftera");
+    redis.PushRight("k1", "a");
+    redis.PushRight("k1", "a");
+
+    ASSERT_TRUE(redis.Index("k1", 3, &tempv));
+    ASSERT_EQ(tempv, "aftera");
+  }
+
+  // Block two: make sure changes were saved and add some other key
+  {
+    RedisLists redis(kDefaultDbName, options, false); // Persistent, non-destructive
+
+    // Check
+    ASSERT_EQ(redis.Length("k1"), 8);
+    ASSERT_TRUE(redis.Index("k1", 3, &tempv));
+    ASSERT_EQ(tempv, "aftera");
+
+    redis.PushRight("k2", "randomkey");
+    redis.PushLeft("k2", "sas");
+
+    redis.PopLeft("k1", &tempv);
+  }
+
+  // Block three: Verify the changes from block 2
+  {
+    RedisLists redis(kDefaultDbName, options, false); // Persistent, non-destructive
+
+    // Check
+    ASSERT_EQ(redis.Length("k1"), 7);
+    ASSERT_EQ(redis.Length("k2"), 2);
+    ASSERT_TRUE(redis.Index("k1", 0, &tempv));
+    ASSERT_EQ(tempv, "z");
+    ASSERT_TRUE(redis.Index("k2", -2, &tempv));
+    ASSERT_EQ(tempv, "sas");
+  }
+}
+
+/// THE manual REDIS TEST begins here
+/// THIS WILL ONLY OCCUR IF YOU RUN: ./redis_test -m
+
+void MakeUpper(std::string* const s) {
+  int len = s->length();
+  for(int i=0; i<len; ++i) {
+    (*s)[i] = toupper((*s)[i]); // C-version defined in <ctype.h>
+  }
+}
+
+/// Allows the user to enter in REDIS commands into the command-line.
+/// This is useful for manual / interacticve testing / debugging.
+///  Use destructive=true to clean the database before use.
+///  Use destructive=false to remember the previous state (i.e.: persistent)
+/// Should be called from main function.
+int manual_redis_test(bool destructive){
+  RedisLists redis(RedisListsTest::kDefaultDbName,
+                   RedisListsTest::options,
+                   destructive);
+
+  // TODO: Right now, please use spaces to separate each word.
+  //  In actual redis, you can use quotes to specify compound values
+  //  Example: RPUSH mylist "this is a compound value"
+
+  std::string command;
+  while(true) {
+    cin >> command;
+    MakeUpper(&command);
+
+    if (command == "LINSERT") {
+      std::string k, t, p, v;
+      cin >> k >> t >> p >> v;
+      MakeUpper(&t);
+      if (t=="BEFORE") {
+        std::cout << redis.InsertBefore(k, p, v) << std::endl;
+      } else if (t=="AFTER") {
+        std::cout << redis.InsertAfter(k, p, v) << std::endl;
+      }
+    } else if (command == "LPUSH") {
+      std::string k, v;
+      std::cin >> k >> v;
+      redis.PushLeft(k, v);
+    } else if (command == "RPUSH") {
+      std::string k, v;
+      std::cin >> k >> v;
+      redis.PushRight(k, v);
+    } else if (command == "LPOP") {
+      std::string k;
+      std::cin >> k;
+      string res;
+      redis.PopLeft(k, &res);
+      std::cout << res << std::endl;
+    } else if (command == "RPOP") {
+      std::string k;
+      std::cin >> k;
+      string res;
+      redis.PopRight(k, &res);
+      std::cout << res << std::endl;
+    } else if (command == "LREM") {
+      std::string k;
+      int amt;
+      std::string v;
+
+      std::cin >> k >> amt >> v;
+      std::cout << redis.Remove(k, amt, v) << std::endl;
+    } else if (command == "LLEN") {
+      std::string k;
+      std::cin >> k;
+      std::cout << redis.Length(k) << std::endl;
+    } else if (command == "LRANGE") {
+      std::string k;
+      int i, j;
+      std::cin >> k >> i >> j;
+      std::vector<std::string> res = redis.Range(k, i, j);
+      for (auto it = res.begin(); it != res.end(); ++it) {
+        std::cout << " " << (*it);
+      }
+      std::cout << std::endl;
+    } else if (command == "LTRIM") {
+      std::string k;
+      int i, j;
+      std::cin >> k >> i >> j;
+      redis.Trim(k, i, j);
+    } else if (command == "LSET") {
+      std::string k;
+      int idx;
+      std::string v;
+      cin >> k >> idx >> v;
+      redis.Set(k, idx, v);
+    } else if (command == "LINDEX") {
+      std::string k;
+      int idx;
+      std::cin >> k >> idx;
+      string res;
+      redis.Index(k, idx, &res);
+      std::cout << res << std::endl;
+    } else if (command == "PRINT") {      // Added by Deon
+      std::string k;
+      cin >> k;
+      redis.Print(k);
+    } else if (command == "QUIT") {
+      return 0;
+    } else {
+      std::cout << "unknown command: " << command << std::endl;
+    }
+  }
+}
+
+} // namespace rocksdb
+
+
+// USAGE: "./redis_test" for default (unit tests)
+//        "./redis_test -m" for manual testing (redis command api)
+//        "./redis_test -m -d" for destructive manual test (erase db before use)
+
+
+// Check for "want" argument in the argument list
+bool found_arg(int argc, char* argv[], const char* want){
+  for(int i=1; i<argc; ++i){
+    if (strcmp(argv[i], want) == 0) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Will run unit tests.
+// However, if -m is specified, it will do user manual/interactive testing
+// -m -d is manual and destructive (will clear the database before use)
+int main(int argc, char* argv[]) {
+  if (found_arg(argc, argv, "-m")) {
+    bool destructive = found_arg(argc, argv, "-d");
+    return rocksdb::manual_redis_test(destructive);
+  } else {
+    return rocksdb::test::RunAllTests();
+  }
+}
+
diff --git a/utilities/ttl/db_ttl.cc b/utilities/ttl/db_ttl.cc
new file mode 100644 (file)
index 0000000..5b70493
--- /dev/null
@@ -0,0 +1,219 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "utilities/ttl/db_ttl.h"
+#include "db/filename.h"
+#include "util/coding.h"
+#include "include/rocksdb/env.h"
+#include "include/rocksdb/iterator.h"
+
+namespace rocksdb {
+
+void DBWithTTL::SanitizeOptions(int32_t ttl, Options* options) {
+  if (options->compaction_filter) {
+    options->compaction_filter =
+        new TtlCompactionFilter(ttl, options->compaction_filter);
+  } else {
+    options->compaction_filter_factory =
+        std::shared_ptr<CompactionFilterFactory>(new TtlCompactionFilterFactory(
+            ttl, options->compaction_filter_factory));
+  }
+
+  if (options->merge_operator) {
+    options->merge_operator.reset(
+        new TtlMergeOperator(options->merge_operator));
+  }
+}
+
+// Open the db inside DBWithTTL because options needs pointer to its ttl
+DBWithTTL::DBWithTTL(DB* db) : StackableDB(db) {}
+
+DBWithTTL::~DBWithTTL() {
+  delete GetOptions().compaction_filter;
+}
+
+Status UtilityDB::OpenTtlDB(
+    const Options& options,
+    const std::string& dbname,
+    StackableDB** dbptr,
+    int32_t ttl,
+    bool read_only) {
+  Status st;
+  Options options_to_open = options;
+  DBWithTTL::SanitizeOptions(ttl, &options_to_open);
+  DB* db;
+
+  if (read_only) {
+    st = DB::OpenForReadOnly(options_to_open, dbname, &db);
+  } else {
+    st = DB::Open(options_to_open, dbname, &db);
+  }
+  if (st.ok()) {
+    *dbptr = new DBWithTTL(db);
+  } else {
+    delete db;
+  }
+  return st;
+}
+
+// Gives back the current time
+Status DBWithTTL::GetCurrentTime(int64_t& curtime) {
+  return Env::Default()->GetCurrentTime(&curtime);
+}
+
+// Appends the current timestamp to the string.
+// Returns false if could not get the current_time, true if append succeeds
+Status DBWithTTL::AppendTS(const Slice& val, std::string& val_with_ts) {
+  val_with_ts.reserve(kTSLength + val.size());
+  char ts_string[kTSLength];
+  int64_t curtime;
+  Status st = GetCurrentTime(curtime);
+  if (!st.ok()) {
+    return st;
+  }
+  EncodeFixed32(ts_string, (int32_t)curtime);
+  val_with_ts.append(val.data(), val.size());
+  val_with_ts.append(ts_string, kTSLength);
+  return st;
+}
+
+// Returns corruption if the length of the string is lesser than timestamp, or
+// timestamp refers to a time lesser than ttl-feature release time
+Status DBWithTTL::SanityCheckTimestamp(const Slice& str) {
+  if (str.size() < kTSLength) {
+    return Status::Corruption("Error: value's length less than timestamp's\n");
+  }
+  // Checks that TS is not lesser than kMinTimestamp
+  // Gaurds against corruption & normal database opened incorrectly in ttl mode
+  int32_t timestamp_value =
+    DecodeFixed32(str.data() + str.size() - kTSLength);
+  if (timestamp_value < kMinTimestamp){
+    return Status::Corruption("Error: Timestamp < ttl feature release time!\n");
+  }
+  return Status::OK();
+}
+
+// Checks if the string is stale or not according to TTl provided
+bool DBWithTTL::IsStale(const Slice& value, int32_t ttl) {
+  if (ttl <= 0) { // Data is fresh if TTL is non-positive
+    return false;
+  }
+  int64_t curtime;
+  if (!GetCurrentTime(curtime).ok()) {
+    return false; // Treat the data as fresh if could not get current time
+  }
+  int32_t timestamp_value =
+    DecodeFixed32(value.data() + value.size() - kTSLength);
+  return (timestamp_value + ttl) < curtime;
+}
+
+// Strips the TS from the end of the string
+Status DBWithTTL::StripTS(std::string* str) {
+  Status st;
+  if (str->length() < kTSLength) {
+    return Status::Corruption("Bad timestamp in key-value");
+  }
+  // Erasing characters which hold the TS
+  str->erase(str->length() - kTSLength, kTSLength);
+  return st;
+}
+
+Status DBWithTTL::Put(const WriteOptions& opt, const Slice& key,
+                      const Slice& val) {
+  WriteBatch batch;
+  batch.Put(key, val);
+  return Write(opt, &batch);
+}
+
+Status DBWithTTL::Get(const ReadOptions& options,
+                      const Slice& key,
+                      std::string* value) {
+  Status st = db_->Get(options, key, value);
+  if (!st.ok()) {
+    return st;
+  }
+  st = SanityCheckTimestamp(*value);
+  if (!st.ok()) {
+    return st;
+  }
+  return StripTS(value);
+}
+
+std::vector<Status> DBWithTTL::MultiGet(const ReadOptions& options,
+                                        const std::vector<Slice>& keys,
+                                        std::vector<std::string>* values) {
+  return std::vector<Status>(keys.size(),
+                             Status::NotSupported("MultiGet not\
+                               supported with TTL"));
+}
+
+bool DBWithTTL::KeyMayExist(const ReadOptions& options,
+                            const Slice& key,
+                            std::string* value,
+                            bool* value_found) {
+  bool ret = db_->KeyMayExist(options, key, value, value_found);
+  if (ret && value != nullptr && value_found != nullptr && *value_found) {
+    if (!SanityCheckTimestamp(*value).ok() || !StripTS(value).ok()) {
+      return false;
+    }
+  }
+  return ret;
+}
+
+Status DBWithTTL::Merge(const WriteOptions& opt,
+                        const Slice& key,
+                        const Slice& value) {
+  WriteBatch batch;
+  batch.Merge(key, value);
+  return Write(opt, &batch);
+}
+
+Status DBWithTTL::Write(const WriteOptions& opts, WriteBatch* updates) {
+  class Handler : public WriteBatch::Handler {
+   public:
+    WriteBatch updates_ttl;
+    Status batch_rewrite_status;
+    virtual void Put(const Slice& key, const Slice& value) {
+      std::string value_with_ts;
+      Status st = AppendTS(value, value_with_ts);
+      if (!st.ok()) {
+        batch_rewrite_status = st;
+      } else {
+        updates_ttl.Put(key, value_with_ts);
+      }
+    }
+    virtual void Merge(const Slice& key, const Slice& value) {
+      std::string value_with_ts;
+      Status st = AppendTS(value, value_with_ts);
+      if (!st.ok()) {
+        batch_rewrite_status = st;
+      } else {
+        updates_ttl.Merge(key, value_with_ts);
+      }
+    }
+    virtual void Delete(const Slice& key) {
+      updates_ttl.Delete(key);
+    }
+    virtual void LogData(const Slice& blob) {
+      updates_ttl.PutLogData(blob);
+    }
+  };
+  Handler handler;
+  updates->Iterate(&handler);
+  if (!handler.batch_rewrite_status.ok()) {
+    return handler.batch_rewrite_status;
+  } else {
+    return db_->Write(opts, &(handler.updates_ttl));
+  }
+}
+
+Iterator* DBWithTTL::NewIterator(const ReadOptions& opts) {
+  return new TtlIterator(db_->NewIterator(opts));
+}
+
+void DBWithTTL::TEST_Destroy_DBWithTtl() {
+  ((DBImpl*) db_)->TEST_Destroy_DBImpl();
+}
+
+}  // namespace rocksdb
diff --git a/utilities/ttl/db_ttl.h b/utilities/ttl/db_ttl.h
new file mode 100644 (file)
index 0000000..2fdc664
--- /dev/null
@@ -0,0 +1,315 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/merge_operator.h"
+#include "utilities/utility_db.h"
+#include "db/db_impl.h"
+
+namespace rocksdb {
+
+class DBWithTTL : public StackableDB {
+ public:
+  static void SanitizeOptions(int32_t ttl, Options* options);
+
+  explicit DBWithTTL(DB* db);
+
+  virtual ~DBWithTTL();
+
+  virtual Status Put(const WriteOptions& o, const Slice& key,
+                     const Slice& val) override;
+
+  virtual Status Get(const ReadOptions& options, const Slice& key,
+                     std::string* value) override;
+
+  virtual std::vector<Status> MultiGet(
+      const ReadOptions& options, const std::vector<Slice>& keys,
+      std::vector<std::string>* values) override;
+
+  virtual bool KeyMayExist(const ReadOptions& options,
+                           const Slice& key,
+                           std::string* value,
+                           bool* value_found = nullptr) override;
+
+  virtual Status Merge(const WriteOptions& options, const Slice& key,
+                       const Slice& value) override;
+
+  virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override;
+
+  virtual Iterator* NewIterator(const ReadOptions& opts) override;
+
+  // Simulate a db crash, no elegant closing of database.
+  void TEST_Destroy_DBWithTtl();
+
+  virtual DB* GetBaseDB() {
+    return db_;
+  }
+
+  static bool IsStale(const Slice& value, int32_t ttl);
+
+  static Status AppendTS(const Slice& val, std::string& val_with_ts);
+
+  static Status SanityCheckTimestamp(const Slice& str);
+
+  static Status StripTS(std::string* str);
+
+  static Status GetCurrentTime(int64_t& curtime);
+
+  static const uint32_t kTSLength = sizeof(int32_t); // size of timestamp
+
+  static const int32_t kMinTimestamp = 1368146402; // 05/09/2013:5:40PM GMT-8
+
+  static const int32_t kMaxTimestamp = 2147483647; // 01/18/2038:7:14PM GMT-8
+};
+
+class TtlIterator : public Iterator {
+
+ public:
+  explicit TtlIterator(Iterator* iter)
+    : iter_(iter) {
+    assert(iter_);
+  }
+
+  ~TtlIterator() {
+    delete iter_;
+  }
+
+  bool Valid() const {
+    return iter_->Valid();
+  }
+
+  void SeekToFirst() {
+    iter_->SeekToFirst();
+  }
+
+  void SeekToLast() {
+    iter_->SeekToLast();
+  }
+
+  void Seek(const Slice& target) {
+    iter_->Seek(target);
+  }
+
+  void Next() {
+    iter_->Next();
+  }
+
+  void Prev() {
+    iter_->Prev();
+  }
+
+  Slice key() const {
+    return iter_->key();
+  }
+
+  int32_t timestamp() const {
+    return DecodeFixed32(
+      iter_->value().data() + iter_->value().size() - DBWithTTL::kTSLength);
+  }
+
+  Slice value() const {
+    //TODO: handle timestamp corruption like in general iterator semantics
+    assert(DBWithTTL::SanityCheckTimestamp(iter_->value()).ok());
+    Slice trimmed_value = iter_->value();
+    trimmed_value.size_ -= DBWithTTL::kTSLength;
+    return trimmed_value;
+  }
+
+  Status status() const {
+    return iter_->status();
+  }
+
+ private:
+  Iterator* iter_;
+};
+
+class TtlCompactionFilter : public CompactionFilter {
+
+ public:
+  TtlCompactionFilter(
+      int32_t ttl,
+      const CompactionFilter* user_comp_filter,
+      std::unique_ptr<const CompactionFilter>
+      user_comp_filter_from_factory = nullptr)
+    : ttl_(ttl),
+      user_comp_filter_(user_comp_filter),
+      user_comp_filter_from_factory_(std::move(user_comp_filter_from_factory)) {
+    // Unlike the merge operator, compaction filter is necessary for TTL, hence
+    // this would be called even if user doesn't specify any compaction-filter
+    if (!user_comp_filter_) {
+      user_comp_filter_ = user_comp_filter_from_factory_.get();
+    }
+  }
+
+  virtual bool Filter(int level,
+                      const Slice& key,
+                      const Slice& old_val,
+                      std::string* new_val,
+                      bool* value_changed) const override {
+    if (DBWithTTL::IsStale(old_val, ttl_)) {
+      return true;
+    }
+    if (user_comp_filter_ == nullptr) {
+      return false;
+    }
+    assert(old_val.size() >= DBWithTTL::kTSLength);
+    Slice old_val_without_ts(old_val.data(),
+                             old_val.size() - DBWithTTL::kTSLength);
+    if (user_comp_filter_->Filter(level, key, old_val_without_ts, new_val,
+                                  value_changed)) {
+      return true;
+    }
+    if (*value_changed) {
+      new_val->append(old_val.data() + old_val.size() - DBWithTTL::kTSLength,
+                      DBWithTTL::kTSLength);
+    }
+    return false;
+  }
+
+  virtual const char* Name() const override {
+    return "Delete By TTL";
+  }
+
+ private:
+  int32_t ttl_;
+  const CompactionFilter* user_comp_filter_;
+  std::unique_ptr<const CompactionFilter> user_comp_filter_from_factory_;
+};
+
+class TtlCompactionFilterFactory : public CompactionFilterFactory {
+  public:
+    TtlCompactionFilterFactory(
+        int32_t ttl,
+        std::shared_ptr<CompactionFilterFactory> comp_filter_factory)
+    : ttl_(ttl),
+      user_comp_filter_factory_(comp_filter_factory) { }
+
+    virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+        const CompactionFilter::Context& context) {
+      return std::unique_ptr<TtlCompactionFilter>(
+        new TtlCompactionFilter(
+          ttl_,
+          nullptr,
+          std::move(user_comp_filter_factory_->CreateCompactionFilter(context))
+        )
+      );
+    }
+
+    virtual const char* Name() const override {
+      return "TtlCompactionFilterFactory";
+    }
+
+  private:
+    int32_t ttl_;
+    std::shared_ptr<CompactionFilterFactory> user_comp_filter_factory_;
+};
+
+class TtlMergeOperator : public MergeOperator {
+
+ public:
+  explicit TtlMergeOperator(const std::shared_ptr<MergeOperator> merge_op)
+    : user_merge_op_(merge_op) {
+    assert(merge_op);
+  }
+
+  virtual bool FullMerge(const Slice& key,
+                         const Slice* existing_value,
+                         const std::deque<std::string>& operands,
+                         std::string* new_value,
+                         Logger* logger) const override {
+    const uint32_t ts_len = DBWithTTL::kTSLength;
+    if (existing_value && existing_value->size() < ts_len) {
+      Log(logger, "Error: Could not remove timestamp from existing value.");
+      return false;
+    }
+
+    // Extract time-stamp from each operand to be passed to user_merge_op_
+    std::deque<std::string> operands_without_ts;
+    for (const auto &operand : operands) {
+      if (operand.size() < ts_len) {
+        Log(logger, "Error: Could not remove timestamp from operand value.");
+        return false;
+      }
+      operands_without_ts.push_back(operand.substr(0, operand.size() - ts_len));
+    }
+
+    // Apply the user merge operator (store result in *new_value)
+    bool good = true;
+    if (existing_value) {
+      Slice existing_value_without_ts(existing_value->data(),
+                                      existing_value->size() - ts_len);
+      good = user_merge_op_->FullMerge(key, &existing_value_without_ts,
+                                       operands_without_ts, new_value, logger);
+    } else {
+      good = user_merge_op_->FullMerge(key, nullptr, operands_without_ts,
+                                       new_value, logger);
+    }
+
+    // Return false if the user merge operator returned false
+    if (!good) {
+      return false;
+    }
+
+    // Augment the *new_value with the ttl time-stamp
+    int64_t curtime;
+    if (!DBWithTTL::GetCurrentTime(curtime).ok()) {
+      Log(logger, "Error: Could not get current time to be attached internally "
+                  "to the new value.");
+      return false;
+    } else {
+      char ts_string[ts_len];
+      EncodeFixed32(ts_string, (int32_t)curtime);
+      new_value->append(ts_string, ts_len);
+      return true;
+    }
+  }
+
+  virtual bool PartialMerge(const Slice& key,
+                            const Slice& left_operand,
+                            const Slice& right_operand,
+                            std::string* new_value,
+                            Logger* logger) const override {
+    const uint32_t ts_len = DBWithTTL::kTSLength;
+
+    if (left_operand.size() < ts_len || right_operand.size() < ts_len) {
+      Log(logger, "Error: Could not remove timestamp from value.");
+      return false;
+    }
+
+    // Apply the user partial-merge operator (store result in *new_value)
+    assert(new_value);
+    Slice left_without_ts(left_operand.data(), left_operand.size() - ts_len);
+    Slice right_without_ts(right_operand.data(), right_operand.size() - ts_len);
+    if (!user_merge_op_->PartialMerge(key, left_without_ts, right_without_ts,
+                                      new_value, logger)) {
+      return false;
+    }
+
+    // Augment the *new_value with the ttl time-stamp
+    int64_t curtime;
+    if (!DBWithTTL::GetCurrentTime(curtime).ok()) {
+      Log(logger, "Error: Could not get current time to be attached internally "
+                  "to the new value.");
+      return false;
+    } else {
+      char ts_string[ts_len];
+      EncodeFixed32(ts_string, (int32_t)curtime);
+      new_value->append(ts_string, ts_len);
+      return true;
+    }
+
+  }
+
+  virtual const char* Name() const override {
+    return "Merge By TTL";
+  }
+
+ private:
+  std::shared_ptr<MergeOperator> user_merge_op_;
+};
+
+}
diff --git a/utilities/ttl/ttl_test.cc b/utilities/ttl/ttl_test.cc
new file mode 100644 (file)
index 0000000..8804d89
--- /dev/null
@@ -0,0 +1,505 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <memory>
+#include "rocksdb/compaction_filter.h"
+#include "utilities/utility_db.h"
+#include "util/testharness.h"
+#include "util/logging.h"
+#include <map>
+#include <unistd.h>
+
+namespace rocksdb {
+
+namespace {
+
+typedef std::map<std::string, std::string> KVMap;
+
+enum BatchOperation {
+  PUT = 0,
+  DELETE = 1
+};
+
+}
+
+class TtlTest {
+ public:
+  TtlTest() {
+    dbname_ = test::TmpDir() + "/db_ttl";
+    options_.create_if_missing = true;
+    // ensure that compaction is kicked in to always strip timestamp from kvs
+    options_.max_grandparent_overlap_factor = 0;
+    // compaction should take place always from level0 for determinism
+    options_.max_mem_compaction_level = 0;
+    db_ttl_ = nullptr;
+    DestroyDB(dbname_, Options());
+  }
+
+  ~TtlTest() {
+    CloseTtl();
+    DestroyDB(dbname_, Options());
+  }
+
+  // Open database with TTL support when TTL not provided with db_ttl_ pointer
+  void OpenTtl() {
+    assert(db_ttl_ == nullptr); //  db should be closed before opening again
+    ASSERT_OK(UtilityDB::OpenTtlDB(options_, dbname_, &db_ttl_));
+  }
+
+  // Open database with TTL support when TTL provided with db_ttl_ pointer
+  void OpenTtl(int32_t ttl) {
+    assert(db_ttl_ == nullptr);
+    ASSERT_OK(UtilityDB::OpenTtlDB(options_, dbname_, &db_ttl_, ttl));
+  }
+
+  // Open with TestFilter compaction filter
+  void OpenTtlWithTestCompaction(int32_t ttl) {
+    options_.compaction_filter_factory =
+      std::shared_ptr<CompactionFilterFactory>(
+          new TestFilterFactory(kSampleSize_, kNewValue_));
+    OpenTtl(ttl);
+  }
+
+  // Open database with TTL support in read_only mode
+  void OpenReadOnlyTtl(int32_t ttl) {
+    assert(db_ttl_ == nullptr);
+    ASSERT_OK(UtilityDB::OpenTtlDB(options_, dbname_, &db_ttl_, ttl, true));
+  }
+
+  void CloseTtl() {
+    delete db_ttl_;
+    db_ttl_ = nullptr;
+  }
+
+  // Populates and returns a kv-map
+  void MakeKVMap(int64_t num_entries) {
+    kvmap_.clear();
+    int digits = 1;
+    for (int dummy = num_entries; dummy /= 10 ; ++digits);
+    int digits_in_i = 1;
+    for (int64_t i = 0; i < num_entries; i++) {
+      std::string key = "key";
+      std::string value = "value";
+      if (i % 10 == 0) {
+        digits_in_i++;
+      }
+      for(int j = digits_in_i; j < digits; j++) {
+        key.append("0");
+        value.append("0");
+      }
+      AppendNumberTo(&key, i);
+      AppendNumberTo(&value, i);
+      kvmap_[key] = value;
+    }
+    ASSERT_EQ((int)kvmap_.size(), num_entries);//check all insertions done
+  }
+
+  // Makes a write-batch with key-vals from kvmap_ and 'Write''s it
+  void MakePutWriteBatch(const BatchOperation* batch_ops, int num_ops) {
+    assert(num_ops <= (int)kvmap_.size());
+    static WriteOptions wopts;
+    static FlushOptions flush_opts;
+    WriteBatch batch;
+    kv_it_ = kvmap_.begin();
+    for (int i = 0; i < num_ops && kv_it_ != kvmap_.end(); i++, kv_it_++) {
+      switch (batch_ops[i]) {
+        case PUT:
+          batch.Put(kv_it_->first, kv_it_->second);
+          break;
+        case DELETE:
+          batch.Delete(kv_it_->first);
+          break;
+        default:
+          assert(false);
+      }
+    }
+    db_ttl_->Write(wopts, &batch);
+    db_ttl_->Flush(flush_opts);
+  }
+
+  // Puts num_entries starting from start_pos_map from kvmap_ into the database
+  void PutValues(int start_pos_map, int num_entries, bool flush = true) {
+    assert(db_ttl_);
+    ASSERT_LE(start_pos_map + num_entries, (int)kvmap_.size());
+    static WriteOptions wopts;
+    static FlushOptions flush_opts;
+    kv_it_ = kvmap_.begin();
+    advance(kv_it_, start_pos_map);
+    for (int i = 0; kv_it_ != kvmap_.end() && i < num_entries; i++, kv_it_++) {
+      ASSERT_OK(db_ttl_->Put(wopts, kv_it_->first, kv_it_->second));
+    }
+    // Put a mock kv at the end because CompactionFilter doesn't delete last key
+    ASSERT_OK(db_ttl_->Put(wopts, "keymock", "valuemock"));
+    if (flush) {
+      db_ttl_->Flush(flush_opts);
+    }
+  }
+
+  // Runs a manual compaction
+  void ManualCompact() {
+    db_ttl_->CompactRange(nullptr, nullptr);
+  }
+
+  // checks the whole kvmap_ to return correct values using KeyMayExist
+  void SimpleKeyMayExistCheck() {
+    static ReadOptions ropts;
+    bool value_found;
+    std::string val;
+    for(auto &kv : kvmap_) {
+      bool ret = db_ttl_->KeyMayExist(ropts, kv.first, &val, &value_found);
+      if (ret == false || value_found == false) {
+        fprintf(stderr, "KeyMayExist could not find key=%s in the database but"
+                        " should have\n", kv.first.c_str());
+        assert(false);
+      } else if (val.compare(kv.second) != 0) {
+        fprintf(stderr, " value for key=%s present in database is %s but"
+                        " should be %s\n", kv.first.c_str(), val.c_str(),
+                        kv.second.c_str());
+        assert(false);
+      }
+    }
+  }
+
+  // Sleeps for slp_tim then runs a manual compaction
+  // Checks span starting from st_pos from kvmap_ in the db and
+  // Gets should return true if check is true and false otherwise
+  // Also checks that value that we got is the same as inserted; and =kNewValue
+  //   if test_compaction_change is true
+  void SleepCompactCheck(int slp_tim, int st_pos, int span, bool check = true,
+                         bool test_compaction_change = false) {
+    assert(db_ttl_);
+    sleep(slp_tim);
+    ManualCompact();
+    static ReadOptions ropts;
+    kv_it_ = kvmap_.begin();
+    advance(kv_it_, st_pos);
+    std::string v;
+    for (int i = 0; kv_it_ != kvmap_.end() && i < span; i++, kv_it_++) {
+      Status s = db_ttl_->Get(ropts, kv_it_->first, &v);
+      if (s.ok() != check) {
+        fprintf(stderr, "key=%s ", kv_it_->first.c_str());
+        if (!s.ok()) {
+          fprintf(stderr, "is absent from db but was expected to be present\n");
+        } else {
+          fprintf(stderr, "is present in db but was expected to be absent\n");
+        }
+        assert(false);
+      } else if (s.ok()) {
+          if (test_compaction_change && v.compare(kNewValue_) != 0) {
+            fprintf(stderr, " value for key=%s present in database is %s but "
+                            " should be %s\n", kv_it_->first.c_str(), v.c_str(),
+                            kNewValue_.c_str());
+            assert(false);
+          } else if (!test_compaction_change && v.compare(kv_it_->second) !=0) {
+            fprintf(stderr, " value for key=%s present in database is %s but "
+                            " should be %s\n", kv_it_->first.c_str(), v.c_str(),
+                            kv_it_->second.c_str());
+            assert(false);
+          }
+      }
+    }
+  }
+
+  // Similar as SleepCompactCheck but uses TtlIterator to read from db
+  void SleepCompactCheckIter(int slp, int st_pos, int span, bool check=true) {
+    assert(db_ttl_);
+    sleep(slp);
+    ManualCompact();
+    static ReadOptions ropts;
+    Iterator *dbiter = db_ttl_->NewIterator(ropts);
+    kv_it_ = kvmap_.begin();
+    advance(kv_it_, st_pos);
+
+    dbiter->Seek(kv_it_->first);
+    if (!check) {
+      if (dbiter->Valid()) {
+        ASSERT_NE(dbiter->value().compare(kv_it_->second), 0);
+      }
+    } else {  // dbiter should have found out kvmap_[st_pos]
+      for (int i = st_pos;
+           kv_it_ != kvmap_.end() && i < st_pos + span;
+           i++, kv_it_++)  {
+        ASSERT_TRUE(dbiter->Valid());
+        ASSERT_EQ(dbiter->value().compare(kv_it_->second), 0);
+        dbiter->Next();
+      }
+    }
+    delete dbiter;
+  }
+
+  class TestFilter : public CompactionFilter {
+   public:
+    TestFilter(const int64_t kSampleSize, const std::string kNewValue)
+      : kSampleSize_(kSampleSize),
+        kNewValue_(kNewValue) {
+    }
+
+    // Works on keys of the form "key<number>"
+    // Drops key if number at the end of key is in [0, kSampleSize_/3),
+    // Keeps key if it is in [kSampleSize_/3, 2*kSampleSize_/3),
+    // Change value if it is in [2*kSampleSize_/3, kSampleSize_)
+    // Eg. kSampleSize_=6. Drop:key0-1...Keep:key2-3...Change:key4-5...
+    virtual bool Filter(int level, const Slice& key,
+                        const Slice& value, std::string* new_value,
+                        bool* value_changed) const override {
+      assert(new_value != nullptr);
+
+      std::string search_str = "0123456789";
+      std::string key_string = key.ToString();
+      size_t pos = key_string.find_first_of(search_str);
+      int num_key_end;
+      if (pos != std::string::npos) {
+        num_key_end = stoi(key_string.substr(pos, key.size() - pos));
+      } else {
+        return false; // Keep keys not matching the format "key<NUMBER>"
+      }
+
+      int partition = kSampleSize_ / 3;
+      if (num_key_end < partition) {
+        return true;
+      } else if (num_key_end < partition * 2) {
+        return false;
+      } else {
+        *new_value = kNewValue_;
+        *value_changed = true;
+        return false;
+      }
+    }
+
+    virtual const char* Name() const override {
+      return "TestFilter";
+    }
+
+   private:
+    const int64_t kSampleSize_;
+    const std::string kNewValue_;
+  };
+
+  class TestFilterFactory : public CompactionFilterFactory {
+    public:
+      TestFilterFactory(const int64_t kSampleSize, const std::string kNewValue)
+        : kSampleSize_(kSampleSize),
+          kNewValue_(kNewValue) {
+      }
+
+      virtual std::unique_ptr<CompactionFilter>
+      CreateCompactionFilter(
+          const CompactionFilter::Context& context) override {
+        return std::unique_ptr<CompactionFilter>(
+            new TestFilter(kSampleSize_, kNewValue_));
+      }
+
+      virtual const char* Name() const override {
+        return "TestFilterFactory";
+      }
+
+    private:
+      const int64_t kSampleSize_;
+      const std::string kNewValue_;
+  };
+
+
+  // Choose carefully so that Put, Gets & Compaction complete in 1 second buffer
+  const int64_t kSampleSize_ = 100;
+
+ private:
+  std::string dbname_;
+  StackableDB* db_ttl_;
+  Options options_;
+  KVMap kvmap_;
+  KVMap::iterator kv_it_;
+  const std::string kNewValue_ = "new_value";
+  unique_ptr<CompactionFilter> test_comp_filter_;
+}; // class TtlTest
+
+// If TTL is non positive or not provided, the behaviour is TTL = infinity
+// This test opens the db 3 times with such default behavior and inserts a
+// bunch of kvs each time. All kvs should accumulate in the db till the end
+// Partitions the sample-size provided into 3 sets over boundary1 and boundary2
+TEST(TtlTest, NoEffect) {
+  MakeKVMap(kSampleSize_);
+  int boundary1 = kSampleSize_ / 3;
+  int boundary2 = 2 * boundary1;
+
+  OpenTtl();
+  PutValues(0, boundary1);                       //T=0: Set1 never deleted
+  SleepCompactCheck(1, 0, boundary1);            //T=1: Set1 still there
+  CloseTtl();
+
+  OpenTtl(0);
+  PutValues(boundary1, boundary2 - boundary1);   //T=1: Set2 never deleted
+  SleepCompactCheck(1, 0, boundary2);            //T=2: Sets1 & 2 still there
+  CloseTtl();
+
+  OpenTtl(-1);
+  PutValues(boundary2, kSampleSize_ - boundary2); //T=3: Set3 never deleted
+  SleepCompactCheck(1, 0, kSampleSize_, true);    //T=4: Sets 1,2,3 still there
+  CloseTtl();
+}
+
+// Puts a set of values and checks its presence using Get during ttl
+TEST(TtlTest, PresentDuringTTL) {
+  MakeKVMap(kSampleSize_);
+
+  OpenTtl(2);                                 // T=0:Open the db with ttl = 2
+  PutValues(0, kSampleSize_);                  // T=0:Insert Set1. Delete at t=2
+  SleepCompactCheck(1, 0, kSampleSize_, true); // T=1:Set1 should still be there
+  CloseTtl();
+}
+
+// Puts a set of values and checks its absence using Get after ttl
+TEST(TtlTest, AbsentAfterTTL) {
+  MakeKVMap(kSampleSize_);
+
+  OpenTtl(1);                                  // T=0:Open the db with ttl = 2
+  PutValues(0, kSampleSize_);                  // T=0:Insert Set1. Delete at t=2
+  SleepCompactCheck(2, 0, kSampleSize_, false); // T=2:Set1 should not be there
+  CloseTtl();
+}
+
+// Resets the timestamp of a set of kvs by updating them and checks that they
+// are not deleted according to the old timestamp
+TEST(TtlTest, ResetTimestamp) {
+  MakeKVMap(kSampleSize_);
+
+  OpenTtl(3);
+  PutValues(0, kSampleSize_);            // T=0: Insert Set1. Delete at t=3
+  sleep(2);                             // T=2
+  PutValues(0, kSampleSize_);            // T=2: Insert Set1. Delete at t=5
+  SleepCompactCheck(2, 0, kSampleSize_); // T=4: Set1 should still be there
+  CloseTtl();
+}
+
+// Similar to PresentDuringTTL but uses Iterator
+TEST(TtlTest, IterPresentDuringTTL) {
+  MakeKVMap(kSampleSize_);
+
+  OpenTtl(2);
+  PutValues(0, kSampleSize_);                 // T=0: Insert. Delete at t=2
+  SleepCompactCheckIter(1, 0, kSampleSize_);  // T=1: Set should be there
+  CloseTtl();
+}
+
+// Similar to AbsentAfterTTL but uses Iterator
+TEST(TtlTest, IterAbsentAfterTTL) {
+  MakeKVMap(kSampleSize_);
+
+  OpenTtl(1);
+  PutValues(0, kSampleSize_);                      // T=0: Insert. Delete at t=1
+  SleepCompactCheckIter(2, 0, kSampleSize_, false); // T=2: Should not be there
+  CloseTtl();
+}
+
+// Checks presence while opening the same db more than once with the same ttl
+// Note: The second open will open the same db
+TEST(TtlTest, MultiOpenSamePresent) {
+  MakeKVMap(kSampleSize_);
+
+  OpenTtl(2);
+  PutValues(0, kSampleSize_);                   // T=0: Insert. Delete at t=2
+  CloseTtl();
+
+  OpenTtl(2);                                  // T=0. Delete at t=2
+  SleepCompactCheck(1, 0, kSampleSize_);        // T=1: Set should be there
+  CloseTtl();
+}
+
+// Checks absence while opening the same db more than once with the same ttl
+// Note: The second open will open the same db
+TEST(TtlTest, MultiOpenSameAbsent) {
+  MakeKVMap(kSampleSize_);
+
+  OpenTtl(1);
+  PutValues(0, kSampleSize_);                   // T=0: Insert. Delete at t=1
+  CloseTtl();
+
+  OpenTtl(1);                                  // T=0.Delete at t=1
+  SleepCompactCheck(2, 0, kSampleSize_, false); // T=2: Set should not be there
+  CloseTtl();
+}
+
+// Checks presence while opening the same db more than once with bigger ttl
+TEST(TtlTest, MultiOpenDifferent) {
+  MakeKVMap(kSampleSize_);
+
+  OpenTtl(1);
+  PutValues(0, kSampleSize_);            // T=0: Insert. Delete at t=1
+  CloseTtl();
+
+  OpenTtl(3);                           // T=0: Set deleted at t=3
+  SleepCompactCheck(2, 0, kSampleSize_); // T=2: Set should be there
+  CloseTtl();
+}
+
+// Checks presence during ttl in read_only mode
+TEST(TtlTest, ReadOnlyPresentForever) {
+  MakeKVMap(kSampleSize_);
+
+  OpenTtl(1);                                 // T=0:Open the db normally
+  PutValues(0, kSampleSize_);                  // T=0:Insert Set1. Delete at t=1
+  CloseTtl();
+
+  OpenReadOnlyTtl(1);
+  SleepCompactCheck(2, 0, kSampleSize_);       // T=2:Set1 should still be there
+  CloseTtl();
+}
+
+// Checks whether WriteBatch works well with TTL
+// Puts all kvs in kvmap_ in a batch and writes first, then deletes first half
+TEST(TtlTest, WriteBatchTest) {
+  MakeKVMap(kSampleSize_);
+  BatchOperation batch_ops[kSampleSize_];
+  for (int i = 0; i < kSampleSize_; i++) {
+    batch_ops[i] = PUT;
+  }
+
+  OpenTtl(2);
+  MakePutWriteBatch(batch_ops, kSampleSize_);
+  for (int i = 0; i < kSampleSize_ / 2; i++) {
+    batch_ops[i] = DELETE;
+  }
+  MakePutWriteBatch(batch_ops, kSampleSize_ / 2);
+  SleepCompactCheck(0, 0, kSampleSize_ / 2, false);
+  SleepCompactCheck(0, kSampleSize_ / 2, kSampleSize_ - kSampleSize_ / 2);
+  CloseTtl();
+}
+
+// Checks user's compaction filter for correctness with TTL logic
+TEST(TtlTest, CompactionFilter) {
+  MakeKVMap(kSampleSize_);
+
+  OpenTtlWithTestCompaction(1);
+  PutValues(0, kSampleSize_);                  // T=0:Insert Set1. Delete at t=1
+  // T=2: TTL logic takes precedence over TestFilter:-Set1 should not be there
+  SleepCompactCheck(2, 0, kSampleSize_, false);
+  CloseTtl();
+
+  OpenTtlWithTestCompaction(3);
+  PutValues(0, kSampleSize_);                   // T=0:Insert Set1.
+  int partition = kSampleSize_ / 3;
+  SleepCompactCheck(1, 0, partition, false);   // Part dropped
+  SleepCompactCheck(0, partition, partition);  // Part kept
+  SleepCompactCheck(0, 2 * partition, partition, true, true); // Part changed
+  CloseTtl();
+}
+
+// Insert some key-values which KeyMayExist should be able to get and check that
+// values returned are fine
+TEST(TtlTest, KeyMayExist) {
+  MakeKVMap(kSampleSize_);
+
+  OpenTtl();
+  PutValues(0, kSampleSize_, false);
+
+  SimpleKeyMayExistCheck();
+
+  CloseTtl();
+}
+
+} //  namespace rocksdb
+
+// A black-box test for the ttl wrapper around rocksdb
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}