]> git-server-git.apps.pok.os.sepia.ceph.com Git - rocksdb.git/commitdiff
Declare all DB methods virtual so that StackableDB can override them
authorIgor Canadi <icanadi@fb.com>
Sun, 4 May 2014 18:39:49 +0000 (11:39 -0700)
committerIgor Canadi <icanadi@fb.com>
Sun, 4 May 2014 18:39:49 +0000 (11:39 -0700)
360 files changed:
.arcconfig [new file with mode: 0644]
.clang-format [new file with mode: 0644]
.gitignore [new file with mode: 0644]
CONTRIBUTING.md [new file with mode: 0644]
HISTORY.md [new file with mode: 0644]
INSTALL.md [new file with mode: 0644]
LICENSE [new file with mode: 0644]
Makefile [new file with mode: 0644]
PATENTS [new file with mode: 0644]
README [new file with mode: 0644]
ROCKSDB_LITE.md [new file with mode: 0644]
build_tools/build_detect_platform [new file with mode: 0755]
build_tools/build_detect_version [new file with mode: 0755]
build_tools/fbcode.clang31.sh [new file with mode: 0644]
build_tools/fbcode.gcc471.sh [new file with mode: 0644]
build_tools/fbcode.gcc481.sh [new file with mode: 0644]
build_tools/format-diff.sh [new file with mode: 0755]
build_tools/mac-install-gflags.sh [new file with mode: 0755]
build_tools/make_new_version.sh [new file with mode: 0755]
build_tools/regression_build_test.sh [new file with mode: 0755]
build_tools/valgrind_test.sh [new file with mode: 0755]
coverage/coverage_test.sh [new file with mode: 0755]
coverage/parse_gcov_output.py [new file with mode: 0644]
db/builder.cc [new file with mode: 0644]
db/builder.h [new file with mode: 0644]
db/c.cc [new file with mode: 0644]
db/c_test.c [new file with mode: 0644]
db/column_family.cc [new file with mode: 0644]
db/column_family.h [new file with mode: 0644]
db/column_family_test.cc [new file with mode: 0644]
db/compaction.cc [new file with mode: 0644]
db/compaction.h [new file with mode: 0644]
db/compaction_picker.cc [new file with mode: 0644]
db/compaction_picker.h [new file with mode: 0644]
db/corruption_test.cc [new file with mode: 0644]
db/db_bench.cc [new file with mode: 0644]
db/db_filesnapshot.cc [new file with mode: 0644]
db/db_impl.cc [new file with mode: 0644]
db/db_impl.h [new file with mode: 0644]
db/db_impl_debug.cc [new file with mode: 0644]
db/db_impl_readonly.cc [new file with mode: 0644]
db/db_impl_readonly.h [new file with mode: 0644]
db/db_iter.cc [new file with mode: 0644]
db/db_iter.h [new file with mode: 0644]
db/db_stats_logger.cc [new file with mode: 0644]
db/db_test.cc [new file with mode: 0644]
db/dbformat.cc [new file with mode: 0644]
db/dbformat.h [new file with mode: 0644]
db/dbformat_test.cc [new file with mode: 0644]
db/deletefile_test.cc [new file with mode: 0644]
db/file_indexer.cc [new file with mode: 0644]
db/file_indexer.h [new file with mode: 0644]
db/file_indexer_test.cc [new file with mode: 0644]
db/filename.cc [new file with mode: 0644]
db/filename.h [new file with mode: 0644]
db/filename_test.cc [new file with mode: 0644]
db/internal_stats.cc [new file with mode: 0644]
db/internal_stats.h [new file with mode: 0644]
db/log_format.h [new file with mode: 0644]
db/log_reader.cc [new file with mode: 0644]
db/log_reader.h [new file with mode: 0644]
db/log_test.cc [new file with mode: 0644]
db/log_writer.cc [new file with mode: 0644]
db/log_writer.h [new file with mode: 0644]
db/memtable.cc [new file with mode: 0644]
db/memtable.h [new file with mode: 0644]
db/memtable_list.cc [new file with mode: 0644]
db/memtable_list.h [new file with mode: 0644]
db/merge_context.h [new file with mode: 0644]
db/merge_helper.cc [new file with mode: 0644]
db/merge_helper.h [new file with mode: 0644]
db/merge_operator.cc [new file with mode: 0644]
db/merge_test.cc [new file with mode: 0644]
db/perf_context_test.cc [new file with mode: 0644]
db/plain_table_db_test.cc [new file with mode: 0644]
db/prefix_test.cc [new file with mode: 0644]
db/repair.cc [new file with mode: 0644]
db/simple_table_db_test.cc [new file with mode: 0644]
db/skiplist.h [new file with mode: 0644]
db/skiplist_test.cc [new file with mode: 0644]
db/snapshot.h [new file with mode: 0644]
db/table_cache.cc [new file with mode: 0644]
db/table_cache.h [new file with mode: 0644]
db/table_properties_collector.cc [new file with mode: 0644]
db/table_properties_collector.h [new file with mode: 0644]
db/table_properties_collector_test.cc [new file with mode: 0644]
db/tailing_iter.cc [new file with mode: 0644]
db/tailing_iter.h [new file with mode: 0644]
db/transaction_log_impl.cc [new file with mode: 0644]
db/transaction_log_impl.h [new file with mode: 0644]
db/version_edit.cc [new file with mode: 0644]
db/version_edit.h [new file with mode: 0644]
db/version_edit_test.cc [new file with mode: 0644]
db/version_set.cc [new file with mode: 0644]
db/version_set.h [new file with mode: 0644]
db/version_set_test.cc [new file with mode: 0644]
db/write_batch.cc [new file with mode: 0644]
db/write_batch_internal.h [new file with mode: 0644]
db/write_batch_test.cc [new file with mode: 0644]
doc/doc.css [new file with mode: 0644]
doc/index.html [new file with mode: 0644]
doc/log_format.txt [new file with mode: 0644]
doc/rockslogo.jpg [new file with mode: 0644]
doc/rockslogo.png [new file with mode: 0644]
hdfs/README [new file with mode: 0644]
hdfs/env_hdfs.h [new file with mode: 0644]
hdfs/hdfs.h [new file with mode: 0644]
hdfs/libhdfs.a [new file with mode: 0644]
helpers/memenv/memenv.cc [new file with mode: 0644]
helpers/memenv/memenv_test.cc [new file with mode: 0644]
include/rocksdb/c.h [new file with mode: 0644]
include/rocksdb/cache.h [new file with mode: 0644]
include/rocksdb/compaction_filter.h [new file with mode: 0644]
include/rocksdb/comparator.h [new file with mode: 0644]
include/rocksdb/db.h [new file with mode: 0644]
include/rocksdb/env.h [new file with mode: 0644]
include/rocksdb/filter_policy.h [new file with mode: 0644]
include/rocksdb/flush_block_policy.h [new file with mode: 0644]
include/rocksdb/iterator.h [new file with mode: 0644]
include/rocksdb/ldb_tool.h [new file with mode: 0644]
include/rocksdb/memtablerep.h [new file with mode: 0644]
include/rocksdb/merge_operator.h [new file with mode: 0644]
include/rocksdb/options.h [new file with mode: 0644]
include/rocksdb/perf_context.h [new file with mode: 0644]
include/rocksdb/slice.h [new file with mode: 0644]
include/rocksdb/slice_transform.h [new file with mode: 0644]
include/rocksdb/statistics.h [new file with mode: 0644]
include/rocksdb/status.h [new file with mode: 0644]
include/rocksdb/table.h [new file with mode: 0644]
include/rocksdb/table_properties.h [new file with mode: 0644]
include/rocksdb/transaction_log.h [new file with mode: 0644]
include/rocksdb/types.h [new file with mode: 0644]
include/rocksdb/universal_compaction.h [new file with mode: 0644]
include/rocksdb/version.h [new file with mode: 0644]
include/rocksdb/write_batch.h [new file with mode: 0644]
include/utilities/backupable_db.h [new file with mode: 0644]
include/utilities/db_ttl.h [new file with mode: 0644]
include/utilities/geo_db.h [new file with mode: 0644]
include/utilities/stackable_db.h [new file with mode: 0644]
include/utilities/utility_db.h [new file with mode: 0644]
java/Makefile [new file with mode: 0644]
java/RocksDBSample.java [new file with mode: 0644]
java/jdb_bench.sh [new file with mode: 0755]
java/org/rocksdb/BackupableDB.java [new file with mode: 0644]
java/org/rocksdb/BackupableDBOptions.java [new file with mode: 0644]
java/org/rocksdb/BloomFilter.java [new file with mode: 0644]
java/org/rocksdb/Filter.java [new file with mode: 0644]
java/org/rocksdb/HashLinkedListMemTableConfig.java [new file with mode: 0644]
java/org/rocksdb/HashSkipListMemTableConfig.java [new file with mode: 0644]
java/org/rocksdb/HistogramData.java [new file with mode: 0644]
java/org/rocksdb/HistogramType.java [new file with mode: 0644]
java/org/rocksdb/Iterator.java [new file with mode: 0644]
java/org/rocksdb/MemTableConfig.java [new file with mode: 0644]
java/org/rocksdb/Options.java [new file with mode: 0644]
java/org/rocksdb/PlainTableConfig.java [new file with mode: 0644]
java/org/rocksdb/ReadOptions.java [new file with mode: 0644]
java/org/rocksdb/RocksDB.java [new file with mode: 0644]
java/org/rocksdb/RocksDBException.java [new file with mode: 0644]
java/org/rocksdb/RocksObject.java [new file with mode: 0644]
java/org/rocksdb/SkipListMemTableConfig.java [new file with mode: 0644]
java/org/rocksdb/Statistics.java [new file with mode: 0644]
java/org/rocksdb/TableFormatConfig.java [new file with mode: 0644]
java/org/rocksdb/TickerType.java [new file with mode: 0644]
java/org/rocksdb/VectorMemTableConfig.java [new file with mode: 0644]
java/org/rocksdb/WriteBatch.java [new file with mode: 0644]
java/org/rocksdb/WriteBatchTest.java [new file with mode: 0644]
java/org/rocksdb/WriteOptions.java [new file with mode: 0644]
java/org/rocksdb/benchmark/DbBenchmark.java [new file with mode: 0644]
java/org/rocksdb/test/BackupableDBTest.java [new file with mode: 0644]
java/org/rocksdb/test/OptionsTest.java [new file with mode: 0644]
java/org/rocksdb/test/ReadOptionsTest.java [new file with mode: 0644]
java/org/rocksdb/util/Environment.java [new file with mode: 0644]
java/org/rocksdb/util/SizeUnit.java [new file with mode: 0644]
java/rocksjni/backupablejni.cc [new file with mode: 0644]
java/rocksjni/filter.cc [new file with mode: 0644]
java/rocksjni/iterator.cc [new file with mode: 0644]
java/rocksjni/memtablejni.cc [new file with mode: 0644]
java/rocksjni/options.cc [new file with mode: 0644]
java/rocksjni/portal.h [new file with mode: 0644]
java/rocksjni/rocksjni.cc [new file with mode: 0644]
java/rocksjni/statistics.cc [new file with mode: 0644]
java/rocksjni/table.cc [new file with mode: 0644]
java/rocksjni/write_batch.cc [new file with mode: 0644]
linters/__phutil_library_init__.php [new file with mode: 0644]
linters/__phutil_library_map__.php [new file with mode: 0644]
linters/cpp_linter/ArcanistCpplintLinter.php [new file with mode: 0644]
linters/cpp_linter/FbcodeCppLinter.php [new file with mode: 0644]
linters/cpp_linter/PfffCppLinter.php [new file with mode: 0644]
linters/cpp_linter/cpplint.py [new file with mode: 0755]
linters/lint_engine/FacebookFbcodeLintEngine.php [new file with mode: 0644]
port/README [new file with mode: 0644]
port/atomic_pointer.h [new file with mode: 0644]
port/likely.h [new file with mode: 0644]
port/port.h [new file with mode: 0644]
port/port_example.h [new file with mode: 0644]
port/port_posix.cc [new file with mode: 0644]
port/port_posix.h [new file with mode: 0644]
port/stack_trace.cc [new file with mode: 0644]
port/stack_trace.h [new file with mode: 0644]
port/win/stdint.h [new file with mode: 0644]
table/block.cc [new file with mode: 0644]
table/block.h [new file with mode: 0644]
table/block_based_table_builder.cc [new file with mode: 0644]
table/block_based_table_builder.h [new file with mode: 0644]
table/block_based_table_factory.cc [new file with mode: 0644]
table/block_based_table_factory.h [new file with mode: 0644]
table/block_based_table_reader.cc [new file with mode: 0644]
table/block_based_table_reader.h [new file with mode: 0644]
table/block_builder.cc [new file with mode: 0644]
table/block_builder.h [new file with mode: 0644]
table/block_hash_index.cc [new file with mode: 0644]
table/block_hash_index.h [new file with mode: 0644]
table/block_hash_index_test.cc [new file with mode: 0644]
table/block_test.cc [new file with mode: 0644]
table/filter_block.cc [new file with mode: 0644]
table/filter_block.h [new file with mode: 0644]
table/filter_block_test.cc [new file with mode: 0644]
table/flush_block_policy.cc [new file with mode: 0644]
table/format.cc [new file with mode: 0644]
table/format.h [new file with mode: 0644]
table/iter_heap.h [new file with mode: 0644]
table/iterator.cc [new file with mode: 0644]
table/iterator_wrapper.h [new file with mode: 0644]
table/merger.cc [new file with mode: 0644]
table/merger.h [new file with mode: 0644]
table/meta_blocks.cc [new file with mode: 0644]
table/meta_blocks.h [new file with mode: 0644]
table/plain_table_builder.cc [new file with mode: 0644]
table/plain_table_builder.h [new file with mode: 0644]
table/plain_table_factory.cc [new file with mode: 0644]
table/plain_table_factory.h [new file with mode: 0644]
table/plain_table_reader.cc [new file with mode: 0644]
table/plain_table_reader.h [new file with mode: 0644]
table/table_builder.h [new file with mode: 0644]
table/table_properties.cc [new file with mode: 0644]
table/table_reader.h [new file with mode: 0644]
table/table_reader_bench.cc [new file with mode: 0644]
table/table_test.cc [new file with mode: 0644]
table/two_level_iterator.cc [new file with mode: 0644]
table/two_level_iterator.h [new file with mode: 0644]
tools/auto_sanity_test.sh [new file with mode: 0755]
tools/blob_store_bench.cc [new file with mode: 0644]
tools/db_crashtest.py [new file with mode: 0644]
tools/db_crashtest2.py [new file with mode: 0644]
tools/db_repl_stress.cc [new file with mode: 0644]
tools/db_sanity_test.cc [new file with mode: 0644]
tools/db_stress.cc [new file with mode: 0644]
tools/ldb.cc [new file with mode: 0644]
tools/ldb_test.py [new file with mode: 0644]
tools/reduce_levels_test.cc [new file with mode: 0644]
tools/sst_dump.cc [new file with mode: 0644]
util/arena.cc [new file with mode: 0644]
util/arena.h [new file with mode: 0644]
util/arena_test.cc [new file with mode: 0644]
util/auto_roll_logger.cc [new file with mode: 0644]
util/auto_roll_logger.h [new file with mode: 0644]
util/auto_roll_logger_test.cc [new file with mode: 0755]
util/autovector.h [new file with mode: 0644]
util/autovector_test.cc [new file with mode: 0644]
util/benchharness.cc [new file with mode: 0644]
util/benchharness.h [new file with mode: 0644]
util/benchharness_test.cc [new file with mode: 0644]
util/blob_store.cc [new file with mode: 0644]
util/blob_store.h [new file with mode: 0644]
util/blob_store_test.cc [new file with mode: 0644]
util/bloom.cc [new file with mode: 0644]
util/bloom_test.cc [new file with mode: 0644]
util/build_version.h [new file with mode: 0644]
util/cache.cc [new file with mode: 0644]
util/cache_test.cc [new file with mode: 0644]
util/coding.cc [new file with mode: 0644]
util/coding.h [new file with mode: 0644]
util/coding_test.cc [new file with mode: 0644]
util/comparator.cc [new file with mode: 0644]
util/crc32c.cc [new file with mode: 0644]
util/crc32c.h [new file with mode: 0644]
util/crc32c_test.cc [new file with mode: 0644]
util/dynamic_bloom.cc [new file with mode: 0644]
util/dynamic_bloom.h [new file with mode: 0644]
util/dynamic_bloom_test.cc [new file with mode: 0644]
util/env.cc [new file with mode: 0644]
util/env_hdfs.cc [new file with mode: 0644]
util/env_posix.cc [new file with mode: 0644]
util/env_test.cc [new file with mode: 0644]
util/filelock_test.cc [new file with mode: 0644]
util/filter_policy.cc [new file with mode: 0644]
util/hash.cc [new file with mode: 0644]
util/hash.h [new file with mode: 0644]
util/hash_cuckoo_rep.cc [new file with mode: 0644]
util/hash_cuckoo_rep.h [new file with mode: 0644]
util/hash_linklist_rep.cc [new file with mode: 0644]
util/hash_linklist_rep.h [new file with mode: 0644]
util/hash_skiplist_rep.cc [new file with mode: 0644]
util/hash_skiplist_rep.h [new file with mode: 0644]
util/histogram.cc [new file with mode: 0644]
util/histogram.h [new file with mode: 0644]
util/histogram_test.cc [new file with mode: 0644]
util/ldb_cmd.cc [new file with mode: 0644]
util/ldb_cmd.h [new file with mode: 0644]
util/ldb_cmd_execute_result.h [new file with mode: 0644]
util/ldb_tool.cc [new file with mode: 0644]
util/log_buffer.cc [new file with mode: 0644]
util/log_buffer.h [new file with mode: 0644]
util/log_write_bench.cc [new file with mode: 0644]
util/logging.cc [new file with mode: 0644]
util/logging.h [new file with mode: 0644]
util/manual_compaction_test.cc [new file with mode: 0644]
util/murmurhash.cc [new file with mode: 0644]
util/murmurhash.h [new file with mode: 0644]
util/mutexlock.h [new file with mode: 0644]
util/options.cc [new file with mode: 0644]
util/perf_context.cc [new file with mode: 0644]
util/perf_context_imp.h [new file with mode: 0644]
util/posix_logger.h [new file with mode: 0644]
util/random.h [new file with mode: 0644]
util/signal_test.cc [new file with mode: 0644]
util/skiplistrep.cc [new file with mode: 0644]
util/slice.cc [new file with mode: 0644]
util/statistics.cc [new file with mode: 0644]
util/statistics.h [new file with mode: 0644]
util/stats_logger.h [new file with mode: 0644]
util/status.cc [new file with mode: 0644]
util/stl_wrappers.h [new file with mode: 0644]
util/stop_watch.h [new file with mode: 0644]
util/string_util.cc [new file with mode: 0644]
util/string_util.h [new file with mode: 0644]
util/sync_point.cc [new file with mode: 0644]
util/sync_point.h [new file with mode: 0644]
util/testharness.cc [new file with mode: 0644]
util/testharness.h [new file with mode: 0644]
util/testutil.cc [new file with mode: 0644]
util/testutil.h [new file with mode: 0644]
util/thread_local.cc [new file with mode: 0644]
util/thread_local.h [new file with mode: 0644]
util/thread_local_test.cc [new file with mode: 0644]
util/vectorrep.cc [new file with mode: 0644]
util/xxhash.cc [new file with mode: 0644]
util/xxhash.h [new file with mode: 0644]
utilities/backupable/backupable_db.cc [new file with mode: 0644]
utilities/backupable/backupable_db_test.cc [new file with mode: 0644]
utilities/geodb/geodb_impl.cc [new file with mode: 0644]
utilities/geodb/geodb_impl.h [new file with mode: 0644]
utilities/geodb/geodb_test.cc [new file with mode: 0644]
utilities/merge_operators.h [new file with mode: 0644]
utilities/merge_operators/put.cc [new file with mode: 0644]
utilities/merge_operators/string_append/stringappend.cc [new file with mode: 0644]
utilities/merge_operators/string_append/stringappend.h [new file with mode: 0644]
utilities/merge_operators/string_append/stringappend2.cc [new file with mode: 0644]
utilities/merge_operators/string_append/stringappend2.h [new file with mode: 0644]
utilities/merge_operators/string_append/stringappend_test.cc [new file with mode: 0644]
utilities/merge_operators/uint64add.cc [new file with mode: 0644]
utilities/redis/README [new file with mode: 0644]
utilities/redis/redis_list_exception.h [new file with mode: 0644]
utilities/redis/redis_list_iterator.h [new file with mode: 0644]
utilities/redis/redis_lists.cc [new file with mode: 0644]
utilities/redis/redis_lists.h [new file with mode: 0644]
utilities/redis/redis_lists_test.cc [new file with mode: 0644]
utilities/ttl/db_ttl_impl.cc [new file with mode: 0644]
utilities/ttl/db_ttl_impl.h [new file with mode: 0644]
utilities/ttl/ttl_test.cc [new file with mode: 0644]

diff --git a/.arcconfig b/.arcconfig
new file mode 100644 (file)
index 0000000..85ca38f
--- /dev/null
@@ -0,0 +1,10 @@
+{
+  "project_id" : "rocksdb",
+  "conduit_uri" : "https://reviews.facebook.net/",
+  "copyright_holder" : "Facebook",
+  "load" : [
+    "linters"
+  ],
+  "lint.engine" : "FacebookFbcodeLintEngine",
+  "lint.engine.single.linter" : "FbcodeCppLinter"
+}
diff --git a/.clang-format b/.clang-format
new file mode 100644 (file)
index 0000000..7c27981
--- /dev/null
@@ -0,0 +1,5 @@
+# Complete list of style options can be found at: 
+# http://clang.llvm.org/docs/ClangFormatStyleOptions.html
+---
+BasedOnStyle: Google
+...
diff --git a/.gitignore b/.gitignore
new file mode 100644 (file)
index 0000000..0c7fd4f
--- /dev/null
@@ -0,0 +1,33 @@
+TARGETS
+build_config.mk
+
+*.a
+*.arc
+*.d
+*.dylib*
+*.gcda
+*.gcno
+*.o
+*.so
+*.so.*
+*_test
+*_bench
+*_stress
+*.out
+*.class
+*.jar
+*.*jnilib*
+*.d-e
+*.o-*
+
+ldb
+manifest_dump
+sst_dump
+util/build_version.cc
+build_tools/VALGRIND_LOGS/
+coverage/COVERAGE_REPORT
+.gdbhistory
+.phutil_module_cache
+tags
+java/*.log
+java/include/org_rocksdb_*.h
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644 (file)
index 0000000..7270d0c
--- /dev/null
@@ -0,0 +1,20 @@
+# Contributing to RocksDB
+
+## Contributor License Agreement ("CLA")
+
+In order to accept your pull request, we need you to submit a CLA. You
+only need to do this once, so if you've done this for another Facebook
+open source project, you're good to go. If you are submitting a pull
+request for the first time, just let us know that you have completed
+the CLA and we can cross-check with your GitHub username.
+
+Complete your CLA here: <https://code.facebook.com/cla>
+
+If you don't have a Facebook account, we can send you a PDF that you can
+sign offline. Send us an e-mail or create a new github issue to
+request the CLA in PDF format.
+
+## License
+
+By contributing to RocksDB, you agree that your contributions will be
+licensed under the [BSD License](LICENSE).
diff --git a/HISTORY.md b/HISTORY.md
new file mode 100644 (file)
index 0000000..a46a51a
--- /dev/null
@@ -0,0 +1,78 @@
+# Rocksdb Change Log
+
+## Unreleased (will be released in 3.0)
+
+### Public API changes
+* Added _LEVEL to all InfoLogLevel enums
+* Deprecated ReadOptions.prefix and ReadOptions.prefix_seek. Seek() defaults to prefix-based seek when Options.prefix_extractor is supplied. More detail is documented in https://github.com/facebook/rocksdb/wiki/Prefix-Seek-API-Changes
+
+### New Features
+* Column family support
+* Added an option to use different checksum functions in BlockBasedTableOptions
+
+## 2.8.0 (04/04/2014)
+
+* Removed arena.h from public header files.
+* By default, checksums are verified on every read from database
+* Change default value of several options, including: paranoid_checks=true, max_open_files=5000, level0_slowdown_writes_trigger=20, level0_stop_writes_trigger=24, disable_seek_compaction=true, max_background_flushes=1 and allow_mmap_writes=false
+* Added is_manual_compaction to CompactionFilter::Context
+* Added "virtual void WaitForJoin()" in class Env. Default operation is no-op.
+* Removed BackupEngine::DeleteBackupsNewerThan() function
+* Added new option -- verify_checksums_in_compaction
+* Changed Options.prefix_extractor from raw pointer to shared_ptr (take ownership)
+  Changed HashSkipListRepFactory and HashLinkListRepFactory constructor to not take SliceTransform object (use Options.prefix_extractor implicitly)
+* Added Env::GetThreadPoolQueueLen(), which returns the waiting queue length of thread pools
+* Added a command "checkconsistency" in ldb tool, which checks
+  if file system state matches DB state (file existence and file sizes)
+* Separate options related to block based table to a new struct BlockBasedTableOptions.
+* WriteBatch has a new function Count() to return total size in the batch, and Data() now returns a reference instead of a copy
+* Add more counters to perf context.
+* Supports several more DB properties: compaction-pending, background-errors and cur-size-active-mem-table.
+
+### New Features
+* If we find one truncated record at the end of the MANIFEST or WAL files,
+  we will ignore it. We assume that writers of these records were interrupted
+  and that we can safely ignore it.
+* A new SST format "PlainTable" is added, which is optimized for memory-only workloads. It can be created through NewPlainTableFactory() or NewTotalOrderPlainTableFactory().
+* A new mem table implementation hash linked list optimizing for the case that there are only few keys for each prefix, which can be created through NewHashLinkListRepFactory().
+* Merge operator supports a new function PartialMergeMulti() to allow users to do partial merges against multiple operands.
+* Now compaction filter has a V2 interface. It buffers the kv-pairs sharing the same key prefix, process them in batches, and return the batched results back to DB. The new interface uses a new structure CompactionFilterContext for the same purpose as CompactionFilter::Context in V1.
+* Geo-spatial support for locations and radial-search.
+
+## 2.7.0 (01/28/2014)
+
+### Public API changes
+
+* Renamed `StackableDB::GetRawDB()` to `StackableDB::GetBaseDB()`.
+* Renamed `WriteBatch::Data()` `const std::string& Data() const`.
+* Renamed class `TableStats` to `TableProperties`.
+* Deleted class `PrefixHashRepFactory`. Please use `NewHashSkipListRepFactory()` instead.
+* Supported multi-threaded `EnableFileDeletions()` and `DisableFileDeletions()`.
+* Added `DB::GetOptions()`.
+* Added `DB::GetDbIdentity()`.
+
+### New Features
+
+* Added [BackupableDB](https://github.com/facebook/rocksdb/wiki/How-to-backup-RocksDB%3F)
+* Implemented [TailingIterator](https://github.com/facebook/rocksdb/wiki/Tailing-Iterator), a special type of iterator that
+  doesn't create a snapshot (can be used to read newly inserted data)
+  and is optimized for doing sequential reads.
+* Added property block for table, which allows (1) a table to store
+  its metadata and (2) end user to collect and store properties they
+  are interested in.
+* Enabled caching index and filter block in block cache (turned off by default).
+* Supported error report when doing manual compaction.
+* Supported additional Linux platform flavors and Mac OS.
+* Put with `SliceParts` - Variant of `Put()` that gathers output like `writev(2)`
+* Bug fixes and code refactor for compatibility with upcoming Column
+  Family feature.
+
+### Performance Improvements
+
+* Huge benchmark performance improvements by multiple efforts. For example, increase in readonly QPS from about 530k in 2.6 release to 1.1 million in 2.7 [1]
+* Speeding up a way RocksDB deleted obsolete files - no longer listing the whole directory under a lock -- decrease in p99
+* Use raw pointer instead of shared pointer for statistics: [5b825d](https://github.com/facebook/rocksdb/commit/5b825d6964e26ec3b4bb6faa708ebb1787f1d7bd) -- huge increase in performance -- shared pointers are slow
+* Optimized locking for `Get()` -- [1fdb3f](https://github.com/facebook/rocksdb/commit/1fdb3f7dc60e96394e3e5b69a46ede5d67fb976c) -- 1.5x QPS increase for some workloads
+* Cache speedup - [e8d40c3](https://github.com/facebook/rocksdb/commit/e8d40c31b3cca0c3e1ae9abe9b9003b1288026a9)
+* Implemented autovector, which allocates first N elements on stack. Most of vectors in RocksDB are small. Also, we never want to allocate heap objects while holding a mutex. -- [c01676e4](https://github.com/facebook/rocksdb/commit/c01676e46d3be08c3c140361ef1f5884f47d3b3c)
+* Lots of efforts to move malloc, memcpy and IO outside of locks
diff --git a/INSTALL.md b/INSTALL.md
new file mode 100644 (file)
index 0000000..2a91be6
--- /dev/null
@@ -0,0 +1,80 @@
+## Dependencies
+
+RocksDB is developed on Linux (CentOS release 5.2), with gcc 4.8.1.
+It depends on gcc with C++11 support.
+
+* RocksDB depends on the following libraries:
+  - [zlib](http://www.zlib.net/) - a library for data compression.
+  - [bzip2](http://www.bzip.org/) - a library for data compression.
+  - [snappy](https://code.google.com/p/snappy/) - a library for fast
+      data compression.
+  - [gflags](https://code.google.com/p/gflags/) - a library that handles
+      command line flags processing.
+
+RocksDB will successfully compile without the compression libraries included,
+but some things may fail. We do not support releases without the compression
+libraries. You are on your own.
+
+## Supported platforms
+
+* **Linux - Ubuntu**
+    * Upgrade your gcc to version at least 4.7 to get C++11 support.
+    * Install gflags. First, try: `sudo apt-get install libgflags-dev`
+      If this doesn't work and you're using Ubuntu, here's a nice tutorial:
+      (http://askubuntu.com/questions/312173/installing-gflags-12-04)
+    * Install snappy. This is usually as easy as:
+      `sudo apt-get install libsnappy-dev`.
+    * Install zlib. Try: `sudo apt-get install zlib1g-dev`.
+    * Install bzip2: `sudo apt-get install libbz2-dev`.
+* **Linux - CentOS**
+    * Upgrade your gcc to version at least 4.7 to get C++11 support:
+      `yum install gcc47-c++`
+    * Install gflags:
+
+              wget https://gflags.googlecode.com/files/gflags-2.0-no-svn-files.tar.gz
+              tar -xzvf gflags-2.0-no-svn-files.tar.gz
+              cd gflags-2.0
+              ./configure && make && sudo make install
+
+    * Install snappy:
+
+              wget https://snappy.googlecode.com/files/snappy-1.1.1.tar.gz
+              tar -xzvf snappy-1.1.1.tar.gz
+              cd snappy-1.1.1
+              ./configure && make && sudo make install
+
+    * Install zlib:
+
+              sudo yum install zlib
+              sudo yum install zlib-devel
+
+    * Install bzip2:
+
+              sudo yum install bzip2
+              sudo yum install bzip2-devel
+
+* **OS X**:
+    * Install latest C++ compiler that supports C++ 11:
+        * Update XCode:  run `xcode-select --install` (or install it from XCode App's settting).
+        * Install via [homebrew](http://brew.sh/).
+            * If you're first time developer in MacOS, you still need to run: `xcode-select --install` in your command line.
+            * run `brew tap homebrew/dupes; brew install gcc47 --use-llvm` to install gcc 4.7 (or higher).
+    * Install zlib, bzip2 and snappy libraries for compression.
+    * Install gflags. We have included a script
+    `build_tools/mac-install-gflags.sh`, which should automatically install it.
+    If you installed gflags by other means (for example, `brew install gflags`),
+    please set `LIBRARY_PATH` and `CPATH` accordingly.
+    * Please note that some of the optimizations/features are disabled in OSX.
+    We did not run any production workloads on it.
+
+* **iOS**:
+  * Run: `TARGET_OS=IOS make static_lib`
+
+## Compilation
+`make clean; make` will compile librocksdb.a (RocksDB static library) and all
+the unit tests. You can run all unit tests with `make check`.
+
+For shared library builds, exec `make shared_lib` instead.
+
+If you followed the above steps and your compile or unit tests fail,
+please submit an issue: (https://github.com/facebook/rocksdb/issues)
diff --git a/LICENSE b/LICENSE
new file mode 100644 (file)
index 0000000..b132901
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,35 @@
+BSD License
+
+For rocksdb software
+
+Copyright (c) 2014, Facebook, Inc.
+All rights reserved.
+---------------------------------------------------------------------
+
+Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+   * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/Makefile b/Makefile
new file mode 100644 (file)
index 0000000..12bdbad
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,526 @@
+# Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+# Inherit some settings from environment variables, if available
+INSTALL_PATH ?= $(CURDIR)
+
+#-----------------------------------------------
+
+ifneq ($(MAKECMDGOALS),dbg)
+OPT += -O2 -fno-omit-frame-pointer -momit-leaf-frame-pointer
+else
+# intentionally left blank
+endif
+
+ifeq ($(MAKECMDGOALS),shared_lib)
+PLATFORM_SHARED_LDFLAGS=-fPIC
+OPT += -DNDEBUG
+endif
+
+ifeq ($(MAKECMDGOALS),static_lib)
+PLATFORM_SHARED_LDFLAGS=-fPIC
+OPT += -DNDEBUG
+endif
+
+#-----------------------------------------------
+
+# detect what platform we're building on
+$(shell (export ROCKSDB_ROOT=$(CURDIR); $(CURDIR)/build_tools/build_detect_platform $(CURDIR)/build_config.mk))
+# this file is generated by the previous line to set build flags and sources
+include build_config.mk
+
+ifneq ($(PLATFORM), IOS)
+CFLAGS += -g
+CXXFLAGS += -g
+else
+# no debug info for IOS, that will make our library big
+OPT += -DNDEBUG
+endif
+
+# ASAN doesn't work well with jemalloc. If we're compiling with ASAN, we should use regular malloc.
+ifdef COMPILE_WITH_ASAN
+       # ASAN compile flags
+       EXEC_LDFLAGS += -fsanitize=address
+       PLATFORM_CCFLAGS += -fsanitize=address
+       PLATFORM_CXXFLAGS += -fsanitize=address
+else
+       # if we're not compiling with ASAN, use jemalloc
+       EXEC_LDFLAGS := $(JEMALLOC_LIB) $(EXEC_LDFLAGS)
+       PLATFORM_CXXFLAGS += $(JEMALLOC_INCLUDE) -DHAVE_JEMALLOC
+       PLATFORM_CCFLAGS += $(JEMALLOC_INCLUDE) -DHAVE_JEMALLOC
+endif
+
+WARNING_FLAGS = -Wall -Werror
+CFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CCFLAGS) $(OPT)
+CXXFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) -Woverloaded-virtual
+
+LDFLAGS += $(PLATFORM_LDFLAGS)
+
+LIBOBJECTS = $(SOURCES:.cc=.o)
+LIBOBJECTS += $(SOURCESCPP:.cpp=.o)
+MEMENVOBJECTS = $(MEMENV_SOURCES:.cc=.o)
+
+TESTUTIL = ./util/testutil.o
+TESTHARNESS = ./util/testharness.o $(TESTUTIL)
+BENCHHARNESS = ./util/benchharness.o
+VALGRIND_ERROR = 2
+VALGRIND_DIR = build_tools/VALGRIND_LOGS
+VALGRIND_VER := $(join $(VALGRIND_VER),valgrind)
+VALGRIND_OPTS = --error-exitcode=$(VALGRIND_ERROR) --leak-check=full
+
+TESTS = \
+       db_test \
+       block_hash_index_test \
+       autovector_test \
+       column_family_test \
+       table_properties_collector_test \
+       arena_test \
+       auto_roll_logger_test \
+       benchharness_test \
+       block_test \
+       bloom_test \
+       dynamic_bloom_test \
+       c_test \
+       cache_test \
+       coding_test \
+       corruption_test \
+       crc32c_test \
+       dbformat_test \
+       env_test \
+       blob_store_test \
+       filelock_test \
+       filename_test \
+       filter_block_test \
+       histogram_test \
+       log_test \
+       manual_compaction_test \
+       memenv_test \
+       merge_test \
+       redis_test \
+       reduce_levels_test \
+       plain_table_db_test \
+       prefix_test \
+       simple_table_db_test \
+       skiplist_test \
+       stringappend_test \
+       ttl_test \
+       backupable_db_test \
+       version_edit_test \
+       version_set_test \
+       file_indexer_test \
+       write_batch_test\
+       deletefile_test \
+       table_test \
+       thread_local_test \
+        geodb_test
+
+TOOLS = \
+        sst_dump \
+       db_sanity_test \
+        db_stress \
+        ldb \
+       db_repl_stress \
+       blob_store_bench
+
+PROGRAMS = db_bench signal_test table_reader_bench $(TOOLS)
+
+# The library name is configurable since we are maintaining libraries of both
+# debug/release mode.
+ifeq ($(LIBNAME),)
+        LIBNAME=librocksdb
+endif
+LIBRARY = ${LIBNAME}.a
+MEMENVLIBRARY = libmemenv.a
+
+default: all
+
+#-----------------------------------------------
+# Create platform independent shared libraries.
+#-----------------------------------------------
+ifneq ($(PLATFORM_SHARED_EXT),)
+
+ifneq ($(PLATFORM_SHARED_VERSIONED),true)
+SHARED1 = ${LIBNAME}.$(PLATFORM_SHARED_EXT)
+SHARED2 = $(SHARED1)
+SHARED3 = $(SHARED1)
+SHARED = $(SHARED1)
+else
+# Update db.h if you change these.
+SHARED_MAJOR = 3
+SHARED_MINOR = 0
+SHARED1 = ${LIBNAME}.$(PLATFORM_SHARED_EXT)
+SHARED2 = $(SHARED1).$(SHARED_MAJOR)
+SHARED3 = $(SHARED1).$(SHARED_MAJOR).$(SHARED_MINOR)
+SHARED = $(SHARED1) $(SHARED2) $(SHARED3)
+$(SHARED1): $(SHARED3)
+       ln -fs $(SHARED3) $(SHARED1)
+$(SHARED2): $(SHARED3)
+       ln -fs $(SHARED3) $(SHARED2)
+endif
+
+$(SHARED3):
+       $(CXX) $(PLATFORM_SHARED_LDFLAGS)$(SHARED2) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(SOURCES) $(LDFLAGS) -o $@
+
+endif  # PLATFORM_SHARED_EXT
+
+.PHONY: blackbox_crash_test check clean coverage crash_test ldb_tests \
+       release tags valgrind_check whitebox_crash_test format static_lib shared_lib all \
+       dbg
+
+all: $(LIBRARY) $(PROGRAMS) $(TESTS)
+
+static_lib: $(LIBRARY)
+
+shared_lib: $(SHARED)
+
+dbg: $(LIBRARY) $(PROGRAMS) $(TESTS)
+
+# creates static library and programs
+release:
+       $(MAKE) clean
+       OPT="-DNDEBUG -O2" $(MAKE) static_lib $(PROGRAMS) -j32
+
+release_shared_lib:
+       $(MAKE) clean
+       OPT="-DNDEBUG -O2" $(MAKE) shared_lib -j32
+
+coverage:
+       $(MAKE) clean
+       COVERAGEFLAGS="-fprofile-arcs -ftest-coverage" LDFLAGS+="-lgcov" $(MAKE) all check -j32
+       (cd coverage; ./coverage_test.sh)
+       # Delete intermediate files
+       find . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \;
+
+check: $(PROGRAMS) $(TESTS) $(TOOLS)
+       for t in $(TESTS); do echo "***** Running $$t"; ./$$t || exit 1; done
+       python tools/ldb_test.py
+
+ldb_tests: all $(PROGRAMS) $(TESTS) $(TOOLS)
+       python tools/ldb_test.py
+
+crash_test: whitebox_crash_test blackbox_crash_test
+
+blackbox_crash_test: db_stress
+       python -u tools/db_crashtest.py
+
+whitebox_crash_test: db_stress
+       python -u tools/db_crashtest2.py
+
+asan_check:
+       $(MAKE) clean
+       COMPILE_WITH_ASAN=1 $(MAKE) check -j32
+       $(MAKE) clean
+
+asan_crash_test:
+       $(MAKE) clean
+       COMPILE_WITH_ASAN=1 $(MAKE) crash_test
+       $(MAKE) clean
+
+valgrind_check: all $(PROGRAMS) $(TESTS)
+       mkdir -p $(VALGRIND_DIR)
+       echo TESTS THAT HAVE VALGRIND ERRORS > $(VALGRIND_DIR)/valgrind_failed_tests; \
+       echo TIMES in seconds TAKEN BY TESTS ON VALGRIND > $(VALGRIND_DIR)/valgrind_tests_times; \
+       for t in $(filter-out skiplist_test,$(TESTS)); do \
+               stime=`date '+%s'`; \
+               $(VALGRIND_VER) $(VALGRIND_OPTS) ./$$t; \
+               if [ $$? -eq $(VALGRIND_ERROR) ] ; then \
+                       echo $$t >> $(VALGRIND_DIR)/valgrind_failed_tests; \
+               fi; \
+               etime=`date '+%s'`; \
+               echo $$t $$((etime - stime)) >> $(VALGRIND_DIR)/valgrind_tests_times; \
+       done
+
+clean:
+       -rm -f $(PROGRAMS) $(TESTS) $(LIBRARY) $(SHARED) $(MEMENVLIBRARY) build_config.mk
+       -rm -rf ios-x86/* ios-arm/*
+       -find . -name "*.[od]" -exec rm {} \;
+       -find . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \;
+tags:
+       ctags * -R
+       cscope -b `find . -name '*.cc'` `find . -name '*.h'`
+
+format:
+       build_tools/format-diff.sh
+
+# ---------------------------------------------------------------------------
+#      Unit tests and tools
+# ---------------------------------------------------------------------------
+$(LIBRARY): $(LIBOBJECTS)
+       rm -f $@
+       $(AR) -rs $@ $(LIBOBJECTS)
+
+db_bench: db/db_bench.o $(LIBOBJECTS) $(TESTUTIL)
+       $(CXX) db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@  $(LDFLAGS) $(COVERAGEFLAGS)
+
+block_hash_index_test: table/block_hash_index_test.o $(LIBOBJECTS) $(TESTHARNESS)
+        $(CXX) table/block_hash_index_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+db_stress: tools/db_stress.o $(LIBOBJECTS) $(TESTUTIL)
+       $(CXX) tools/db_stress.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@  $(LDFLAGS) $(COVERAGEFLAGS)
+
+db_sanity_test: tools/db_sanity_test.o $(LIBOBJECTS) $(TESTUTIL)
+       $(CXX) tools/db_sanity_test.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@  $(LDFLAGS) $(COVERAGEFLAGS)
+
+db_repl_stress: tools/db_repl_stress.o $(LIBOBJECTS) $(TESTUTIL)
+       $(CXX) tools/db_repl_stress.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@  $(LDFLAGS) $(COVERAGEFLAGS)
+
+blob_store_bench: tools/blob_store_bench.o $(LIBOBJECTS) $(TESTUTIL)
+       $(CXX) tools/blob_store_bench.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@  $(LDFLAGS) $(COVERAGEFLAGS)
+
+signal_test: util/signal_test.o $(LIBOBJECTS)
+       $(CXX) util/signal_test.o $(LIBOBJECTS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+arena_test: util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+autovector_test: util/autovector_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) util/autovector_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+column_family_test: db/column_family_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) db/column_family_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+table_properties_collector_test: db/table_properties_collector_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) db/table_properties_collector_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+bloom_test: util/bloom_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) util/bloom_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+dynamic_bloom_test: util/dynamic_bloom_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) util/dynamic_bloom_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+c_test: db/c_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) db/c_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+cache_test: util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+coding_test: util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+blob_store_test: util/blob_store_test.o $(LIBOBJECTS) $(TESTHARNESS) $(TESTUTIL)
+       $(CXX) util/blob_store_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o$@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+stringappend_test: utilities/merge_operators/string_append/stringappend_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) utilities/merge_operators/string_append/stringappend_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+redis_test: utilities/redis/redis_lists_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) utilities/redis/redis_lists_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+benchharness_test: util/benchharness_test.o $(LIBOBJECTS) $(TESTHARNESS) $(BENCHHARNESS)
+       $(CXX) util/benchharness_test.o $(LIBOBJECTS) $(TESTHARNESS) $(BENCHHARNESS) $(EXEC_LDFLAGS) -o$@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+histogram_test: util/histogram_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) util/histogram_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o$@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+thread_local_test: util/thread_local_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) util/thread_local_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+corruption_test: db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+crc32c_test: util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+db_test: db/db_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+log_write_bench: util/log_write_bench.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) util/log_write_bench.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) -pg
+
+plain_table_db_test: db/plain_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) db/plain_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+simple_table_db_test: db/simple_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) db/simple_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+table_reader_bench: table/table_reader_bench.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) table/table_reader_bench.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) -pg
+
+perf_context_test: db/perf_context_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) db/perf_context_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS)
+
+prefix_test: db/prefix_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) db/prefix_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS)
+
+backupable_db_test: utilities/backupable/backupable_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) utilities/backupable/backupable_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@  $(LDFLAGS) $(COVERAGEFLAGS)
+
+ttl_test: utilities/ttl/ttl_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) utilities/ttl/ttl_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@  $(LDFLAGS) $(COVERAGEFLAGS)
+
+dbformat_test: db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+env_test: util/env_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) util/env_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+filename_test: db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+filter_block_test: table/filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) table/filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+log_test: db/log_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+table_test: table/table_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) table/table_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+block_test: table/block_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) table/block_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+skiplist_test: db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+version_edit_test: db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+version_set_test: db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+file_indexer_test : db/file_indexer_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) db/file_indexer_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+reduce_levels_test: tools/reduce_levels_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) tools/reduce_levels_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+write_batch_test: db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+merge_test: db/merge_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) db/merge_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+deletefile_test: db/deletefile_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) db/deletefile_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS)
+
+geodb_test: utilities/geodb/geodb_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) utilities/geodb/geodb_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+$(MEMENVLIBRARY) : $(MEMENVOBJECTS)
+       rm -f $@
+       $(AR) -rs $@ $(MEMENVOBJECTS)
+
+memenv_test : helpers/memenv/memenv_test.o $(MEMENVOBJECTS) $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) helpers/memenv/memenv_test.o $(MEMENVOBJECTS) $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+manual_compaction_test: util/manual_compaction_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) util/manual_compaction_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+rocksdb_shell: tools/shell/ShellContext.o tools/shell/ShellState.o tools/shell/LeveldbShell.o tools/shell/DBClientProxy.o tools/shell/ShellContext.h tools/shell/ShellState.h tools/shell/DBClientProxy.h $(LIBOBJECTS)
+       $(CXX) tools/shell/ShellContext.o tools/shell/ShellState.o tools/shell/LeveldbShell.o tools/shell/DBClientProxy.o $(LIBOBJECTS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+DBClientProxy_test: tools/shell/test/DBClientProxyTest.o tools/shell/DBClientProxy.o $(LIBRARY)
+       $(CXX) tools/shell/test/DBClientProxyTest.o tools/shell/DBClientProxy.o $(LIBRARY) $(EXEC_LDFLAGS) $(EXEC_LDFLAGS) -o $@  $(LDFLAGS) $(COVERAGEFLAGS)
+
+filelock_test: util/filelock_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) util/filelock_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+auto_roll_logger_test: util/auto_roll_logger_test.o $(LIBOBJECTS) $(TESTHARNESS)
+       $(CXX) util/auto_roll_logger_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+sst_dump: tools/sst_dump.o $(LIBOBJECTS)
+       $(CXX) tools/sst_dump.o $(LIBOBJECTS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+ldb: tools/ldb.o $(LIBOBJECTS)
+       $(CXX) tools/ldb.o $(LIBOBJECTS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+# ---------------------------------------------------------------------------
+# Jni stuff
+# ---------------------------------------------------------------------------
+
+JNI_NATIVE_SOURCES = ./java/rocksjni/*.cc
+JAVA_INCLUDE = -I/usr/lib/jvm/java-openjdk/include/ -I/usr/lib/jvm/java-openjdk/include/linux
+ROCKSDBJNILIB = ./java/librocksdbjni.so
+
+ifeq ($(PLATFORM), OS_MACOSX)
+ROCKSDBJNILIB = ./java/librocksdbjni.jnilib
+JAVA_INCLUDE = -I/System/Library/Frameworks/JavaVM.framework/Headers/
+endif
+
+rocksdbjava: clean
+       OPT="-fPIC -DNDEBUG -O2" $(MAKE) $(LIBRARY) -j32
+       cd java;$(MAKE) java;
+       rm -f $(ROCKSDBJNILIB)
+       $(CXX) $(CXXFLAGS) -I./java/. $(JAVA_INCLUDE) -shared -fPIC -o $(ROCKSDBJNILIB) $(JNI_NATIVE_SOURCES) $(LIBOBJECTS) $(LDFLAGS) $(COVERAGEFLAGS)
+
+jclean:
+       cd java;$(MAKE) clean;
+       rm -f $(ROCKSDBJNILIB)
+
+jtest:
+       cd java;$(MAKE) sample;$(MAKE) test;
+
+jdb_bench:
+       cd java;$(MAKE) db_bench;
+
+# ---------------------------------------------------------------------------
+#      Platform-specific compilation
+# ---------------------------------------------------------------------------
+
+ifeq ($(PLATFORM), IOS)
+# For iOS, create universal object files to be used on both the simulator and
+# a device.
+PLATFORMSROOT=/Applications/Xcode.app/Contents/Developer/Platforms
+SIMULATORROOT=$(PLATFORMSROOT)/iPhoneSimulator.platform/Developer
+DEVICEROOT=$(PLATFORMSROOT)/iPhoneOS.platform/Developer
+IOSVERSION=$(shell defaults read $(PLATFORMSROOT)/iPhoneOS.platform/version CFBundleShortVersionString)
+
+.cc.o:
+       mkdir -p ios-x86/$(dir $@)
+       $(CXX) $(CXXFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -arch x86_64 -c $< -o ios-x86/$@
+       mkdir -p ios-arm/$(dir $@)
+       xcrun -sdk iphoneos $(CXX) $(CXXFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -arch armv7s -arch arm64 -c $< -o ios-arm/$@
+       lipo ios-x86/$@ ios-arm/$@ -create -output $@
+
+.c.o:
+       mkdir -p ios-x86/$(dir $@)
+       $(CC) $(CFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -arch x86_64 -c $< -o ios-x86/$@
+       mkdir -p ios-arm/$(dir $@)
+       xcrun -sdk iphoneos $(CC) $(CFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -arch armv7s -arch arm64 -c $< -o ios-arm/$@
+       lipo ios-x86/$@ ios-arm/$@ -create -output $@
+
+else
+.cc.o:
+       $(CXX) $(CXXFLAGS) -c $< -o $@ $(COVERAGEFLAGS)
+
+.c.o:
+       $(CC) $(CFLAGS) -c $< -o $@
+endif
+
+# ---------------------------------------------------------------------------
+#      Source files dependencies detection
+# ---------------------------------------------------------------------------
+
+# Add proper dependency support so changing a .h file forces a .cc file to
+# rebuild.
+
+# The .d file indicates .cc file's dependencies on .h files. We generate such
+# dependency by g++'s -MM option, whose output is a make dependency rule.
+# The sed command makes sure the "target" file in the generated .d file has
+# the correct path prefix.
+%.d: %.cc
+       $(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) -MM $< -o $@
+ifeq ($(PLATFORM), OS_MACOSX)
+       @sed -i '' -e 's,.*:,$*.o:,' $@
+else
+       @sed -i -e 's,.*:,$*.o:,' $@
+endif
+
+DEPFILES = $(filter-out util/build_version.d,$(SOURCES:.cc=.d))
+
+depend: $(DEPFILES)
+
+# if the make goal is either "clean" or "format", we shouldn't
+# try to import the *.d files.
+# TODO(kailiu) The unfamiliarity of Make's conditions leads to the ugly
+# working solution.
+ifneq ($(MAKECMDGOALS),clean)
+ifneq ($(MAKECMDGOALS),format)
+ifneq ($(MAKECMDGOALS),jclean)
+ifneq ($(MAKECMDGOALS),jtest)
+-include $(DEPFILES)
+endif
+endif
+endif
+endif
diff --git a/PATENTS b/PATENTS
new file mode 100644 (file)
index 0000000..8a6fca4
--- /dev/null
+++ b/PATENTS
@@ -0,0 +1,23 @@
+Additional Grant of Patent Rights
+
+“Software” means the rocksdb software distributed by Facebook, Inc.
+
+Facebook hereby grants you a perpetual, worldwide, royalty-free,
+non-exclusive, irrevocable (subject to the termination provision below)
+license under any rights in any patent claims owned by Facebook, to make,
+have made, use, sell, offer to sell, import, and otherwise transfer the
+Software. For avoidance of doubt, no license is granted under Facebook’s
+rights in any patent claims that are infringed by (i) modifications to the
+Software made by you or a third party, or (ii) the Software in combination
+with any software or other technology provided by you or a third party.
+
+The license granted hereunder will terminate, automatically and without
+notice, for anyone that makes any claim (including by filing any lawsuit,
+assertion or other action) alleging (a) direct, indirect, or contributory
+infringement or inducement to infringe any patent: (i) by Facebook or any
+of its subsidiaries or affiliates, whether or not such claim is related
+to the Software, (ii) by any party if such claim arises in whole or in
+part from any software, product or service of Facebook or any of its
+subsidiaries or affiliates, whether or not such claim is related to the
+Software, or (iii) by any party relating to the Software; or (b) that
+any right in any patent claim of Facebook is invalid or unenforceable.
diff --git a/README b/README
new file mode 100644 (file)
index 0000000..473e414
--- /dev/null
+++ b/README
@@ -0,0 +1,82 @@
+rocksdb: A persistent key-value store for flash storage
+Authors: * The Facebook Database Engineering Team
+         * Build on earlier work on leveldb by Sanjay Ghemawat
+           (sanjay@google.com) and Jeff Dean (jeff@google.com)
+
+This code is a library that forms the core building block for a fast
+key value server, especially suited for storing data on flash drives.
+It has an Log-Structured-Merge-Database (LSM) design with flexible tradeoffs
+between Write-Amplification-Factor(WAF), Read-Amplification-Factor (RAF)
+and Space-Amplification-Factor(SAF). It has multi-threaded compactions,
+making it specially suitable for storing multiple terabytes of data in a
+single database.
+
+The core of this code has been derived from open-source leveldb.
+
+The code under this directory implements a system for maintaining a
+persistent key/value store.
+
+See doc/index.html and github wiki (https://github.com/facebook/rocksdb/wiki)
+for more explanation.
+
+The public interface is in include/*.  Callers should not include or
+rely on the details of any other header files in this package.  Those
+internal APIs may be changed without warning.
+
+Guide to header files:
+
+include/rocksdb/db.h
+    Main interface to the DB: Start here
+
+include/rocksdb/options.h
+    Control over the behavior of an entire database, and also
+    control over the behavior of individual reads and writes.
+
+include/rocksdb/comparator.h
+    Abstraction for user-specified comparison function.  If you want
+    just bytewise comparison of keys, you can use the default comparator,
+    but clients can write their own comparator implementations if they
+    want custom ordering (e.g. to handle different character
+    encodings, etc.)
+
+include/rocksdb/iterator.h
+    Interface for iterating over data. You can get an iterator
+    from a DB object.
+
+include/rocksdb/write_batch.h
+    Interface for atomically applying multiple updates to a database.
+
+include/rocksdb/slice.h
+    A simple module for maintaining a pointer and a length into some
+    other byte array.
+
+include/rocksdb/status.h
+    Status is returned from many of the public interfaces and is used
+    to report success and various kinds of errors.
+
+include/rocksdb/env.h
+    Abstraction of the OS environment.  A posix implementation of
+    this interface is in util/env_posix.cc
+
+include/rocksdb/table_builder.h
+    Lower-level modules that most clients probably won't use directly
+
+include/rocksdb/cache.h
+    An API for the block cache.
+
+include/rocksdb/compaction_filter.h
+    An API for a application filter invoked on every compaction.
+
+include/rocksdb/filter_policy.h
+    An API for configuring a bloom filter.
+
+include/rocksdb/memtablerep.h
+    An API for implementing a memtable.
+
+include/rocksdb/statistics.h
+    An API to retrieve various database statistics.
+
+include/rocksdb/transaction_log.h
+    An API to retrieve transaction logs from a database.
+
+Design discussions are conducted in https://www.facebook.com/groups/rocksdb.dev/
diff --git a/ROCKSDB_LITE.md b/ROCKSDB_LITE.md
new file mode 100644 (file)
index 0000000..e7e3752
--- /dev/null
@@ -0,0 +1,20 @@
+# RocksDBLite
+
+RocksDBLite is a project focused on mobile use cases, which don't need a lot of fancy things we've built for server workloads and they are very sensitive to binary size. For that reason, we added a compile flag ROCKSDB_LITE that comments out a lot of the nonessential code and keeps the binary lean.
+
+Some examples of the features disabled by ROCKSDB_LITE:
+* compiled-in support for LDB tool
+* No backupable DB
+* No support for replication (which we provide in form of TrasactionalIterator)
+* No advanced monitoring tools
+* No special-purpose memtables that are highly optimized for specific use cases
+
+When adding a new big feature to RocksDB, please add ROCKSDB_LITE compile guard if:
+* Nobody from mobile really needs your feature,
+* Your feature is adding a lot of weight to the binary.
+
+Don't add ROCKSDB_LITE compile guard if:
+* It would introduce a lot of code complexity. Compile guards make code harder to read. It's a trade-off.
+* Your feature is not adding a lot of weight.
+
+If unsure, ask. :)
diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform
new file mode 100755 (executable)
index 0000000..99a212b
--- /dev/null
@@ -0,0 +1,313 @@
+#!/bin/sh
+#
+# Detects OS we're compiling on and outputs a file specified by the first
+# argument, which in turn gets read while processing Makefile.
+#
+# The output will set the following variables:
+#   CC                          C Compiler path
+#   CXX                         C++ Compiler path
+#   PLATFORM_LDFLAGS            Linker flags
+#   PLATFORM_SHARED_EXT         Extension for shared libraries
+#   PLATFORM_SHARED_LDFLAGS     Flags for building shared library
+#   PLATFORM_SHARED_CFLAGS      Flags for compiling objects for shared library
+#   PLATFORM_CCFLAGS            C compiler flags
+#   PLATFORM_CXXFLAGS           C++ compiler flags.  Will contain:
+#   PLATFORM_SHARED_VERSIONED   Set to 'true' if platform supports versioned
+#                               shared libraries, empty otherwise.
+#
+# The PLATFORM_CCFLAGS and PLATFORM_CXXFLAGS might include the following:
+#
+#       -DLEVELDB_PLATFORM_POSIX if cstdatomic is present
+#       -DLEVELDB_PLATFORM_NOATOMIC if it is not
+#       -DSNAPPY                    if the Snappy library is present
+#       -DLZ4                       if the LZ4 library is present
+#
+# Using gflags in rocksdb:
+# Our project depends on gflags, which requires users to take some extra steps
+# before they can compile the whole repository:
+#   1. Install gflags. You may download it from here:
+#      https://code.google.com/p/gflags/
+#   2. Once install, add the include path/lib path for gflags to CPATH and
+#      LIBRARY_PATH respectively. If installed with default mode, the
+#      lib and include path will be /usr/local/lib and /usr/local/include
+# Mac user can do this by running build_tools/mac-install-gflags.sh
+
+OUTPUT=$1
+if test -z "$OUTPUT"; then
+  echo "usage: $0 <output-filename>" >&2
+  exit 1
+fi
+
+# we depend on C++11
+PLATFORM_CXXFLAGS="-std=c++11"
+# we currently depend on POSIX platform
+COMMON_FLAGS="-DROCKSDB_PLATFORM_POSIX"
+
+# Default to fbcode gcc on internal fb machines
+if [ -d /mnt/gvfs/third-party -a -z "$CXX" ]; then
+    FBCODE_BUILD="true"
+    if [ -z "$USE_CLANG" ]; then
+        CENTOS_VERSION=`rpm -q --qf "%{VERSION}" \
+          $(rpm -q --whatprovides redhat-release)`
+        if [ "$CENTOS_VERSION" = "6" ]; then
+          source $PWD/build_tools/fbcode.gcc481.sh
+        else
+          source $PWD/build_tools/fbcode.gcc471.sh
+        fi
+    else
+        source $PWD/build_tools/fbcode.clang31.sh
+    fi
+fi
+
+# Delete existing output, if it exists
+rm -f $OUTPUT
+touch $OUTPUT
+
+if test -z "$CC"; then
+   CC=cc
+fi
+
+if test -z "$CXX"; then
+    CXX=g++
+fi
+
+# Detect OS
+if test -z "$TARGET_OS"; then
+    TARGET_OS=`uname -s`
+fi
+
+COMMON_FLAGS="$COMMON_FLAGS ${CFLAGS}"
+CROSS_COMPILE=
+PLATFORM_CCFLAGS=
+PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS ${CXXFLAGS}"
+PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS"
+PLATFORM_SHARED_EXT="so"
+PLATFORM_SHARED_LDFLAGS="-shared -Wl,-soname -Wl,"
+PLATFORM_SHARED_CFLAGS="-fPIC"
+PLATFORM_SHARED_VERSIONED=false
+
+# generic port files (working on all platform by #ifdef) go directly in /port
+GENERIC_PORT_FILES=`cd $ROCKSDB_ROOT; find port -name '*.cc' | tr "\n" " "`
+
+# On GCC, we pick libc's memcmp over GCC's memcmp via -fno-builtin-memcmp
+case "$TARGET_OS" in
+    Darwin)
+        PLATFORM=OS_MACOSX
+        COMMON_FLAGS="$COMMON_FLAGS -DOS_MACOSX"
+        PLATFORM_SHARED_EXT=dylib
+        PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name "
+        # PORT_FILES=port/darwin/darwin_specific.cc
+        ;;
+    IOS)
+        PLATFORM=IOS
+        COMMON_FLAGS="$COMMON_FLAGS -DOS_MACOSX -DIOS_CROSS_COMPILE -DROCKSDB_LITE"
+        PLATFORM_SHARED_EXT=dylib
+        PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name "
+        CROSS_COMPILE=true
+        ;;
+    Linux)
+        PLATFORM=OS_LINUX
+        COMMON_FLAGS="$COMMON_FLAGS -DOS_LINUX"
+        if [ -z "$USE_CLANG" ]; then
+            COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp"
+        fi
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lrt"
+        # PORT_FILES=port/linux/linux_specific.cc
+        ;;
+    SunOS)
+        PLATFORM=OS_SOLARIS
+        COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_SOLARIS"
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lrt"
+        # PORT_FILES=port/sunos/sunos_specific.cc
+        ;;
+    FreeBSD)
+        PLATFORM=OS_FREEBSD
+        COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_FREEBSD"
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread"
+        # PORT_FILES=port/freebsd/freebsd_specific.cc
+        ;;
+    NetBSD)
+        PLATFORM=OS_NETBSD
+        COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_NETBSD"
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lgcc_s"
+        # PORT_FILES=port/netbsd/netbsd_specific.cc
+        ;;
+    OpenBSD)
+        PLATFORM=OS_OPENBSD
+        COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_OPENBSD"
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -pthread"
+        # PORT_FILES=port/openbsd/openbsd_specific.cc
+        ;;
+    DragonFly)
+        PLATFORM=OS_DRAGONFLYBSD
+        COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_DRAGONFLYBSD"
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread"
+        # PORT_FILES=port/dragonfly/dragonfly_specific.cc
+        ;;
+    OS_ANDROID_CROSSCOMPILE)
+        PLATFORM=OS_ANDROID
+       COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_ANDROID -DLEVELDB_PLATFORM_POSIX"
+       PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS "  # All pthread features are in the Android C library
+        # PORT_FILES=port/android/android.cc
+        CROSS_COMPILE=true
+        ;;
+    *)
+        echo "Unknown platform!" >&2
+        exit 1
+esac
+
+if test -z "$DO_NOT_RUN_BUILD_DETECT_VERSION"; then
+  $PWD/build_tools/build_detect_version
+fi
+
+# We want to make a list of all cc files within util, db, table, and helpers
+# except for the test and benchmark files. By default, find will output a list
+# of all files matching either rule, so we need to append -print to make the
+# prune take effect.
+DIRS="util db table utilities"
+
+set -f # temporarily disable globbing so that our patterns arent expanded
+PRUNE_TEST="-name *test*.cc -prune"
+PRUNE_BENCH="-name *bench*.cc -prune"
+PORTABLE_FILES=`cd $ROCKSDB_ROOT; find $DIRS $PRUNE_TEST -o $PRUNE_BENCH -o -name '*.cc' -print | sort | tr "\n" " "`
+PORTABLE_CPP=`cd $ROCKSDB_ROOT; find $DIRS $PRUNE_TEST -o $PRUNE_BENCH -o -name '*.cpp' -print | sort | tr "\n" " "`
+set +f # re-enable globbing
+
+# The sources consist of the portable files, plus the platform-specific port
+# file.
+echo "SOURCES=$PORTABLE_FILES $GENERIC_PORT_FILES $PORT_FILES" >> $OUTPUT
+echo "SOURCESCPP=$PORTABLE_CPP" >> $OUTPUT
+echo "MEMENV_SOURCES=helpers/memenv/memenv.cc" >> $OUTPUT
+
+if [ "$CROSS_COMPILE" = "true" -o "$FBCODE_BUILD" = "true" ]; then
+    # Cross-compiling; do not try any compilation tests.
+    # Also don't need any compilation tests if compiling on fbcode
+    true
+else
+    # do fPIC on 64 bit in non-fbcode environment
+    case "$TARGET_OS" in
+        x86_64)
+            PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS -fPIC"
+    esac
+
+    # If -std=c++0x works, use <atomic>.  Otherwise use port_posix.h.
+    $CXX $CFLAGS -std=c++0x -x c++ - -o /dev/null 2>/dev/null  <<EOF
+      #include <atomic>
+      int main() {}
+EOF
+    if [ "$?" = 0 ]; then
+        COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_ATOMIC_PRESENT"
+    fi
+
+    # Test whether fallocate is available
+    $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+      #include <fcntl.h>
+      int main() {
+       int fd = open("/dev/null", 0);
+       fallocate(fd, 0, 0, 1024);
+      }
+EOF
+    if [ "$?" = 0 ]; then
+        COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_FALLOCATE_PRESENT"
+    fi
+
+    # Test whether Snappy library is installed
+    # http://code.google.com/p/snappy/
+    $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+      #include <snappy.h>
+      int main() {}
+EOF
+    if [ "$?" = 0 ]; then
+        COMMON_FLAGS="$COMMON_FLAGS -DSNAPPY"
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lsnappy"
+    fi
+
+
+    # Test whether gflags library is installed
+    # http://code.google.com/p/gflags/
+    $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+      #include <gflags/gflags.h>
+      int main() {}
+EOF
+    if [ "$?" = 0 ]; then
+        COMMON_FLAGS="$COMMON_FLAGS -DGFLAGS"
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lgflags"
+    fi
+
+    # Test whether zlib library is installed
+    $CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+      #include <zlib.h>
+      int main() {}
+EOF
+    if [ "$?" = 0 ]; then
+        COMMON_FLAGS="$COMMON_FLAGS -DZLIB"
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lz"
+    fi
+
+    # Test whether bzip library is installed
+    $CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+      #include <bzlib.h>
+      int main() {}
+EOF
+    if [ "$?" = 0 ]; then
+        COMMON_FLAGS="$COMMON_FLAGS -DBZIP2"
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lbz2"
+    fi
+
+    # Test whether lz4 library is installed
+    $CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+      #include <lz4.h>
+      #include <lz4hc.h>
+      int main() {}
+EOF
+    if [ "$?" = 0 ]; then
+        COMMON_FLAGS="$COMMON_FLAGS -DLZ4"
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -llz4"
+    fi
+
+    # Test whether tcmalloc is available
+    $CXX $CFLAGS -x c++ - -o /dev/null -ltcmalloc 2>/dev/null  <<EOF
+      int main() {}
+EOF
+    if [ "$?" = 0 ]; then
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -ltcmalloc"
+    fi
+fi
+
+# shall we use HDFS?
+
+if test "$USE_HDFS"; then
+  if test -z "$JAVA_HOME"; then
+    echo "JAVA_HOME has to be set for HDFS usage."
+    exit 1
+  fi
+  HDFS_CCFLAGS="$HDFS_CCFLAGS -I$JAVA_HOME/include -I$JAVA_HOME/include/linux -DUSE_HDFS"
+  HDFS_LDFLAGS="$HDFS_LDFLAGS -Wl,--no-whole-archive hdfs/libhdfs.a -L$JAVA_HOME/jre/lib/amd64"
+  HDFS_LDFLAGS="$HDFS_LDFLAGS -L$JAVA_HOME/jre/lib/amd64/server -L$GLIBC_RUNTIME_PATH/lib"
+  HDFS_LDFLAGS="$HDFS_LDFLAGS -ldl -lverify -ljava -ljvm"
+  COMMON_FLAGS="$COMMON_FLAGS $HDFS_CCFLAGS"
+  PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS $HDFS_LDFLAGS"
+fi
+
+# if Intel SSE instruction set is supported, set USE_SSE=" -msse -msse4.2 "
+COMMON_FLAGS="$COMMON_FLAGS $USE_SSE"
+
+PLATFORM_CCFLAGS="$PLATFORM_CCFLAGS $COMMON_FLAGS"
+PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS $COMMON_FLAGS"
+
+VALGRIND_VER="$VALGRIND_VER"
+
+echo "CC=$CC" >> $OUTPUT
+echo "CXX=$CXX" >> $OUTPUT
+echo "PLATFORM=$PLATFORM" >> $OUTPUT
+echo "PLATFORM_LDFLAGS=$PLATFORM_LDFLAGS" >> $OUTPUT
+echo "VALGRIND_VER=$VALGRIND_VER" >> $OUTPUT
+echo "PLATFORM_CCFLAGS=$PLATFORM_CCFLAGS" >> $OUTPUT
+echo "PLATFORM_CXXFLAGS=$PLATFORM_CXXFLAGS" >> $OUTPUT
+echo "PLATFORM_SHARED_CFLAGS=$PLATFORM_SHARED_CFLAGS" >> $OUTPUT
+echo "PLATFORM_SHARED_EXT=$PLATFORM_SHARED_EXT" >> $OUTPUT
+echo "PLATFORM_SHARED_LDFLAGS=$PLATFORM_SHARED_LDFLAGS" >> $OUTPUT
+echo "PLATFORM_SHARED_VERSIONED=$PLATFORM_SHARED_VERSIONED" >> $OUTPUT
+echo "EXEC_LDFLAGS=$EXEC_LDFLAGS" >> $OUTPUT
+echo "JEMALLOC_INCLUDE=$JEMALLOC_INCLUDE" >> $OUTPUT
+echo "JEMALLOC_LIB=$JEMALLOC_LIB" >> $OUTPUT
diff --git a/build_tools/build_detect_version b/build_tools/build_detect_version
new file mode 100755 (executable)
index 0000000..f7d711f
--- /dev/null
@@ -0,0 +1,22 @@
+#!/bin/sh
+#
+# Record the version of the source that we are compiling.
+# We keep a record of the git revision in util/version.cc. This source file
+# is then built as a regular source file as part of the compilation process.
+# One can run "strings executable_filename | grep _build_" to find the version of
+# the source that we used to build the executable file.
+
+OUTFILE="$PWD/util/build_version.cc"
+
+GIT_SHA=""
+if command -v git >/dev/null 2>&1; then
+    GIT_SHA=$(git rev-parse HEAD 2>/dev/null)
+fi
+
+cat > "${OUTFILE}" <<EOF
+#include "build_version.h"
+const char* rocksdb_build_git_sha = "rocksdb_build_git_sha:${GIT_SHA}";
+const char* rocksdb_build_git_datetime = "rocksdb_build_git_datetime:$(date)";
+const char* rocksdb_build_compile_date = __DATE__;
+const char* rocksdb_build_compile_time = __TIME__;
+EOF
diff --git a/build_tools/fbcode.clang31.sh b/build_tools/fbcode.clang31.sh
new file mode 100644 (file)
index 0000000..25a2ca7
--- /dev/null
@@ -0,0 +1,74 @@
+#!/bin/sh
+#
+# Set environment variables so that we can compile leveldb using
+# fbcode settings.  It uses the latest g++ compiler and also
+# uses jemalloc
+
+TOOLCHAIN_REV=fbe3b095a4cc4a3713730050d182b7b4a80c342f
+TOOLCHAIN_EXECUTABLES="/mnt/gvfs/third-party/$TOOLCHAIN_REV/centos5.2-native"
+TOOLCHAIN_LIB_BASE="/mnt/gvfs/third-party/$TOOLCHAIN_REV/gcc-4.7.1-glibc-2.14.1"
+TOOL_JEMALLOC=jemalloc-3.3.1/9202ce3
+GLIBC_RUNTIME_PATH=/usr/local/fbcode/gcc-4.7.1-glibc-2.14.1
+
+# location of libgcc
+LIBGCC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include"
+LIBGCC_LIBS=" -L $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/libs"
+
+# location of glibc
+GLIBC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/glibc/glibc-2.14.1/99df8fc/include"
+GLIBC_LIBS=" -L $TOOLCHAIN_LIB_BASE/glibc/glibc-2.14.1/99df8fc/lib"
+
+# location of snappy headers and libraries
+SNAPPY_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/7518bbe/include"
+SNAPPY_LIBS=" $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/7518bbe/lib/libsnappy.a"
+
+# location of zlib headers and libraries
+ZLIB_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/91ddd43/include"
+ZLIB_LIBS=" $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/91ddd43/lib/libz.a"
+
+# location of gflags headers and libraries
+GFLAGS_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/include"
+GFLAGS_LIBS=" $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/lib/libgflags.a"
+
+# location of bzip headers and libraries
+BZIP_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/91ddd43/include"
+BZIP_LIBS=" $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/91ddd43/lib/libbz2.a"
+
+# location of gflags headers and libraries
+GFLAGS_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/include"
+GFLAGS_LIBS=" $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/lib/libgflags.a"
+
+# use Intel SSE support for checksum calculations
+export USE_SSE=" -msse -msse4.2 "
+
+CC="$TOOLCHAIN_EXECUTABLES/clang/clang-3.2/0b7c69d/bin/clang $CLANG_INCLUDES"
+CXX="$TOOLCHAIN_EXECUTABLES/clang/clang-3.2/0b7c69d/bin/clang++ $CLANG_INCLUDES $JINCLUDE $SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $GFLAGS_INCLUDE"
+AR=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ar
+RANLIB=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ranlib
+
+CFLAGS="-B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin -nostdlib "
+CFLAGS+=" -nostdinc -isystem $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include/c++/4.7.1 "
+CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include/c++/4.7.1/x86_64-facebook-linux "
+CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include/c++/4.7.1/backward "
+CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/glibc/glibc-2.14.1/99df8fc/include "
+CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include "
+CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/clang/clang-3.2/0b7c69d/lib/clang/3.2/include "
+CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/kernel-headers/kernel-headers-3.2.18_70_fbk11_00129_gc8882d0/da39a3e/include/linux "
+CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/kernel-headers/kernel-headers-3.2.18_70_fbk11_00129_gc8882d0/da39a3e/include "
+CFLAGS+=" -Wall -Wno-sign-compare -Wno-unused-variable -Winvalid-pch -Wno-deprecated -Woverloaded-virtual"
+CFLAGS+=" $LIBGCC_INCLUDE $GLIBC_INCLUDE"
+CXXFLAGS="$CFLAGS -nostdinc++"
+
+CFLAGS+=" -I $TOOLCHAIN_LIB_BASE/jemalloc/$TOOL_JEMALLOC/include -DHAVE_JEMALLOC"
+
+EXEC_LDFLAGS=" -Wl,--whole-archive $TOOLCHAIN_LIB_BASE/jemalloc/$TOOL_JEMALLOC/lib/libjemalloc.a"
+EXEC_LDFLAGS+=" -Wl,--no-whole-archive $TOOLCHAIN_LIB_BASE/libunwind/libunwind-1.0.1/350336c/lib/libunwind.a"
+EXEC_LDFLAGS+=" $HDFSLIB $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $GFLAGS_LIBS"
+EXEC_LDFLAGS+=" -Wl,--dynamic-linker,$GLIBC_RUNTIME_PATH/lib/ld-linux-x86-64.so.2"
+EXEC_LDFLAGS+=" -B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin"
+
+PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS "
+
+EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $GFLAGS_LIBS"
+
+export CC CXX AR RANLIB CFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED 
diff --git a/build_tools/fbcode.gcc471.sh b/build_tools/fbcode.gcc471.sh
new file mode 100644 (file)
index 0000000..9294057
--- /dev/null
@@ -0,0 +1,70 @@
+#!/bin/sh
+#
+# Set environment variables so that we can compile leveldb using
+# fbcode settings.  It uses the latest g++ compiler and also
+# uses jemalloc
+
+TOOLCHAIN_REV=fbe3b095a4cc4a3713730050d182b7b4a80c342f
+TOOLCHAIN_EXECUTABLES="/mnt/gvfs/third-party/$TOOLCHAIN_REV/centos5.2-native"
+TOOLCHAIN_LIB_BASE="/mnt/gvfs/third-party/$TOOLCHAIN_REV/gcc-4.7.1-glibc-2.14.1"
+TOOL_JEMALLOC=jemalloc-3.3.1/9202ce3
+
+# location of libhdfs libraries
+if test "$USE_HDFS"; then
+  JAVA_HOME="/usr/local/jdk-6u22-64"
+  JINCLUDE="-I$JAVA_HOME/include -I$JAVA_HOME/include/linux"
+  GLIBC_RUNTIME_PATH="/usr/local/fbcode/gcc-4.7.1-glibc-2.14.1"
+  HDFSLIB=" -Wl,--no-whole-archive hdfs/libhdfs.a -L$JAVA_HOME/jre/lib/amd64 "
+  HDFSLIB+=" -L$JAVA_HOME/jre/lib/amd64/server -L$GLIBC_RUNTIME_PATH/lib "
+  HDFSLIB+=" -ldl -lverify -ljava -ljvm "
+fi
+
+# location of libgcc
+LIBGCC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include"
+LIBGCC_LIBS=" -L $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/libs"
+
+# location of glibc
+GLIBC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/glibc/glibc-2.14.1/99df8fc/include"
+GLIBC_LIBS=" -L $TOOLCHAIN_LIB_BASE/glibc/glibc-2.14.1/99df8fc/lib"
+
+# location of snappy headers and libraries
+SNAPPY_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/7518bbe/include"
+SNAPPY_LIBS=" $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/7518bbe/lib/libsnappy.a"
+
+# location of zlib headers and libraries
+ZLIB_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/91ddd43/include"
+ZLIB_LIBS=" $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/91ddd43/lib/libz.a"
+
+# location of bzip headers and libraries
+BZIP_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/91ddd43/include"
+BZIP_LIBS=" $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/91ddd43/lib/libbz2.a"
+
+# location of gflags headers and libraries
+GFLAGS_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/include"
+GFLAGS_LIBS=" $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/lib/libgflags.a"
+
+# use Intel SSE support for checksum calculations
+export USE_SSE=" -msse -msse4.2 "
+
+CC="$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.7.1-glibc-2.14.1/bin/gcc"
+CXX="$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.7.1-glibc-2.14.1/bin/g++ $JINCLUDE $SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $GFLAGS_INCLUDE"
+AR=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ar
+RANLIB=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ranlib
+
+CFLAGS="-B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/gold -m64 -mtune=generic"
+CFLAGS+=" -I $TOOLCHAIN_LIB_BASE/jemalloc/$TOOL_JEMALLOC/include -DHAVE_JEMALLOC"
+CFLAGS+=" $LIBGCC_INCLUDE $GLIBC_INCLUDE"
+CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_ATOMIC_PRESENT -DROCKSDB_FALLOCATE_PRESENT"
+CFLAGS+=" -DSNAPPY -DGFLAGS -DZLIB -DBZIP2"
+
+EXEC_LDFLAGS=" -Wl,--whole-archive $TOOLCHAIN_LIB_BASE/jemalloc/$TOOL_JEMALLOC/lib/libjemalloc.a"
+EXEC_LDFLAGS+=" -Wl,--no-whole-archive $TOOLCHAIN_LIB_BASE/libunwind/libunwind-1.0.1/350336c/lib/libunwind.a"
+EXEC_LDFLAGS+=" $HDFSLIB $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $GFLAGS_LIBS"
+
+PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS "
+
+EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $GFLAGS_LIBS"
+
+VALGRIND_VER="$TOOLCHAIN_LIB_BASE/valgrind/valgrind-3.8.1/91ddd43/bin/"
+
+export CC CXX AR RANLIB CFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER
diff --git a/build_tools/fbcode.gcc481.sh b/build_tools/fbcode.gcc481.sh
new file mode 100644 (file)
index 0000000..556d15a
--- /dev/null
@@ -0,0 +1,81 @@
+#!/bin/sh
+#
+# Set environment variables so that we can compile rocksdb using
+# fbcode settings.  It uses the latest g++ compiler and also
+# uses jemalloc
+
+TOOLCHAIN_REV=53dc1fe83f84e9145b9ffb81b81aa7f6a49c87cc
+CENTOS_VERSION=`rpm -q --qf "%{VERSION}" $(rpm -q --whatprovides redhat-release)`
+if [ "$CENTOS_VERSION" = "6" ]; then
+  TOOLCHAIN_EXECUTABLES="/mnt/gvfs/third-party/$TOOLCHAIN_REV/centos6-native"
+else
+  TOOLCHAIN_EXECUTABLES="/mnt/gvfs/third-party/$TOOLCHAIN_REV/centos5.2-native"
+fi
+TOOLCHAIN_LIB_BASE="/mnt/gvfs/third-party/$TOOLCHAIN_REV/gcc-4.8.1-glibc-2.17"
+
+# location of libhdfs libraries
+if test "$USE_HDFS"; then
+  JAVA_HOME="/usr/local/jdk-6u22-64"
+  JINCLUDE="-I$JAVA_HOME/include -I$JAVA_HOME/include/linux"
+  GLIBC_RUNTIME_PATH="/usr/local/fbcode/gcc-4.8.1-glibc-2.17"
+  HDFSLIB=" -Wl,--no-whole-archive hdfs/libhdfs.a -L$JAVA_HOME/jre/lib/amd64 "
+  HDFSLIB+=" -L$JAVA_HOME/jre/lib/amd64/server -L$GLIBC_RUNTIME_PATH/lib "
+  HDFSLIB+=" -ldl -lverify -ljava -ljvm "
+fi
+
+# location of libgcc
+LIBGCC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.8.1/8aac7fc/include"
+LIBGCC_LIBS=" -L $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.8.1/8aac7fc/libs"
+
+# location of glibc
+GLIBC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/glibc/glibc-2.17/99df8fc/include"
+GLIBC_LIBS=" -L $TOOLCHAIN_LIB_BASE/glibc/glibc-2.17/99df8fc/lib"
+
+# location of snappy headers and libraries
+SNAPPY_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/43d84e2/include"
+SNAPPY_LIBS=" $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/43d84e2/lib/libsnappy.a"
+
+# location of zlib headers and libraries
+ZLIB_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/c3f970a/include"
+ZLIB_LIBS=" $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/c3f970a/lib/libz.a"
+
+# location of bzip headers and libraries
+BZIP_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/c3f970a/include"
+BZIP_LIBS=" $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/c3f970a/lib/libbz2.a"
+
+LZ4_REV=065ec7e38fe83329031f6668c43bef83eff5808b
+LZ4_INCLUDE=" -I /mnt/gvfs/third-party2/lz4/$LZ4_REV/r117/gcc-4.8.1-glibc-2.17/c3f970a/include"
+LZ4_LIBS=" /mnt/gvfs/third-party2/lz4/$LZ4_REV/r117/gcc-4.8.1-glibc-2.17/c3f970a/lib/liblz4.a"
+
+# location of gflags headers and libraries
+GFLAGS_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/c3f970a/include"
+GFLAGS_LIBS=" $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/c3f970a/lib/libgflags.a"
+
+# location of jemalloc
+JEMALLOC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/jemalloc/jemalloc-3.4.1/4d53c6f/include/"
+JEMALLOC_LIB=" -Wl,--whole-archive $TOOLCHAIN_LIB_BASE/jemalloc/jemalloc-3.4.1/4d53c6f/lib/libjemalloc.a"
+
+# use Intel SSE support for checksum calculations
+export USE_SSE=" -msse -msse4.2 "
+
+CC="$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.8.1/cc6c9dc/bin/gcc"
+CXX="$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.8.1/cc6c9dc/bin/g++ $JINCLUDE $SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $LZ4_INCLUDE $GFLAGS_INCLUDE"
+AR=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ar
+RANLIB=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ranlib
+
+CFLAGS="-B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/gold -m64 -mtune=generic"
+CFLAGS+=" $LIBGCC_INCLUDE $GLIBC_INCLUDE"
+CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_ATOMIC_PRESENT -DROCKSDB_FALLOCATE_PRESENT"
+CFLAGS+=" -DSNAPPY -DGFLAGS -DZLIB -DBZIP2 -DLZ4"
+
+EXEC_LDFLAGS="-Wl,--dynamic-linker,/usr/local/fbcode/gcc-4.8.1-glibc-2.17/lib/ld.so"
+EXEC_LDFLAGS+=" -Wl,--no-whole-archive $TOOLCHAIN_LIB_BASE/libunwind/libunwind-1.0.1/675d945/lib/libunwind.a"
+EXEC_LDFLAGS+=" $HDFSLIB $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $GFLAGS_LIBS"
+
+PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS "
+
+EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $GFLAGS_LIBS"
+
+VALGRIND_VER="$TOOLCHAIN_LIB_BASE/valgrind/valgrind-3.8.1/c3f970a/bin/"
+
+export CC CXX AR RANLIB CFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE
diff --git a/build_tools/format-diff.sh b/build_tools/format-diff.sh
new file mode 100755 (executable)
index 0000000..2d60620
--- /dev/null
@@ -0,0 +1,107 @@
+#!/bin/bash
+# If clang_format_diff.py command is not specfied, we assume we are able to
+# access directly without any path.
+if [ -z $CLANG_FORMAT_DIFF ]
+then
+CLANG_FORMAT_DIFF="clang-format-diff.py"
+fi
+
+# Check clang-format-diff.py
+if ! which $CLANG_FORMAT_DIFF &> /dev/null
+then
+  echo "You didn't have clang-format-diff.py available in your computer!"
+  echo "You can download it by running: "
+  echo "    curl http://goo.gl/iUW1u2"
+  exit 128
+fi
+
+# Check argparse, a library that clang-format-diff.py requires.
+python 2>/dev/null << EOF
+import argparse
+EOF
+
+if [ "$?" != 0 ]
+then
+  echo "To run clang-format-diff.py, we'll need the library "argparse" to be"
+  echo "installed. You can try either of the follow ways to install it:"
+  echo "  1. Manually download argparse: https://pypi.python.org/pypi/argparse"
+  echo "  2. easy_install argparse (if you have easy_install)"
+  echo "  3. pip install argparse (if you have pip)"
+  exit 129
+fi
+
+# TODO(kailiu) following work is not complete since we still need to figure
+# out how to add the modified files done pre-commit hook to git's commit index.
+#
+# Check if this script has already been added to pre-commit hook.
+# Will suggest user to add this script to pre-commit hook if their pre-commit
+# is empty.
+# PRE_COMMIT_SCRIPT_PATH="`git rev-parse --show-toplevel`/.git/hooks/pre-commit"
+# if ! ls $PRE_COMMIT_SCRIPT_PATH &> /dev/null
+# then
+#   echo "Would you like to add this script to pre-commit hook, which will do "
+#   echo -n "the format check for all the affected lines before you check in (y/n):"
+#   read add_to_hook
+#   if [ "$add_to_hook" == "y" ]
+#   then
+#     ln -s `git rev-parse --show-toplevel`/build_tools/format-diff.sh $PRE_COMMIT_SCRIPT_PATH
+#   fi
+# fi
+set -e
+
+uncommitted_code=`git diff HEAD`
+
+# If there's no uncommitted changes, we assume user are doing post-commit
+# format check, in which case we'll check the modified lines from latest commit.
+# Otherwise, we'll check format of the uncommitted code only.
+if [ -z "$uncommitted_code" ]
+then
+  # Check the format of last commit
+  diffs=$(git diff -U0 HEAD^ | $CLANG_FORMAT_DIFF -p 1)
+else
+  # Check the format of uncommitted lines,
+  diffs=$(git diff -U0 HEAD | $CLANG_FORMAT_DIFF -p 1)
+fi
+
+if [ -z "$diffs" ]
+then
+  echo "Nothing needs to be reformatted!"
+  exit 0
+fi
+
+# Highlight the insertion/deletion from the clang-format-diff.py's output
+COLOR_END="\033[0m"
+COLOR_RED="\033[0;31m" 
+COLOR_GREEN="\033[0;32m" 
+
+echo -e "Detect lines that doesn't follow the format rules:\r"
+# Add the color to the diff. lines added will be green; lines removed will be red.
+echo "$diffs" | 
+  sed -e "s/\(^-.*$\)/`echo -e \"$COLOR_RED\1$COLOR_END\"`/" |
+  sed -e "s/\(^+.*$\)/`echo -e \"$COLOR_GREEN\1$COLOR_END\"`/"
+echo -e "Would you like to fix the format automatically (y/n): \c"
+
+# Make sure under any mode, we can read user input.
+exec < /dev/tty
+read to_fix
+
+if [ "$to_fix" != "y" ]
+then
+  exit 1
+fi
+
+# Do in-place format adjustment.
+git diff -U0 HEAD^ | $CLANG_FORMAT_DIFF -i -p 1
+echo "Files reformatted!"
+
+# Amend to last commit if user do the post-commit format check
+if [ -z "$uncommitted_code" ]; then
+  echo -e "Would you like to amend the changes to last commit (`git log HEAD --oneline | head -1`)? (y/n): \c"
+  read to_amend
+
+  if [ "$to_amend" == "y" ]
+  then
+    git commit -a --amend --reuse-message HEAD
+    echo "Amended to last commit"
+  fi
+fi
diff --git a/build_tools/mac-install-gflags.sh b/build_tools/mac-install-gflags.sh
new file mode 100755 (executable)
index 0000000..ef0339c
--- /dev/null
@@ -0,0 +1,25 @@
+#!/bin/sh
+# Install gflags for mac developers.
+
+set -e
+
+DIR=`mktemp -d /tmp/rocksdb_gflags_XXXX`
+
+cd $DIR
+wget https://gflags.googlecode.com/files/gflags-2.0.tar.gz
+tar xvfz gflags-2.0.tar.gz
+cd gflags-2.0
+
+./configure
+make
+make install
+
+# Add include/lib path for g++
+echo 'export LIBRARY_PATH+=":/usr/local/lib"' >> ~/.bash_profile
+echo 'export CPATH+=":/usr/local/include"' >> ~/.bash_profile
+
+echo ""
+echo "-----------------------------------------------------------------------------"
+echo "|                         Installation Completed                            |"
+echo "-----------------------------------------------------------------------------"
+echo "Please run `. ~/bash_profile` to be able to compile with gflags"
diff --git a/build_tools/make_new_version.sh b/build_tools/make_new_version.sh
new file mode 100755 (executable)
index 0000000..a8d524f
--- /dev/null
@@ -0,0 +1,46 @@
+#!/bin/bash
+#  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+#  This source code is licensed under the BSD-style license found in the
+#  LICENSE file in the root directory of this source tree. An additional grant
+#  of patent rights can be found in the PATENTS file in the same directory.
+
+set -e
+if [ -z "$GIT" ]
+then
+  GIT="git"
+fi
+
+# Print out the colored progress info so that it can be brainlessly 
+# distinguished by users.
+function title() {
+  echo -e "\033[1;32m$*\033[0m"
+}
+
+usage="Create new RocksDB version and prepare it for the release process\n"
+usage+="USAGE: ./make_new_version.sh <version>"
+
+# -- Pre-check
+if [[ $# < 1 ]]; then
+  echo -e $usage
+  exit 1
+fi
+
+ROCKSDB_VERSION=$1
+
+GIT_BRANCH=`git rev-parse --abbrev-ref HEAD`
+echo $GIT_BRANCH
+
+if [ $GIT_BRANCH != "master" ]; then
+  echo "Error: Current branch is '$GIT_BRANCH', Please switch to master branch."
+  exit 1
+fi
+
+title "Adding new tag for this release ..."
+BRANCH="$ROCKSDB_VERSION.fb"
+$GIT co -b $BRANCH
+
+# Setting up the proxy for remote repo access
+title "Pushing new branch to remote repo ..."
+git push origin --set-upstream $BRANCH
+
+title "Branch $BRANCH is pushed to github;"
diff --git a/build_tools/regression_build_test.sh b/build_tools/regression_build_test.sh
new file mode 100755 (executable)
index 0000000..58766f5
--- /dev/null
@@ -0,0 +1,330 @@
+#!/bin/bash
+
+set -e
+
+NUM=10000000
+
+if [ $# -eq 1 ];then
+  DATA_DIR=$1
+elif [ $# -eq 2 ];then
+  DATA_DIR=$1
+  STAT_FILE=$2
+fi
+
+# On the production build servers, set data and stat
+# files/directories not in /tmp or else the tempdir cleaning
+# scripts will make you very unhappy.
+DATA_DIR=${DATA_DIR:-$(mktemp -t -d rocksdb_XXXX)}
+STAT_FILE=${STAT_FILE:-$(mktemp -t -u rocksdb_test_stats_XXXX)}
+
+function cleanup {
+  rm -rf $DATA_DIR
+  rm -f $STAT_FILE.fillseq
+  rm -f $STAT_FILE.readrandom
+  rm -f $STAT_FILE.overwrite
+  rm -f $STAT_FILE.memtablefillreadrandom
+}
+
+trap cleanup EXIT
+
+if [ -z $GIT_BRANCH ]; then
+  git_br=`git rev-parse --abbrev-ref HEAD`
+else
+  git_br=$(basename $GIT_BRANCH)
+fi
+
+if [ $git_br == "master" ]; then
+  git_br=""
+else
+  git_br="."$git_br
+fi
+
+make release
+
+# measure fillseq + fill up the DB for overwrite benchmark
+./db_bench \
+    --benchmarks=fillseq \
+    --db=$DATA_DIR \
+    --use_existing_db=0 \
+    --bloom_bits=10 \
+    --num=$NUM \
+    --writes=$NUM \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6 \
+    --table_cache_numshardbits=4 \
+    --open_files=55000 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0  > ${STAT_FILE}.fillseq
+
+# measure overwrite performance
+./db_bench \
+    --benchmarks=overwrite \
+    --db=$DATA_DIR \
+    --use_existing_db=1 \
+    --bloom_bits=10 \
+    --num=$NUM \
+    --writes=$((NUM / 10)) \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6  \
+    --table_cache_numshardbits=4 \
+    --open_files=55000 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=8 > ${STAT_FILE}.overwrite
+
+# fill up the db for readrandom benchmark (1GB total size)
+./db_bench \
+    --benchmarks=fillseq \
+    --db=$DATA_DIR \
+    --use_existing_db=0 \
+    --bloom_bits=10 \
+    --num=$NUM \
+    --writes=$NUM \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6 \
+    --table_cache_numshardbits=4 \
+    --open_files=55000 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=1 > /dev/null
+
+# measure readrandom with 6GB block cache
+./db_bench \
+    --benchmarks=readrandom \
+    --db=$DATA_DIR \
+    --use_existing_db=1 \
+    --bloom_bits=10 \
+    --num=$NUM \
+    --reads=$((NUM / 5)) \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6 \
+    --table_cache_numshardbits=4 \
+    --open_files=55000 \
+    --disable_seek_compaction=1 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=16 > ${STAT_FILE}.readrandom
+
+# measure readrandom with 6GB block cache and tailing iterator
+./db_bench \
+    --benchmarks=readrandom \
+    --db=$DATA_DIR \
+    --use_existing_db=1 \
+    --bloom_bits=10 \
+    --num=$NUM \
+    --reads=$((NUM / 5)) \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6 \
+    --table_cache_numshardbits=4 \
+    --open_files=55000 \
+    --disable_seek_compaction=1 \
+    --use_tailing_iterator=1 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=16 > ${STAT_FILE}.readrandomtailing
+
+# measure readrandom with 100MB block cache
+./db_bench \
+    --benchmarks=readrandom \
+    --db=$DATA_DIR \
+    --use_existing_db=1 \
+    --bloom_bits=10 \
+    --num=$NUM \
+    --reads=$((NUM / 5)) \
+    --cache_size=104857600 \
+    --cache_numshardbits=6 \
+    --table_cache_numshardbits=4 \
+    --open_files=55000 \
+    --disable_seek_compaction=1 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=16 > ${STAT_FILE}.readrandomsmallblockcache
+
+# measure readrandom with 8k data in memtable
+./db_bench \
+    --benchmarks=overwrite,readrandom \
+    --db=$DATA_DIR \
+    --use_existing_db=1 \
+    --bloom_bits=10 \
+    --num=$NUM \
+    --reads=$((NUM / 5)) \
+    --writes=512 \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6 \
+    --table_cache_numshardbits=4 \
+    --write_buffer_size=1000000000 \
+    --open_files=55000 \
+    --disable_seek_compaction=1 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=16 > ${STAT_FILE}.readrandom_mem_sst
+
+
+# fill up the db for readrandom benchmark with filluniquerandom (1GB total size)
+./db_bench \
+    --benchmarks=filluniquerandom \
+    --db=$DATA_DIR \
+    --use_existing_db=0 \
+    --bloom_bits=10 \
+    --num=$((NUM / 4)) \
+    --writes=$((NUM / 4)) \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6 \
+    --table_cache_numshardbits=4 \
+    --open_files=55000 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=1 > /dev/null
+
+# dummy test just to compact the data
+./db_bench \
+    --benchmarks=readrandom \
+    --db=$DATA_DIR \
+    --use_existing_db=1 \
+    --bloom_bits=10 \
+    --num=$((NUM / 1000)) \
+    --reads=$((NUM / 1000)) \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6 \
+    --table_cache_numshardbits=4 \
+    --open_files=55000 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=16 > /dev/null
+
+# measure readrandom after load with filluniquerandom with 6GB block cache
+./db_bench \
+    --benchmarks=readrandom \
+    --db=$DATA_DIR \
+    --use_existing_db=1 \
+    --bloom_bits=10 \
+    --num=$((NUM / 4)) \
+    --reads=$((NUM / 4)) \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6 \
+    --table_cache_numshardbits=4 \
+    --open_files=55000 \
+    --disable_seek_compaction=1 \
+    --disable_auto_compactions=1 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=16 > ${STAT_FILE}.readrandom_filluniquerandom
+
+# measure readwhilewriting after load with filluniquerandom with 6GB block cache
+./db_bench \
+    --benchmarks=readwhilewriting \
+    --db=$DATA_DIR \
+    --use_existing_db=1 \
+    --bloom_bits=10 \
+    --num=$((NUM / 4)) \
+    --reads=$((NUM / 4)) \
+    --writes_per_second=1000 \
+    --write_buffer_size=100000000 \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6 \
+    --table_cache_numshardbits=4 \
+    --open_files=55000 \
+    --disable_seek_compaction=1 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=16 > ${STAT_FILE}.readwhilewriting
+
+# measure memtable performance -- none of the data gets flushed to disk
+./db_bench \
+    --benchmarks=fillrandom,readrandom, \
+    --db=$DATA_DIR \
+    --use_existing_db=0 \
+    --num=$((NUM / 10)) \
+    --reads=$NUM \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6 \
+    --table_cache_numshardbits=4 \
+    --write_buffer_size=1000000000 \
+    --open_files=55000 \
+    --disable_seek_compaction=1 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --value_size=10 \
+    --threads=16 > ${STAT_FILE}.memtablefillreadrandom
+
+# send data to ods
+function send_to_ods {
+  key="$1"
+  value="$2"
+
+  if [ -z $JENKINS_HOME ]; then
+    # running on devbox, just print out the values
+    echo $1 $2
+    return
+  fi
+
+  if [ -z "$value" ];then
+    echo >&2 "ERROR: Key $key doesn't have a value."
+    return
+  fi
+  curl -s "https://www.intern.facebook.com/intern/agent/ods_set.php?entity=rocksdb_build$git_br&key=$key&value=$value" \
+    --connect-timeout 60
+}
+
+function send_benchmark_to_ods {
+  bench="$1"
+  bench_key="$2"
+  file="$3"
+
+  QPS=$(grep $bench $file | awk '{print $5}')
+  P50_MICROS=$(grep $bench $file -A 4 | tail -n1 | awk '{print $3}' )
+  P75_MICROS=$(grep $bench $file -A 4 | tail -n1 | awk '{print $5}' )
+  P99_MICROS=$(grep $bench $file -A 4 | tail -n1 | awk '{print $7}' )
+
+  send_to_ods rocksdb.build.$bench_key.qps $QPS
+  send_to_ods rocksdb.build.$bench_key.p50_micros $P50_MICROS
+  send_to_ods rocksdb.build.$bench_key.p75_micros $P75_MICROS
+  send_to_ods rocksdb.build.$bench_key.p99_micros $P99_MICROS
+}
+
+send_benchmark_to_ods overwrite overwrite $STAT_FILE.overwrite
+send_benchmark_to_ods fillseq fillseq $STAT_FILE.fillseq
+send_benchmark_to_ods readrandom readrandom $STAT_FILE.readrandom
+send_benchmark_to_ods readrandom readrandom_tailing $STAT_FILE.readrandomtailing
+send_benchmark_to_ods readrandom readrandom_smallblockcache $STAT_FILE.readrandomsmallblockcache
+send_benchmark_to_ods readrandom readrandom_memtable_sst $STAT_FILE.readrandom_mem_sst
+send_benchmark_to_ods readrandom readrandom_fillunique_random $STAT_FILE.readrandom_filluniquerandom
+send_benchmark_to_ods fillrandom memtablefillrandom $STAT_FILE.memtablefillreadrandom
+send_benchmark_to_ods readrandom memtablereadrandom $STAT_FILE.memtablefillreadrandom
+send_benchmark_to_ods readwhilewriting readwhilewriting $STAT_FILE.readwhilewriting
diff --git a/build_tools/valgrind_test.sh b/build_tools/valgrind_test.sh
new file mode 100755 (executable)
index 0000000..8c7e521
--- /dev/null
@@ -0,0 +1,15 @@
+#!/bin/bash
+#A shell script for Jenknis to run valgrind on rocksdb tests
+#Returns 0 on success when there are no failed tests 
+
+VALGRIND_DIR=build_tools/VALGRIND_LOGS
+make clean
+make -j$(nproc) valgrind_check
+NUM_FAILED_TESTS=$((`wc -l $VALGRIND_DIR/valgrind_failed_tests | awk '{print $1}'` - 1))
+if [ $NUM_FAILED_TESTS -lt 1 ]; then
+  echo No tests have valgrind errors
+  exit 0
+else
+  cat $VALGRIND_DIR/valgrind_failed_tests
+  exit 1
+fi
diff --git a/coverage/coverage_test.sh b/coverage/coverage_test.sh
new file mode 100755 (executable)
index 0000000..08dbd05
--- /dev/null
@@ -0,0 +1,78 @@
+#!/bin/bash
+
+# Exit on error.
+set -e
+
+if [ -n "$USE_CLANG" ]; then
+  echo "Error: Coverage test is supported only for gcc."
+  exit 1
+fi
+
+ROOT=".."
+# Fetch right version of gcov
+if [ -d /mnt/gvfs/third-party -a -z "$CXX" ]; then
+  source $ROOT/build_tools/fbcode.gcc471.sh
+  GCOV=$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.7.1/cc6c9dc/bin/gcov
+else
+  GCOV=$(which gcov)
+fi
+
+COVERAGE_DIR="$PWD/COVERAGE_REPORT"
+mkdir -p $COVERAGE_DIR
+
+# Find all gcno files to generate the coverage report
+
+GCNO_FILES=`find $ROOT -name "*.gcno"`
+$GCOV --preserve-paths --relative-only --no-output $GCNO_FILES 2>/dev/null |
+  # Parse the raw gcov report to more human readable form.
+  python $ROOT/coverage/parse_gcov_output.py |
+  # Write the output to both stdout and report file.
+  tee $COVERAGE_DIR/coverage_report_all.txt &&
+echo -e "Generated coverage report for all files: $COVERAGE_DIR/coverage_report_all.txt\n"
+
+# TODO: we also need to get the files of the latest commits.
+# Get the most recently committed files.
+LATEST_FILES=`
+  git show --pretty="format:" --name-only HEAD |
+  grep -v "^$" |
+  paste -s -d,`
+RECENT_REPORT=$COVERAGE_DIR/coverage_report_recent.txt
+
+echo -e "Recently updated files: $LATEST_FILES\n" > $RECENT_REPORT
+$GCOV --preserve-paths --relative-only --no-output $GCNO_FILES 2>/dev/null |
+  python $ROOT/coverage/parse_gcov_output.py -interested-files $LATEST_FILES |
+  tee -a $RECENT_REPORT &&
+echo -e "Generated coverage report for recently updated files: $RECENT_REPORT\n"
+
+# Unless otherwise specified, we'll not generate html report by default
+if [ -z "$HTML" ]; then
+  exit 0
+fi
+
+# Generate the html report. If we cannot find lcov in this machine, we'll simply
+# skip this step.
+echo "Generating the html coverage report..."
+
+LCOV=$(which lcov || true 2>/dev/null)
+if [ -z $LCOV ]
+then
+  echo "Skip: Cannot find lcov to generate the html report."
+  exit 0
+fi
+
+LCOV_VERSION=$(lcov -v | grep 1.1 || true)
+if [ $LCOV_VERSION ]
+then
+  echo "Not supported lcov version. Expect lcov 1.1."
+  exit 0
+fi
+
+(cd $ROOT; lcov --no-external \
+     --capture  \
+     --directory $PWD \
+     --gcov-tool $GCOV \
+     --output-file $COVERAGE_DIR/coverage.info)
+
+genhtml $COVERAGE_DIR/coverage.info -o $COVERAGE_DIR
+
+echo "HTML Coverage report is generated in $COVERAGE_DIR"
diff --git a/coverage/parse_gcov_output.py b/coverage/parse_gcov_output.py
new file mode 100644 (file)
index 0000000..72e8b07
--- /dev/null
@@ -0,0 +1,118 @@
+import optparse
+import re
+import sys
+
+from optparse import OptionParser
+
+# the gcov report follows certain pattern. Each file will have two lines
+# of report, from which we can extract the file name, total lines and coverage
+# percentage.
+def parse_gcov_report(gcov_input):
+    per_file_coverage = {}
+    total_coverage = None
+
+    for line in sys.stdin:
+        line = line.strip()
+
+        # --First line of the coverage report (with file name in it)?
+        match_obj = re.match("^File '(.*)'$", line)
+        if match_obj:
+            # fetch the file name from the first line of the report.
+            current_file = match_obj.group(1)
+            continue
+
+        # -- Second line of the file report (with coverage percentage)
+        match_obj = re.match("^Lines executed:(.*)% of (.*)", line)
+
+        if match_obj:
+            coverage = float(match_obj.group(1))
+            lines = int(match_obj.group(2))
+
+            if current_file is not None:
+                per_file_coverage[current_file] = (coverage, lines)
+                current_file = None
+            else:
+                # If current_file is not set, we reach the last line of report,
+                # which contains the summarized coverage percentage.
+                total_coverage = (coverage, lines)
+            continue
+
+        # If the line's pattern doesn't fall into the above categories. We
+        # can simply ignore them since they're either empty line or doesn't
+        # find executable lines of the given file.
+        current_file = None
+
+    return per_file_coverage, total_coverage
+
+def get_option_parser():
+    usage = "Parse the gcov output and generate more human-readable code " +\
+            "coverage report."
+    parser = OptionParser(usage)
+
+    parser.add_option(
+        "--interested-files", "-i",
+        dest="filenames",
+        help="Comma separated files names. if specified, we will display " +
+             "the coverage report only for interested source files. " +
+             "Otherwise we will display the coverage report for all " +
+             "source files."
+    )
+    return parser
+
+def display_file_coverage(per_file_coverage, total_coverage):
+    # To print out auto-adjustable column, we need to know the longest
+    # length of file names.
+    max_file_name_length = max(
+        len(fname) for fname in per_file_coverage.keys()
+    )
+
+    # -- Print header
+    # size of separator is determined by 3 column sizes:
+    # file name, coverage percentage and lines.
+    header_template = \
+        "%" + str(max_file_name_length) + "s\t%s\t%s"
+    separator = "-" * (max_file_name_length + 10 + 20)
+    print header_template % ("Filename", "Coverage", "Lines")
+    print separator
+
+    # -- Print body
+    # template for printing coverage report for each file.
+    record_template = "%" + str(max_file_name_length) + "s\t%5.2f%%\t%10d"
+
+    for fname, coverage_info in per_file_coverage.items():
+        coverage, lines = coverage_info
+        print record_template % (fname, coverage, lines)
+
+    # -- Print footer
+    if total_coverage:
+        print separator
+        print record_template % ("Total", total_coverage[0], total_coverage[1])
+
+def report_coverage():
+    parser = get_option_parser()
+    (options, args) = parser.parse_args()
+
+    interested_files = set()
+    if options.filenames is not None:
+        interested_files = set(f.strip() for f in options.filenames.split(','))
+
+    # To make things simple, right now we only read gcov report from the input
+    per_file_coverage, total_coverage = parse_gcov_report(sys.stdin)
+
+    # Check if we need to display coverage info for interested files.
+    if len(interested_files):
+        per_file_coverage = dict(
+            (fname, per_file_coverage[fname]) for fname in interested_files
+            if fname in per_file_coverage
+        )
+        # If we only interested in several files, it makes no sense to report
+        # the total_coverage
+        total_coverage = None
+
+    if not len(per_file_coverage):
+        print >> sys.stderr, "Cannot find coverage info for the given files."
+        return
+    display_file_coverage(per_file_coverage, total_coverage)
+
+if __name__ == "__main__":
+    report_coverage()
diff --git a/db/builder.cc b/db/builder.cc
new file mode 100644 (file)
index 0000000..ce85ae5
--- /dev/null
@@ -0,0 +1,224 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/builder.h"
+
+#include "db/dbformat.h"
+#include "db/filename.h"
+#include "db/merge_helper.h"
+#include "db/table_cache.h"
+#include "db/version_edit.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+#include "table/block_based_table_builder.h"
+#include "util/stop_watch.h"
+
+namespace rocksdb {
+
+class TableFactory;
+
+TableBuilder* NewTableBuilder(const Options& options,
+                              const InternalKeyComparator& internal_comparator,
+                              WritableFile* file,
+                              CompressionType compression_type) {
+  return options.table_factory->NewTableBuilder(options, internal_comparator,
+                                                file, compression_type);
+}
+
+Status BuildTable(const std::string& dbname, Env* env, const Options& options,
+                  const EnvOptions& soptions, TableCache* table_cache,
+                  Iterator* iter, FileMetaData* meta,
+                  const InternalKeyComparator& internal_comparator,
+                  const SequenceNumber newest_snapshot,
+                  const SequenceNumber earliest_seqno_in_memtable,
+                  const CompressionType compression) {
+  Status s;
+  meta->file_size = 0;
+  meta->smallest_seqno = meta->largest_seqno = 0;
+  iter->SeekToFirst();
+
+  // If the sequence number of the smallest entry in the memtable is
+  // smaller than the most recent snapshot, then we do not trigger
+  // removal of duplicate/deleted keys as part of this builder.
+  bool purge = options.purge_redundant_kvs_while_flush;
+  if (earliest_seqno_in_memtable <= newest_snapshot) {
+    purge = false;
+  }
+
+  std::string fname = TableFileName(dbname, meta->number);
+  if (iter->Valid()) {
+    unique_ptr<WritableFile> file;
+    s = env->NewWritableFile(fname, &file, soptions);
+    if (!s.ok()) {
+      return s;
+    }
+
+    TableBuilder* builder =
+        NewTableBuilder(options, internal_comparator, file.get(), compression);
+
+    // the first key is the smallest key
+    Slice key = iter->key();
+    meta->smallest.DecodeFrom(key);
+    meta->smallest_seqno = GetInternalKeySeqno(key);
+    meta->largest_seqno = meta->smallest_seqno;
+
+    MergeHelper merge(internal_comparator.user_comparator(),
+                      options.merge_operator.get(), options.info_log.get(),
+                      options.min_partial_merge_operands,
+                      true /* internal key corruption is not ok */);
+
+    if (purge) {
+      // Ugly walkaround to avoid compiler error for release build
+      bool ok __attribute__((unused)) = true;
+
+      // Will write to builder if current key != prev key
+      ParsedInternalKey prev_ikey;
+      std::string prev_key;
+      bool is_first_key = true;    // Also write if this is the very first key
+
+      while (iter->Valid()) {
+        bool iterator_at_next = false;
+
+        // Get current key
+        ParsedInternalKey this_ikey;
+        Slice key = iter->key();
+        Slice value = iter->value();
+
+        // In-memory key corruption is not ok;
+        // TODO: find a clean way to treat in memory key corruption
+        ok = ParseInternalKey(key, &this_ikey);
+        assert(ok);
+        assert(this_ikey.sequence >= earliest_seqno_in_memtable);
+
+        // If the key is the same as the previous key (and it is not the
+        // first key), then we skip it, since it is an older version.
+        // Otherwise we output the key and mark it as the "new" previous key.
+        if (!is_first_key && !internal_comparator.user_comparator()->Compare(
+                                  prev_ikey.user_key, this_ikey.user_key)) {
+          // seqno within the same key are in decreasing order
+          assert(this_ikey.sequence < prev_ikey.sequence);
+        } else {
+          is_first_key = false;
+
+          if (this_ikey.type == kTypeMerge) {
+            // Handle merge-type keys using the MergeHelper
+            // TODO: pass statistics to MergeUntil
+            merge.MergeUntil(iter, 0 /* don't worry about snapshot */);
+            iterator_at_next = true;
+            if (merge.IsSuccess()) {
+              // Merge completed correctly.
+              // Add the resulting merge key/value and continue to next
+              builder->Add(merge.key(), merge.value());
+              prev_key.assign(merge.key().data(), merge.key().size());
+              ok = ParseInternalKey(Slice(prev_key), &prev_ikey);
+              assert(ok);
+            } else {
+              // Merge did not find a Put/Delete.
+              // Can not compact these merges into a kValueType.
+              // Write them out one-by-one. (Proceed back() to front())
+              const std::deque<std::string>& keys = merge.keys();
+              const std::deque<std::string>& values = merge.values();
+              assert(keys.size() == values.size() && keys.size() >= 1);
+              std::deque<std::string>::const_reverse_iterator key_iter;
+              std::deque<std::string>::const_reverse_iterator value_iter;
+              for (key_iter=keys.rbegin(), value_iter = values.rbegin();
+                   key_iter != keys.rend() && value_iter != values.rend();
+                   ++key_iter, ++value_iter) {
+
+                builder->Add(Slice(*key_iter), Slice(*value_iter));
+              }
+
+              // Sanity check. Both iterators should end at the same time
+              assert(key_iter == keys.rend() && value_iter == values.rend());
+
+              prev_key.assign(keys.front());
+              ok = ParseInternalKey(Slice(prev_key), &prev_ikey);
+              assert(ok);
+            }
+          } else {
+            // Handle Put/Delete-type keys by simply writing them
+            builder->Add(key, value);
+            prev_key.assign(key.data(), key.size());
+            ok = ParseInternalKey(Slice(prev_key), &prev_ikey);
+            assert(ok);
+          }
+        }
+
+        if (!iterator_at_next) iter->Next();
+      }
+
+      // The last key is the largest key
+      meta->largest.DecodeFrom(Slice(prev_key));
+      SequenceNumber seqno = GetInternalKeySeqno(Slice(prev_key));
+      meta->smallest_seqno = std::min(meta->smallest_seqno, seqno);
+      meta->largest_seqno = std::max(meta->largest_seqno, seqno);
+
+    } else {
+      for (; iter->Valid(); iter->Next()) {
+        Slice key = iter->key();
+        meta->largest.DecodeFrom(key);
+        builder->Add(key, iter->value());
+        SequenceNumber seqno = GetInternalKeySeqno(key);
+        meta->smallest_seqno = std::min(meta->smallest_seqno, seqno);
+        meta->largest_seqno = std::max(meta->largest_seqno, seqno);
+      }
+    }
+
+    // Finish and check for builder errors
+    if (s.ok()) {
+      s = builder->Finish();
+      if (s.ok()) {
+        meta->file_size = builder->FileSize();
+        assert(meta->file_size > 0);
+      }
+    } else {
+      builder->Abandon();
+    }
+    delete builder;
+
+    // Finish and check for file errors
+    if (s.ok() && !options.disableDataSync) {
+      if (options.use_fsync) {
+        StopWatch sw(env, options.statistics.get(), TABLE_SYNC_MICROS);
+        s = file->Fsync();
+      } else {
+        StopWatch sw(env, options.statistics.get(), TABLE_SYNC_MICROS);
+        s = file->Sync();
+      }
+    }
+    if (s.ok()) {
+      s = file->Close();
+    }
+
+    if (s.ok()) {
+      // Verify that the table is usable
+      Iterator* it = table_cache->NewIterator(ReadOptions(), soptions,
+                                              internal_comparator, *meta);
+      s = it->status();
+      delete it;
+    }
+  }
+
+  // Check for input iterator errors
+  if (!iter->status().ok()) {
+    s = iter->status();
+  }
+
+  if (s.ok() && meta->file_size > 0) {
+    // Keep it
+  } else {
+    env->DeleteFile(fname);
+  }
+  return s;
+}
+
+}  // namespace rocksdb
diff --git a/db/builder.h b/db/builder.h
new file mode 100644 (file)
index 0000000..6301629
--- /dev/null
@@ -0,0 +1,45 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+#include "rocksdb/comparator.h"
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+#include "rocksdb/options.h"
+
+namespace rocksdb {
+
+struct Options;
+struct FileMetaData;
+
+class Env;
+struct EnvOptions;
+class Iterator;
+class TableCache;
+class VersionEdit;
+class TableBuilder;
+class WritableFile;
+
+extern TableBuilder* NewTableBuilder(
+    const Options& options, const InternalKeyComparator& internal_comparator,
+    WritableFile* file, CompressionType compression_type);
+
+// Build a Table file from the contents of *iter.  The generated file
+// will be named according to meta->number.  On success, the rest of
+// *meta will be filled with metadata about the generated table.
+// If no data is present in *iter, meta->file_size will be set to
+// zero, and no Table file will be produced.
+extern Status BuildTable(const std::string& dbname, Env* env,
+                         const Options& options, const EnvOptions& soptions,
+                         TableCache* table_cache, Iterator* iter,
+                         FileMetaData* meta,
+                         const InternalKeyComparator& internal_comparator,
+                         const SequenceNumber newest_snapshot,
+                         const SequenceNumber earliest_seqno_in_memtable,
+                         const CompressionType compression);
+
+}  // namespace rocksdb
diff --git a/db/c.cc b/db/c.cc
new file mode 100644 (file)
index 0000000..b50e59e
--- /dev/null
+++ b/db/c.cc
@@ -0,0 +1,1476 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/c.h"
+
+#include <stdlib.h>
+#include <unistd.h>
+#include "rocksdb/cache.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+#include "rocksdb/write_batch.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/universal_compaction.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
+
+using rocksdb::Cache;
+using rocksdb::Comparator;
+using rocksdb::CompressionType;
+using rocksdb::DB;
+using rocksdb::Env;
+using rocksdb::InfoLogLevel;
+using rocksdb::FileLock;
+using rocksdb::FilterPolicy;
+using rocksdb::FlushOptions;
+using rocksdb::Iterator;
+using rocksdb::Logger;
+using rocksdb::MergeOperator;
+using rocksdb::NewBloomFilterPolicy;
+using rocksdb::NewLRUCache;
+using rocksdb::Options;
+using rocksdb::RandomAccessFile;
+using rocksdb::Range;
+using rocksdb::ReadOptions;
+using rocksdb::SequentialFile;
+using rocksdb::Slice;
+using rocksdb::SliceTransform;
+using rocksdb::Snapshot;
+using rocksdb::Status;
+using rocksdb::WritableFile;
+using rocksdb::WriteBatch;
+using rocksdb::WriteOptions;
+using rocksdb::LiveFileMetaData;
+
+using std::shared_ptr;
+
+extern "C" {
+
+struct rocksdb_t                 { DB*               rep; };
+struct rocksdb_iterator_t        { Iterator*         rep; };
+struct rocksdb_writebatch_t      { WriteBatch        rep; };
+struct rocksdb_snapshot_t        { const Snapshot*   rep; };
+struct rocksdb_flushoptions_t    { FlushOptions      rep; };
+struct rocksdb_readoptions_t     { ReadOptions       rep; };
+struct rocksdb_writeoptions_t    { WriteOptions      rep; };
+struct rocksdb_options_t         { Options           rep; };
+struct rocksdb_seqfile_t         { SequentialFile*   rep; };
+struct rocksdb_randomfile_t      { RandomAccessFile* rep; };
+struct rocksdb_writablefile_t    { WritableFile*     rep; };
+struct rocksdb_filelock_t        { FileLock*         rep; };
+struct rocksdb_logger_t          { shared_ptr<Logger>  rep; };
+struct rocksdb_cache_t           { shared_ptr<Cache>   rep; };
+struct rocksdb_livefiles_t       { std::vector<LiveFileMetaData> rep; };
+
+struct rocksdb_comparator_t : public Comparator {
+  void* state_;
+  void (*destructor_)(void*);
+  int (*compare_)(
+      void*,
+      const char* a, size_t alen,
+      const char* b, size_t blen);
+  const char* (*name_)(void*);
+
+  virtual ~rocksdb_comparator_t() {
+    (*destructor_)(state_);
+  }
+
+  virtual int Compare(const Slice& a, const Slice& b) const {
+    return (*compare_)(state_, a.data(), a.size(), b.data(), b.size());
+  }
+
+  virtual const char* Name() const {
+    return (*name_)(state_);
+  }
+
+  // No-ops since the C binding does not support key shortening methods.
+  virtual void FindShortestSeparator(std::string*, const Slice&) const { }
+  virtual void FindShortSuccessor(std::string* key) const { }
+};
+
+struct rocksdb_filterpolicy_t : public FilterPolicy {
+  void* state_;
+  void (*destructor_)(void*);
+  const char* (*name_)(void*);
+  char* (*create_)(
+      void*,
+      const char* const* key_array, const size_t* key_length_array,
+      int num_keys,
+      size_t* filter_length);
+  unsigned char (*key_match_)(
+      void*,
+      const char* key, size_t length,
+      const char* filter, size_t filter_length);
+  void (*delete_filter_)(
+      void*,
+      const char* filter, size_t filter_length);
+
+  virtual ~rocksdb_filterpolicy_t() {
+    (*destructor_)(state_);
+  }
+
+  virtual const char* Name() const {
+    return (*name_)(state_);
+  }
+
+  virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const {
+    std::vector<const char*> key_pointers(n);
+    std::vector<size_t> key_sizes(n);
+    for (int i = 0; i < n; i++) {
+      key_pointers[i] = keys[i].data();
+      key_sizes[i] = keys[i].size();
+    }
+    size_t len;
+    char* filter = (*create_)(state_, &key_pointers[0], &key_sizes[0], n, &len);
+    dst->append(filter, len);
+
+    if (delete_filter_ != nullptr) {
+      (*delete_filter_)(state_, filter, len);
+    } else {
+      free(filter);
+    }
+  }
+
+  virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const {
+    return (*key_match_)(state_, key.data(), key.size(),
+                         filter.data(), filter.size());
+  }
+};
+
+struct rocksdb_mergeoperator_t : public MergeOperator {
+  void* state_;
+  void (*destructor_)(void*);
+  const char* (*name_)(void*);
+  char* (*full_merge_)(
+      void*,
+      const char* key, size_t key_length,
+      const char* existing_value, size_t existing_value_length,
+      const char* const* operands_list, const size_t* operands_list_length,
+      int num_operands,
+      unsigned char* success, size_t* new_value_length);
+  char* (*partial_merge_)(void*, const char* key, size_t key_length,
+                          const char* const* operands_list,
+                          const size_t* operands_list_length, int num_operands,
+                          unsigned char* success, size_t* new_value_length);
+  void (*delete_value_)(
+      void*,
+      const char* value, size_t value_length);
+
+  virtual ~rocksdb_mergeoperator_t() {
+    (*destructor_)(state_);
+  }
+
+  virtual const char* Name() const {
+    return (*name_)(state_);
+  }
+
+  virtual bool FullMerge(
+      const Slice& key,
+      const Slice* existing_value,
+      const std::deque<std::string>& operand_list,
+      std::string* new_value,
+      Logger* logger) const {
+
+    size_t n = operand_list.size();
+    std::vector<const char*> operand_pointers(n);
+    std::vector<size_t> operand_sizes(n);
+    for (size_t i = 0; i < n; i++) {
+      Slice operand(operand_list[i]);
+      operand_pointers[i] = operand.data();
+      operand_sizes[i] = operand.size();
+    }
+
+    const char* existing_value_data = nullptr;
+    size_t existing_value_len = 0;
+    if (existing_value != nullptr) {
+      existing_value_data = existing_value->data();
+      existing_value_len = existing_value->size();
+    }
+
+    unsigned char success;
+    size_t new_value_len;
+    char* tmp_new_value = (*full_merge_)(
+        state_,
+        key.data(), key.size(),
+        existing_value_data, existing_value_len,
+        &operand_pointers[0], &operand_sizes[0], n,
+        &success, &new_value_len);
+    new_value->assign(tmp_new_value, new_value_len);
+
+    if (delete_value_ != nullptr) {
+      (*delete_value_)(state_, tmp_new_value, new_value_len);
+    } else {
+      free(tmp_new_value);
+    }
+
+    return success;
+  }
+
+  virtual bool PartialMergeMulti(const Slice& key,
+                                 const std::deque<Slice>& operand_list,
+                                 std::string* new_value, Logger* logger) const {
+    size_t operand_count = operand_list.size();
+    std::vector<const char*> operand_pointers(operand_count);
+    std::vector<size_t> operand_sizes(operand_count);
+    for (size_t i = 0; i < operand_count; ++i) {
+      Slice operand(operand_list[i]);
+      operand_pointers[i] = operand.data();
+      operand_sizes[i] = operand.size();
+    }
+
+    unsigned char success;
+    size_t new_value_len;
+    char* tmp_new_value = (*partial_merge_)(
+        state_, key.data(), key.size(), &operand_pointers[0], &operand_sizes[0],
+        operand_count, &success, &new_value_len);
+    new_value->assign(tmp_new_value, new_value_len);
+
+    if (delete_value_ != nullptr) {
+      (*delete_value_)(state_, tmp_new_value, new_value_len);
+    } else {
+      free(tmp_new_value);
+    }
+
+    return success;
+  }
+};
+
+struct rocksdb_env_t {
+  Env* rep;
+  bool is_default;
+};
+
+struct rocksdb_slicetransform_t : public SliceTransform {
+  void* state_;
+  void (*destructor_)(void*);
+  const char* (*name_)(void*);
+  char* (*transform_)(
+      void*,
+      const char* key, size_t length,
+      size_t* dst_length);
+  unsigned char (*in_domain_)(
+      void*,
+      const char* key, size_t length);
+  unsigned char (*in_range_)(
+      void*,
+      const char* key, size_t length);
+
+  virtual ~rocksdb_slicetransform_t() {
+    (*destructor_)(state_);
+  }
+
+  virtual const char* Name() const {
+    return (*name_)(state_);
+  }
+
+  virtual Slice Transform(const Slice& src) const {
+    size_t len;
+    char* dst = (*transform_)(state_, src.data(), src.size(), &len);
+    return Slice(dst, len);
+  }
+
+  virtual bool InDomain(const Slice& src) const {
+    return (*in_domain_)(state_, src.data(), src.size());
+  }
+
+  virtual bool InRange(const Slice& src) const {
+    return (*in_range_)(state_, src.data(), src.size());
+  }
+};
+
+struct rocksdb_universal_compaction_options_t {
+  rocksdb::CompactionOptionsUniversal *rep;
+};
+
+static bool SaveError(char** errptr, const Status& s) {
+  assert(errptr != nullptr);
+  if (s.ok()) {
+    return false;
+  } else if (*errptr == nullptr) {
+    *errptr = strdup(s.ToString().c_str());
+  } else {
+    // TODO(sanjay): Merge with existing error?
+    free(*errptr);
+    *errptr = strdup(s.ToString().c_str());
+  }
+  return true;
+}
+
+static char* CopyString(const std::string& str) {
+  char* result = reinterpret_cast<char*>(malloc(sizeof(char) * str.size()));
+  memcpy(result, str.data(), sizeof(char) * str.size());
+  return result;
+}
+
+rocksdb_t* rocksdb_open(
+    const rocksdb_options_t* options,
+    const char* name,
+    char** errptr) {
+  DB* db;
+  if (SaveError(errptr, DB::Open(options->rep, std::string(name), &db))) {
+    return nullptr;
+  }
+  rocksdb_t* result = new rocksdb_t;
+  result->rep = db;
+  return result;
+}
+
+rocksdb_t* rocksdb_open_for_read_only(
+    const rocksdb_options_t* options,
+    const char* name,
+    unsigned char error_if_log_file_exist,
+    char** errptr) {
+  DB* db;
+  if (SaveError(errptr, DB::OpenForReadOnly(options->rep, std::string(name), &db, error_if_log_file_exist))) {
+    return nullptr;
+  }
+  rocksdb_t* result = new rocksdb_t;
+  result->rep = db;
+  return result;
+}
+
+void rocksdb_close(rocksdb_t* db) {
+  delete db->rep;
+  delete db;
+}
+
+void rocksdb_put(
+    rocksdb_t* db,
+    const rocksdb_writeoptions_t* options,
+    const char* key, size_t keylen,
+    const char* val, size_t vallen,
+    char** errptr) {
+  SaveError(errptr,
+            db->rep->Put(options->rep, Slice(key, keylen), Slice(val, vallen)));
+}
+
+void rocksdb_delete(
+    rocksdb_t* db,
+    const rocksdb_writeoptions_t* options,
+    const char* key, size_t keylen,
+    char** errptr) {
+  SaveError(errptr, db->rep->Delete(options->rep, Slice(key, keylen)));
+}
+
+void rocksdb_merge(
+    rocksdb_t* db,
+    const rocksdb_writeoptions_t* options,
+    const char* key, size_t keylen,
+    const char* val, size_t vallen,
+    char** errptr) {
+  SaveError(errptr,
+            db->rep->Merge(options->rep, Slice(key, keylen), Slice(val, vallen)));
+}
+
+void rocksdb_write(
+    rocksdb_t* db,
+    const rocksdb_writeoptions_t* options,
+    rocksdb_writebatch_t* batch,
+    char** errptr) {
+  SaveError(errptr, db->rep->Write(options->rep, &batch->rep));
+}
+
+char* rocksdb_get(
+    rocksdb_t* db,
+    const rocksdb_readoptions_t* options,
+    const char* key, size_t keylen,
+    size_t* vallen,
+    char** errptr) {
+  char* result = nullptr;
+  std::string tmp;
+  Status s = db->rep->Get(options->rep, Slice(key, keylen), &tmp);
+  if (s.ok()) {
+    *vallen = tmp.size();
+    result = CopyString(tmp);
+  } else {
+    *vallen = 0;
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+  }
+  return result;
+}
+
+rocksdb_iterator_t* rocksdb_create_iterator(
+    rocksdb_t* db,
+    const rocksdb_readoptions_t* options) {
+  rocksdb_iterator_t* result = new rocksdb_iterator_t;
+  result->rep = db->rep->NewIterator(options->rep);
+  return result;
+}
+
+const rocksdb_snapshot_t* rocksdb_create_snapshot(
+    rocksdb_t* db) {
+  rocksdb_snapshot_t* result = new rocksdb_snapshot_t;
+  result->rep = db->rep->GetSnapshot();
+  return result;
+}
+
+void rocksdb_release_snapshot(
+    rocksdb_t* db,
+    const rocksdb_snapshot_t* snapshot) {
+  db->rep->ReleaseSnapshot(snapshot->rep);
+  delete snapshot;
+}
+
+char* rocksdb_property_value(
+    rocksdb_t* db,
+    const char* propname) {
+  std::string tmp;
+  if (db->rep->GetProperty(Slice(propname), &tmp)) {
+    // We use strdup() since we expect human readable output.
+    return strdup(tmp.c_str());
+  } else {
+    return nullptr;
+  }
+}
+
+void rocksdb_approximate_sizes(
+    rocksdb_t* db,
+    int num_ranges,
+    const char* const* range_start_key, const size_t* range_start_key_len,
+    const char* const* range_limit_key, const size_t* range_limit_key_len,
+    uint64_t* sizes) {
+  Range* ranges = new Range[num_ranges];
+  for (int i = 0; i < num_ranges; i++) {
+    ranges[i].start = Slice(range_start_key[i], range_start_key_len[i]);
+    ranges[i].limit = Slice(range_limit_key[i], range_limit_key_len[i]);
+  }
+  db->rep->GetApproximateSizes(ranges, num_ranges, sizes);
+  delete[] ranges;
+}
+
+void rocksdb_delete_file(
+    rocksdb_t* db,
+    const char* name) {
+  db->rep->DeleteFile(name);
+}
+
+const rocksdb_livefiles_t* rocksdb_livefiles(
+    rocksdb_t* db) {
+  rocksdb_livefiles_t* result = new rocksdb_livefiles_t;
+  db->rep->GetLiveFilesMetaData(&result->rep);
+  return result;
+}
+
+void rocksdb_compact_range(
+    rocksdb_t* db,
+    const char* start_key, size_t start_key_len,
+    const char* limit_key, size_t limit_key_len) {
+  Slice a, b;
+  db->rep->CompactRange(
+      // Pass nullptr Slice if corresponding "const char*" is nullptr
+      (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr),
+      (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr));
+}
+
+void rocksdb_flush(
+    rocksdb_t* db,
+    const rocksdb_flushoptions_t* options,
+    char** errptr) {
+  SaveError(errptr, db->rep->Flush(options->rep));
+}
+
+void rocksdb_disable_file_deletions(
+    rocksdb_t* db,
+    char** errptr) {
+  SaveError(errptr, db->rep->DisableFileDeletions());
+}
+
+void rocksdb_enable_file_deletions(
+    rocksdb_t* db,
+    unsigned char force,
+    char** errptr) {
+  SaveError(errptr, db->rep->EnableFileDeletions(force));
+}
+
+void rocksdb_destroy_db(
+    const rocksdb_options_t* options,
+    const char* name,
+    char** errptr) {
+  SaveError(errptr, DestroyDB(name, options->rep));
+}
+
+void rocksdb_repair_db(
+    const rocksdb_options_t* options,
+    const char* name,
+    char** errptr) {
+  SaveError(errptr, RepairDB(name, options->rep));
+}
+
+void rocksdb_iter_destroy(rocksdb_iterator_t* iter) {
+  delete iter->rep;
+  delete iter;
+}
+
+unsigned char rocksdb_iter_valid(const rocksdb_iterator_t* iter) {
+  return iter->rep->Valid();
+}
+
+void rocksdb_iter_seek_to_first(rocksdb_iterator_t* iter) {
+  iter->rep->SeekToFirst();
+}
+
+void rocksdb_iter_seek_to_last(rocksdb_iterator_t* iter) {
+  iter->rep->SeekToLast();
+}
+
+void rocksdb_iter_seek(rocksdb_iterator_t* iter, const char* k, size_t klen) {
+  iter->rep->Seek(Slice(k, klen));
+}
+
+void rocksdb_iter_next(rocksdb_iterator_t* iter) {
+  iter->rep->Next();
+}
+
+void rocksdb_iter_prev(rocksdb_iterator_t* iter) {
+  iter->rep->Prev();
+}
+
+const char* rocksdb_iter_key(const rocksdb_iterator_t* iter, size_t* klen) {
+  Slice s = iter->rep->key();
+  *klen = s.size();
+  return s.data();
+}
+
+const char* rocksdb_iter_value(const rocksdb_iterator_t* iter, size_t* vlen) {
+  Slice s = iter->rep->value();
+  *vlen = s.size();
+  return s.data();
+}
+
+void rocksdb_iter_get_error(const rocksdb_iterator_t* iter, char** errptr) {
+  SaveError(errptr, iter->rep->status());
+}
+
+rocksdb_writebatch_t* rocksdb_writebatch_create() {
+  return new rocksdb_writebatch_t;
+}
+
+void rocksdb_writebatch_destroy(rocksdb_writebatch_t* b) {
+  delete b;
+}
+
+void rocksdb_writebatch_clear(rocksdb_writebatch_t* b) {
+  b->rep.Clear();
+}
+
+int rocksdb_writebatch_count(rocksdb_writebatch_t* b) {
+  return b->rep.Count();
+}
+
+void rocksdb_writebatch_put(
+    rocksdb_writebatch_t* b,
+    const char* key, size_t klen,
+    const char* val, size_t vlen) {
+  b->rep.Put(Slice(key, klen), Slice(val, vlen));
+}
+
+void rocksdb_writebatch_merge(
+    rocksdb_writebatch_t* b,
+    const char* key, size_t klen,
+    const char* val, size_t vlen) {
+  b->rep.Merge(Slice(key, klen), Slice(val, vlen));
+}
+
+void rocksdb_writebatch_delete(
+    rocksdb_writebatch_t* b,
+    const char* key, size_t klen) {
+  b->rep.Delete(Slice(key, klen));
+}
+
+void rocksdb_writebatch_iterate(
+    rocksdb_writebatch_t* b,
+    void* state,
+    void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen),
+    void (*deleted)(void*, const char* k, size_t klen)) {
+  class H : public WriteBatch::Handler {
+   public:
+    void* state_;
+    void (*put_)(void*, const char* k, size_t klen, const char* v, size_t vlen);
+    void (*deleted_)(void*, const char* k, size_t klen);
+    virtual void Put(const Slice& key, const Slice& value) {
+      (*put_)(state_, key.data(), key.size(), value.data(), value.size());
+    }
+    virtual void Delete(const Slice& key) {
+      (*deleted_)(state_, key.data(), key.size());
+    }
+  };
+  H handler;
+  handler.state_ = state;
+  handler.put_ = put;
+  handler.deleted_ = deleted;
+  b->rep.Iterate(&handler);
+}
+
+const char* rocksdb_writebatch_data(rocksdb_writebatch_t* b, size_t* size) {
+  *size = b->rep.GetDataSize();
+  return b->rep.Data().c_str();
+}
+
+rocksdb_options_t* rocksdb_options_create() {
+  return new rocksdb_options_t;
+}
+
+void rocksdb_options_destroy(rocksdb_options_t* options) {
+  delete options;
+}
+
+void rocksdb_options_set_comparator(
+    rocksdb_options_t* opt,
+    rocksdb_comparator_t* cmp) {
+  opt->rep.comparator = cmp;
+}
+
+void rocksdb_options_set_merge_operator(
+    rocksdb_options_t* opt,
+    rocksdb_mergeoperator_t* merge_operator) {
+  opt->rep.merge_operator = std::shared_ptr<MergeOperator>(merge_operator);
+}
+
+void rocksdb_options_set_filter_policy(
+    rocksdb_options_t* opt,
+    rocksdb_filterpolicy_t* policy) {
+  opt->rep.filter_policy = policy;
+}
+
+void rocksdb_options_set_create_if_missing(
+    rocksdb_options_t* opt, unsigned char v) {
+  opt->rep.create_if_missing = v;
+}
+
+void rocksdb_options_set_error_if_exists(
+    rocksdb_options_t* opt, unsigned char v) {
+  opt->rep.error_if_exists = v;
+}
+
+void rocksdb_options_set_paranoid_checks(
+    rocksdb_options_t* opt, unsigned char v) {
+  opt->rep.paranoid_checks = v;
+}
+
+void rocksdb_options_set_env(rocksdb_options_t* opt, rocksdb_env_t* env) {
+  opt->rep.env = (env ? env->rep : nullptr);
+}
+
+void rocksdb_options_set_info_log(rocksdb_options_t* opt, rocksdb_logger_t* l) {
+  if (l) {
+    opt->rep.info_log = l->rep;
+  }
+}
+
+void rocksdb_options_set_info_log_level(
+    rocksdb_options_t* opt, int v) {
+  opt->rep.info_log_level = static_cast<InfoLogLevel>(v);
+}
+
+void rocksdb_options_set_write_buffer_size(rocksdb_options_t* opt, size_t s) {
+  opt->rep.write_buffer_size = s;
+}
+
+void rocksdb_options_set_max_open_files(rocksdb_options_t* opt, int n) {
+  opt->rep.max_open_files = n;
+}
+
+void rocksdb_options_set_cache(rocksdb_options_t* opt, rocksdb_cache_t* c) {
+  if (c) {
+    opt->rep.block_cache = c->rep;
+  }
+}
+
+void rocksdb_options_set_cache_compressed(rocksdb_options_t* opt, rocksdb_cache_t* c) {
+  if (c) {
+    opt->rep.block_cache_compressed = c->rep;
+  }
+}
+
+void rocksdb_options_set_block_size(rocksdb_options_t* opt, size_t s) {
+  opt->rep.block_size = s;
+}
+
+void rocksdb_options_set_block_restart_interval(rocksdb_options_t* opt, int n) {
+  opt->rep.block_restart_interval = n;
+}
+
+void rocksdb_options_set_target_file_size_base(
+    rocksdb_options_t* opt, uint64_t n) {
+  opt->rep.target_file_size_base = n;
+}
+
+void rocksdb_options_set_target_file_size_multiplier(
+    rocksdb_options_t* opt, int n) {
+  opt->rep.target_file_size_multiplier = n;
+}
+
+void rocksdb_options_set_max_bytes_for_level_base(
+    rocksdb_options_t* opt, uint64_t n) {
+  opt->rep.max_bytes_for_level_base = n;
+}
+
+void rocksdb_options_set_max_bytes_for_level_multiplier(
+    rocksdb_options_t* opt, int n) {
+  opt->rep.max_bytes_for_level_multiplier = n;
+}
+
+void rocksdb_options_set_expanded_compaction_factor(
+    rocksdb_options_t* opt, int n) {
+  opt->rep.expanded_compaction_factor = n;
+}
+
+void rocksdb_options_set_max_grandparent_overlap_factor(
+    rocksdb_options_t* opt, int n) {
+  opt->rep.max_grandparent_overlap_factor = n;
+}
+
+void rocksdb_options_set_max_bytes_for_level_multiplier_additional(
+    rocksdb_options_t* opt, int* level_values, size_t num_levels) {
+  opt->rep.max_bytes_for_level_multiplier_additional.resize(num_levels);
+  for (size_t i = 0; i < num_levels; ++i) {
+    opt->rep.max_bytes_for_level_multiplier_additional[i] = level_values[i];
+  }
+}
+
+void rocksdb_options_enable_statistics(rocksdb_options_t* opt) {
+  opt->rep.statistics = rocksdb::CreateDBStatistics();
+}
+
+void rocksdb_options_set_num_levels(rocksdb_options_t* opt, int n) {
+  opt->rep.num_levels = n;
+}
+
+void rocksdb_options_set_level0_file_num_compaction_trigger(
+    rocksdb_options_t* opt, int n) {
+  opt->rep.level0_file_num_compaction_trigger = n;
+}
+
+void rocksdb_options_set_level0_slowdown_writes_trigger(
+    rocksdb_options_t* opt, int n) {
+  opt->rep.level0_slowdown_writes_trigger = n;
+}
+
+void rocksdb_options_set_level0_stop_writes_trigger(
+    rocksdb_options_t* opt, int n) {
+  opt->rep.level0_stop_writes_trigger = n;
+}
+
+void rocksdb_options_set_max_mem_compaction_level(
+    rocksdb_options_t* opt, int n) {
+  opt->rep.max_mem_compaction_level = n;
+}
+
+void rocksdb_options_set_compression(rocksdb_options_t* opt, int t) {
+  opt->rep.compression = static_cast<CompressionType>(t);
+}
+
+void rocksdb_options_set_compression_per_level(rocksdb_options_t* opt,
+                                               int* level_values,
+                                               size_t num_levels) {
+  opt->rep.compression_per_level.resize(num_levels);
+  for (size_t i = 0; i < num_levels; ++i) {
+    opt->rep.compression_per_level[i] =
+      static_cast<CompressionType>(level_values[i]);
+  }
+}
+
+void rocksdb_options_set_compression_options(
+    rocksdb_options_t* opt, int w_bits, int level, int strategy) {
+  opt->rep.compression_opts.window_bits = w_bits;
+  opt->rep.compression_opts.level = level;
+  opt->rep.compression_opts.strategy = strategy;
+}
+
+void rocksdb_options_set_prefix_extractor(
+    rocksdb_options_t* opt, rocksdb_slicetransform_t* prefix_extractor) {
+  opt->rep.prefix_extractor.reset(prefix_extractor);
+}
+
+void rocksdb_options_set_whole_key_filtering(
+    rocksdb_options_t* opt, unsigned char v) {
+  opt->rep.whole_key_filtering = v;
+}
+
+void rocksdb_options_set_disable_data_sync(
+    rocksdb_options_t* opt, int disable_data_sync) {
+  opt->rep.disableDataSync = disable_data_sync;
+}
+
+void rocksdb_options_set_use_fsync(
+    rocksdb_options_t* opt, int use_fsync) {
+  opt->rep.use_fsync = use_fsync;
+}
+
+void rocksdb_options_set_db_stats_log_interval(
+    rocksdb_options_t* opt, int db_stats_log_interval) {
+  opt->rep.db_stats_log_interval = db_stats_log_interval;
+}
+
+void rocksdb_options_set_db_log_dir(
+    rocksdb_options_t* opt, const char* db_log_dir) {
+  opt->rep.db_log_dir = db_log_dir;
+}
+
+void rocksdb_options_set_wal_dir(
+    rocksdb_options_t* opt, const char* v) {
+  opt->rep.wal_dir = v;
+}
+
+void rocksdb_options_set_WAL_ttl_seconds(rocksdb_options_t* opt, uint64_t ttl) {
+  opt->rep.WAL_ttl_seconds = ttl;
+}
+
+void rocksdb_options_set_WAL_size_limit_MB(
+    rocksdb_options_t* opt, uint64_t limit) {
+  opt->rep.WAL_size_limit_MB = limit;
+}
+
+void rocksdb_options_set_manifest_preallocation_size(
+    rocksdb_options_t* opt, size_t v) {
+  opt->rep.manifest_preallocation_size = v;
+}
+
+void rocksdb_options_set_purge_redundant_kvs_while_flush(
+    rocksdb_options_t* opt, unsigned char v) {
+  opt->rep.purge_redundant_kvs_while_flush = v;
+}
+
+void rocksdb_options_set_allow_os_buffer(
+    rocksdb_options_t* opt, unsigned char v) {
+  opt->rep.allow_os_buffer = v;
+}
+
+void rocksdb_options_set_allow_mmap_reads(
+    rocksdb_options_t* opt, unsigned char v) {
+  opt->rep.allow_mmap_reads = v;
+}
+
+void rocksdb_options_set_allow_mmap_writes(
+    rocksdb_options_t* opt, unsigned char v) {
+  opt->rep.allow_mmap_writes = v;
+}
+
+void rocksdb_options_set_is_fd_close_on_exec(
+    rocksdb_options_t* opt, unsigned char v) {
+  opt->rep.is_fd_close_on_exec = v;
+}
+
+void rocksdb_options_set_skip_log_error_on_recovery(
+    rocksdb_options_t* opt, unsigned char v) {
+  opt->rep.skip_log_error_on_recovery = v;
+}
+
+void rocksdb_options_set_stats_dump_period_sec(
+    rocksdb_options_t* opt, unsigned int v) {
+  opt->rep.stats_dump_period_sec = v;
+}
+
+void rocksdb_options_set_block_size_deviation(
+    rocksdb_options_t* opt, int v) {
+  opt->rep.block_size_deviation = v;
+}
+
+void rocksdb_options_set_advise_random_on_open(
+    rocksdb_options_t* opt, unsigned char v) {
+  opt->rep.advise_random_on_open = v;
+}
+
+void rocksdb_options_set_access_hint_on_compaction_start(
+    rocksdb_options_t* opt, int v) {
+  switch(v) {
+    case 0:
+      opt->rep.access_hint_on_compaction_start = rocksdb::Options::NONE;
+      break;
+    case 1:
+      opt->rep.access_hint_on_compaction_start = rocksdb::Options::NORMAL;
+      break;
+    case 2:
+      opt->rep.access_hint_on_compaction_start = rocksdb::Options::SEQUENTIAL;
+      break;
+    case 3:
+      opt->rep.access_hint_on_compaction_start = rocksdb::Options::WILLNEED;
+      break;
+  }
+}
+
+void rocksdb_options_set_use_adaptive_mutex(
+    rocksdb_options_t* opt, unsigned char v) {
+  opt->rep.use_adaptive_mutex = v;
+}
+
+void rocksdb_options_set_bytes_per_sync(
+    rocksdb_options_t* opt, uint64_t v) {
+  opt->rep.bytes_per_sync = v;
+}
+
+void rocksdb_options_set_verify_checksums_in_compaction(
+    rocksdb_options_t* opt, unsigned char v) {
+  opt->rep.verify_checksums_in_compaction = v;
+}
+
+void rocksdb_options_set_filter_deletes(
+    rocksdb_options_t* opt, unsigned char v) {
+  opt->rep.filter_deletes = v;
+}
+
+void rocksdb_options_set_max_sequential_skip_in_iterations(
+    rocksdb_options_t* opt, uint64_t v) {
+  opt->rep.max_sequential_skip_in_iterations = v;
+}
+
+void rocksdb_options_set_max_write_buffer_number(rocksdb_options_t* opt, int n) {
+  opt->rep.max_write_buffer_number = n;
+}
+
+void rocksdb_options_set_min_write_buffer_number_to_merge(rocksdb_options_t* opt, int n) {
+  opt->rep.min_write_buffer_number_to_merge = n;
+}
+
+void rocksdb_options_set_max_background_compactions(rocksdb_options_t* opt, int n) {
+  opt->rep.max_background_compactions = n;
+}
+
+void rocksdb_options_set_max_background_flushes(rocksdb_options_t* opt, int n) {
+  opt->rep.max_background_flushes = n;
+}
+
+void rocksdb_options_set_max_log_file_size(rocksdb_options_t* opt, size_t v) {
+  opt->rep.max_log_file_size = v;
+}
+
+void rocksdb_options_set_log_file_time_to_roll(rocksdb_options_t* opt, size_t v) {
+  opt->rep.log_file_time_to_roll = v;
+}
+
+void rocksdb_options_set_keep_log_file_num(rocksdb_options_t* opt, size_t v) {
+  opt->rep.keep_log_file_num = v;
+}
+
+void rocksdb_options_set_soft_rate_limit(rocksdb_options_t* opt, double v) {
+  opt->rep.soft_rate_limit = v;
+}
+
+void rocksdb_options_set_hard_rate_limit(rocksdb_options_t* opt, double v) {
+  opt->rep.hard_rate_limit = v;
+}
+
+void rocksdb_options_set_rate_limit_delay_max_milliseconds(
+    rocksdb_options_t* opt, unsigned int v) {
+  opt->rep.rate_limit_delay_max_milliseconds = v;
+}
+
+void rocksdb_options_set_max_manifest_file_size(
+    rocksdb_options_t* opt, size_t v) {
+  opt->rep.max_manifest_file_size = v;
+}
+
+void rocksdb_options_set_no_block_cache(
+    rocksdb_options_t* opt, unsigned char v) {
+  opt->rep.no_block_cache = v;
+}
+
+void rocksdb_options_set_table_cache_numshardbits(
+    rocksdb_options_t* opt, int v) {
+  opt->rep.table_cache_numshardbits = v;
+}
+
+void rocksdb_options_set_table_cache_remove_scan_count_limit(
+    rocksdb_options_t* opt, int v) {
+  opt->rep.table_cache_remove_scan_count_limit = v;
+}
+
+void rocksdb_options_set_arena_block_size(
+    rocksdb_options_t* opt, size_t v) {
+  opt->rep.arena_block_size = v;
+}
+
+void rocksdb_options_set_disable_auto_compactions(rocksdb_options_t* opt, int disable) {
+  opt->rep.disable_auto_compactions = disable;
+}
+
+void rocksdb_options_set_disable_seek_compaction(rocksdb_options_t* opt, int disable) {
+  opt->rep.disable_seek_compaction = disable;
+}
+
+void rocksdb_options_set_delete_obsolete_files_period_micros(
+    rocksdb_options_t* opt, uint64_t v) {
+  opt->rep.delete_obsolete_files_period_micros = v;
+}
+
+void rocksdb_options_set_source_compaction_factor(
+    rocksdb_options_t* opt, int n) {
+  opt->rep.expanded_compaction_factor = n;
+}
+
+void rocksdb_options_prepare_for_bulk_load(rocksdb_options_t* opt) {
+  opt->rep.PrepareForBulkLoad();
+}
+
+void rocksdb_options_set_memtable_vector_rep(rocksdb_options_t *opt) {
+  static rocksdb::VectorRepFactory* factory = 0;
+  if (!factory) {
+    factory = new rocksdb::VectorRepFactory;
+  }
+  opt->rep.memtable_factory.reset(factory);
+}
+
+void rocksdb_options_set_memtable_prefix_bloom_bits(
+    rocksdb_options_t* opt, uint32_t v) {
+  opt->rep.memtable_prefix_bloom_bits = v;
+}
+
+void rocksdb_options_set_memtable_prefix_bloom_probes(
+    rocksdb_options_t* opt, uint32_t v) {
+  opt->rep.memtable_prefix_bloom_probes = v;
+}
+
+void rocksdb_options_set_hash_skip_list_rep(
+    rocksdb_options_t *opt, size_t bucket_count,
+    int32_t skiplist_height, int32_t skiplist_branching_factor) {
+  static rocksdb::MemTableRepFactory* factory = 0;
+  if (!factory) {
+    factory = rocksdb::NewHashSkipListRepFactory(
+        bucket_count, skiplist_height, skiplist_branching_factor);
+  }
+  opt->rep.memtable_factory.reset(factory);
+}
+
+void rocksdb_options_set_hash_link_list_rep(
+    rocksdb_options_t *opt, size_t bucket_count) {
+  static rocksdb::MemTableRepFactory* factory = 0;
+  if (!factory) {
+    factory = rocksdb::NewHashLinkListRepFactory(bucket_count);
+  }
+  opt->rep.memtable_factory.reset(factory);
+}
+
+void rocksdb_options_set_plain_table_factory(
+    rocksdb_options_t *opt, uint32_t user_key_len, int bloom_bits_per_key,
+    double hash_table_ratio, size_t index_sparseness) {
+  static rocksdb::TableFactory* factory = 0;
+  if (!factory) {
+    factory = rocksdb::NewPlainTableFactory(
+        user_key_len, bloom_bits_per_key,
+        hash_table_ratio, index_sparseness);
+  }
+  opt->rep.table_factory.reset(factory);
+}
+
+void rocksdb_options_set_max_successive_merges(
+    rocksdb_options_t* opt, size_t v) {
+  opt->rep.max_successive_merges = v;
+}
+
+void rocksdb_options_set_min_partial_merge_operands(
+    rocksdb_options_t* opt, uint32_t v) {
+  opt->rep.min_partial_merge_operands = v;
+}
+
+void rocksdb_options_set_bloom_locality(
+    rocksdb_options_t* opt, uint32_t v) {
+  opt->rep.bloom_locality = v;
+}
+
+void rocksdb_options_set_allow_thread_local(
+    rocksdb_options_t* opt, unsigned char v) {
+  opt->rep.allow_thread_local = v;
+}
+
+void rocksdb_options_set_inplace_update_support(
+    rocksdb_options_t* opt, unsigned char v) {
+  opt->rep.inplace_update_support = v;
+}
+
+void rocksdb_options_set_inplace_update_num_locks(
+    rocksdb_options_t* opt, size_t v) {
+  opt->rep.inplace_update_num_locks = v;
+}
+
+void rocksdb_options_set_compaction_style(rocksdb_options_t *opt, int style) {
+  opt->rep.compaction_style = static_cast<rocksdb::CompactionStyle>(style);
+}
+
+void rocksdb_options_set_universal_compaction_options(rocksdb_options_t *opt, rocksdb_universal_compaction_options_t *uco) {
+  opt->rep.compaction_options_universal = *(uco->rep);
+}
+
+/*
+TODO:
+DB::OpenForReadOnly
+DB::MultiGet
+DB::KeyMayExist
+DB::GetOptions
+DB::GetSortedWalFiles
+DB::GetLatestSequenceNumber
+DB::GetUpdatesSince
+DB::GetDbIdentity
+DB::RunManualCompaction
+custom cache
+compaction_filter
+table_properties_collectors
+*/
+
+rocksdb_comparator_t* rocksdb_comparator_create(
+    void* state,
+    void (*destructor)(void*),
+    int (*compare)(
+        void*,
+        const char* a, size_t alen,
+        const char* b, size_t blen),
+    const char* (*name)(void*)) {
+  rocksdb_comparator_t* result = new rocksdb_comparator_t;
+  result->state_ = state;
+  result->destructor_ = destructor;
+  result->compare_ = compare;
+  result->name_ = name;
+  return result;
+}
+
+void rocksdb_comparator_destroy(rocksdb_comparator_t* cmp) {
+  delete cmp;
+}
+
+rocksdb_filterpolicy_t* rocksdb_filterpolicy_create(
+    void* state,
+    void (*destructor)(void*),
+    char* (*create_filter)(
+        void*,
+        const char* const* key_array, const size_t* key_length_array,
+        int num_keys,
+        size_t* filter_length),
+    unsigned char (*key_may_match)(
+        void*,
+        const char* key, size_t length,
+        const char* filter, size_t filter_length),
+    void (*delete_filter)(
+        void*,
+        const char* filter, size_t filter_length),
+    const char* (*name)(void*)) {
+  rocksdb_filterpolicy_t* result = new rocksdb_filterpolicy_t;
+  result->state_ = state;
+  result->destructor_ = destructor;
+  result->create_ = create_filter;
+  result->key_match_ = key_may_match;
+  result->delete_filter_ = delete_filter;
+  result->name_ = name;
+  return result;
+}
+
+void rocksdb_filterpolicy_destroy(rocksdb_filterpolicy_t* filter) {
+  delete filter;
+}
+
+rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom(int bits_per_key) {
+  // Make a rocksdb_filterpolicy_t, but override all of its methods so
+  // they delegate to a NewBloomFilterPolicy() instead of user
+  // supplied C functions.
+  struct Wrapper : public rocksdb_filterpolicy_t {
+    const FilterPolicy* rep_;
+    ~Wrapper() { delete rep_; }
+    const char* Name() const { return rep_->Name(); }
+    void CreateFilter(const Slice* keys, int n, std::string* dst) const {
+      return rep_->CreateFilter(keys, n, dst);
+    }
+    bool KeyMayMatch(const Slice& key, const Slice& filter) const {
+      return rep_->KeyMayMatch(key, filter);
+    }
+    static void DoNothing(void*) { }
+  };
+  Wrapper* wrapper = new Wrapper;
+  wrapper->rep_ = NewBloomFilterPolicy(bits_per_key);
+  wrapper->state_ = nullptr;
+  wrapper->delete_filter_ = nullptr;
+  wrapper->destructor_ = &Wrapper::DoNothing;
+  return wrapper;
+}
+
+rocksdb_mergeoperator_t* rocksdb_mergeoperator_create(
+    void* state, void (*destructor)(void*),
+    char* (*full_merge)(void*, const char* key, size_t key_length,
+                        const char* existing_value,
+                        size_t existing_value_length,
+                        const char* const* operands_list,
+                        const size_t* operands_list_length, int num_operands,
+                        unsigned char* success, size_t* new_value_length),
+    char* (*partial_merge)(void*, const char* key, size_t key_length,
+                           const char* const* operands_list,
+                           const size_t* operands_list_length, int num_operands,
+                           unsigned char* success, size_t* new_value_length),
+    void (*delete_value)(void*, const char* value, size_t value_length),
+    const char* (*name)(void*)) {
+  rocksdb_mergeoperator_t* result = new rocksdb_mergeoperator_t;
+  result->state_ = state;
+  result->destructor_ = destructor;
+  result->full_merge_ = full_merge;
+  result->partial_merge_ = partial_merge;
+  result->delete_value_ = delete_value;
+  result->name_ = name;
+  return result;
+}
+
+void rocksdb_mergeoperator_destroy(rocksdb_mergeoperator_t* merge_operator) {
+  delete merge_operator;
+}
+
+rocksdb_readoptions_t* rocksdb_readoptions_create() {
+  return new rocksdb_readoptions_t;
+}
+
+void rocksdb_readoptions_destroy(rocksdb_readoptions_t* opt) {
+  delete opt;
+}
+
+void rocksdb_readoptions_set_verify_checksums(
+    rocksdb_readoptions_t* opt,
+    unsigned char v) {
+  opt->rep.verify_checksums = v;
+}
+
+void rocksdb_readoptions_set_fill_cache(
+    rocksdb_readoptions_t* opt, unsigned char v) {
+  opt->rep.fill_cache = v;
+}
+
+void rocksdb_readoptions_set_snapshot(
+    rocksdb_readoptions_t* opt,
+    const rocksdb_snapshot_t* snap) {
+  opt->rep.snapshot = (snap ? snap->rep : nullptr);
+}
+
+void rocksdb_readoptions_set_read_tier(
+    rocksdb_readoptions_t* opt, int v) {
+  opt->rep.read_tier = static_cast<rocksdb::ReadTier>(v);
+}
+
+void rocksdb_readoptions_set_tailing(
+    rocksdb_readoptions_t* opt, unsigned char v) {
+  opt->rep.tailing = v;
+}
+
+rocksdb_writeoptions_t* rocksdb_writeoptions_create() {
+  return new rocksdb_writeoptions_t;
+}
+
+void rocksdb_writeoptions_destroy(rocksdb_writeoptions_t* opt) {
+  delete opt;
+}
+
+void rocksdb_writeoptions_set_sync(
+    rocksdb_writeoptions_t* opt, unsigned char v) {
+  opt->rep.sync = v;
+}
+
+void rocksdb_writeoptions_disable_WAL(rocksdb_writeoptions_t* opt, int disable) {
+  opt->rep.disableWAL = disable;
+}
+
+
+rocksdb_flushoptions_t* rocksdb_flushoptions_create() {
+  return new rocksdb_flushoptions_t;
+}
+
+void rocksdb_flushoptions_destroy(rocksdb_flushoptions_t* opt) {
+  delete opt;
+}
+
+void rocksdb_flushoptions_set_wait(
+    rocksdb_flushoptions_t* opt, unsigned char v) {
+  opt->rep.wait = v;
+}
+
+rocksdb_cache_t* rocksdb_cache_create_lru(size_t capacity) {
+  rocksdb_cache_t* c = new rocksdb_cache_t;
+  c->rep = NewLRUCache(capacity);
+  return c;
+}
+
+void rocksdb_cache_destroy(rocksdb_cache_t* cache) {
+  delete cache;
+}
+
+rocksdb_env_t* rocksdb_create_default_env() {
+  rocksdb_env_t* result = new rocksdb_env_t;
+  result->rep = Env::Default();
+  result->is_default = true;
+  return result;
+}
+
+void rocksdb_env_set_background_threads(rocksdb_env_t* env, int n) {
+  env->rep->SetBackgroundThreads(n);
+}
+
+void rocksdb_env_set_high_priority_background_threads(rocksdb_env_t* env, int n) {
+  env->rep->SetBackgroundThreads(n, Env::HIGH);
+}
+
+void rocksdb_env_destroy(rocksdb_env_t* env) {
+  if (!env->is_default) delete env->rep;
+  delete env;
+}
+
+rocksdb_slicetransform_t* rocksdb_slicetransform_create(
+    void* state,
+    void (*destructor)(void*),
+    char* (*transform)(
+        void*,
+        const char* key, size_t length,
+        size_t* dst_length),
+    unsigned char (*in_domain)(
+        void*,
+        const char* key, size_t length),
+    unsigned char (*in_range)(
+        void*,
+        const char* key, size_t length),
+    const char* (*name)(void*)) {
+  rocksdb_slicetransform_t* result = new rocksdb_slicetransform_t;
+  result->state_ = state;
+  result->destructor_ = destructor;
+  result->transform_ = transform;
+  result->in_domain_ = in_domain;
+  result->in_range_ = in_range;
+  result->name_ = name;
+  return result;
+}
+
+void rocksdb_slicetransform_destroy(rocksdb_slicetransform_t* st) {
+  delete st;
+}
+
+rocksdb_slicetransform_t* rocksdb_slicetransform_create_fixed_prefix(size_t prefixLen) {
+  struct Wrapper : public rocksdb_slicetransform_t {
+    const SliceTransform* rep_;
+    ~Wrapper() { delete rep_; }
+    const char* Name() const { return rep_->Name(); }
+    Slice Transform(const Slice& src) const {
+      return rep_->Transform(src);
+    }
+    bool InDomain(const Slice& src) const {
+      return rep_->InDomain(src);
+    }
+    bool InRange(const Slice& src) const {
+      return rep_->InRange(src);
+    }
+    static void DoNothing(void*) { }
+  };
+  Wrapper* wrapper = new Wrapper;
+  wrapper->rep_ = rocksdb::NewFixedPrefixTransform(prefixLen);
+  wrapper->state_ = nullptr;
+  wrapper->destructor_ = &Wrapper::DoNothing;
+  return wrapper;
+}
+
+rocksdb_universal_compaction_options_t* rocksdb_universal_compaction_options_create() {
+  rocksdb_universal_compaction_options_t* result = new rocksdb_universal_compaction_options_t;
+  result->rep = new rocksdb::CompactionOptionsUniversal;
+  return result;
+}
+
+void rocksdb_universal_compaction_options_set_size_ratio(
+  rocksdb_universal_compaction_options_t* uco, int ratio) {
+  uco->rep->size_ratio = ratio;
+}
+
+void rocksdb_universal_compaction_options_set_min_merge_width(
+  rocksdb_universal_compaction_options_t* uco, int w) {
+  uco->rep->min_merge_width = w;
+}
+
+void rocksdb_universal_compaction_options_set_max_merge_width(
+  rocksdb_universal_compaction_options_t* uco, int w) {
+  uco->rep->max_merge_width = w;
+}
+
+void rocksdb_universal_compaction_options_set_max_size_amplification_percent(
+  rocksdb_universal_compaction_options_t* uco, int p) {
+  uco->rep->max_size_amplification_percent = p;
+}
+
+void rocksdb_universal_compaction_options_set_compression_size_percent(
+  rocksdb_universal_compaction_options_t* uco, int p) {
+  uco->rep->compression_size_percent = p;
+}
+
+void rocksdb_universal_compaction_options_set_stop_style(
+  rocksdb_universal_compaction_options_t* uco, int style) {
+  uco->rep->stop_style = static_cast<rocksdb::CompactionStopStyle>(style);
+}
+
+void rocksdb_universal_compaction_options_destroy(
+  rocksdb_universal_compaction_options_t* uco) {
+  delete uco->rep;
+  delete uco;
+}
+
+void rocksdb_options_set_min_level_to_compress(rocksdb_options_t* opt, int level) {
+  if (level >= 0) {
+    assert(level <= opt->rep.num_levels);
+    opt->rep.compression_per_level.resize(opt->rep.num_levels);
+    for (int i = 0; i < level; i++) {
+      opt->rep.compression_per_level[i] = rocksdb::kNoCompression;
+    }
+    for (int i = level; i < opt->rep.num_levels; i++) {
+      opt->rep.compression_per_level[i] = opt->rep.compression;
+    }
+  }
+}
+
+int rocksdb_livefiles_count(
+  const rocksdb_livefiles_t* lf) {
+  return lf->rep.size();
+}
+
+const char* rocksdb_livefiles_name(
+  const rocksdb_livefiles_t* lf,
+  int index) {
+  return lf->rep[index].name.c_str();
+}
+
+int rocksdb_livefiles_level(
+  const rocksdb_livefiles_t* lf,
+  int index) {
+  return lf->rep[index].level;
+}
+
+size_t rocksdb_livefiles_size(
+  const rocksdb_livefiles_t* lf,
+  int index) {
+  return lf->rep[index].size;
+}
+
+const char* rocksdb_livefiles_smallestkey(
+  const rocksdb_livefiles_t* lf,
+  int index,
+  size_t* size) {
+  *size = lf->rep[index].smallestkey.size();
+  return lf->rep[index].smallestkey.data();
+}
+
+const char* rocksdb_livefiles_largestkey(
+  const rocksdb_livefiles_t* lf,
+  int index,
+  size_t* size) {
+  *size = lf->rep[index].largestkey.size();
+  return lf->rep[index].largestkey.data();
+}
+
+extern void rocksdb_livefiles_destroy(
+  const rocksdb_livefiles_t* lf) {
+  delete lf;
+}
+
+}  // end extern "C"
+
+#endif  // ROCKSDB_LITE
diff --git a/db/c_test.c b/db/c_test.c
new file mode 100644 (file)
index 0000000..8ebce90
--- /dev/null
@@ -0,0 +1,494 @@
+/* Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+   Use of this source code is governed by a BSD-style license that can be
+   found in the LICENSE file. See the AUTHORS file for names of contributors. */
+
+#include "rocksdb/c.h"
+
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+const char* phase = "";
+static char dbname[200];
+
+static void StartPhase(const char* name) {
+  fprintf(stderr, "=== Test %s\n", name);
+  phase = name;
+}
+
+static const char* GetTempDir(void) {
+    const char* ret = getenv("TEST_TMPDIR");
+    if (ret == NULL || ret[0] == '\0')
+        ret = "/tmp";
+    return ret;
+}
+
+#define CheckNoError(err)                                               \
+  if ((err) != NULL) {                                                  \
+    fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, __LINE__, phase, (err)); \
+    abort();                                                            \
+  }
+
+#define CheckCondition(cond)                                            \
+  if (!(cond)) {                                                        \
+    fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, __LINE__, phase, #cond); \
+    abort();                                                            \
+  }
+
+static void CheckEqual(const char* expected, const char* v, size_t n) {
+  if (expected == NULL && v == NULL) {
+    // ok
+  } else if (expected != NULL && v != NULL && n == strlen(expected) &&
+             memcmp(expected, v, n) == 0) {
+    // ok
+    return;
+  } else {
+    fprintf(stderr, "%s: expected '%s', got '%s'\n",
+            phase,
+            (expected ? expected : "(null)"),
+            (v ? v : "(null"));
+    abort();
+  }
+}
+
+static void Free(char** ptr) {
+  if (*ptr) {
+    free(*ptr);
+    *ptr = NULL;
+  }
+}
+
+static void CheckGet(
+    rocksdb_t* db,
+    const rocksdb_readoptions_t* options,
+    const char* key,
+    const char* expected) {
+  char* err = NULL;
+  size_t val_len;
+  char* val;
+  val = rocksdb_get(db, options, key, strlen(key), &val_len, &err);
+  CheckNoError(err);
+  CheckEqual(expected, val, val_len);
+  Free(&val);
+}
+
+static void CheckIter(rocksdb_iterator_t* iter,
+                      const char* key, const char* val) {
+  size_t len;
+  const char* str;
+  str = rocksdb_iter_key(iter, &len);
+  CheckEqual(key, str, len);
+  str = rocksdb_iter_value(iter, &len);
+  CheckEqual(val, str, len);
+}
+
+// Callback from rocksdb_writebatch_iterate()
+static void CheckPut(void* ptr,
+                     const char* k, size_t klen,
+                     const char* v, size_t vlen) {
+  int* state = (int*) ptr;
+  CheckCondition(*state < 2);
+  switch (*state) {
+    case 0:
+      CheckEqual("bar", k, klen);
+      CheckEqual("b", v, vlen);
+      break;
+    case 1:
+      CheckEqual("box", k, klen);
+      CheckEqual("c", v, vlen);
+      break;
+  }
+  (*state)++;
+}
+
+// Callback from rocksdb_writebatch_iterate()
+static void CheckDel(void* ptr, const char* k, size_t klen) {
+  int* state = (int*) ptr;
+  CheckCondition(*state == 2);
+  CheckEqual("bar", k, klen);
+  (*state)++;
+}
+
+static void CmpDestroy(void* arg) { }
+
+static int CmpCompare(void* arg, const char* a, size_t alen,
+                      const char* b, size_t blen) {
+  int n = (alen < blen) ? alen : blen;
+  int r = memcmp(a, b, n);
+  if (r == 0) {
+    if (alen < blen) r = -1;
+    else if (alen > blen) r = +1;
+  }
+  return r;
+}
+
+static const char* CmpName(void* arg) {
+  return "foo";
+}
+
+// Custom filter policy
+static unsigned char fake_filter_result = 1;
+static void FilterDestroy(void* arg) { }
+static const char* FilterName(void* arg) {
+  return "TestFilter";
+}
+static char* FilterCreate(
+    void* arg,
+    const char* const* key_array, const size_t* key_length_array,
+    int num_keys,
+    size_t* filter_length) {
+  *filter_length = 4;
+  char* result = malloc(4);
+  memcpy(result, "fake", 4);
+  return result;
+}
+static unsigned char FilterKeyMatch(
+    void* arg,
+    const char* key, size_t length,
+    const char* filter, size_t filter_length) {
+  CheckCondition(filter_length == 4);
+  CheckCondition(memcmp(filter, "fake", 4) == 0);
+  return fake_filter_result;
+}
+
+// Custom merge operator
+static void MergeOperatorDestroy(void* arg) { }
+static const char* MergeOperatorName(void* arg) {
+  return "TestMergeOperator";
+}
+static char* MergeOperatorFullMerge(
+    void* arg,
+    const char* key, size_t key_length,
+    const char* existing_value, size_t existing_value_length,
+    const char* const* operands_list, const size_t* operands_list_length,
+    int num_operands,
+    unsigned char* success, size_t* new_value_length) {
+  *new_value_length = 4;
+  *success = 1;
+  char* result = malloc(4);
+  memcpy(result, "fake", 4);
+  return result;
+}
+static char* MergeOperatorPartialMerge(
+    void* arg,
+    const char* key, size_t key_length,
+    const char* const* operands_list, const size_t* operands_list_length,
+    int num_operands,
+    unsigned char* success, size_t* new_value_length) {
+  *new_value_length = 4;
+  *success = 1;
+  char* result = malloc(4);
+  memcpy(result, "fake", 4);
+  return result;
+}
+
+int main(int argc, char** argv) {
+  rocksdb_t* db;
+  rocksdb_comparator_t* cmp;
+  rocksdb_cache_t* cache;
+  rocksdb_env_t* env;
+  rocksdb_options_t* options;
+  rocksdb_readoptions_t* roptions;
+  rocksdb_writeoptions_t* woptions;
+  char* err = NULL;
+  int run = -1;
+
+  snprintf(dbname, sizeof(dbname),
+           "%s/rocksdb_c_test-%d",
+           GetTempDir(),
+           ((int) geteuid()));
+
+  StartPhase("create_objects");
+  cmp = rocksdb_comparator_create(NULL, CmpDestroy, CmpCompare, CmpName);
+  env = rocksdb_create_default_env();
+  cache = rocksdb_cache_create_lru(100000);
+
+  options = rocksdb_options_create();
+  rocksdb_options_set_comparator(options, cmp);
+  rocksdb_options_set_error_if_exists(options, 1);
+  rocksdb_options_set_cache(options, cache);
+  rocksdb_options_set_env(options, env);
+  rocksdb_options_set_info_log(options, NULL);
+  rocksdb_options_set_write_buffer_size(options, 100000);
+  rocksdb_options_set_paranoid_checks(options, 1);
+  rocksdb_options_set_max_open_files(options, 10);
+  rocksdb_options_set_block_size(options, 1024);
+  rocksdb_options_set_block_restart_interval(options, 8);
+  rocksdb_options_set_compression(options, rocksdb_no_compression);
+  rocksdb_options_set_compression_options(options, -14, -1, 0);
+  int compression_levels[] = {rocksdb_no_compression, rocksdb_no_compression,
+                              rocksdb_no_compression, rocksdb_no_compression};
+  rocksdb_options_set_compression_per_level(options, compression_levels, 4);
+
+  roptions = rocksdb_readoptions_create();
+  rocksdb_readoptions_set_verify_checksums(roptions, 1);
+  rocksdb_readoptions_set_fill_cache(roptions, 0);
+
+  woptions = rocksdb_writeoptions_create();
+  rocksdb_writeoptions_set_sync(woptions, 1);
+
+  StartPhase("destroy");
+  rocksdb_destroy_db(options, dbname, &err);
+  Free(&err);
+
+  StartPhase("open_error");
+  db = rocksdb_open(options, dbname, &err);
+  CheckCondition(err != NULL);
+  Free(&err);
+
+  StartPhase("open");
+  rocksdb_options_set_create_if_missing(options, 1);
+  db = rocksdb_open(options, dbname, &err);
+  CheckNoError(err);
+  CheckGet(db, roptions, "foo", NULL);
+
+  StartPhase("put");
+  rocksdb_put(db, woptions, "foo", 3, "hello", 5, &err);
+  CheckNoError(err);
+  CheckGet(db, roptions, "foo", "hello");
+
+  StartPhase("compactall");
+  rocksdb_compact_range(db, NULL, 0, NULL, 0);
+  CheckGet(db, roptions, "foo", "hello");
+
+  StartPhase("compactrange");
+  rocksdb_compact_range(db, "a", 1, "z", 1);
+  CheckGet(db, roptions, "foo", "hello");
+
+  StartPhase("writebatch");
+  {
+    rocksdb_writebatch_t* wb = rocksdb_writebatch_create();
+    rocksdb_writebatch_put(wb, "foo", 3, "a", 1);
+    rocksdb_writebatch_clear(wb);
+    rocksdb_writebatch_put(wb, "bar", 3, "b", 1);
+    rocksdb_writebatch_put(wb, "box", 3, "c", 1);
+    rocksdb_writebatch_delete(wb, "bar", 3);
+    rocksdb_write(db, woptions, wb, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "foo", "hello");
+    CheckGet(db, roptions, "bar", NULL);
+    CheckGet(db, roptions, "box", "c");
+    int pos = 0;
+    rocksdb_writebatch_iterate(wb, &pos, CheckPut, CheckDel);
+    CheckCondition(pos == 3);
+    rocksdb_writebatch_destroy(wb);
+  }
+
+  StartPhase("iter");
+  {
+    rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions);
+    CheckCondition(!rocksdb_iter_valid(iter));
+    rocksdb_iter_seek_to_first(iter);
+    CheckCondition(rocksdb_iter_valid(iter));
+    CheckIter(iter, "box", "c");
+    rocksdb_iter_next(iter);
+    CheckIter(iter, "foo", "hello");
+    rocksdb_iter_prev(iter);
+    CheckIter(iter, "box", "c");
+    rocksdb_iter_prev(iter);
+    CheckCondition(!rocksdb_iter_valid(iter));
+    rocksdb_iter_seek_to_last(iter);
+    CheckIter(iter, "foo", "hello");
+    rocksdb_iter_seek(iter, "b", 1);
+    CheckIter(iter, "box", "c");
+    rocksdb_iter_get_error(iter, &err);
+    CheckNoError(err);
+    rocksdb_iter_destroy(iter);
+  }
+
+  StartPhase("approximate_sizes");
+  {
+    int i;
+    int n = 20000;
+    char keybuf[100];
+    char valbuf[100];
+    uint64_t sizes[2];
+    const char* start[2] = { "a", "k00000000000000010000" };
+    size_t start_len[2] = { 1, 21 };
+    const char* limit[2] = { "k00000000000000010000", "z" };
+    size_t limit_len[2] = { 21, 1 };
+    rocksdb_writeoptions_set_sync(woptions, 0);
+    for (i = 0; i < n; i++) {
+      snprintf(keybuf, sizeof(keybuf), "k%020d", i);
+      snprintf(valbuf, sizeof(valbuf), "v%020d", i);
+      rocksdb_put(db, woptions, keybuf, strlen(keybuf), valbuf, strlen(valbuf),
+                  &err);
+      CheckNoError(err);
+    }
+    rocksdb_approximate_sizes(db, 2, start, start_len, limit, limit_len, sizes);
+    CheckCondition(sizes[0] > 0);
+    CheckCondition(sizes[1] > 0);
+  }
+
+  StartPhase("property");
+  {
+    char* prop = rocksdb_property_value(db, "nosuchprop");
+    CheckCondition(prop == NULL);
+    prop = rocksdb_property_value(db, "rocksdb.stats");
+    CheckCondition(prop != NULL);
+    Free(&prop);
+  }
+
+  StartPhase("snapshot");
+  {
+    const rocksdb_snapshot_t* snap;
+    snap = rocksdb_create_snapshot(db);
+    rocksdb_delete(db, woptions, "foo", 3, &err);
+    CheckNoError(err);
+    rocksdb_readoptions_set_snapshot(roptions, snap);
+    CheckGet(db, roptions, "foo", "hello");
+    rocksdb_readoptions_set_snapshot(roptions, NULL);
+    CheckGet(db, roptions, "foo", NULL);
+    rocksdb_release_snapshot(db, snap);
+  }
+
+  StartPhase("repair");
+  {
+    // If we do not compact here, then the lazy deletion of
+    // files (https://reviews.facebook.net/D6123) would leave
+    // around deleted files and the repair process will find
+    // those files and put them back into the database.
+    rocksdb_compact_range(db, NULL, 0, NULL, 0);
+    rocksdb_close(db);
+    rocksdb_options_set_create_if_missing(options, 0);
+    rocksdb_options_set_error_if_exists(options, 0);
+    rocksdb_repair_db(options, dbname, &err);
+    CheckNoError(err);
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "foo", NULL);
+    CheckGet(db, roptions, "bar", NULL);
+    CheckGet(db, roptions, "box", "c");
+    rocksdb_options_set_create_if_missing(options, 1);
+    rocksdb_options_set_error_if_exists(options, 1);
+  }
+
+  StartPhase("filter");
+  for (run = 0; run < 2; run++) {
+    // First run uses custom filter, second run uses bloom filter
+    CheckNoError(err);
+    rocksdb_filterpolicy_t* policy;
+    if (run == 0) {
+      policy = rocksdb_filterpolicy_create(
+          NULL, FilterDestroy, FilterCreate, FilterKeyMatch, NULL, FilterName);
+    } else {
+      policy = rocksdb_filterpolicy_create_bloom(10);
+    }
+
+    // Create new database
+    rocksdb_close(db);
+    rocksdb_destroy_db(options, dbname, &err);
+    rocksdb_options_set_filter_policy(options, policy);
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+    rocksdb_put(db, woptions, "foo", 3, "foovalue", 8, &err);
+    CheckNoError(err);
+    rocksdb_put(db, woptions, "bar", 3, "barvalue", 8, &err);
+    CheckNoError(err);
+    rocksdb_compact_range(db, NULL, 0, NULL, 0);
+
+    fake_filter_result = 1;
+    CheckGet(db, roptions, "foo", "foovalue");
+    CheckGet(db, roptions, "bar", "barvalue");
+    if (phase == 0) {
+      // Must not find value when custom filter returns false
+      fake_filter_result = 0;
+      CheckGet(db, roptions, "foo", NULL);
+      CheckGet(db, roptions, "bar", NULL);
+      fake_filter_result = 1;
+
+      CheckGet(db, roptions, "foo", "foovalue");
+      CheckGet(db, roptions, "bar", "barvalue");
+    }
+    rocksdb_options_set_filter_policy(options, NULL);
+    rocksdb_filterpolicy_destroy(policy);
+  }
+
+  StartPhase("merge_operator");
+  {
+    rocksdb_mergeoperator_t* merge_operator;
+    merge_operator = rocksdb_mergeoperator_create(
+        NULL, MergeOperatorDestroy, MergeOperatorFullMerge,
+        MergeOperatorPartialMerge, NULL, MergeOperatorName);
+    // Create new database
+    rocksdb_close(db);
+    rocksdb_destroy_db(options, dbname, &err);
+    rocksdb_options_set_merge_operator(options, merge_operator);
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+    rocksdb_put(db, woptions, "foo", 3, "foovalue", 8, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "foo", "foovalue");
+    rocksdb_merge(db, woptions, "foo", 3, "barvalue", 8, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "foo", "fake");
+
+    // Merge of a non-existing value
+    rocksdb_merge(db, woptions, "bar", 3, "barvalue", 8, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "bar", "fake");
+
+  }
+
+  StartPhase("prefix");
+  {
+    // Create new database
+    rocksdb_close(db);
+    rocksdb_destroy_db(options, dbname, &err);
+
+    rocksdb_filterpolicy_t* policy = rocksdb_filterpolicy_create_bloom(10);
+    rocksdb_options_set_filter_policy(options, policy);
+    rocksdb_options_set_prefix_extractor(options, rocksdb_slicetransform_create_fixed_prefix(3));
+    rocksdb_options_set_hash_skip_list_rep(options, 50000, 4, 4);
+    rocksdb_options_set_plain_table_factory(options, 4, 10, 0.75, 16);
+
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+
+    rocksdb_put(db, woptions, "foo1", 4, "foo", 3, &err);
+    CheckNoError(err);
+    rocksdb_put(db, woptions, "foo2", 4, "foo", 3, &err);
+    CheckNoError(err);
+    rocksdb_put(db, woptions, "foo3", 4, "foo", 3, &err);
+    CheckNoError(err);
+    rocksdb_put(db, woptions, "bar1", 4, "bar", 3, &err);
+    CheckNoError(err);
+    rocksdb_put(db, woptions, "bar2", 4, "bar", 3, &err);
+    CheckNoError(err);
+    rocksdb_put(db, woptions, "bar3", 4, "bar", 3, &err);
+    CheckNoError(err);
+
+    rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions);
+    CheckCondition(!rocksdb_iter_valid(iter));
+
+    rocksdb_iter_seek(iter, "bar", 3);
+    rocksdb_iter_get_error(iter, &err);
+    CheckNoError(err);
+    CheckCondition(rocksdb_iter_valid(iter));
+
+    CheckIter(iter, "bar1", "bar");
+    rocksdb_iter_next(iter);
+    CheckIter(iter, "bar2", "bar");
+    rocksdb_iter_next(iter);
+    CheckIter(iter, "bar3", "bar");
+    rocksdb_iter_get_error(iter, &err);
+    CheckNoError(err);
+    rocksdb_iter_destroy(iter);
+    rocksdb_filterpolicy_destroy(policy);
+  }
+
+  StartPhase("cleanup");
+  rocksdb_close(db);
+  rocksdb_options_destroy(options);
+  rocksdb_readoptions_destroy(roptions);
+  rocksdb_writeoptions_destroy(woptions);
+  rocksdb_cache_destroy(cache);
+  rocksdb_comparator_destroy(cmp);
+  rocksdb_env_destroy(env);
+
+  fprintf(stderr, "PASS\n");
+  return 0;
+}
diff --git a/db/column_family.cc b/db/column_family.cc
new file mode 100644 (file)
index 0000000..2fd68e3
--- /dev/null
@@ -0,0 +1,583 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/column_family.h"
+
+#include <vector>
+#include <string>
+#include <algorithm>
+
+#include "db/db_impl.h"
+#include "db/version_set.h"
+#include "db/internal_stats.h"
+#include "db/compaction_picker.h"
+#include "db/table_properties_collector.h"
+#include "util/autovector.h"
+#include "util/hash_skiplist_rep.h"
+
+namespace rocksdb {
+
+ColumnFamilyHandleImpl::ColumnFamilyHandleImpl(ColumnFamilyData* cfd,
+                                               DBImpl* db, port::Mutex* mutex)
+    : cfd_(cfd), db_(db), mutex_(mutex) {
+  if (cfd_ != nullptr) {
+    cfd_->Ref();
+  }
+}
+
+ColumnFamilyHandleImpl::~ColumnFamilyHandleImpl() {
+  if (cfd_ != nullptr) {
+    DBImpl::DeletionState deletion_state;
+    mutex_->Lock();
+    if (cfd_->Unref()) {
+      delete cfd_;
+    }
+    db_->FindObsoleteFiles(deletion_state, false, true);
+    mutex_->Unlock();
+    if (deletion_state.HaveSomethingToDelete()) {
+      db_->PurgeObsoleteFiles(deletion_state);
+    }
+  }
+}
+
+uint32_t ColumnFamilyHandleImpl::GetID() const { return cfd()->GetID(); }
+
+namespace {
+// Fix user-supplied options to be reasonable
+template <class T, class V>
+static void ClipToRange(T* ptr, V minvalue, V maxvalue) {
+  if (static_cast<V>(*ptr) > maxvalue) *ptr = maxvalue;
+  if (static_cast<V>(*ptr) < minvalue) *ptr = minvalue;
+}
+}  // anonymous namespace
+
+ColumnFamilyOptions SanitizeOptions(const InternalKeyComparator* icmp,
+                                    const InternalFilterPolicy* ipolicy,
+                                    const ColumnFamilyOptions& src) {
+  ColumnFamilyOptions result = src;
+  result.comparator = icmp;
+  result.filter_policy = (src.filter_policy != nullptr) ? ipolicy : nullptr;
+#ifdef OS_MACOSX
+  // TODO(icanadi) make write_buffer_size uint64_t instead of size_t
+  ClipToRange(&result.write_buffer_size, ((size_t)64) << 10, ((size_t)1) << 30);
+#else
+  ClipToRange(&result.write_buffer_size,
+              ((size_t)64) << 10, ((size_t)64) << 30);
+#endif
+  // if user sets arena_block_size, we trust user to use this value. Otherwise,
+  // calculate a proper value from writer_buffer_size;
+  if (result.arena_block_size <= 0) {
+    result.arena_block_size = result.write_buffer_size / 10;
+  }
+  result.min_write_buffer_number_to_merge =
+      std::min(result.min_write_buffer_number_to_merge,
+               result.max_write_buffer_number - 1);
+  if (result.block_cache == nullptr && !result.no_block_cache) {
+    result.block_cache = NewLRUCache(8 << 20);
+  }
+  result.compression_per_level = src.compression_per_level;
+  if (result.block_size_deviation < 0 || result.block_size_deviation > 100) {
+    result.block_size_deviation = 0;
+  }
+  if (result.max_mem_compaction_level >= result.num_levels) {
+    result.max_mem_compaction_level = result.num_levels - 1;
+  }
+  if (result.soft_rate_limit > result.hard_rate_limit) {
+    result.soft_rate_limit = result.hard_rate_limit;
+  }
+  if (!result.prefix_extractor) {
+    assert(result.memtable_factory);
+    Slice name = result.memtable_factory->Name();
+    if (name.compare("HashSkipListRepFactory") == 0 ||
+        name.compare("HashLinkListRepFactory") == 0) {
+      result.memtable_factory = std::make_shared<SkipListFactory>();
+    }
+  }
+
+  // -- Sanitize the table properties collector
+  // All user defined properties collectors will be wrapped by
+  // UserKeyTablePropertiesCollector since for them they only have the
+  // knowledge of the user keys; internal keys are invisible to them.
+  auto& collectors = result.table_properties_collectors;
+  for (size_t i = 0; i < result.table_properties_collectors.size(); ++i) {
+    assert(collectors[i]);
+    collectors[i] =
+        std::make_shared<UserKeyTablePropertiesCollector>(collectors[i]);
+  }
+  // Add collector to collect internal key statistics
+  collectors.push_back(std::make_shared<InternalKeyPropertiesCollector>());
+
+  return result;
+}
+
+int SuperVersion::dummy = 0;
+void* const SuperVersion::kSVInUse = &SuperVersion::dummy;
+void* const SuperVersion::kSVObsolete = nullptr;
+
+SuperVersion::~SuperVersion() {
+  for (auto td : to_delete) {
+    delete td;
+  }
+}
+
+SuperVersion* SuperVersion::Ref() {
+  refs.fetch_add(1, std::memory_order_relaxed);
+  return this;
+}
+
+bool SuperVersion::Unref() {
+  // fetch_sub returns the previous value of ref
+  uint32_t previous_refs = refs.fetch_sub(1, std::memory_order_relaxed);
+  assert(previous_refs > 0);
+  return previous_refs == 1;
+}
+
+void SuperVersion::Cleanup() {
+  assert(refs.load(std::memory_order_relaxed) == 0);
+  imm->Unref(&to_delete);
+  MemTable* m = mem->Unref();
+  if (m != nullptr) {
+    to_delete.push_back(m);
+  }
+  current->Unref();
+}
+
+void SuperVersion::Init(MemTable* new_mem, MemTableListVersion* new_imm,
+                        Version* new_current) {
+  mem = new_mem;
+  imm = new_imm;
+  current = new_current;
+  mem->Ref();
+  imm->Ref();
+  current->Ref();
+  refs.store(1, std::memory_order_relaxed);
+}
+
+namespace {
+void SuperVersionUnrefHandle(void* ptr) {
+  // UnrefHandle is called when a thread exists or a ThreadLocalPtr gets
+  // destroyed. When former happens, the thread shouldn't see kSVInUse.
+  // When latter happens, we are in ~ColumnFamilyData(), no get should happen as
+  // well.
+  SuperVersion* sv = static_cast<SuperVersion*>(ptr);
+  if (sv->Unref()) {
+    sv->db_mutex->Lock();
+    sv->Cleanup();
+    sv->db_mutex->Unlock();
+    delete sv;
+  }
+}
+}  // anonymous namespace
+
+ColumnFamilyData::ColumnFamilyData(const std::string& dbname, uint32_t id,
+                                   const std::string& name,
+                                   Version* dummy_versions, Cache* table_cache,
+                                   const ColumnFamilyOptions& options,
+                                   const DBOptions* db_options,
+                                   const EnvOptions& storage_options,
+                                   ColumnFamilySet* column_family_set)
+    : id_(id),
+      name_(name),
+      dummy_versions_(dummy_versions),
+      current_(nullptr),
+      refs_(0),
+      dropped_(false),
+      internal_comparator_(options.comparator),
+      internal_filter_policy_(options.filter_policy),
+      options_(*db_options, SanitizeOptions(&internal_comparator_,
+                                            &internal_filter_policy_, options)),
+      mem_(nullptr),
+      imm_(options.min_write_buffer_number_to_merge),
+      super_version_(nullptr),
+      super_version_number_(0),
+      local_sv_(new ThreadLocalPtr(&SuperVersionUnrefHandle)),
+      next_(nullptr),
+      prev_(nullptr),
+      log_number_(0),
+      need_slowdown_for_num_level0_files_(false),
+      column_family_set_(column_family_set) {
+  Ref();
+
+  // if dummy_versions is nullptr, then this is a dummy column family.
+  if (dummy_versions != nullptr) {
+    internal_stats_.reset(new InternalStats(options.num_levels, db_options->env,
+                                            db_options->statistics.get()));
+    table_cache_.reset(
+        new TableCache(dbname, &options_, storage_options, table_cache));
+    if (options_.compaction_style == kCompactionStyleUniversal) {
+      compaction_picker_.reset(
+          new UniversalCompactionPicker(&options_, &internal_comparator_));
+    } else {
+      compaction_picker_.reset(
+          new LevelCompactionPicker(&options_, &internal_comparator_));
+    }
+
+    Log(options_.info_log, "Options for column family \"%s\":\n",
+        name.c_str());
+    const ColumnFamilyOptions* cf_options = &options_;
+    cf_options->Dump(options_.info_log.get());
+  }
+}
+
+// DB mutex held
+ColumnFamilyData::~ColumnFamilyData() {
+  assert(refs_ == 0);
+  // remove from linked list
+  auto prev = prev_;
+  auto next = next_;
+  prev->next_ = next;
+  next->prev_ = prev;
+
+  // it's nullptr for dummy CFD
+  if (column_family_set_ != nullptr) {
+    // remove from column_family_set
+    column_family_set_->RemoveColumnFamily(this);
+  }
+
+  if (current_ != nullptr) {
+    current_->Unref();
+  }
+
+  if (super_version_ != nullptr) {
+    // Release SuperVersion reference kept in ThreadLocalPtr.
+    // This must be done outside of mutex_ since unref handler can lock mutex.
+    super_version_->db_mutex->Unlock();
+    local_sv_.reset();
+    super_version_->db_mutex->Lock();
+
+    bool is_last_reference __attribute__((unused));
+    is_last_reference = super_version_->Unref();
+    assert(is_last_reference);
+    super_version_->Cleanup();
+    delete super_version_;
+    super_version_ = nullptr;
+  }
+
+  if (dummy_versions_ != nullptr) {
+    // List must be empty
+    assert(dummy_versions_->next_ == dummy_versions_);
+    delete dummy_versions_;
+  }
+
+  if (mem_ != nullptr) {
+    delete mem_->Unref();
+  }
+  autovector<MemTable*> to_delete;
+  imm_.current()->Unref(&to_delete);
+  for (MemTable* m : to_delete) {
+    delete m;
+  }
+}
+
+const EnvOptions* ColumnFamilyData::soptions() const {
+  return &(column_family_set_->storage_options_);
+}
+
+void ColumnFamilyData::SetCurrent(Version* current) {
+  current_ = current;
+  need_slowdown_for_num_level0_files_ =
+      (options_.level0_slowdown_writes_trigger >= 0 &&
+       current_->NumLevelFiles(0) >= options_.level0_slowdown_writes_trigger);
+}
+
+void ColumnFamilyData::CreateNewMemtable() {
+  assert(current_ != nullptr);
+  if (mem_ != nullptr) {
+    delete mem_->Unref();
+  }
+  mem_ = new MemTable(internal_comparator_, options_);
+  mem_->Ref();
+}
+
+Compaction* ColumnFamilyData::PickCompaction(LogBuffer* log_buffer) {
+  return compaction_picker_->PickCompaction(current_, log_buffer);
+}
+
+Compaction* ColumnFamilyData::CompactRange(int input_level, int output_level,
+                                           const InternalKey* begin,
+                                           const InternalKey* end,
+                                           InternalKey** compaction_end) {
+  return compaction_picker_->CompactRange(current_, input_level, output_level,
+                                          begin, end, compaction_end);
+}
+
+SuperVersion* ColumnFamilyData::GetReferencedSuperVersion(
+    port::Mutex* db_mutex) {
+  SuperVersion* sv = nullptr;
+  if (LIKELY(column_family_set_->db_options_->allow_thread_local)) {
+    sv = GetThreadLocalSuperVersion(db_mutex);
+    sv->Ref();
+    if (!ReturnThreadLocalSuperVersion(sv)) {
+      sv->Unref();
+    }
+  } else {
+    db_mutex->Lock();
+    sv = super_version_->Ref();
+    db_mutex->Unlock();
+  }
+  return sv;
+}
+
+SuperVersion* ColumnFamilyData::GetThreadLocalSuperVersion(
+    port::Mutex* db_mutex) {
+  SuperVersion* sv = nullptr;
+  // The SuperVersion is cached in thread local storage to avoid acquiring
+  // mutex when SuperVersion does not change since the last use. When a new
+  // SuperVersion is installed, the compaction or flush thread cleans up
+  // cached SuperVersion in all existing thread local storage. To avoid
+  // acquiring mutex for this operation, we use atomic Swap() on the thread
+  // local pointer to guarantee exclusive access. If the thread local pointer
+  // is being used while a new SuperVersion is installed, the cached
+  // SuperVersion can become stale. In that case, the background thread would
+  // have swapped in kSVObsolete. We re-check the value at when returning
+  // SuperVersion back to thread local, with an atomic compare and swap.
+  // The superversion will need to be released if detected to be stale.
+  void* ptr = local_sv_->Swap(SuperVersion::kSVInUse);
+  // Invariant:
+  // (1) Scrape (always) installs kSVObsolete in ThreadLocal storage
+  // (2) the Swap above (always) installs kSVInUse, ThreadLocal storage
+  // should only keep kSVInUse before ReturnThreadLocalSuperVersion call
+  // (if no Scrape happens).
+  assert(ptr != SuperVersion::kSVInUse);
+  sv = static_cast<SuperVersion*>(ptr);
+  if (sv == SuperVersion::kSVObsolete ||
+      sv->version_number != super_version_number_.load()) {
+    RecordTick(options_.statistics.get(), NUMBER_SUPERVERSION_ACQUIRES);
+    SuperVersion* sv_to_delete = nullptr;
+
+    if (sv && sv->Unref()) {
+      RecordTick(options_.statistics.get(), NUMBER_SUPERVERSION_CLEANUPS);
+      db_mutex->Lock();
+      // NOTE: underlying resources held by superversion (sst files) might
+      // not be released until the next background job.
+      sv->Cleanup();
+      sv_to_delete = sv;
+    } else {
+      db_mutex->Lock();
+    }
+    sv = super_version_->Ref();
+    db_mutex->Unlock();
+
+    delete sv_to_delete;
+  }
+  assert(sv != nullptr);
+  return sv;
+}
+
+bool ColumnFamilyData::ReturnThreadLocalSuperVersion(SuperVersion* sv) {
+  assert(sv != nullptr);
+  // Put the SuperVersion back
+  void* expected = SuperVersion::kSVInUse;
+  if (local_sv_->CompareAndSwap(static_cast<void*>(sv), expected)) {
+    // When we see kSVInUse in the ThreadLocal, we are sure ThreadLocal
+    // storage has not been altered and no Scrape has happend. The
+    // SuperVersion is still current.
+    return true;
+  } else {
+    // ThreadLocal scrape happened in the process of this GetImpl call (after
+    // thread local Swap() at the beginning and before CompareAndSwap()).
+    // This means the SuperVersion it holds is obsolete.
+    assert(expected == SuperVersion::kSVObsolete);
+  }
+  return false;
+}
+
+SuperVersion* ColumnFamilyData::InstallSuperVersion(
+    SuperVersion* new_superversion, port::Mutex* db_mutex) {
+  new_superversion->db_mutex = db_mutex;
+  new_superversion->Init(mem_, imm_.current(), current_);
+  SuperVersion* old_superversion = super_version_;
+  super_version_ = new_superversion;
+  ++super_version_number_;
+  super_version_->version_number = super_version_number_;
+  // Reset SuperVersions cached in thread local storage
+  if (column_family_set_->db_options_->allow_thread_local) {
+    ResetThreadLocalSuperVersions();
+  }
+  if (old_superversion != nullptr && old_superversion->Unref()) {
+    old_superversion->Cleanup();
+    return old_superversion;  // will let caller delete outside of mutex
+  }
+  return nullptr;
+}
+
+void ColumnFamilyData::ResetThreadLocalSuperVersions() {
+  autovector<void*> sv_ptrs;
+  local_sv_->Scrape(&sv_ptrs, SuperVersion::kSVObsolete);
+  for (auto ptr : sv_ptrs) {
+    assert(ptr);
+    if (ptr == SuperVersion::kSVInUse) {
+      continue;
+    }
+    auto sv = static_cast<SuperVersion*>(ptr);
+    if (sv->Unref()) {
+      sv->Cleanup();
+      delete sv;
+    }
+  }
+}
+
+ColumnFamilySet::ColumnFamilySet(const std::string& dbname,
+                                 const DBOptions* db_options,
+                                 const EnvOptions& storage_options,
+                                 Cache* table_cache)
+    : max_column_family_(0),
+      dummy_cfd_(new ColumnFamilyData(dbname, 0, "", nullptr, nullptr,
+                                      ColumnFamilyOptions(), db_options,
+                                      storage_options_, nullptr)),
+      default_cfd_cache_(nullptr),
+      db_name_(dbname),
+      db_options_(db_options),
+      storage_options_(storage_options),
+      table_cache_(table_cache),
+      spin_lock_(ATOMIC_FLAG_INIT) {
+  // initialize linked list
+  dummy_cfd_->prev_ = dummy_cfd_;
+  dummy_cfd_->next_ = dummy_cfd_;
+}
+
+ColumnFamilySet::~ColumnFamilySet() {
+  while (column_family_data_.size() > 0) {
+    // cfd destructor will delete itself from column_family_data_
+    auto cfd = column_family_data_.begin()->second;
+    cfd->Unref();
+    delete cfd;
+  }
+  dummy_cfd_->Unref();
+  delete dummy_cfd_;
+}
+
+ColumnFamilyData* ColumnFamilySet::GetDefault() const {
+  assert(default_cfd_cache_ != nullptr);
+  return default_cfd_cache_;
+}
+
+ColumnFamilyData* ColumnFamilySet::GetColumnFamily(uint32_t id) const {
+  auto cfd_iter = column_family_data_.find(id);
+  if (cfd_iter != column_family_data_.end()) {
+    return cfd_iter->second;
+  } else {
+    return nullptr;
+  }
+}
+
+ColumnFamilyData* ColumnFamilySet::GetColumnFamily(const std::string& name)
+    const {
+  auto cfd_iter = column_families_.find(name);
+  if (cfd_iter != column_families_.end()) {
+    auto cfd = GetColumnFamily(cfd_iter->second);
+    assert(cfd != nullptr);
+    return cfd;
+  } else {
+    return nullptr;
+  }
+}
+
+uint32_t ColumnFamilySet::GetNextColumnFamilyID() {
+  return ++max_column_family_;
+}
+
+uint32_t ColumnFamilySet::GetMaxColumnFamily() { return max_column_family_; }
+
+void ColumnFamilySet::UpdateMaxColumnFamily(uint32_t new_max_column_family) {
+  max_column_family_ = std::max(new_max_column_family, max_column_family_);
+}
+
+// under a DB mutex
+ColumnFamilyData* ColumnFamilySet::CreateColumnFamily(
+    const std::string& name, uint32_t id, Version* dummy_versions,
+    const ColumnFamilyOptions& options) {
+  assert(column_families_.find(name) == column_families_.end());
+  ColumnFamilyData* new_cfd =
+      new ColumnFamilyData(db_name_, id, name, dummy_versions, table_cache_,
+                           options, db_options_, storage_options_, this);
+  Lock();
+  column_families_.insert({name, id});
+  column_family_data_.insert({id, new_cfd});
+  Unlock();
+  max_column_family_ = std::max(max_column_family_, id);
+  // add to linked list
+  new_cfd->next_ = dummy_cfd_;
+  auto prev = dummy_cfd_->prev_;
+  new_cfd->prev_ = prev;
+  prev->next_ = new_cfd;
+  dummy_cfd_->prev_ = new_cfd;
+  if (id == 0) {
+    default_cfd_cache_ = new_cfd;
+  }
+  return new_cfd;
+}
+
+void ColumnFamilySet::Lock() {
+  // spin lock
+  while (spin_lock_.test_and_set(std::memory_order_acquire)) {
+  }
+}
+
+void ColumnFamilySet::Unlock() { spin_lock_.clear(std::memory_order_release); }
+
+// REQUIRES: DB mutex held
+void ColumnFamilySet::FreeDeadColumnFamilies() {
+  autovector<ColumnFamilyData*> to_delete;
+  for (auto cfd = dummy_cfd_->next_; cfd != dummy_cfd_; cfd = cfd->next_) {
+    if (cfd->refs_ == 0) {
+      to_delete.push_back(cfd);
+    }
+  }
+  for (auto cfd : to_delete) {
+    // this is very rare, so it's not a problem that we do it under a mutex
+    delete cfd;
+  }
+}
+
+// under a DB mutex
+void ColumnFamilySet::RemoveColumnFamily(ColumnFamilyData* cfd) {
+  auto cfd_iter = column_family_data_.find(cfd->GetID());
+  assert(cfd_iter != column_family_data_.end());
+  Lock();
+  column_family_data_.erase(cfd_iter);
+  column_families_.erase(cfd->GetName());
+  Unlock();
+}
+
+bool ColumnFamilyMemTablesImpl::Seek(uint32_t column_family_id) {
+  if (column_family_id == 0) {
+    // optimization for common case
+    current_ = column_family_set_->GetDefault();
+  } else {
+    // maybe outside of db mutex, should lock
+    column_family_set_->Lock();
+    current_ = column_family_set_->GetColumnFamily(column_family_id);
+    column_family_set_->Unlock();
+  }
+  handle_.SetCFD(current_);
+  return current_ != nullptr;
+}
+
+uint64_t ColumnFamilyMemTablesImpl::GetLogNumber() const {
+  assert(current_ != nullptr);
+  return current_->GetLogNumber();
+}
+
+MemTable* ColumnFamilyMemTablesImpl::GetMemTable() const {
+  assert(current_ != nullptr);
+  return current_->mem();
+}
+
+const Options* ColumnFamilyMemTablesImpl::GetOptions() const {
+  assert(current_ != nullptr);
+  return current_->options();
+}
+
+ColumnFamilyHandle* ColumnFamilyMemTablesImpl::GetColumnFamilyHandle() {
+  assert(current_ != nullptr);
+  return &handle_;
+}
+
+}  // namespace rocksdb
diff --git a/db/column_family.h b/db/column_family.h
new file mode 100644 (file)
index 0000000..d306f4e
--- /dev/null
@@ -0,0 +1,418 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <unordered_map>
+#include <string>
+#include <vector>
+#include <atomic>
+
+#include "rocksdb/options.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "db/memtable_list.h"
+#include "db/write_batch_internal.h"
+#include "db/table_cache.h"
+#include "util/thread_local.h"
+
+namespace rocksdb {
+
+class Version;
+class VersionSet;
+class MemTable;
+class MemTableListVersion;
+class CompactionPicker;
+class Compaction;
+class InternalKey;
+class InternalStats;
+class ColumnFamilyData;
+class DBImpl;
+class LogBuffer;
+
+// ColumnFamilyHandleImpl is the class that clients use to access different
+// column families. It has non-trivial destructor, which gets called when client
+// is done using the column family
+class ColumnFamilyHandleImpl : public ColumnFamilyHandle {
+ public:
+  // create while holding the mutex
+  ColumnFamilyHandleImpl(ColumnFamilyData* cfd, DBImpl* db, port::Mutex* mutex);
+  // destroy without mutex
+  virtual ~ColumnFamilyHandleImpl();
+  virtual ColumnFamilyData* cfd() const { return cfd_; }
+
+  virtual uint32_t GetID() const;
+
+ private:
+  ColumnFamilyData* cfd_;
+  DBImpl* db_;
+  port::Mutex* mutex_;
+};
+
+// Does not ref-count ColumnFamilyData
+// We use this dummy ColumnFamilyHandleImpl because sometimes MemTableInserter
+// calls DBImpl methods. When this happens, MemTableInserter need access to
+// ColumnFamilyHandle (same as the client would need). In that case, we feed
+// MemTableInserter dummy ColumnFamilyHandle and enable it to call DBImpl
+// methods
+class ColumnFamilyHandleInternal : public ColumnFamilyHandleImpl {
+ public:
+  ColumnFamilyHandleInternal()
+      : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr) {}
+
+  void SetCFD(ColumnFamilyData* cfd) { internal_cfd_ = cfd; }
+  virtual ColumnFamilyData* cfd() const override { return internal_cfd_; }
+
+ private:
+  ColumnFamilyData* internal_cfd_;
+};
+
+// holds references to memtable, all immutable memtables and version
+struct SuperVersion {
+  MemTable* mem;
+  MemTableListVersion* imm;
+  Version* current;
+  std::atomic<uint32_t> refs;
+  // We need to_delete because during Cleanup(), imm->Unref() returns
+  // all memtables that we need to free through this vector. We then
+  // delete all those memtables outside of mutex, during destruction
+  autovector<MemTable*> to_delete;
+  // Version number of the current SuperVersion
+  uint64_t version_number;
+  port::Mutex* db_mutex;
+
+  // should be called outside the mutex
+  SuperVersion() = default;
+  ~SuperVersion();
+  SuperVersion* Ref();
+
+  bool Unref();
+
+  // call these two methods with db mutex held
+  // Cleanup unrefs mem, imm and current. Also, it stores all memtables
+  // that needs to be deleted in to_delete vector. Unrefing those
+  // objects needs to be done in the mutex
+  void Cleanup();
+  void Init(MemTable* new_mem, MemTableListVersion* new_imm,
+            Version* new_current);
+
+  // The value of dummy is not actually used. kSVInUse takes its address as a
+  // mark in the thread local storage to indicate the SuperVersion is in use
+  // by thread. This way, the value of kSVInUse is guaranteed to have no
+  // conflict with SuperVersion object address and portable on different
+  // platform.
+  static int dummy;
+  static void* const kSVInUse;
+  static void* const kSVObsolete;
+};
+
+extern ColumnFamilyOptions SanitizeOptions(const InternalKeyComparator* icmp,
+                                           const InternalFilterPolicy* ipolicy,
+                                           const ColumnFamilyOptions& src);
+
+class ColumnFamilySet;
+
+// This class keeps all the data that a column family needs. It's mosly dumb and
+// used just to provide access to metadata.
+// Most methods require DB mutex held, unless otherwise noted
+class ColumnFamilyData {
+ public:
+  ~ColumnFamilyData();
+
+  // thread-safe
+  uint32_t GetID() const { return id_; }
+  // thread-safe
+  const std::string& GetName() const { return name_; }
+
+  void Ref() { ++refs_; }
+  // will just decrease reference count to 0, but will not delete it. returns
+  // true if the ref count was decreased to zero. in that case, it can be
+  // deleted by the caller immediatelly, or later, by calling
+  // FreeDeadColumnFamilies()
+  bool Unref() {
+    assert(refs_ > 0);
+    return --refs_ == 0;
+  }
+
+  // This can only be called from single-threaded VersionSet::LogAndApply()
+  // After dropping column family no other operation on that column family
+  // will be executed. All the files and memory will be, however, kept around
+  // until client drops the column family handle. That way, client can still
+  // access data from dropped column family.
+  // Column family can be dropped and still alive. In that state:
+  // *) Column family is not included in the iteration.
+  // *) Compaction and flush is not executed on the dropped column family.
+  // *) Client can continue writing and reading from column family. However, all
+  // writes stay in the current memtable.
+  // When the dropped column family is unreferenced, then we:
+  // *) delete all memory associated with that column family
+  // *) delete all the files associated with that column family
+  void SetDropped() {
+    // can't drop default CF
+    assert(id_ != 0);
+    dropped_ = true;
+  }
+  bool IsDropped() const { return dropped_; }
+
+  // thread-safe
+  int NumberLevels() const { return options_.num_levels; }
+
+  void SetLogNumber(uint64_t log_number) { log_number_ = log_number; }
+  uint64_t GetLogNumber() const { return log_number_; }
+
+  // thread-safe
+  const Options* options() const { return &options_; }
+  const EnvOptions* soptions() const;
+
+  InternalStats* internal_stats() { return internal_stats_.get(); }
+
+  MemTableList* imm() { return &imm_; }
+  MemTable* mem() { return mem_; }
+  Version* current() { return current_; }
+  Version* dummy_versions() { return dummy_versions_; }
+  void SetMemtable(MemTable* new_mem) { mem_ = new_mem; }
+  void SetCurrent(Version* current);
+  void CreateNewMemtable();
+
+  TableCache* table_cache() { return table_cache_.get(); }
+
+  // See documentation in compaction_picker.h
+  Compaction* PickCompaction(LogBuffer* log_buffer);
+  Compaction* CompactRange(int input_level, int output_level,
+                           const InternalKey* begin, const InternalKey* end,
+                           InternalKey** compaction_end);
+
+  CompactionPicker* compaction_picker() { return compaction_picker_.get(); }
+  // thread-safe
+  const Comparator* user_comparator() const {
+    return internal_comparator_.user_comparator();
+  }
+  // thread-safe
+  const InternalKeyComparator& internal_comparator() const {
+    return internal_comparator_;
+  }
+
+  SuperVersion* GetSuperVersion() { return super_version_; }
+  // thread-safe
+  // Return a already referenced SuperVersion to be used safely.
+  SuperVersion* GetReferencedSuperVersion(port::Mutex* db_mutex);
+  // thread-safe
+  // Get SuperVersion stored in thread local storage. If it does not exist,
+  // get a reference from a current SuperVersion.
+  SuperVersion* GetThreadLocalSuperVersion(port::Mutex* db_mutex);
+  // Try to return SuperVersion back to thread local storage. Retrun true on
+  // success and false on failure. It fails when the thread local storage
+  // contains anything other than SuperVersion::kSVInUse flag.
+  bool ReturnThreadLocalSuperVersion(SuperVersion* sv);
+  // thread-safe
+  uint64_t GetSuperVersionNumber() const {
+    return super_version_number_.load();
+  }
+  // will return a pointer to SuperVersion* if previous SuperVersion
+  // if its reference count is zero and needs deletion or nullptr if not
+  // As argument takes a pointer to allocated SuperVersion to enable
+  // the clients to allocate SuperVersion outside of mutex.
+  SuperVersion* InstallSuperVersion(SuperVersion* new_superversion,
+                                    port::Mutex* db_mutex);
+
+  void ResetThreadLocalSuperVersions();
+
+  // A Flag indicating whether write needs to slowdown because of there are
+  // too many number of level0 files.
+  bool NeedSlowdownForNumLevel0Files() const {
+    return need_slowdown_for_num_level0_files_;
+  }
+
+ private:
+  friend class ColumnFamilySet;
+  ColumnFamilyData(const std::string& dbname, uint32_t id,
+                   const std::string& name, Version* dummy_versions,
+                   Cache* table_cache, const ColumnFamilyOptions& options,
+                   const DBOptions* db_options,
+                   const EnvOptions& storage_options,
+                   ColumnFamilySet* column_family_set);
+
+  uint32_t id_;
+  const std::string name_;
+  Version* dummy_versions_;  // Head of circular doubly-linked list of versions.
+  Version* current_;         // == dummy_versions->prev_
+
+  int refs_;                   // outstanding references to ColumnFamilyData
+  bool dropped_;               // true if client dropped it
+
+  const InternalKeyComparator internal_comparator_;
+  const InternalFilterPolicy internal_filter_policy_;
+
+  Options const options_;
+
+  std::unique_ptr<TableCache> table_cache_;
+
+  std::unique_ptr<InternalStats> internal_stats_;
+
+  MemTable* mem_;
+  MemTableList imm_;
+  SuperVersion* super_version_;
+
+  // An ordinal representing the current SuperVersion. Updated by
+  // InstallSuperVersion(), i.e. incremented every time super_version_
+  // changes.
+  std::atomic<uint64_t> super_version_number_;
+
+  // Thread's local copy of SuperVersion pointer
+  // This needs to be destructed before mutex_
+  std::unique_ptr<ThreadLocalPtr> local_sv_;
+
+  // pointers for a circular linked list. we use it to support iterations
+  // that can be concurrent with writes
+  ColumnFamilyData* next_;
+  ColumnFamilyData* prev_;
+
+  // This is the earliest log file number that contains data from this
+  // Column Family. All earlier log files must be ignored and not
+  // recovered from
+  uint64_t log_number_;
+
+  // A flag indicating whether we should delay writes because
+  // we have too many level 0 files
+  bool need_slowdown_for_num_level0_files_;
+
+  // An object that keeps all the compaction stats
+  // and picks the next compaction
+  std::unique_ptr<CompactionPicker> compaction_picker_;
+
+  ColumnFamilySet* column_family_set_;
+};
+
+// ColumnFamilySet has interesting thread-safety requirements
+// * CreateColumnFamily() or RemoveColumnFamily() -- need to protect by DB
+// mutex. Inside, column_family_data_ and column_families_ will be protected
+// by Lock() and Unlock(). CreateColumnFamily() should ONLY be called from
+// VersionSet::LogAndApply() in the normal runtime. It is also called
+// during Recovery and in DumpManifest(). RemoveColumnFamily() is called
+// from ColumnFamilyData destructor
+// * Iteration -- hold DB mutex, but you can release it in the body of
+// iteration. If you release DB mutex in body, reference the column
+// family before the mutex and unreference after you unlock, since the column
+// family might get dropped when the DB mutex is released
+// * GetDefault() -- thread safe
+// * GetColumnFamily() -- either inside of DB mutex or call Lock() <-> Unlock()
+// * GetNextColumnFamilyID(), GetMaxColumnFamily(), UpdateMaxColumnFamily() --
+// inside of DB mutex
+class ColumnFamilySet {
+ public:
+  // ColumnFamilySet supports iteration
+  class iterator {
+   public:
+    explicit iterator(ColumnFamilyData* cfd)
+        : current_(cfd) {}
+    iterator& operator++() {
+      // dummy is never dead or dropped, so this will never be infinite
+      do {
+        current_ = current_->next_;
+      } while (current_->refs_ == 0 || current_->IsDropped());
+      return *this;
+    }
+    bool operator!=(const iterator& other) {
+      return this->current_ != other.current_;
+    }
+    ColumnFamilyData* operator*() { return current_; }
+
+   private:
+    ColumnFamilyData* current_;
+  };
+
+  ColumnFamilySet(const std::string& dbname, const DBOptions* db_options,
+                  const EnvOptions& storage_options, Cache* table_cache);
+  ~ColumnFamilySet();
+
+  ColumnFamilyData* GetDefault() const;
+  // GetColumnFamily() calls return nullptr if column family is not found
+  ColumnFamilyData* GetColumnFamily(uint32_t id) const;
+  ColumnFamilyData* GetColumnFamily(const std::string& name) const;
+  // this call will return the next available column family ID. it guarantees
+  // that there is no column family with id greater than or equal to the
+  // returned value in the current running instance or anytime in RocksDB
+  // instance history.
+  uint32_t GetNextColumnFamilyID();
+  uint32_t GetMaxColumnFamily();
+  void UpdateMaxColumnFamily(uint32_t new_max_column_family);
+
+  ColumnFamilyData* CreateColumnFamily(const std::string& name, uint32_t id,
+                                       Version* dummy_version,
+                                       const ColumnFamilyOptions& options);
+
+  iterator begin() { return iterator(dummy_cfd_->next_); }
+  iterator end() { return iterator(dummy_cfd_); }
+
+  void Lock();
+  void Unlock();
+
+  // REQUIRES: DB mutex held
+  // Don't call while iterating over ColumnFamilySet
+  void FreeDeadColumnFamilies();
+
+ private:
+  friend class ColumnFamilyData;
+  // helper function that gets called from cfd destructor
+  // REQUIRES: DB mutex held
+  void RemoveColumnFamily(ColumnFamilyData* cfd);
+
+  // column_families_ and column_family_data_ need to be protected:
+  // * when mutating: 1. DB mutex locked first, 2. spinlock locked second
+  // * when reading, either: 1. lock DB mutex, or 2. lock spinlock
+  //  (if both, respect the ordering to avoid deadlock!)
+  std::unordered_map<std::string, uint32_t> column_families_;
+  std::unordered_map<uint32_t, ColumnFamilyData*> column_family_data_;
+
+  uint32_t max_column_family_;
+  ColumnFamilyData* dummy_cfd_;
+  // We don't hold the refcount here, since default column family always exists
+  // We are also not responsible for cleaning up default_cfd_cache_. This is
+  // just a cache that makes common case (accessing default column family)
+  // faster
+  ColumnFamilyData* default_cfd_cache_;
+
+  const std::string db_name_;
+  const DBOptions* const db_options_;
+  const EnvOptions storage_options_;
+  Cache* table_cache_;
+  std::atomic_flag spin_lock_;
+};
+
+// We use ColumnFamilyMemTablesImpl to provide WriteBatch a way to access
+// memtables of different column families (specified by ID in the write batch)
+class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables {
+ public:
+  explicit ColumnFamilyMemTablesImpl(ColumnFamilySet* column_family_set)
+      : column_family_set_(column_family_set), current_(nullptr) {}
+
+  // sets current_ to ColumnFamilyData with column_family_id
+  // returns false if column family doesn't exist
+  bool Seek(uint32_t column_family_id) override;
+
+  // Returns log number of the selected column family
+  uint64_t GetLogNumber() const override;
+
+  // REQUIRES: Seek() called first
+  virtual MemTable* GetMemTable() const override;
+
+  // Returns options for selected column family
+  // REQUIRES: Seek() called first
+  virtual const Options* GetOptions() const override;
+
+  // Returns column family handle for the selected column family
+  virtual ColumnFamilyHandle* GetColumnFamilyHandle() override;
+
+ private:
+  ColumnFamilySet* column_family_set_;
+  ColumnFamilyData* current_;
+  ColumnFamilyHandleInternal handle_;
+};
+
+}  // namespace rocksdb
diff --git a/db/column_family_test.cc b/db/column_family_test.cc
new file mode 100644 (file)
index 0000000..5f7ff48
--- /dev/null
@@ -0,0 +1,977 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <algorithm>
+#include <vector>
+#include <string>
+
+#include "db/db_impl.h"
+#include "rocksdb/env.h"
+#include "rocksdb/db.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+#include "util/coding.h"
+#include "utilities/merge_operators.h"
+
+namespace rocksdb {
+
+namespace {
+std::string RandomString(Random* rnd, int len) {
+  std::string r;
+  test::RandomString(rnd, len, &r);
+  return r;
+}
+}  // anonymous namespace
+
+// counts how many operations were performed
+class EnvCounter : public EnvWrapper {
+ public:
+  explicit EnvCounter(Env* base)
+      : EnvWrapper(base), num_new_writable_file_(0) {}
+  int GetNumberOfNewWritableFileCalls() {
+    return num_new_writable_file_;
+  }
+  Status NewWritableFile(const std::string& f, unique_ptr<WritableFile>* r,
+                         const EnvOptions& soptions) {
+    ++num_new_writable_file_;
+    return EnvWrapper::NewWritableFile(f, r, soptions);
+  }
+
+ private:
+  int num_new_writable_file_;
+};
+
+class ColumnFamilyTest {
+ public:
+  ColumnFamilyTest() : rnd_(139) {
+    env_ = new EnvCounter(Env::Default());
+    dbname_ = test::TmpDir() + "/column_family_test";
+    db_options_.create_if_missing = true;
+    db_options_.env = env_;
+    DestroyDB(dbname_, Options(db_options_, column_family_options_));
+  }
+
+  ~ColumnFamilyTest() {
+    delete env_;
+  }
+
+  void Close() {
+    for (auto h : handles_) {
+      delete h;
+    }
+    handles_.clear();
+    names_.clear();
+    delete db_;
+    db_ = nullptr;
+  }
+
+  Status TryOpen(std::vector<std::string> cf,
+                 std::vector<ColumnFamilyOptions> options = {}) {
+    std::vector<ColumnFamilyDescriptor> column_families;
+    names_.clear();
+    for (size_t i = 0; i < cf.size(); ++i) {
+      column_families.push_back(ColumnFamilyDescriptor(
+          cf[i], options.size() == 0 ? column_family_options_ : options[i]));
+      names_.push_back(cf[i]);
+    }
+    return DB::Open(db_options_, dbname_, column_families, &handles_, &db_);
+  }
+
+  Status OpenReadOnly(std::vector<std::string> cf,
+                         std::vector<ColumnFamilyOptions> options = {}) {
+    std::vector<ColumnFamilyDescriptor> column_families;
+    names_.clear();
+    for (size_t i = 0; i < cf.size(); ++i) {
+      column_families.push_back(ColumnFamilyDescriptor(
+          cf[i], options.size() == 0 ? column_family_options_ : options[i]));
+      names_.push_back(cf[i]);
+    }
+    return DB::OpenForReadOnly(db_options_, dbname_, column_families, &handles_,
+                               &db_);
+  }
+
+  void AssertOpenReadOnly(std::vector<std::string> cf,
+                    std::vector<ColumnFamilyOptions> options = {}) {
+    ASSERT_OK(OpenReadOnly(cf, options));
+  }
+
+
+  void Open(std::vector<std::string> cf,
+            std::vector<ColumnFamilyOptions> options = {}) {
+    ASSERT_OK(TryOpen(cf, options));
+  }
+
+  void Open() {
+    Open({"default"});
+  }
+
+  DBImpl* dbfull() { return reinterpret_cast<DBImpl*>(db_); }
+
+  int GetProperty(int cf, std::string property) {
+    std::string value;
+    ASSERT_TRUE(dbfull()->GetProperty(handles_[cf], property, &value));
+    return std::stoi(value);
+  }
+
+  void Destroy() {
+    for (auto h : handles_) {
+      delete h;
+    }
+    handles_.clear();
+    names_.clear();
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(DestroyDB(dbname_, Options(db_options_, column_family_options_)));
+  }
+
+  void CreateColumnFamilies(
+      const std::vector<std::string>& cfs,
+      const std::vector<ColumnFamilyOptions> options = {}) {
+    int cfi = handles_.size();
+    handles_.resize(cfi + cfs.size());
+    names_.resize(cfi + cfs.size());
+    for (size_t i = 0; i < cfs.size(); ++i) {
+      ASSERT_OK(db_->CreateColumnFamily(
+          options.size() == 0 ? column_family_options_ : options[i], cfs[i],
+          &handles_[cfi]));
+      names_[cfi] = cfs[i];
+      cfi++;
+    }
+  }
+
+  void Reopen(const std::vector<ColumnFamilyOptions> options = {}) {
+    std::vector<std::string> names;
+    for (auto name : names_) {
+      if (name != "") {
+        names.push_back(name);
+      }
+    }
+    Close();
+    assert(options.size() == 0 || names.size() == options.size());
+    Open(names, options);
+  }
+
+  void CreateColumnFamiliesAndReopen(const std::vector<std::string>& cfs) {
+    CreateColumnFamilies(cfs);
+    Reopen();
+  }
+
+  void DropColumnFamilies(const std::vector<int>& cfs) {
+    for (auto cf : cfs) {
+      ASSERT_OK(db_->DropColumnFamily(handles_[cf]));
+      delete handles_[cf];
+      handles_[cf] = nullptr;
+      names_[cf] = "";
+    }
+  }
+
+  void PutRandomData(int cf, int num, int key_value_size) {
+    for (int i = 0; i < num; ++i) {
+      // 10 bytes for key, rest is value
+      ASSERT_OK(Put(cf, test::RandomKey(&rnd_, 10),
+                    RandomString(&rnd_, key_value_size - 10)));
+    }
+  }
+
+  void WaitForFlush(int cf) {
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf]));
+  }
+
+  void WaitForCompaction() { ASSERT_OK(dbfull()->TEST_WaitForCompact()); }
+
+  Status Put(int cf, const std::string& key, const std::string& value) {
+    return db_->Put(WriteOptions(), handles_[cf], Slice(key), Slice(value));
+  }
+  Status Merge(int cf, const std::string& key, const std::string& value) {
+    return db_->Merge(WriteOptions(), handles_[cf], Slice(key), Slice(value));
+  }
+  Status Flush(int cf) {
+    return db_->Flush(FlushOptions(), handles_[cf]);
+  }
+
+  std::string Get(int cf, const std::string& key) {
+    ReadOptions options;
+    options.verify_checksums = true;
+    std::string result;
+    Status s = db_->Get(options, handles_[cf], Slice(key), &result);
+    if (s.IsNotFound()) {
+      result = "NOT_FOUND";
+    } else if (!s.ok()) {
+      result = s.ToString();
+    }
+    return result;
+  }
+
+  void CompactAll(int cf) {
+    ASSERT_OK(db_->CompactRange(handles_[cf], nullptr, nullptr));
+  }
+
+  void Compact(int cf, const Slice& start, const Slice& limit) {
+    ASSERT_OK(db_->CompactRange(handles_[cf], &start, &limit));
+  }
+
+  int NumTableFilesAtLevel(int level, int cf) {
+    return GetProperty(cf,
+                       "rocksdb.num-files-at-level" + std::to_string(level));
+  }
+
+  // Return spread of files per level
+  std::string FilesPerLevel(int cf) {
+    std::string result;
+    int last_non_zero_offset = 0;
+    for (int level = 0; level < dbfull()->NumberLevels(handles_[cf]); level++) {
+      int f = NumTableFilesAtLevel(level, cf);
+      char buf[100];
+      snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
+      result += buf;
+      if (f > 0) {
+        last_non_zero_offset = result.size();
+      }
+    }
+    result.resize(last_non_zero_offset);
+    return result;
+  }
+
+  int CountLiveFiles() {
+    std::vector<LiveFileMetaData> metadata;
+    db_->GetLiveFilesMetaData(&metadata);
+    return static_cast<int>(metadata.size());
+  }
+
+  // Do n memtable flushes, each of which produces an sstable
+  // covering the range [small,large].
+  void MakeTables(int cf, int n, const std::string& small,
+                  const std::string& large) {
+    for (int i = 0; i < n; i++) {
+      ASSERT_OK(Put(cf, small, "begin"));
+      ASSERT_OK(Put(cf, large, "end"));
+      ASSERT_OK(db_->Flush(FlushOptions(), handles_[cf]));
+    }
+  }
+
+  int CountLiveLogFiles() {
+    int micros_wait_for_log_deletion = 20000;
+    env_->SleepForMicroseconds(micros_wait_for_log_deletion);
+    int ret = 0;
+    VectorLogPtr wal_files;
+    Status s;
+    // GetSortedWalFiles is a flakey function -- it gets all the wal_dir
+    // children files and then later checks for their existance. if some of the
+    // log files doesn't exist anymore, it reports an error. it does all of this
+    // without DB mutex held, so if a background process deletes the log file
+    // while the function is being executed, it returns an error. We retry the
+    // function 10 times to avoid the error failing the test
+    for (int retries = 0; retries < 10; ++retries) {
+      wal_files.clear();
+      s = db_->GetSortedWalFiles(wal_files);
+      if (s.ok()) {
+        break;
+      }
+    }
+    ASSERT_OK(s);
+    for (const auto& wal : wal_files) {
+      if (wal->Type() == kAliveLogFile) {
+        ++ret;
+      }
+    }
+    return ret;
+  }
+
+  void AssertNumberOfImmutableMemtables(std::vector<int> num_per_cf) {
+    assert(num_per_cf.size() == handles_.size());
+
+    for (size_t i = 0; i < num_per_cf.size(); ++i) {
+      ASSERT_EQ(num_per_cf[i],
+                GetProperty(i, "rocksdb.num-immutable-mem-table"));
+    }
+  }
+
+  void CopyFile(const std::string& source, const std::string& destination,
+                uint64_t size = 0) {
+    const EnvOptions soptions;
+    unique_ptr<SequentialFile> srcfile;
+    ASSERT_OK(env_->NewSequentialFile(source, &srcfile, soptions));
+    unique_ptr<WritableFile> destfile;
+    ASSERT_OK(env_->NewWritableFile(destination, &destfile, soptions));
+
+    if (size == 0) {
+      // default argument means copy everything
+      ASSERT_OK(env_->GetFileSize(source, &size));
+    }
+
+    char buffer[4096];
+    Slice slice;
+    while (size > 0) {
+      uint64_t one = std::min(uint64_t(sizeof(buffer)), size);
+      ASSERT_OK(srcfile->Read(one, &slice, buffer));
+      ASSERT_OK(destfile->Append(slice));
+      size -= slice.size();
+    }
+    ASSERT_OK(destfile->Close());
+  }
+
+  std::vector<ColumnFamilyHandle*> handles_;
+  std::vector<std::string> names_;
+  ColumnFamilyOptions column_family_options_;
+  DBOptions db_options_;
+  std::string dbname_;
+  DB* db_ = nullptr;
+  EnvCounter* env_;
+  Random rnd_;
+};
+
+TEST(ColumnFamilyTest, DontReuseColumnFamilyID) {
+  for (int iter = 0; iter < 3; ++iter) {
+    Open();
+    CreateColumnFamilies({"one", "two", "three"});
+    for (size_t i = 0; i < handles_.size(); ++i) {
+      auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(handles_[i]);
+      ASSERT_EQ(i, cfh->GetID());
+    }
+    if (iter == 1) {
+      Reopen();
+    }
+    DropColumnFamilies({3});
+    Reopen();
+    if (iter == 2) {
+      // this tests if max_column_family is correctly persisted with
+      // WriteSnapshot()
+      Reopen();
+    }
+    CreateColumnFamilies({"three2"});
+    // ID 3 that was used for dropped column family "three" should not be reused
+    auto cfh3 = reinterpret_cast<ColumnFamilyHandleImpl*>(handles_[3]);
+    ASSERT_EQ(4U, cfh3->GetID());
+    Close();
+    Destroy();
+  }
+}
+
+
+TEST(ColumnFamilyTest, AddDrop) {
+  Open();
+  CreateColumnFamilies({"one", "two", "three"});
+  ASSERT_EQ("NOT_FOUND", Get(1, "fodor"));
+  ASSERT_EQ("NOT_FOUND", Get(2, "fodor"));
+  DropColumnFamilies({2});
+  ASSERT_EQ("NOT_FOUND", Get(1, "fodor"));
+  CreateColumnFamilies({"four"});
+  ASSERT_EQ("NOT_FOUND", Get(3, "fodor"));
+  ASSERT_OK(Put(1, "fodor", "mirko"));
+  ASSERT_EQ("mirko", Get(1, "fodor"));
+  ASSERT_EQ("NOT_FOUND", Get(3, "fodor"));
+  Close();
+  ASSERT_TRUE(TryOpen({"default"}).IsInvalidArgument());
+  Open({"default", "one", "three", "four"});
+  DropColumnFamilies({1});
+  Reopen();
+  Close();
+
+  std::vector<std::string> families;
+  ASSERT_OK(DB::ListColumnFamilies(db_options_, dbname_, &families));
+  sort(families.begin(), families.end());
+  ASSERT_TRUE(families ==
+              std::vector<std::string>({"default", "four", "three"}));
+}
+
+TEST(ColumnFamilyTest, DropTest) {
+  // first iteration - dont reopen DB before dropping
+  // second iteration - reopen DB before dropping
+  for (int iter = 0; iter < 2; ++iter) {
+    Open({"default"});
+    CreateColumnFamiliesAndReopen({"pikachu"});
+    for (int i = 0; i < 100; ++i) {
+      ASSERT_OK(Put(1, std::to_string(i), "bar" + std::to_string(i)));
+    }
+    ASSERT_OK(Flush(1));
+
+    if (iter == 1) {
+      Reopen();
+    }
+    ASSERT_EQ("bar1", Get(1, "1"));
+
+    ASSERT_EQ(CountLiveFiles(), 1);
+    DropColumnFamilies({1});
+    // make sure that all files are deleted when we drop the column family
+    ASSERT_EQ(CountLiveFiles(), 0);
+    Destroy();
+  }
+}
+
+TEST(ColumnFamilyTest, WriteBatchFailure) {
+  Open();
+  CreateColumnFamiliesAndReopen({"one", "two"});
+  WriteBatch batch;
+  batch.Put(handles_[1], Slice("non-existing"), Slice("column-family"));
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+  DropColumnFamilies({1});
+  Status s = db_->Write(WriteOptions(), &batch);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  Close();
+}
+
+TEST(ColumnFamilyTest, ReadWrite) {
+  Open();
+  CreateColumnFamiliesAndReopen({"one", "two"});
+  ASSERT_OK(Put(0, "foo", "v1"));
+  ASSERT_OK(Put(0, "bar", "v2"));
+  ASSERT_OK(Put(1, "mirko", "v3"));
+  ASSERT_OK(Put(0, "foo", "v2"));
+  ASSERT_OK(Put(2, "fodor", "v5"));
+
+  for (int iter = 0; iter <= 3; ++iter) {
+    ASSERT_EQ("v2", Get(0, "foo"));
+    ASSERT_EQ("v2", Get(0, "bar"));
+    ASSERT_EQ("v3", Get(1, "mirko"));
+    ASSERT_EQ("v5", Get(2, "fodor"));
+    ASSERT_EQ("NOT_FOUND", Get(0, "fodor"));
+    ASSERT_EQ("NOT_FOUND", Get(1, "fodor"));
+    ASSERT_EQ("NOT_FOUND", Get(2, "foo"));
+    if (iter <= 1) {
+      Reopen();
+    }
+  }
+  Close();
+}
+
+TEST(ColumnFamilyTest, IgnoreRecoveredLog) {
+  std::string backup_logs = dbname_ + "/backup_logs";
+
+  // delete old files in backup_logs directory
+  ASSERT_OK(env_->CreateDirIfMissing(dbname_));
+  ASSERT_OK(env_->CreateDirIfMissing(backup_logs));
+  std::vector<std::string> old_files;
+  env_->GetChildren(backup_logs, &old_files);
+  for (auto& file : old_files) {
+    if (file != "." && file != "..") {
+      env_->DeleteFile(backup_logs + "/" + file);
+    }
+  }
+
+  column_family_options_.merge_operator =
+      MergeOperators::CreateUInt64AddOperator();
+  db_options_.wal_dir = dbname_ + "/logs";
+  Destroy();
+  Open();
+  CreateColumnFamilies({"cf1", "cf2"});
+
+  // fill up the DB
+  std::string one, two, three;
+  PutFixed64(&one, 1);
+  PutFixed64(&two, 2);
+  PutFixed64(&three, 3);
+  ASSERT_OK(Merge(0, "foo", one));
+  ASSERT_OK(Merge(1, "mirko", one));
+  ASSERT_OK(Merge(0, "foo", one));
+  ASSERT_OK(Merge(2, "bla", one));
+  ASSERT_OK(Merge(2, "fodor", one));
+  ASSERT_OK(Merge(0, "bar", one));
+  ASSERT_OK(Merge(2, "bla", one));
+  ASSERT_OK(Merge(1, "mirko", two));
+  ASSERT_OK(Merge(1, "franjo", one));
+
+  // copy the logs to backup
+  std::vector<std::string> logs;
+  env_->GetChildren(db_options_.wal_dir, &logs);
+  for (auto& log : logs) {
+    if (log != ".." && log != ".") {
+      CopyFile(db_options_.wal_dir + "/" + log, backup_logs + "/" + log);
+    }
+  }
+
+  // recover the DB
+  Close();
+
+  // 1. check consistency
+  // 2. copy the logs from backup back to WAL dir. if the recovery happens
+  // again on the same log files, this should lead to incorrect results
+  // due to applying merge operator twice
+  // 3. check consistency
+  for (int iter = 0; iter < 2; ++iter) {
+    // assert consistency
+    Open({"default", "cf1", "cf2"});
+    ASSERT_EQ(two, Get(0, "foo"));
+    ASSERT_EQ(one, Get(0, "bar"));
+    ASSERT_EQ(three, Get(1, "mirko"));
+    ASSERT_EQ(one, Get(1, "franjo"));
+    ASSERT_EQ(one, Get(2, "fodor"));
+    ASSERT_EQ(two, Get(2, "bla"));
+    Close();
+
+    if (iter == 0) {
+      // copy the logs from backup back to wal dir
+      for (auto& log : logs) {
+        if (log != ".." && log != ".") {
+          CopyFile(backup_logs + "/" + log, db_options_.wal_dir + "/" + log);
+        }
+      }
+    }
+  }
+}
+
+TEST(ColumnFamilyTest, FlushTest) {
+  Open();
+  CreateColumnFamiliesAndReopen({"one", "two"});
+  ASSERT_OK(Put(0, "foo", "v1"));
+  ASSERT_OK(Put(0, "bar", "v2"));
+  ASSERT_OK(Put(1, "mirko", "v3"));
+  ASSERT_OK(Put(0, "foo", "v2"));
+  ASSERT_OK(Put(2, "fodor", "v5"));
+  for (int i = 0; i < 3; ++i) {
+    Flush(i);
+  }
+  Reopen();
+
+  for (int iter = 0; iter <= 2; ++iter) {
+    ASSERT_EQ("v2", Get(0, "foo"));
+    ASSERT_EQ("v2", Get(0, "bar"));
+    ASSERT_EQ("v3", Get(1, "mirko"));
+    ASSERT_EQ("v5", Get(2, "fodor"));
+    ASSERT_EQ("NOT_FOUND", Get(0, "fodor"));
+    ASSERT_EQ("NOT_FOUND", Get(1, "fodor"));
+    ASSERT_EQ("NOT_FOUND", Get(2, "foo"));
+    if (iter <= 1) {
+      Reopen();
+    }
+  }
+  Close();
+}
+
+// Makes sure that obsolete log files get deleted
+TEST(ColumnFamilyTest, LogDeletionTest) {
+  db_options_.max_total_wal_size = std::numeric_limits<uint64_t>::max();
+  column_family_options_.write_buffer_size = 100000;  // 100KB
+  Open();
+  CreateColumnFamilies({"one", "two", "three", "four"});
+  // Each bracket is one log file. if number is in (), it means
+  // we don't need it anymore (it's been flushed)
+  // []
+  ASSERT_EQ(CountLiveLogFiles(), 0);
+  PutRandomData(0, 1, 100);
+  // [0]
+  PutRandomData(1, 1, 100);
+  // [0, 1]
+  PutRandomData(1, 1000, 100);
+  WaitForFlush(1);
+  // [0, (1)] [1]
+  ASSERT_EQ(CountLiveLogFiles(), 2);
+  PutRandomData(0, 1, 100);
+  // [0, (1)] [0, 1]
+  ASSERT_EQ(CountLiveLogFiles(), 2);
+  PutRandomData(2, 1, 100);
+  // [0, (1)] [0, 1, 2]
+  PutRandomData(2, 1000, 100);
+  WaitForFlush(2);
+  // [0, (1)] [0, 1, (2)] [2]
+  ASSERT_EQ(CountLiveLogFiles(), 3);
+  PutRandomData(2, 1000, 100);
+  WaitForFlush(2);
+  // [0, (1)] [0, 1, (2)] [(2)] [2]
+  ASSERT_EQ(CountLiveLogFiles(), 4);
+  PutRandomData(3, 1, 100);
+  // [0, (1)] [0, 1, (2)] [(2)] [2, 3]
+  PutRandomData(1, 1, 100);
+  // [0, (1)] [0, 1, (2)] [(2)] [1, 2, 3]
+  ASSERT_EQ(CountLiveLogFiles(), 4);
+  PutRandomData(1, 1000, 100);
+  WaitForFlush(1);
+  // [0, (1)] [0, (1), (2)] [(2)] [(1), 2, 3] [1]
+  ASSERT_EQ(CountLiveLogFiles(), 5);
+  PutRandomData(0, 1000, 100);
+  WaitForFlush(0);
+  // [(0), (1)] [(0), (1), (2)] [(2)] [(1), 2, 3] [1, (0)] [0]
+  // delete obsolete logs -->
+  // [(1), 2, 3] [1, (0)] [0]
+  ASSERT_EQ(CountLiveLogFiles(), 3);
+  PutRandomData(0, 1000, 100);
+  WaitForFlush(0);
+  // [(1), 2, 3] [1, (0)], [(0)] [0]
+  ASSERT_EQ(CountLiveLogFiles(), 4);
+  PutRandomData(1, 1000, 100);
+  WaitForFlush(1);
+  // [(1), 2, 3] [(1), (0)] [(0)] [0, (1)] [1]
+  ASSERT_EQ(CountLiveLogFiles(), 5);
+  PutRandomData(2, 1000, 100);
+  WaitForFlush(2);
+  // [(1), (2), 3] [(1), (0)] [(0)] [0, (1)] [1, (2)], [2]
+  ASSERT_EQ(CountLiveLogFiles(), 6);
+  PutRandomData(3, 1000, 100);
+  WaitForFlush(3);
+  // [(1), (2), (3)] [(1), (0)] [(0)] [0, (1)] [1, (2)], [2, (3)] [3]
+  // delete obsolete logs -->
+  // [0, (1)] [1, (2)], [2, (3)] [3]
+  ASSERT_EQ(CountLiveLogFiles(), 4);
+  Close();
+}
+
+// Makes sure that obsolete log files get deleted
+TEST(ColumnFamilyTest, DifferentWriteBufferSizes) {
+  // disable flushing stale column families
+  db_options_.max_total_wal_size = std::numeric_limits<uint64_t>::max();
+  Open();
+  CreateColumnFamilies({"one", "two", "three"});
+  ColumnFamilyOptions default_cf, one, two, three;
+  // setup options. all column families have max_write_buffer_number setup to 10
+  // "default" -> 100KB memtable, start flushing immediatelly
+  // "one" -> 200KB memtable, start flushing with two immutable memtables
+  // "two" -> 1MB memtable, start flushing with three immutable memtables
+  // "three" -> 90KB memtable, start flushing with four immutable memtables
+  default_cf.write_buffer_size = 100000;
+  default_cf.max_write_buffer_number = 10;
+  default_cf.min_write_buffer_number_to_merge = 1;
+  one.write_buffer_size = 200000;
+  one.max_write_buffer_number = 10;
+  one.min_write_buffer_number_to_merge = 2;
+  two.write_buffer_size = 1000000;
+  two.max_write_buffer_number = 10;
+  two.min_write_buffer_number_to_merge = 3;
+  three.write_buffer_size = 90000;
+  three.max_write_buffer_number = 10;
+  three.min_write_buffer_number_to_merge = 4;
+
+  Reopen({default_cf, one, two, three});
+
+  int micros_wait_for_flush = 10000;
+  PutRandomData(0, 100, 1000);
+  WaitForFlush(0);
+  AssertNumberOfImmutableMemtables({0, 0, 0, 0});
+  ASSERT_EQ(CountLiveLogFiles(), 1);
+  PutRandomData(1, 200, 1000);
+  env_->SleepForMicroseconds(micros_wait_for_flush);
+  AssertNumberOfImmutableMemtables({0, 1, 0, 0});
+  ASSERT_EQ(CountLiveLogFiles(), 2);
+  PutRandomData(2, 1000, 1000);
+  env_->SleepForMicroseconds(micros_wait_for_flush);
+  AssertNumberOfImmutableMemtables({0, 1, 1, 0});
+  ASSERT_EQ(CountLiveLogFiles(), 3);
+  PutRandomData(2, 1000, 1000);
+  env_->SleepForMicroseconds(micros_wait_for_flush);
+  AssertNumberOfImmutableMemtables({0, 1, 2, 0});
+  ASSERT_EQ(CountLiveLogFiles(), 4);
+  PutRandomData(3, 90, 1000);
+  env_->SleepForMicroseconds(micros_wait_for_flush);
+  AssertNumberOfImmutableMemtables({0, 1, 2, 1});
+  ASSERT_EQ(CountLiveLogFiles(), 5);
+  PutRandomData(3, 90, 1000);
+  env_->SleepForMicroseconds(micros_wait_for_flush);
+  AssertNumberOfImmutableMemtables({0, 1, 2, 2});
+  ASSERT_EQ(CountLiveLogFiles(), 6);
+  PutRandomData(3, 90, 1000);
+  env_->SleepForMicroseconds(micros_wait_for_flush);
+  AssertNumberOfImmutableMemtables({0, 1, 2, 3});
+  ASSERT_EQ(CountLiveLogFiles(), 7);
+  PutRandomData(0, 100, 1000);
+  WaitForFlush(0);
+  AssertNumberOfImmutableMemtables({0, 1, 2, 3});
+  ASSERT_EQ(CountLiveLogFiles(), 8);
+  PutRandomData(2, 100, 10000);
+  WaitForFlush(2);
+  AssertNumberOfImmutableMemtables({0, 1, 0, 3});
+  ASSERT_EQ(CountLiveLogFiles(), 9);
+  PutRandomData(3, 90, 1000);
+  WaitForFlush(3);
+  AssertNumberOfImmutableMemtables({0, 1, 0, 0});
+  ASSERT_EQ(CountLiveLogFiles(), 10);
+  PutRandomData(3, 90, 1000);
+  env_->SleepForMicroseconds(micros_wait_for_flush);
+  AssertNumberOfImmutableMemtables({0, 1, 0, 1});
+  ASSERT_EQ(CountLiveLogFiles(), 11);
+  PutRandomData(1, 200, 1000);
+  WaitForFlush(1);
+  AssertNumberOfImmutableMemtables({0, 0, 0, 1});
+  ASSERT_EQ(CountLiveLogFiles(), 5);
+  PutRandomData(3, 90*6, 1000);
+  WaitForFlush(3);
+  AssertNumberOfImmutableMemtables({0, 0, 0, 0});
+  ASSERT_EQ(CountLiveLogFiles(), 12);
+  PutRandomData(0, 100, 1000);
+  WaitForFlush(0);
+  AssertNumberOfImmutableMemtables({0, 0, 0, 0});
+  ASSERT_EQ(CountLiveLogFiles(), 12);
+  PutRandomData(2, 3*100, 10000);
+  WaitForFlush(2);
+  AssertNumberOfImmutableMemtables({0, 0, 0, 0});
+  ASSERT_EQ(CountLiveLogFiles(), 12);
+  PutRandomData(1, 2*200, 1000);
+  WaitForFlush(1);
+  AssertNumberOfImmutableMemtables({0, 0, 0, 0});
+  ASSERT_EQ(CountLiveLogFiles(), 7);
+  Close();
+}
+
+TEST(ColumnFamilyTest, DifferentMergeOperators) {
+  Open();
+  CreateColumnFamilies({"first", "second"});
+  ColumnFamilyOptions default_cf, first, second;
+  first.merge_operator = MergeOperators::CreateUInt64AddOperator();
+  second.merge_operator = MergeOperators::CreateStringAppendOperator();
+  Reopen({default_cf, first, second});
+
+  std::string one, two, three;
+  PutFixed64(&one, 1);
+  PutFixed64(&two, 2);
+  PutFixed64(&three, 3);
+
+  ASSERT_OK(Put(0, "foo", two));
+  ASSERT_OK(Put(0, "foo", one));
+  ASSERT_TRUE(Merge(0, "foo", two).IsNotSupported());
+  ASSERT_EQ(Get(0, "foo"), one);
+
+  ASSERT_OK(Put(1, "foo", two));
+  ASSERT_OK(Put(1, "foo", one));
+  ASSERT_OK(Merge(1, "foo", two));
+  ASSERT_EQ(Get(1, "foo"), three);
+
+  ASSERT_OK(Put(2, "foo", two));
+  ASSERT_OK(Put(2, "foo", one));
+  ASSERT_OK(Merge(2, "foo", two));
+  ASSERT_EQ(Get(2, "foo"), one + "," + two);
+  Close();
+}
+
+TEST(ColumnFamilyTest, DifferentCompactionStyles) {
+  Open();
+  CreateColumnFamilies({"one", "two"});
+  ColumnFamilyOptions default_cf, one, two;
+  db_options_.max_open_files = 20;  // only 10 files in file cache
+  db_options_.disableDataSync = true;
+
+  default_cf.compaction_style = kCompactionStyleLevel;
+  default_cf.num_levels = 3;
+  default_cf.write_buffer_size = 64 << 10;  // 64KB
+  default_cf.target_file_size_base = 30 << 10;
+  default_cf.filter_policy = nullptr;
+  default_cf.no_block_cache = true;
+  default_cf.source_compaction_factor = 100;
+  default_cf.disable_seek_compaction = false;
+
+  one.compaction_style = kCompactionStyleUniversal;
+  // trigger compaction if there are >= 4 files
+  one.level0_file_num_compaction_trigger = 4;
+  one.write_buffer_size = 100000;
+
+  two.compaction_style = kCompactionStyleLevel;
+  two.num_levels = 4;
+  two.max_mem_compaction_level = 0;
+  two.level0_file_num_compaction_trigger = 3;
+  two.write_buffer_size = 100000;
+
+  Reopen({default_cf, one, two});
+
+  // SETUP column family "default" - test read compaction
+  ASSERT_EQ("", FilesPerLevel(0));
+  PutRandomData(0, 1, 4096);
+  ASSERT_OK(Flush(0));
+  ASSERT_EQ("0,0,1", FilesPerLevel(0));
+  // write 8MB
+  PutRandomData(0, 2000, 4096);
+  ASSERT_OK(Flush(0));
+  // clear levels 0 and 1
+  dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[0]);
+  dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[0]);
+  ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1, 0), 0);
+  // write some new keys into level 0 and 1
+  PutRandomData(0, 1024, 512);
+  ASSERT_OK(Flush(0));
+  WaitForCompaction();
+  PutRandomData(0, 10, 512);
+  ASSERT_OK(Flush(0));
+  // remember number of files in each level
+  int l1 = NumTableFilesAtLevel(0, 0);
+  int l2 = NumTableFilesAtLevel(1, 0);
+  int l3 = NumTableFilesAtLevel(2, 0);
+  ASSERT_NE(l1, 0);
+  ASSERT_NE(l2, 0);
+  ASSERT_NE(l3, 0);
+
+  // SETUP column family "one" -- universal style
+  for (int i = 0; i < one.level0_file_num_compaction_trigger - 1; ++i) {
+    PutRandomData(1, 11, 10000);
+    WaitForFlush(1);
+    ASSERT_EQ(std::to_string(i + 1), FilesPerLevel(1));
+  }
+
+  // SETUP column family "two" -- level style with 4 levels
+  for (int i = 0; i < two.level0_file_num_compaction_trigger - 1; ++i) {
+    PutRandomData(2, 15, 10000);
+    WaitForFlush(2);
+    ASSERT_EQ(std::to_string(i + 1), FilesPerLevel(2));
+  }
+
+  // TRIGGER compaction "default"
+  // read a bunch of times, trigger read compaction
+  for (int i = 0; i < 200000; ++i) {
+    Get(0, std::to_string(i));
+  }
+
+  // TRIGGER compaction "one"
+  PutRandomData(1, 12, 10000);
+
+  // TRIGGER compaction "two"
+  PutRandomData(2, 10, 10000);
+
+  // WAIT for compactions
+  WaitForCompaction();
+
+  // VERIFY compaction "default"
+  // verify that the number of files have decreased
+  // in some level, indicating that there was a compaction
+  ASSERT_TRUE(NumTableFilesAtLevel(0, 0) < l1 ||
+              NumTableFilesAtLevel(1, 0) < l2 ||
+              NumTableFilesAtLevel(2, 0) < l3);
+
+  // VERIFY compaction "one"
+  ASSERT_EQ("1", FilesPerLevel(1));
+
+  // VERIFY compaction "two"
+  ASSERT_EQ("0,1", FilesPerLevel(2));
+  CompactAll(2);
+  ASSERT_EQ("0,1", FilesPerLevel(2));
+
+  Close();
+}
+
+namespace {
+std::string IterStatus(Iterator* iter) {
+  std::string result;
+  if (iter->Valid()) {
+    result = iter->key().ToString() + "->" + iter->value().ToString();
+  } else {
+    result = "(invalid)";
+  }
+  return result;
+}
+}  // anonymous namespace
+
+TEST(ColumnFamilyTest, NewIteratorsTest) {
+  // iter == 0 -- no tailing
+  // iter == 2 -- tailing
+  for (int iter = 0; iter < 2; ++iter) {
+    Open();
+    CreateColumnFamiliesAndReopen({"one", "two"});
+    ASSERT_OK(Put(0, "a", "b"));
+    ASSERT_OK(Put(1, "b", "a"));
+    ASSERT_OK(Put(2, "c", "m"));
+    ASSERT_OK(Put(2, "v", "t"));
+    std::vector<Iterator*> iterators;
+    ReadOptions options;
+    options.tailing = (iter == 1);
+    ASSERT_OK(db_->NewIterators(options, handles_, &iterators));
+
+    for (auto it : iterators) {
+      it->SeekToFirst();
+    }
+    ASSERT_EQ(IterStatus(iterators[0]), "a->b");
+    ASSERT_EQ(IterStatus(iterators[1]), "b->a");
+    ASSERT_EQ(IterStatus(iterators[2]), "c->m");
+
+    ASSERT_OK(Put(1, "x", "x"));
+
+    for (auto it : iterators) {
+      it->Next();
+    }
+
+    ASSERT_EQ(IterStatus(iterators[0]), "(invalid)");
+    if (iter == 0) {
+      // no tailing
+      ASSERT_EQ(IterStatus(iterators[1]), "(invalid)");
+    } else {
+      // tailing
+      ASSERT_EQ(IterStatus(iterators[1]), "x->x");
+    }
+    ASSERT_EQ(IterStatus(iterators[2]), "v->t");
+
+    for (auto it : iterators) {
+      delete it;
+    }
+    Destroy();
+  }
+}
+
+TEST(ColumnFamilyTest, ReadOnlyDBTest) {
+  Open();
+  CreateColumnFamiliesAndReopen({"one", "two", "three", "four"});
+  ASSERT_OK(Put(1, "foo", "bla"));
+  ASSERT_OK(Put(2, "foo", "blabla"));
+  ASSERT_OK(Put(3, "foo", "blablabla"));
+  ASSERT_OK(Put(4, "foo", "blablablabla"));
+
+  DropColumnFamilies({2});
+  Close();
+  // open only a subset of column families
+  AssertOpenReadOnly({"default", "one", "four"});
+  ASSERT_EQ("NOT_FOUND", Get(0, "foo"));
+  ASSERT_EQ("bla", Get(1, "foo"));
+  ASSERT_EQ("blablablabla", Get(2, "foo"));
+
+  Close();
+  // can't open dropped column family
+  Status s = OpenReadOnly({"default", "one", "two"});
+  ASSERT_TRUE(!s.ok());
+
+  // Can't open without specifying default column family
+  s = OpenReadOnly({"one", "four"});
+  ASSERT_TRUE(!s.ok());
+}
+
+TEST(ColumnFamilyTest, DontRollEmptyLogs) {
+  Open();
+  CreateColumnFamiliesAndReopen({"one", "two", "three", "four"});
+
+  for (size_t i = 0; i < handles_.size(); ++i) {
+    PutRandomData(i, 10, 100);
+  }
+  int num_writable_file_start = env_->GetNumberOfNewWritableFileCalls();
+  // this will trigger the flushes
+  ASSERT_OK(db_->Write(WriteOptions(), nullptr));
+
+  for (int i = 0; i < 4; ++i) {
+    dbfull()->TEST_WaitForFlushMemTable(handles_[i]);
+  }
+  int total_new_writable_files =
+      env_->GetNumberOfNewWritableFileCalls() - num_writable_file_start;
+  ASSERT_EQ(static_cast<size_t>(total_new_writable_files), handles_.size() + 1);
+  Close();
+}
+
+TEST(ColumnFamilyTest, FlushStaleColumnFamilies) {
+  Open();
+  CreateColumnFamilies({"one", "two"});
+  ColumnFamilyOptions default_cf, one, two;
+  default_cf.write_buffer_size = 100000;  // small write buffer size
+  default_cf.disable_auto_compactions = true;
+  one.disable_auto_compactions = true;
+  two.disable_auto_compactions = true;
+  db_options_.max_total_wal_size = 210000;
+
+  Reopen({default_cf, one, two});
+
+  PutRandomData(2, 1, 10);  // 10 bytes
+  for (int i = 0; i < 2; ++i) {
+    PutRandomData(0, 100, 1000);  // flush
+    WaitForFlush(0);
+    ASSERT_EQ(i + 1, CountLiveFiles());
+  }
+  // third flush. now, CF [two] should be detected as stale and flushed
+  // column family 1 should not be flushed since it's empty
+  PutRandomData(0, 100, 1000);  // flush
+  WaitForFlush(0);
+  WaitForFlush(2);
+  // 3 files for default column families, 1 file for column family [two], zero
+  // files for column family [one], because it's empty
+  ASSERT_EQ(4, CountLiveFiles());
+  Close();
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/db/compaction.cc b/db/compaction.cc
new file mode 100644 (file)
index 0000000..bafb5b4
--- /dev/null
@@ -0,0 +1,261 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction.h"
+#include "db/column_family.h"
+
+namespace rocksdb {
+
+static uint64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
+  uint64_t sum = 0;
+  for (size_t i = 0; i < files.size() && files[i]; i++) {
+    sum += files[i]->file_size;
+  }
+  return sum;
+}
+
+Compaction::Compaction(Version* input_version, int level, int out_level,
+                       uint64_t target_file_size,
+                       uint64_t max_grandparent_overlap_bytes,
+                       bool seek_compaction, bool enable_compression)
+    : level_(level),
+      out_level_(out_level),
+      max_output_file_size_(target_file_size),
+      max_grandparent_overlap_bytes_(max_grandparent_overlap_bytes),
+      input_version_(input_version),
+      number_levels_(input_version_->NumberLevels()),
+      cfd_(input_version_->cfd_),
+      seek_compaction_(seek_compaction),
+      enable_compression_(enable_compression),
+      grandparent_index_(0),
+      seen_key_(false),
+      overlapped_bytes_(0),
+      base_index_(-1),
+      parent_index_(-1),
+      score_(0),
+      bottommost_level_(false),
+      is_full_compaction_(false),
+      is_manual_compaction_(false),
+      level_ptrs_(std::vector<size_t>(number_levels_)) {
+
+  cfd_->Ref();
+  input_version_->Ref();
+  edit_ = new VersionEdit();
+  edit_->SetColumnFamily(cfd_->GetID());
+  for (int i = 0; i < number_levels_; i++) {
+    level_ptrs_[i] = 0;
+  }
+}
+
+Compaction::~Compaction() {
+  delete edit_;
+  if (input_version_ != nullptr) {
+    input_version_->Unref();
+  }
+  if (cfd_ != nullptr) {
+    if (cfd_->Unref()) {
+      delete cfd_;
+    }
+  }
+}
+
+bool Compaction::IsTrivialMove() const {
+  // Avoid a move if there is lots of overlapping grandparent data.
+  // Otherwise, the move could create a parent file that will require
+  // a very expensive merge later on.
+  // If level_== out_level_, the purpose is to force compaction filter to be
+  // applied to that level, and thus cannot be a trivia move.
+  return (level_ != out_level_ &&
+          num_input_files(0) == 1 &&
+          num_input_files(1) == 0 &&
+          TotalFileSize(grandparents_) <= max_grandparent_overlap_bytes_);
+}
+
+void Compaction::AddInputDeletions(VersionEdit* edit) {
+  for (int which = 0; which < 2; which++) {
+    for (size_t i = 0; i < inputs_[which].size(); i++) {
+      edit->DeleteFile(level_ + which, inputs_[which][i]->number);
+    }
+  }
+}
+
+bool Compaction::IsBaseLevelForKey(const Slice& user_key) {
+  if (cfd_->options()->compaction_style == kCompactionStyleUniversal) {
+    return bottommost_level_;
+  }
+  // Maybe use binary search to find right entry instead of linear search?
+  const Comparator* user_cmp = cfd_->user_comparator();
+  for (int lvl = level_ + 2; lvl < number_levels_; lvl++) {
+    const std::vector<FileMetaData*>& files = input_version_->files_[lvl];
+    for (; level_ptrs_[lvl] < files.size(); ) {
+      FileMetaData* f = files[level_ptrs_[lvl]];
+      if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) {
+        // We've advanced far enough
+        if (user_cmp->Compare(user_key, f->smallest.user_key()) >= 0) {
+          // Key falls in this file's range, so definitely not base level
+          return false;
+        }
+        break;
+      }
+      level_ptrs_[lvl]++;
+    }
+  }
+  return true;
+}
+
+bool Compaction::ShouldStopBefore(const Slice& internal_key) {
+  // Scan to find earliest grandparent file that contains key.
+  const InternalKeyComparator* icmp = &cfd_->internal_comparator();
+  while (grandparent_index_ < grandparents_.size() &&
+      icmp->Compare(internal_key,
+                    grandparents_[grandparent_index_]->largest.Encode()) > 0) {
+    if (seen_key_) {
+      overlapped_bytes_ += grandparents_[grandparent_index_]->file_size;
+    }
+    assert(grandparent_index_ + 1 >= grandparents_.size() ||
+           icmp->Compare(grandparents_[grandparent_index_]->largest.Encode(),
+                         grandparents_[grandparent_index_+1]->smallest.Encode())
+                         < 0);
+    grandparent_index_++;
+  }
+  seen_key_ = true;
+
+  if (overlapped_bytes_ > max_grandparent_overlap_bytes_) {
+    // Too much overlap for current output; start new output
+    overlapped_bytes_ = 0;
+    return true;
+  } else {
+    return false;
+  }
+}
+
+// Mark (or clear) each file that is being compacted
+void Compaction::MarkFilesBeingCompacted(bool value) {
+  for (int i = 0; i < 2; i++) {
+    std::vector<FileMetaData*> v = inputs_[i];
+    for (unsigned int j = 0; j < inputs_[i].size(); j++) {
+      assert(value ? !inputs_[i][j]->being_compacted :
+                      inputs_[i][j]->being_compacted);
+      inputs_[i][j]->being_compacted = value;
+    }
+  }
+}
+
+// Is this compaction producing files at the bottommost level?
+void Compaction::SetupBottomMostLevel(bool isManual) {
+  if (cfd_->options()->compaction_style == kCompactionStyleUniversal) {
+    // If universal compaction style is used and manual
+    // compaction is occuring, then we are guaranteed that
+    // all files will be picked in a single compaction
+    // run. We can safely set bottommost_level_ = true.
+    // If it is not manual compaction, then bottommost_level_
+    // is already set when the Compaction was created.
+    if (isManual) {
+      bottommost_level_ = true;
+    }
+    return;
+  }
+  bottommost_level_ = true;
+  for (int i = output_level() + 1; i < number_levels_; i++) {
+    if (input_version_->NumLevelFiles(i) > 0) {
+      bottommost_level_ = false;
+      break;
+    }
+  }
+}
+
+void Compaction::ReleaseInputs() {
+  if (input_version_ != nullptr) {
+    input_version_->Unref();
+    input_version_ = nullptr;
+  }
+  if (cfd_ != nullptr) {
+    if (cfd_->Unref()) {
+      delete cfd_;
+    }
+    cfd_ = nullptr;
+  }
+}
+
+void Compaction::ReleaseCompactionFiles(Status status) {
+  cfd_->compaction_picker()->ReleaseCompactionFiles(this, status);
+}
+
+void Compaction::ResetNextCompactionIndex() {
+  input_version_->ResetNextCompactionIndex(level_);
+}
+
+/*
+for sizes >=10TB, print "XXTB"
+for sizes >=10GB, print "XXGB"
+etc.
+*/
+static void FileSizeSummary(unsigned long long sz, char* output, int len) {
+  const unsigned long long ull10 = 10;
+  if (sz >= ull10<<40) {
+    snprintf(output, len, "%lluTB", sz>>40);
+  } else if (sz >= ull10<<30) {
+    snprintf(output, len, "%lluGB", sz>>30);
+  } else if (sz >= ull10<<20) {
+    snprintf(output, len, "%lluMB", sz>>20);
+  } else if (sz >= ull10<<10) {
+    snprintf(output, len, "%lluKB", sz>>10);
+  } else {
+    snprintf(output, len, "%lluB", sz);
+  }
+}
+
+static int InputSummary(std::vector<FileMetaData*>& files, char* output,
+                         int len) {
+  *output = '\0';
+  int write = 0;
+  for (unsigned int i = 0; i < files.size(); i++) {
+    int sz = len - write;
+    int ret;
+    char sztxt[16];
+    FileSizeSummary((unsigned long long)files.at(i)->file_size, sztxt, 16);
+    ret = snprintf(output + write, sz, "%lu(%s) ",
+                   (unsigned long)files.at(i)->number,
+                   sztxt);
+    if (ret < 0 || ret >= sz)
+      break;
+    write += ret;
+  }
+  return write;
+}
+
+void Compaction::Summary(char* output, int len) {
+  int write = snprintf(output, len,
+      "Base version %lu Base level %d, seek compaction:%d, inputs: [",
+      (unsigned long)input_version_->GetVersionNumber(),
+      level_,
+      seek_compaction_);
+  if (write < 0 || write >= len) {
+    return;
+  }
+
+  write += InputSummary(inputs_[0], output+write, len-write);
+  if (write < 0 || write >= len) {
+    return;
+  }
+
+  write += snprintf(output+write, len-write, "],[");
+  if (write < 0 || write >= len) {
+    return;
+  }
+
+  write += InputSummary(inputs_[1], output+write, len-write);
+  if (write < 0 || write >= len) {
+    return;
+  }
+
+  snprintf(output+write, len-write, "]");
+}
+
+}  // namespace rocksdb
diff --git a/db/compaction.h b/db/compaction.h
new file mode 100644 (file)
index 0000000..8fd95f9
--- /dev/null
@@ -0,0 +1,151 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include "db/version_set.h"
+
+namespace rocksdb {
+
+class Version;
+class ColumnFamilyData;
+
+// A Compaction encapsulates information about a compaction.
+class Compaction {
+ public:
+  ~Compaction();
+
+  // Return the level that is being compacted.  Inputs from "level"
+  // will be merged.
+  int level() const { return level_; }
+
+  // Outputs will go to this level
+  int output_level() const { return out_level_; }
+
+  // Return the object that holds the edits to the descriptor done
+  // by this compaction.
+  VersionEdit* edit() { return edit_; }
+
+  // "which" must be either 0 or 1
+  int num_input_files(int which) const { return inputs_[which].size(); }
+
+  // Returns input version of the compaction
+  Version* input_version() const { return input_version_; }
+
+  ColumnFamilyData* column_family_data() const { return cfd_; }
+
+  // Return the ith input file at "level()+which" ("which" must be 0 or 1).
+  FileMetaData* input(int which, int i) const { return inputs_[which][i]; }
+
+  std::vector<FileMetaData*>* inputs(int which) { return &inputs_[which]; }
+
+  // Maximum size of files to build during this compaction.
+  uint64_t MaxOutputFileSize() const { return max_output_file_size_; }
+
+  // Whether compression will be enabled for compaction outputs
+  bool enable_compression() const { return enable_compression_; }
+
+  // Is this a trivial compaction that can be implemented by just
+  // moving a single input file to the next level (no merging or splitting)
+  bool IsTrivialMove() const;
+
+  // Add all inputs to this compaction as delete operations to *edit.
+  void AddInputDeletions(VersionEdit* edit);
+
+  // Returns true if the information we have available guarantees that
+  // the compaction is producing data in "level+1" for which no data exists
+  // in levels greater than "level+1".
+  bool IsBaseLevelForKey(const Slice& user_key);
+
+  // Returns true iff we should stop building the current output
+  // before processing "internal_key".
+  bool ShouldStopBefore(const Slice& internal_key);
+
+  // Release the input version for the compaction, once the compaction
+  // is successful.
+  void ReleaseInputs();
+
+  // Clear all files to indicate that they are not being compacted
+  // Delete this compaction from the list of running compactions.
+  void ReleaseCompactionFiles(Status status);
+
+  void Summary(char* output, int len);
+
+  // Return the score that was used to pick this compaction run.
+  double score() const { return score_; }
+
+  // Is this compaction creating a file in the bottom most level?
+  bool BottomMostLevel() { return bottommost_level_; }
+
+  // Does this compaction include all sst files?
+  bool IsFullCompaction() { return is_full_compaction_; }
+
+  // Was this compaction triggered manually by the client?
+  bool IsManualCompaction() { return is_manual_compaction_; }
+
+ private:
+  friend class CompactionPicker;
+  friend class UniversalCompactionPicker;
+  friend class LevelCompactionPicker;
+
+  Compaction(Version* input_version, int level, int out_level,
+             uint64_t target_file_size, uint64_t max_grandparent_overlap_bytes,
+             bool seek_compaction = false, bool enable_compression = true);
+
+  int level_;
+  int out_level_; // levels to which output files are stored
+  uint64_t max_output_file_size_;
+  uint64_t max_grandparent_overlap_bytes_;
+  Version* input_version_;
+  VersionEdit* edit_;
+  int number_levels_;
+  ColumnFamilyData* cfd_;
+
+  bool seek_compaction_;
+  bool enable_compression_;
+
+  // Each compaction reads inputs from "level_" and "level_+1"
+  std::vector<FileMetaData*> inputs_[2];      // The two sets of inputs
+
+  // State used to check for number of of overlapping grandparent files
+  // (parent == level_ + 1, grandparent == level_ + 2)
+  std::vector<FileMetaData*> grandparents_;
+  size_t grandparent_index_;  // Index in grandparent_starts_
+  bool seen_key_;             // Some output key has been seen
+  uint64_t overlapped_bytes_;  // Bytes of overlap between current output
+                              // and grandparent files
+  int base_index_;   // index of the file in files_[level_]
+  int parent_index_; // index of some file with same range in files_[level_+1]
+  double score_;     // score that was used to pick this compaction.
+
+  // Is this compaction creating a file in the bottom most level?
+  bool bottommost_level_;
+  // Does this compaction include all sst files?
+  bool is_full_compaction_;
+
+  // Is this compaction requested by the client?
+  bool is_manual_compaction_;
+
+  // level_ptrs_ holds indices into input_version_->levels_: our state
+  // is that we are positioned at one of the file ranges for each
+  // higher level than the ones involved in this compaction (i.e. for
+  // all L >= level_ + 2).
+  std::vector<size_t> level_ptrs_;
+
+  // mark (or clear) all files that are being compacted
+  void MarkFilesBeingCompacted(bool);
+
+  // Initialize whether compaction producing files at the bottommost level
+  void SetupBottomMostLevel(bool isManual);
+
+  // In case of compaction error, reset the nextIndex that is used
+  // to pick up the next file to be compacted from files_by_size_
+  void ResetNextCompactionIndex();
+};
+
+}  // namespace rocksdb
diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc
new file mode 100644 (file)
index 0000000..a8700bb
--- /dev/null
@@ -0,0 +1,889 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction_picker.h"
+
+#include <limits>
+#include "util/log_buffer.h"
+#include "util/statistics.h"
+
+namespace rocksdb {
+
+namespace {
+
+uint64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
+  uint64_t sum = 0;
+  for (size_t i = 0; i < files.size() && files[i]; i++) {
+    sum += files[i]->file_size;
+  }
+  return sum;
+}
+
+// Multiple two operands. If they overflow, return op1.
+uint64_t MultiplyCheckOverflow(uint64_t op1, int op2) {
+  if (op1 == 0) {
+    return 0;
+  }
+  if (op2 <= 0) {
+    return op1;
+  }
+  uint64_t casted_op2 = (uint64_t) op2;
+  if (std::numeric_limits<uint64_t>::max() / op1 < casted_op2) {
+    return op1;
+  }
+  return op1 * casted_op2;
+}
+
+}  // anonymous namespace
+
+CompactionPicker::CompactionPicker(const Options* options,
+                                   const InternalKeyComparator* icmp)
+    : compactions_in_progress_(options->num_levels),
+      options_(options),
+      num_levels_(options->num_levels),
+      icmp_(icmp) {
+
+  max_file_size_.reset(new uint64_t[NumberLevels()]);
+  level_max_bytes_.reset(new uint64_t[NumberLevels()]);
+  int target_file_size_multiplier = options_->target_file_size_multiplier;
+  int max_bytes_multiplier = options_->max_bytes_for_level_multiplier;
+  for (int i = 0; i < NumberLevels(); i++) {
+    if (i == 0 && options_->compaction_style == kCompactionStyleUniversal) {
+      max_file_size_[i] = ULLONG_MAX;
+      level_max_bytes_[i] = options_->max_bytes_for_level_base;
+    } else if (i > 1) {
+      max_file_size_[i] = MultiplyCheckOverflow(max_file_size_[i - 1],
+                                                target_file_size_multiplier);
+      level_max_bytes_[i] = MultiplyCheckOverflow(
+          MultiplyCheckOverflow(level_max_bytes_[i - 1], max_bytes_multiplier),
+          options_->max_bytes_for_level_multiplier_additional[i - 1]);
+    } else {
+      max_file_size_[i] = options_->target_file_size_base;
+      level_max_bytes_[i] = options_->max_bytes_for_level_base;
+    }
+  }
+}
+
+CompactionPicker::~CompactionPicker() {}
+
+void CompactionPicker::SizeBeingCompacted(std::vector<uint64_t>& sizes) {
+  for (int level = 0; level < NumberLevels() - 1; level++) {
+    uint64_t total = 0;
+    for (auto c : compactions_in_progress_[level]) {
+      assert(c->level() == level);
+      for (int i = 0; i < c->num_input_files(0); i++) {
+        total += c->input(0,i)->file_size;
+      }
+    }
+    sizes[level] = total;
+  }
+}
+
+// Clear all files to indicate that they are not being compacted
+// Delete this compaction from the list of running compactions.
+void CompactionPicker::ReleaseCompactionFiles(Compaction* c, Status status) {
+  c->MarkFilesBeingCompacted(false);
+  compactions_in_progress_[c->level()].erase(c);
+  if (!status.ok()) {
+    c->ResetNextCompactionIndex();
+  }
+}
+
+uint64_t CompactionPicker::MaxFileSizeForLevel(int level) const {
+  assert(level >= 0);
+  assert(level < NumberLevels());
+  return max_file_size_[level];
+}
+
+uint64_t CompactionPicker::MaxGrandParentOverlapBytes(int level) {
+  uint64_t result = MaxFileSizeForLevel(level);
+  result *= options_->max_grandparent_overlap_factor;
+  return result;
+}
+
+double CompactionPicker::MaxBytesForLevel(int level) {
+  // Note: the result for level zero is not really used since we set
+  // the level-0 compaction threshold based on number of files.
+  assert(level >= 0);
+  assert(level < NumberLevels());
+  return level_max_bytes_[level];
+}
+
+void CompactionPicker::GetRange(const std::vector<FileMetaData*>& inputs,
+                                InternalKey* smallest, InternalKey* largest) {
+  assert(!inputs.empty());
+  smallest->Clear();
+  largest->Clear();
+  for (size_t i = 0; i < inputs.size(); i++) {
+    FileMetaData* f = inputs[i];
+    if (i == 0) {
+      *smallest = f->smallest;
+      *largest = f->largest;
+    } else {
+      if (icmp_->Compare(f->smallest, *smallest) < 0) {
+        *smallest = f->smallest;
+      }
+      if (icmp_->Compare(f->largest, *largest) > 0) {
+        *largest = f->largest;
+      }
+    }
+  }
+}
+
+void CompactionPicker::GetRange(const std::vector<FileMetaData*>& inputs1,
+                                const std::vector<FileMetaData*>& inputs2,
+                                InternalKey* smallest, InternalKey* largest) {
+  std::vector<FileMetaData*> all = inputs1;
+  all.insert(all.end(), inputs2.begin(), inputs2.end());
+  GetRange(all, smallest, largest);
+}
+
+bool CompactionPicker::ExpandWhileOverlapping(Compaction* c) {
+  // If inputs are empty then there is nothing to expand.
+  if (!c || c->inputs_[0].empty()) {
+    return true;
+  }
+
+  // GetOverlappingInputs will always do the right thing for level-0.
+  // So we don't need to do any expansion if level == 0.
+  if (c->level() == 0) {
+    return true;
+  }
+
+  const int level = c->level();
+  InternalKey smallest, largest;
+
+  // Keep expanding c->inputs_[0] until we are sure that there is a
+  // "clean cut" boundary between the files in input and the surrounding files.
+  // This will ensure that no parts of a key are lost during compaction.
+  int hint_index = -1;
+  size_t old_size;
+  do {
+    old_size = c->inputs_[0].size();
+    GetRange(c->inputs_[0], &smallest, &largest);
+    c->inputs_[0].clear();
+    c->input_version_->GetOverlappingInputs(
+        level, &smallest, &largest, &c->inputs_[0], hint_index, &hint_index);
+  } while(c->inputs_[0].size() > old_size);
+
+  // Get the new range
+  GetRange(c->inputs_[0], &smallest, &largest);
+
+  // If, after the expansion, there are files that are already under
+  // compaction, then we must drop/cancel this compaction.
+  int parent_index = -1;
+  if (c->inputs_[0].empty()) {
+    Log(options_->info_log,
+        "[%s] ExpandWhileOverlapping() failure because zero input files",
+        c->column_family_data()->GetName().c_str());
+  }
+  if (c->inputs_[0].empty() || FilesInCompaction(c->inputs_[0]) ||
+      (c->level() != c->output_level() &&
+       ParentRangeInCompaction(c->input_version_, &smallest, &largest, level,
+                               &parent_index))) {
+    c->inputs_[0].clear();
+    c->inputs_[1].clear();
+    return false;
+  }
+  return true;
+}
+
+uint64_t CompactionPicker::ExpandedCompactionByteSizeLimit(int level) {
+  uint64_t result = MaxFileSizeForLevel(level);
+  result *= options_->expanded_compaction_factor;
+  return result;
+}
+
+// Returns true if any one of specified files are being compacted
+bool CompactionPicker::FilesInCompaction(std::vector<FileMetaData*>& files) {
+  for (unsigned int i = 0; i < files.size(); i++) {
+    if (files[i]->being_compacted) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Returns true if any one of the parent files are being compacted
+bool CompactionPicker::ParentRangeInCompaction(Version* version,
+                                               const InternalKey* smallest,
+                                               const InternalKey* largest,
+                                               int level, int* parent_index) {
+  std::vector<FileMetaData*> inputs;
+  assert(level + 1 < NumberLevels());
+
+  version->GetOverlappingInputs(level + 1, smallest, largest, &inputs,
+                                *parent_index, parent_index);
+  return FilesInCompaction(inputs);
+}
+
+// Populates the set of inputs from "level+1" that overlap with "level".
+// Will also attempt to expand "level" if that doesn't expand "level+1"
+// or cause "level" to include a file for compaction that has an overlapping
+// user-key with another file.
+void CompactionPicker::SetupOtherInputs(Compaction* c) {
+  // If inputs are empty, then there is nothing to expand.
+  // If both input and output levels are the same, no need to consider
+  // files at level "level+1"
+  if (c->inputs_[0].empty() || c->level() == c->output_level()) {
+    return;
+  }
+
+  const int level = c->level();
+  InternalKey smallest, largest;
+
+  // Get the range one last time.
+  GetRange(c->inputs_[0], &smallest, &largest);
+
+  // Populate the set of next-level files (inputs_[1]) to include in compaction
+  c->input_version_->GetOverlappingInputs(level + 1, &smallest, &largest,
+                                          &c->inputs_[1], c->parent_index_,
+                                          &c->parent_index_);
+
+  // Get entire range covered by compaction
+  InternalKey all_start, all_limit;
+  GetRange(c->inputs_[0], c->inputs_[1], &all_start, &all_limit);
+
+  // See if we can further grow the number of inputs in "level" without
+  // changing the number of "level+1" files we pick up. We also choose NOT
+  // to expand if this would cause "level" to include some entries for some
+  // user key, while excluding other entries for the same user key. This
+  // can happen when one user key spans multiple files.
+  if (!c->inputs_[1].empty()) {
+    std::vector<FileMetaData*> expanded0;
+    c->input_version_->GetOverlappingInputs(
+        level, &all_start, &all_limit, &expanded0, c->base_index_, nullptr);
+    const uint64_t inputs0_size = TotalFileSize(c->inputs_[0]);
+    const uint64_t inputs1_size = TotalFileSize(c->inputs_[1]);
+    const uint64_t expanded0_size = TotalFileSize(expanded0);
+    uint64_t limit = ExpandedCompactionByteSizeLimit(level);
+    if (expanded0.size() > c->inputs_[0].size() &&
+        inputs1_size + expanded0_size < limit &&
+        !FilesInCompaction(expanded0) &&
+        !c->input_version_->HasOverlappingUserKey(&expanded0, level)) {
+      InternalKey new_start, new_limit;
+      GetRange(expanded0, &new_start, &new_limit);
+      std::vector<FileMetaData*> expanded1;
+      c->input_version_->GetOverlappingInputs(level + 1, &new_start, &new_limit,
+                                              &expanded1, c->parent_index_,
+                                              &c->parent_index_);
+      if (expanded1.size() == c->inputs_[1].size() &&
+          !FilesInCompaction(expanded1)) {
+        Log(options_->info_log,
+            "[%s] Expanding@%lu %lu+%lu (%lu+%lu bytes) to %lu+%lu (%lu+%lu "
+            "bytes)\n",
+            c->column_family_data()->GetName().c_str(), (unsigned long)level,
+            (unsigned long)(c->inputs_[0].size()),
+            (unsigned long)(c->inputs_[1].size()), (unsigned long)inputs0_size,
+            (unsigned long)inputs1_size, (unsigned long)(expanded0.size()),
+            (unsigned long)(expanded1.size()), (unsigned long)expanded0_size,
+            (unsigned long)inputs1_size);
+        smallest = new_start;
+        largest = new_limit;
+        c->inputs_[0] = expanded0;
+        c->inputs_[1] = expanded1;
+        GetRange(c->inputs_[0], c->inputs_[1], &all_start, &all_limit);
+      }
+    }
+  }
+
+  // Compute the set of grandparent files that overlap this compaction
+  // (parent == level+1; grandparent == level+2)
+  if (level + 2 < NumberLevels()) {
+    c->input_version_->GetOverlappingInputs(level + 2, &all_start, &all_limit,
+                                            &c->grandparents_);
+  }
+}
+
+
+Compaction* CompactionPicker::CompactRange(Version* version, int input_level,
+                                           int output_level,
+                                           const InternalKey* begin,
+                                           const InternalKey* end,
+                                           InternalKey** compaction_end) {
+  std::vector<FileMetaData*> inputs;
+  bool covering_the_whole_range = true;
+
+  // All files are 'overlapping' in universal style compaction.
+  // We have to compact the entire range in one shot.
+  if (options_->compaction_style == kCompactionStyleUniversal) {
+    begin = nullptr;
+    end = nullptr;
+  }
+  version->GetOverlappingInputs(input_level, begin, end, &inputs);
+  if (inputs.empty()) {
+    return nullptr;
+  }
+
+  // Avoid compacting too much in one shot in case the range is large.
+  // But we cannot do this for level-0 since level-0 files can overlap
+  // and we must not pick one file and drop another older file if the
+  // two files overlap.
+  if (input_level > 0) {
+    const uint64_t limit =
+        MaxFileSizeForLevel(input_level) * options_->source_compaction_factor;
+    uint64_t total = 0;
+    for (size_t i = 0; i + 1 < inputs.size(); ++i) {
+      uint64_t s = inputs[i]->file_size;
+      total += s;
+      if (total >= limit) {
+        **compaction_end = inputs[i + 1]->smallest;
+        covering_the_whole_range = false;
+        inputs.resize(i + 1);
+        break;
+      }
+    }
+  }
+  Compaction* c = new Compaction(version, input_level, output_level,
+                                 MaxFileSizeForLevel(output_level),
+                                 MaxGrandParentOverlapBytes(input_level));
+
+  c->inputs_[0] = inputs;
+  if (ExpandWhileOverlapping(c) == false) {
+    delete c;
+    Log(options_->info_log,
+        "[%s] Could not compact due to expansion failure.\n",
+        version->cfd_->GetName().c_str());
+    return nullptr;
+  }
+
+  SetupOtherInputs(c);
+
+  if (covering_the_whole_range) {
+    *compaction_end = nullptr;
+  }
+
+  // These files that are to be manaully compacted do not trample
+  // upon other files because manual compactions are processed when
+  // the system has a max of 1 background compaction thread.
+  c->MarkFilesBeingCompacted(true);
+
+  // Is this compaction creating a file at the bottommost level
+  c->SetupBottomMostLevel(true);
+
+  c->is_manual_compaction_ = true;
+
+  return c;
+}
+
+Compaction* LevelCompactionPicker::PickCompaction(Version* version,
+                                                  LogBuffer* log_buffer) {
+  Compaction* c = nullptr;
+  int level = -1;
+
+  // Compute the compactions needed. It is better to do it here
+  // and also in LogAndApply(), otherwise the values could be stale.
+  std::vector<uint64_t> size_being_compacted(NumberLevels() - 1);
+  SizeBeingCompacted(size_being_compacted);
+  version->ComputeCompactionScore(size_being_compacted);
+
+  // We prefer compactions triggered by too much data in a level over
+  // the compactions triggered by seeks.
+  //
+  // Find the compactions by size on all levels.
+  for (int i = 0; i < NumberLevels() - 1; i++) {
+    assert(i == 0 ||
+           version->compaction_score_[i] <= version->compaction_score_[i - 1]);
+    level = version->compaction_level_[i];
+    if ((version->compaction_score_[i] >= 1)) {
+      c = PickCompactionBySize(version, level, version->compaction_score_[i]);
+      if (ExpandWhileOverlapping(c) == false) {
+        delete c;
+        c = nullptr;
+      } else {
+        break;
+      }
+    }
+  }
+
+  // Find compactions needed by seeks
+  FileMetaData* f = version->file_to_compact_;
+  if (c == nullptr && f != nullptr && !f->being_compacted) {
+
+    level = version->file_to_compact_level_;
+    int parent_index = -1;
+
+    // Only allow one level 0 compaction at a time.
+    // Do not pick this file if its parents at level+1 are being compacted.
+    if (level != 0 || compactions_in_progress_[0].empty()) {
+      if (!ParentRangeInCompaction(version, &f->smallest, &f->largest, level,
+                                   &parent_index)) {
+        c = new Compaction(version, level, level + 1,
+                           MaxFileSizeForLevel(level + 1),
+                           MaxGrandParentOverlapBytes(level), true);
+        c->inputs_[0].push_back(f);
+        c->parent_index_ = parent_index;
+        c->input_version_->file_to_compact_ = nullptr;
+        if (ExpandWhileOverlapping(c) == false) {
+          return nullptr;
+        }
+      }
+    }
+  }
+
+  if (c == nullptr) {
+    return nullptr;
+  }
+
+  // Two level 0 compaction won't run at the same time, so don't need to worry
+  // about files on level 0 being compacted.
+  if (level == 0) {
+    assert(compactions_in_progress_[0].empty());
+    InternalKey smallest, largest;
+    GetRange(c->inputs_[0], &smallest, &largest);
+    // Note that the next call will discard the file we placed in
+    // c->inputs_[0] earlier and replace it with an overlapping set
+    // which will include the picked file.
+    c->inputs_[0].clear();
+    c->input_version_->GetOverlappingInputs(0, &smallest, &largest,
+                                            &c->inputs_[0]);
+
+    // If we include more L0 files in the same compaction run it can
+    // cause the 'smallest' and 'largest' key to get extended to a
+    // larger range. So, re-invoke GetRange to get the new key range
+    GetRange(c->inputs_[0], &smallest, &largest);
+    if (ParentRangeInCompaction(c->input_version_, &smallest, &largest, level,
+                                &c->parent_index_)) {
+      delete c;
+      return nullptr;
+    }
+    assert(!c->inputs_[0].empty());
+  }
+
+  // Setup "level+1" files (inputs_[1])
+  SetupOtherInputs(c);
+
+  // mark all the files that are being compacted
+  c->MarkFilesBeingCompacted(true);
+
+  // Is this compaction creating a file at the bottommost level
+  c->SetupBottomMostLevel(false);
+
+  // remember this currently undergoing compaction
+  compactions_in_progress_[level].insert(c);
+
+  return c;
+}
+
+Compaction* LevelCompactionPicker::PickCompactionBySize(Version* version,
+                                                        int level,
+                                                        double score) {
+  Compaction* c = nullptr;
+
+  // level 0 files are overlapping. So we cannot pick more
+  // than one concurrent compactions at this level. This
+  // could be made better by looking at key-ranges that are
+  // being compacted at level 0.
+  if (level == 0 && compactions_in_progress_[level].size() == 1) {
+    return nullptr;
+  }
+
+  assert(level >= 0);
+  assert(level + 1 < NumberLevels());
+  c = new Compaction(version, level, level + 1, MaxFileSizeForLevel(level + 1),
+                     MaxGrandParentOverlapBytes(level));
+  c->score_ = score;
+
+  // Pick the largest file in this level that is not already
+  // being compacted
+  std::vector<int>& file_size = c->input_version_->files_by_size_[level];
+
+  // record the first file that is not yet compacted
+  int nextIndex = -1;
+
+  for (unsigned int i = c->input_version_->next_file_to_compact_by_size_[level];
+       i < file_size.size(); i++) {
+    int index = file_size[i];
+    FileMetaData* f = c->input_version_->files_[level][index];
+
+    // check to verify files are arranged in descending size
+    assert((i == file_size.size() - 1) ||
+           (i >= Version::number_of_files_to_sort_ - 1) ||
+           (f->file_size >=
+            c->input_version_->files_[level][file_size[i + 1]]->file_size));
+
+    // do not pick a file to compact if it is being compacted
+    // from n-1 level.
+    if (f->being_compacted) {
+      continue;
+    }
+
+    // remember the startIndex for the next call to PickCompaction
+    if (nextIndex == -1) {
+      nextIndex = i;
+    }
+
+    // Do not pick this file if its parents at level+1 are being compacted.
+    // Maybe we can avoid redoing this work in SetupOtherInputs
+    int parent_index = -1;
+    if (ParentRangeInCompaction(c->input_version_, &f->smallest, &f->largest,
+                                level, &parent_index)) {
+      continue;
+    }
+    c->inputs_[0].push_back(f);
+    c->base_index_ = index;
+    c->parent_index_ = parent_index;
+    break;
+  }
+
+  if (c->inputs_[0].empty()) {
+    delete c;
+    c = nullptr;
+  }
+
+  // store where to start the iteration in the next call to PickCompaction
+  version->next_file_to_compact_by_size_[level] = nextIndex;
+
+  return c;
+}
+
+// Universal style of compaction. Pick files that are contiguous in
+// time-range to compact.
+//
+Compaction* UniversalCompactionPicker::PickCompaction(Version* version,
+                                                      LogBuffer* log_buffer) {
+  int level = 0;
+  double score = version->compaction_score_[0];
+
+  if ((version->files_[level].size() <
+       (unsigned int)options_->level0_file_num_compaction_trigger)) {
+    LogToBuffer(log_buffer, "[%s] Universal: nothing to do\n",
+                version->cfd_->GetName().c_str());
+    return nullptr;
+  }
+  Version::FileSummaryStorage tmp;
+  LogToBuffer(log_buffer, "[%s] Universal: candidate files(%zu): %s\n",
+              version->cfd_->GetName().c_str(), version->files_[level].size(),
+              version->LevelFileSummary(&tmp, 0));
+
+  // Check for size amplification first.
+  Compaction* c;
+  if ((c = PickCompactionUniversalSizeAmp(version, score, log_buffer)) !=
+      nullptr) {
+    LogToBuffer(log_buffer, "[%s] Universal: compacting for size amp\n",
+                version->cfd_->GetName().c_str());
+  } else {
+    // Size amplification is within limits. Try reducing read
+    // amplification while maintaining file size ratios.
+    unsigned int ratio = options_->compaction_options_universal.size_ratio;
+
+    if ((c = PickCompactionUniversalReadAmp(version, score, ratio, UINT_MAX,
+                                            log_buffer)) != nullptr) {
+      LogToBuffer(log_buffer, "[%s] Universal: compacting for size ratio\n",
+                  version->cfd_->GetName().c_str());
+    } else {
+      // Size amplification and file size ratios are within configured limits.
+      // If max read amplification is exceeding configured limits, then force
+      // compaction without looking at filesize ratios and try to reduce
+      // the number of files to fewer than level0_file_num_compaction_trigger.
+      unsigned int num_files = version->files_[level].size() -
+                               options_->level0_file_num_compaction_trigger;
+      if ((c = PickCompactionUniversalReadAmp(
+               version, score, UINT_MAX, num_files, log_buffer)) != nullptr) {
+        LogToBuffer(log_buffer, "[%s] Universal: compacting for file num\n",
+                    version->cfd_->GetName().c_str());
+      }
+    }
+  }
+  if (c == nullptr) {
+    return nullptr;
+  }
+  assert(c->inputs_[0].size() > 1);
+
+  // validate that all the chosen files are non overlapping in time
+  FileMetaData* newerfile __attribute__((unused)) = nullptr;
+  for (unsigned int i = 0; i < c->inputs_[0].size(); i++) {
+    FileMetaData* f = c->inputs_[0][i];
+    assert (f->smallest_seqno <= f->largest_seqno);
+    assert(newerfile == nullptr ||
+           newerfile->smallest_seqno > f->largest_seqno);
+    newerfile = f;
+  }
+
+  // The files are sorted from newest first to oldest last.
+  std::vector<int>& file_by_time = c->input_version_->files_by_size_[level];
+
+  // Is the earliest file part of this compaction?
+  int last_index = file_by_time[file_by_time.size()-1];
+  FileMetaData* last_file = c->input_version_->files_[level][last_index];
+  if (c->inputs_[0][c->inputs_[0].size()-1] == last_file) {
+    c->bottommost_level_ = true;
+  }
+
+  // update statistics
+  MeasureTime(options_->statistics.get(), NUM_FILES_IN_SINGLE_COMPACTION,
+              c->inputs_[0].size());
+
+  // mark all the files that are being compacted
+  c->MarkFilesBeingCompacted(true);
+
+  // remember this currently undergoing compaction
+  compactions_in_progress_[level].insert(c);
+
+  // Record whether this compaction includes all sst files.
+  // For now, it is only relevant in universal compaction mode.
+  c->is_full_compaction_ =
+      (c->inputs_[0].size() == c->input_version_->files_[0].size());
+
+  return c;
+}
+
+//
+// Consider compaction files based on their size differences with
+// the next file in time order.
+//
+Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp(
+    Version* version, double score, unsigned int ratio,
+    unsigned int max_number_of_files_to_compact, LogBuffer* log_buffer) {
+  int level = 0;
+
+  unsigned int min_merge_width =
+    options_->compaction_options_universal.min_merge_width;
+  unsigned int max_merge_width =
+    options_->compaction_options_universal.max_merge_width;
+
+  // The files are sorted from newest first to oldest last.
+  std::vector<int>& file_by_time = version->files_by_size_[level];
+  FileMetaData* f = nullptr;
+  bool done = false;
+  int start_index = 0;
+  unsigned int candidate_count = 0;
+  assert(file_by_time.size() == version->files_[level].size());
+
+  unsigned int max_files_to_compact = std::min(max_merge_width,
+                                       max_number_of_files_to_compact);
+  min_merge_width = std::max(min_merge_width, 2U);
+
+  // Considers a candidate file only if it is smaller than the
+  // total size accumulated so far.
+  for (unsigned int loop = 0; loop < file_by_time.size(); loop++) {
+
+    candidate_count = 0;
+
+    // Skip files that are already being compacted
+    for (f = nullptr; loop < file_by_time.size(); loop++) {
+      int index = file_by_time[loop];
+      f = version->files_[level][index];
+
+      if (!f->being_compacted) {
+        candidate_count = 1;
+        break;
+      }
+      LogToBuffer(
+          log_buffer, "[%s] Universal: file %lu[%d] being compacted, skipping",
+          version->cfd_->GetName().c_str(), (unsigned long)f->number, loop);
+      f = nullptr;
+    }
+
+    // This file is not being compacted. Consider it as the
+    // first candidate to be compacted.
+    uint64_t candidate_size =  f != nullptr? f->file_size : 0;
+    if (f != nullptr) {
+      LogToBuffer(
+          log_buffer, "[%s] Universal: Possible candidate file %lu[%d].",
+          version->cfd_->GetName().c_str(), (unsigned long)f->number, loop);
+    }
+
+    // Check if the suceeding files need compaction.
+    for (unsigned int i = loop+1;
+         candidate_count < max_files_to_compact && i < file_by_time.size();
+         i++) {
+      int index = file_by_time[i];
+      FileMetaData* f = version->files_[level][index];
+      if (f->being_compacted) {
+        break;
+      }
+      // Pick files if the total/last candidate file size (increased by the
+      // specified ratio) is still larger than the next candidate file.
+      // candidate_size is the total size of files picked so far with the
+      // default kCompactionStopStyleTotalSize; with
+      // kCompactionStopStyleSimilarSize, it's simply the size of the last
+      // picked file.
+      uint64_t sz = (candidate_size * (100L + ratio)) /100;
+      if (sz < f->file_size) {
+        break;
+      }
+      if (options_->compaction_options_universal.stop_style == kCompactionStopStyleSimilarSize) {
+        // Similar-size stopping rule: also check the last picked file isn't
+        // far larger than the next candidate file.
+        sz = (f->file_size * (100L + ratio)) / 100;
+        if (sz < candidate_size) {
+          // If the small file we've encountered begins a run of similar-size
+          // files, we'll pick them up on a future iteration of the outer
+          // loop. If it's some lonely straggler, it'll eventually get picked
+          // by the last-resort read amp strategy which disregards size ratios.
+          break;
+        }
+        candidate_size = f->file_size;
+      } else { // default kCompactionStopStyleTotalSize
+        candidate_size += f->file_size;
+      }
+      candidate_count++;
+    }
+
+    // Found a series of consecutive files that need compaction.
+    if (candidate_count >= (unsigned int)min_merge_width) {
+      start_index = loop;
+      done = true;
+      break;
+    } else {
+      for (unsigned int i = loop;
+           i < loop + candidate_count && i < file_by_time.size(); i++) {
+       int index = file_by_time[i];
+       FileMetaData* f = version->files_[level][index];
+       LogToBuffer(log_buffer,
+                   "[%s] Universal: Skipping file %lu[%d] with size %lu %d\n",
+                   version->cfd_->GetName().c_str(), (unsigned long)f->number,
+                   i, (unsigned long)f->file_size, f->being_compacted);
+      }
+    }
+  }
+  if (!done || candidate_count <= 1) {
+    return nullptr;
+  }
+  unsigned int first_index_after = start_index + candidate_count;
+  // Compression is enabled if files compacted earlier already reached
+  // size ratio of compression.
+  bool enable_compression = true;
+  int ratio_to_compress =
+      options_->compaction_options_universal.compression_size_percent;
+  if (ratio_to_compress >= 0) {
+    uint64_t total_size = version->NumLevelBytes(level);
+    uint64_t older_file_size = 0;
+    for (unsigned int i = file_by_time.size() - 1; i >= first_index_after;
+        i--) {
+      older_file_size += version->files_[level][file_by_time[i]]->file_size;
+      if (older_file_size * 100L >= total_size * (long) ratio_to_compress) {
+        enable_compression = false;
+        break;
+      }
+    }
+  }
+  Compaction* c =
+      new Compaction(version, level, level, MaxFileSizeForLevel(level),
+                     LLONG_MAX, false, enable_compression);
+  c->score_ = score;
+
+  for (unsigned int i = start_index; i < first_index_after; i++) {
+    int index = file_by_time[i];
+    FileMetaData* f = c->input_version_->files_[level][index];
+    c->inputs_[0].push_back(f);
+    LogToBuffer(log_buffer,
+                "[%s] Universal: Picking file %lu[%d] with size %lu\n",
+                version->cfd_->GetName().c_str(), (unsigned long)f->number, i,
+                (unsigned long)f->file_size);
+  }
+  return c;
+}
+
+// Look at overall size amplification. If size amplification
+// exceeeds the configured value, then do a compaction
+// of the candidate files all the way upto the earliest
+// base file (overrides configured values of file-size ratios,
+// min_merge_width and max_merge_width).
+//
+Compaction* UniversalCompactionPicker::PickCompactionUniversalSizeAmp(
+    Version* version, double score, LogBuffer* log_buffer) {
+  int level = 0;
+
+  // percentage flexibilty while reducing size amplification
+  uint64_t ratio = options_->compaction_options_universal.
+                     max_size_amplification_percent;
+
+  // The files are sorted from newest first to oldest last.
+  std::vector<int>& file_by_time = version->files_by_size_[level];
+  assert(file_by_time.size() == version->files_[level].size());
+
+  unsigned int candidate_count = 0;
+  uint64_t candidate_size = 0;
+  unsigned int start_index = 0;
+  FileMetaData* f = nullptr;
+
+  // Skip files that are already being compacted
+  for (unsigned int loop = 0; loop < file_by_time.size() - 1; loop++) {
+    int index = file_by_time[loop];
+    f = version->files_[level][index];
+    if (!f->being_compacted) {
+      start_index = loop;         // Consider this as the first candidate.
+      break;
+    }
+    LogToBuffer(log_buffer,
+                "[%s] Universal: skipping file %lu[%d] compacted %s",
+                version->cfd_->GetName().c_str(), (unsigned long)f->number,
+                loop, " cannot be a candidate to reduce size amp.\n");
+    f = nullptr;
+  }
+  if (f == nullptr) {
+    return nullptr;             // no candidate files
+  }
+
+  LogToBuffer(log_buffer, "[%s] Universal: First candidate file %lu[%d] %s",
+              version->cfd_->GetName().c_str(), (unsigned long)f->number,
+              start_index, " to reduce size amp.\n");
+
+  // keep adding up all the remaining files
+  for (unsigned int loop = start_index; loop < file_by_time.size() - 1;
+       loop++) {
+    int index = file_by_time[loop];
+    f = version->files_[level][index];
+    if (f->being_compacted) {
+      LogToBuffer(
+          log_buffer, "[%s] Universal: Possible candidate file %lu[%d] %s.",
+          version->cfd_->GetName().c_str(), (unsigned long)f->number, loop,
+          " is already being compacted. No size amp reduction possible.\n");
+      return nullptr;
+    }
+    candidate_size += f->file_size;
+    candidate_count++;
+  }
+  if (candidate_count == 0) {
+    return nullptr;
+  }
+
+  // size of earliest file
+  int index = file_by_time[file_by_time.size() - 1];
+  uint64_t earliest_file_size = version->files_[level][index]->file_size;
+
+  // size amplification = percentage of additional size
+  if (candidate_size * 100 < ratio * earliest_file_size) {
+    LogToBuffer(
+        log_buffer,
+        "[%s] Universal: size amp not needed. newer-files-total-size %lu "
+        "earliest-file-size %lu",
+        version->cfd_->GetName().c_str(), (unsigned long)candidate_size,
+        (unsigned long)earliest_file_size);
+    return nullptr;
+  } else {
+    LogToBuffer(log_buffer,
+                "[%s] Universal: size amp needed. newer-files-total-size %lu "
+                "earliest-file-size %lu",
+                version->cfd_->GetName().c_str(), (unsigned long)candidate_size,
+                (unsigned long)earliest_file_size);
+  }
+  assert(start_index >= 0 && start_index < file_by_time.size() - 1);
+
+  // create a compaction request
+  // We always compact all the files, so always compress.
+  Compaction* c =
+      new Compaction(version, level, level, MaxFileSizeForLevel(level),
+                     LLONG_MAX, false, true);
+  c->score_ = score;
+  for (unsigned int loop = start_index; loop < file_by_time.size(); loop++) {
+    int index = file_by_time[loop];
+    f = c->input_version_->files_[level][index];
+    c->inputs_[0].push_back(f);
+    LogToBuffer(log_buffer,
+                "[%s] Universal: size amp picking file %lu[%d] with size %lu",
+                version->cfd_->GetName().c_str(), (unsigned long)f->number,
+                index, (unsigned long)f->file_size);
+  }
+  return c;
+}
+
+}  // namespace rocksdb
diff --git a/db/compaction_picker.h b/db/compaction_picker.h
new file mode 100644 (file)
index 0000000..6527ef9
--- /dev/null
@@ -0,0 +1,165 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include "db/version_set.h"
+#include "db/compaction.h"
+#include "rocksdb/status.h"
+#include "rocksdb/options.h"
+#include "rocksdb/env.h"
+
+#include <vector>
+#include <memory>
+#include <set>
+
+namespace rocksdb {
+
+class LogBuffer;
+class Compaction;
+class Version;
+
+class CompactionPicker {
+ public:
+  CompactionPicker(const Options* options, const InternalKeyComparator* icmp);
+  virtual ~CompactionPicker();
+
+  // Pick level and inputs for a new compaction.
+  // Returns nullptr if there is no compaction to be done.
+  // Otherwise returns a pointer to a heap-allocated object that
+  // describes the compaction.  Caller should delete the result.
+  virtual Compaction* PickCompaction(Version* version,
+                                     LogBuffer* log_buffer) = 0;
+
+  // Return a compaction object for compacting the range [begin,end] in
+  // the specified level.  Returns nullptr if there is nothing in that
+  // level that overlaps the specified range.  Caller should delete
+  // the result.
+  //
+  // The returned Compaction might not include the whole requested range.
+  // In that case, compaction_end will be set to the next key that needs
+  // compacting. In case the compaction will compact the whole range,
+  // compaction_end will be set to nullptr.
+  // Client is responsible for compaction_end storage -- when called,
+  // *compaction_end should point to valid InternalKey!
+  Compaction* CompactRange(Version* version, int input_level, int output_level,
+                           const InternalKey* begin, const InternalKey* end,
+                           InternalKey** compaction_end);
+
+  // Free up the files that participated in a compaction
+  void ReleaseCompactionFiles(Compaction* c, Status status);
+
+  // Return the total amount of data that is undergoing
+  // compactions per level
+  void SizeBeingCompacted(std::vector<uint64_t>& sizes);
+
+  // Returns maximum total overlap bytes with grandparent
+  // level (i.e., level+2) before we stop building a single
+  // file in level->level+1 compaction.
+  uint64_t MaxGrandParentOverlapBytes(int level);
+
+  // Returns maximum total bytes of data on a given level.
+  double MaxBytesForLevel(int level);
+
+  // Get the max file size in a given level.
+  uint64_t MaxFileSizeForLevel(int level) const;
+
+ protected:
+  int NumberLevels() const { return num_levels_; }
+
+  // Stores the minimal range that covers all entries in inputs in
+  // *smallest, *largest.
+  // REQUIRES: inputs is not empty
+  void GetRange(const std::vector<FileMetaData*>& inputs, InternalKey* smallest,
+                InternalKey* largest);
+
+  // Stores the minimal range that covers all entries in inputs1 and inputs2
+  // in *smallest, *largest.
+  // REQUIRES: inputs is not empty
+  void GetRange(const std::vector<FileMetaData*>& inputs1,
+                const std::vector<FileMetaData*>& inputs2,
+                InternalKey* smallest, InternalKey* largest);
+
+  // Add more files to the inputs on "level" to make sure that
+  // no newer version of a key is compacted to "level+1" while leaving an older
+  // version in a "level". Otherwise, any Get() will search "level" first,
+  // and will likely return an old/stale value for the key, since it always
+  // searches in increasing order of level to find the value. This could
+  // also scramble the order of merge operands. This function should be
+  // called any time a new Compaction is created, and its inputs_[0] are
+  // populated.
+  //
+  // Will return false if it is impossible to apply this compaction.
+  bool ExpandWhileOverlapping(Compaction* c);
+
+  uint64_t ExpandedCompactionByteSizeLimit(int level);
+
+  // Returns true if any one of the specified files are being compacted
+  bool FilesInCompaction(std::vector<FileMetaData*>& files);
+
+  // Returns true if any one of the parent files are being compacted
+  bool ParentRangeInCompaction(Version* version, const InternalKey* smallest,
+                               const InternalKey* largest, int level,
+                               int* index);
+
+  void SetupOtherInputs(Compaction* c);
+
+  // record all the ongoing compactions for all levels
+  std::vector<std::set<Compaction*>> compactions_in_progress_;
+
+  // Per-level target file size.
+  std::unique_ptr<uint64_t[]> max_file_size_;
+
+  // Per-level max bytes
+  std::unique_ptr<uint64_t[]> level_max_bytes_;
+
+  const Options* const options_;
+
+ private:
+  int num_levels_;
+
+  const InternalKeyComparator* const icmp_;
+};
+
+class UniversalCompactionPicker : public CompactionPicker {
+ public:
+  UniversalCompactionPicker(const Options* options,
+                            const InternalKeyComparator* icmp)
+      : CompactionPicker(options, icmp) {}
+  virtual Compaction* PickCompaction(Version* version,
+                                     LogBuffer* log_buffer) override;
+
+ private:
+  // Pick Universal compaction to limit read amplification
+  Compaction* PickCompactionUniversalReadAmp(Version* version, double score,
+                                             unsigned int ratio,
+                                             unsigned int num_files,
+                                             LogBuffer* log_buffer);
+
+  // Pick Universal compaction to limit space amplification.
+  Compaction* PickCompactionUniversalSizeAmp(Version* version, double score,
+                                             LogBuffer* log_buffer);
+};
+
+class LevelCompactionPicker : public CompactionPicker {
+ public:
+  LevelCompactionPicker(const Options* options,
+                        const InternalKeyComparator* icmp)
+      : CompactionPicker(options, icmp) {}
+  virtual Compaction* PickCompaction(Version* version,
+                                     LogBuffer* log_buffer) override;
+
+ private:
+  // For the specfied level, pick a compaction.
+  // Returns nullptr if there is no compaction to be done.
+  // If level is 0 and there is already a compaction on that level, this
+  // function will return nullptr.
+  Compaction* PickCompactionBySize(Version* version, int level, double score);
+};
+
+}  // namespace rocksdb
diff --git a/db/corruption_test.cc b/db/corruption_test.cc
new file mode 100644 (file)
index 0000000..4726e92
--- /dev/null
@@ -0,0 +1,440 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/db.h"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include "rocksdb/cache.h"
+#include "rocksdb/env.h"
+#include "rocksdb/table.h"
+#include "rocksdb/write_batch.h"
+#include "db/db_impl.h"
+#include "db/filename.h"
+#include "db/log_format.h"
+#include "db/version_set.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+
+static const int kValueSize = 1000;
+
+class CorruptionTest {
+ public:
+  test::ErrorEnv env_;
+  std::string dbname_;
+  shared_ptr<Cache> tiny_cache_;
+  Options options_;
+  DB* db_;
+
+  CorruptionTest() {
+    tiny_cache_ = NewLRUCache(100);
+    options_.env = &env_;
+    dbname_ = test::TmpDir() + "/corruption_test";
+    DestroyDB(dbname_, options_);
+
+    db_ = nullptr;
+    options_.create_if_missing = true;
+    options_.block_size_deviation = 0; // make unit test pass for now
+    Reopen();
+    options_.create_if_missing = false;
+  }
+
+  ~CorruptionTest() {
+     delete db_;
+     DestroyDB(dbname_, Options());
+  }
+
+  Status TryReopen(Options* options = nullptr) {
+    delete db_;
+    db_ = nullptr;
+    Options opt = (options ? *options : options_);
+    opt.env = &env_;
+    opt.block_cache = tiny_cache_;
+    opt.block_size_deviation = 0;
+    opt.arena_block_size = 4096;
+    return DB::Open(opt, dbname_, &db_);
+  }
+
+  void Reopen(Options* options = nullptr) {
+    ASSERT_OK(TryReopen(options));
+  }
+
+  void RepairDB() {
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(::rocksdb::RepairDB(dbname_, options_));
+  }
+
+  void Build(int n) {
+    std::string key_space, value_space;
+    WriteBatch batch;
+    for (int i = 0; i < n; i++) {
+      //if ((i % 100) == 0) fprintf(stderr, "@ %d of %d\n", i, n);
+      Slice key = Key(i, &key_space);
+      batch.Clear();
+      batch.Put(key, Value(i, &value_space));
+      ASSERT_OK(db_->Write(WriteOptions(), &batch));
+    }
+  }
+
+  void Check(int min_expected, int max_expected) {
+    unsigned int next_expected = 0;
+    int missed = 0;
+    int bad_keys = 0;
+    int bad_values = 0;
+    int correct = 0;
+    std::string value_space;
+    // Do not verify checksums. If we verify checksums then the
+    // db itself will raise errors because data is corrupted.
+    // Instead, we want the reads to be successful and this test
+    // will detect whether the appropriate corruptions have
+    // occured.
+    Iterator* iter = db_->NewIterator(ReadOptions(false, true));
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      uint64_t key;
+      Slice in(iter->key());
+      if (!ConsumeDecimalNumber(&in, &key) ||
+          !in.empty() ||
+          key < next_expected) {
+        bad_keys++;
+        continue;
+      }
+      missed += (key - next_expected);
+      next_expected = key + 1;
+      if (iter->value() != Value(key, &value_space)) {
+        bad_values++;
+      } else {
+        correct++;
+      }
+    }
+    delete iter;
+
+    fprintf(stderr,
+            "expected=%d..%d; got=%d; bad_keys=%d; bad_values=%d; missed=%d\n",
+            min_expected, max_expected, correct, bad_keys, bad_values, missed);
+    ASSERT_LE(min_expected, correct);
+    ASSERT_GE(max_expected, correct);
+  }
+
+  void CorruptFile(const std::string fname, int offset, int bytes_to_corrupt) {
+    struct stat sbuf;
+    if (stat(fname.c_str(), &sbuf) != 0) {
+      const char* msg = strerror(errno);
+      ASSERT_TRUE(false) << fname << ": " << msg;
+    }
+
+    if (offset < 0) {
+      // Relative to end of file; make it absolute
+      if (-offset > sbuf.st_size) {
+        offset = 0;
+      } else {
+        offset = sbuf.st_size + offset;
+      }
+    }
+    if (offset > sbuf.st_size) {
+      offset = sbuf.st_size;
+    }
+    if (offset + bytes_to_corrupt > sbuf.st_size) {
+      bytes_to_corrupt = sbuf.st_size - offset;
+    }
+
+    // Do it
+    std::string contents;
+    Status s = ReadFileToString(Env::Default(), fname, &contents);
+    ASSERT_TRUE(s.ok()) << s.ToString();
+    for (int i = 0; i < bytes_to_corrupt; i++) {
+      contents[i + offset] ^= 0x80;
+    }
+    s = WriteStringToFile(Env::Default(), contents, fname);
+    ASSERT_TRUE(s.ok()) << s.ToString();
+  }
+
+  void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) {
+    // Pick file to corrupt
+    std::vector<std::string> filenames;
+    ASSERT_OK(env_.GetChildren(dbname_, &filenames));
+    uint64_t number;
+    FileType type;
+    std::string fname;
+    int picked_number = -1;
+    for (unsigned int i = 0; i < filenames.size(); i++) {
+      if (ParseFileName(filenames[i], &number, &type) &&
+          type == filetype &&
+          static_cast<int>(number) > picked_number) {  // Pick latest file
+        fname = dbname_ + "/" + filenames[i];
+        picked_number = number;
+      }
+    }
+    ASSERT_TRUE(!fname.empty()) << filetype;
+
+    CorruptFile(fname, offset, bytes_to_corrupt);
+  }
+
+  // corrupts exactly one file at level `level`. if no file found at level,
+  // asserts
+  void CorruptTableFileAtLevel(int level, int offset, int bytes_to_corrupt) {
+    std::vector<LiveFileMetaData> metadata;
+    db_->GetLiveFilesMetaData(&metadata);
+    for (const auto& m : metadata) {
+      if (m.level == level) {
+        CorruptFile(dbname_ + "/" + m.name, offset, bytes_to_corrupt);
+        return;
+      }
+    }
+    ASSERT_TRUE(false) << "no file found at level";
+  }
+
+
+  int Property(const std::string& name) {
+    std::string property;
+    int result;
+    if (db_->GetProperty(name, &property) &&
+        sscanf(property.c_str(), "%d", &result) == 1) {
+      return result;
+    } else {
+      return -1;
+    }
+  }
+
+  // Return the ith key
+  Slice Key(int i, std::string* storage) {
+    char buf[100];
+    snprintf(buf, sizeof(buf), "%016d", i);
+    storage->assign(buf, strlen(buf));
+    return Slice(*storage);
+  }
+
+  // Return the value to associate with the specified key
+  Slice Value(int k, std::string* storage) {
+    Random r(k);
+    return test::RandomString(&r, kValueSize, storage);
+  }
+};
+
+TEST(CorruptionTest, Recovery) {
+  Build(100);
+  Check(100, 100);
+  Corrupt(kLogFile, 19, 1);      // WriteBatch tag for first record
+  Corrupt(kLogFile, log::kBlockSize + 1000, 1);  // Somewhere in second block
+  Reopen();
+
+  // The 64 records in the first two log blocks are completely lost.
+  Check(36, 36);
+}
+
+TEST(CorruptionTest, RecoverWriteError) {
+  env_.writable_file_error_ = true;
+  Status s = TryReopen();
+  ASSERT_TRUE(!s.ok());
+}
+
+TEST(CorruptionTest, NewFileErrorDuringWrite) {
+  // Do enough writing to force minor compaction
+  env_.writable_file_error_ = true;
+  const int num = 3 + (Options().write_buffer_size / kValueSize);
+  std::string value_storage;
+  Status s;
+  bool failed = false;
+  for (int i = 0; i < num; i++) {
+    WriteBatch batch;
+    batch.Put("a", Value(100, &value_storage));
+    s = db_->Write(WriteOptions(), &batch);
+    if (!s.ok()) {
+      failed = true;
+    }
+    ASSERT_TRUE(!failed || !s.ok());
+  }
+  ASSERT_TRUE(!s.ok());
+  ASSERT_GE(env_.num_writable_file_errors_, 1);
+  env_.writable_file_error_ = false;
+  Reopen();
+}
+
+TEST(CorruptionTest, TableFile) {
+  Build(100);
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+  dbi->TEST_FlushMemTable();
+  dbi->TEST_CompactRange(0, nullptr, nullptr);
+  dbi->TEST_CompactRange(1, nullptr, nullptr);
+
+  Corrupt(kTableFile, 100, 1);
+  Check(99, 99);
+}
+
+TEST(CorruptionTest, TableFileIndexData) {
+  Build(10000);  // Enough to build multiple Tables
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+  dbi->TEST_FlushMemTable();
+
+  Corrupt(kTableFile, -2000, 500);
+  Reopen();
+  Check(5000, 9999);
+}
+
+TEST(CorruptionTest, MissingDescriptor) {
+  Build(1000);
+  RepairDB();
+  Reopen();
+  Check(1000, 1000);
+}
+
+TEST(CorruptionTest, SequenceNumberRecovery) {
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1"));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2"));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v3"));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v4"));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v5"));
+  RepairDB();
+  Reopen();
+  std::string v;
+  ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+  ASSERT_EQ("v5", v);
+  // Write something.  If sequence number was not recovered properly,
+  // it will be hidden by an earlier write.
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v6"));
+  ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+  ASSERT_EQ("v6", v);
+  Reopen();
+  ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+  ASSERT_EQ("v6", v);
+}
+
+TEST(CorruptionTest, CorruptedDescriptor) {
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "hello"));
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+  dbi->TEST_FlushMemTable();
+  dbi->TEST_CompactRange(0, nullptr, nullptr);
+
+  Corrupt(kDescriptorFile, 0, 1000);
+  Status s = TryReopen();
+  ASSERT_TRUE(!s.ok());
+
+  RepairDB();
+  Reopen();
+  std::string v;
+  ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+  ASSERT_EQ("hello", v);
+}
+
+TEST(CorruptionTest, CompactionInputError) {
+  Build(10);
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+  dbi->TEST_FlushMemTable();
+  const int last = dbi->MaxMemCompactionLevel();
+  ASSERT_EQ(1, Property("rocksdb.num-files-at-level" + NumberToString(last)));
+
+  Corrupt(kTableFile, 100, 1);
+  Check(9, 9);
+
+  // Force compactions by writing lots of values
+  Build(10000);
+  Check(10000, 10000);
+}
+
+TEST(CorruptionTest, CompactionInputErrorParanoid) {
+  Options options;
+  options.paranoid_checks = true;
+  options.write_buffer_size = 131072;
+  options.max_write_buffer_number = 2;
+  Reopen(&options);
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+
+  // Fill levels >= 1 so memtable flush outputs to level 0
+  for (int level = 1; level < dbi->NumberLevels(); level++) {
+    dbi->Put(WriteOptions(), "", "begin");
+    dbi->Put(WriteOptions(), "~", "end");
+    dbi->TEST_FlushMemTable();
+  }
+
+  options.max_mem_compaction_level = 0;
+  Reopen(&options);
+
+  dbi = reinterpret_cast<DBImpl*>(db_);
+  Build(10);
+  dbi->TEST_FlushMemTable();
+  dbi->TEST_WaitForCompact();
+  ASSERT_EQ(1, Property("rocksdb.num-files-at-level0"));
+
+  CorruptTableFileAtLevel(0, 100, 1);
+  Check(9, 9);
+
+  // Write must eventually fail because of corrupted table
+  Status s;
+  std::string tmp1, tmp2;
+  bool failed = false;
+  for (int i = 0; i < 10000; i++) {
+    s = db_->Put(WriteOptions(), Key(i, &tmp1), Value(i, &tmp2));
+    if (!s.ok()) {
+      failed = true;
+    }
+    // if one write failed, every subsequent write must fail, too
+    ASSERT_TRUE(!failed || !s.ok()) << "write did not fail in a corrupted db";
+  }
+  ASSERT_TRUE(!s.ok()) << "write did not fail in corrupted paranoid db";
+}
+
+TEST(CorruptionTest, UnrelatedKeys) {
+  Build(10);
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+  dbi->TEST_FlushMemTable();
+  Corrupt(kTableFile, 100, 1);
+
+  std::string tmp1, tmp2;
+  ASSERT_OK(db_->Put(WriteOptions(), Key(1000, &tmp1), Value(1000, &tmp2)));
+  std::string v;
+  ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v));
+  ASSERT_EQ(Value(1000, &tmp2).ToString(), v);
+  dbi->TEST_FlushMemTable();
+  ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v));
+  ASSERT_EQ(Value(1000, &tmp2).ToString(), v);
+}
+
+TEST(CorruptionTest, FileSystemStateCorrupted) {
+  for (int iter = 0; iter < 2; ++iter) {
+    Options options;
+    options.paranoid_checks = true;
+    options.create_if_missing = true;
+    Reopen(&options);
+    Build(10);
+    ASSERT_OK(db_->Flush(FlushOptions()));
+    DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+    std::vector<LiveFileMetaData> metadata;
+    dbi->GetLiveFilesMetaData(&metadata);
+    ASSERT_GT(metadata.size(), size_t(0));
+    std::string filename = dbname_ + metadata[0].name;
+
+    delete db_;
+    db_ = nullptr;
+
+    if (iter == 0) {  // corrupt file size
+      unique_ptr<WritableFile> file;
+      env_.NewWritableFile(filename, &file, EnvOptions());
+      file->Append(Slice("corrupted sst"));
+      file.reset();
+    } else {  // delete the file
+      env_.DeleteFile(filename);
+    }
+
+    Status x = TryReopen(&options);
+    ASSERT_TRUE(x.IsCorruption());
+    DestroyDB(dbname_, options_);
+    Reopen(&options);
+  }
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/db/db_bench.cc b/db/db_bench.cc
new file mode 100644 (file)
index 0000000..2e8da9e
--- /dev/null
@@ -0,0 +1,2616 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#define __STDC_FORMAT_MACROS
+#include <inttypes.h>
+#include <cstddef>
+#include <sys/types.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <gflags/gflags.h>
+#include "db/db_impl.h"
+#include "db/version_set.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/options.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/write_batch.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/perf_context.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "util/crc32c.h"
+#include "util/histogram.h"
+#include "util/mutexlock.h"
+#include "util/random.h"
+#include "util/string_util.h"
+#include "util/statistics.h"
+#include "util/testutil.h"
+#include "util/xxhash.h"
+#include "hdfs/env_hdfs.h"
+#include "utilities/merge_operators.h"
+
+
+DEFINE_string(benchmarks,
+              "fillseq,"
+              "fillsync,"
+              "fillrandom,"
+              "overwrite,"
+              "readrandom,"
+              "newiterator,"
+              "newiteratorwhilewriting,"
+              "seekrandom,"
+              "seekrandomwhilewriting,"
+              "readseq,"
+              "readreverse,"
+              "compact,"
+              "readrandom,"
+              "multireadrandom,"
+              "readseq,"
+              "readtocache,"
+              "readreverse,"
+              "readwhilewriting,"
+              "readrandomwriterandom,"
+              "updaterandom,"
+              "randomwithverify,"
+              "fill100K,"
+              "crc32c,"
+              "xxhash,"
+              "compress,"
+              "uncompress,"
+              "acquireload,",
+
+              "Comma-separated list of operations to run in the specified order"
+              "Actual benchmarks:\n"
+              "\tfillseq       -- write N values in sequential key"
+              " order in async mode\n"
+              "\tfillrandom    -- write N values in random key order in async"
+              " mode\n"
+              "\toverwrite     -- overwrite N values in random key order in"
+              " async mode\n"
+              "\tfillsync      -- write N/100 values in random key order in "
+              "sync mode\n"
+              "\tfill100K      -- write N/1000 100K values in random order in"
+              " async mode\n"
+              "\tdeleteseq     -- delete N keys in sequential order\n"
+              "\tdeleterandom  -- delete N keys in random order\n"
+              "\treadseq       -- read N times sequentially\n"
+              "\treadtocache   -- 1 thread reading database sequentially\n"
+              "\treadreverse   -- read N times in reverse order\n"
+              "\treadrandom    -- read N times in random order\n"
+              "\treadmissing   -- read N missing keys in random order\n"
+              "\treadhot       -- read N times in random order from 1% section "
+              "of DB\n"
+              "\treadwhilewriting      -- 1 writer, N threads doing random "
+              "reads\n"
+              "\treadrandomwriterandom -- N threads doing random-read, "
+              "random-write\n"
+              "\tprefixscanrandom      -- prefix scan N times in random order\n"
+              "\tupdaterandom  -- N threads doing read-modify-write for random "
+              "keys\n"
+              "\tappendrandom  -- N threads doing read-modify-write with "
+              "growing values\n"
+              "\tmergerandom   -- same as updaterandom/appendrandom using merge"
+              " operator. "
+              "Must be used with merge_operator\n"
+              "\treadrandommergerandom -- perform N random read-or-merge "
+              "operations. Must be used with merge_operator\n"
+              "\tnewiterator   -- repeated iterator creation\n"
+              "\tseekrandom    -- N random seeks\n"
+              "\tseekrandom    -- 1 writer, N threads doing random seeks\n"
+              "\tcrc32c        -- repeated crc32c of 4K of data\n"
+              "\txxhash        -- repeated xxHash of 4K of data\n"
+              "\tacquireload   -- load N*1000 times\n"
+              "Meta operations:\n"
+              "\tcompact     -- Compact the entire DB\n"
+              "\tstats       -- Print DB stats\n"
+              "\tlevelstats  -- Print the number of files and bytes per level\n"
+              "\tsstables    -- Print sstable info\n"
+              "\theapprofile -- Dump a heap profile (if supported by this"
+              " port)\n");
+
+DEFINE_int64(num, 1000000, "Number of key/values to place in database");
+
+DEFINE_int64(numdistinct, 1000,
+             "Number of distinct keys to use. Used in RandomWithVerify to "
+             "read/write on fewer keys so that gets are more likely to find the"
+             " key and puts are more likely to update the same key");
+
+DEFINE_int64(merge_keys, -1,
+             "Number of distinct keys to use for MergeRandom and "
+             "ReadRandomMergeRandom. "
+             "If negative, there will be FLAGS_num keys.");
+
+DEFINE_int64(reads, -1, "Number of read operations to do.  "
+             "If negative, do FLAGS_num reads.");
+
+DEFINE_int32(bloom_locality, 0, "Control bloom filter probes locality");
+
+DEFINE_int64(seed, 0, "Seed base for random number generators. "
+             "When 0 it is deterministic.");
+
+DEFINE_int32(threads, 1, "Number of concurrent threads to run.");
+
+DEFINE_int32(duration, 0, "Time in seconds for the random-ops tests to run."
+             " When 0 then num & reads determine the test duration");
+
+DEFINE_int32(value_size, 100, "Size of each value");
+
+
+// the maximum size of key in bytes
+static const int kMaxKeySize = 128;
+static bool ValidateKeySize(const char* flagname, int32_t value) {
+  if (value > kMaxKeySize) {
+    fprintf(stderr, "Invalid value for --%s: %d, must be < %d\n",
+            flagname, value, kMaxKeySize);
+    return false;
+  }
+  return true;
+}
+
+DEFINE_int32(key_size, 16, "size of each key");
+
+DEFINE_int32(num_multi_db, 0,
+             "Number of DBs used in the benchmark. 0 means single DB.");
+
+DEFINE_double(compression_ratio, 0.5, "Arrange to generate values that shrink"
+              " to this fraction of their original size after compression");
+
+DEFINE_bool(histogram, false, "Print histogram of operation timings");
+
+DEFINE_int64(write_buffer_size, rocksdb::Options().write_buffer_size,
+             "Number of bytes to buffer in memtable before compacting");
+
+DEFINE_int32(max_write_buffer_number,
+             rocksdb::Options().max_write_buffer_number,
+             "The number of in-memory memtables. Each memtable is of size"
+             "write_buffer_size.");
+
+DEFINE_int32(min_write_buffer_number_to_merge,
+             rocksdb::Options().min_write_buffer_number_to_merge,
+             "The minimum number of write buffers that will be merged together"
+             "before writing to storage. This is cheap because it is an"
+             "in-memory merge. If this feature is not enabled, then all these"
+             "write buffers are flushed to L0 as separate files and this "
+             "increases read amplification because a get request has to check"
+             " in all of these files. Also, an in-memory merge may result in"
+             " writing less data to storage if there are duplicate records "
+             " in each of these individual write buffers.");
+
+DEFINE_int32(max_background_compactions,
+             rocksdb::Options().max_background_compactions,
+             "The maximum number of concurrent background compactions"
+             " that can occur in parallel.");
+
+DEFINE_int32(max_background_flushes,
+             rocksdb::Options().max_background_flushes,
+             "The maximum number of concurrent background flushes"
+             " that can occur in parallel.");
+
+static rocksdb::CompactionStyle FLAGS_compaction_style_e;
+DEFINE_int32(compaction_style, (int32_t) rocksdb::Options().compaction_style,
+             "style of compaction: level-based vs universal");
+
+DEFINE_int32(universal_size_ratio, 0,
+             "Percentage flexibility while comparing file size"
+             " (for universal compaction only).");
+
+DEFINE_int32(universal_min_merge_width, 0, "The minimum number of files in a"
+             " single compaction run (for universal compaction only).");
+
+DEFINE_int32(universal_max_merge_width, 0, "The max number of files to compact"
+             " in universal style compaction");
+
+DEFINE_int32(universal_max_size_amplification_percent, 0,
+             "The max size amplification for universal style compaction");
+
+DEFINE_int32(universal_compression_size_percent, -1,
+             "The percentage of the database to compress for universal "
+             "compaction. -1 means compress everything.");
+
+DEFINE_int64(cache_size, -1, "Number of bytes to use as a cache of uncompressed"
+             "data. Negative means use default settings.");
+
+DEFINE_int32(block_size, rocksdb::Options().block_size,
+             "Number of bytes in a block.");
+
+DEFINE_int64(compressed_cache_size, -1,
+             "Number of bytes to use as a cache of compressed data.");
+
+DEFINE_int32(open_files, rocksdb::Options().max_open_files,
+             "Maximum number of files to keep open at the same time"
+             " (use default if == 0)");
+
+DEFINE_int32(bloom_bits, -1, "Bloom filter bits per key. Negative means"
+             " use default settings.");
+DEFINE_int32(memtable_bloom_bits, 0, "Bloom filter bits per key for memtable. "
+             "Negative means no bloom filter.");
+
+DEFINE_bool(use_existing_db, false, "If true, do not destroy the existing"
+            " database.  If you set this flag and also specify a benchmark that"
+            " wants a fresh database, that benchmark will fail.");
+
+DEFINE_string(db, "", "Use the db with the following name.");
+
+static bool ValidateCacheNumshardbits(const char* flagname, int32_t value) {
+  if (value >= 20) {
+    fprintf(stderr, "Invalid value for --%s: %d, must be < 20\n",
+            flagname, value);
+    return false;
+  }
+  return true;
+}
+DEFINE_int32(cache_numshardbits, -1, "Number of shards for the block cache"
+             " is 2 ** cache_numshardbits. Negative means use default settings."
+             " This is applied only if FLAGS_cache_size is non-negative.");
+
+DEFINE_int32(cache_remove_scan_count_limit, 32, "");
+
+DEFINE_bool(verify_checksum, false, "Verify checksum for every block read"
+            " from storage");
+
+DEFINE_bool(statistics, false, "Database statistics");
+static class std::shared_ptr<rocksdb::Statistics> dbstats;
+
+DEFINE_int64(writes, -1, "Number of write operations to do. If negative, do"
+             " --num reads.");
+
+DEFINE_int32(writes_per_second, 0, "Per-thread rate limit on writes per second."
+             " No limit when <= 0. Only for the readwhilewriting test.");
+
+DEFINE_bool(sync, false, "Sync all writes to disk");
+
+DEFINE_bool(disable_data_sync, false, "If true, do not wait until data is"
+            " synced to disk.");
+
+DEFINE_bool(use_fsync, false, "If true, issue fsync instead of fdatasync");
+
+DEFINE_bool(disable_wal, false, "If true, do not write WAL for write.");
+
+DEFINE_string(wal_dir, "", "If not empty, use the given dir for WAL");
+
+DEFINE_int32(num_levels, 7, "The total number of levels");
+
+DEFINE_int32(target_file_size_base, 2 * 1048576, "Target file size at level-1");
+
+DEFINE_int32(target_file_size_multiplier, 1,
+             "A multiplier to compute target level-N file size (N >= 2)");
+
+DEFINE_uint64(max_bytes_for_level_base,  10 * 1048576, "Max bytes for level-1");
+
+DEFINE_int32(max_bytes_for_level_multiplier, 10,
+             "A multiplier to compute max bytes for level-N (N >= 2)");
+
+static std::vector<int> FLAGS_max_bytes_for_level_multiplier_additional_v;
+DEFINE_string(max_bytes_for_level_multiplier_additional, "",
+              "A vector that specifies additional fanout per level");
+
+DEFINE_int32(level0_stop_writes_trigger, 12, "Number of files in level-0"
+             " that will trigger put stop.");
+
+DEFINE_int32(level0_slowdown_writes_trigger, 8, "Number of files in level-0"
+             " that will slow down writes.");
+
+DEFINE_int32(level0_file_num_compaction_trigger, 4, "Number of files in level-0"
+             " when compactions start");
+
+static bool ValidateInt32Percent(const char* flagname, int32_t value) {
+  if (value <= 0 || value>=100) {
+    fprintf(stderr, "Invalid value for --%s: %d, 0< pct <100 \n",
+            flagname, value);
+    return false;
+  }
+  return true;
+}
+DEFINE_int32(readwritepercent, 90, "Ratio of reads to reads/writes (expressed"
+             " as percentage) for the ReadRandomWriteRandom workload. The "
+             "default value 90 means 90% operations out of all reads and writes"
+             " operations are reads. In other words, 9 gets for every 1 put.");
+
+DEFINE_int32(mergereadpercent, 70, "Ratio of merges to merges&reads (expressed"
+             " as percentage) for the ReadRandomMergeRandom workload. The"
+             " default value 70 means 70% out of all read and merge operations"
+             " are merges. In other words, 7 merges for every 3 gets.");
+
+DEFINE_int32(deletepercent, 2, "Percentage of deletes out of reads/writes/"
+             "deletes (used in RandomWithVerify only). RandomWithVerify "
+             "calculates writepercent as (100 - FLAGS_readwritepercent - "
+             "deletepercent), so deletepercent must be smaller than (100 - "
+             "FLAGS_readwritepercent)");
+
+DEFINE_int32(disable_seek_compaction, false, "Option to disable compaction"
+             " triggered by read.");
+
+DEFINE_uint64(delete_obsolete_files_period_micros, 0, "Option to delete "
+              "obsolete files periodically. 0 means that obsolete files are"
+              " deleted after every compaction run.");
+
+namespace {
+enum rocksdb::CompressionType StringToCompressionType(const char* ctype) {
+  assert(ctype);
+
+  if (!strcasecmp(ctype, "none"))
+    return rocksdb::kNoCompression;
+  else if (!strcasecmp(ctype, "snappy"))
+    return rocksdb::kSnappyCompression;
+  else if (!strcasecmp(ctype, "zlib"))
+    return rocksdb::kZlibCompression;
+  else if (!strcasecmp(ctype, "bzip2"))
+    return rocksdb::kBZip2Compression;
+  else if (!strcasecmp(ctype, "lz4"))
+    return rocksdb::kLZ4Compression;
+  else if (!strcasecmp(ctype, "lz4hc"))
+    return rocksdb::kLZ4HCCompression;
+
+  fprintf(stdout, "Cannot parse compression type '%s'\n", ctype);
+  return rocksdb::kSnappyCompression; //default value
+}
+}  // namespace
+
+DEFINE_string(compression_type, "snappy",
+              "Algorithm to use to compress the database");
+static enum rocksdb::CompressionType FLAGS_compression_type_e =
+    rocksdb::kSnappyCompression;
+
+DEFINE_int32(compression_level, -1,
+             "Compression level. For zlib this should be -1 for the "
+             "default level, or between 0 and 9.");
+
+static bool ValidateCompressionLevel(const char* flagname, int32_t value) {
+  if (value < -1 || value > 9) {
+    fprintf(stderr, "Invalid value for --%s: %d, must be between -1 and 9\n",
+            flagname, value);
+    return false;
+  }
+  return true;
+}
+
+static const bool FLAGS_compression_level_dummy __attribute__((unused)) =
+    google::RegisterFlagValidator(&FLAGS_compression_level,
+                                  &ValidateCompressionLevel);
+
+DEFINE_int32(min_level_to_compress, -1, "If non-negative, compression starts"
+             " from this level. Levels with number < min_level_to_compress are"
+             " not compressed. Otherwise, apply compression_type to "
+             "all levels.");
+
+static bool ValidateTableCacheNumshardbits(const char* flagname,
+                                           int32_t value) {
+  if (0 >= value || value > 20) {
+    fprintf(stderr, "Invalid value for --%s: %d, must be  0 < val <= 20\n",
+            flagname, value);
+    return false;
+  }
+  return true;
+}
+DEFINE_int32(table_cache_numshardbits, 4, "");
+
+DEFINE_string(hdfs, "", "Name of hdfs environment");
+// posix or hdfs environment
+static rocksdb::Env* FLAGS_env = rocksdb::Env::Default();
+
+DEFINE_int64(stats_interval, 0, "Stats are reported every N operations when "
+             "this is greater than zero. When 0 the interval grows over time.");
+
+DEFINE_int32(stats_per_interval, 0, "Reports additional stats per interval when"
+             " this is greater than 0.");
+
+DEFINE_int32(perf_level, 0, "Level of perf collection");
+
+static bool ValidateRateLimit(const char* flagname, double value) {
+  static constexpr double EPSILON = 1e-10;
+  if ( value < -EPSILON ) {
+    fprintf(stderr, "Invalid value for --%s: %12.6f, must be >= 0.0\n",
+            flagname, value);
+    return false;
+  }
+  return true;
+}
+DEFINE_double(soft_rate_limit, 0.0, "");
+
+DEFINE_double(hard_rate_limit, 0.0, "When not equal to 0 this make threads "
+              "sleep at each stats reporting interval until the compaction"
+              " score for all levels is less than or equal to this value.");
+
+DEFINE_int32(rate_limit_delay_max_milliseconds, 1000,
+             "When hard_rate_limit is set then this is the max time a put will"
+             " be stalled.");
+
+DEFINE_int32(max_grandparent_overlap_factor, 10, "Control maximum bytes of "
+             "overlaps in grandparent (i.e., level+2) before we stop building a"
+             " single file in a level->level+1 compaction.");
+
+DEFINE_bool(readonly, false, "Run read only benchmarks.");
+
+DEFINE_bool(disable_auto_compactions, false, "Do not auto trigger compactions");
+
+DEFINE_int32(source_compaction_factor, 1, "Cap the size of data in level-K for"
+             " a compaction run that compacts Level-K with Level-(K+1) (for"
+             " K >= 1)");
+
+DEFINE_uint64(wal_ttl_seconds, 0, "Set the TTL for the WAL Files in seconds.");
+DEFINE_uint64(wal_size_limit_MB, 0, "Set the size limit for the WAL Files"
+              " in MB.");
+
+DEFINE_bool(bufferedio, rocksdb::EnvOptions().use_os_buffer,
+            "Allow buffered io using OS buffers");
+
+DEFINE_bool(mmap_read, rocksdb::EnvOptions().use_mmap_reads,
+            "Allow reads to occur via mmap-ing files");
+
+DEFINE_bool(mmap_write, rocksdb::EnvOptions().use_mmap_writes,
+            "Allow writes to occur via mmap-ing files");
+
+DEFINE_bool(advise_random_on_open, rocksdb::Options().advise_random_on_open,
+            "Advise random access on table file open");
+
+DEFINE_string(compaction_fadvice, "NORMAL",
+              "Access pattern advice when a file is compacted");
+static auto FLAGS_compaction_fadvice_e =
+  rocksdb::Options().access_hint_on_compaction_start;
+
+DEFINE_bool(use_tailing_iterator, false,
+            "Use tailing iterator to access a series of keys instead of get");
+
+DEFINE_bool(use_adaptive_mutex, rocksdb::Options().use_adaptive_mutex,
+            "Use adaptive mutex");
+
+DEFINE_uint64(bytes_per_sync,  rocksdb::Options().bytes_per_sync,
+              "Allows OS to incrementally sync files to disk while they are"
+              " being written, in the background. Issue one request for every"
+              " bytes_per_sync written. 0 turns it off.");
+DEFINE_bool(filter_deletes, false, " On true, deletes use bloom-filter and drop"
+            " the delete if key not present");
+
+DEFINE_int32(max_successive_merges, 0, "Maximum number of successive merge"
+             " operations on a key in the memtable");
+
+static bool ValidatePrefixSize(const char* flagname, int32_t value) {
+  if (value < 0 || value>=2000000000) {
+    fprintf(stderr, "Invalid value for --%s: %d. 0<= PrefixSize <=2000000000\n",
+            flagname, value);
+    return false;
+  }
+  return true;
+}
+DEFINE_int32(prefix_size, 0, "control the prefix size for HashSkipList and "
+             "plain table");
+DEFINE_int64(keys_per_prefix, 0, "control average number of keys generated "
+             "per prefix, 0 means no special handling of the prefix, "
+             "i.e. use the prefix comes with the generated random number.");
+
+enum RepFactory {
+  kSkipList,
+  kPrefixHash,
+  kVectorRep,
+  kHashLinkedList,
+  kCuckoo
+};
+
+namespace {
+enum RepFactory StringToRepFactory(const char* ctype) {
+  assert(ctype);
+
+  if (!strcasecmp(ctype, "skip_list"))
+    return kSkipList;
+  else if (!strcasecmp(ctype, "prefix_hash"))
+    return kPrefixHash;
+  else if (!strcasecmp(ctype, "vector"))
+    return kVectorRep;
+  else if (!strcasecmp(ctype, "hash_linkedlist"))
+    return kHashLinkedList;
+  else if (!strcasecmp(ctype, "cuckoo"))
+    return kCuckoo;
+
+  fprintf(stdout, "Cannot parse memreptable %s\n", ctype);
+  return kSkipList;
+}
+}  // namespace
+
+static enum RepFactory FLAGS_rep_factory;
+DEFINE_string(memtablerep, "skip_list", "");
+DEFINE_int64(hash_bucket_count, 1024 * 1024, "hash bucket count");
+DEFINE_bool(use_plain_table, false, "if use plain table "
+            "instead of block-based table format");
+
+DEFINE_string(merge_operator, "", "The merge operator to use with the database."
+              "If a new merge operator is specified, be sure to use fresh"
+              " database The possible merge operators are defined in"
+              " utilities/merge_operators.h");
+
+static const bool FLAGS_soft_rate_limit_dummy __attribute__((unused)) =
+  google::RegisterFlagValidator(&FLAGS_soft_rate_limit,
+                                &ValidateRateLimit);
+
+static const bool FLAGS_hard_rate_limit_dummy __attribute__((unused)) =
+  google::RegisterFlagValidator(&FLAGS_hard_rate_limit, &ValidateRateLimit);
+
+static const bool FLAGS_prefix_size_dummy __attribute__((unused)) =
+  google::RegisterFlagValidator(&FLAGS_prefix_size, &ValidatePrefixSize);
+
+static const bool FLAGS_key_size_dummy __attribute__((unused)) =
+  google::RegisterFlagValidator(&FLAGS_key_size, &ValidateKeySize);
+
+static const bool FLAGS_cache_numshardbits_dummy __attribute__((unused)) =
+  google::RegisterFlagValidator(&FLAGS_cache_numshardbits,
+                                &ValidateCacheNumshardbits);
+
+static const bool FLAGS_readwritepercent_dummy __attribute__((unused)) =
+  google::RegisterFlagValidator(&FLAGS_readwritepercent,
+                                &ValidateInt32Percent);
+
+static const bool FLAGS_deletepercent_dummy __attribute__((unused)) =
+  google::RegisterFlagValidator(&FLAGS_deletepercent,
+                                &ValidateInt32Percent);
+static const bool
+  FLAGS_table_cache_numshardbits_dummy __attribute__((unused)) =
+  google::RegisterFlagValidator(&FLAGS_table_cache_numshardbits,
+                                &ValidateTableCacheNumshardbits);
+
+namespace rocksdb {
+
+// Helper for quickly generating random data.
+class RandomGenerator {
+ private:
+  std::string data_;
+  unsigned int pos_;
+
+ public:
+  RandomGenerator() {
+    // We use a limited amount of data over and over again and ensure
+    // that it is larger than the compression window (32KB), and also
+    // large enough to serve all typical value sizes we want to write.
+    Random rnd(301);
+    std::string piece;
+    while (data_.size() < (unsigned)std::max(1048576, FLAGS_value_size)) {
+      // Add a short fragment that is as compressible as specified
+      // by FLAGS_compression_ratio.
+      test::CompressibleString(&rnd, FLAGS_compression_ratio, 100, &piece);
+      data_.append(piece);
+    }
+    pos_ = 0;
+  }
+
+  Slice Generate(unsigned int len) {
+    if (pos_ + len > data_.size()) {
+      pos_ = 0;
+      assert(len < data_.size());
+    }
+    pos_ += len;
+    return Slice(data_.data() + pos_ - len, len);
+  }
+};
+
+static void AppendWithSpace(std::string* str, Slice msg) {
+  if (msg.empty()) return;
+  if (!str->empty()) {
+    str->push_back(' ');
+  }
+  str->append(msg.data(), msg.size());
+}
+
+class Stats {
+ private:
+  int id_;
+  double start_;
+  double finish_;
+  double seconds_;
+  int64_t done_;
+  int64_t last_report_done_;
+  int64_t next_report_;
+  int64_t bytes_;
+  double last_op_finish_;
+  double last_report_finish_;
+  HistogramImpl hist_;
+  std::string message_;
+  bool exclude_from_merge_;
+
+ public:
+  Stats() { Start(-1); }
+
+  void Start(int id) {
+    id_ = id;
+    next_report_ = FLAGS_stats_interval ? FLAGS_stats_interval : 100;
+    last_op_finish_ = start_;
+    hist_.Clear();
+    done_ = 0;
+    last_report_done_ = 0;
+    bytes_ = 0;
+    seconds_ = 0;
+    start_ = FLAGS_env->NowMicros();
+    finish_ = start_;
+    last_report_finish_ = start_;
+    message_.clear();
+    // When set, stats from this thread won't be merged with others.
+    exclude_from_merge_ = false;
+  }
+
+  void Merge(const Stats& other) {
+    if (other.exclude_from_merge_)
+      return;
+
+    hist_.Merge(other.hist_);
+    done_ += other.done_;
+    bytes_ += other.bytes_;
+    seconds_ += other.seconds_;
+    if (other.start_ < start_) start_ = other.start_;
+    if (other.finish_ > finish_) finish_ = other.finish_;
+
+    // Just keep the messages from one thread
+    if (message_.empty()) message_ = other.message_;
+  }
+
+  void Stop() {
+    finish_ = FLAGS_env->NowMicros();
+    seconds_ = (finish_ - start_) * 1e-6;
+  }
+
+  void AddMessage(Slice msg) {
+    AppendWithSpace(&message_, msg);
+  }
+
+  void SetId(int id) { id_ = id; }
+  void SetExcludeFromMerge() { exclude_from_merge_ = true; }
+
+  void FinishedSingleOp(DB* db) {
+    if (FLAGS_histogram) {
+      double now = FLAGS_env->NowMicros();
+      double micros = now - last_op_finish_;
+      hist_.Add(micros);
+      if (micros > 20000 && !FLAGS_stats_interval) {
+        fprintf(stderr, "long op: %.1f micros%30s\r", micros, "");
+        fflush(stderr);
+      }
+      last_op_finish_ = now;
+    }
+
+    done_++;
+    if (done_ >= next_report_) {
+      if (!FLAGS_stats_interval) {
+        if      (next_report_ < 1000)   next_report_ += 100;
+        else if (next_report_ < 5000)   next_report_ += 500;
+        else if (next_report_ < 10000)  next_report_ += 1000;
+        else if (next_report_ < 50000)  next_report_ += 5000;
+        else if (next_report_ < 100000) next_report_ += 10000;
+        else if (next_report_ < 500000) next_report_ += 50000;
+        else                            next_report_ += 100000;
+        fprintf(stderr, "... finished %" PRIu64 " ops%30s\r", done_, "");
+        fflush(stderr);
+      } else {
+        double now = FLAGS_env->NowMicros();
+        fprintf(stderr,
+                "%s ... thread %d: (%" PRIu64 ",%" PRIu64 ") ops and "
+                "(%.1f,%.1f) ops/second in (%.6f,%.6f) seconds\n",
+                FLAGS_env->TimeToString((uint64_t) now/1000000).c_str(),
+                id_,
+                done_ - last_report_done_, done_,
+                (done_ - last_report_done_) /
+                ((now - last_report_finish_) / 1000000.0),
+                done_ / ((now - start_) / 1000000.0),
+                (now - last_report_finish_) / 1000000.0,
+                (now - start_) / 1000000.0);
+
+        if (FLAGS_stats_per_interval) {
+          std::string stats;
+          if (db && db->GetProperty("rocksdb.stats", &stats))
+            fprintf(stderr, "%s\n", stats.c_str());
+        }
+
+        fflush(stderr);
+        next_report_ += FLAGS_stats_interval;
+        last_report_finish_ = now;
+        last_report_done_ = done_;
+      }
+    }
+  }
+
+  void AddBytes(int64_t n) {
+    bytes_ += n;
+  }
+
+  void Report(const Slice& name) {
+    // Pretend at least one op was done in case we are running a benchmark
+    // that does not call FinishedSingleOp().
+    if (done_ < 1) done_ = 1;
+
+    std::string extra;
+    if (bytes_ > 0) {
+      // Rate is computed on actual elapsed time, not the sum of per-thread
+      // elapsed times.
+      double elapsed = (finish_ - start_) * 1e-6;
+      char rate[100];
+      snprintf(rate, sizeof(rate), "%6.1f MB/s",
+               (bytes_ / 1048576.0) / elapsed);
+      extra = rate;
+    }
+    AppendWithSpace(&extra, message_);
+    double elapsed = (finish_ - start_) * 1e-6;
+    double throughput = (double)done_/elapsed;
+
+    fprintf(stdout, "%-12s : %11.3f micros/op %ld ops/sec;%s%s\n",
+            name.ToString().c_str(),
+            elapsed * 1e6 / done_,
+            (long)throughput,
+            (extra.empty() ? "" : " "),
+            extra.c_str());
+    if (FLAGS_histogram) {
+      fprintf(stdout, "Microseconds per op:\n%s\n", hist_.ToString().c_str());
+    }
+    fflush(stdout);
+  }
+};
+
+// State shared by all concurrent executions of the same benchmark.
+struct SharedState {
+  port::Mutex mu;
+  port::CondVar cv;
+  int total;
+  int perf_level;
+
+  // Each thread goes through the following states:
+  //    (1) initializing
+  //    (2) waiting for others to be initialized
+  //    (3) running
+  //    (4) done
+
+  long num_initialized;
+  long num_done;
+  bool start;
+
+  SharedState() : cv(&mu), perf_level(FLAGS_perf_level) { }
+};
+
+// Per-thread state for concurrent executions of the same benchmark.
+struct ThreadState {
+  int tid;             // 0..n-1 when running in n threads
+  Random64 rand;         // Has different seeds for different threads
+  Stats stats;
+  SharedState* shared;
+
+  /* implicit */ ThreadState(int index)
+      : tid(index),
+        rand((FLAGS_seed ? FLAGS_seed : 1000) + index) {
+  }
+};
+
+class Duration {
+ public:
+  Duration(int max_seconds, int64_t max_ops) {
+    max_seconds_ = max_seconds;
+    max_ops_= max_ops;
+    ops_ = 0;
+    start_at_ = FLAGS_env->NowMicros();
+  }
+
+  bool Done(int64_t increment) {
+    if (increment <= 0) increment = 1;    // avoid Done(0) and infinite loops
+    ops_ += increment;
+
+    if (max_seconds_) {
+      // Recheck every appx 1000 ops (exact iff increment is factor of 1000)
+      if ((ops_/1000) != ((ops_-increment)/1000)) {
+        double now = FLAGS_env->NowMicros();
+        return ((now - start_at_) / 1000000.0) >= max_seconds_;
+      } else {
+        return false;
+      }
+    } else {
+      return ops_ > max_ops_;
+    }
+  }
+
+ private:
+  int max_seconds_;
+  int64_t max_ops_;
+  int64_t ops_;
+  double start_at_;
+};
+
+class Benchmark {
+ private:
+  shared_ptr<Cache> cache_;
+  shared_ptr<Cache> compressed_cache_;
+  const FilterPolicy* filter_policy_;
+  const SliceTransform* prefix_extractor_;
+  DB* db_;
+  std::vector<DB*> multi_dbs_;
+  int64_t num_;
+  int value_size_;
+  int key_size_;
+  int prefix_size_;
+  int64_t keys_per_prefix_;
+  int64_t entries_per_batch_;
+  WriteOptions write_options_;
+  int64_t reads_;
+  int64_t writes_;
+  int64_t readwrites_;
+  int64_t merge_keys_;
+  void PrintHeader() {
+    PrintEnvironment();
+    fprintf(stdout, "Keys:       %d bytes each\n", FLAGS_key_size);
+    fprintf(stdout, "Values:     %d bytes each (%d bytes after compression)\n",
+            FLAGS_value_size,
+            static_cast<int>(FLAGS_value_size * FLAGS_compression_ratio + 0.5));
+    fprintf(stdout, "Entries:    %" PRIu64 "\n", num_);
+    fprintf(stdout, "Prefix:    %d bytes\n", FLAGS_prefix_size);
+    fprintf(stdout, "Keys per prefix:    %" PRIu64 "\n", keys_per_prefix_);
+    fprintf(stdout, "RawSize:    %.1f MB (estimated)\n",
+            ((static_cast<int64_t>(FLAGS_key_size + FLAGS_value_size) * num_)
+             / 1048576.0));
+    fprintf(stdout, "FileSize:   %.1f MB (estimated)\n",
+            (((FLAGS_key_size + FLAGS_value_size * FLAGS_compression_ratio)
+              * num_)
+             / 1048576.0));
+    fprintf(stdout, "Write rate limit: %d\n", FLAGS_writes_per_second);
+    switch (FLAGS_compression_type_e) {
+      case rocksdb::kNoCompression:
+        fprintf(stdout, "Compression: none\n");
+        break;
+      case rocksdb::kSnappyCompression:
+        fprintf(stdout, "Compression: snappy\n");
+        break;
+      case rocksdb::kZlibCompression:
+        fprintf(stdout, "Compression: zlib\n");
+        break;
+      case rocksdb::kBZip2Compression:
+        fprintf(stdout, "Compression: bzip2\n");
+        break;
+      case rocksdb::kLZ4Compression:
+        fprintf(stdout, "Compression: lz4\n");
+        break;
+      case rocksdb::kLZ4HCCompression:
+        fprintf(stdout, "Compression: lz4hc\n");
+        break;
+    }
+
+    switch (FLAGS_rep_factory) {
+      case kPrefixHash:
+        fprintf(stdout, "Memtablerep: prefix_hash\n");
+        break;
+      case kSkipList:
+        fprintf(stdout, "Memtablerep: skip_list\n");
+        break;
+      case kVectorRep:
+        fprintf(stdout, "Memtablerep: vector\n");
+        break;
+      case kHashLinkedList:
+        fprintf(stdout, "Memtablerep: hash_linkedlist\n");
+        break;
+      case kCuckoo:
+        fprintf(stdout, "Memtablerep: cuckoo\n");
+        break;
+    }
+    fprintf(stdout, "Perf Level: %d\n", FLAGS_perf_level);
+
+    PrintWarnings();
+    fprintf(stdout, "------------------------------------------------\n");
+  }
+
+  void PrintWarnings() {
+#if defined(__GNUC__) && !defined(__OPTIMIZE__)
+    fprintf(stdout,
+            "WARNING: Optimization is disabled: benchmarks unnecessarily slow\n"
+            );
+#endif
+#ifndef NDEBUG
+    fprintf(stdout,
+            "WARNING: Assertions are enabled; benchmarks unnecessarily slow\n");
+#endif
+    if (FLAGS_compression_type_e != rocksdb::kNoCompression) {
+      // The test string should not be too small.
+      const int len = FLAGS_block_size;
+      char* text = (char*) malloc(len+1);
+      bool result = true;
+      const char* name = nullptr;
+      std::string compressed;
+
+      memset(text, (int) 'y', len);
+      text[len] = '\0';
+      switch (FLAGS_compression_type_e) {
+        case kSnappyCompression:
+          result = port::Snappy_Compress(Options().compression_opts, text,
+                                         strlen(text), &compressed);
+          name = "Snappy";
+          break;
+        case kZlibCompression:
+          result = port::Zlib_Compress(Options().compression_opts, text,
+                                       strlen(text), &compressed);
+          name = "Zlib";
+          break;
+        case kBZip2Compression:
+          result = port::BZip2_Compress(Options().compression_opts, text,
+                                        strlen(text), &compressed);
+          name = "BZip2";
+          break;
+        case kLZ4Compression:
+          result = port::LZ4_Compress(Options().compression_opts, text,
+                                      strlen(text), &compressed);
+          name = "LZ4";
+          break;
+        case kLZ4HCCompression:
+          result = port::LZ4HC_Compress(Options().compression_opts, text,
+                                        strlen(text), &compressed);
+          name = "LZ4HC";
+          break;
+        case kNoCompression:
+          assert(false); // cannot happen
+          break;
+      }
+
+      if (!result) {
+        fprintf(stdout, "WARNING: %s compression is not enabled\n", name);
+      } else if (name && compressed.size() >= strlen(text)) {
+        fprintf(stdout, "WARNING: %s compression is not effective\n", name);
+      }
+
+      free(text);
+    }
+  }
+
+// Current the following isn't equivalent to OS_LINUX.
+#if defined(__linux)
+  static Slice TrimSpace(Slice s) {
+    unsigned int start = 0;
+    while (start < s.size() && isspace(s[start])) {
+      start++;
+    }
+    unsigned int limit = s.size();
+    while (limit > start && isspace(s[limit-1])) {
+      limit--;
+    }
+    return Slice(s.data() + start, limit - start);
+  }
+#endif
+
+  void PrintEnvironment() {
+    fprintf(stderr, "LevelDB:    version %d.%d\n",
+            kMajorVersion, kMinorVersion);
+
+#if defined(__linux)
+    time_t now = time(nullptr);
+    fprintf(stderr, "Date:       %s", ctime(&now));  // ctime() adds newline
+
+    FILE* cpuinfo = fopen("/proc/cpuinfo", "r");
+    if (cpuinfo != nullptr) {
+      char line[1000];
+      int num_cpus = 0;
+      std::string cpu_type;
+      std::string cache_size;
+      while (fgets(line, sizeof(line), cpuinfo) != nullptr) {
+        const char* sep = strchr(line, ':');
+        if (sep == nullptr) {
+          continue;
+        }
+        Slice key = TrimSpace(Slice(line, sep - 1 - line));
+        Slice val = TrimSpace(Slice(sep + 1));
+        if (key == "model name") {
+          ++num_cpus;
+          cpu_type = val.ToString();
+        } else if (key == "cache size") {
+          cache_size = val.ToString();
+        }
+      }
+      fclose(cpuinfo);
+      fprintf(stderr, "CPU:        %d * %s\n", num_cpus, cpu_type.c_str());
+      fprintf(stderr, "CPUCache:   %s\n", cache_size.c_str());
+    }
+#endif
+  }
+
+ public:
+  Benchmark()
+  : cache_(FLAGS_cache_size >= 0 ?
+           (FLAGS_cache_numshardbits >= 1 ?
+            NewLRUCache(FLAGS_cache_size, FLAGS_cache_numshardbits,
+                        FLAGS_cache_remove_scan_count_limit) :
+            NewLRUCache(FLAGS_cache_size)) : nullptr),
+    compressed_cache_(FLAGS_compressed_cache_size >= 0 ?
+           (FLAGS_cache_numshardbits >= 1 ?
+            NewLRUCache(FLAGS_compressed_cache_size, FLAGS_cache_numshardbits) :
+            NewLRUCache(FLAGS_compressed_cache_size)) : nullptr),
+    filter_policy_(FLAGS_bloom_bits >= 0
+                   ? NewBloomFilterPolicy(FLAGS_bloom_bits)
+                   : nullptr),
+    prefix_extractor_(NewFixedPrefixTransform(FLAGS_prefix_size)),
+    db_(nullptr),
+    num_(FLAGS_num),
+    value_size_(FLAGS_value_size),
+    key_size_(FLAGS_key_size),
+    prefix_size_(FLAGS_prefix_size),
+    keys_per_prefix_(FLAGS_keys_per_prefix),
+    entries_per_batch_(1),
+    reads_(FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads),
+    writes_(FLAGS_writes < 0 ? FLAGS_num : FLAGS_writes),
+    readwrites_((FLAGS_writes < 0  && FLAGS_reads < 0)? FLAGS_num :
+                ((FLAGS_writes > FLAGS_reads) ? FLAGS_writes : FLAGS_reads)
+               ),
+    merge_keys_(FLAGS_merge_keys < 0 ? FLAGS_num : FLAGS_merge_keys) {
+    if (FLAGS_prefix_size > FLAGS_key_size) {
+      fprintf(stderr, "prefix size is larger than key size");
+      exit(1);
+    }
+
+    std::vector<std::string> files;
+    FLAGS_env->GetChildren(FLAGS_db, &files);
+    for (unsigned int i = 0; i < files.size(); i++) {
+      if (Slice(files[i]).starts_with("heap-")) {
+        FLAGS_env->DeleteFile(FLAGS_db + "/" + files[i]);
+      }
+    }
+    if (!FLAGS_use_existing_db) {
+      DestroyDB(FLAGS_db, Options());
+    }
+  }
+
+  ~Benchmark() {
+    delete db_;
+    delete filter_policy_;
+    delete prefix_extractor_;
+  }
+
+  Slice AllocateKey() {
+    return Slice(new char[key_size_], key_size_);
+  }
+
+  // Generate key according to the given specification and random number.
+  // The resulting key will have the following format (if keys_per_prefix_
+  // is positive), extra trailing bytes are either cut off or paddd with '0'.
+  // The prefix value is derived from key value.
+  //   ----------------------------
+  //   | prefix 00000 | key 00000 |
+  //   ----------------------------
+  // If keys_per_prefix_ is 0, the key is simply a binary representation of
+  // random number followed by trailing '0's
+  //   ----------------------------
+  //   |        key 00000         |
+  //   ----------------------------
+  void GenerateKeyFromInt(uint64_t v, int64_t num_keys, Slice* key) {
+    char* start = const_cast<char*>(key->data());
+    char* pos = start;
+    if (keys_per_prefix_ > 0) {
+      int64_t num_prefix = num_keys / keys_per_prefix_;
+      int64_t prefix = v % num_prefix;
+      int bytes_to_fill = std::min(prefix_size_, 8);
+      if (port::kLittleEndian) {
+        for (int i = 0; i < bytes_to_fill; ++i) {
+          pos[i] = (prefix >> ((bytes_to_fill - i - 1) << 3)) & 0xFF;
+        }
+      } else {
+        memcpy(pos, static_cast<void*>(&prefix), bytes_to_fill);
+      }
+      if (prefix_size_ > 8) {
+        // fill the rest with 0s
+        memset(pos + 8, '0', prefix_size_ - 8);
+      }
+      pos += prefix_size_;
+    }
+
+    int bytes_to_fill = std::min(key_size_ - static_cast<int>(pos - start), 8);
+    if (port::kLittleEndian) {
+      for (int i = 0; i < bytes_to_fill; ++i) {
+        pos[i] = (v >> ((bytes_to_fill - i - 1) << 3)) & 0xFF;
+      }
+    } else {
+      memcpy(pos, static_cast<void*>(&v), bytes_to_fill);
+    }
+    pos += bytes_to_fill;
+    if (key_size_ > pos - start) {
+      memset(pos, '0', key_size_ - (pos - start));
+    }
+  }
+
+  std::string GetDbNameForMultiple(std::string base_name, size_t id) {
+    return base_name + std::to_string(id);
+  }
+
+  void Run() {
+    PrintHeader();
+    Open();
+    const char* benchmarks = FLAGS_benchmarks.c_str();
+    while (benchmarks != nullptr) {
+      const char* sep = strchr(benchmarks, ',');
+      Slice name;
+      if (sep == nullptr) {
+        name = benchmarks;
+        benchmarks = nullptr;
+      } else {
+        name = Slice(benchmarks, sep - benchmarks);
+        benchmarks = sep + 1;
+      }
+
+      // Sanitize parameters
+      num_ = FLAGS_num;
+      reads_ = (FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads);
+      writes_ = (FLAGS_writes < 0 ? FLAGS_num : FLAGS_writes);
+      value_size_ = FLAGS_value_size;
+      key_size_ = FLAGS_key_size;
+      entries_per_batch_ = 1;
+      write_options_ = WriteOptions();
+      if (FLAGS_sync) {
+        write_options_.sync = true;
+      }
+      write_options_.disableWAL = FLAGS_disable_wal;
+
+      void (Benchmark::*method)(ThreadState*) = nullptr;
+      bool fresh_db = false;
+      int num_threads = FLAGS_threads;
+
+      if (name == Slice("fillseq")) {
+        fresh_db = true;
+        method = &Benchmark::WriteSeq;
+      } else if (name == Slice("fillbatch")) {
+        fresh_db = true;
+        entries_per_batch_ = 1000;
+        method = &Benchmark::WriteSeq;
+      } else if (name == Slice("fillrandom")) {
+        fresh_db = true;
+        method = &Benchmark::WriteRandom;
+      } else if (name == Slice("filluniquerandom")) {
+        fresh_db = true;
+        if (num_threads > 1) {
+          fprintf(stderr, "filluniquerandom multithreaded not supported"
+                           ", use 1 thread");
+          num_threads = 1;
+        }
+        method = &Benchmark::WriteUniqueRandom;
+      } else if (name == Slice("overwrite")) {
+        fresh_db = false;
+        method = &Benchmark::WriteRandom;
+      } else if (name == Slice("fillsync")) {
+        fresh_db = true;
+        num_ /= 1000;
+        write_options_.sync = true;
+        method = &Benchmark::WriteRandom;
+      } else if (name == Slice("fill100K")) {
+        fresh_db = true;
+        num_ /= 1000;
+        value_size_ = 100 * 1000;
+        method = &Benchmark::WriteRandom;
+      } else if (name == Slice("readseq")) {
+        method = &Benchmark::ReadSequential;
+      } else if (name == Slice("readtocache")) {
+        method = &Benchmark::ReadSequential;
+        num_threads = 1;
+        reads_ = num_;
+      } else if (name == Slice("readreverse")) {
+        method = &Benchmark::ReadReverse;
+      } else if (name == Slice("readrandom")) {
+        method = &Benchmark::ReadRandom;
+      } else if (name == Slice("multireadrandom")) {
+        method = &Benchmark::MultiReadRandom;
+      } else if (name == Slice("readmissing")) {
+        ++key_size_;
+        method = &Benchmark::ReadRandom;
+      } else if (name == Slice("newiterator")) {
+        method = &Benchmark::IteratorCreation;
+      } else if (name == Slice("newiteratorwhilewriting")) {
+        num_threads++;  // Add extra thread for writing
+        method = &Benchmark::IteratorCreationWhileWriting;
+      } else if (name == Slice("seekrandom")) {
+        method = &Benchmark::SeekRandom;
+      } else if (name == Slice("seekrandomwhilewriting")) {
+        num_threads++;  // Add extra thread for writing
+        method = &Benchmark::SeekRandomWhileWriting;
+      } else if (name == Slice("readrandomsmall")) {
+        reads_ /= 1000;
+        method = &Benchmark::ReadRandom;
+      } else if (name == Slice("deleteseq")) {
+        method = &Benchmark::DeleteSeq;
+      } else if (name == Slice("deleterandom")) {
+        method = &Benchmark::DeleteRandom;
+      } else if (name == Slice("readwhilewriting")) {
+        num_threads++;  // Add extra thread for writing
+        method = &Benchmark::ReadWhileWriting;
+      } else if (name == Slice("readrandomwriterandom")) {
+        method = &Benchmark::ReadRandomWriteRandom;
+      } else if (name == Slice("readrandommergerandom")) {
+        if (FLAGS_merge_operator.empty()) {
+          fprintf(stdout, "%-12s : skipped (--merge_operator is unknown)\n",
+                  name.ToString().c_str());
+          exit(1);
+        }
+        method = &Benchmark::ReadRandomMergeRandom;
+      } else if (name == Slice("updaterandom")) {
+        method = &Benchmark::UpdateRandom;
+      } else if (name == Slice("appendrandom")) {
+        method = &Benchmark::AppendRandom;
+      } else if (name == Slice("mergerandom")) {
+        if (FLAGS_merge_operator.empty()) {
+          fprintf(stdout, "%-12s : skipped (--merge_operator is unknown)\n",
+                  name.ToString().c_str());
+          exit(1);
+        }
+        method = &Benchmark::MergeRandom;
+      } else if (name == Slice("randomwithverify")) {
+        method = &Benchmark::RandomWithVerify;
+      } else if (name == Slice("compact")) {
+        method = &Benchmark::Compact;
+      } else if (name == Slice("crc32c")) {
+        method = &Benchmark::Crc32c;
+      } else if (name == Slice("xxhash")) {
+        method = &Benchmark::xxHash;
+      } else if (name == Slice("acquireload")) {
+        method = &Benchmark::AcquireLoad;
+      } else if (name == Slice("compress")) {
+        method = &Benchmark::Compress;
+      } else if (name == Slice("uncompress")) {
+        method = &Benchmark::Uncompress;
+      } else if (name == Slice("stats")) {
+        PrintStats("rocksdb.stats");
+      } else if (name == Slice("levelstats")) {
+        PrintStats("rocksdb.levelstats");
+      } else if (name == Slice("sstables")) {
+        PrintStats("rocksdb.sstables");
+      } else {
+        if (name != Slice()) {  // No error message for empty name
+          fprintf(stderr, "unknown benchmark '%s'\n", name.ToString().c_str());
+          exit(1);
+        }
+      }
+
+      if (fresh_db) {
+        if (FLAGS_use_existing_db) {
+          fprintf(stdout, "%-12s : skipped (--use_existing_db is true)\n",
+                  name.ToString().c_str());
+          method = nullptr;
+        } else {
+          if (db_ != nullptr) {
+            delete db_;
+            db_ = nullptr;
+            DestroyDB(FLAGS_db, Options());
+          }
+          for (size_t i = 0; i < multi_dbs_.size(); i++) {
+            delete multi_dbs_[i];
+            DestroyDB(GetDbNameForMultiple(FLAGS_db, i), Options());
+          }
+          multi_dbs_.clear();
+        }
+        Open();
+      }
+
+      if (method != nullptr) {
+        fprintf(stdout, "DB path: [%s]\n", FLAGS_db.c_str());
+        RunBenchmark(num_threads, name, method);
+      }
+    }
+    if (FLAGS_statistics) {
+     fprintf(stdout, "STATISTICS:\n%s\n", dbstats->ToString().c_str());
+    }
+  }
+
+ private:
+  struct ThreadArg {
+    Benchmark* bm;
+    SharedState* shared;
+    ThreadState* thread;
+    void (Benchmark::*method)(ThreadState*);
+  };
+
+  static void ThreadBody(void* v) {
+    ThreadArg* arg = reinterpret_cast<ThreadArg*>(v);
+    SharedState* shared = arg->shared;
+    ThreadState* thread = arg->thread;
+    {
+      MutexLock l(&shared->mu);
+      shared->num_initialized++;
+      if (shared->num_initialized >= shared->total) {
+        shared->cv.SignalAll();
+      }
+      while (!shared->start) {
+        shared->cv.Wait();
+      }
+    }
+
+    SetPerfLevel(static_cast<PerfLevel> (shared->perf_level));
+    thread->stats.Start(thread->tid);
+    (arg->bm->*(arg->method))(thread);
+    thread->stats.Stop();
+
+    {
+      MutexLock l(&shared->mu);
+      shared->num_done++;
+      if (shared->num_done >= shared->total) {
+        shared->cv.SignalAll();
+      }
+    }
+  }
+
+  void RunBenchmark(int n, Slice name,
+                    void (Benchmark::*method)(ThreadState*)) {
+    SharedState shared;
+    shared.total = n;
+    shared.num_initialized = 0;
+    shared.num_done = 0;
+    shared.start = false;
+
+    ThreadArg* arg = new ThreadArg[n];
+    for (int i = 0; i < n; i++) {
+      arg[i].bm = this;
+      arg[i].method = method;
+      arg[i].shared = &shared;
+      arg[i].thread = new ThreadState(i);
+      arg[i].thread->shared = &shared;
+      FLAGS_env->StartThread(ThreadBody, &arg[i]);
+    }
+
+    shared.mu.Lock();
+    while (shared.num_initialized < n) {
+      shared.cv.Wait();
+    }
+
+    shared.start = true;
+    shared.cv.SignalAll();
+    while (shared.num_done < n) {
+      shared.cv.Wait();
+    }
+    shared.mu.Unlock();
+
+    // Stats for some threads can be excluded.
+    Stats merge_stats;
+    for (int i = 0; i < n; i++) {
+      merge_stats.Merge(arg[i].thread->stats);
+    }
+    merge_stats.Report(name);
+
+    for (int i = 0; i < n; i++) {
+      delete arg[i].thread;
+    }
+    delete[] arg;
+  }
+
+  void Crc32c(ThreadState* thread) {
+    // Checksum about 500MB of data total
+    const int size = 4096;
+    const char* label = "(4K per op)";
+    std::string data(size, 'x');
+    int64_t bytes = 0;
+    uint32_t crc = 0;
+    while (bytes < 500 * 1048576) {
+      crc = crc32c::Value(data.data(), size);
+      thread->stats.FinishedSingleOp(nullptr);
+      bytes += size;
+    }
+    // Print so result is not dead
+    fprintf(stderr, "... crc=0x%x\r", static_cast<unsigned int>(crc));
+
+    thread->stats.AddBytes(bytes);
+    thread->stats.AddMessage(label);
+  }
+
+  void xxHash(ThreadState* thread) {
+    // Checksum about 500MB of data total
+    const int size = 4096;
+    const char* label = "(4K per op)";
+    std::string data(size, 'x');
+    int64_t bytes = 0;
+    unsigned int xxh32 = 0;
+    while (bytes < 500 * 1048576) {
+      xxh32 = XXH32(data.data(), size, 0);
+      thread->stats.FinishedSingleOp(nullptr);
+      bytes += size;
+    }
+    // Print so result is not dead
+    fprintf(stderr, "... xxh32=0x%x\r", static_cast<unsigned int>(xxh32));
+
+    thread->stats.AddBytes(bytes);
+    thread->stats.AddMessage(label);
+  }
+
+  void AcquireLoad(ThreadState* thread) {
+    int dummy;
+    port::AtomicPointer ap(&dummy);
+    int count = 0;
+    void *ptr = nullptr;
+    thread->stats.AddMessage("(each op is 1000 loads)");
+    while (count < 100000) {
+      for (int i = 0; i < 1000; i++) {
+        ptr = ap.Acquire_Load();
+      }
+      count++;
+      thread->stats.FinishedSingleOp(nullptr);
+    }
+    if (ptr == nullptr) exit(1); // Disable unused variable warning.
+  }
+
+  void Compress(ThreadState *thread) {
+    RandomGenerator gen;
+    Slice input = gen.Generate(Options().block_size);
+    int64_t bytes = 0;
+    int64_t produced = 0;
+    bool ok = true;
+    std::string compressed;
+
+    // Compress 1G
+    while (ok && bytes < int64_t(1) << 30) {
+      switch (FLAGS_compression_type_e) {
+      case rocksdb::kSnappyCompression:
+        ok = port::Snappy_Compress(Options().compression_opts, input.data(),
+                                   input.size(), &compressed);
+        break;
+      case rocksdb::kZlibCompression:
+        ok = port::Zlib_Compress(Options().compression_opts, input.data(),
+                                 input.size(), &compressed);
+        break;
+      case rocksdb::kBZip2Compression:
+        ok = port::BZip2_Compress(Options().compression_opts, input.data(),
+                                  input.size(), &compressed);
+        break;
+      case rocksdb::kLZ4Compression:
+        ok = port::LZ4_Compress(Options().compression_opts, input.data(),
+                                input.size(), &compressed);
+        break;
+      case rocksdb::kLZ4HCCompression:
+        ok = port::LZ4HC_Compress(Options().compression_opts, input.data(),
+                                  input.size(), &compressed);
+        break;
+      default:
+        ok = false;
+      }
+      produced += compressed.size();
+      bytes += input.size();
+      thread->stats.FinishedSingleOp(nullptr);
+    }
+
+    if (!ok) {
+      thread->stats.AddMessage("(compression failure)");
+    } else {
+      char buf[100];
+      snprintf(buf, sizeof(buf), "(output: %.1f%%)",
+               (produced * 100.0) / bytes);
+      thread->stats.AddMessage(buf);
+      thread->stats.AddBytes(bytes);
+    }
+  }
+
+  void Uncompress(ThreadState *thread) {
+    RandomGenerator gen;
+    Slice input = gen.Generate(Options().block_size);
+    std::string compressed;
+
+    bool ok;
+    switch (FLAGS_compression_type_e) {
+    case rocksdb::kSnappyCompression:
+      ok = port::Snappy_Compress(Options().compression_opts, input.data(),
+                                 input.size(), &compressed);
+      break;
+    case rocksdb::kZlibCompression:
+      ok = port::Zlib_Compress(Options().compression_opts, input.data(),
+                               input.size(), &compressed);
+      break;
+    case rocksdb::kBZip2Compression:
+      ok = port::BZip2_Compress(Options().compression_opts, input.data(),
+                                input.size(), &compressed);
+      break;
+    case rocksdb::kLZ4Compression:
+      ok = port::LZ4_Compress(Options().compression_opts, input.data(),
+                              input.size(), &compressed);
+      break;
+    case rocksdb::kLZ4HCCompression:
+      ok = port::LZ4HC_Compress(Options().compression_opts, input.data(),
+                                input.size(), &compressed);
+      break;
+    default:
+      ok = false;
+    }
+
+    int64_t bytes = 0;
+    int decompress_size;
+    while (ok && bytes < 1024 * 1048576) {
+      char *uncompressed = nullptr;
+      switch (FLAGS_compression_type_e) {
+      case rocksdb::kSnappyCompression:
+        // allocate here to make comparison fair
+        uncompressed = new char[input.size()];
+        ok = port::Snappy_Uncompress(compressed.data(), compressed.size(),
+                                     uncompressed);
+        break;
+      case rocksdb::kZlibCompression:
+        uncompressed = port::Zlib_Uncompress(
+            compressed.data(), compressed.size(), &decompress_size);
+        ok = uncompressed != nullptr;
+        break;
+      case rocksdb::kBZip2Compression:
+        uncompressed = port::BZip2_Uncompress(
+            compressed.data(), compressed.size(), &decompress_size);
+        ok = uncompressed != nullptr;
+        break;
+      case rocksdb::kLZ4Compression:
+        uncompressed = port::LZ4_Uncompress(
+            compressed.data(), compressed.size(), &decompress_size);
+        ok = uncompressed != nullptr;
+        break;
+      case rocksdb::kLZ4HCCompression:
+        uncompressed = port::LZ4_Uncompress(
+            compressed.data(), compressed.size(), &decompress_size);
+        ok = uncompressed != nullptr;
+        break;
+      default:
+        ok = false;
+      }
+      delete[] uncompressed;
+      bytes += input.size();
+      thread->stats.FinishedSingleOp(nullptr);
+    }
+
+    if (!ok) {
+      thread->stats.AddMessage("(compression failure)");
+    } else {
+      thread->stats.AddBytes(bytes);
+    }
+  }
+
+  void Open() {
+    assert(db_ == nullptr);
+    Options options;
+    options.create_if_missing = !FLAGS_use_existing_db;
+    options.block_cache = cache_;
+    options.block_cache_compressed = compressed_cache_;
+    if (cache_ == nullptr) {
+      options.no_block_cache = true;
+    }
+    options.write_buffer_size = FLAGS_write_buffer_size;
+    options.max_write_buffer_number = FLAGS_max_write_buffer_number;
+    options.min_write_buffer_number_to_merge =
+      FLAGS_min_write_buffer_number_to_merge;
+    options.max_background_compactions = FLAGS_max_background_compactions;
+    options.max_background_flushes = FLAGS_max_background_flushes;
+    options.compaction_style = FLAGS_compaction_style_e;
+    options.block_size = FLAGS_block_size;
+    options.filter_policy = filter_policy_;
+    if (FLAGS_use_plain_table) {
+      options.prefix_extractor.reset(
+          NewFixedPrefixTransform(FLAGS_prefix_size));
+    }
+    options.memtable_prefix_bloom_bits = FLAGS_memtable_bloom_bits;
+    options.bloom_locality = FLAGS_bloom_locality;
+    options.max_open_files = FLAGS_open_files;
+    options.statistics = dbstats;
+    options.env = FLAGS_env;
+    options.disableDataSync = FLAGS_disable_data_sync;
+    options.use_fsync = FLAGS_use_fsync;
+    options.wal_dir = FLAGS_wal_dir;
+    options.num_levels = FLAGS_num_levels;
+    options.target_file_size_base = FLAGS_target_file_size_base;
+    options.target_file_size_multiplier = FLAGS_target_file_size_multiplier;
+    options.max_bytes_for_level_base = FLAGS_max_bytes_for_level_base;
+    options.max_bytes_for_level_multiplier =
+        FLAGS_max_bytes_for_level_multiplier;
+    options.filter_deletes = FLAGS_filter_deletes;
+    if ((FLAGS_prefix_size == 0) && (FLAGS_rep_factory == kPrefixHash ||
+                                     FLAGS_rep_factory == kHashLinkedList)) {
+      fprintf(stderr, "prefix_size should be non-zero if PrefixHash or "
+                      "HashLinkedList memtablerep is used\n");
+      exit(1);
+    }
+    switch (FLAGS_rep_factory) {
+      case kPrefixHash:
+        options.memtable_factory.reset(NewHashSkipListRepFactory(
+            FLAGS_hash_bucket_count));
+        break;
+      case kSkipList:
+        // no need to do anything
+        break;
+      case kHashLinkedList:
+        options.memtable_factory.reset(NewHashLinkListRepFactory(
+            FLAGS_hash_bucket_count));
+        break;
+      case kVectorRep:
+        options.memtable_factory.reset(
+          new VectorRepFactory
+        );
+        break;
+      case kCuckoo:
+        options.memtable_factory.reset(NewHashCuckooRepFactory(
+            options.write_buffer_size, FLAGS_key_size + FLAGS_value_size));
+        break;
+    }
+    if (FLAGS_use_plain_table) {
+      if (FLAGS_rep_factory != kPrefixHash &&
+          FLAGS_rep_factory != kHashLinkedList) {
+        fprintf(stderr, "Waring: plain table is used with skipList\n");
+      }
+      if (!FLAGS_mmap_read && !FLAGS_mmap_write) {
+        fprintf(stderr, "plain table format requires mmap to operate\n");
+        exit(1);
+      }
+
+      int bloom_bits_per_key = FLAGS_bloom_bits;
+      if (bloom_bits_per_key < 0) {
+        bloom_bits_per_key = 0;
+      }
+      options.table_factory = std::shared_ptr<TableFactory>(
+          NewPlainTableFactory(FLAGS_key_size, bloom_bits_per_key, 0.75));
+    }
+    if (FLAGS_max_bytes_for_level_multiplier_additional_v.size() > 0) {
+      if (FLAGS_max_bytes_for_level_multiplier_additional_v.size() !=
+          (unsigned int)FLAGS_num_levels) {
+        fprintf(stderr, "Insufficient number of fanouts specified %d\n",
+                (int)FLAGS_max_bytes_for_level_multiplier_additional_v.size());
+        exit(1);
+      }
+      options.max_bytes_for_level_multiplier_additional =
+        FLAGS_max_bytes_for_level_multiplier_additional_v;
+    }
+    options.level0_stop_writes_trigger = FLAGS_level0_stop_writes_trigger;
+    options.level0_file_num_compaction_trigger =
+        FLAGS_level0_file_num_compaction_trigger;
+    options.level0_slowdown_writes_trigger =
+      FLAGS_level0_slowdown_writes_trigger;
+    options.compression = FLAGS_compression_type_e;
+    options.compression_opts.level = FLAGS_compression_level;
+    options.WAL_ttl_seconds = FLAGS_wal_ttl_seconds;
+    options.WAL_size_limit_MB = FLAGS_wal_size_limit_MB;
+    if (FLAGS_min_level_to_compress >= 0) {
+      assert(FLAGS_min_level_to_compress <= FLAGS_num_levels);
+      options.compression_per_level.resize(FLAGS_num_levels);
+      for (int i = 0; i < FLAGS_min_level_to_compress; i++) {
+        options.compression_per_level[i] = kNoCompression;
+      }
+      for (int i = FLAGS_min_level_to_compress;
+           i < FLAGS_num_levels; i++) {
+        options.compression_per_level[i] = FLAGS_compression_type_e;
+      }
+    }
+    options.disable_seek_compaction = FLAGS_disable_seek_compaction;
+    options.delete_obsolete_files_period_micros =
+      FLAGS_delete_obsolete_files_period_micros;
+    options.soft_rate_limit = FLAGS_soft_rate_limit;
+    options.hard_rate_limit = FLAGS_hard_rate_limit;
+    options.rate_limit_delay_max_milliseconds =
+      FLAGS_rate_limit_delay_max_milliseconds;
+    options.table_cache_numshardbits = FLAGS_table_cache_numshardbits;
+    options.max_grandparent_overlap_factor =
+      FLAGS_max_grandparent_overlap_factor;
+    options.disable_auto_compactions = FLAGS_disable_auto_compactions;
+    options.source_compaction_factor = FLAGS_source_compaction_factor;
+
+    // fill storage options
+    options.allow_os_buffer = FLAGS_bufferedio;
+    options.allow_mmap_reads = FLAGS_mmap_read;
+    options.allow_mmap_writes = FLAGS_mmap_write;
+    options.advise_random_on_open = FLAGS_advise_random_on_open;
+    options.access_hint_on_compaction_start = FLAGS_compaction_fadvice_e;
+    options.use_adaptive_mutex = FLAGS_use_adaptive_mutex;
+    options.bytes_per_sync = FLAGS_bytes_per_sync;
+
+    // merge operator options
+    options.merge_operator = MergeOperators::CreateFromStringId(
+        FLAGS_merge_operator);
+    if (options.merge_operator == nullptr && !FLAGS_merge_operator.empty()) {
+      fprintf(stderr, "invalid merge operator: %s\n",
+              FLAGS_merge_operator.c_str());
+      exit(1);
+    }
+    options.max_successive_merges = FLAGS_max_successive_merges;
+
+    // set universal style compaction configurations, if applicable
+    if (FLAGS_universal_size_ratio != 0) {
+      options.compaction_options_universal.size_ratio =
+        FLAGS_universal_size_ratio;
+    }
+    if (FLAGS_universal_min_merge_width != 0) {
+      options.compaction_options_universal.min_merge_width =
+        FLAGS_universal_min_merge_width;
+    }
+    if (FLAGS_universal_max_merge_width != 0) {
+      options.compaction_options_universal.max_merge_width =
+        FLAGS_universal_max_merge_width;
+    }
+    if (FLAGS_universal_max_size_amplification_percent != 0) {
+      options.compaction_options_universal.max_size_amplification_percent =
+        FLAGS_universal_max_size_amplification_percent;
+    }
+    if (FLAGS_universal_compression_size_percent != -1) {
+      options.compaction_options_universal.compression_size_percent =
+        FLAGS_universal_compression_size_percent;
+    }
+
+    if (FLAGS_num_multi_db <= 1) {
+      OpenDb(options, FLAGS_db, &db_);
+    } else {
+      multi_dbs_.clear();
+      for (int i = 0; i < FLAGS_num_multi_db; i++) {
+        DB* db;
+        OpenDb(options, GetDbNameForMultiple(FLAGS_db, i), &db);
+        multi_dbs_.push_back(db);
+      }
+    }
+    if (FLAGS_min_level_to_compress >= 0) {
+      options.compression_per_level.clear();
+    }
+  }
+
+  void OpenDb(Options options, std::string db_name, DB** db) {
+    Status s;
+    if(FLAGS_readonly) {
+      s = DB::OpenForReadOnly(options, db_name, db);
+    } else {
+      s = DB::Open(options, db_name, db);
+    }
+    if (!s.ok()) {
+      fprintf(stderr, "open error: %s\n", s.ToString().c_str());
+      exit(1);
+    }
+  }
+
+  enum WriteMode {
+    RANDOM, SEQUENTIAL, UNIQUE_RANDOM
+  };
+
+  void WriteSeq(ThreadState* thread) {
+    DoWrite(thread, SEQUENTIAL);
+  }
+
+  void WriteRandom(ThreadState* thread) {
+    DoWrite(thread, RANDOM);
+  }
+
+  void WriteUniqueRandom(ThreadState* thread) {
+    DoWrite(thread, UNIQUE_RANDOM);
+  }
+
+  class KeyGenerator {
+   public:
+    KeyGenerator(Random64* rand, WriteMode mode,
+        uint64_t num, uint64_t num_per_set = 64 * 1024)
+      : rand_(rand),
+        mode_(mode),
+        num_(num),
+        next_(0) {
+      if (mode_ == UNIQUE_RANDOM) {
+        // NOTE: if memory consumption of this approach becomes a concern,
+        // we can either break it into pieces and only random shuffle a section
+        // each time. Alternatively, use a bit map implementation
+        // (https://reviews.facebook.net/differential/diff/54627/)
+        values_.resize(num_);
+        for (uint64_t i = 0; i < num_; ++i) {
+          values_[i] = i;
+        }
+        std::shuffle(values_.begin(), values_.end(),
+            std::default_random_engine(FLAGS_seed));
+      }
+    }
+
+    uint64_t Next() {
+      switch (mode_) {
+        case SEQUENTIAL:
+          return next_++;
+        case RANDOM:
+          return rand_->Next() % num_;
+        case UNIQUE_RANDOM:
+          return values_[next_++];
+      }
+      assert(false);
+      return std::numeric_limits<uint64_t>::max();
+    }
+
+   private:
+    Random64* rand_;
+    WriteMode mode_;
+    const uint64_t num_;
+    uint64_t next_;
+    std::vector<uint64_t> values_;
+  };
+
+  DB* SelectDB(ThreadState* thread) {
+    if (db_ != nullptr) {
+      return db_;
+    } else {
+      return multi_dbs_[thread->rand.Next() % multi_dbs_.size()];
+    }
+  }
+
+  void DoWrite(ThreadState* thread, WriteMode write_mode) {
+    const int test_duration = write_mode == RANDOM ? FLAGS_duration : 0;
+    const int64_t num_ops = writes_ == 0 ? num_ : writes_;
+
+    size_t num_key_gens = 1;
+    if (db_ == nullptr) {
+      num_key_gens = multi_dbs_.size();
+    }
+    std::vector<std::unique_ptr<KeyGenerator>> key_gens(num_key_gens);
+    Duration duration(test_duration, num_ops * num_key_gens);
+    for (size_t i = 0; i < num_key_gens; i++) {
+      key_gens[i].reset(new KeyGenerator(&(thread->rand), write_mode, num_ops));
+    }
+
+    if (num_ != FLAGS_num) {
+      char msg[100];
+      snprintf(msg, sizeof(msg), "(%" PRIu64 " ops)", num_);
+      thread->stats.AddMessage(msg);
+    }
+
+    RandomGenerator gen;
+    WriteBatch batch;
+    Status s;
+    int64_t bytes = 0;
+
+    Slice key = AllocateKey();
+    std::unique_ptr<const char[]> key_guard(key.data());
+    while (!duration.Done(entries_per_batch_)) {
+      size_t id = 0;
+      DB* db_to_write = db_;
+      if (db_to_write == nullptr) {
+        id = thread->rand.Next() % num_key_gens;
+        db_to_write = multi_dbs_[id];
+      }
+      batch.Clear();
+      for (int64_t j = 0; j < entries_per_batch_; j++) {
+        GenerateKeyFromInt(key_gens[id]->Next(), FLAGS_num, &key);
+        batch.Put(key, gen.Generate(value_size_));
+        bytes += value_size_ + key_size_;
+        thread->stats.FinishedSingleOp(db_to_write);
+      }
+      s = db_to_write->Write(write_options_, &batch);
+      if (!s.ok()) {
+        fprintf(stderr, "put error: %s\n", s.ToString().c_str());
+        exit(1);
+      }
+    }
+    thread->stats.AddBytes(bytes);
+  }
+
+  void ReadSequential(ThreadState* thread) {
+    if (db_ != nullptr) {
+      ReadSequential(thread, db_);
+    } else {
+      for (DB* db : multi_dbs_) {
+        ReadSequential(thread, db);
+      }
+    }
+  }
+
+  void ReadSequential(ThreadState* thread, DB* db) {
+    Iterator* iter = db->NewIterator(ReadOptions(FLAGS_verify_checksum, true));
+    int64_t i = 0;
+    int64_t bytes = 0;
+    for (iter->SeekToFirst(); i < reads_ && iter->Valid(); iter->Next()) {
+      bytes += iter->key().size() + iter->value().size();
+      thread->stats.FinishedSingleOp(db);
+      ++i;
+    }
+    delete iter;
+    thread->stats.AddBytes(bytes);
+  }
+
+  void ReadReverse(ThreadState* thread) {
+    if (db_ != nullptr) {
+      ReadReverse(thread, db_);
+    } else {
+      for (DB* db : multi_dbs_) {
+        ReadReverse(thread, db);
+      }
+    }
+  }
+
+  void ReadReverse(ThreadState* thread, DB* db) {
+    Iterator* iter = db->NewIterator(ReadOptions(FLAGS_verify_checksum, true));
+    int64_t i = 0;
+    int64_t bytes = 0;
+    for (iter->SeekToLast(); i < reads_ && iter->Valid(); iter->Prev()) {
+      bytes += iter->key().size() + iter->value().size();
+      thread->stats.FinishedSingleOp(db_);
+      ++i;
+    }
+    delete iter;
+    thread->stats.AddBytes(bytes);
+  }
+
+  void ReadRandom(ThreadState* thread) {
+    int64_t read = 0;
+    int64_t found = 0;
+    ReadOptions options(FLAGS_verify_checksum, true);
+    Slice key = AllocateKey();
+    std::unique_ptr<const char[]> key_guard(key.data());
+    std::string value;
+
+    Duration duration(FLAGS_duration, reads_);
+    while (!duration.Done(1)) {
+      DB* db = SelectDB(thread);
+      GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
+      read++;
+      if (db->Get(options, key, &value).ok()) {
+        found++;
+      }
+      thread->stats.FinishedSingleOp(db_);
+    }
+
+    char msg[100];
+    snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)",
+             found, read);
+
+    thread->stats.AddMessage(msg);
+
+    if (FLAGS_perf_level > 0) {
+      thread->stats.AddMessage(perf_context.ToString());
+    }
+  }
+
+  // Calls MultiGet over a list of keys from a random distribution.
+  // Returns the total number of keys found.
+  void MultiReadRandom(ThreadState* thread) {
+    int64_t read = 0;
+    int64_t found = 0;
+    ReadOptions options(FLAGS_verify_checksum, true);
+    std::vector<Slice> keys;
+    std::vector<std::string> values(entries_per_batch_);
+    while (static_cast<int64_t>(keys.size()) < entries_per_batch_) {
+      keys.push_back(AllocateKey());
+    }
+
+    Duration duration(FLAGS_duration, reads_);
+    while (!duration.Done(1)) {
+      DB* db = SelectDB(thread);
+      for (int64_t i = 0; i < entries_per_batch_; ++i) {
+        GenerateKeyFromInt(thread->rand.Next() % FLAGS_num,
+            FLAGS_num, &keys[i]);
+      }
+      std::vector<Status> statuses = db->MultiGet(options, keys, &values);
+      assert(static_cast<int64_t>(statuses.size()) == entries_per_batch_);
+
+      read += entries_per_batch_;
+      for (int64_t i = 0; i < entries_per_batch_; ++i) {
+        if (statuses[i].ok()) {
+          ++found;
+        }
+      }
+    }
+    for (auto& k : keys) {
+      delete k.data();
+    }
+
+    char msg[100];
+    snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)",
+             found, read);
+    thread->stats.AddMessage(msg);
+  }
+
+  void IteratorCreation(ThreadState* thread) {
+    Duration duration(FLAGS_duration, reads_);
+    ReadOptions options(FLAGS_verify_checksum, true);
+    while (!duration.Done(1)) {
+      DB* db = SelectDB(thread);
+      Iterator* iter = db->NewIterator(options);
+      delete iter;
+      thread->stats.FinishedSingleOp(db);
+    }
+  }
+
+  void IteratorCreationWhileWriting(ThreadState* thread) {
+    if (thread->tid > 0) {
+      IteratorCreation(thread);
+    } else {
+      BGWriter(thread);
+    }
+  }
+
+  void SeekRandom(ThreadState* thread) {
+    int64_t read = 0;
+    int64_t found = 0;
+    ReadOptions options(FLAGS_verify_checksum, true);
+    options.tailing = FLAGS_use_tailing_iterator;
+
+    Iterator* single_iter = nullptr;
+    std::vector<Iterator*> multi_iters;
+    if (db_ != nullptr) {
+      single_iter = db_->NewIterator(options);
+    } else {
+      for (DB* db : multi_dbs_) {
+        multi_iters.push_back(db->NewIterator(options));
+      }
+    }
+
+    Slice key = AllocateKey();
+    std::unique_ptr<const char[]> key_guard(key.data());
+
+    Duration duration(FLAGS_duration, reads_);
+    while (!duration.Done(1)) {
+      // Pick a Iterator to use
+      Iterator* iter_to_use = single_iter;
+      if (single_iter == nullptr) {
+        iter_to_use = multi_iters[thread->rand.Next() % multi_iters.size()];
+      }
+
+      GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
+      iter_to_use->Seek(key);
+      read++;
+      if (iter_to_use->Valid() && iter_to_use->key().compare(key) == 0) {
+        found++;
+      }
+      thread->stats.FinishedSingleOp(db_);
+    }
+    delete single_iter;
+    for (auto iter : multi_iters) {
+      delete iter;
+    }
+
+    char msg[100];
+    snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)",
+             found, read);
+    thread->stats.AddMessage(msg);
+  }
+
+  void SeekRandomWhileWriting(ThreadState* thread) {
+    if (thread->tid > 0) {
+      SeekRandom(thread);
+    } else {
+      BGWriter(thread);
+    }
+  }
+
+  void DoDelete(ThreadState* thread, bool seq) {
+    WriteBatch batch;
+    Duration duration(seq ? 0 : FLAGS_duration, num_);
+    int64_t i = 0;
+    Slice key = AllocateKey();
+    std::unique_ptr<const char[]> key_guard(key.data());
+
+    while (!duration.Done(entries_per_batch_)) {
+      DB* db = SelectDB(thread);
+      batch.Clear();
+      for (int64_t j = 0; j < entries_per_batch_; ++j) {
+        const int64_t k = seq ? i + j : (thread->rand.Next() % FLAGS_num);
+        GenerateKeyFromInt(k, FLAGS_num, &key);
+        batch.Delete(key);
+        thread->stats.FinishedSingleOp(db);
+      }
+      auto s = db->Write(write_options_, &batch);
+      if (!s.ok()) {
+        fprintf(stderr, "del error: %s\n", s.ToString().c_str());
+        exit(1);
+      }
+      i += entries_per_batch_;
+    }
+  }
+
+  void DeleteSeq(ThreadState* thread) {
+    DoDelete(thread, true);
+  }
+
+  void DeleteRandom(ThreadState* thread) {
+    DoDelete(thread, false);
+  }
+
+  void ReadWhileWriting(ThreadState* thread) {
+    if (thread->tid > 0) {
+      ReadRandom(thread);
+    } else {
+      BGWriter(thread);
+    }
+  }
+
+  void BGWriter(ThreadState* thread) {
+    // Special thread that keeps writing until other threads are done.
+    RandomGenerator gen;
+    double last = FLAGS_env->NowMicros();
+    int writes_per_second_by_10 = 0;
+    int num_writes = 0;
+
+    // --writes_per_second rate limit is enforced per 100 milliseconds
+    // intervals to avoid a burst of writes at the start of each second.
+
+    if (FLAGS_writes_per_second > 0)
+      writes_per_second_by_10 = FLAGS_writes_per_second / 10;
+
+    // Don't merge stats from this thread with the readers.
+    thread->stats.SetExcludeFromMerge();
+
+    Slice key = AllocateKey();
+    std::unique_ptr<const char[]> key_guard(key.data());
+
+    while (true) {
+      DB* db = SelectDB(thread);
+      {
+        MutexLock l(&thread->shared->mu);
+        if (thread->shared->num_done + 1 >= thread->shared->num_initialized) {
+          // Other threads have finished
+          break;
+        }
+      }
+
+      GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
+      Status s = db->Put(write_options_, key, gen.Generate(value_size_));
+      if (!s.ok()) {
+        fprintf(stderr, "put error: %s\n", s.ToString().c_str());
+        exit(1);
+      }
+      thread->stats.FinishedSingleOp(db_);
+
+      ++num_writes;
+      if (writes_per_second_by_10 && num_writes >= writes_per_second_by_10) {
+        double now = FLAGS_env->NowMicros();
+        double usecs_since_last = now - last;
+
+        num_writes = 0;
+        last = now;
+
+        if (usecs_since_last < 100000.0) {
+          FLAGS_env->SleepForMicroseconds(100000.0 - usecs_since_last);
+          last = FLAGS_env->NowMicros();
+        }
+      }
+    }
+  }
+
+  // Given a key K and value V, this puts (K+"0", V), (K+"1", V), (K+"2", V)
+  // in DB atomically i.e in a single batch. Also refer GetMany.
+  Status PutMany(DB* db, const WriteOptions& writeoptions, const Slice& key,
+                 const Slice& value) {
+    std::string suffixes[3] = {"2", "1", "0"};
+    std::string keys[3];
+
+    WriteBatch batch;
+    Status s;
+    for (int i = 0; i < 3; i++) {
+      keys[i] = key.ToString() + suffixes[i];
+      batch.Put(keys[i], value);
+    }
+
+    s = db->Write(writeoptions, &batch);
+    return s;
+  }
+
+
+  // Given a key K, this deletes (K+"0", V), (K+"1", V), (K+"2", V)
+  // in DB atomically i.e in a single batch. Also refer GetMany.
+  Status DeleteMany(DB* db, const WriteOptions& writeoptions,
+                    const Slice& key) {
+    std::string suffixes[3] = {"1", "2", "0"};
+    std::string keys[3];
+
+    WriteBatch batch;
+    Status s;
+    for (int i = 0; i < 3; i++) {
+      keys[i] = key.ToString() + suffixes[i];
+      batch.Delete(keys[i]);
+    }
+
+    s = db->Write(writeoptions, &batch);
+    return s;
+  }
+
+  // Given a key K and value V, this gets values for K+"0", K+"1" and K+"2"
+  // in the same snapshot, and verifies that all the values are identical.
+  // ASSUMES that PutMany was used to put (K, V) into the DB.
+  Status GetMany(DB* db, const ReadOptions& readoptions, const Slice& key,
+                 std::string* value) {
+    std::string suffixes[3] = {"0", "1", "2"};
+    std::string keys[3];
+    Slice key_slices[3];
+    std::string values[3];
+    ReadOptions readoptionscopy = readoptions;
+    readoptionscopy.snapshot = db->GetSnapshot();
+    Status s;
+    for (int i = 0; i < 3; i++) {
+      keys[i] = key.ToString() + suffixes[i];
+      key_slices[i] = keys[i];
+      s = db->Get(readoptionscopy, key_slices[i], value);
+      if (!s.ok() && !s.IsNotFound()) {
+        fprintf(stderr, "get error: %s\n", s.ToString().c_str());
+        values[i] = "";
+        // we continue after error rather than exiting so that we can
+        // find more errors if any
+      } else if (s.IsNotFound()) {
+        values[i] = "";
+      } else {
+        values[i] = *value;
+      }
+    }
+    db->ReleaseSnapshot(readoptionscopy.snapshot);
+
+    if ((values[0] != values[1]) || (values[1] != values[2])) {
+      fprintf(stderr, "inconsistent values for key %s: %s, %s, %s\n",
+              key.ToString().c_str(), values[0].c_str(), values[1].c_str(),
+              values[2].c_str());
+      // we continue after error rather than exiting so that we can
+      // find more errors if any
+    }
+
+    return s;
+  }
+
+  // Differs from readrandomwriterandom in the following ways:
+  // (a) Uses GetMany/PutMany to read/write key values. Refer to those funcs.
+  // (b) Does deletes as well (per FLAGS_deletepercent)
+  // (c) In order to achieve high % of 'found' during lookups, and to do
+  //     multiple writes (including puts and deletes) it uses upto
+  //     FLAGS_numdistinct distinct keys instead of FLAGS_num distinct keys.
+  // (d) Does not have a MultiGet option.
+  void RandomWithVerify(ThreadState* thread) {
+    ReadOptions options(FLAGS_verify_checksum, true);
+    RandomGenerator gen;
+    std::string value;
+    int64_t found = 0;
+    int get_weight = 0;
+    int put_weight = 0;
+    int delete_weight = 0;
+    int64_t gets_done = 0;
+    int64_t puts_done = 0;
+    int64_t deletes_done = 0;
+
+    Slice key = AllocateKey();
+    std::unique_ptr<const char[]> key_guard(key.data());
+
+    // the number of iterations is the larger of read_ or write_
+    for (int64_t i = 0; i < readwrites_; i++) {
+      DB* db = SelectDB(thread);
+      if (get_weight == 0 && put_weight == 0 && delete_weight == 0) {
+        // one batch completed, reinitialize for next batch
+        get_weight = FLAGS_readwritepercent;
+        delete_weight = FLAGS_deletepercent;
+        put_weight = 100 - get_weight - delete_weight;
+      }
+      GenerateKeyFromInt(thread->rand.Next() % FLAGS_numdistinct,
+          FLAGS_numdistinct, &key);
+      if (get_weight > 0) {
+        // do all the gets first
+        Status s = GetMany(db, options, key, &value);
+        if (!s.ok() && !s.IsNotFound()) {
+          fprintf(stderr, "getmany error: %s\n", s.ToString().c_str());
+          // we continue after error rather than exiting so that we can
+          // find more errors if any
+        } else if (!s.IsNotFound()) {
+          found++;
+        }
+        get_weight--;
+        gets_done++;
+      } else if (put_weight > 0) {
+        // then do all the corresponding number of puts
+        // for all the gets we have done earlier
+        Status s = PutMany(db, write_options_, key, gen.Generate(value_size_));
+        if (!s.ok()) {
+          fprintf(stderr, "putmany error: %s\n", s.ToString().c_str());
+          exit(1);
+        }
+        put_weight--;
+        puts_done++;
+      } else if (delete_weight > 0) {
+        Status s = DeleteMany(db, write_options_, key);
+        if (!s.ok()) {
+          fprintf(stderr, "deletemany error: %s\n", s.ToString().c_str());
+          exit(1);
+        }
+        delete_weight--;
+        deletes_done++;
+      }
+
+      thread->stats.FinishedSingleOp(db_);
+    }
+    char msg[100];
+    snprintf(msg, sizeof(msg),
+             "( get:%" PRIu64 " put:%" PRIu64 " del:%" PRIu64 " total:%" \
+             PRIu64 " found:%" PRIu64 ")",
+             gets_done, puts_done, deletes_done, readwrites_, found);
+    thread->stats.AddMessage(msg);
+  }
+
+  // This is different from ReadWhileWriting because it does not use
+  // an extra thread.
+  void ReadRandomWriteRandom(ThreadState* thread) {
+    ReadOptions options(FLAGS_verify_checksum, true);
+    RandomGenerator gen;
+    std::string value;
+    int64_t found = 0;
+    int get_weight = 0;
+    int put_weight = 0;
+    int64_t reads_done = 0;
+    int64_t writes_done = 0;
+    Duration duration(FLAGS_duration, readwrites_);
+
+    Slice key = AllocateKey();
+    std::unique_ptr<const char[]> key_guard(key.data());
+
+    // the number of iterations is the larger of read_ or write_
+    while (!duration.Done(1)) {
+      DB* db = SelectDB(thread);
+      GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
+      if (get_weight == 0 && put_weight == 0) {
+        // one batch completed, reinitialize for next batch
+        get_weight = FLAGS_readwritepercent;
+        put_weight = 100 - get_weight;
+      }
+      if (get_weight > 0) {
+        // do all the gets first
+        Status s = db->Get(options, key, &value);
+        if (!s.ok() && !s.IsNotFound()) {
+          fprintf(stderr, "get error: %s\n", s.ToString().c_str());
+          // we continue after error rather than exiting so that we can
+          // find more errors if any
+        } else if (!s.IsNotFound()) {
+          found++;
+        }
+        get_weight--;
+        reads_done++;
+      } else  if (put_weight > 0) {
+        // then do all the corresponding number of puts
+        // for all the gets we have done earlier
+        Status s = db->Put(write_options_, key, gen.Generate(value_size_));
+        if (!s.ok()) {
+          fprintf(stderr, "put error: %s\n", s.ToString().c_str());
+          exit(1);
+        }
+        put_weight--;
+        writes_done++;
+      }
+      thread->stats.FinishedSingleOp(db);
+    }
+    char msg[100];
+    snprintf(msg, sizeof(msg), "( reads:%" PRIu64 " writes:%" PRIu64 \
+             " total:%" PRIu64 " found:%" PRIu64 ")",
+             reads_done, writes_done, readwrites_, found);
+    thread->stats.AddMessage(msg);
+  }
+
+  //
+  // Read-modify-write for random keys
+  void UpdateRandom(ThreadState* thread) {
+    ReadOptions options(FLAGS_verify_checksum, true);
+    RandomGenerator gen;
+    std::string value;
+    int64_t found = 0;
+    Duration duration(FLAGS_duration, readwrites_);
+
+    Slice key = AllocateKey();
+    std::unique_ptr<const char[]> key_guard(key.data());
+    // the number of iterations is the larger of read_ or write_
+    while (!duration.Done(1)) {
+      DB* db = SelectDB(thread);
+      GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
+
+      if (db->Get(options, key, &value).ok()) {
+        found++;
+      }
+
+      Status s = db->Put(write_options_, key, gen.Generate(value_size_));
+      if (!s.ok()) {
+        fprintf(stderr, "put error: %s\n", s.ToString().c_str());
+        exit(1);
+      }
+      thread->stats.FinishedSingleOp(db);
+    }
+    char msg[100];
+    snprintf(msg, sizeof(msg),
+             "( updates:%" PRIu64 " found:%" PRIu64 ")", readwrites_, found);
+    thread->stats.AddMessage(msg);
+  }
+
+  // Read-modify-write for random keys.
+  // Each operation causes the key grow by value_size (simulating an append).
+  // Generally used for benchmarking against merges of similar type
+  void AppendRandom(ThreadState* thread) {
+    ReadOptions options(FLAGS_verify_checksum, true);
+    RandomGenerator gen;
+    std::string value;
+    int64_t found = 0;
+
+    Slice key = AllocateKey();
+    std::unique_ptr<const char[]> key_guard(key.data());
+    // The number of iterations is the larger of read_ or write_
+    Duration duration(FLAGS_duration, readwrites_);
+    while (!duration.Done(1)) {
+      DB* db = SelectDB(thread);
+      GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
+
+      // Get the existing value
+      if (db->Get(options, key, &value).ok()) {
+        found++;
+      } else {
+        // If not existing, then just assume an empty string of data
+        value.clear();
+      }
+
+      // Update the value (by appending data)
+      Slice operand = gen.Generate(value_size_);
+      if (value.size() > 0) {
+        // Use a delimeter to match the semantics for StringAppendOperator
+        value.append(1,',');
+      }
+      value.append(operand.data(), operand.size());
+
+      // Write back to the database
+      Status s = db->Put(write_options_, key, value);
+      if (!s.ok()) {
+        fprintf(stderr, "put error: %s\n", s.ToString().c_str());
+        exit(1);
+      }
+      thread->stats.FinishedSingleOp(db_);
+    }
+
+    char msg[100];
+    snprintf(msg, sizeof(msg), "( updates:%" PRIu64 " found:%" PRIu64 ")",
+            readwrites_, found);
+    thread->stats.AddMessage(msg);
+  }
+
+  // Read-modify-write for random keys (using MergeOperator)
+  // The merge operator to use should be defined by FLAGS_merge_operator
+  // Adjust FLAGS_value_size so that the keys are reasonable for this operator
+  // Assumes that the merge operator is non-null (i.e.: is well-defined)
+  //
+  // For example, use FLAGS_merge_operator="uint64add" and FLAGS_value_size=8
+  // to simulate random additions over 64-bit integers using merge.
+  //
+  // The number of merges on the same key can be controlled by adjusting
+  // FLAGS_merge_keys.
+  void MergeRandom(ThreadState* thread) {
+    RandomGenerator gen;
+
+    Slice key = AllocateKey();
+    std::unique_ptr<const char[]> key_guard(key.data());
+    // The number of iterations is the larger of read_ or write_
+    Duration duration(FLAGS_duration, readwrites_);
+    while (!duration.Done(1)) {
+      DB* db = SelectDB(thread);
+      GenerateKeyFromInt(thread->rand.Next() % merge_keys_, merge_keys_, &key);
+
+      Status s = db->Merge(write_options_, key, gen.Generate(value_size_));
+
+      if (!s.ok()) {
+        fprintf(stderr, "merge error: %s\n", s.ToString().c_str());
+        exit(1);
+      }
+      thread->stats.FinishedSingleOp(db_);
+    }
+
+    // Print some statistics
+    char msg[100];
+    snprintf(msg, sizeof(msg), "( updates:%" PRIu64 ")", readwrites_);
+    thread->stats.AddMessage(msg);
+  }
+
+  // Read and merge random keys. The amount of reads and merges are controlled
+  // by adjusting FLAGS_num and FLAGS_mergereadpercent. The number of distinct
+  // keys (and thus also the number of reads and merges on the same key) can be
+  // adjusted with FLAGS_merge_keys.
+  //
+  // As with MergeRandom, the merge operator to use should be defined by
+  // FLAGS_merge_operator.
+  void ReadRandomMergeRandom(ThreadState* thread) {
+    ReadOptions options(FLAGS_verify_checksum, true);
+    RandomGenerator gen;
+    std::string value;
+    int64_t num_hits = 0;
+    int64_t num_gets = 0;
+    int64_t num_merges = 0;
+    size_t max_length = 0;
+
+    Slice key = AllocateKey();
+    std::unique_ptr<const char[]> key_guard(key.data());
+    // the number of iterations is the larger of read_ or write_
+    Duration duration(FLAGS_duration, readwrites_);
+    while (!duration.Done(1)) {
+      DB* db = SelectDB(thread);
+      GenerateKeyFromInt(thread->rand.Next() % merge_keys_, merge_keys_, &key);
+
+      bool do_merge = int(thread->rand.Next() % 100) < FLAGS_mergereadpercent;
+
+      if (do_merge) {
+        Status s = db->Merge(write_options_, key, gen.Generate(value_size_));
+        if (!s.ok()) {
+          fprintf(stderr, "merge error: %s\n", s.ToString().c_str());
+          exit(1);
+        }
+
+        num_merges++;
+
+      } else {
+        Status s = db->Get(options, key, &value);
+        if (value.length() > max_length)
+          max_length = value.length();
+
+        if (!s.ok() && !s.IsNotFound()) {
+          fprintf(stderr, "get error: %s\n", s.ToString().c_str());
+          // we continue after error rather than exiting so that we can
+          // find more errors if any
+        } else if (!s.IsNotFound()) {
+          num_hits++;
+        }
+
+        num_gets++;
+
+      }
+
+      thread->stats.FinishedSingleOp(db_);
+    }
+
+    char msg[100];
+    snprintf(msg, sizeof(msg),
+             "(reads:%" PRIu64 " merges:%" PRIu64 " total:%" PRIu64 " hits:%" \
+             PRIu64 " maxlength:%zu)",
+             num_gets, num_merges, readwrites_, num_hits, max_length);
+    thread->stats.AddMessage(msg);
+  }
+
+  void Compact(ThreadState* thread) {
+    DB* db = SelectDB(thread);
+    db->CompactRange(nullptr, nullptr);
+  }
+
+  void PrintStats(const char* key) {
+    if (db_ != nullptr) {
+      PrintStats(db_, key, false);
+    }
+    for (DB* db : multi_dbs_) {
+      PrintStats(db, key, true);
+    }
+  }
+
+  void PrintStats(DB* db, const char* key, bool print_header = false) {
+    if (print_header) {
+      fprintf(stdout, "\n==== DB: %s ===\n", db->GetName().c_str());
+    }
+    std::string stats;
+    if (!db->GetProperty(key, &stats)) {
+      stats = "(failed)";
+    }
+    fprintf(stdout, "\n%s\n", stats.c_str());
+  }
+};
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  rocksdb::port::InstallStackTraceHandler();
+  google::SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
+                          " [OPTIONS]...");
+  google::ParseCommandLineFlags(&argc, &argv, true);
+
+  FLAGS_compaction_style_e = (rocksdb::CompactionStyle) FLAGS_compaction_style;
+  if (FLAGS_statistics) {
+    dbstats = rocksdb::CreateDBStatistics();
+  }
+
+  std::vector<std::string> fanout =
+    rocksdb::stringSplit(FLAGS_max_bytes_for_level_multiplier_additional, ',');
+  for (unsigned int j= 0; j < fanout.size(); j++) {
+    FLAGS_max_bytes_for_level_multiplier_additional_v.push_back(
+      std::stoi(fanout[j]));
+  }
+
+  FLAGS_compression_type_e =
+    StringToCompressionType(FLAGS_compression_type.c_str());
+
+  if (!FLAGS_hdfs.empty()) {
+    FLAGS_env  = new rocksdb::HdfsEnv(FLAGS_hdfs);
+  }
+
+  if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "NONE"))
+    FLAGS_compaction_fadvice_e = rocksdb::Options::NONE;
+  else if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "NORMAL"))
+    FLAGS_compaction_fadvice_e = rocksdb::Options::NORMAL;
+  else if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "SEQUENTIAL"))
+    FLAGS_compaction_fadvice_e = rocksdb::Options::SEQUENTIAL;
+  else if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "WILLNEED"))
+    FLAGS_compaction_fadvice_e = rocksdb::Options::WILLNEED;
+  else {
+    fprintf(stdout, "Unknown compaction fadvice:%s\n",
+            FLAGS_compaction_fadvice.c_str());
+  }
+
+  FLAGS_rep_factory = StringToRepFactory(FLAGS_memtablerep.c_str());
+
+  // The number of background threads should be at least as much the
+  // max number of concurrent compactions.
+  FLAGS_env->SetBackgroundThreads(FLAGS_max_background_compactions);
+  // Choose a location for the test database if none given with --db=<path>
+  if (FLAGS_db.empty()) {
+    std::string default_db_path;
+    rocksdb::Env::Default()->GetTestDirectory(&default_db_path);
+    default_db_path += "/dbbench";
+    FLAGS_db = default_db_path;
+  }
+
+  rocksdb::Benchmark benchmark;
+  benchmark.Run();
+  return 0;
+}
diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc
new file mode 100644 (file)
index 0000000..1e1ec97
--- /dev/null
@@ -0,0 +1,172 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2012 Facebook.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef ROCKSDB_LITE
+
+#define __STDC_FORMAT_MACROS
+#include <inttypes.h>
+#include <algorithm>
+#include <string>
+#include <stdint.h>
+#include "db/db_impl.h"
+#include "db/filename.h"
+#include "db/version_set.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "port/port.h"
+#include "util/mutexlock.h"
+#include "util/sync_point.h"
+
+namespace rocksdb {
+
+Status DBImpl::DisableFileDeletions() {
+  MutexLock l(&mutex_);
+  ++disable_delete_obsolete_files_;
+  if (disable_delete_obsolete_files_ == 1) {
+    // if not, it has already been disabled, so don't log anything
+    Log(options_.info_log, "File Deletions Disabled");
+  }
+  return Status::OK();
+}
+
+Status DBImpl::EnableFileDeletions(bool force) {
+  DeletionState deletion_state;
+  bool should_purge_files = false;
+  {
+    MutexLock l(&mutex_);
+    if (force) {
+      // if force, we need to enable file deletions right away
+      disable_delete_obsolete_files_ = 0;
+    } else if (disable_delete_obsolete_files_ > 0) {
+      --disable_delete_obsolete_files_;
+    }
+    if (disable_delete_obsolete_files_ == 0)  {
+      Log(options_.info_log, "File Deletions Enabled");
+      should_purge_files = true;
+      FindObsoleteFiles(deletion_state, true);
+    }
+  }
+  if (should_purge_files)  {
+    PurgeObsoleteFiles(deletion_state);
+  }
+  LogFlush(options_.info_log);
+  return Status::OK();
+}
+
+Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,
+                            uint64_t* manifest_file_size,
+                            bool flush_memtable) {
+
+  *manifest_file_size = 0;
+
+  mutex_.Lock();
+
+  if (flush_memtable) {
+    // flush all dirty data to disk.
+    Status status;
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      cfd->Ref();
+      mutex_.Unlock();
+      status = FlushMemTable(cfd, FlushOptions());
+      mutex_.Lock();
+      cfd->Unref();
+      if (!status.ok()) {
+        break;
+      }
+    }
+    versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
+
+    if (!status.ok()) {
+      mutex_.Unlock();
+      Log(options_.info_log, "Cannot Flush data %s\n",
+          status.ToString().c_str());
+      return status;
+    }
+  }
+
+  // Make a set of all of the live *.sst files
+  std::set<uint64_t> live;
+  for (auto cfd : *versions_->GetColumnFamilySet()) {
+    cfd->current()->AddLiveFiles(&live);
+  }
+
+  ret.clear();
+  ret.reserve(live.size() + 2); //*.sst + CURRENT + MANIFEST
+
+  // create names of the live files. The names are not absolute
+  // paths, instead they are relative to dbname_;
+  for (auto live_file : live) {
+    ret.push_back(TableFileName("", live_file));
+  }
+
+  ret.push_back(CurrentFileName(""));
+  ret.push_back(DescriptorFileName("", versions_->ManifestFileNumber()));
+
+  // find length of manifest file while holding the mutex lock
+  *manifest_file_size = versions_->ManifestFileSize();
+
+  mutex_.Unlock();
+  return Status::OK();
+}
+
+Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) {
+  // First get sorted files in db dir, then get sorted files from archived
+  // dir, to avoid a race condition where a log file is moved to archived
+  // dir in between.
+  Status s;
+  // list wal files in main db dir.
+  VectorLogPtr logs;
+  s = GetSortedWalsOfType(options_.wal_dir, logs, kAliveLogFile);
+  if (!s.ok()) {
+    return s;
+  }
+
+  // Reproduce the race condition where a log file is moved
+  // to archived dir, between these two sync points, used in
+  // (DBTest,TransactionLogIteratorRace)
+  TEST_SYNC_POINT("DBImpl::GetSortedWalFiles:1");
+  TEST_SYNC_POINT("DBImpl::GetSortedWalFiles:2");
+
+  files.clear();
+  // list wal files in archive dir.
+  std::string archivedir = ArchivalDirectory(options_.wal_dir);
+  if (env_->FileExists(archivedir)) {
+    s = GetSortedWalsOfType(archivedir, files, kArchivedLogFile);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  uint64_t latest_archived_log_number = 0;
+  if (!files.empty()) {
+    latest_archived_log_number = files.back()->LogNumber();
+    Log(options_.info_log, "Latest Archived log: %" PRIu64,
+        latest_archived_log_number);
+  }
+
+  files.reserve(files.size() + logs.size());
+  for (auto& log : logs) {
+    if (log->LogNumber() > latest_archived_log_number) {
+      files.push_back(std::move(log));
+    } else {
+      // When the race condition happens, we could see the
+      // same log in both db dir and archived dir. Simply
+      // ignore the one in db dir. Note that, if we read
+      // archived dir first, we would have missed the log file.
+      Log(options_.info_log, "%s already moved to archive",
+          log->PathName().c_str());
+    }
+  }
+
+  return s;
+}
+
+}
+
+#endif  // ROCKSDB_LITE
diff --git a/db/db_impl.cc b/db/db_impl.cc
new file mode 100644 (file)
index 0000000..25d8a07
--- /dev/null
@@ -0,0 +1,4587 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_impl.h"
+
+#define __STDC_FORMAT_MACROS
+#include <inttypes.h>
+#include <algorithm>
+#include <climits>
+#include <cstdio>
+#include <set>
+#include <stdexcept>
+#include <stdint.h>
+#include <string>
+#include <unordered_set>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "db/builder.h"
+#include "db/db_iter.h"
+#include "db/dbformat.h"
+#include "db/filename.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/memtable.h"
+#include "db/memtable_list.h"
+#include "db/merge_context.h"
+#include "db/merge_helper.h"
+#include "db/table_cache.h"
+#include "db/table_properties_collector.h"
+#include "db/tailing_iter.h"
+#include "db/transaction_log_impl.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "port/port.h"
+#include "rocksdb/cache.h"
+#include "port/likely.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "table/block.h"
+#include "table/block_based_table_factory.h"
+#include "table/merger.h"
+#include "table/table_builder.h"
+#include "table/two_level_iterator.h"
+#include "util/auto_roll_logger.h"
+#include "util/autovector.h"
+#include "util/build_version.h"
+#include "util/coding.h"
+#include "util/hash_skiplist_rep.h"
+#include "util/hash_linklist_rep.h"
+#include "util/logging.h"
+#include "util/log_buffer.h"
+#include "util/mutexlock.h"
+#include "util/perf_context_imp.h"
+#include "util/stop_watch.h"
+#include "util/sync_point.h"
+
+namespace rocksdb {
+
+const std::string kDefaultColumnFamilyName("default");
+
+void DumpLeveldbBuildVersion(Logger * log);
+
+// Information kept for every waiting writer
+struct DBImpl::Writer {
+  Status status;
+  WriteBatch* batch;
+  bool sync;
+  bool disableWAL;
+  bool done;
+  port::CondVar cv;
+
+  explicit Writer(port::Mutex* mu) : cv(mu) { }
+};
+
+struct DBImpl::CompactionState {
+  Compaction* const compaction;
+
+  // If there were two snapshots with seq numbers s1 and
+  // s2 and s1 < s2, and if we find two instances of a key k1 then lies
+  // entirely within s1 and s2, then the earlier version of k1 can be safely
+  // deleted because that version is not visible in any snapshot.
+  std::vector<SequenceNumber> existing_snapshots;
+
+  // Files produced by compaction
+  struct Output {
+    uint64_t number;
+    uint64_t file_size;
+    InternalKey smallest, largest;
+    SequenceNumber smallest_seqno, largest_seqno;
+  };
+  std::vector<Output> outputs;
+  std::list<uint64_t> allocated_file_numbers;
+
+  // State kept for output being generated
+  unique_ptr<WritableFile> outfile;
+  unique_ptr<TableBuilder> builder;
+
+  uint64_t total_bytes;
+
+  Output* current_output() { return &outputs[outputs.size()-1]; }
+
+  explicit CompactionState(Compaction* c)
+      : compaction(c),
+        total_bytes(0) {
+  }
+
+  // Create a client visible context of this compaction
+  CompactionFilter::Context GetFilterContextV1() {
+    CompactionFilter::Context context;
+    context.is_full_compaction = compaction->IsFullCompaction();
+    context.is_manual_compaction = compaction->IsManualCompaction();
+    return context;
+  }
+
+  // Create a client visible context of this compaction
+  CompactionFilterContext GetFilterContext() {
+    CompactionFilterContext context;
+    context.is_full_compaction = compaction->IsFullCompaction();
+    context.is_manual_compaction = compaction->IsManualCompaction();
+    return context;
+  }
+
+  std::vector<Slice> key_buf_;
+  std::vector<Slice> existing_value_buf_;
+  std::vector<std::string> key_str_buf_;
+  std::vector<std::string> existing_value_str_buf_;
+  // new_value_buf_ will only be appended if a value changes
+  std::vector<std::string> new_value_buf_;
+  // if values_changed_buf_[i] is true
+  // new_value_buf_ will add a new entry with the changed value
+  std::vector<bool> value_changed_buf_;
+  // to_delete_buf_[i] is true iff key_buf_[i] is deleted
+  std::vector<bool> to_delete_buf_;
+  // buffer for the parsed internal keys, the string buffer is backed
+  // by key_str_buf_
+  std::vector<ParsedInternalKey> ikey_buf_;
+
+  std::vector<Slice> other_key_buf_;
+  std::vector<Slice> other_value_buf_;
+  std::vector<std::string> other_key_str_buf_;
+  std::vector<std::string> other_value_str_buf_;
+
+  std::vector<Slice> combined_key_buf_;
+  std::vector<Slice> combined_value_buf_;
+
+  std::string cur_prefix_;
+
+  // Buffers the kv-pair that will be run through compaction filter V2
+  // in the future.
+  void BufferKeyValueSlices(const Slice& key, const Slice& value) {
+    key_str_buf_.emplace_back(key.ToString());
+    existing_value_str_buf_.emplace_back(value.ToString());
+    key_buf_.emplace_back(Slice(key_str_buf_.back()));
+    existing_value_buf_.emplace_back(Slice(existing_value_str_buf_.back()));
+
+    ParsedInternalKey ikey;
+    ParseInternalKey(key_buf_.back(), &ikey);
+    ikey_buf_.emplace_back(ikey);
+  }
+
+  // Buffers the kv-pair that will not be run through compaction filter V2
+  // in the future.
+  void BufferOtherKeyValueSlices(const Slice& key, const Slice& value) {
+    other_key_str_buf_.emplace_back(key.ToString());
+    other_value_str_buf_.emplace_back(value.ToString());
+    other_key_buf_.emplace_back(Slice(other_key_str_buf_.back()));
+    other_value_buf_.emplace_back(Slice(other_value_str_buf_.back()));
+  }
+
+  // Add a kv-pair to the combined buffer
+  void AddToCombinedKeyValueSlices(const Slice& key, const Slice& value) {
+    // The real strings are stored in the batch buffers
+    combined_key_buf_.emplace_back(key);
+    combined_value_buf_.emplace_back(value);
+  }
+
+  // Merging the two buffers
+  void MergeKeyValueSliceBuffer(const InternalKeyComparator* comparator) {
+    size_t i = 0;
+    size_t j = 0;
+    size_t total_size = key_buf_.size() + other_key_buf_.size();
+    combined_key_buf_.reserve(total_size);
+    combined_value_buf_.reserve(total_size);
+
+    while (i + j < total_size) {
+      int comp_res = 0;
+      if (i < key_buf_.size() && j < other_key_buf_.size()) {
+        comp_res = comparator->Compare(key_buf_[i], other_key_buf_[j]);
+      } else if (i >= key_buf_.size() && j < other_key_buf_.size()) {
+        comp_res = 1;
+      } else if (j >= other_key_buf_.size() && i < key_buf_.size()) {
+        comp_res = -1;
+      }
+      if (comp_res > 0) {
+        AddToCombinedKeyValueSlices(other_key_buf_[j], other_value_buf_[j]);
+        j++;
+      } else if (comp_res < 0) {
+        AddToCombinedKeyValueSlices(key_buf_[i], existing_value_buf_[i]);
+        i++;
+      }
+    }
+  }
+
+  void CleanupBatchBuffer() {
+    to_delete_buf_.clear();
+    key_buf_.clear();
+    existing_value_buf_.clear();
+    key_str_buf_.clear();
+    existing_value_str_buf_.clear();
+    new_value_buf_.clear();
+    value_changed_buf_.clear();
+    ikey_buf_.clear();
+
+    to_delete_buf_.shrink_to_fit();
+    key_buf_.shrink_to_fit();
+    existing_value_buf_.shrink_to_fit();
+    key_str_buf_.shrink_to_fit();
+    existing_value_str_buf_.shrink_to_fit();
+    new_value_buf_.shrink_to_fit();
+    value_changed_buf_.shrink_to_fit();
+    ikey_buf_.shrink_to_fit();
+
+    other_key_buf_.clear();
+    other_value_buf_.clear();
+    other_key_str_buf_.clear();
+    other_value_str_buf_.clear();
+    other_key_buf_.shrink_to_fit();
+    other_value_buf_.shrink_to_fit();
+    other_key_str_buf_.shrink_to_fit();
+    other_value_str_buf_.shrink_to_fit();
+  }
+
+  void CleanupMergedBuffer() {
+    combined_key_buf_.clear();
+    combined_value_buf_.clear();
+    combined_key_buf_.shrink_to_fit();
+    combined_value_buf_.shrink_to_fit();
+  }
+};
+
+namespace {
+// Fix user-supplied options to be reasonable
+template <class T, class V>
+static void ClipToRange(T* ptr, V minvalue, V maxvalue) {
+  if (static_cast<V>(*ptr) > maxvalue) *ptr = maxvalue;
+  if (static_cast<V>(*ptr) < minvalue) *ptr = minvalue;
+}
+}  // anonymous namespace
+
+Options SanitizeOptions(const std::string& dbname,
+                        const InternalKeyComparator* icmp,
+                        const InternalFilterPolicy* ipolicy,
+                        const Options& src) {
+  auto db_options = SanitizeOptions(dbname, DBOptions(src));
+  auto cf_options = SanitizeOptions(icmp, ipolicy, ColumnFamilyOptions(src));
+  return Options(db_options, cf_options);
+}
+
+DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) {
+  DBOptions result = src;
+  // result.max_open_files means an "infinite" open files.
+  if (result.max_open_files != -1) {
+    ClipToRange(&result.max_open_files, 20, 1000000);
+  }
+
+  if (result.info_log == nullptr) {
+    Status s = CreateLoggerFromOptions(dbname, result.db_log_dir, src.env,
+                                       result, &result.info_log);
+    if (!s.ok()) {
+      // No place suitable for logging
+      result.info_log = nullptr;
+    }
+  }
+
+  if (result.wal_dir.empty()) {
+    // Use dbname as default
+    result.wal_dir = dbname;
+  }
+  if (result.wal_dir.back() == '/') {
+    result.wal_dir = result.wal_dir.substr(0, result.wal_dir.size() - 1);
+  }
+
+  return result;
+}
+
+CompressionType GetCompressionType(const Options& options, int level,
+                                   const bool enable_compression) {
+  if (!enable_compression) {
+    // disable compression
+    return kNoCompression;
+  }
+  // If the use has specified a different compression level for each level,
+  // then pick the compresison for that level.
+  if (!options.compression_per_level.empty()) {
+    const int n = options.compression_per_level.size() - 1;
+    // It is possible for level_ to be -1; in that case, we use level
+    // 0's compression.  This occurs mostly in backwards compatibility
+    // situations when the builder doesn't know what level the file
+    // belongs to.  Likewise, if level_ is beyond the end of the
+    // specified compression levels, use the last value.
+    return options.compression_per_level[std::max(0, std::min(level, n))];
+  } else {
+    return options.compression;
+  }
+}
+
+CompressionType GetCompressionFlush(const Options& options) {
+  // Compressing memtable flushes might not help unless the sequential load
+  // optimization is used for leveled compaction. Otherwise the CPU and
+  // latency overhead is not offset by saving much space.
+
+  bool can_compress;
+
+  if  (options.compaction_style == kCompactionStyleUniversal) {
+    can_compress =
+        (options.compaction_options_universal.compression_size_percent < 0);
+  } else {
+    // For leveled compress when min_level_to_compress == 0.
+    can_compress = (GetCompressionType(options, 0, true) != kNoCompression);
+  }
+
+  if (can_compress) {
+    return options.compression;
+  } else {
+    return kNoCompression;
+  }
+}
+
+DBImpl::DBImpl(const DBOptions& options, const std::string& dbname)
+    : env_(options.env),
+      dbname_(dbname),
+      options_(SanitizeOptions(dbname, options)),
+      db_lock_(nullptr),
+      mutex_(options.use_adaptive_mutex),
+      shutting_down_(nullptr),
+      bg_cv_(&mutex_),
+      logfile_number_(0),
+      log_empty_(true),
+      default_cf_handle_(nullptr),
+      total_log_size_(0),
+      max_total_in_memory_state_(0),
+      tmp_batch_(),
+      bg_schedule_needed_(false),
+      bg_compaction_scheduled_(0),
+      bg_manual_only_(0),
+      bg_flush_scheduled_(0),
+      bg_logstats_scheduled_(false),
+      manual_compaction_(nullptr),
+      logger_(nullptr),
+      disable_delete_obsolete_files_(0),
+      delete_obsolete_files_last_run_(options.env->NowMicros()),
+      purge_wal_files_last_run_(0),
+      last_stats_dump_time_microsec_(0),
+      default_interval_to_delete_obsolete_WAL_(600),
+      flush_on_destroy_(false),
+      delayed_writes_(0),
+      storage_options_(options),
+      bg_work_gate_closed_(false),
+      refitting_level_(false),
+      opened_successfully_(false) {
+  env_->GetAbsolutePath(dbname, &db_absolute_path_);
+
+  // Reserve ten files or so for other uses and give the rest to TableCache.
+  // Give a large number for setting of "infinite" open files.
+  const int table_cache_size =
+      (options_.max_open_files == -1) ? 4194304 : options_.max_open_files - 10;
+  // Reserve ten files or so for other uses and give the rest to TableCache.
+  table_cache_ =
+      NewLRUCache(table_cache_size, options_.table_cache_numshardbits,
+                  options_.table_cache_remove_scan_count_limit);
+
+  versions_.reset(
+      new VersionSet(dbname_, &options_, storage_options_, table_cache_.get()));
+  column_family_memtables_.reset(
+      new ColumnFamilyMemTablesImpl(versions_->GetColumnFamilySet()));
+
+  DumpLeveldbBuildVersion(options_.info_log.get());
+  options_.Dump(options_.info_log.get());
+
+  char name[100];
+  Status s = env_->GetHostName(name, 100L);
+  if (s.ok()) {
+    host_name_ = name;
+  } else {
+    Log(options_.info_log, "Can't get hostname, use localhost as host name.");
+    host_name_ = "localhost";
+  }
+  last_log_ts = 0;
+
+  LogFlush(options_.info_log);
+}
+
+DBImpl::~DBImpl() {
+  mutex_.Lock();
+  if (flush_on_destroy_) {
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      if (cfd->mem()->GetFirstSequenceNumber() != 0) {
+        cfd->Ref();
+        mutex_.Unlock();
+        FlushMemTable(cfd, FlushOptions());
+        mutex_.Lock();
+        cfd->Unref();
+      }
+    }
+    versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
+  }
+
+  // Wait for background work to finish
+  shutting_down_.Release_Store(this);  // Any non-nullptr value is ok
+  while (bg_compaction_scheduled_ ||
+         bg_flush_scheduled_ ||
+         bg_logstats_scheduled_) {
+    bg_cv_.Wait();
+  }
+
+  if (default_cf_handle_ != nullptr) {
+    // we need to delete handle outside of lock because it does its own locking
+    mutex_.Unlock();
+    delete default_cf_handle_;
+    mutex_.Lock();
+  }
+
+  if (options_.allow_thread_local) {
+    // Clean up obsolete files due to SuperVersion release.
+    // (1) Need to delete to obsolete files before closing because RepairDB()
+    // scans all existing files in the file system and builds manifest file.
+    // Keeping obsolete files confuses the repair process.
+    // (2) Need to check if we Open()/Recover() the DB successfully before
+    // deleting because if VersionSet recover fails (may be due to corrupted
+    // manifest file), it is not able to identify live files correctly. As a
+    // result, all "live" files can get deleted by accident. However, corrupted
+    // manifest is recoverable by RepairDB().
+    if (opened_successfully_) {
+      DeletionState deletion_state;
+      FindObsoleteFiles(deletion_state, true);
+      // manifest number starting from 2
+      deletion_state.manifest_file_number = 1;
+      if (deletion_state.HaveSomethingToDelete()) {
+        PurgeObsoleteFiles(deletion_state);
+      }
+    }
+  }
+
+  // versions need to be destroyed before table_cache since it can hold
+  // references to table_cache.
+  versions_.reset();
+  mutex_.Unlock();
+  if (db_lock_ != nullptr) {
+    env_->UnlockFile(db_lock_);
+  }
+
+  LogFlush(options_.info_log);
+}
+
+Status DBImpl::NewDB() {
+  VersionEdit new_db;
+  new_db.SetLogNumber(0);
+  new_db.SetNextFile(2);
+  new_db.SetLastSequence(0);
+
+  const std::string manifest = DescriptorFileName(dbname_, 1);
+  unique_ptr<WritableFile> file;
+  Status s = env_->NewWritableFile(
+      manifest, &file, env_->OptimizeForManifestWrite(storage_options_));
+  if (!s.ok()) {
+    return s;
+  }
+  file->SetPreallocationBlockSize(options_.manifest_preallocation_size);
+  {
+    log::Writer log(std::move(file));
+    std::string record;
+    new_db.EncodeTo(&record);
+    s = log.AddRecord(record);
+  }
+  if (s.ok()) {
+    // Make "CURRENT" file that points to the new manifest file.
+    s = SetCurrentFile(env_, dbname_, 1);
+  } else {
+    env_->DeleteFile(manifest);
+  }
+  return s;
+}
+
+void DBImpl::MaybeIgnoreError(Status* s) const {
+  if (s->ok() || options_.paranoid_checks) {
+    // No change needed
+  } else {
+    Log(options_.info_log, "Ignoring error %s", s->ToString().c_str());
+    *s = Status::OK();
+  }
+}
+
+const Status DBImpl::CreateArchivalDirectory() {
+  if (options_.WAL_ttl_seconds > 0 || options_.WAL_size_limit_MB > 0) {
+    std::string archivalPath = ArchivalDirectory(options_.wal_dir);
+    return env_->CreateDirIfMissing(archivalPath);
+  }
+  return Status::OK();
+}
+
+void DBImpl::PrintStatistics() {
+  auto dbstats = options_.statistics.get();
+  if (dbstats) {
+    Log(options_.info_log,
+        "STATISTCS:\n %s",
+        dbstats->ToString().c_str());
+  }
+}
+
+void DBImpl::MaybeDumpStats() {
+  if (options_.stats_dump_period_sec == 0) return;
+
+  const uint64_t now_micros = env_->NowMicros();
+
+  if (last_stats_dump_time_microsec_ +
+      options_.stats_dump_period_sec * 1000000
+      <= now_micros) {
+    // Multiple threads could race in here simultaneously.
+    // However, the last one will update last_stats_dump_time_microsec_
+    // atomically. We could see more than one dump during one dump
+    // period in rare cases.
+    last_stats_dump_time_microsec_ = now_micros;
+    std::string stats;
+    GetProperty("rocksdb.stats", &stats);
+    Log(options_.info_log, "%s", stats.c_str());
+    PrintStatistics();
+  }
+}
+
+// Returns the list of live files in 'sst_live' and the list
+// of all files in the filesystem in 'candidate_files'.
+// no_full_scan = true -- never do the full scan using GetChildren()
+// force = false -- don't force the full scan, except every
+//  options_.delete_obsolete_files_period_micros
+// force = true -- force the full scan
+void DBImpl::FindObsoleteFiles(DeletionState& deletion_state,
+                               bool force,
+                               bool no_full_scan) {
+  mutex_.AssertHeld();
+
+  // if deletion is disabled, do nothing
+  if (disable_delete_obsolete_files_ > 0) {
+    return;
+  }
+
+  bool doing_the_full_scan = false;
+
+  // logic for figurint out if we're doing the full scan
+  if (no_full_scan) {
+    doing_the_full_scan = false;
+  } else if (force || options_.delete_obsolete_files_period_micros == 0) {
+    doing_the_full_scan = true;
+  } else {
+    const uint64_t now_micros = env_->NowMicros();
+    if (delete_obsolete_files_last_run_ +
+        options_.delete_obsolete_files_period_micros < now_micros) {
+      doing_the_full_scan = true;
+      delete_obsolete_files_last_run_ = now_micros;
+    }
+  }
+
+  // get obsolete files
+  versions_->GetObsoleteFiles(&deletion_state.sst_delete_files);
+
+  // store the current filenum, lognum, etc
+  deletion_state.manifest_file_number = versions_->ManifestFileNumber();
+  deletion_state.pending_manifest_file_number =
+      versions_->PendingManifestFileNumber();
+  deletion_state.log_number = versions_->MinLogNumber();
+  deletion_state.prev_log_number = versions_->PrevLogNumber();
+
+  if (!doing_the_full_scan && !deletion_state.HaveSomethingToDelete()) {
+    // avoid filling up sst_live if we're sure that we
+    // are not going to do the full scan and that we don't have
+    // anything to delete at the moment
+    return;
+  }
+
+  // don't delete live files
+  deletion_state.sst_live.assign(pending_outputs_.begin(),
+                                 pending_outputs_.end());
+  versions_->AddLiveFiles(&deletion_state.sst_live);
+
+  if (doing_the_full_scan) {
+    // set of all files in the directory. We'll exclude files that are still
+    // alive in the subsequent processings.
+    env_->GetChildren(
+        dbname_, &deletion_state.candidate_files
+    ); // Ignore errors
+
+    //Add log files in wal_dir
+    if (options_.wal_dir != dbname_) {
+      std::vector<std::string> log_files;
+      env_->GetChildren(options_.wal_dir, &log_files); // Ignore errors
+      deletion_state.candidate_files.insert(
+        deletion_state.candidate_files.end(),
+        log_files.begin(),
+        log_files.end()
+      );
+    }
+  }
+}
+
+// Diffs the files listed in filenames and those that do not
+// belong to live files are posibly removed. Also, removes all the
+// files in sst_delete_files and log_delete_files.
+// It is not necessary to hold the mutex when invoking this method.
+void DBImpl::PurgeObsoleteFiles(DeletionState& state) {
+  // we'd better have sth to delete
+  assert(state.HaveSomethingToDelete());
+
+  // this checks if FindObsoleteFiles() was run before. If not, don't do
+  // PurgeObsoleteFiles(). If FindObsoleteFiles() was run, we need to also
+  // run PurgeObsoleteFiles(), even if disable_delete_obsolete_files_ is true
+  if (state.manifest_file_number == 0) {
+    return;
+  }
+
+  // Now, convert live list to an unordered set, WITHOUT mutex held;
+  // set is slow.
+  std::unordered_set<uint64_t> sst_live(state.sst_live.begin(),
+                                        state.sst_live.end());
+
+  auto& candidate_files = state.candidate_files;
+  candidate_files.reserve(
+      candidate_files.size() +
+      state.sst_delete_files.size() +
+      state.log_delete_files.size());
+  // We may ignore the dbname when generating the file names.
+  const char* kDumbDbName = "";
+  for (auto file : state.sst_delete_files) {
+    candidate_files.push_back(
+        TableFileName(kDumbDbName, file->number).substr(1)
+    );
+    delete file;
+  }
+
+  for (auto file_num : state.log_delete_files) {
+    if (file_num > 0) {
+      candidate_files.push_back(LogFileName(kDumbDbName, file_num).substr(1));
+    }
+  }
+
+  // dedup state.candidate_files so we don't try to delete the same
+  // file twice
+  sort(candidate_files.begin(), candidate_files.end());
+  candidate_files.erase(unique(candidate_files.begin(), candidate_files.end()),
+                        candidate_files.end());
+
+  std::vector<std::string> old_info_log_files;
+
+  for (const auto& to_delete : candidate_files) {
+    uint64_t number;
+    FileType type;
+    // Ignore file if we cannot recognize it.
+    if (!ParseFileName(to_delete, &number, &type)) {
+      continue;
+    }
+
+    bool keep = true;
+    switch (type) {
+      case kLogFile:
+        keep = ((number >= state.log_number) ||
+                (number == state.prev_log_number));
+        break;
+      case kDescriptorFile:
+        // Keep my manifest file, and any newer incarnations'
+        // (can happen during manifest roll)
+        keep = (number >= state.manifest_file_number);
+        break;
+      case kTableFile:
+        keep = (sst_live.find(number) != sst_live.end());
+        break;
+      case kTempFile:
+        // Any temp files that are currently being written to must
+        // be recorded in pending_outputs_, which is inserted into "live".
+        // Also, SetCurrentFile creates a temp file when writing out new
+        // manifest, which is equal to state.pending_manifest_file_number. We
+        // should not delete that file
+        keep = (sst_live.find(number) != sst_live.end()) ||
+               (number == state.pending_manifest_file_number);
+        break;
+      case kInfoLogFile:
+        keep = true;
+        if (number != 0) {
+          old_info_log_files.push_back(to_delete);
+        }
+        break;
+      case kCurrentFile:
+      case kDBLockFile:
+      case kIdentityFile:
+      case kMetaDatabase:
+        keep = true;
+        break;
+    }
+
+    if (keep) {
+      continue;
+    }
+
+    if (type == kTableFile) {
+      // evict from cache
+      TableCache::Evict(table_cache_.get(), number);
+    }
+
+    std::string fname = ((type == kLogFile) ? options_.wal_dir : dbname_) +
+        "/" + to_delete;
+    if (type == kLogFile &&
+        (options_.WAL_ttl_seconds > 0 || options_.WAL_size_limit_MB > 0)) {
+      auto archived_log_name = ArchivedLogFileName(options_.wal_dir, number);
+      // The sync point below is used in (DBTest,TransactionLogIteratorRace)
+      TEST_SYNC_POINT("DBImpl::PurgeObsoleteFiles:1");
+      Status s = env_->RenameFile(fname, archived_log_name);
+      // The sync point below is used in (DBTest,TransactionLogIteratorRace)
+      TEST_SYNC_POINT("DBImpl::PurgeObsoleteFiles:2");
+      Log(options_.info_log,
+          "Move log file %s to %s -- %s\n",
+          fname.c_str(), archived_log_name.c_str(), s.ToString().c_str());
+    } else {
+      Status s = env_->DeleteFile(fname);
+      Log(options_.info_log, "Delete %s type=%d #%lu -- %s\n",
+          fname.c_str(), type, (unsigned long)number,
+          s.ToString().c_str());
+    }
+  }
+
+  // Delete old info log files.
+  size_t old_info_log_file_count = old_info_log_files.size();
+  // NOTE: Currently we only support log purge when options_.db_log_dir is
+  // located in `dbname` directory.
+  if (old_info_log_file_count >= options_.keep_log_file_num &&
+      options_.db_log_dir.empty()) {
+    std::sort(old_info_log_files.begin(), old_info_log_files.end());
+    size_t end = old_info_log_file_count - options_.keep_log_file_num;
+    for (unsigned int i = 0; i <= end; i++) {
+      std::string& to_delete = old_info_log_files.at(i);
+      Log(options_.info_log, "Delete info log file %s\n", to_delete.c_str());
+      Status s = env_->DeleteFile(dbname_ + "/" + to_delete);
+      if (!s.ok()) {
+        Log(options_.info_log, "Delete info log file %s FAILED -- %s\n",
+            to_delete.c_str(), s.ToString().c_str());
+      }
+    }
+  }
+  PurgeObsoleteWALFiles();
+  LogFlush(options_.info_log);
+}
+
+void DBImpl::DeleteObsoleteFiles() {
+  mutex_.AssertHeld();
+  DeletionState deletion_state;
+  FindObsoleteFiles(deletion_state, true);
+  if (deletion_state.HaveSomethingToDelete()) {
+    PurgeObsoleteFiles(deletion_state);
+  }
+}
+
+#ifndef ROCKSDB_LITE
+// 1. Go through all archived files and
+//    a. if ttl is enabled, delete outdated files
+//    b. if archive size limit is enabled, delete empty files,
+//        compute file number and size.
+// 2. If size limit is enabled:
+//    a. compute how many files should be deleted
+//    b. get sorted non-empty archived logs
+//    c. delete what should be deleted
+void DBImpl::PurgeObsoleteWALFiles() {
+  bool const ttl_enabled = options_.WAL_ttl_seconds > 0;
+  bool const size_limit_enabled =  options_.WAL_size_limit_MB > 0;
+  if (!ttl_enabled && !size_limit_enabled) {
+    return;
+  }
+
+  int64_t current_time;
+  Status s = env_->GetCurrentTime(&current_time);
+  if (!s.ok()) {
+    Log(options_.info_log, "Can't get current time: %s", s.ToString().c_str());
+    assert(false);
+    return;
+  }
+  uint64_t const now_seconds = static_cast<uint64_t>(current_time);
+  uint64_t const time_to_check = (ttl_enabled && !size_limit_enabled) ?
+    options_.WAL_ttl_seconds / 2 : default_interval_to_delete_obsolete_WAL_;
+
+  if (purge_wal_files_last_run_ + time_to_check > now_seconds) {
+    return;
+  }
+
+  purge_wal_files_last_run_ = now_seconds;
+
+  std::string archival_dir = ArchivalDirectory(options_.wal_dir);
+  std::vector<std::string> files;
+  s = env_->GetChildren(archival_dir, &files);
+  if (!s.ok()) {
+    Log(options_.info_log, "Can't get archive files: %s", s.ToString().c_str());
+    assert(false);
+    return;
+  }
+
+  size_t log_files_num = 0;
+  uint64_t log_file_size = 0;
+
+  for (auto& f : files) {
+    uint64_t number;
+    FileType type;
+    if (ParseFileName(f, &number, &type) && type == kLogFile) {
+      std::string const file_path = archival_dir + "/" + f;
+      if (ttl_enabled) {
+        uint64_t file_m_time;
+        Status const s = env_->GetFileModificationTime(file_path,
+          &file_m_time);
+        if (!s.ok()) {
+          Log(options_.info_log, "Can't get file mod time: %s: %s",
+              file_path.c_str(), s.ToString().c_str());
+          continue;
+        }
+        if (now_seconds - file_m_time > options_.WAL_ttl_seconds) {
+          Status const s = env_->DeleteFile(file_path);
+          if (!s.ok()) {
+            Log(options_.info_log, "Can't delete file: %s: %s",
+                file_path.c_str(), s.ToString().c_str());
+            continue;
+          } else {
+            MutexLock l(&read_first_record_cache_mutex_);
+            read_first_record_cache_.erase(number);
+          }
+          continue;
+        }
+      }
+
+      if (size_limit_enabled) {
+        uint64_t file_size;
+        Status const s = env_->GetFileSize(file_path, &file_size);
+        if (!s.ok()) {
+          Log(options_.info_log, "Can't get file size: %s: %s",
+              file_path.c_str(), s.ToString().c_str());
+          return;
+        } else {
+          if (file_size > 0) {
+            log_file_size = std::max(log_file_size, file_size);
+            ++log_files_num;
+          } else {
+            Status s = env_->DeleteFile(file_path);
+            if (!s.ok()) {
+              Log(options_.info_log, "Can't delete file: %s: %s",
+                  file_path.c_str(), s.ToString().c_str());
+              continue;
+            } else {
+              MutexLock l(&read_first_record_cache_mutex_);
+              read_first_record_cache_.erase(number);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  if (0 == log_files_num || !size_limit_enabled) {
+    return;
+  }
+
+  size_t const files_keep_num = options_.WAL_size_limit_MB *
+    1024 * 1024 / log_file_size;
+  if (log_files_num <= files_keep_num) {
+    return;
+  }
+
+  size_t files_del_num = log_files_num - files_keep_num;
+  VectorLogPtr archived_logs;
+  GetSortedWalsOfType(archival_dir, archived_logs, kArchivedLogFile);
+
+  if (files_del_num > archived_logs.size()) {
+    Log(options_.info_log, "Trying to delete more archived log files than "
+        "exist. Deleting all");
+    files_del_num = archived_logs.size();
+  }
+
+  for (size_t i = 0; i < files_del_num; ++i) {
+    std::string const file_path = archived_logs[i]->PathName();
+    Status const s = DeleteFile(file_path);
+    if (!s.ok()) {
+      Log(options_.info_log, "Can't delete file: %s: %s",
+          file_path.c_str(), s.ToString().c_str());
+      continue;
+    } else {
+      MutexLock l(&read_first_record_cache_mutex_);
+      read_first_record_cache_.erase(archived_logs[i]->LogNumber());
+    }
+  }
+}
+
+namespace {
+struct CompareLogByPointer {
+  bool operator()(const unique_ptr<LogFile>& a, const unique_ptr<LogFile>& b) {
+    LogFileImpl* a_impl = dynamic_cast<LogFileImpl*>(a.get());
+    LogFileImpl* b_impl = dynamic_cast<LogFileImpl*>(b.get());
+    return *a_impl < *b_impl;
+  }
+};
+}
+
+Status DBImpl::GetSortedWalsOfType(const std::string& path,
+                                   VectorLogPtr& log_files,
+                                   WalFileType log_type) {
+  std::vector<std::string> all_files;
+  const Status status = env_->GetChildren(path, &all_files);
+  if (!status.ok()) {
+    return status;
+  }
+  log_files.reserve(all_files.size());
+  for (const auto& f : all_files) {
+    uint64_t number;
+    FileType type;
+    if (ParseFileName(f, &number, &type) && type == kLogFile) {
+      SequenceNumber sequence;
+      Status s = ReadFirstRecord(log_type, number, &sequence);
+      if (!s.ok()) {
+        return s;
+      }
+      if (sequence == 0) {
+        // empty file
+        continue;
+      }
+
+      uint64_t size_bytes;
+      s = env_->GetFileSize(LogFileName(path, number), &size_bytes);
+      if (!s.ok()) {
+        return s;
+      }
+
+      log_files.push_back(std::move(unique_ptr<LogFile>(
+          new LogFileImpl(number, log_type, sequence, size_bytes))));
+    }
+  }
+  CompareLogByPointer compare_log_files;
+  std::sort(log_files.begin(), log_files.end(), compare_log_files);
+  return status;
+}
+
+Status DBImpl::RetainProbableWalFiles(VectorLogPtr& all_logs,
+                                      const SequenceNumber target) {
+  int64_t start = 0;  // signed to avoid overflow when target is < first file.
+  int64_t end = static_cast<int64_t>(all_logs.size()) - 1;
+  // Binary Search. avoid opening all files.
+  while (end >= start) {
+    int64_t mid = start + (end - start) / 2;  // Avoid overflow.
+    SequenceNumber current_seq_num = all_logs.at(mid)->StartSequence();
+    if (current_seq_num == target) {
+      end = mid;
+      break;
+    } else if (current_seq_num < target) {
+      start = mid + 1;
+    } else {
+      end = mid - 1;
+    }
+  }
+  // end could be -ve.
+  size_t start_index = std::max(static_cast<int64_t>(0), end);
+  // The last wal file is always included
+  all_logs.erase(all_logs.begin(), all_logs.begin() + start_index);
+  return Status::OK();
+}
+
+Status DBImpl::ReadFirstRecord(const WalFileType type, const uint64_t number,
+                               SequenceNumber* sequence) {
+  if (type != kAliveLogFile && type != kArchivedLogFile) {
+    return Status::NotSupported("File Type Not Known " + std::to_string(type));
+  }
+  {
+    MutexLock l(&read_first_record_cache_mutex_);
+    auto itr = read_first_record_cache_.find(number);
+    if (itr != read_first_record_cache_.end()) {
+      *sequence = itr->second;
+      return Status::OK();
+    }
+  }
+  Status s;
+  if (type == kAliveLogFile) {
+    std::string fname = LogFileName(options_.wal_dir, number);
+    s = ReadFirstLine(fname, sequence);
+    if (env_->FileExists(fname) && !s.ok()) {
+      // return any error that is not caused by non-existing file
+      return s;
+    }
+  }
+
+  if (type == kArchivedLogFile || !s.ok()) {
+    //  check if the file got moved to archive.
+    std::string archived_file = ArchivedLogFileName(options_.wal_dir, number);
+    s = ReadFirstLine(archived_file, sequence);
+  }
+
+  if (s.ok() && *sequence != 0) {
+    MutexLock l(&read_first_record_cache_mutex_);
+    read_first_record_cache_.insert({number, *sequence});
+  }
+  return s;
+}
+
+// the function returns status.ok() and sequence == 0 if the file exists, but is
+// empty
+Status DBImpl::ReadFirstLine(const std::string& fname,
+                             SequenceNumber* sequence) {
+  struct LogReporter : public log::Reader::Reporter {
+    Env* env;
+    Logger* info_log;
+    const char* fname;
+
+    Status* status;
+    bool ignore_error;  // true if options_.paranoid_checks==false
+    virtual void Corruption(size_t bytes, const Status& s) {
+      Log(info_log, "%s%s: dropping %d bytes; %s",
+          (this->ignore_error ? "(ignoring error) " : ""), fname,
+          static_cast<int>(bytes), s.ToString().c_str());
+      if (this->status->ok()) {
+        // only keep the first error
+        *this->status = s;
+      }
+    }
+  };
+
+  unique_ptr<SequentialFile> file;
+  Status status = env_->NewSequentialFile(fname, &file, storage_options_);
+
+  if (!status.ok()) {
+    return status;
+  }
+
+  LogReporter reporter;
+  reporter.env = env_;
+  reporter.info_log = options_.info_log.get();
+  reporter.fname = fname.c_str();
+  reporter.status = &status;
+  reporter.ignore_error = !options_.paranoid_checks;
+  log::Reader reader(std::move(file), &reporter, true /*checksum*/,
+                     0 /*initial_offset*/);
+  std::string scratch;
+  Slice record;
+
+  if (reader.ReadRecord(&record, &scratch) &&
+      (status.ok() || !options_.paranoid_checks)) {
+    if (record.size() < 12) {
+      reporter.Corruption(record.size(),
+                          Status::Corruption("log record too small"));
+      // TODO read record's till the first no corrupt entry?
+    } else {
+      WriteBatch batch;
+      WriteBatchInternal::SetContents(&batch, record);
+      *sequence = WriteBatchInternal::Sequence(&batch);
+      return Status::OK();
+    }
+  }
+
+  // ReadRecord returns false on EOF, which means that the log file is empty. we
+  // return status.ok() in that case and set sequence number to 0
+  *sequence = 0;
+  return status;
+}
+
+#endif  // ROCKSDB_LITE
+
+Status DBImpl::Recover(
+    const std::vector<ColumnFamilyDescriptor>& column_families, bool read_only,
+    bool error_if_log_file_exist) {
+  mutex_.AssertHeld();
+
+  bool is_new_db = false;
+  assert(db_lock_ == nullptr);
+  if (!read_only) {
+    // We call CreateDirIfMissing() as the directory may already exist (if we
+    // are reopening a DB), when this happens we don't want creating the
+    // directory to cause an error. However, we need to check if creating the
+    // directory fails or else we may get an obscure message about the lock
+    // file not existing. One real-world example of this occurring is if
+    // env->CreateDirIfMissing() doesn't create intermediate directories, e.g.
+    // when dbname_ is "dir/db" but when "dir" doesn't exist.
+    Status s = env_->CreateDirIfMissing(dbname_);
+    if (!s.ok()) {
+      return s;
+    }
+
+    s = env_->NewDirectory(dbname_, &db_directory_);
+    if (!s.ok()) {
+      return s;
+    }
+
+    s = env_->LockFile(LockFileName(dbname_), &db_lock_);
+    if (!s.ok()) {
+      return s;
+    }
+
+    if (!env_->FileExists(CurrentFileName(dbname_))) {
+      if (options_.create_if_missing) {
+        // TODO: add merge_operator name check
+        s = NewDB();
+        is_new_db = true;
+        if (!s.ok()) {
+          return s;
+        }
+      } else {
+        return Status::InvalidArgument(
+            dbname_, "does not exist (create_if_missing is false)");
+      }
+    } else {
+      if (options_.error_if_exists) {
+        return Status::InvalidArgument(
+            dbname_, "exists (error_if_exists is true)");
+      }
+    }
+    // Check for the IDENTITY file and create it if not there
+    if (!env_->FileExists(IdentityFileName(dbname_))) {
+      s = SetIdentityFile(env_, dbname_);
+      if (!s.ok()) {
+        return s;
+      }
+    }
+  }
+
+  Status s = versions_->Recover(column_families, read_only);
+  if (options_.paranoid_checks && s.ok()) {
+    s = CheckConsistency();
+  }
+  if (s.ok()) {
+    SequenceNumber max_sequence(0);
+    default_cf_handle_ = new ColumnFamilyHandleImpl(
+        versions_->GetColumnFamilySet()->GetDefault(), this, &mutex_);
+
+    // Recover from all newer log files than the ones named in the
+    // descriptor (new log files may have been added by the previous
+    // incarnation without registering them in the descriptor).
+    //
+    // Note that PrevLogNumber() is no longer used, but we pay
+    // attention to it in case we are recovering a database
+    // produced by an older version of rocksdb.
+    const uint64_t min_log = versions_->MinLogNumber();
+    const uint64_t prev_log = versions_->PrevLogNumber();
+    std::vector<std::string> filenames;
+    s = env_->GetChildren(options_.wal_dir, &filenames);
+    if (!s.ok()) {
+      return s;
+    }
+
+    std::vector<uint64_t> logs;
+    for (size_t i = 0; i < filenames.size(); i++) {
+      uint64_t number;
+      FileType type;
+      if (ParseFileName(filenames[i], &number, &type) && type == kLogFile) {
+        if (is_new_db) {
+          return Status::Corruption(
+              "While creating a new Db, wal_dir contains "
+              "existing log file: ",
+              filenames[i]);
+        } else if ((number >= min_log) || (number == prev_log)) {
+          logs.push_back(number);
+        }
+      }
+    }
+
+    if (logs.size() > 0 && error_if_log_file_exist) {
+      return Status::Corruption(""
+          "The db was opened in readonly mode with error_if_log_file_exist"
+          "flag but a log file already exists");
+    }
+
+    // Recover in the order in which the logs were generated
+    std::sort(logs.begin(), logs.end());
+    for (const auto& log : logs) {
+      // The previous incarnation may not have written any MANIFEST
+      // records after allocating this log number.  So we manually
+      // update the file number allocation counter in VersionSet.
+      versions_->MarkFileNumberUsed(log);
+      s = RecoverLogFile(log, &max_sequence, read_only);
+    }
+    SetTickerCount(options_.statistics.get(), SEQUENCE_NUMBER,
+                   versions_->LastSequence());
+  }
+
+  for (auto cfd : *versions_->GetColumnFamilySet()) {
+    max_total_in_memory_state_ += cfd->options()->write_buffer_size *
+                                  cfd->options()->max_write_buffer_number;
+  }
+
+  return s;
+}
+
+Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence,
+                              bool read_only) {
+  struct LogReporter : public log::Reader::Reporter {
+    Env* env;
+    Logger* info_log;
+    const char* fname;
+    Status* status;  // nullptr if options_.paranoid_checks==false or
+                     //            options_.skip_log_error_on_recovery==true
+    virtual void Corruption(size_t bytes, const Status& s) {
+      Log(info_log, "%s%s: dropping %d bytes; %s",
+          (this->status == nullptr ? "(ignoring error) " : ""),
+          fname, static_cast<int>(bytes), s.ToString().c_str());
+      if (this->status != nullptr && this->status->ok()) *this->status = s;
+    }
+  };
+
+  mutex_.AssertHeld();
+
+  std::unordered_map<int, VersionEdit> version_edits;
+  // no need to refcount because iteration is under mutex
+  for (auto cfd : *versions_->GetColumnFamilySet()) {
+    VersionEdit edit;
+    edit.SetColumnFamily(cfd->GetID());
+    version_edits.insert({cfd->GetID(), edit});
+  }
+
+  // Open the log file
+  std::string fname = LogFileName(options_.wal_dir, log_number);
+  unique_ptr<SequentialFile> file;
+  Status status = env_->NewSequentialFile(fname, &file, storage_options_);
+  if (!status.ok()) {
+    MaybeIgnoreError(&status);
+    return status;
+  }
+
+  // Create the log reader.
+  LogReporter reporter;
+  reporter.env = env_;
+  reporter.info_log = options_.info_log.get();
+  reporter.fname = fname.c_str();
+  reporter.status = (options_.paranoid_checks &&
+                     !options_.skip_log_error_on_recovery ? &status : nullptr);
+  // We intentially make log::Reader do checksumming even if
+  // paranoid_checks==false so that corruptions cause entire commits
+  // to be skipped instead of propagating bad information (like overly
+  // large sequence numbers).
+  log::Reader reader(std::move(file), &reporter, true/*checksum*/,
+                     0/*initial_offset*/);
+  Log(options_.info_log, "Recovering log #%lu",
+      (unsigned long) log_number);
+
+  // Read all the records and add to a memtable
+  std::string scratch;
+  Slice record;
+  WriteBatch batch;
+  while (reader.ReadRecord(&record, &scratch)) {
+    if (record.size() < 12) {
+      reporter.Corruption(
+          record.size(), Status::Corruption("log record too small"));
+      continue;
+    }
+    WriteBatchInternal::SetContents(&batch, record);
+
+    status = WriteBatchInternal::InsertInto(
+        &batch, column_family_memtables_.get(), true, log_number);
+
+    MaybeIgnoreError(&status);
+    if (!status.ok()) {
+      return status;
+    }
+    const SequenceNumber last_seq =
+        WriteBatchInternal::Sequence(&batch) +
+        WriteBatchInternal::Count(&batch) - 1;
+    if (last_seq > *max_sequence) {
+      *max_sequence = last_seq;
+    }
+
+    if (!read_only) {
+      // no need to refcount since client still doesn't have access
+      // to the DB and can not drop column families while we iterate
+      for (auto cfd : *versions_->GetColumnFamilySet()) {
+        if (cfd->mem()->ShouldFlush()) {
+          // If this asserts, it means that InsertInto failed in
+          // filtering updates to already-flushed column families
+          assert(cfd->GetLogNumber() <= log_number);
+          auto iter = version_edits.find(cfd->GetID());
+          assert(iter != version_edits.end());
+          VersionEdit* edit = &iter->second;
+          status = WriteLevel0TableForRecovery(cfd, cfd->mem(), edit);
+          // we still want to clear the memtable, even if the recovery failed
+          cfd->CreateNewMemtable();
+          if (!status.ok()) {
+            // Reflect errors immediately so that conditions like full
+            // file-systems cause the DB::Open() to fail.
+            return status;
+          }
+        }
+      }
+    }
+  }
+
+  if (versions_->LastSequence() < *max_sequence) {
+    versions_->SetLastSequence(*max_sequence);
+  }
+
+  if (!read_only) {
+    // no need to refcount since client still doesn't have access
+    // to the DB and can not drop column families while we iterate
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      auto iter = version_edits.find(cfd->GetID());
+      assert(iter != version_edits.end());
+      VersionEdit* edit = &iter->second;
+
+      if (cfd->GetLogNumber() > log_number) {
+        // Column family cfd has already flushed the data
+        // from log_number. Memtable has to be empty because
+        // we filter the updates based on log_number
+        // (in WriteBatch::InsertInto)
+        assert(cfd->mem()->GetFirstSequenceNumber() == 0);
+        assert(edit->NumEntries() == 0);
+        continue;
+      }
+
+      // flush the final memtable (if non-empty)
+      if (cfd->mem()->GetFirstSequenceNumber() != 0) {
+        status = WriteLevel0TableForRecovery(cfd, cfd->mem(), edit);
+      }
+      // we still want to clear the memtable, even if the recovery failed
+      cfd->CreateNewMemtable();
+      if (!status.ok()) {
+        return status;
+      }
+
+      // write MANIFEST with update
+      // writing log number in the manifest means that any log file
+      // with number strongly less than (log_number + 1) is already
+      // recovered and should be ignored on next reincarnation.
+      // Since we already recovered log_number, we want all logs
+      // with numbers `<= log_number` (includes this one) to be ignored
+      edit->SetLogNumber(log_number + 1);
+      // we must mark the next log number as used, even though it's
+      // not actually used. that is because VersionSet assumes
+      // VersionSet::next_file_number_ always to be strictly greater than any
+      // log number
+      versions_->MarkFileNumberUsed(log_number + 1);
+      status = versions_->LogAndApply(cfd, edit, &mutex_);
+      if (!status.ok()) {
+        return status;
+      }
+    }
+  }
+
+  return status;
+}
+
+Status DBImpl::WriteLevel0TableForRecovery(ColumnFamilyData* cfd, MemTable* mem,
+                                           VersionEdit* edit) {
+  mutex_.AssertHeld();
+  const uint64_t start_micros = env_->NowMicros();
+  FileMetaData meta;
+  meta.number = versions_->NewFileNumber();
+  pending_outputs_.insert(meta.number);
+  Iterator* iter = mem->NewIterator(ReadOptions(), true);
+  const SequenceNumber newest_snapshot = snapshots_.GetNewest();
+  const SequenceNumber earliest_seqno_in_memtable =
+    mem->GetFirstSequenceNumber();
+  Log(options_.info_log, "[%s] Level-0 table #%lu: started",
+      cfd->GetName().c_str(), (unsigned long)meta.number);
+
+  Status s;
+  {
+    mutex_.Unlock();
+    s = BuildTable(dbname_, env_, *cfd->options(), storage_options_,
+                   cfd->table_cache(), iter, &meta, cfd->internal_comparator(),
+                   newest_snapshot, earliest_seqno_in_memtable,
+                   GetCompressionFlush(*cfd->options()));
+    LogFlush(options_.info_log);
+    mutex_.Lock();
+  }
+
+  Log(options_.info_log, "[%s] Level-0 table #%lu: %lu bytes %s",
+      cfd->GetName().c_str(), (unsigned long)meta.number,
+      (unsigned long)meta.file_size, s.ToString().c_str());
+  delete iter;
+
+  pending_outputs_.erase(meta.number);
+
+  // Note that if file_size is zero, the file has been deleted and
+  // should not be added to the manifest.
+  int level = 0;
+  if (s.ok() && meta.file_size > 0) {
+    edit->AddFile(level, meta.number, meta.file_size,
+                  meta.smallest, meta.largest,
+                  meta.smallest_seqno, meta.largest_seqno);
+  }
+
+  InternalStats::CompactionStats stats;
+  stats.micros = env_->NowMicros() - start_micros;
+  stats.bytes_written = meta.file_size;
+  stats.files_out_levelnp1 = 1;
+  cfd->internal_stats()->AddCompactionStats(level, stats);
+  RecordTick(options_.statistics.get(), COMPACT_WRITE_BYTES, meta.file_size);
+  return s;
+}
+
+Status DBImpl::WriteLevel0Table(ColumnFamilyData* cfd,
+                                autovector<MemTable*>& mems, VersionEdit* edit,
+                                uint64_t* filenumber, LogBuffer* log_buffer) {
+  mutex_.AssertHeld();
+  const uint64_t start_micros = env_->NowMicros();
+  FileMetaData meta;
+  meta.number = versions_->NewFileNumber();
+  *filenumber = meta.number;
+  pending_outputs_.insert(meta.number);
+
+  const SequenceNumber newest_snapshot = snapshots_.GetNewest();
+  const SequenceNumber earliest_seqno_in_memtable =
+    mems[0]->GetFirstSequenceNumber();
+  Version* base = cfd->current();
+  base->Ref();          // it is likely that we do not need this reference
+  Status s;
+  {
+    mutex_.Unlock();
+    log_buffer->FlushBufferToLog();
+    std::vector<Iterator*> memtables;
+    for (MemTable* m : mems) {
+      Log(options_.info_log, "[%s] Flushing memtable with next log file: %lu\n",
+          cfd->GetName().c_str(), (unsigned long)m->GetNextLogNumber());
+      memtables.push_back(m->NewIterator(ReadOptions(), true));
+    }
+    Iterator* iter = NewMergingIterator(&cfd->internal_comparator(),
+                                        &memtables[0], memtables.size());
+    Log(options_.info_log, "[%s] Level-0 flush table #%lu: started",
+        cfd->GetName().c_str(), (unsigned long)meta.number);
+
+    s = BuildTable(dbname_, env_, *cfd->options(), storage_options_,
+                   cfd->table_cache(), iter, &meta, cfd->internal_comparator(),
+                   newest_snapshot, earliest_seqno_in_memtable,
+                   GetCompressionFlush(*cfd->options()));
+    LogFlush(options_.info_log);
+    delete iter;
+    Log(options_.info_log, "[%s] Level-0 flush table #%lu: %lu bytes %s",
+        cfd->GetName().c_str(), (unsigned long)meta.number,
+        (unsigned long)meta.file_size, s.ToString().c_str());
+
+    if (!options_.disableDataSync) {
+      db_directory_->Fsync();
+    }
+    mutex_.Lock();
+  }
+  base->Unref();
+
+  // re-acquire the most current version
+  base = cfd->current();
+
+  // There could be multiple threads writing to its own level-0 file.
+  // The pending_outputs cannot be cleared here, otherwise this newly
+  // created file might not be considered as a live-file by another
+  // compaction thread that is concurrently deleting obselete files.
+  // The pending_outputs can be cleared only after the new version is
+  // committed so that other threads can recognize this file as a
+  // valid one.
+  // pending_outputs_.erase(meta.number);
+
+  // Note that if file_size is zero, the file has been deleted and
+  // should not be added to the manifest.
+  int level = 0;
+  if (s.ok() && meta.file_size > 0) {
+    const Slice min_user_key = meta.smallest.user_key();
+    const Slice max_user_key = meta.largest.user_key();
+    // if we have more than 1 background thread, then we cannot
+    // insert files directly into higher levels because some other
+    // threads could be concurrently producing compacted files for
+    // that key range.
+    if (base != nullptr && options_.max_background_compactions <= 1 &&
+        cfd->options()->compaction_style == kCompactionStyleLevel) {
+      level = base->PickLevelForMemTableOutput(min_user_key, max_user_key);
+    }
+    edit->AddFile(level, meta.number, meta.file_size,
+                  meta.smallest, meta.largest,
+                  meta.smallest_seqno, meta.largest_seqno);
+  }
+
+  InternalStats::CompactionStats stats;
+  stats.micros = env_->NowMicros() - start_micros;
+  stats.bytes_written = meta.file_size;
+  cfd->internal_stats()->AddCompactionStats(level, stats);
+  RecordTick(options_.statistics.get(), COMPACT_WRITE_BYTES, meta.file_size);
+  return s;
+}
+
+Status DBImpl::FlushMemTableToOutputFile(ColumnFamilyData* cfd,
+                                         bool* madeProgress,
+                                         DeletionState& deletion_state,
+                                         LogBuffer* log_buffer) {
+  mutex_.AssertHeld();
+  assert(cfd->imm()->size() != 0);
+  assert(cfd->imm()->IsFlushPending());
+
+  // Save the contents of the earliest memtable as a new Table
+  uint64_t file_number;
+  autovector<MemTable*> mems;
+  cfd->imm()->PickMemtablesToFlush(&mems);
+  if (mems.empty()) {
+    LogToBuffer(log_buffer, "[%s] Nothing in memtable to flush",
+                cfd->GetName().c_str());
+    return Status::OK();
+  }
+
+  // record the logfile_number_ before we release the mutex
+  // entries mems are (implicitly) sorted in ascending order by their created
+  // time. We will use the first memtable's `edit` to keep the meta info for
+  // this flush.
+  MemTable* m = mems[0];
+  VersionEdit* edit = m->GetEdits();
+  edit->SetPrevLogNumber(0);
+  // SetLogNumber(log_num) indicates logs with number smaller than log_num
+  // will no longer be picked up for recovery.
+  edit->SetLogNumber(mems.back()->GetNextLogNumber());
+  edit->SetColumnFamily(cfd->GetID());
+
+  // This will release and re-acquire the mutex.
+  Status s = WriteLevel0Table(cfd, mems, edit, &file_number, log_buffer);
+
+  if (s.ok() && shutting_down_.Acquire_Load() && cfd->IsDropped()) {
+    s = Status::ShutdownInProgress(
+        "Database shutdown or Column family drop during flush");
+  }
+
+  if (!s.ok()) {
+    cfd->imm()->RollbackMemtableFlush(mems, file_number, &pending_outputs_);
+  } else {
+    // Replace immutable memtable with the generated Table
+    s = cfd->imm()->InstallMemtableFlushResults(
+        cfd, mems, versions_.get(), &mutex_, options_.info_log.get(),
+        file_number, pending_outputs_, &deletion_state.memtables_to_free,
+        db_directory_.get(), log_buffer);
+  }
+
+  if (s.ok()) {
+    InstallSuperVersion(cfd, deletion_state);
+    if (madeProgress) {
+      *madeProgress = 1;
+    }
+    Version::LevelSummaryStorage tmp;
+    LogToBuffer(log_buffer, "[%s] Level summary: %s\n", cfd->GetName().c_str(),
+                cfd->current()->LevelSummary(&tmp));
+
+    MaybeScheduleLogDBDeployStats();
+
+    if (disable_delete_obsolete_files_ == 0) {
+      // add to deletion state
+      while (alive_log_files_.size() &&
+             alive_log_files_.begin()->number < versions_->MinLogNumber()) {
+        const auto& earliest = *alive_log_files_.begin();
+        deletion_state.log_delete_files.push_back(earliest.number);
+        total_log_size_ -= earliest.size;
+        alive_log_files_.pop_front();
+      }
+    }
+  }
+
+  if (!s.ok() && !s.IsShutdownInProgress() && options_.paranoid_checks &&
+      bg_error_.ok()) {
+    // if a bad error happened (not ShutdownInProgress) and paranoid_checks is
+    // true, mark DB read-only
+    bg_error_ = s;
+  }
+  return s;
+}
+
+Status DBImpl::CompactRange(ColumnFamilyHandle* column_family,
+                            const Slice* begin, const Slice* end,
+                            bool reduce_level, int target_level) {
+  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  auto cfd = cfh->cfd();
+
+  Status s = FlushMemTable(cfd, FlushOptions());
+  if (!s.ok()) {
+    LogFlush(options_.info_log);
+    return s;
+  }
+
+  int max_level_with_files = 1;
+  {
+    MutexLock l(&mutex_);
+    Version* base = cfd->current();
+    for (int level = 1; level < cfd->NumberLevels(); level++) {
+      if (base->OverlapInLevel(level, begin, end)) {
+        max_level_with_files = level;
+      }
+    }
+  }
+  for (int level = 0; level <= max_level_with_files; level++) {
+    // in case the compaction is unversal or if we're compacting the
+    // bottom-most level, the output level will be the same as input one
+    if (cfd->options()->compaction_style == kCompactionStyleUniversal ||
+        level == max_level_with_files) {
+      s = RunManualCompaction(cfd, level, level, begin, end);
+    } else {
+      s = RunManualCompaction(cfd, level, level + 1, begin, end);
+    }
+    if (!s.ok()) {
+      LogFlush(options_.info_log);
+      return s;
+    }
+  }
+
+  if (reduce_level) {
+    s = ReFitLevel(cfd, max_level_with_files, target_level);
+  }
+  LogFlush(options_.info_log);
+
+  return s;
+}
+
+// return the same level if it cannot be moved
+int DBImpl::FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd, int level) {
+  mutex_.AssertHeld();
+  Version* current = cfd->current();
+  int minimum_level = level;
+  for (int i = level - 1; i > 0; --i) {
+    // stop if level i is not empty
+    if (current->NumLevelFiles(i) > 0) break;
+    // stop if level i is too small (cannot fit the level files)
+    if (cfd->compaction_picker()->MaxBytesForLevel(i) <
+        current->NumLevelBytes(level)) {
+      break;
+    }
+
+    minimum_level = i;
+  }
+  return minimum_level;
+}
+
+Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
+  assert(level < cfd->NumberLevels());
+
+  SuperVersion* superversion_to_free = nullptr;
+  SuperVersion* new_superversion = new SuperVersion();
+
+  mutex_.Lock();
+
+  // only allow one thread refitting
+  if (refitting_level_) {
+    mutex_.Unlock();
+    Log(options_.info_log, "ReFitLevel: another thread is refitting");
+    delete new_superversion;
+    return Status::NotSupported("another thread is refitting");
+  }
+  refitting_level_ = true;
+
+  // wait for all background threads to stop
+  bg_work_gate_closed_ = true;
+  while (bg_compaction_scheduled_ > 0 || bg_flush_scheduled_) {
+    Log(options_.info_log,
+        "RefitLevel: waiting for background threads to stop: %d %d",
+        bg_compaction_scheduled_, bg_flush_scheduled_);
+    bg_cv_.Wait();
+  }
+
+  // move to a smaller level
+  int to_level = target_level;
+  if (target_level < 0) {
+    to_level = FindMinimumEmptyLevelFitting(cfd, level);
+  }
+
+  assert(to_level <= level);
+
+  Status status;
+  if (to_level < level) {
+    Log(options_.info_log, "[%s] Before refitting:\n%s", cfd->GetName().c_str(),
+        cfd->current()->DebugString().data());
+
+    VersionEdit edit;
+    edit.SetColumnFamily(cfd->GetID());
+    for (const auto& f : cfd->current()->files_[level]) {
+      edit.DeleteFile(level, f->number);
+      edit.AddFile(to_level, f->number, f->file_size, f->smallest, f->largest,
+                   f->smallest_seqno, f->largest_seqno);
+    }
+    Log(options_.info_log, "[%s] Apply version edit:\n%s",
+        cfd->GetName().c_str(), edit.DebugString().data());
+
+    status = versions_->LogAndApply(cfd, &edit, &mutex_, db_directory_.get());
+    superversion_to_free = cfd->InstallSuperVersion(new_superversion, &mutex_);
+    new_superversion = nullptr;
+
+    Log(options_.info_log, "[%s] LogAndApply: %s\n", cfd->GetName().c_str(),
+        status.ToString().data());
+
+    if (status.ok()) {
+      Log(options_.info_log, "[%s] After refitting:\n%s",
+          cfd->GetName().c_str(), cfd->current()->DebugString().data());
+    }
+  }
+
+  refitting_level_ = false;
+  bg_work_gate_closed_ = false;
+
+  mutex_.Unlock();
+  delete superversion_to_free;
+  delete new_superversion;
+  return status;
+}
+
+int DBImpl::NumberLevels(ColumnFamilyHandle* column_family) {
+  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  return cfh->cfd()->NumberLevels();
+}
+
+int DBImpl::MaxMemCompactionLevel(ColumnFamilyHandle* column_family) {
+  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  return cfh->cfd()->options()->max_mem_compaction_level;
+}
+
+int DBImpl::Level0StopWriteTrigger(ColumnFamilyHandle* column_family) {
+  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  return cfh->cfd()->options()->level0_stop_writes_trigger;
+}
+
+Status DBImpl::Flush(const FlushOptions& options,
+                     ColumnFamilyHandle* column_family) {
+  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  return FlushMemTable(cfh->cfd(), options);
+}
+
+SequenceNumber DBImpl::GetLatestSequenceNumber() const {
+  return versions_->LastSequence();
+}
+
+Status DBImpl::RunManualCompaction(ColumnFamilyData* cfd, int input_level,
+                                   int output_level, const Slice* begin,
+                                   const Slice* end) {
+  assert(input_level >= 0);
+
+  InternalKey begin_storage, end_storage;
+
+  ManualCompaction manual;
+  manual.cfd = cfd;
+  manual.input_level = input_level;
+  manual.output_level = output_level;
+  manual.done = false;
+  manual.in_progress = false;
+  // For universal compaction, we enforce every manual compaction to compact
+  // all files.
+  if (begin == nullptr ||
+      cfd->options()->compaction_style == kCompactionStyleUniversal) {
+    manual.begin = nullptr;
+  } else {
+    begin_storage = InternalKey(*begin, kMaxSequenceNumber, kValueTypeForSeek);
+    manual.begin = &begin_storage;
+  }
+  if (end == nullptr ||
+      cfd->options()->compaction_style == kCompactionStyleUniversal) {
+    manual.end = nullptr;
+  } else {
+    end_storage = InternalKey(*end, 0, static_cast<ValueType>(0));
+    manual.end = &end_storage;
+  }
+
+  MutexLock l(&mutex_);
+
+  // When a manual compaction arrives, temporarily disable scheduling of
+  // non-manual compactions and wait until the number of scheduled compaction
+  // jobs drops to zero. This is needed to ensure that this manual compaction
+  // can compact any range of keys/files.
+  //
+  // bg_manual_only_ is non-zero when at least one thread is inside
+  // RunManualCompaction(), i.e. during that time no other compaction will
+  // get scheduled (see MaybeScheduleFlushOrCompaction).
+  //
+  // Note that the following loop doesn't stop more that one thread calling
+  // RunManualCompaction() from getting to the second while loop below.
+  // However, only one of them will actually schedule compaction, while
+  // others will wait on a condition variable until it completes.
+
+  ++bg_manual_only_;
+  while (bg_compaction_scheduled_ > 0) {
+    Log(options_.info_log,
+        "[%s] Manual compaction waiting for all other scheduled background "
+        "compactions to finish",
+        cfd->GetName().c_str());
+    bg_cv_.Wait();
+  }
+
+  Log(options_.info_log, "[%s] Manual compaction starting",
+      cfd->GetName().c_str());
+
+  while (!manual.done && !shutting_down_.Acquire_Load() && bg_error_.ok()) {
+    assert(bg_manual_only_ > 0);
+    if (manual_compaction_ != nullptr) {
+      // Running either this or some other manual compaction
+      bg_cv_.Wait();
+    } else {
+      manual_compaction_ = &manual;
+      MaybeScheduleFlushOrCompaction();
+    }
+  }
+
+  assert(!manual.in_progress);
+  assert(bg_manual_only_ > 0);
+  --bg_manual_only_;
+  return manual.status;
+}
+
+Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
+                             const FlushOptions& options) {
+  // nullptr batch means just wait for earlier writes to be done
+  Status s = Write(WriteOptions(), nullptr);
+  if (s.ok() && options.wait) {
+    // Wait until the compaction completes
+    s = WaitForFlushMemTable(cfd);
+  }
+  return s;
+}
+
+Status DBImpl::WaitForFlushMemTable(ColumnFamilyData* cfd) {
+  Status s;
+  // Wait until the compaction completes
+  MutexLock l(&mutex_);
+  while (cfd->imm()->size() > 0 && bg_error_.ok()) {
+    bg_cv_.Wait();
+  }
+  if (!bg_error_.ok()) {
+    s = bg_error_;
+  }
+  return s;
+}
+
+void DBImpl::MaybeScheduleFlushOrCompaction() {
+  mutex_.AssertHeld();
+  bg_schedule_needed_ = false;
+  if (bg_work_gate_closed_) {
+    // gate closed for backgrond work
+  } else if (shutting_down_.Acquire_Load()) {
+    // DB is being deleted; no more background compactions
+  } else {
+    bool is_flush_pending = false;
+    // no need to refcount since we're under a mutex
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      if (cfd->imm()->IsFlushPending()) {
+        is_flush_pending = true;
+      }
+    }
+    if (is_flush_pending) {
+      // memtable flush needed
+      if (bg_flush_scheduled_ < options_.max_background_flushes) {
+        bg_flush_scheduled_++;
+        env_->Schedule(&DBImpl::BGWorkFlush, this, Env::Priority::HIGH);
+      } else if (options_.max_background_flushes > 0) {
+        bg_schedule_needed_ = true;
+      }
+    }
+    bool is_compaction_needed = false;
+    // no need to refcount since we're under a mutex
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      if (cfd->current()->NeedsCompaction()) {
+        is_compaction_needed = true;
+        break;
+      }
+    }
+
+    // Schedule BGWorkCompaction if there's a compaction pending (or a memtable
+    // flush, but the HIGH pool is not enabled)
+    // Do it only if max_background_compactions hasn't been reached and, in case
+    // bg_manual_only_ > 0, if it's a manual compaction.
+    if ((manual_compaction_ || is_compaction_needed ||
+         (is_flush_pending && options_.max_background_flushes == 0)) &&
+        (!bg_manual_only_ || manual_compaction_)) {
+      if (bg_compaction_scheduled_ < options_.max_background_compactions) {
+        bg_compaction_scheduled_++;
+        env_->Schedule(&DBImpl::BGWorkCompaction, this, Env::Priority::LOW);
+      } else {
+        bg_schedule_needed_ = true;
+      }
+    }
+  }
+}
+
+void DBImpl::BGWorkFlush(void* db) {
+  reinterpret_cast<DBImpl*>(db)->BackgroundCallFlush();
+}
+
+void DBImpl::BGWorkCompaction(void* db) {
+  reinterpret_cast<DBImpl*>(db)->BackgroundCallCompaction();
+}
+
+Status DBImpl::BackgroundFlush(bool* madeProgress,
+                               DeletionState& deletion_state,
+                               LogBuffer* log_buffer) {
+  mutex_.AssertHeld();
+  // call_status is failure if at least one flush was a failure. even if
+  // flushing one column family reports a failure, we will continue flushing
+  // other column families. however, call_status will be a failure in that case.
+  Status call_status;
+  // refcounting in iteration
+  for (auto cfd : *versions_->GetColumnFamilySet()) {
+    cfd->Ref();
+    Status flush_status;
+    while (flush_status.ok() && cfd->imm()->IsFlushPending()) {
+      LogToBuffer(
+          log_buffer,
+          "BackgroundCallFlush doing FlushMemTableToOutputFile with column "
+          "family [%s], flush slots available %d",
+          cfd->GetName().c_str(),
+          options_.max_background_flushes - bg_flush_scheduled_);
+      flush_status = FlushMemTableToOutputFile(cfd, madeProgress,
+                                               deletion_state, log_buffer);
+    }
+    if (call_status.ok() && !flush_status.ok()) {
+      call_status = flush_status;
+    }
+    cfd->Unref();
+  }
+  versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
+  return call_status;
+}
+
+void DBImpl::BackgroundCallFlush() {
+  bool madeProgress = false;
+  DeletionState deletion_state(true);
+  assert(bg_flush_scheduled_);
+
+  LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, options_.info_log.get());
+  {
+    MutexLock l(&mutex_);
+
+    Status s;
+    if (!shutting_down_.Acquire_Load()) {
+      s = BackgroundFlush(&madeProgress, deletion_state, &log_buffer);
+      if (!s.ok()) {
+        // Wait a little bit before retrying background compaction in
+        // case this is an environmental problem and we do not want to
+        // chew up resources for failed compactions for the duration of
+        // the problem.
+        uint64_t error_cnt = default_cf_handle_->cfd()
+                                 ->internal_stats()
+                                 ->BumpAndGetBackgroundErrorCount();
+        bg_cv_.SignalAll();  // In case a waiter can proceed despite the error
+        mutex_.Unlock();
+        Log(options_.info_log,
+            "Waiting after background flush error: %s"
+            "Accumulated background error counts: %" PRIu64,
+            s.ToString().c_str(), error_cnt);
+        log_buffer.FlushBufferToLog();
+        LogFlush(options_.info_log);
+        env_->SleepForMicroseconds(1000000);
+        mutex_.Lock();
+      }
+    }
+
+    // If !s.ok(), this means that Flush failed. In that case, we want
+    // to delete all obsolete files and we force FindObsoleteFiles()
+    FindObsoleteFiles(deletion_state, !s.ok());
+    // delete unnecessary files if any, this is done outside the mutex
+    if (deletion_state.HaveSomethingToDelete() || !log_buffer.IsEmpty()) {
+      mutex_.Unlock();
+      // Have to flush the info logs before bg_flush_scheduled_--
+      // because if bg_flush_scheduled_ becomes 0 and the lock is
+      // released, the deconstructor of DB can kick in and destroy all the
+      // states of DB so info_log might not be available after that point.
+      // It also applies to access other states that DB owns.
+      log_buffer.FlushBufferToLog();
+      if (deletion_state.HaveSomethingToDelete()) {
+        PurgeObsoleteFiles(deletion_state);
+      }
+      mutex_.Lock();
+    }
+
+    bg_flush_scheduled_--;
+    // Any time the mutex is released After finding the work to do, another
+    // thread might execute MaybeScheduleFlushOrCompaction(). It is possible
+    // that there is a pending job but it is not scheduled because of the
+    // max thread limit.
+    if (madeProgress || bg_schedule_needed_) {
+      MaybeScheduleFlushOrCompaction();
+    }
+    bg_cv_.SignalAll();
+    // IMPORTANT: there should be no code after calling SignalAll. This call may
+    // signal the DB destructor that it's OK to proceed with destruction. In
+    // that case, all DB variables will be dealloacated and referencing them
+    // will cause trouble.
+  }
+}
+
+void DBImpl::BackgroundCallCompaction() {
+  bool madeProgress = false;
+  DeletionState deletion_state(true);
+
+  MaybeDumpStats();
+  LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, options_.info_log.get());
+  {
+    MutexLock l(&mutex_);
+    assert(bg_compaction_scheduled_);
+    Status s;
+    if (!shutting_down_.Acquire_Load()) {
+      s = BackgroundCompaction(&madeProgress, deletion_state, &log_buffer);
+      if (!s.ok()) {
+        // Wait a little bit before retrying background compaction in
+        // case this is an environmental problem and we do not want to
+        // chew up resources for failed compactions for the duration of
+        // the problem.
+        uint64_t error_cnt = default_cf_handle_->cfd()
+                                 ->internal_stats()
+                                 ->BumpAndGetBackgroundErrorCount();
+        bg_cv_.SignalAll();  // In case a waiter can proceed despite the error
+        mutex_.Unlock();
+        log_buffer.FlushBufferToLog();
+        Log(options_.info_log,
+            "Waiting after background compaction error: %s, "
+            "Accumulated background error counts: %" PRIu64,
+            s.ToString().c_str(), error_cnt);
+        LogFlush(options_.info_log);
+        env_->SleepForMicroseconds(1000000);
+        mutex_.Lock();
+      }
+    }
+
+    // If !s.ok(), this means that Compaction failed. In that case, we want
+    // to delete all obsolete files we might have created and we force
+    // FindObsoleteFiles(). This is because deletion_state does not catch
+    // all created files if compaction failed.
+    FindObsoleteFiles(deletion_state, !s.ok());
+
+    // delete unnecessary files if any, this is done outside the mutex
+    if (deletion_state.HaveSomethingToDelete() || !log_buffer.IsEmpty()) {
+      mutex_.Unlock();
+      // Have to flush the info logs before bg_compaction_scheduled_--
+      // because if bg_flush_scheduled_ becomes 0 and the lock is
+      // released, the deconstructor of DB can kick in and destroy all the
+      // states of DB so info_log might not be available after that point.
+      // It also applies to access other states that DB owns.
+      log_buffer.FlushBufferToLog();
+      if (deletion_state.HaveSomethingToDelete()) {
+        PurgeObsoleteFiles(deletion_state);
+      }
+      mutex_.Lock();
+    }
+
+    bg_compaction_scheduled_--;
+
+    MaybeScheduleLogDBDeployStats();
+
+    versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
+
+    // Previous compaction may have produced too many files in a level,
+    // So reschedule another compaction if we made progress in the
+    // last compaction.
+    //
+    // Also, any time the mutex is released After finding the work to do,
+    // another thread might execute MaybeScheduleFlushOrCompaction(). It is
+    // possible  that there is a pending job but it is not scheduled because of
+    // the max thread limit.
+    if (madeProgress || bg_schedule_needed_) {
+      MaybeScheduleFlushOrCompaction();
+    }
+    bg_cv_.SignalAll();
+    // IMPORTANT: there should be no code after calling SignalAll. This call may
+    // signal the DB destructor that it's OK to proceed with destruction. In
+    // that case, all DB variables will be dealloacated and referencing them
+    // will cause trouble.
+  }
+}
+
+Status DBImpl::BackgroundCompaction(bool* madeProgress,
+                                    DeletionState& deletion_state,
+                                    LogBuffer* log_buffer) {
+  *madeProgress = false;
+  mutex_.AssertHeld();
+
+  bool is_manual = (manual_compaction_ != nullptr) &&
+                   (manual_compaction_->in_progress == false);
+
+  if (is_manual) {
+    // another thread cannot pick up the same work
+    manual_compaction_->in_progress = true;
+  }
+
+  // FLUSH preempts compaction
+  Status flush_stat;
+  for (auto cfd : *versions_->GetColumnFamilySet()) {
+    while (cfd->imm()->IsFlushPending()) {
+      LogToBuffer(
+          log_buffer,
+          "BackgroundCompaction doing FlushMemTableToOutputFile, "
+          "compaction slots available %d",
+          options_.max_background_compactions - bg_compaction_scheduled_);
+      cfd->Ref();
+      flush_stat = FlushMemTableToOutputFile(cfd, madeProgress, deletion_state,
+                                             log_buffer);
+      cfd->Unref();
+      if (!flush_stat.ok()) {
+        if (is_manual) {
+          manual_compaction_->status = flush_stat;
+          manual_compaction_->done = true;
+          manual_compaction_->in_progress = false;
+          manual_compaction_ = nullptr;
+        }
+        return flush_stat;
+      }
+    }
+  }
+
+  unique_ptr<Compaction> c;
+  InternalKey manual_end_storage;
+  InternalKey* manual_end = &manual_end_storage;
+  if (is_manual) {
+    ManualCompaction* m = manual_compaction_;
+    assert(m->in_progress);
+    c.reset(m->cfd->CompactRange(m->input_level, m->output_level, m->begin,
+                                 m->end, &manual_end));
+    if (!c) {
+      m->done = true;
+    }
+    LogToBuffer(log_buffer,
+                "[%s] Manual compaction from level-%d to level-%d from %s .. "
+                "%s; will stop at %s\n",
+                m->cfd->GetName().c_str(), m->input_level, m->output_level,
+                (m->begin ? m->begin->DebugString().c_str() : "(begin)"),
+                (m->end ? m->end->DebugString().c_str() : "(end)"),
+                ((m->done || manual_end == nullptr)
+                     ? "(end)"
+                     : manual_end->DebugString().c_str()));
+  } else {
+    // no need to refcount in iteration since it's always under a mutex
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      if (!cfd->options()->disable_auto_compactions) {
+        c.reset(cfd->PickCompaction(log_buffer));
+        if (c != nullptr) {
+          // update statistics
+          MeasureTime(options_.statistics.get(), NUM_FILES_IN_SINGLE_COMPACTION,
+                      c->inputs(0)->size());
+          break;
+        }
+      }
+    }
+  }
+
+  Status status;
+  if (!c) {
+    // Nothing to do
+    LogToBuffer(log_buffer, "Compaction nothing to do");
+  } else if (!is_manual && c->IsTrivialMove()) {
+    // Move file to next level
+    assert(c->num_input_files(0) == 1);
+    FileMetaData* f = c->input(0, 0);
+    c->edit()->DeleteFile(c->level(), f->number);
+    c->edit()->AddFile(c->level() + 1, f->number, f->file_size,
+                       f->smallest, f->largest,
+                       f->smallest_seqno, f->largest_seqno);
+    status = versions_->LogAndApply(c->column_family_data(), c->edit(), &mutex_,
+                                    db_directory_.get());
+    InstallSuperVersion(c->column_family_data(), deletion_state);
+
+    Version::LevelSummaryStorage tmp;
+    LogToBuffer(log_buffer, "[%s] Moved #%lld to level-%d %lld bytes %s: %s\n",
+                c->column_family_data()->GetName().c_str(),
+                static_cast<unsigned long long>(f->number), c->level() + 1,
+                static_cast<unsigned long long>(f->file_size),
+                status.ToString().c_str(),
+                c->input_version()->LevelSummary(&tmp));
+    c->ReleaseCompactionFiles(status);
+    *madeProgress = true;
+  } else {
+    MaybeScheduleFlushOrCompaction(); // do more compaction work in parallel.
+    CompactionState* compact = new CompactionState(c.get());
+    status = DoCompactionWork(compact, deletion_state, log_buffer);
+    CleanupCompaction(compact, status);
+    c->ReleaseCompactionFiles(status);
+    c->ReleaseInputs();
+    *madeProgress = true;
+  }
+  c.reset();
+
+  if (status.ok()) {
+    // Done
+  } else if (shutting_down_.Acquire_Load()) {
+    // Ignore compaction errors found during shutting down
+  } else {
+    Log(InfoLogLevel::WARN_LEVEL, options_.info_log, "Compaction error: %s",
+        status.ToString().c_str());
+    if (options_.paranoid_checks && bg_error_.ok()) {
+      bg_error_ = status;
+    }
+  }
+
+  if (is_manual) {
+    ManualCompaction* m = manual_compaction_;
+    if (!status.ok()) {
+      m->status = status;
+      m->done = true;
+    }
+    // For universal compaction:
+    //   Because universal compaction always happens at level 0, so one
+    //   compaction will pick up all overlapped files. No files will be
+    //   filtered out due to size limit and left for a successive compaction.
+    //   So we can safely conclude the current compaction.
+    //
+    //   Also note that, if we don't stop here, then the current compaction
+    //   writes a new file back to level 0, which will be used in successive
+    //   compaction. Hence the manual compaction will never finish.
+    //
+    // Stop the compaction if manual_end points to nullptr -- this means
+    // that we compacted the whole range. manual_end should always point
+    // to nullptr in case of universal compaction
+    if (manual_end == nullptr) {
+      m->done = true;
+    }
+    if (!m->done) {
+      // We only compacted part of the requested range.  Update *m
+      // to the range that is left to be compacted.
+      // Universal compaction should always compact the whole range
+      assert(m->cfd->options()->compaction_style != kCompactionStyleUniversal);
+      m->tmp_storage = *manual_end;
+      m->begin = &m->tmp_storage;
+    }
+    m->in_progress = false; // not being processed anymore
+    manual_compaction_ = nullptr;
+  }
+  return status;
+}
+
+void DBImpl::CleanupCompaction(CompactionState* compact, Status status) {
+  mutex_.AssertHeld();
+  if (compact->builder != nullptr) {
+    // May happen if we get a shutdown call in the middle of compaction
+    compact->builder->Abandon();
+    compact->builder.reset();
+  } else {
+    assert(compact->outfile == nullptr);
+  }
+  for (size_t i = 0; i < compact->outputs.size(); i++) {
+    const CompactionState::Output& out = compact->outputs[i];
+    pending_outputs_.erase(out.number);
+
+    // If this file was inserted into the table cache then remove
+    // them here because this compaction was not committed.
+    if (!status.ok()) {
+      TableCache::Evict(table_cache_.get(), out.number);
+    }
+  }
+  delete compact;
+}
+
+// Allocate the file numbers for the output file. We allocate as
+// many output file numbers as there are files in level+1 (at least one)
+// Insert them into pending_outputs so that they do not get deleted.
+void DBImpl::AllocateCompactionOutputFileNumbers(CompactionState* compact) {
+  mutex_.AssertHeld();
+  assert(compact != nullptr);
+  assert(compact->builder == nullptr);
+  int filesNeeded = compact->compaction->num_input_files(1);
+  for (int i = 0; i < std::max(filesNeeded, 1); i++) {
+    uint64_t file_number = versions_->NewFileNumber();
+    pending_outputs_.insert(file_number);
+    compact->allocated_file_numbers.push_back(file_number);
+  }
+}
+
+// Frees up unused file number.
+void DBImpl::ReleaseCompactionUnusedFileNumbers(CompactionState* compact) {
+  mutex_.AssertHeld();
+  for (const auto file_number : compact->allocated_file_numbers) {
+    pending_outputs_.erase(file_number);
+  }
+}
+
+Status DBImpl::OpenCompactionOutputFile(CompactionState* compact) {
+  assert(compact != nullptr);
+  assert(compact->builder == nullptr);
+  uint64_t file_number;
+  // If we have not yet exhausted the pre-allocated file numbers,
+  // then use the one from the front. Otherwise, we have to acquire
+  // the heavyweight lock and allocate a new file number.
+  if (!compact->allocated_file_numbers.empty()) {
+    file_number = compact->allocated_file_numbers.front();
+    compact->allocated_file_numbers.pop_front();
+  } else {
+    mutex_.Lock();
+    file_number = versions_->NewFileNumber();
+    pending_outputs_.insert(file_number);
+    mutex_.Unlock();
+  }
+  CompactionState::Output out;
+  out.number = file_number;
+  out.smallest.Clear();
+  out.largest.Clear();
+  out.smallest_seqno = out.largest_seqno = 0;
+  compact->outputs.push_back(out);
+
+  // Make the output file
+  std::string fname = TableFileName(dbname_, file_number);
+  Status s = env_->NewWritableFile(fname, &compact->outfile, storage_options_);
+
+  if (s.ok()) {
+    // Over-estimate slightly so we don't end up just barely crossing
+    // the threshold.
+    ColumnFamilyData* cfd = compact->compaction->column_family_data();
+    compact->outfile->SetPreallocationBlockSize(
+        1.1 * cfd->compaction_picker()->MaxFileSizeForLevel(
+                  compact->compaction->output_level()));
+
+    CompressionType compression_type =
+        GetCompressionType(*cfd->options(), compact->compaction->output_level(),
+                           compact->compaction->enable_compression());
+
+    compact->builder.reset(
+        NewTableBuilder(*cfd->options(), cfd->internal_comparator(),
+                        compact->outfile.get(), compression_type));
+  }
+  LogFlush(options_.info_log);
+  return s;
+}
+
+Status DBImpl::FinishCompactionOutputFile(CompactionState* compact,
+                                          Iterator* input) {
+  assert(compact != nullptr);
+  assert(compact->outfile);
+  assert(compact->builder != nullptr);
+
+  const uint64_t output_number = compact->current_output()->number;
+  assert(output_number != 0);
+
+  // Check for iterator errors
+  Status s = input->status();
+  const uint64_t current_entries = compact->builder->NumEntries();
+  if (s.ok()) {
+    s = compact->builder->Finish();
+  } else {
+    compact->builder->Abandon();
+  }
+  const uint64_t current_bytes = compact->builder->FileSize();
+  compact->current_output()->file_size = current_bytes;
+  compact->total_bytes += current_bytes;
+  compact->builder.reset();
+
+  // Finish and check for file errors
+  if (s.ok() && !options_.disableDataSync) {
+    if (options_.use_fsync) {
+      StopWatch sw(env_, options_.statistics.get(),
+                   COMPACTION_OUTFILE_SYNC_MICROS, false);
+      s = compact->outfile->Fsync();
+    } else {
+      StopWatch sw(env_, options_.statistics.get(),
+                   COMPACTION_OUTFILE_SYNC_MICROS, false);
+      s = compact->outfile->Sync();
+    }
+  }
+  if (s.ok()) {
+    s = compact->outfile->Close();
+  }
+  compact->outfile.reset();
+
+  if (s.ok() && current_entries > 0) {
+    // Verify that the table is usable
+    ColumnFamilyData* cfd = compact->compaction->column_family_data();
+    FileMetaData meta(output_number, current_bytes);
+    Iterator* iter = cfd->table_cache()->NewIterator(
+        ReadOptions(), storage_options_, cfd->internal_comparator(), meta);
+    s = iter->status();
+    delete iter;
+    if (s.ok()) {
+      Log(options_.info_log, "[%s] Generated table #%lu: %lu keys, %lu bytes",
+          cfd->GetName().c_str(), (unsigned long)output_number,
+          (unsigned long)current_entries, (unsigned long)current_bytes);
+    }
+  }
+  return s;
+}
+
+
+Status DBImpl::InstallCompactionResults(CompactionState* compact,
+                                        LogBuffer* log_buffer) {
+  mutex_.AssertHeld();
+
+  // paranoia: verify that the files that we started with
+  // still exist in the current version and in the same original level.
+  // This ensures that a concurrent compaction did not erroneously
+  // pick the same files to compact.
+  if (!versions_->VerifyCompactionFileConsistency(compact->compaction)) {
+    Log(options_.info_log, "[%s] Compaction %d@%d + %d@%d files aborted",
+        compact->compaction->column_family_data()->GetName().c_str(),
+        compact->compaction->num_input_files(0), compact->compaction->level(),
+        compact->compaction->num_input_files(1),
+        compact->compaction->output_level());
+    return Status::Corruption("Compaction input files inconsistent");
+  }
+
+  LogToBuffer(log_buffer, "[%s] Compacted %d@%d + %d@%d files => %lld bytes",
+              compact->compaction->column_family_data()->GetName().c_str(),
+              compact->compaction->num_input_files(0),
+              compact->compaction->level(),
+              compact->compaction->num_input_files(1),
+              compact->compaction->output_level(),
+              static_cast<long long>(compact->total_bytes));
+
+  // Add compaction outputs
+  compact->compaction->AddInputDeletions(compact->compaction->edit());
+  for (size_t i = 0; i < compact->outputs.size(); i++) {
+    const CompactionState::Output& out = compact->outputs[i];
+    compact->compaction->edit()->AddFile(
+        compact->compaction->output_level(), out.number, out.file_size,
+        out.smallest, out.largest, out.smallest_seqno, out.largest_seqno);
+  }
+  return versions_->LogAndApply(compact->compaction->column_family_data(),
+                                compact->compaction->edit(), &mutex_,
+                                db_directory_.get());
+}
+
+// Given a sequence number, return the sequence number of the
+// earliest snapshot that this sequence number is visible in.
+// The snapshots themselves are arranged in ascending order of
+// sequence numbers.
+// Employ a sequential search because the total number of
+// snapshots are typically small.
+inline SequenceNumber DBImpl::findEarliestVisibleSnapshot(
+  SequenceNumber in, std::vector<SequenceNumber>& snapshots,
+  SequenceNumber* prev_snapshot) {
+  if (!IsSnapshotSupported()) {
+    return 0;
+  }
+  SequenceNumber prev __attribute__((unused)) = 0;
+  for (const auto cur : snapshots) {
+    assert(prev <= cur);
+    if (cur >= in) {
+      *prev_snapshot = prev;
+      return cur;
+    }
+    prev = cur; // assignment
+    assert(prev);
+  }
+  Log(options_.info_log,
+      "Looking for seqid %lu but maxseqid is %lu",
+      (unsigned long)in,
+      (unsigned long)snapshots[snapshots.size()-1]);
+  assert(0);
+  return 0;
+}
+
+uint64_t DBImpl::CallFlushDuringCompaction(ColumnFamilyData* cfd,
+                                           DeletionState& deletion_state,
+                                           LogBuffer* log_buffer) {
+  if (cfd->imm()->imm_flush_needed.NoBarrier_Load() != nullptr) {
+    const uint64_t imm_start = env_->NowMicros();
+    mutex_.Lock();
+    if (cfd->imm()->IsFlushPending()) {
+      cfd->Ref();
+      FlushMemTableToOutputFile(cfd, nullptr, deletion_state, log_buffer);
+      cfd->Unref();
+      bg_cv_.SignalAll();  // Wakeup MakeRoomForWrite() if necessary
+    }
+    mutex_.Unlock();
+    log_buffer->FlushBufferToLog();
+    return env_->NowMicros() - imm_start;
+  }
+  return 0;
+}
+
+Status DBImpl::ProcessKeyValueCompaction(
+    SequenceNumber visible_at_tip,
+    SequenceNumber earliest_snapshot,
+    SequenceNumber latest_snapshot,
+    DeletionState& deletion_state,
+    bool bottommost_level,
+    int64_t& imm_micros,
+    Iterator* input,
+    CompactionState* compact,
+    bool is_compaction_v2,
+    LogBuffer* log_buffer) {
+  size_t combined_idx = 0;
+  Status status;
+  std::string compaction_filter_value;
+  ParsedInternalKey ikey;
+  IterKey current_user_key;
+  bool has_current_user_key = false;
+  IterKey delete_key;
+  SequenceNumber last_sequence_for_key __attribute__((unused)) =
+    kMaxSequenceNumber;
+  SequenceNumber visible_in_snapshot = kMaxSequenceNumber;
+  ColumnFamilyData* cfd = compact->compaction->column_family_data();
+  MergeHelper merge(
+      cfd->user_comparator(), cfd->options()->merge_operator.get(),
+      options_.info_log.get(), cfd->options()->min_partial_merge_operands,
+      false /* internal key corruption is expected */);
+  auto compaction_filter = cfd->options()->compaction_filter;
+  std::unique_ptr<CompactionFilter> compaction_filter_from_factory = nullptr;
+  if (!compaction_filter) {
+    auto context = compact->GetFilterContextV1();
+    compaction_filter_from_factory =
+        cfd->options()->compaction_filter_factory->CreateCompactionFilter(
+            context);
+    compaction_filter = compaction_filter_from_factory.get();
+  }
+
+  while (input->Valid() && !shutting_down_.Acquire_Load() &&
+         !cfd->IsDropped()) {
+    // FLUSH preempts compaction
+    // TODO(icanadi) this currently only checks if flush is necessary on
+    // compacting column family. we should also check if flush is necessary on
+    // other column families, too
+    imm_micros += CallFlushDuringCompaction(cfd, deletion_state, log_buffer);
+
+    Slice key;
+    Slice value;
+    // If is_compaction_v2 is on, kv-pairs are reset to the prefix batch.
+    // This prefix batch should contain results after calling
+    // compaction_filter_v2.
+    //
+    // If is_compaction_v2 is off, this function will go through all the
+    // kv-pairs in input.
+    if (!is_compaction_v2) {
+      key = input->key();
+      value = input->value();
+    } else {
+      if (combined_idx >= compact->combined_key_buf_.size()) {
+        break;
+      }
+      assert(combined_idx < compact->combined_key_buf_.size());
+      key = compact->combined_key_buf_[combined_idx];
+      value = compact->combined_value_buf_[combined_idx];
+
+      ++combined_idx;
+    }
+
+    if (compact->compaction->ShouldStopBefore(key) &&
+        compact->builder != nullptr) {
+      status = FinishCompactionOutputFile(compact, input);
+      if (!status.ok()) {
+        break;
+      }
+    }
+
+    // Handle key/value, add to state, etc.
+    bool drop = false;
+    bool current_entry_is_merging = false;
+    if (!ParseInternalKey(key, &ikey)) {
+      // Do not hide error keys
+      // TODO: error key stays in db forever? Figure out the intention/rationale
+      // v10 error v8 : we cannot hide v8 even though it's pretty obvious.
+      current_user_key.Clear();
+      has_current_user_key = false;
+      last_sequence_for_key = kMaxSequenceNumber;
+      visible_in_snapshot = kMaxSequenceNumber;
+    } else {
+      if (!has_current_user_key ||
+          cfd->user_comparator()->Compare(ikey.user_key,
+                                          current_user_key.GetKey()) != 0) {
+        // First occurrence of this user key
+        current_user_key.SetUserKey(ikey.user_key);
+        has_current_user_key = true;
+        last_sequence_for_key = kMaxSequenceNumber;
+        visible_in_snapshot = kMaxSequenceNumber;
+        // apply the compaction filter to the first occurrence of the user key
+        if (compaction_filter && !is_compaction_v2 &&
+            ikey.type == kTypeValue &&
+            (visible_at_tip || ikey.sequence > latest_snapshot)) {
+          // If the user has specified a compaction filter and the sequence
+          // number is greater than any external snapshot, then invoke the
+          // filter.
+          // If the return value of the compaction filter is true, replace
+          // the entry with a delete marker.
+          bool value_changed = false;
+          compaction_filter_value.clear();
+          bool to_delete = compaction_filter->Filter(
+              compact->compaction->level(), ikey.user_key, value,
+              &compaction_filter_value, &value_changed);
+          if (to_delete) {
+            // make a copy of the original key and convert it to a delete
+            delete_key.SetInternalKey(ExtractUserKey(key), ikey.sequence,
+                                      kTypeDeletion);
+            // anchor the key again
+            key = delete_key.GetKey();
+            // needed because ikey is backed by key
+            ParseInternalKey(key, &ikey);
+            // no value associated with delete
+            value.clear();
+            RecordTick(options_.statistics.get(), COMPACTION_KEY_DROP_USER);
+          } else if (value_changed) {
+            value = compaction_filter_value;
+          }
+        }
+      }
+
+      // If there are no snapshots, then this kv affect visibility at tip.
+      // Otherwise, search though all existing snapshots to find
+      // the earlist snapshot that is affected by this kv.
+      SequenceNumber prev_snapshot = 0; // 0 means no previous snapshot
+      SequenceNumber visible = visible_at_tip ?
+        visible_at_tip :
+        findEarliestVisibleSnapshot(ikey.sequence,
+            compact->existing_snapshots,
+            &prev_snapshot);
+
+      if (visible_in_snapshot == visible) {
+        // If the earliest snapshot is which this key is visible in
+        // is the same as the visibily of a previous instance of the
+        // same key, then this kv is not visible in any snapshot.
+        // Hidden by an newer entry for same user key
+        // TODO: why not > ?
+        assert(last_sequence_for_key >= ikey.sequence);
+        drop = true;    // (A)
+        RecordTick(options_.statistics.get(), COMPACTION_KEY_DROP_NEWER_ENTRY);
+      } else if (ikey.type == kTypeDeletion &&
+          ikey.sequence <= earliest_snapshot &&
+          compact->compaction->IsBaseLevelForKey(ikey.user_key)) {
+        // For this user key:
+        // (1) there is no data in higher levels
+        // (2) data in lower levels will have larger sequence numbers
+        // (3) data in layers that are being compacted here and have
+        //     smaller sequence numbers will be dropped in the next
+        //     few iterations of this loop (by rule (A) above).
+        // Therefore this deletion marker is obsolete and can be dropped.
+        drop = true;
+        RecordTick(options_.statistics.get(), COMPACTION_KEY_DROP_OBSOLETE);
+      } else if (ikey.type == kTypeMerge) {
+        // We know the merge type entry is not hidden, otherwise we would
+        // have hit (A)
+        // We encapsulate the merge related state machine in a different
+        // object to minimize change to the existing flow. Turn out this
+        // logic could also be nicely re-used for memtable flush purge
+        // optimization in BuildTable.
+        int steps = 0;
+        merge.MergeUntil(input, prev_snapshot, bottommost_level,
+            options_.statistics.get(), &steps);
+        // Skip the Merge ops
+        combined_idx = combined_idx - 1 + steps;
+
+        current_entry_is_merging = true;
+        if (merge.IsSuccess()) {
+          // Successfully found Put/Delete/(end-of-key-range) while merging
+          // Get the merge result
+          key = merge.key();
+          ParseInternalKey(key, &ikey);
+          value = merge.value();
+        } else {
+          // Did not find a Put/Delete/(end-of-key-range) while merging
+          // We now have some stack of merge operands to write out.
+          // NOTE: key,value, and ikey are now referring to old entries.
+          //       These will be correctly set below.
+          assert(!merge.keys().empty());
+          assert(merge.keys().size() == merge.values().size());
+
+          // Hack to make sure last_sequence_for_key is correct
+          ParseInternalKey(merge.keys().front(), &ikey);
+        }
+      }
+
+      last_sequence_for_key = ikey.sequence;
+      visible_in_snapshot = visible;
+    }
+
+    if (!drop) {
+      // We may write a single key (e.g.: for Put/Delete or successful merge).
+      // Or we may instead have to write a sequence/list of keys.
+      // We have to write a sequence iff we have an unsuccessful merge
+      bool has_merge_list = current_entry_is_merging && !merge.IsSuccess();
+      const std::deque<std::string>* keys = nullptr;
+      const std::deque<std::string>* values = nullptr;
+      std::deque<std::string>::const_reverse_iterator key_iter;
+      std::deque<std::string>::const_reverse_iterator value_iter;
+      if (has_merge_list) {
+        keys = &merge.keys();
+        values = &merge.values();
+        key_iter = keys->rbegin();    // The back (*rbegin()) is the first key
+        value_iter = values->rbegin();
+
+        key = Slice(*key_iter);
+        value = Slice(*value_iter);
+      }
+
+      // If we have a list of keys to write, traverse the list.
+      // If we have a single key to write, simply write that key.
+      while (true) {
+        // Invariant: key,value,ikey will always be the next entry to write
+        char* kptr = (char*)key.data();
+        std::string kstr;
+
+        // Zeroing out the sequence number leads to better compression.
+        // If this is the bottommost level (no files in lower levels)
+        // and the earliest snapshot is larger than this seqno
+        // then we can squash the seqno to zero.
+        if (bottommost_level && ikey.sequence < earliest_snapshot &&
+            ikey.type != kTypeMerge) {
+          assert(ikey.type != kTypeDeletion);
+          // make a copy because updating in place would cause problems
+          // with the priority queue that is managing the input key iterator
+          kstr.assign(key.data(), key.size());
+          kptr = (char *)kstr.c_str();
+          UpdateInternalKey(kptr, key.size(), (uint64_t)0, ikey.type);
+        }
+
+        Slice newkey(kptr, key.size());
+        assert((key.clear(), 1)); // we do not need 'key' anymore
+
+        // Open output file if necessary
+        if (compact->builder == nullptr) {
+          status = OpenCompactionOutputFile(compact);
+          if (!status.ok()) {
+            break;
+          }
+        }
+
+        SequenceNumber seqno = GetInternalKeySeqno(newkey);
+        if (compact->builder->NumEntries() == 0) {
+          compact->current_output()->smallest.DecodeFrom(newkey);
+          compact->current_output()->smallest_seqno = seqno;
+        } else {
+          compact->current_output()->smallest_seqno =
+            std::min(compact->current_output()->smallest_seqno, seqno);
+        }
+        compact->current_output()->largest.DecodeFrom(newkey);
+        compact->builder->Add(newkey, value);
+        compact->current_output()->largest_seqno =
+          std::max(compact->current_output()->largest_seqno, seqno);
+
+        // Close output file if it is big enough
+        if (compact->builder->FileSize() >=
+            compact->compaction->MaxOutputFileSize()) {
+          status = FinishCompactionOutputFile(compact, input);
+          if (!status.ok()) {
+            break;
+          }
+        }
+
+        // If we have a list of entries, move to next element
+        // If we only had one entry, then break the loop.
+        if (has_merge_list) {
+          ++key_iter;
+          ++value_iter;
+
+          // If at end of list
+          if (key_iter == keys->rend() || value_iter == values->rend()) {
+            // Sanity Check: if one ends, then both end
+            assert(key_iter == keys->rend() && value_iter == values->rend());
+            break;
+          }
+
+          // Otherwise not at end of list. Update key, value, and ikey.
+          key = Slice(*key_iter);
+          value = Slice(*value_iter);
+          ParseInternalKey(key, &ikey);
+
+        } else{
+          // Only had one item to begin with (Put/Delete)
+          break;
+        }
+      }
+    }
+
+    // MergeUntil has moved input to the next entry
+    if (!current_entry_is_merging) {
+      input->Next();
+    }
+  }
+
+  return status;
+}
+
+void DBImpl::CallCompactionFilterV2(CompactionState* compact,
+  CompactionFilterV2* compaction_filter_v2) {
+  if (compact == nullptr || compaction_filter_v2 == nullptr) {
+    return;
+  }
+
+  std::vector<Slice> user_key_buf;
+  for (const auto& key : compact->ikey_buf_) {
+    user_key_buf.emplace_back(key.user_key);
+  }
+
+  // If the user has specified a compaction filter and the sequence
+  // number is greater than any external snapshot, then invoke the
+  // filter.
+  // If the return value of the compaction filter is true, replace
+  // the entry with a delete marker.
+  compact->to_delete_buf_ = compaction_filter_v2->Filter(
+      compact->compaction->level(),
+      user_key_buf, compact->existing_value_buf_,
+      &compact->new_value_buf_,
+      &compact->value_changed_buf_);
+
+  // new_value_buf_.size() <= to_delete__buf_.size(). "=" iff all
+  // kv-pairs in this compaction run needs to be deleted.
+  assert(compact->to_delete_buf_.size() ==
+      compact->key_buf_.size());
+  assert(compact->to_delete_buf_.size() ==
+      compact->existing_value_buf_.size());
+  assert(compact->to_delete_buf_.size() ==
+      compact->value_changed_buf_.size());
+
+  int new_value_idx = 0;
+  for (unsigned int i = 0; i < compact->to_delete_buf_.size(); ++i) {
+    if (compact->to_delete_buf_[i]) {
+      // update the string buffer directly
+      // the Slice buffer points to the updated buffer
+      UpdateInternalKey(&compact->key_str_buf_[i][0],
+          compact->key_str_buf_[i].size(),
+          compact->ikey_buf_[i].sequence,
+          kTypeDeletion);
+
+      // no value associated with delete
+      compact->existing_value_buf_[i].clear();
+      RecordTick(options_.statistics.get(), COMPACTION_KEY_DROP_USER);
+    } else if (compact->value_changed_buf_[i]) {
+      compact->existing_value_buf_[i] =
+        Slice(compact->new_value_buf_[new_value_idx++]);
+    }
+  }  // for
+}
+
+Status DBImpl::DoCompactionWork(CompactionState* compact,
+                                DeletionState& deletion_state,
+                                LogBuffer* log_buffer) {
+  assert(compact);
+  compact->CleanupBatchBuffer();
+  compact->CleanupMergedBuffer();
+  bool prefix_initialized = false;
+
+  int64_t imm_micros = 0;  // Micros spent doing imm_ compactions
+  ColumnFamilyData* cfd = compact->compaction->column_family_data();
+  LogToBuffer(
+      log_buffer,
+      "[%s] Compacting %d@%d + %d@%d files, score %.2f slots available %d",
+      cfd->GetName().c_str(), compact->compaction->num_input_files(0),
+      compact->compaction->level(), compact->compaction->num_input_files(1),
+      compact->compaction->output_level(), compact->compaction->score(),
+      options_.max_background_compactions - bg_compaction_scheduled_);
+  char scratch[2345];
+  compact->compaction->Summary(scratch, sizeof(scratch));
+  LogToBuffer(log_buffer, "[%s] Compaction start summary: %s\n",
+              cfd->GetName().c_str(), scratch);
+
+  assert(cfd->current()->NumLevelFiles(compact->compaction->level()) > 0);
+  assert(compact->builder == nullptr);
+  assert(!compact->outfile);
+
+  SequenceNumber visible_at_tip = 0;
+  SequenceNumber earliest_snapshot;
+  SequenceNumber latest_snapshot = 0;
+  snapshots_.getAll(compact->existing_snapshots);
+  if (compact->existing_snapshots.size() == 0) {
+    // optimize for fast path if there are no snapshots
+    visible_at_tip = versions_->LastSequence();
+    earliest_snapshot = visible_at_tip;
+  } else {
+    latest_snapshot = compact->existing_snapshots.back();
+    // Add the current seqno as the 'latest' virtual
+    // snapshot to the end of this list.
+    compact->existing_snapshots.push_back(versions_->LastSequence());
+    earliest_snapshot = compact->existing_snapshots[0];
+  }
+
+  // Is this compaction producing files at the bottommost level?
+  bool bottommost_level = compact->compaction->BottomMostLevel();
+
+  // Allocate the output file numbers before we release the lock
+  AllocateCompactionOutputFileNumbers(compact);
+
+  // Release mutex while we're actually doing the compaction work
+  mutex_.Unlock();
+  log_buffer->FlushBufferToLog();
+
+  const uint64_t start_micros = env_->NowMicros();
+  unique_ptr<Iterator> input(versions_->MakeInputIterator(compact->compaction));
+  input->SeekToFirst();
+  shared_ptr<Iterator> backup_input(
+      versions_->MakeInputIterator(compact->compaction));
+  backup_input->SeekToFirst();
+
+  Status status;
+  ParsedInternalKey ikey;
+  std::unique_ptr<CompactionFilterV2> compaction_filter_from_factory_v2
+    = nullptr;
+  auto context = compact->GetFilterContext();
+  compaction_filter_from_factory_v2 =
+      cfd->options()->compaction_filter_factory_v2->CreateCompactionFilterV2(
+          context);
+  auto compaction_filter_v2 =
+    compaction_filter_from_factory_v2.get();
+
+  // temp_backup_input always point to the start of the current buffer
+  // temp_backup_input = backup_input;
+  // iterate through input,
+  // 1) buffer ineligible keys and value keys into 2 separate buffers;
+  // 2) send value_buffer to compaction filter and alternate the values;
+  // 3) merge value_buffer with ineligible_value_buffer;
+  // 4) run the modified "compaction" using the old for loop.
+  if (compaction_filter_v2) {
+    while (backup_input->Valid() && !shutting_down_.Acquire_Load() &&
+           !cfd->IsDropped()) {
+      // FLUSH preempts compaction
+      // TODO(icanadi) this currently only checks if flush is necessary on
+      // compacting column family. we should also check if flush is necessary on
+      // other column families, too
+      imm_micros += CallFlushDuringCompaction(cfd, deletion_state, log_buffer);
+
+      Slice key = backup_input->key();
+      Slice value = backup_input->value();
+
+      const SliceTransform* transformer =
+          cfd->options()->compaction_filter_factory_v2->GetPrefixExtractor();
+      const auto key_prefix = transformer->Transform(key);
+      if (!prefix_initialized) {
+        compact->cur_prefix_ = key_prefix.ToString();
+        prefix_initialized = true;
+      }
+      if (!ParseInternalKey(key, &ikey)) {
+        // log error
+        Log(options_.info_log, "[%s] Failed to parse key: %s",
+            cfd->GetName().c_str(), key.ToString().c_str());
+        continue;
+      } else {
+        // If the prefix remains the same, keep buffering
+        if (key_prefix.compare(Slice(compact->cur_prefix_)) == 0) {
+          // Apply the compaction filter V2 to all the kv pairs sharing
+          // the same prefix
+          if (ikey.type == kTypeValue &&
+              (visible_at_tip || ikey.sequence > latest_snapshot)) {
+            // Buffer all keys sharing the same prefix for CompactionFilterV2
+            // Iterate through keys to check prefix
+            compact->BufferKeyValueSlices(key, value);
+          } else {
+            // buffer ineligible keys
+            compact->BufferOtherKeyValueSlices(key, value);
+          }
+          backup_input->Next();
+          continue;
+          // finish changing values for eligible keys
+        } else {
+          // Now prefix changes, this batch is done.
+          // Call compaction filter on the buffered values to change the value
+          if (compact->key_buf_.size() > 0) {
+            CallCompactionFilterV2(compact, compaction_filter_v2);
+          }
+          compact->cur_prefix_ = key_prefix.ToString();
+        }
+      }
+
+      // Merge this batch of data (values + ineligible keys)
+      compact->MergeKeyValueSliceBuffer(&cfd->internal_comparator());
+
+      // Done buffering for the current prefix. Spit it out to disk
+      // Now just iterate through all the kv-pairs
+      status = ProcessKeyValueCompaction(
+          visible_at_tip,
+          earliest_snapshot,
+          latest_snapshot,
+          deletion_state,
+          bottommost_level,
+          imm_micros,
+          input.get(),
+          compact,
+          true,
+          log_buffer);
+
+      if (!status.ok()) {
+        break;
+      }
+
+      // After writing the kv-pairs, we can safely remove the reference
+      // to the string buffer and clean them up
+      compact->CleanupBatchBuffer();
+      compact->CleanupMergedBuffer();
+      // Buffer the key that triggers the mismatch in prefix
+      if (ikey.type == kTypeValue &&
+        (visible_at_tip || ikey.sequence > latest_snapshot)) {
+        compact->BufferKeyValueSlices(key, value);
+      } else {
+        compact->BufferOtherKeyValueSlices(key, value);
+      }
+      backup_input->Next();
+      if (!backup_input->Valid()) {
+        // If this is the single last value, we need to merge it.
+        if (compact->key_buf_.size() > 0) {
+          CallCompactionFilterV2(compact, compaction_filter_v2);
+        }
+        compact->MergeKeyValueSliceBuffer(&cfd->internal_comparator());
+
+        status = ProcessKeyValueCompaction(
+            visible_at_tip,
+            earliest_snapshot,
+            latest_snapshot,
+            deletion_state,
+            bottommost_level,
+            imm_micros,
+            input.get(),
+            compact,
+            true,
+            log_buffer);
+
+        compact->CleanupBatchBuffer();
+        compact->CleanupMergedBuffer();
+      }
+    }  // done processing all prefix batches
+    // finish the last batch
+    if (compact->key_buf_.size() > 0) {
+      CallCompactionFilterV2(compact, compaction_filter_v2);
+    }
+    compact->MergeKeyValueSliceBuffer(&cfd->internal_comparator());
+    status = ProcessKeyValueCompaction(
+        visible_at_tip,
+        earliest_snapshot,
+        latest_snapshot,
+        deletion_state,
+        bottommost_level,
+        imm_micros,
+        input.get(),
+        compact,
+        true,
+        log_buffer);
+  }  // checking for compaction filter v2
+
+  if (!compaction_filter_v2) {
+    status = ProcessKeyValueCompaction(
+      visible_at_tip,
+      earliest_snapshot,
+      latest_snapshot,
+      deletion_state,
+      bottommost_level,
+      imm_micros,
+      input.get(),
+      compact,
+      false,
+      log_buffer);
+  }
+
+  if (status.ok() && (shutting_down_.Acquire_Load() || cfd->IsDropped())) {
+    status = Status::ShutdownInProgress(
+        "Database shutdown or Column family drop during compaction");
+  }
+  if (status.ok() && compact->builder != nullptr) {
+    status = FinishCompactionOutputFile(compact, input.get());
+  }
+  if (status.ok()) {
+    status = input->status();
+  }
+  input.reset();
+
+  if (!options_.disableDataSync) {
+    db_directory_->Fsync();
+  }
+
+  InternalStats::CompactionStats stats;
+  stats.micros = env_->NowMicros() - start_micros - imm_micros;
+  MeasureTime(options_.statistics.get(), COMPACTION_TIME, stats.micros);
+  stats.files_in_leveln = compact->compaction->num_input_files(0);
+  stats.files_in_levelnp1 = compact->compaction->num_input_files(1);
+
+  int num_output_files = compact->outputs.size();
+  if (compact->builder != nullptr) {
+    // An error occurred so ignore the last output.
+    assert(num_output_files > 0);
+    --num_output_files;
+  }
+  stats.files_out_levelnp1 = num_output_files;
+
+  for (int i = 0; i < compact->compaction->num_input_files(0); i++) {
+    stats.bytes_readn += compact->compaction->input(0, i)->file_size;
+    RecordTick(options_.statistics.get(), COMPACT_READ_BYTES,
+               compact->compaction->input(0, i)->file_size);
+  }
+
+  for (int i = 0; i < compact->compaction->num_input_files(1); i++) {
+    stats.bytes_readnp1 += compact->compaction->input(1, i)->file_size;
+    RecordTick(options_.statistics.get(), COMPACT_READ_BYTES,
+               compact->compaction->input(1, i)->file_size);
+  }
+
+  for (int i = 0; i < num_output_files; i++) {
+    stats.bytes_written += compact->outputs[i].file_size;
+    RecordTick(options_.statistics.get(), COMPACT_WRITE_BYTES,
+               compact->outputs[i].file_size);
+  }
+
+  LogFlush(options_.info_log);
+  mutex_.Lock();
+  cfd->internal_stats()->AddCompactionStats(compact->compaction->output_level(),
+                                            stats);
+
+  // if there were any unused file number (mostly in case of
+  // compaction error), free up the entry from pending_putputs
+  ReleaseCompactionUnusedFileNumbers(compact);
+
+  if (status.ok()) {
+    status = InstallCompactionResults(compact, log_buffer);
+    InstallSuperVersion(cfd, deletion_state);
+  }
+  Version::LevelSummaryStorage tmp;
+  LogToBuffer(
+      log_buffer,
+      "[%s] compacted to: %s, %.1f MB/sec, level %d, files in(%d, %d) out(%d) "
+      "MB in(%.1f, %.1f) out(%.1f), read-write-amplify(%.1f) "
+      "write-amplify(%.1f) %s\n",
+      cfd->GetName().c_str(), cfd->current()->LevelSummary(&tmp),
+      (stats.bytes_readn + stats.bytes_readnp1 + stats.bytes_written) /
+          (double)stats.micros,
+      compact->compaction->output_level(), stats.files_in_leveln,
+      stats.files_in_levelnp1, stats.files_out_levelnp1,
+      stats.bytes_readn / 1048576.0, stats.bytes_readnp1 / 1048576.0,
+      stats.bytes_written / 1048576.0,
+      (stats.bytes_written + stats.bytes_readnp1 + stats.bytes_readn) /
+          (double)stats.bytes_readn,
+      stats.bytes_written / (double)stats.bytes_readn,
+      status.ToString().c_str());
+
+  return status;
+}
+
+namespace {
+struct IterState {
+  IterState(DBImpl* db, port::Mutex* mu, SuperVersion* super_version)
+      : db(db), mu(mu), super_version(super_version) {}
+
+  DBImpl* db;
+  port::Mutex* mu;
+  SuperVersion* super_version;
+};
+
+static void CleanupIteratorState(void* arg1, void* arg2) {
+  IterState* state = reinterpret_cast<IterState*>(arg1);
+
+  if (state->super_version->Unref()) {
+    DBImpl::DeletionState deletion_state;
+
+    state->mu->Lock();
+    state->super_version->Cleanup();
+    state->db->FindObsoleteFiles(deletion_state, false, true);
+    state->mu->Unlock();
+
+    delete state->super_version;
+    if (deletion_state.HaveSomethingToDelete()) {
+      state->db->PurgeObsoleteFiles(deletion_state);
+    }
+  }
+
+  delete state;
+}
+}  // namespace
+
+Iterator* DBImpl::NewInternalIterator(const ReadOptions& options,
+                                      ColumnFamilyData* cfd,
+                                      SuperVersion* super_version) {
+  std::vector<Iterator*> iterator_list;
+  // Collect iterator for mutable mem
+  iterator_list.push_back(super_version->mem->NewIterator(options));
+  // Collect all needed child iterators for immutable memtables
+  super_version->imm->AddIterators(options, &iterator_list);
+  // Collect iterators for files in L0 - Ln
+  super_version->current->AddIterators(options, storage_options_,
+                                       &iterator_list);
+  Iterator* internal_iter = NewMergingIterator(
+      &cfd->internal_comparator(), &iterator_list[0], iterator_list.size());
+
+  IterState* cleanup = new IterState(this, &mutex_, super_version);
+  internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, nullptr);
+
+  return internal_iter;
+}
+
+ColumnFamilyHandle* DBImpl::DefaultColumnFamily() const {
+  return default_cf_handle_;
+}
+
+Status DBImpl::Get(const ReadOptions& options,
+                   ColumnFamilyHandle* column_family, const Slice& key,
+                   std::string* value) {
+  return GetImpl(options, column_family, key, value);
+}
+
+// DeletionState gets created and destructed outside of the lock -- we
+// use this convinently to:
+// * malloc one SuperVersion() outside of the lock -- new_superversion
+// * delete SuperVersion()s outside of the lock -- superversions_to_free
+//
+// However, if InstallSuperVersion() gets called twice with the same,
+// deletion_state, we can't reuse the SuperVersion() that got malloced because
+// first call already used it. In that rare case, we take a hit and create a
+// new SuperVersion() inside of the mutex. We do similar thing
+// for superversion_to_free
+void DBImpl::InstallSuperVersion(ColumnFamilyData* cfd,
+                                 DeletionState& deletion_state) {
+  mutex_.AssertHeld();
+  // if new_superversion == nullptr, it means somebody already used it
+  SuperVersion* new_superversion =
+    (deletion_state.new_superversion != nullptr) ?
+    deletion_state.new_superversion : new SuperVersion();
+  SuperVersion* old_superversion =
+      cfd->InstallSuperVersion(new_superversion, &mutex_);
+  deletion_state.new_superversion = nullptr;
+  deletion_state.superversions_to_free.push_back(old_superversion);
+}
+
+Status DBImpl::GetImpl(const ReadOptions& options,
+                       ColumnFamilyHandle* column_family, const Slice& key,
+                       std::string* value, bool* value_found) {
+  StopWatch sw(env_, options_.statistics.get(), DB_GET, false);
+  PERF_TIMER_AUTO(get_snapshot_time);
+
+  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  auto cfd = cfh->cfd();
+
+  SequenceNumber snapshot;
+  if (options.snapshot != nullptr) {
+    snapshot = reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_;
+  } else {
+    snapshot = versions_->LastSequence();
+  }
+
+  // Acquire SuperVersion
+  SuperVersion* sv = nullptr;
+  // TODO(ljin): consider using GetReferencedSuperVersion() directly
+  if (LIKELY(options_.allow_thread_local)) {
+    sv = cfd->GetThreadLocalSuperVersion(&mutex_);
+  } else {
+    mutex_.Lock();
+    sv = cfd->GetSuperVersion()->Ref();
+    mutex_.Unlock();
+  }
+
+  bool have_stat_update = false;
+  Version::GetStats stats;
+
+  // Prepare to store a list of merge operations if merge occurs.
+  MergeContext merge_context;
+
+  Status s;
+  // First look in the memtable, then in the immutable memtable (if any).
+  // s is both in/out. When in, s could either be OK or MergeInProgress.
+  // merge_operands will contain the sequence of merges in the latter case.
+  LookupKey lkey(key, snapshot);
+  PERF_TIMER_STOP(get_snapshot_time);
+  if (sv->mem->Get(lkey, value, &s, merge_context, *cfd->options())) {
+    // Done
+    RecordTick(options_.statistics.get(), MEMTABLE_HIT);
+  } else if (sv->imm->Get(lkey, value, &s, merge_context, *cfd->options())) {
+    // Done
+    RecordTick(options_.statistics.get(), MEMTABLE_HIT);
+  } else {
+    PERF_TIMER_START(get_from_output_files_time);
+
+    sv->current->Get(options, lkey, value, &s, &merge_context, &stats,
+                     value_found);
+    have_stat_update = true;
+    PERF_TIMER_STOP(get_from_output_files_time);
+    RecordTick(options_.statistics.get(), MEMTABLE_MISS);
+  }
+
+  PERF_TIMER_START(get_post_process_time);
+
+  if (!cfd->options()->disable_seek_compaction && have_stat_update) {
+    mutex_.Lock();
+    if (sv->current->UpdateStats(stats)) {
+      MaybeScheduleFlushOrCompaction();
+    }
+    mutex_.Unlock();
+  }
+
+  bool unref_sv = true;
+  if (LIKELY(options_.allow_thread_local)) {
+    unref_sv = !cfd->ReturnThreadLocalSuperVersion(sv);
+  }
+
+  if (unref_sv) {
+    // Release SuperVersion
+    if (sv->Unref()) {
+      mutex_.Lock();
+      sv->Cleanup();
+      mutex_.Unlock();
+      delete sv;
+      RecordTick(options_.statistics.get(), NUMBER_SUPERVERSION_CLEANUPS);
+    }
+    RecordTick(options_.statistics.get(), NUMBER_SUPERVERSION_RELEASES);
+  }
+
+  RecordTick(options_.statistics.get(), NUMBER_KEYS_READ);
+  RecordTick(options_.statistics.get(), BYTES_READ, value->size());
+  PERF_TIMER_STOP(get_post_process_time);
+  return s;
+}
+
+std::vector<Status> DBImpl::MultiGet(
+    const ReadOptions& options,
+    const std::vector<ColumnFamilyHandle*>& column_family,
+    const std::vector<Slice>& keys, std::vector<std::string>* values) {
+
+  StopWatch sw(env_, options_.statistics.get(), DB_MULTIGET, false);
+  PERF_TIMER_AUTO(get_snapshot_time);
+
+  SequenceNumber snapshot;
+
+  struct MultiGetColumnFamilyData {
+    ColumnFamilyData* cfd;
+    SuperVersion* super_version;
+    Version::GetStats stats;
+    bool have_stat_update = false;
+  };
+  std::unordered_map<uint32_t, MultiGetColumnFamilyData*> multiget_cf_data;
+  // fill up and allocate outside of mutex
+  for (auto cf : column_family) {
+    auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(cf);
+    auto cfd = cfh->cfd();
+    if (multiget_cf_data.find(cfd->GetID()) == multiget_cf_data.end()) {
+      auto mgcfd = new MultiGetColumnFamilyData();
+      mgcfd->cfd = cfd;
+      multiget_cf_data.insert({cfd->GetID(), mgcfd});
+    }
+  }
+
+  mutex_.Lock();
+  if (options.snapshot != nullptr) {
+    snapshot = reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_;
+  } else {
+    snapshot = versions_->LastSequence();
+  }
+  for (auto mgd_iter : multiget_cf_data) {
+    mgd_iter.second->super_version =
+        mgd_iter.second->cfd->GetSuperVersion()->Ref();
+  }
+  mutex_.Unlock();
+
+  // Contain a list of merge operations if merge occurs.
+  MergeContext merge_context;
+
+  // Note: this always resizes the values array
+  size_t num_keys = keys.size();
+  std::vector<Status> stat_list(num_keys);
+  values->resize(num_keys);
+
+  // Keep track of bytes that we read for statistics-recording later
+  uint64_t bytes_read = 0;
+  PERF_TIMER_STOP(get_snapshot_time);
+
+  // For each of the given keys, apply the entire "get" process as follows:
+  // First look in the memtable, then in the immutable memtable (if any).
+  // s is both in/out. When in, s could either be OK or MergeInProgress.
+  // merge_operands will contain the sequence of merges in the latter case.
+  for (size_t i = 0; i < num_keys; ++i) {
+    merge_context.Clear();
+    Status& s = stat_list[i];
+    std::string* value = &(*values)[i];
+
+    LookupKey lkey(keys[i], snapshot);
+    auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family[i]);
+    auto mgd_iter = multiget_cf_data.find(cfh->cfd()->GetID());
+    assert(mgd_iter != multiget_cf_data.end());
+    auto mgd = mgd_iter->second;
+    auto super_version = mgd->super_version;
+    auto cfd = mgd->cfd;
+    if (super_version->mem->Get(lkey, value, &s, merge_context,
+                                *cfd->options())) {
+      // Done
+    } else if (super_version->imm->Get(lkey, value, &s, merge_context,
+                                       *cfd->options())) {
+      // Done
+    } else {
+      super_version->current->Get(options, lkey, value, &s, &merge_context,
+                                  &mgd->stats);
+      mgd->have_stat_update = true;
+    }
+
+    if (s.ok()) {
+      bytes_read += value->size();
+    }
+  }
+
+  // Post processing (decrement reference counts and record statistics)
+  PERF_TIMER_START(get_post_process_time);
+  autovector<SuperVersion*> superversions_to_delete;
+
+  bool schedule_flush_or_compaction = false;
+  mutex_.Lock();
+  for (auto mgd_iter : multiget_cf_data) {
+    auto mgd = mgd_iter.second;
+    auto cfd = mgd->cfd;
+    if (!cfd->options()->disable_seek_compaction && mgd->have_stat_update) {
+      if (mgd->super_version->current->UpdateStats(mgd->stats)) {
+        schedule_flush_or_compaction = true;
+      }
+    }
+    if (mgd->super_version->Unref()) {
+      mgd->super_version->Cleanup();
+      superversions_to_delete.push_back(mgd->super_version);
+    }
+  }
+  if (schedule_flush_or_compaction) {
+    MaybeScheduleFlushOrCompaction();
+  }
+  mutex_.Unlock();
+
+  for (auto td : superversions_to_delete) {
+    delete td;
+  }
+  for (auto mgd : multiget_cf_data) {
+    delete mgd.second;
+  }
+
+  RecordTick(options_.statistics.get(), NUMBER_MULTIGET_CALLS);
+  RecordTick(options_.statistics.get(), NUMBER_MULTIGET_KEYS_READ, num_keys);
+  RecordTick(options_.statistics.get(), NUMBER_MULTIGET_BYTES_READ, bytes_read);
+  PERF_TIMER_STOP(get_post_process_time);
+
+  return stat_list;
+}
+
+Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& options,
+                                  const std::string& column_family_name,
+                                  ColumnFamilyHandle** handle) {
+  *handle = nullptr;
+  MutexLock l(&mutex_);
+
+  if (versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name) !=
+      nullptr) {
+    return Status::InvalidArgument("Column family already exists");
+  }
+  VersionEdit edit;
+  edit.AddColumnFamily(column_family_name);
+  uint32_t new_id = versions_->GetColumnFamilySet()->GetNextColumnFamilyID();
+  edit.SetColumnFamily(new_id);
+  edit.SetLogNumber(logfile_number_);
+  edit.SetComparatorName(options.comparator->Name());
+
+  // LogAndApply will both write the creation in MANIFEST and create
+  // ColumnFamilyData object
+  Status s = versions_->LogAndApply(nullptr, &edit, &mutex_,
+                                    db_directory_.get(), false, &options);
+  if (s.ok()) {
+    auto cfd =
+        versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name);
+    assert(cfd != nullptr);
+    delete cfd->InstallSuperVersion(new SuperVersion(), &mutex_);
+    *handle = new ColumnFamilyHandleImpl(cfd, this, &mutex_);
+    Log(options_.info_log, "Created column family [%s] (ID %u)",
+        column_family_name.c_str(), (unsigned)cfd->GetID());
+    max_total_in_memory_state_ += cfd->options()->write_buffer_size *
+                                  cfd->options()->max_write_buffer_number;
+  } else {
+    Log(options_.info_log, "Creating column family [%s] FAILED -- %s",
+        column_family_name.c_str(), s.ToString().c_str());
+  }
+  return s;
+}
+
+Status DBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) {
+  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  auto cfd = cfh->cfd();
+  if (cfd->GetID() == 0) {
+    return Status::InvalidArgument("Can't drop default column family");
+  }
+
+  VersionEdit edit;
+  edit.DropColumnFamily();
+  edit.SetColumnFamily(cfd->GetID());
+
+  Status s;
+  {
+    MutexLock l(&mutex_);
+    if (cfd->IsDropped()) {
+      s = Status::InvalidArgument("Column family already dropped!\n");
+    }
+    if (s.ok()) {
+      s = versions_->LogAndApply(cfd, &edit, &mutex_);
+    }
+  }
+
+  if (s.ok()) {
+    assert(cfd->IsDropped());
+    max_total_in_memory_state_ -= cfd->options()->write_buffer_size *
+                                  cfd->options()->max_write_buffer_number;
+    Log(options_.info_log, "Dropped column family with id %u\n", cfd->GetID());
+    // Flush the memtables. This will make all WAL files referencing dropped
+    // column family to be obsolete. They will be deleted once user deletes
+    // column family handle
+    Write(WriteOptions(), nullptr);  // ignore error
+  } else {
+    Log(options_.info_log, "Dropping column family with id %u FAILED -- %s\n",
+        cfd->GetID(), s.ToString().c_str());
+  }
+
+  return s;
+}
+
+bool DBImpl::KeyMayExist(const ReadOptions& options,
+                         ColumnFamilyHandle* column_family, const Slice& key,
+                         std::string* value, bool* value_found) {
+  if (value_found != nullptr) {
+    // falsify later if key-may-exist but can't fetch value
+    *value_found = true;
+  }
+  ReadOptions roptions = options;
+  roptions.read_tier = kBlockCacheTier; // read from block cache only
+  auto s = GetImpl(roptions, column_family, key, value, value_found);
+
+  // If options.block_cache != nullptr and the index block of the table didn't
+  // not present in block_cache, the return value will be Status::Incomplete.
+  // In this case, key may still exist in the table.
+  return s.ok() || s.IsIncomplete();
+}
+
+Iterator* DBImpl::NewIterator(const ReadOptions& options,
+                              ColumnFamilyHandle* column_family) {
+  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  auto cfd = cfh->cfd();
+
+  Iterator* iter;
+  if (options.tailing) {
+#ifdef ROCKSDB_LITE
+    // not supported in lite version
+    return nullptr;
+#else
+    iter = new TailingIterator(env_, this, options, cfd);
+#endif
+  } else {
+    SequenceNumber latest_snapshot = versions_->LastSequence();
+    SuperVersion* sv = nullptr;
+    sv = cfd->GetReferencedSuperVersion(&mutex_);
+
+    iter = NewInternalIterator(options, cfd, sv);
+
+    auto snapshot =
+        options.snapshot != nullptr
+            ? reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_
+            : latest_snapshot;
+    iter = NewDBIterator(env_, *cfd->options(),
+                         cfd->user_comparator(), iter, snapshot);
+  }
+
+  return iter;
+}
+
+Status DBImpl::NewIterators(
+    const ReadOptions& options,
+    const std::vector<ColumnFamilyHandle*>& column_families,
+    std::vector<Iterator*>* iterators) {
+  iterators->clear();
+  iterators->reserve(column_families.size());
+  SequenceNumber latest_snapshot = 0;
+  std::vector<SuperVersion*> super_versions;
+  super_versions.reserve(column_families.size());
+
+  if (!options.tailing) {
+    mutex_.Lock();
+    latest_snapshot = versions_->LastSequence();
+    for (auto cfh : column_families) {
+      auto cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(cfh)->cfd();
+      super_versions.push_back(cfd->GetSuperVersion()->Ref());
+    }
+    mutex_.Unlock();
+  }
+
+  if (options.tailing) {
+#ifdef ROCKSDB_LITE
+    return Status::InvalidArgument(
+        "Tailing interator not supported in RocksDB lite");
+#else
+    for (auto cfh : column_families) {
+      auto cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(cfh)->cfd();
+      iterators->push_back(new TailingIterator(env_, this, options, cfd));
+    }
+#endif
+  } else {
+    for (size_t i = 0; i < column_families.size(); ++i) {
+      auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_families[i]);
+      auto cfd = cfh->cfd();
+
+      auto snapshot =
+          options.snapshot != nullptr
+              ? reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_
+              : latest_snapshot;
+
+      auto iter = NewInternalIterator(options, cfd, super_versions[i]);
+      iter = NewDBIterator(env_, *cfd->options(),
+                           cfd->user_comparator(), iter, snapshot);
+      iterators->push_back(iter);
+    }
+  }
+
+  return Status::OK();
+}
+
+bool DBImpl::IsSnapshotSupported() const {
+  for (auto cfd : *versions_->GetColumnFamilySet()) {
+    if (!cfd->mem()->IsSnapshotSupported()) {
+      return false;
+    }
+  }
+  return true;
+}
+
+const Snapshot* DBImpl::GetSnapshot() {
+  // returns null if the underlying memtable does not support snapshot.
+  if (!IsSnapshotSupported()) return nullptr;
+  MutexLock l(&mutex_);
+  return snapshots_.New(versions_->LastSequence());
+}
+
+void DBImpl::ReleaseSnapshot(const Snapshot* s) {
+  MutexLock l(&mutex_);
+  snapshots_.Delete(reinterpret_cast<const SnapshotImpl*>(s));
+}
+
+// Convenience methods
+Status DBImpl::Put(const WriteOptions& o, ColumnFamilyHandle* column_family,
+                   const Slice& key, const Slice& val) {
+  return DB::Put(o, column_family, key, val);
+}
+
+Status DBImpl::Merge(const WriteOptions& o, ColumnFamilyHandle* column_family,
+                     const Slice& key, const Slice& val) {
+  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  if (!cfh->cfd()->options()->merge_operator) {
+    return Status::NotSupported("Provide a merge_operator when opening DB");
+  } else {
+    return DB::Merge(o, column_family, key, val);
+  }
+}
+
+Status DBImpl::Delete(const WriteOptions& options,
+                      ColumnFamilyHandle* column_family, const Slice& key) {
+  return DB::Delete(options, column_family, key);
+}
+
+Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
+  PERF_TIMER_AUTO(write_pre_and_post_process_time);
+  Writer w(&mutex_);
+  w.batch = my_batch;
+  w.sync = options.sync;
+  w.disableWAL = options.disableWAL;
+  w.done = false;
+
+  StopWatch sw(env_, options_.statistics.get(), DB_WRITE, false);
+  mutex_.Lock();
+  writers_.push_back(&w);
+  while (!w.done && &w != writers_.front()) {
+    w.cv.Wait();
+  }
+
+  if (!options.disableWAL) {
+    RecordTick(options_.statistics.get(), WRITE_WITH_WAL, 1);
+  }
+
+  if (w.done) {
+    mutex_.Unlock();
+    RecordTick(options_.statistics.get(), WRITE_DONE_BY_OTHER, 1);
+    return w.status;
+  } else {
+    RecordTick(options_.statistics.get(), WRITE_DONE_BY_SELF, 1);
+  }
+
+  uint64_t flush_column_family_if_log_file = 0;
+  uint64_t max_total_wal_size = (options_.max_total_wal_size == 0)
+                                    ? 2 * max_total_in_memory_state_
+                                    : options_.max_total_wal_size;
+  if (alive_log_files_.begin()->getting_flushed == false &&
+      total_log_size_ > max_total_wal_size) {
+    flush_column_family_if_log_file = alive_log_files_.begin()->number;
+    alive_log_files_.begin()->getting_flushed = true;
+    Log(options_.info_log,
+        "Flushing all column families with data in WAL number %" PRIu64,
+        flush_column_family_if_log_file);
+  }
+
+  Status status;
+  // refcounting cfd in iteration
+  bool dead_cfd = false;
+  autovector<SuperVersion*> superversions_to_free;
+  autovector<log::Writer*> logs_to_free;
+  for (auto cfd : *versions_->GetColumnFamilySet()) {
+    cfd->Ref();
+    bool force_flush = my_batch == nullptr ||
+                       (flush_column_family_if_log_file != 0 &&
+                        cfd->GetLogNumber() <= flush_column_family_if_log_file);
+    // May temporarily unlock and wait.
+    status = MakeRoomForWrite(cfd, force_flush, &superversions_to_free,
+                              &logs_to_free);
+    if (cfd->Unref()) {
+      dead_cfd = true;
+    }
+    if (!status.ok()) {
+      break;
+    }
+  }
+  if (dead_cfd) {
+    versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
+  }
+
+  uint64_t last_sequence = versions_->LastSequence();
+  Writer* last_writer = &w;
+  if (status.ok() && my_batch != nullptr) {  // nullptr batch is for compactions
+    autovector<WriteBatch*> write_batch_group;
+    BuildBatchGroup(&last_writer, &write_batch_group);
+
+    // Add to log and apply to memtable.  We can release the lock
+    // during this phase since &w is currently responsible for logging
+    // and protects against concurrent loggers and concurrent writes
+    // into memtables
+    {
+      mutex_.Unlock();
+      WriteBatch* updates = nullptr;
+      if (write_batch_group.size() == 1) {
+        updates = write_batch_group[0];
+      } else {
+        updates = &tmp_batch_;
+        for (size_t i = 0; i < write_batch_group.size(); ++i) {
+          WriteBatchInternal::Append(updates, write_batch_group[i]);
+        }
+      }
+
+      const SequenceNumber current_sequence = last_sequence + 1;
+      WriteBatchInternal::SetSequence(updates, current_sequence);
+      int my_batch_count = WriteBatchInternal::Count(updates);
+      last_sequence += my_batch_count;
+      // Record statistics
+      RecordTick(options_.statistics.get(),
+                 NUMBER_KEYS_WRITTEN, my_batch_count);
+      RecordTick(options_.statistics.get(),
+                 BYTES_WRITTEN,
+                 WriteBatchInternal::ByteSize(updates));
+      if (options.disableWAL) {
+        flush_on_destroy_ = true;
+      }
+      PERF_TIMER_STOP(write_pre_and_post_process_time);
+
+      if (!options.disableWAL) {
+        PERF_TIMER_START(write_wal_time);
+        Slice log_entry = WriteBatchInternal::Contents(updates);
+        status = log_->AddRecord(log_entry);
+        total_log_size_ += log_entry.size();
+        alive_log_files_.back().AddSize(log_entry.size());
+        log_empty_ = false;
+        RecordTick(options_.statistics.get(), WAL_FILE_SYNCED, 1);
+        RecordTick(options_.statistics.get(), WAL_FILE_BYTES, log_entry.size());
+        if (status.ok() && options.sync) {
+          if (options_.use_fsync) {
+            StopWatch(env_, options_.statistics.get(), WAL_FILE_SYNC_MICROS);
+            status = log_->file()->Fsync();
+          } else {
+            StopWatch(env_, options_.statistics.get(), WAL_FILE_SYNC_MICROS);
+            status = log_->file()->Sync();
+          }
+        }
+        PERF_TIMER_STOP(write_wal_time);
+      }
+      if (status.ok()) {
+        PERF_TIMER_START(write_memtable_time);
+        status = WriteBatchInternal::InsertInto(
+            updates, column_family_memtables_.get(), false, 0, this, false);
+        PERF_TIMER_STOP(write_memtable_time);
+
+        if (!status.ok()) {
+          // Iteration failed (either in-memory writebatch corruption (very
+          // bad), or the client specified invalid column family). Return
+          // failure.
+          // Note that existing logic was not sound. Any partial failure writing
+          // into the memtable would result in a state that some write ops might
+          // have succeeded in memtable but Status reports error for all writes.
+          return status;
+        }
+        SetTickerCount(options_.statistics.get(), SEQUENCE_NUMBER,
+                       last_sequence);
+      }
+      PERF_TIMER_START(write_pre_and_post_process_time);
+      if (updates == &tmp_batch_) tmp_batch_.Clear();
+      mutex_.Lock();
+      if (status.ok()) {
+        versions_->SetLastSequence(last_sequence);
+      }
+    }
+  }
+  if (options_.paranoid_checks && !status.ok() && bg_error_.ok()) {
+    bg_error_ = status; // stop compaction & fail any further writes
+  }
+
+  while (true) {
+    Writer* ready = writers_.front();
+    writers_.pop_front();
+    if (ready != &w) {
+      ready->status = status;
+      ready->done = true;
+      ready->cv.Signal();
+    }
+    if (ready == last_writer) break;
+  }
+
+  // Notify new head of write queue
+  if (!writers_.empty()) {
+    writers_.front()->cv.Signal();
+  }
+  mutex_.Unlock();
+
+  for (auto& sv : superversions_to_free) {
+    delete sv;
+  }
+  for (auto& log : logs_to_free) {
+    delete log;
+  }
+
+  PERF_TIMER_STOP(write_pre_and_post_process_time);
+  return status;
+}
+
+// REQUIRES: Writer list must be non-empty
+// REQUIRES: First writer must have a non-nullptr batch
+void DBImpl::BuildBatchGroup(Writer** last_writer,
+                             autovector<WriteBatch*>* write_batch_group) {
+  assert(!writers_.empty());
+  Writer* first = writers_.front();
+  assert(first->batch != nullptr);
+
+  size_t size = WriteBatchInternal::ByteSize(first->batch);
+  write_batch_group->push_back(first->batch);
+
+  // Allow the group to grow up to a maximum size, but if the
+  // original write is small, limit the growth so we do not slow
+  // down the small write too much.
+  size_t max_size = 1 << 20;
+  if (size <= (128<<10)) {
+    max_size = size + (128<<10);
+  }
+
+  *last_writer = first;
+  std::deque<Writer*>::iterator iter = writers_.begin();
+  ++iter;  // Advance past "first"
+  for (; iter != writers_.end(); ++iter) {
+    Writer* w = *iter;
+    if (w->sync && !first->sync) {
+      // Do not include a sync write into a batch handled by a non-sync write.
+      break;
+    }
+
+    if (!w->disableWAL && first->disableWAL) {
+      // Do not include a write that needs WAL into a batch that has
+      // WAL disabled.
+      break;
+    }
+
+    if (w->batch != nullptr) {
+      size += WriteBatchInternal::ByteSize(w->batch);
+      if (size > max_size) {
+        // Do not make batch too big
+        break;
+      }
+
+      write_batch_group->push_back(w->batch);
+    }
+    *last_writer = w;
+  }
+}
+
+// This function computes the amount of time in microseconds by which a write
+// should be delayed based on the number of level-0 files according to the
+// following formula:
+// if n < bottom, return 0;
+// if n >= top, return 1000;
+// otherwise, let r = (n - bottom) /
+//                    (top - bottom)
+//  and return r^2 * 1000.
+// The goal of this formula is to gradually increase the rate at which writes
+// are slowed. We also tried linear delay (r * 1000), but it seemed to do
+// slightly worse. There is no other particular reason for choosing quadratic.
+uint64_t DBImpl::SlowdownAmount(int n, double bottom, double top) {
+  uint64_t delay;
+  if (n >= top) {
+    delay = 1000;
+  }
+  else if (n < bottom) {
+    delay = 0;
+  }
+  else {
+    // If we are here, we know that:
+    //   level0_start_slowdown <= n < level0_slowdown
+    // since the previous two conditions are false.
+    double how_much =
+      (double) (n - bottom) /
+              (top - bottom);
+    delay = std::max(how_much * how_much * 1000, 100.0);
+  }
+  assert(delay <= 1000);
+  return delay;
+}
+
+// REQUIRES: mutex_ is held
+// REQUIRES: this thread is currently at the front of the writer queue
+Status DBImpl::MakeRoomForWrite(
+    ColumnFamilyData* cfd, bool force,
+    autovector<SuperVersion*>* superversions_to_free,
+    autovector<log::Writer*>* logs_to_free) {
+  mutex_.AssertHeld();
+  assert(!writers_.empty());
+  bool allow_delay = !force;
+  bool allow_hard_rate_limit_delay = !force;
+  bool allow_soft_rate_limit_delay = !force;
+  uint64_t rate_limit_delay_millis = 0;
+  Status s;
+  double score;
+
+  while (true) {
+    if (!bg_error_.ok()) {
+      // Yield previous error
+      s = bg_error_;
+      break;
+    } else if (allow_delay && cfd->NeedSlowdownForNumLevel0Files()) {
+      // We are getting close to hitting a hard limit on the number of
+      // L0 files.  Rather than delaying a single write by several
+      // seconds when we hit the hard limit, start delaying each
+      // individual write by 0-1ms to reduce latency variance.  Also,
+      // this delay hands over some CPU to the compaction thread in
+      // case it is sharing the same core as the writer.
+      uint64_t slowdown =
+          SlowdownAmount(cfd->current()->NumLevelFiles(0),
+                         cfd->options()->level0_slowdown_writes_trigger,
+                         cfd->options()->level0_stop_writes_trigger);
+      mutex_.Unlock();
+      uint64_t delayed;
+      {
+        StopWatch sw(env_, options_.statistics.get(), STALL_L0_SLOWDOWN_COUNT);
+        env_->SleepForMicroseconds(slowdown);
+        delayed = sw.ElapsedMicros();
+      }
+      RecordTick(options_.statistics.get(), STALL_L0_SLOWDOWN_MICROS, delayed);
+      cfd->internal_stats()->RecordWriteStall(InternalStats::LEVEL0_SLOWDOWN,
+                                              delayed);
+      allow_delay = false;  // Do not delay a single write more than once
+      mutex_.Lock();
+      delayed_writes_++;
+    } else if (!force && !cfd->mem()->ShouldFlush()) {
+      // There is room in current memtable
+      if (allow_delay) {
+        DelayLoggingAndReset();
+      }
+      break;
+    } else if (cfd->imm()->size() ==
+               cfd->options()->max_write_buffer_number - 1) {
+      // We have filled up the current memtable, but the previous
+      // ones are still being flushed, so we wait.
+      DelayLoggingAndReset();
+      Log(options_.info_log, "[%s] wait for memtable flush...\n",
+          cfd->GetName().c_str());
+      MaybeScheduleFlushOrCompaction();
+      uint64_t stall;
+      {
+        StopWatch sw(env_, options_.statistics.get(),
+                     STALL_MEMTABLE_COMPACTION_COUNT);
+        bg_cv_.Wait();
+        stall = sw.ElapsedMicros();
+      }
+      RecordTick(options_.statistics.get(),
+                 STALL_MEMTABLE_COMPACTION_MICROS, stall);
+      cfd->internal_stats()->RecordWriteStall(
+          InternalStats::MEMTABLE_COMPACTION, stall);
+    } else if (cfd->current()->NumLevelFiles(0) >=
+               cfd->options()->level0_stop_writes_trigger) {
+      // There are too many level-0 files.
+      DelayLoggingAndReset();
+      Log(options_.info_log, "[%s] wait for fewer level0 files...\n",
+          cfd->GetName().c_str());
+      uint64_t stall;
+      {
+        StopWatch sw(env_, options_.statistics.get(),
+                     STALL_L0_NUM_FILES_COUNT);
+        bg_cv_.Wait();
+        stall = sw.ElapsedMicros();
+      }
+      RecordTick(options_.statistics.get(), STALL_L0_NUM_FILES_MICROS, stall);
+      cfd->internal_stats()->RecordWriteStall(InternalStats::LEVEL0_NUM_FILES,
+                                              stall);
+    } else if (allow_hard_rate_limit_delay &&
+               cfd->options()->hard_rate_limit > 1.0 &&
+               (score = cfd->current()->MaxCompactionScore()) >
+                   cfd->options()->hard_rate_limit) {
+      // Delay a write when the compaction score for any level is too large.
+      int max_level = cfd->current()->MaxCompactionScoreLevel();
+      mutex_.Unlock();
+      uint64_t delayed;
+      {
+        StopWatch sw(env_, options_.statistics.get(),
+                     HARD_RATE_LIMIT_DELAY_COUNT);
+        env_->SleepForMicroseconds(1000);
+        delayed = sw.ElapsedMicros();
+      }
+      cfd->internal_stats()->RecordLevelNSlowdown(max_level, delayed);
+      // Make sure the following value doesn't round to zero.
+      uint64_t rate_limit = std::max((delayed / 1000), (uint64_t) 1);
+      rate_limit_delay_millis += rate_limit;
+      RecordTick(options_.statistics.get(),
+                 RATE_LIMIT_DELAY_MILLIS, rate_limit);
+      if (cfd->options()->rate_limit_delay_max_milliseconds > 0 &&
+          rate_limit_delay_millis >=
+              (unsigned)cfd->options()->rate_limit_delay_max_milliseconds) {
+        allow_hard_rate_limit_delay = false;
+      }
+      mutex_.Lock();
+    } else if (allow_soft_rate_limit_delay &&
+               cfd->options()->soft_rate_limit > 0.0 &&
+               (score = cfd->current()->MaxCompactionScore()) >
+                   cfd->options()->soft_rate_limit) {
+      // Delay a write when the compaction score for any level is too large.
+      // TODO: add statistics
+      mutex_.Unlock();
+      {
+        StopWatch sw(env_, options_.statistics.get(),
+                     SOFT_RATE_LIMIT_DELAY_COUNT);
+        env_->SleepForMicroseconds(
+            SlowdownAmount(score, cfd->options()->soft_rate_limit,
+                           cfd->options()->hard_rate_limit));
+        rate_limit_delay_millis += sw.ElapsedMicros();
+      }
+      allow_soft_rate_limit_delay = false;
+      mutex_.Lock();
+
+    } else {
+      unique_ptr<WritableFile> lfile;
+      log::Writer* new_log = nullptr;
+      MemTable* new_mem = nullptr;
+
+      // Attempt to switch to a new memtable and trigger flush of old.
+      // Do this without holding the dbmutex lock.
+      assert(versions_->PrevLogNumber() == 0);
+      bool creating_new_log = !log_empty_;
+      uint64_t new_log_number =
+          creating_new_log ? versions_->NewFileNumber() : logfile_number_;
+      SuperVersion* new_superversion = nullptr;
+      mutex_.Unlock();
+      {
+        DelayLoggingAndReset();
+        if (creating_new_log) {
+          s = env_->NewWritableFile(
+              LogFileName(options_.wal_dir, new_log_number), &lfile,
+              env_->OptimizeForLogWrite(storage_options_));
+          if (s.ok()) {
+            // Our final size should be less than write_buffer_size
+            // (compression, etc) but err on the side of caution.
+            lfile->SetPreallocationBlockSize(1.1 *
+                                             cfd->options()->write_buffer_size);
+            new_log = new log::Writer(std::move(lfile));
+          }
+        }
+
+        if (s.ok()) {
+          new_mem = new MemTable(cfd->internal_comparator(), *cfd->options());
+          new_superversion = new SuperVersion();
+        }
+      }
+      mutex_.Lock();
+      if (!s.ok()) {
+        // how do we fail if we're not creating new log?
+        assert(creating_new_log);
+        // Avoid chewing through file number space in a tight loop.
+        versions_->ReuseFileNumber(new_log_number);
+        assert(!new_mem);
+        assert(!new_log);
+        break;
+      }
+      if (creating_new_log) {
+        logfile_number_ = new_log_number;
+        assert(new_log != nullptr);
+        logs_to_free->push_back(log_.release());
+        log_.reset(new_log);
+        log_empty_ = true;
+        alive_log_files_.push_back(LogFileNumberSize(logfile_number_));
+        for (auto cfd : *versions_->GetColumnFamilySet()) {
+          // all this is just optimization to delete logs that
+          // are no longer needed -- if CF is empty, that means it
+          // doesn't need that particular log to stay alive, so we just
+          // advance the log number. no need to persist this in the manifest
+          if (cfd->mem()->GetFirstSequenceNumber() == 0 &&
+              cfd->imm()->size() == 0) {
+            cfd->SetLogNumber(logfile_number_);
+          }
+        }
+      }
+      cfd->mem()->SetNextLogNumber(logfile_number_);
+      cfd->imm()->Add(cfd->mem());
+      if (force) {
+        cfd->imm()->FlushRequested();
+      }
+      new_mem->Ref();
+      cfd->SetMemtable(new_mem);
+      Log(options_.info_log, "[%s] New memtable created with log file: #%lu\n",
+          cfd->GetName().c_str(), (unsigned long)logfile_number_);
+      force = false;  // Do not force another compaction if have room
+      MaybeScheduleFlushOrCompaction();
+      superversions_to_free->push_back(
+          cfd->InstallSuperVersion(new_superversion, &mutex_));
+    }
+  }
+  return s;
+}
+
+#ifndef ROCKSDB_LITE
+Status DBImpl::GetPropertiesOfAllTables(ColumnFamilyHandle* column_family,
+                                        TablePropertiesCollection* props) {
+  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  auto cfd = cfh->cfd();
+
+  // Increment the ref count
+  mutex_.Lock();
+  auto version = cfd->current();
+  version->Ref();
+  mutex_.Unlock();
+
+  auto s = version->GetPropertiesOfAllTables(props);
+
+  // Decrement the ref count
+  mutex_.Lock();
+  version->Unref();
+  mutex_.Unlock();
+
+  return s;
+}
+#endif  // ROCKSDB_LITE
+
+const std::string& DBImpl::GetName() const {
+  return dbname_;
+}
+
+Env* DBImpl::GetEnv() const {
+  return env_;
+}
+
+const Options& DBImpl::GetOptions(ColumnFamilyHandle* column_family) const {
+  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  return *cfh->cfd()->options();
+}
+
+bool DBImpl::GetProperty(ColumnFamilyHandle* column_family,
+                         const Slice& property, std::string* value) {
+  value->clear();
+  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  auto cfd = cfh->cfd();
+  DBPropertyType property_type = GetPropertyType(property);
+  MutexLock l(&mutex_);
+  return cfd->internal_stats()->GetProperty(property_type, property, value,
+                                            cfd);
+}
+
+void DBImpl::GetApproximateSizes(ColumnFamilyHandle* column_family,
+                                 const Range* range, int n, uint64_t* sizes) {
+  // TODO(opt): better implementation
+  Version* v;
+  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  auto cfd = cfh->cfd();
+  {
+    MutexLock l(&mutex_);
+    v = cfd->current();
+    v->Ref();
+  }
+
+  for (int i = 0; i < n; i++) {
+    // Convert user_key into a corresponding internal key.
+    InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek);
+    InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek);
+    uint64_t start = versions_->ApproximateOffsetOf(v, k1);
+    uint64_t limit = versions_->ApproximateOffsetOf(v, k2);
+    sizes[i] = (limit >= start ? limit - start : 0);
+  }
+
+  {
+    MutexLock l(&mutex_);
+    v->Unref();
+  }
+}
+
+inline void DBImpl::DelayLoggingAndReset() {
+  if (delayed_writes_ > 0) {
+    Log(options_.info_log, "delayed %d write...\n", delayed_writes_ );
+    delayed_writes_ = 0;
+  }
+}
+
+#ifndef ROCKSDB_LITE
+Status DBImpl::GetUpdatesSince(
+    SequenceNumber seq, unique_ptr<TransactionLogIterator>* iter,
+    const TransactionLogIterator::ReadOptions& read_options) {
+
+  RecordTick(options_.statistics.get(), GET_UPDATES_SINCE_CALLS);
+  if (seq > versions_->LastSequence()) {
+    return Status::NotFound("Requested sequence not yet written in the db");
+  }
+  //  Get all sorted Wal Files.
+  //  Do binary search and open files and find the seq number.
+
+  std::unique_ptr<VectorLogPtr> wal_files(new VectorLogPtr);
+  Status s = GetSortedWalFiles(*wal_files);
+  if (!s.ok()) {
+    return s;
+  }
+
+  s = RetainProbableWalFiles(*wal_files, seq);
+  if (!s.ok()) {
+    return s;
+  }
+  iter->reset(new TransactionLogIteratorImpl(options_.wal_dir, &options_,
+                                             read_options, storage_options_,
+                                             seq, std::move(wal_files), this));
+  return (*iter)->status();
+}
+
+Status DBImpl::DeleteFile(std::string name) {
+  uint64_t number;
+  FileType type;
+  WalFileType log_type;
+  if (!ParseFileName(name, &number, &type, &log_type) ||
+      (type != kTableFile && type != kLogFile)) {
+    Log(options_.info_log, "DeleteFile %s failed.\n", name.c_str());
+    return Status::InvalidArgument("Invalid file name");
+  }
+
+  Status status;
+  if (type == kLogFile) {
+    // Only allow deleting archived log files
+    if (log_type != kArchivedLogFile) {
+      Log(options_.info_log, "DeleteFile %s failed - not archived log.\n",
+          name.c_str());
+      return Status::NotSupported("Delete only supported for archived logs");
+    }
+    status = env_->DeleteFile(options_.wal_dir + "/" + name.c_str());
+    if (!status.ok()) {
+      Log(options_.info_log, "DeleteFile %s failed -- %s.\n",
+          name.c_str(), status.ToString().c_str());
+    }
+    return status;
+  }
+
+  int level;
+  FileMetaData* metadata;
+  ColumnFamilyData* cfd;
+  VersionEdit edit;
+  DeletionState deletion_state(true);
+  {
+    MutexLock l(&mutex_);
+    status = versions_->GetMetadataForFile(number, &level, &metadata, &cfd);
+    if (!status.ok()) {
+      Log(options_.info_log, "DeleteFile %s failed. File not found\n",
+                             name.c_str());
+      return Status::InvalidArgument("File not found");
+    }
+    assert((level > 0) && (level < cfd->NumberLevels()));
+
+    // If the file is being compacted no need to delete.
+    if (metadata->being_compacted) {
+      Log(options_.info_log,
+          "DeleteFile %s Skipped. File about to be compacted\n", name.c_str());
+      return Status::OK();
+    }
+
+    // Only the files in the last level can be deleted externally.
+    // This is to make sure that any deletion tombstones are not
+    // lost. Check that the level passed is the last level.
+    for (int i = level + 1; i < cfd->NumberLevels(); i++) {
+      if (cfd->current()->NumLevelFiles(i) != 0) {
+        Log(options_.info_log,
+            "DeleteFile %s FAILED. File not in last level\n", name.c_str());
+        return Status::InvalidArgument("File not in last level");
+      }
+    }
+    edit.DeleteFile(level, number);
+    status = versions_->LogAndApply(cfd, &edit, &mutex_, db_directory_.get());
+    if (status.ok()) {
+      InstallSuperVersion(cfd, deletion_state);
+    }
+    FindObsoleteFiles(deletion_state, false);
+  } // lock released here
+  LogFlush(options_.info_log);
+  // remove files outside the db-lock
+  if (deletion_state.HaveSomethingToDelete()) {
+    PurgeObsoleteFiles(deletion_state);
+  }
+  {
+    MutexLock l(&mutex_);
+    // schedule flush if file deletion means we freed the space for flushes to
+    // continue
+    MaybeScheduleFlushOrCompaction();
+  }
+  return status;
+}
+
+void DBImpl::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
+  MutexLock l(&mutex_);
+  versions_->GetLiveFilesMetaData(metadata);
+}
+#endif  // ROCKSDB_LITE
+
+Status DBImpl::CheckConsistency() {
+  mutex_.AssertHeld();
+  std::vector<LiveFileMetaData> metadata;
+  versions_->GetLiveFilesMetaData(&metadata);
+
+  std::string corruption_messages;
+  for (const auto& md : metadata) {
+    std::string file_path = dbname_ + md.name;
+    uint64_t fsize = 0;
+    Status s = env_->GetFileSize(file_path, &fsize);
+    if (!s.ok()) {
+      corruption_messages +=
+          "Can't access " + md.name + ": " + s.ToString() + "\n";
+    } else if (fsize != md.size) {
+      corruption_messages += "Sst file size mismatch: " + md.name +
+                             ". Size recorded in manifest " +
+                             std::to_string(md.size) + ", actual size " +
+                             std::to_string(fsize) + "\n";
+    }
+  }
+  if (corruption_messages.size() == 0) {
+    return Status::OK();
+  } else {
+    return Status::Corruption(corruption_messages);
+  }
+}
+
+Status DBImpl::GetDbIdentity(std::string& identity) {
+  std::string idfilename = IdentityFileName(dbname_);
+  unique_ptr<SequentialFile> idfile;
+  const EnvOptions soptions;
+  Status s = env_->NewSequentialFile(idfilename, &idfile, soptions);
+  if (!s.ok()) {
+    return s;
+  }
+  uint64_t file_size;
+  s = env_->GetFileSize(idfilename, &file_size);
+  if (!s.ok()) {
+    return s;
+  }
+  char buffer[file_size];
+  Slice id;
+  s = idfile->Read(file_size, &id, buffer);
+  if (!s.ok()) {
+    return s;
+  }
+  identity.assign(id.ToString());
+  // If last character is '\n' remove it from identity
+  if (identity.size() > 0 && identity.back() == '\n') {
+    identity.pop_back();
+  }
+  return s;
+}
+
+// Default implementations of convenience methods that subclasses of DB
+// can call if they wish
+Status DB::Put(const WriteOptions& opt, ColumnFamilyHandle* column_family,
+               const Slice& key, const Slice& value) {
+  // Pre-allocate size of write batch conservatively.
+  // 8 bytes are taken by header, 4 bytes for count, 1 byte for type,
+  // and we allocate 11 extra bytes for key length, as well as value length.
+  WriteBatch batch(key.size() + value.size() + 24);
+  batch.Put(column_family, key, value);
+  return Write(opt, &batch);
+}
+
+Status DB::Delete(const WriteOptions& opt, ColumnFamilyHandle* column_family,
+                  const Slice& key) {
+  WriteBatch batch;
+  batch.Delete(column_family, key);
+  return Write(opt, &batch);
+}
+
+Status DB::Merge(const WriteOptions& opt, ColumnFamilyHandle* column_family,
+                 const Slice& key, const Slice& value) {
+  WriteBatch batch;
+  batch.Merge(column_family, key, value);
+  return Write(opt, &batch);
+}
+
+// Default implementation -- returns not supported status
+Status DB::CreateColumnFamily(const ColumnFamilyOptions& options,
+                              const std::string& column_family_name,
+                              ColumnFamilyHandle** handle) {
+  return Status::NotSupported("");
+}
+Status DB::DropColumnFamily(ColumnFamilyHandle* column_family) {
+  return Status::NotSupported("");
+}
+
+DB::~DB() { }
+
+Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) {
+  DBOptions db_options(options);
+  ColumnFamilyOptions cf_options(options);
+  std::vector<ColumnFamilyDescriptor> column_families;
+  column_families.push_back(
+      ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
+  std::vector<ColumnFamilyHandle*> handles;
+  Status s = DB::Open(db_options, dbname, column_families, &handles, dbptr);
+  if (s.ok()) {
+    assert(handles.size() == 1);
+    // i can delete the handle since DBImpl is always holding a reference to
+    // default column family
+    delete handles[0];
+  }
+  return s;
+}
+
+Status DB::Open(const DBOptions& db_options, const std::string& dbname,
+                const std::vector<ColumnFamilyDescriptor>& column_families,
+                std::vector<ColumnFamilyHandle*>* handles, DB** dbptr) {
+  *dbptr = nullptr;
+  handles->clear();
+
+  size_t max_write_buffer_size = 0;
+  for (auto cf : column_families) {
+    max_write_buffer_size =
+        std::max(max_write_buffer_size, cf.options.write_buffer_size);
+    if (cf.options.block_cache != nullptr && cf.options.no_block_cache) {
+      return Status::InvalidArgument(
+          "no_block_cache is true while block_cache is not nullptr");
+    }
+  }
+
+  DBImpl* impl = new DBImpl(db_options, dbname);
+  Status s = impl->env_->CreateDirIfMissing(impl->options_.wal_dir);
+  if (!s.ok()) {
+    delete impl;
+    return s;
+  }
+
+  s = impl->CreateArchivalDirectory();
+  if (!s.ok()) {
+    delete impl;
+    return s;
+  }
+  impl->mutex_.Lock();
+  // Handles create_if_missing, error_if_exists
+  s = impl->Recover(column_families);
+  if (s.ok()) {
+    uint64_t new_log_number = impl->versions_->NewFileNumber();
+    unique_ptr<WritableFile> lfile;
+    EnvOptions soptions(db_options);
+    s = impl->options_.env->NewWritableFile(
+        LogFileName(impl->options_.wal_dir, new_log_number), &lfile,
+        impl->options_.env->OptimizeForLogWrite(soptions));
+    if (s.ok()) {
+      lfile->SetPreallocationBlockSize(1.1 * max_write_buffer_size);
+      impl->logfile_number_ = new_log_number;
+      impl->log_.reset(new log::Writer(std::move(lfile)));
+
+      // set column family handles
+      for (auto cf : column_families) {
+        auto cfd =
+            impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name);
+        if (cfd == nullptr) {
+          s = Status::InvalidArgument("Column family not found: ", cf.name);
+          break;
+        }
+        handles->push_back(
+            new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_));
+      }
+    }
+    if (s.ok()) {
+      for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
+        delete cfd->InstallSuperVersion(new SuperVersion(), &impl->mutex_);
+      }
+      impl->alive_log_files_.push_back(
+          DBImpl::LogFileNumberSize(impl->logfile_number_));
+      impl->DeleteObsoleteFiles();
+      impl->MaybeScheduleFlushOrCompaction();
+      impl->MaybeScheduleLogDBDeployStats();
+      s = impl->db_directory_->Fsync();
+    }
+  }
+
+  if (s.ok()) {
+    for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
+      if (cfd->options()->compaction_style == kCompactionStyleUniversal) {
+        Version* current = cfd->current();
+        for (int i = 1; i < current->NumberLevels(); ++i) {
+          int num_files = current->NumLevelFiles(i);
+          if (num_files > 0) {
+            s = Status::InvalidArgument("Not all files are at level 0. Cannot "
+                "open with universal compaction style.");
+            break;
+          }
+        }
+      }
+      if (cfd->options()->merge_operator != nullptr &&
+          !cfd->mem()->IsMergeOperatorSupported()) {
+        s = Status::InvalidArgument(
+            "The memtable of column family %s does not support merge operator "
+            "its options.merge_operator is non-null", cfd->GetName().c_str());
+      }
+      if (!s.ok()) {
+        break;
+      }
+    }
+  }
+
+  impl->mutex_.Unlock();
+
+  if (s.ok()) {
+    impl->opened_successfully_ = true;
+    *dbptr = impl;
+  } else {
+    for (auto h : *handles) {
+      delete h;
+    }
+    handles->clear();
+    delete impl;
+  }
+  return s;
+}
+
+Status DB::ListColumnFamilies(const DBOptions& db_options,
+                              const std::string& name,
+                              std::vector<std::string>* column_families) {
+  return VersionSet::ListColumnFamilies(column_families, name, db_options.env);
+}
+
+Snapshot::~Snapshot() {
+}
+
+Status DestroyDB(const std::string& dbname, const Options& options) {
+  const InternalKeyComparator comparator(options.comparator);
+  const InternalFilterPolicy filter_policy(options.filter_policy);
+  const Options& soptions(SanitizeOptions(
+    dbname, &comparator, &filter_policy, options));
+  Env* env = soptions.env;
+  std::vector<std::string> filenames;
+  std::vector<std::string> archiveFiles;
+
+  std::string archivedir = ArchivalDirectory(dbname);
+  // Ignore error in case directory does not exist
+  env->GetChildren(dbname, &filenames);
+
+  if (dbname != soptions.wal_dir) {
+    std::vector<std::string> logfilenames;
+    env->GetChildren(soptions.wal_dir, &logfilenames);
+    filenames.insert(filenames.end(), logfilenames.begin(), logfilenames.end());
+    archivedir = ArchivalDirectory(soptions.wal_dir);
+  }
+
+  if (filenames.empty()) {
+    return Status::OK();
+  }
+
+  FileLock* lock;
+  const std::string lockname = LockFileName(dbname);
+  Status result = env->LockFile(lockname, &lock);
+  if (result.ok()) {
+    uint64_t number;
+    FileType type;
+    for (size_t i = 0; i < filenames.size(); i++) {
+      if (ParseFileName(filenames[i], &number, &type) &&
+          type != kDBLockFile) {  // Lock file will be deleted at end
+        Status del;
+        if (type == kMetaDatabase) {
+          del = DestroyDB(dbname + "/" + filenames[i], options);
+        } else if (type == kLogFile) {
+          del = env->DeleteFile(soptions.wal_dir + "/" + filenames[i]);
+        } else {
+          del = env->DeleteFile(dbname + "/" + filenames[i]);
+        }
+        if (result.ok() && !del.ok()) {
+          result = del;
+        }
+      }
+    }
+
+    env->GetChildren(archivedir, &archiveFiles);
+    // Delete archival files.
+    for (size_t i = 0; i < archiveFiles.size(); ++i) {
+      if (ParseFileName(archiveFiles[i], &number, &type) &&
+          type == kLogFile) {
+        Status del = env->DeleteFile(archivedir + "/" + archiveFiles[i]);
+        if (result.ok() && !del.ok()) {
+          result = del;
+        }
+      }
+    }
+    // ignore case where no archival directory is present.
+    env->DeleteDir(archivedir);
+
+    env->UnlockFile(lock);  // Ignore error since state is already gone
+    env->DeleteFile(lockname);
+    env->DeleteDir(dbname);  // Ignore error in case dir contains other files
+    env->DeleteDir(soptions.wal_dir);
+  }
+  return result;
+}
+
+//
+// A global method that can dump out the build version
+void DumpLeveldbBuildVersion(Logger * log) {
+#if !defined(IOS_CROSS_COMPILE)
+  // if we compile with Xcode, we don't run build_detect_vesion, so we don't generate util/build_version.cc
+  Log(log, "Git sha %s", rocksdb_build_git_sha);
+  Log(log, "Compile time %s %s",
+      rocksdb_build_compile_time, rocksdb_build_compile_date);
+#endif
+}
+
+}  // namespace rocksdb
diff --git a/db/db_impl.h b/db/db_impl.h
new file mode 100644 (file)
index 0000000..cc59cfd
--- /dev/null
@@ -0,0 +1,623 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <atomic>
+#include <deque>
+#include <set>
+#include <utility>
+#include <vector>
+#include <string>
+
+#include "db/dbformat.h"
+#include "db/log_writer.h"
+#include "db/snapshot.h"
+#include "db/column_family.h"
+#include "db/version_edit.h"
+#include "memtable_list.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/transaction_log.h"
+#include "util/autovector.h"
+#include "util/stats_logger.h"
+#include "util/thread_local.h"
+#include "db/internal_stats.h"
+
+namespace rocksdb {
+
+class MemTable;
+class TableCache;
+class Version;
+class VersionEdit;
+class VersionSet;
+class CompactionFilterV2;
+
+class DBImpl : public DB {
+ public:
+  DBImpl(const DBOptions& options, const std::string& dbname);
+  virtual ~DBImpl();
+
+  // Implementations of the DB interface
+  using DB::Put;
+  virtual Status Put(const WriteOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     const Slice& value);
+  using DB::Merge;
+  virtual Status Merge(const WriteOptions& options,
+                       ColumnFamilyHandle* column_family, const Slice& key,
+                       const Slice& value);
+  using DB::Delete;
+  virtual Status Delete(const WriteOptions& options,
+                        ColumnFamilyHandle* column_family, const Slice& key);
+  using DB::Write;
+  virtual Status Write(const WriteOptions& options, WriteBatch* updates);
+  using DB::Get;
+  virtual Status Get(const ReadOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     std::string* value);
+  using DB::MultiGet;
+  virtual std::vector<Status> MultiGet(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_family,
+      const std::vector<Slice>& keys, std::vector<std::string>* values);
+
+  virtual Status CreateColumnFamily(const ColumnFamilyOptions& options,
+                                    const std::string& column_family,
+                                    ColumnFamilyHandle** handle);
+  virtual Status DropColumnFamily(ColumnFamilyHandle* column_family);
+
+  // Returns false if key doesn't exist in the database and true if it may.
+  // If value_found is not passed in as null, then return the value if found in
+  // memory. On return, if value was found, then value_found will be set to true
+  // , otherwise false.
+  using DB::KeyMayExist;
+  virtual bool KeyMayExist(const ReadOptions& options,
+                           ColumnFamilyHandle* column_family, const Slice& key,
+                           std::string* value, bool* value_found = nullptr);
+  using DB::NewIterator;
+  virtual Iterator* NewIterator(const ReadOptions& options,
+                                ColumnFamilyHandle* column_family);
+  virtual Status NewIterators(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_families,
+      std::vector<Iterator*>* iterators);
+  virtual const Snapshot* GetSnapshot();
+  virtual void ReleaseSnapshot(const Snapshot* snapshot);
+  using DB::GetProperty;
+  virtual bool GetProperty(ColumnFamilyHandle* column_family,
+                           const Slice& property, std::string* value);
+  using DB::GetApproximateSizes;
+  virtual void GetApproximateSizes(ColumnFamilyHandle* column_family,
+                                   const Range* range, int n, uint64_t* sizes);
+  using DB::CompactRange;
+  virtual Status CompactRange(ColumnFamilyHandle* column_family,
+                              const Slice* begin, const Slice* end,
+                              bool reduce_level = false, int target_level = -1);
+
+  using DB::NumberLevels;
+  virtual int NumberLevels(ColumnFamilyHandle* column_family);
+  using DB::MaxMemCompactionLevel;
+  virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family);
+  using DB::Level0StopWriteTrigger;
+  virtual int Level0StopWriteTrigger(ColumnFamilyHandle* column_family);
+  virtual const std::string& GetName() const;
+  virtual Env* GetEnv() const;
+  using DB::GetOptions;
+  virtual const Options& GetOptions(ColumnFamilyHandle* column_family) const;
+  using DB::Flush;
+  virtual Status Flush(const FlushOptions& options,
+                       ColumnFamilyHandle* column_family);
+
+  virtual SequenceNumber GetLatestSequenceNumber() const;
+
+#ifndef ROCKSDB_LITE
+  virtual Status DisableFileDeletions();
+  virtual Status EnableFileDeletions(bool force);
+  // All the returned filenames start with "/"
+  virtual Status GetLiveFiles(std::vector<std::string>&,
+                              uint64_t* manifest_file_size,
+                              bool flush_memtable = true);
+  virtual Status GetSortedWalFiles(VectorLogPtr& files);
+
+  virtual Status GetUpdatesSince(
+      SequenceNumber seq_number, unique_ptr<TransactionLogIterator>* iter,
+      const TransactionLogIterator::ReadOptions&
+          read_options = TransactionLogIterator::ReadOptions());
+  virtual Status DeleteFile(std::string name);
+
+  virtual void GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata);
+#endif  // ROCKSDB_LITE
+
+  // checks if all live files exist on file system and that their file sizes
+  // match to our in-memory records
+  virtual Status CheckConsistency();
+
+  virtual Status GetDbIdentity(std::string& identity);
+
+  Status RunManualCompaction(ColumnFamilyData* cfd, int input_level,
+                             int output_level, const Slice* begin,
+                             const Slice* end);
+
+#ifndef ROCKSDB_LITE
+  // Extra methods (for testing) that are not in the public DB interface
+  // Implemented in db_impl_debug.cc
+
+  // Compact any files in the named level that overlap [*begin, *end]
+  Status TEST_CompactRange(int level, const Slice* begin, const Slice* end,
+                           ColumnFamilyHandle* column_family = nullptr);
+
+  // Force current memtable contents to be flushed.
+  Status TEST_FlushMemTable(bool wait = true);
+
+  // Wait for memtable compaction
+  Status TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family = nullptr);
+
+  // Wait for any compaction
+  Status TEST_WaitForCompact();
+
+  // Return an internal iterator over the current state of the database.
+  // The keys of this iterator are internal keys (see format.h).
+  // The returned iterator should be deleted when no longer needed.
+  Iterator* TEST_NewInternalIterator(ColumnFamilyHandle* column_family =
+                                         nullptr);
+
+  // Return the maximum overlapping data (in bytes) at next level for any
+  // file at a level >= 1.
+  int64_t TEST_MaxNextLevelOverlappingBytes(ColumnFamilyHandle* column_family =
+                                                nullptr);
+
+  // Return the current manifest file no.
+  uint64_t TEST_Current_Manifest_FileNo();
+
+  // Trigger's a background call for testing.
+  void TEST_PurgeObsoleteteWAL();
+
+  // get total level0 file size. Only for testing.
+  uint64_t TEST_GetLevel0TotalSize();
+
+  void TEST_SetDefaultTimeToCheck(uint64_t default_interval_to_delete_obsolete_WAL)
+  {
+    default_interval_to_delete_obsolete_WAL_ = default_interval_to_delete_obsolete_WAL;
+  }
+
+  void TEST_GetFilesMetaData(ColumnFamilyHandle* column_family,
+                             std::vector<std::vector<FileMetaData>>* metadata);
+
+  Status TEST_ReadFirstRecord(const WalFileType type, const uint64_t number,
+                              SequenceNumber* sequence);
+
+  Status TEST_ReadFirstLine(const std::string& fname, SequenceNumber* sequence);
+#endif  // NDEBUG
+
+  // needed for CleanupIteratorState
+  struct DeletionState {
+    inline bool HaveSomethingToDelete() const {
+      return  candidate_files.size() ||
+        sst_delete_files.size() ||
+        log_delete_files.size();
+    }
+
+    // a list of all files that we'll consider deleting
+    // (every once in a while this is filled up with all files
+    // in the DB directory)
+    std::vector<std::string> candidate_files;
+
+    // the list of all live sst files that cannot be deleted
+    std::vector<uint64_t> sst_live;
+
+    // a list of sst files that we need to delete
+    std::vector<FileMetaData*> sst_delete_files;
+
+    // a list of log files that we need to delete
+    std::vector<uint64_t> log_delete_files;
+
+    // a list of memtables to be free
+    autovector<MemTable*> memtables_to_free;
+
+    autovector<SuperVersion*> superversions_to_free;
+
+    SuperVersion* new_superversion;  // if nullptr no new superversion
+
+    // the current manifest_file_number, log_number and prev_log_number
+    // that corresponds to the set of files in 'live'.
+    uint64_t manifest_file_number, pending_manifest_file_number, log_number,
+        prev_log_number;
+
+    explicit DeletionState(bool create_superversion = false) {
+      manifest_file_number = 0;
+      pending_manifest_file_number = 0;
+      log_number = 0;
+      prev_log_number = 0;
+      new_superversion = create_superversion ? new SuperVersion() : nullptr;
+    }
+
+    ~DeletionState() {
+      // free pending memtables
+      for (auto m : memtables_to_free) {
+        delete m;
+      }
+      // free superversions
+      for (auto s : superversions_to_free) {
+        delete s;
+      }
+      // if new_superversion was not used, it will be non-nullptr and needs
+      // to be freed here
+      delete new_superversion;
+    }
+  };
+
+  // Returns the list of live files in 'live' and the list
+  // of all files in the filesystem in 'candidate_files'.
+  // If force == false and the last call was less than
+  // options_.delete_obsolete_files_period_micros microseconds ago,
+  // it will not fill up the deletion_state
+  void FindObsoleteFiles(DeletionState& deletion_state,
+                         bool force,
+                         bool no_full_scan = false);
+
+  // Diffs the files listed in filenames and those that do not
+  // belong to live files are posibly removed. Also, removes all the
+  // files in sst_delete_files and log_delete_files.
+  // It is not necessary to hold the mutex when invoking this method.
+  void PurgeObsoleteFiles(DeletionState& deletion_state);
+
+  ColumnFamilyHandle* DefaultColumnFamily() const;
+
+ protected:
+  Env* const env_;
+  const std::string dbname_;
+  unique_ptr<VersionSet> versions_;
+  const DBOptions options_;
+
+  Iterator* NewInternalIterator(const ReadOptions&, ColumnFamilyData* cfd,
+                                SuperVersion* super_version);
+
+ private:
+  friend class DB;
+  friend class InternalStats;
+#ifndef ROCKSDB_LITE
+  friend class TailingIterator;
+#endif
+  friend struct SuperVersion;
+  struct CompactionState;
+  struct Writer;
+
+  Status NewDB();
+
+  // Recover the descriptor from persistent storage.  May do a significant
+  // amount of work to recover recently logged updates.  Any changes to
+  // be made to the descriptor are added to *edit.
+  Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families,
+                 bool read_only = false, bool error_if_log_file_exist = false);
+
+  void MaybeIgnoreError(Status* s) const;
+
+  const Status CreateArchivalDirectory();
+
+  // Delete any unneeded files and stale in-memory entries.
+  void DeleteObsoleteFiles();
+
+  // Flush the in-memory write buffer to storage.  Switches to a new
+  // log-file/memtable and writes a new descriptor iff successful.
+  Status FlushMemTableToOutputFile(ColumnFamilyData* cfd, bool* madeProgress,
+                                   DeletionState& deletion_state,
+                                   LogBuffer* log_buffer);
+
+  Status RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence,
+                        bool read_only);
+
+  // The following two methods are used to flush a memtable to
+  // storage. The first one is used atdatabase RecoveryTime (when the
+  // database is opened) and is heavyweight because it holds the mutex
+  // for the entire period. The second method WriteLevel0Table supports
+  // concurrent flush memtables to storage.
+  Status WriteLevel0TableForRecovery(ColumnFamilyData* cfd, MemTable* mem,
+                                     VersionEdit* edit);
+  Status WriteLevel0Table(ColumnFamilyData* cfd, autovector<MemTable*>& mems,
+                          VersionEdit* edit, uint64_t* filenumber,
+                          LogBuffer* log_buffer);
+
+  uint64_t SlowdownAmount(int n, double bottom, double top);
+
+  // TODO(icanadi) free superversion_to_free and old_log outside of mutex
+  Status MakeRoomForWrite(ColumnFamilyData* cfd,
+                          bool force /* flush even if there is room? */,
+                          autovector<SuperVersion*>* superversions_to_free,
+                          autovector<log::Writer*>* logs_to_free);
+
+  void BuildBatchGroup(Writer** last_writer,
+                       autovector<WriteBatch*>* write_batch_group);
+
+  // Force current memtable contents to be flushed.
+  Status FlushMemTable(ColumnFamilyData* cfd, const FlushOptions& options);
+
+  // Wait for memtable flushed
+  Status WaitForFlushMemTable(ColumnFamilyData* cfd);
+
+  void MaybeScheduleLogDBDeployStats();
+
+#ifndef ROCKSDB_LITE
+  static void BGLogDBDeployStats(void* db);
+  void LogDBDeployStats();
+#endif  // ROCKSDB_LITE
+
+  void MaybeScheduleFlushOrCompaction();
+  static void BGWorkCompaction(void* db);
+  static void BGWorkFlush(void* db);
+  void BackgroundCallCompaction();
+  void BackgroundCallFlush();
+  Status BackgroundCompaction(bool* madeProgress, DeletionState& deletion_state,
+                              LogBuffer* log_buffer);
+  Status BackgroundFlush(bool* madeProgress, DeletionState& deletion_state,
+                         LogBuffer* log_buffer);
+  void CleanupCompaction(CompactionState* compact, Status status);
+  Status DoCompactionWork(CompactionState* compact,
+                          DeletionState& deletion_state,
+                          LogBuffer* log_buffer);
+
+  // This function is called as part of compaction. It enables Flush process to
+  // preempt compaction, since it's higher prioirty
+  // Returns: micros spent executing
+  uint64_t CallFlushDuringCompaction(ColumnFamilyData* cfd,
+                                     DeletionState& deletion_state,
+                                     LogBuffer* log_buffer);
+
+  // Call compaction filter if is_compaction_v2 is not true. Then iterate
+  // through input and compact the kv-pairs
+  Status ProcessKeyValueCompaction(
+    SequenceNumber visible_at_tip,
+    SequenceNumber earliest_snapshot,
+    SequenceNumber latest_snapshot,
+    DeletionState& deletion_state,
+    bool bottommost_level,
+    int64_t& imm_micros,
+    Iterator* input,
+    CompactionState* compact,
+    bool is_compaction_v2,
+    LogBuffer* log_buffer);
+
+  // Call compaction_filter_v2->Filter() on kv-pairs in compact
+  void CallCompactionFilterV2(CompactionState* compact,
+    CompactionFilterV2* compaction_filter_v2);
+
+  Status OpenCompactionOutputFile(CompactionState* compact);
+  Status FinishCompactionOutputFile(CompactionState* compact, Iterator* input);
+  Status InstallCompactionResults(CompactionState* compact,
+                                  LogBuffer* log_buffer);
+  void AllocateCompactionOutputFileNumbers(CompactionState* compact);
+  void ReleaseCompactionUnusedFileNumbers(CompactionState* compact);
+
+#ifdef ROCKSDB_LITE
+  void PurgeObsoleteWALFiles() {
+    // this function is used for archiving WAL files. we don't need this in
+    // ROCKSDB_LITE
+  }
+#else
+  void PurgeObsoleteWALFiles();
+
+  Status GetSortedWalsOfType(const std::string& path,
+                             VectorLogPtr& log_files,
+                             WalFileType type);
+
+  // Requires: all_logs should be sorted with earliest log file first
+  // Retains all log files in all_logs which contain updates with seq no.
+  // Greater Than or Equal to the requested SequenceNumber.
+  Status RetainProbableWalFiles(VectorLogPtr& all_logs,
+                                const SequenceNumber target);
+
+  Status ReadFirstRecord(const WalFileType type, const uint64_t number,
+                         SequenceNumber* sequence);
+
+  Status ReadFirstLine(const std::string& fname, SequenceNumber* sequence);
+#endif  // ROCKSDB_LITE
+
+  void PrintStatistics();
+
+  // dump rocksdb.stats to LOG
+  void MaybeDumpStats();
+
+  // Return true if the current db supports snapshot.  If the current
+  // DB does not support snapshot, then calling GetSnapshot() will always
+  // return nullptr.
+  //
+  // @see GetSnapshot()
+  virtual bool IsSnapshotSupported() const;
+
+  // Return the minimum empty level that could hold the total data in the
+  // input level. Return the input level, if such level could not be found.
+  int FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd, int level);
+
+  // Move the files in the input level to the target level.
+  // If target_level < 0, automatically calculate the minimum level that could
+  // hold the data set.
+  Status ReFitLevel(ColumnFamilyData* cfd, int level, int target_level = -1);
+
+  // table_cache_ provides its own synchronization
+  std::shared_ptr<Cache> table_cache_;
+
+  // Lock over the persistent DB state.  Non-nullptr iff successfully acquired.
+  FileLock* db_lock_;
+
+  // State below is protected by mutex_
+  port::Mutex mutex_;
+  port::AtomicPointer shutting_down_;
+  port::CondVar bg_cv_;          // Signalled when background work finishes
+  uint64_t logfile_number_;
+  unique_ptr<log::Writer> log_;
+  bool log_empty_;
+  ColumnFamilyHandleImpl* default_cf_handle_;
+  unique_ptr<ColumnFamilyMemTablesImpl> column_family_memtables_;
+  struct LogFileNumberSize {
+    explicit LogFileNumberSize(uint64_t _number)
+        : number(_number), size(0), getting_flushed(false) {}
+    void AddSize(uint64_t new_size) { size += new_size; }
+    uint64_t number;
+    uint64_t size;
+    bool getting_flushed;
+  };
+  std::deque<LogFileNumberSize> alive_log_files_;
+  uint64_t total_log_size_;
+  // only used for dynamically adjusting max_total_wal_size. it is a sum of
+  // [write_buffer_size * max_write_buffer_number] over all column families
+  uint64_t max_total_in_memory_state_;
+
+  std::string host_name_;
+
+  std::unique_ptr<Directory> db_directory_;
+
+  // Queue of writers.
+  std::deque<Writer*> writers_;
+  WriteBatch tmp_batch_;
+
+  SnapshotList snapshots_;
+
+  // cache for ReadFirstRecord() calls
+  std::unordered_map<uint64_t, SequenceNumber> read_first_record_cache_;
+  port::Mutex read_first_record_cache_mutex_;
+
+  // Set of table files to protect from deletion because they are
+  // part of ongoing compactions.
+  std::set<uint64_t> pending_outputs_;
+
+  // At least one compaction or flush job is pending but not yet scheduled
+  // because of the max background thread limit.
+  bool bg_schedule_needed_;
+
+  // count how many background compactions are running or have been scheduled
+  int bg_compaction_scheduled_;
+
+  // If non-zero, MaybeScheduleFlushOrCompaction() will only schedule manual
+  // compactions (if manual_compaction_ is not null). This mechanism enables
+  // manual compactions to wait until all other compactions are finished.
+  int bg_manual_only_;
+
+  // number of background memtable flush jobs, submitted to the HIGH pool
+  int bg_flush_scheduled_;
+
+  // Has a background stats log thread scheduled?
+  bool bg_logstats_scheduled_;
+
+  // Information for a manual compaction
+  struct ManualCompaction {
+    ColumnFamilyData* cfd;
+    int input_level;
+    int output_level;
+    bool done;
+    Status status;
+    bool in_progress;           // compaction request being processed?
+    const InternalKey* begin;   // nullptr means beginning of key range
+    const InternalKey* end;     // nullptr means end of key range
+    InternalKey tmp_storage;    // Used to keep track of compaction progress
+  };
+  ManualCompaction* manual_compaction_;
+
+  // Have we encountered a background error in paranoid mode?
+  Status bg_error_;
+
+  std::unique_ptr<StatsLogger> logger_;
+
+  int64_t volatile last_log_ts;
+
+  // shall we disable deletion of obsolete files
+  // if 0 the deletion is enabled.
+  // if non-zero, files will not be getting deleted
+  // This enables two different threads to call
+  // EnableFileDeletions() and DisableFileDeletions()
+  // without any synchronization
+  int disable_delete_obsolete_files_;
+
+  // last time when DeleteObsoleteFiles was invoked
+  uint64_t delete_obsolete_files_last_run_;
+
+  // last time when PurgeObsoleteWALFiles ran.
+  uint64_t purge_wal_files_last_run_;
+
+  // last time stats were dumped to LOG
+  std::atomic<uint64_t> last_stats_dump_time_microsec_;
+
+  // obsolete files will be deleted every this seconds if ttl deletion is
+  // enabled and archive size_limit is disabled.
+  uint64_t default_interval_to_delete_obsolete_WAL_;
+
+  bool flush_on_destroy_; // Used when disableWAL is true.
+
+  static const int KEEP_LOG_FILE_NUM = 1000;
+  std::string db_absolute_path_;
+
+  // count of the number of contiguous delaying writes
+  int delayed_writes_;
+
+  // The options to access storage files
+  const EnvOptions storage_options_;
+
+  // A value of true temporarily disables scheduling of background work
+  bool bg_work_gate_closed_;
+
+  // Guard against multiple concurrent refitting
+  bool refitting_level_;
+
+  // Indicate DB was opened successfully
+  bool opened_successfully_;
+
+  // No copying allowed
+  DBImpl(const DBImpl&);
+  void operator=(const DBImpl&);
+
+  // dump the delayed_writes_ to the log file and reset counter.
+  void DelayLoggingAndReset();
+
+  // Return the earliest snapshot where seqno is visible.
+  // Store the snapshot right before that, if any, in prev_snapshot
+  inline SequenceNumber findEarliestVisibleSnapshot(
+    SequenceNumber in,
+    std::vector<SequenceNumber>& snapshots,
+    SequenceNumber* prev_snapshot);
+
+  // Background threads call this function, which is just a wrapper around
+  // the cfd->InstallSuperVersion() function. Background threads carry
+  // deletion_state which can have new_superversion already allocated.
+  void InstallSuperVersion(ColumnFamilyData* cfd,
+                           DeletionState& deletion_state);
+
+#ifndef ROCKSDB_LITE
+  using DB::GetPropertiesOfAllTables;
+  virtual Status GetPropertiesOfAllTables(ColumnFamilyHandle* column_family,
+                                          TablePropertiesCollection* props)
+      override;
+#endif  // ROCKSDB_LITE
+
+  // Function that Get and KeyMayExist call with no_io true or false
+  // Note: 'value_found' from KeyMayExist propagates here
+  Status GetImpl(const ReadOptions& options, ColumnFamilyHandle* column_family,
+                 const Slice& key, std::string* value,
+                 bool* value_found = nullptr);
+};
+
+// Sanitize db options.  The caller should delete result.info_log if
+// it is not equal to src.info_log.
+extern Options SanitizeOptions(const std::string& db,
+                               const InternalKeyComparator* icmp,
+                               const InternalFilterPolicy* ipolicy,
+                               const Options& src);
+extern DBOptions SanitizeOptions(const std::string& db, const DBOptions& src);
+
+// Determine compression type, based on user options, level of the output
+// file and whether compression is disabled.
+// If enable_compression is false, then compression is always disabled no
+// matter what the values of the other two parameters are.
+// Otherwise, the compression type is determined based on options and level.
+CompressionType GetCompressionType(const Options& options, int level,
+                                   const bool enable_compression);
+
+// Determine compression type for L0 file written by memtable flush.
+CompressionType GetCompressionFlush(const Options& options);
+
+}  // namespace rocksdb
diff --git a/db/db_impl_debug.cc b/db/db_impl_debug.cc
new file mode 100644 (file)
index 0000000..d6551b4
--- /dev/null
@@ -0,0 +1,132 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+
+#include "db/db_impl.h"
+
+namespace rocksdb {
+
+void DBImpl::TEST_PurgeObsoleteteWAL() { PurgeObsoleteWALFiles(); }
+
+uint64_t DBImpl::TEST_GetLevel0TotalSize() {
+  MutexLock l(&mutex_);
+  return default_cf_handle_->cfd()->current()->NumLevelBytes(0);
+}
+
+Iterator* DBImpl::TEST_NewInternalIterator(ColumnFamilyHandle* column_family) {
+  ColumnFamilyData* cfd;
+  if (column_family == nullptr) {
+    cfd = default_cf_handle_->cfd();
+  } else {
+    auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+    cfd = cfh->cfd();
+  }
+
+  mutex_.Lock();
+  SuperVersion* super_version = cfd->GetSuperVersion()->Ref();
+  mutex_.Unlock();
+  ReadOptions roptions;
+  return NewInternalIterator(roptions, cfd, super_version);
+}
+
+int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes(
+    ColumnFamilyHandle* column_family) {
+  ColumnFamilyData* cfd;
+  if (column_family == nullptr) {
+    cfd = default_cf_handle_->cfd();
+  } else {
+    auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+    cfd = cfh->cfd();
+  }
+  MutexLock l(&mutex_);
+  return cfd->current()->MaxNextLevelOverlappingBytes();
+}
+
+void DBImpl::TEST_GetFilesMetaData(
+    ColumnFamilyHandle* column_family,
+    std::vector<std::vector<FileMetaData>>* metadata) {
+  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  auto cfd = cfh->cfd();
+  MutexLock l(&mutex_);
+  metadata->resize(NumberLevels());
+  for (int level = 0; level < NumberLevels(); level++) {
+    const std::vector<FileMetaData*>& files = cfd->current()->files_[level];
+
+    (*metadata)[level].clear();
+    for (const auto& f : files) {
+      (*metadata)[level].push_back(*f);
+    }
+  }
+}
+
+uint64_t DBImpl::TEST_Current_Manifest_FileNo() {
+  return versions_->ManifestFileNumber();
+}
+
+Status DBImpl::TEST_CompactRange(int level, const Slice* begin,
+                                 const Slice* end,
+                                 ColumnFamilyHandle* column_family) {
+  ColumnFamilyData* cfd;
+  if (column_family == nullptr) {
+    cfd = default_cf_handle_->cfd();
+  } else {
+    auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+    cfd = cfh->cfd();
+  }
+  int output_level =
+      (cfd->options()->compaction_style == kCompactionStyleUniversal)
+          ? level
+          : level + 1;
+  return RunManualCompaction(cfd, level, output_level, begin, end);
+}
+
+Status DBImpl::TEST_FlushMemTable(bool wait) {
+  FlushOptions fo;
+  fo.wait = wait;
+  return FlushMemTable(default_cf_handle_->cfd(), fo);
+}
+
+Status DBImpl::TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family) {
+  ColumnFamilyData* cfd;
+  if (column_family == nullptr) {
+    cfd = default_cf_handle_->cfd();
+  } else {
+    auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+    cfd = cfh->cfd();
+  }
+  return WaitForFlushMemTable(cfd);
+}
+
+Status DBImpl::TEST_WaitForCompact() {
+  // Wait until the compaction completes
+
+  // TODO: a bug here. This function actually does not necessarily
+  // wait for compact. It actually waits for scheduled compaction
+  // OR flush to finish.
+
+  MutexLock l(&mutex_);
+  while ((bg_compaction_scheduled_ || bg_flush_scheduled_) && bg_error_.ok()) {
+    bg_cv_.Wait();
+  }
+  return bg_error_;
+}
+
+Status DBImpl::TEST_ReadFirstRecord(const WalFileType type,
+                                    const uint64_t number,
+                                    SequenceNumber* sequence) {
+  return ReadFirstRecord(type, number, sequence);
+}
+
+Status DBImpl::TEST_ReadFirstLine(const std::string& fname,
+                                  SequenceNumber* sequence) {
+  return ReadFirstLine(fname, sequence);
+}
+}  // namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/db/db_impl_readonly.cc b/db/db_impl_readonly.cc
new file mode 100644 (file)
index 0000000..4308374
--- /dev/null
@@ -0,0 +1,154 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2012 Facebook. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "db/db_impl_readonly.h"
+#include "db/db_impl.h"
+
+#include <algorithm>
+#include <set>
+#include <string>
+#include <stdint.h>
+#include <stdio.h>
+#include <vector>
+#include <algorithm>
+#include "db/db_iter.h"
+#include "db/dbformat.h"
+#include "db/filename.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/memtable.h"
+#include "db/merge_context.h"
+#include "db/table_cache.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "rocksdb/merge_operator.h"
+#include "port/port.h"
+#include "table/block.h"
+#include "table/merger.h"
+#include "table/two_level_iterator.h"
+#include "util/coding.h"
+#include "util/logging.h"
+#include "util/build_version.h"
+
+namespace rocksdb {
+
+DBImplReadOnly::DBImplReadOnly(const DBOptions& options,
+                               const std::string& dbname)
+    : DBImpl(options, dbname) {
+  Log(options_.info_log, "Opening the db in read only mode");
+}
+
+DBImplReadOnly::~DBImplReadOnly() {
+}
+
+// Implementations of the DB interface
+Status DBImplReadOnly::Get(const ReadOptions& options,
+                           ColumnFamilyHandle* column_family, const Slice& key,
+                           std::string* value) {
+  Status s;
+  SequenceNumber snapshot = versions_->LastSequence();
+  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  auto cfd = cfh->cfd();
+  SuperVersion* super_version = cfd->GetSuperVersion();
+  MergeContext merge_context;
+  LookupKey lkey(key, snapshot);
+  if (super_version->mem->Get(lkey, value, &s, merge_context,
+                              *cfd->options())) {
+  } else {
+    Version::GetStats stats;
+    super_version->current->Get(options, lkey, value, &s, &merge_context,
+                                &stats);
+  }
+  return s;
+}
+
+Iterator* DBImplReadOnly::NewIterator(const ReadOptions& options,
+                                      ColumnFamilyHandle* column_family) {
+  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  auto cfd = cfh->cfd();
+  SuperVersion* super_version = cfd->GetSuperVersion()->Ref();
+  SequenceNumber latest_snapshot = versions_->LastSequence();
+  Iterator* internal_iter = NewInternalIterator(options, cfd, super_version);
+  return NewDBIterator(
+      env_, *cfd->options(), cfd->user_comparator(), internal_iter,
+      (options.snapshot != nullptr
+           ? reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_
+           : latest_snapshot));
+}
+
+Status DB::OpenForReadOnly(const Options& options, const std::string& dbname,
+                           DB** dbptr, bool error_if_log_file_exist) {
+  *dbptr = nullptr;
+
+  DBOptions db_options(options);
+  ColumnFamilyOptions cf_options(options);
+  std::vector<ColumnFamilyDescriptor> column_families;
+  column_families.push_back(
+      ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
+  std::vector<ColumnFamilyHandle*> handles;
+
+  Status s =
+      DB::OpenForReadOnly(db_options, dbname, column_families, &handles, dbptr);
+  if (s.ok()) {
+    assert(handles.size() == 1);
+    // i can delete the handle since DBImpl is always holding a
+    // reference to default column family
+    delete handles[0];
+  }
+  return s;
+}
+
+Status DB::OpenForReadOnly(
+    const DBOptions& db_options, const std::string& dbname,
+    const std::vector<ColumnFamilyDescriptor>& column_families,
+    std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
+    bool error_if_log_file_exist) {
+  *dbptr = nullptr;
+  handles->clear();
+
+  DBImplReadOnly* impl = new DBImplReadOnly(db_options, dbname);
+  impl->mutex_.Lock();
+  Status s = impl->Recover(column_families, true /* read only */,
+                           error_if_log_file_exist);
+  if (s.ok()) {
+    // set column family handles
+    for (auto cf : column_families) {
+      auto cfd =
+          impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name);
+      if (cfd == nullptr) {
+        s = Status::InvalidArgument("Column family not found: ", cf.name);
+        break;
+      }
+      handles->push_back(new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_));
+    }
+  }
+  if (s.ok()) {
+    for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
+      delete cfd->InstallSuperVersion(new SuperVersion(), &impl->mutex_);
+    }
+  }
+  impl->mutex_.Unlock();
+  if (s.ok()) {
+    *dbptr = impl;
+  } else {
+    for (auto h : *handles) {
+      delete h;
+    }
+    handles->clear();
+    delete impl;
+  }
+  return s;
+}
+
+
+}   // namespace rocksdb
diff --git a/db/db_impl_readonly.h b/db/db_impl_readonly.h
new file mode 100644 (file)
index 0000000..c4703ba
--- /dev/null
@@ -0,0 +1,103 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2012 Facebook. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#pragma once
+#include "db/db_impl.h"
+
+#include <deque>
+#include <set>
+#include <vector>
+#include <string>
+#include "db/dbformat.h"
+#include "db/log_writer.h"
+#include "db/snapshot.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "port/port.h"
+#include "util/stats_logger.h"
+
+namespace rocksdb {
+
+class DBImplReadOnly : public DBImpl {
+ public:
+  DBImplReadOnly(const DBOptions& options, const std::string& dbname);
+  virtual ~DBImplReadOnly();
+
+  // Implementations of the DB interface
+  using DB::Get;
+  virtual Status Get(const ReadOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     std::string* value);
+
+  // TODO: Implement ReadOnly MultiGet?
+
+  using DBImpl::NewIterator;
+  virtual Iterator* NewIterator(const ReadOptions&,
+                                ColumnFamilyHandle* column_family);
+
+  virtual Status NewIterators(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_family,
+      std::vector<Iterator*>* iterators) {
+   // TODO
+    return Status::NotSupported("Not supported yet.");
+  }
+
+  using DBImpl::Put;
+  virtual Status Put(const WriteOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     const Slice& value) {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+  using DBImpl::Merge;
+  virtual Status Merge(const WriteOptions& options,
+                       ColumnFamilyHandle* column_family, const Slice& key,
+                       const Slice& value) {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+  using DBImpl::Delete;
+  virtual Status Delete(const WriteOptions& options,
+                        ColumnFamilyHandle* column_family, const Slice& key) {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+  virtual Status Write(const WriteOptions& options, WriteBatch* updates) {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+  using DBImpl::CompactRange;
+  virtual Status CompactRange(ColumnFamilyHandle* column_family,
+                              const Slice* begin, const Slice* end,
+                              bool reduce_level = false,
+                              int target_level = -1) {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+  virtual Status DisableFileDeletions() {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+  virtual Status EnableFileDeletions(bool force) {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+  virtual Status GetLiveFiles(std::vector<std::string>&,
+                              uint64_t* manifest_file_size,
+                              bool flush_memtable = true) {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+  using DBImpl::Flush;
+  virtual Status Flush(const FlushOptions& options,
+                       ColumnFamilyHandle* column_family) {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+
+ private:
+  friend class DB;
+
+  // No copying allowed
+  DBImplReadOnly(const DBImplReadOnly&);
+  void operator=(const DBImplReadOnly&);
+};
+}
diff --git a/db/db_iter.cc b/db/db_iter.cc
new file mode 100644 (file)
index 0000000..a6d765d
--- /dev/null
@@ -0,0 +1,477 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_iter.h"
+#include <stdexcept>
+#include <deque>
+
+#include "db/filename.h"
+#include "db/dbformat.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/merge_operator.h"
+#include "port/port.h"
+#include "util/logging.h"
+#include "util/mutexlock.h"
+#include "util/perf_context_imp.h"
+
+namespace rocksdb {
+
+#if 0
+static void DumpInternalIter(Iterator* iter) {
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ParsedInternalKey k;
+    if (!ParseInternalKey(iter->key(), &k)) {
+      fprintf(stderr, "Corrupt '%s'\n", EscapeString(iter->key()).c_str());
+    } else {
+      fprintf(stderr, "@ '%s'\n", k.DebugString().c_str());
+    }
+  }
+}
+#endif
+
+namespace {
+
+// Memtables and sstables that make the DB representation contain
+// (userkey,seq,type) => uservalue entries.  DBIter
+// combines multiple entries for the same userkey found in the DB
+// representation into a single entry while accounting for sequence
+// numbers, deletion markers, overwrites, etc.
+class DBIter: public Iterator {
+ public:
+  // The following is grossly complicated. TODO: clean it up
+  // Which direction is the iterator currently moving?
+  // (1) When moving forward, the internal iterator is positioned at
+  //     the exact entry that yields this->key(), this->value()
+  // (2) When moving backwards, the internal iterator is positioned
+  //     just before all entries whose user key == this->key().
+  enum Direction {
+    kForward,
+    kReverse
+  };
+
+  DBIter(Env* env, const Options& options,
+         const Comparator* cmp, Iterator* iter, SequenceNumber s)
+      : env_(env),
+        logger_(options.info_log.get()),
+        user_comparator_(cmp),
+        user_merge_operator_(options.merge_operator.get()),
+        iter_(iter),
+        sequence_(s),
+        direction_(kForward),
+        valid_(false),
+        current_entry_is_merged_(false),
+        statistics_(options.statistics.get()) {
+    RecordTick(statistics_, NO_ITERATORS, 1);
+    max_skip_ = options.max_sequential_skip_in_iterations;
+  }
+  virtual ~DBIter() {
+    RecordTick(statistics_, NO_ITERATORS, -1);
+    delete iter_;
+  }
+  virtual bool Valid() const { return valid_; }
+  virtual Slice key() const {
+    assert(valid_);
+    return saved_key_.GetKey();
+  }
+  virtual Slice value() const {
+    assert(valid_);
+    return (direction_ == kForward && !current_entry_is_merged_) ?
+      iter_->value() : saved_value_;
+  }
+  virtual Status status() const {
+    if (status_.ok()) {
+      return iter_->status();
+    } else {
+      return status_;
+    }
+  }
+
+  virtual void Next();
+  virtual void Prev();
+  virtual void Seek(const Slice& target);
+  virtual void SeekToFirst();
+  virtual void SeekToLast();
+
+ private:
+  inline void FindNextUserEntry(bool skipping);
+  void FindNextUserEntryInternal(bool skipping);
+  void FindPrevUserEntry();
+  bool ParseKey(ParsedInternalKey* key);
+  void MergeValuesNewToOld();
+
+  inline void ClearSavedValue() {
+    if (saved_value_.capacity() > 1048576) {
+      std::string empty;
+      swap(empty, saved_value_);
+    } else {
+      saved_value_.clear();
+    }
+  }
+
+  Env* const env_;
+  Logger* logger_;
+  const Comparator* const user_comparator_;
+  const MergeOperator* const user_merge_operator_;
+  Iterator* const iter_;
+  SequenceNumber const sequence_;
+
+  Status status_;
+  IterKey saved_key_;   // == current key when direction_==kReverse
+  std::string saved_value_;   // == current raw value when direction_==kReverse
+  std::string skip_key_;
+  Direction direction_;
+  bool valid_;
+  bool current_entry_is_merged_;
+  Statistics* statistics_;
+  uint64_t max_skip_;
+
+  // No copying allowed
+  DBIter(const DBIter&);
+  void operator=(const DBIter&);
+};
+
+inline bool DBIter::ParseKey(ParsedInternalKey* ikey) {
+  if (!ParseInternalKey(iter_->key(), ikey)) {
+    status_ = Status::Corruption("corrupted internal key in DBIter");
+    Log(logger_, "corrupted internal key in DBIter: %s",
+        iter_->key().ToString(true).c_str());
+    return false;
+  } else {
+    return true;
+  }
+}
+
+void DBIter::Next() {
+  assert(valid_);
+
+  if (direction_ == kReverse) {  // Switch directions?
+    direction_ = kForward;
+    // iter_ is pointing just before the entries for this->key(),
+    // so advance into the range of entries for this->key() and then
+    // use the normal skipping code below.
+    if (!iter_->Valid()) {
+      iter_->SeekToFirst();
+    } else {
+      iter_->Next();
+    }
+    if (!iter_->Valid()) {
+      valid_ = false;
+      saved_key_.Clear();
+      return;
+    }
+  }
+
+  // If the current value is merged, we might already hit end of iter_
+  if (!iter_->Valid()) {
+    valid_ = false;
+    return;
+  }
+  FindNextUserEntry(true /* skipping the current user key */);
+}
+
+
+// PRE: saved_key_ has the current user key if skipping
+// POST: saved_key_ should have the next user key if valid_,
+//       if the current entry is a result of merge
+//           current_entry_is_merged_ => true
+//           saved_value_             => the merged value
+//
+// NOTE: In between, saved_key_ can point to a user key that has
+//       a delete marker
+inline void DBIter::FindNextUserEntry(bool skipping) {
+  PERF_TIMER_AUTO(find_next_user_entry_time);
+  FindNextUserEntryInternal(skipping);
+  PERF_TIMER_STOP(find_next_user_entry_time);
+}
+
+// Actual implementation of DBIter::FindNextUserEntry()
+void DBIter::FindNextUserEntryInternal(bool skipping) {
+  // Loop until we hit an acceptable entry to yield
+  assert(iter_->Valid());
+  assert(direction_ == kForward);
+  current_entry_is_merged_ = false;
+  uint64_t num_skipped = 0;
+  do {
+    ParsedInternalKey ikey;
+    if (ParseKey(&ikey) && ikey.sequence <= sequence_) {
+      if (skipping &&
+          user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) <= 0) {
+        num_skipped++; // skip this entry
+        PERF_COUNTER_ADD(internal_key_skipped_count, 1);
+      } else {
+        skipping = false;
+        switch (ikey.type) {
+          case kTypeDeletion:
+            // Arrange to skip all upcoming entries for this key since
+            // they are hidden by this deletion.
+            saved_key_.SetUserKey(ikey.user_key);
+            skipping = true;
+            num_skipped = 0;
+            PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
+            break;
+          case kTypeValue:
+            valid_ = true;
+            saved_key_.SetUserKey(ikey.user_key);
+            return;
+          case kTypeMerge:
+            // By now, we are sure the current ikey is going to yield a value
+            saved_key_.SetUserKey(ikey.user_key);
+            current_entry_is_merged_ = true;
+            valid_ = true;
+            MergeValuesNewToOld();  // Go to a different state machine
+            return;
+          default:
+            assert(false);
+            break;
+        }
+      }
+    }
+    // If we have sequentially iterated via numerous keys and still not
+    // found the next user-key, then it is better to seek so that we can
+    // avoid too many key comparisons. We seek to the last occurence of
+    // our current key by looking for sequence number 0.
+    if (skipping && num_skipped > max_skip_) {
+      num_skipped = 0;
+      std::string last_key;
+      AppendInternalKey(&last_key, ParsedInternalKey(saved_key_.GetKey(), 0,
+                                                     kValueTypeForSeek));
+      iter_->Seek(last_key);
+      RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
+    } else {
+      iter_->Next();
+    }
+  } while (iter_->Valid());
+  valid_ = false;
+}
+
+// Merge values of the same user key starting from the current iter_ position
+// Scan from the newer entries to older entries.
+// PRE: iter_->key() points to the first merge type entry
+//      saved_key_ stores the user key
+// POST: saved_value_ has the merged value for the user key
+//       iter_ points to the next entry (or invalid)
+void DBIter::MergeValuesNewToOld() {
+  if (!user_merge_operator_) {
+    Log(logger_, "Options::merge_operator is null.");
+    throw std::logic_error("DBIter::MergeValuesNewToOld() with"
+                           " Options::merge_operator null");
+  }
+
+  // Start the merge process by pushing the first operand
+  std::deque<std::string> operands;
+  operands.push_front(iter_->value().ToString());
+
+  std::string merge_result;   // Temporary string to hold merge result later
+  ParsedInternalKey ikey;
+  for (iter_->Next(); iter_->Valid(); iter_->Next()) {
+    if (!ParseKey(&ikey)) {
+      // skip corrupted key
+      continue;
+    }
+
+    if (user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) != 0) {
+      // hit the next user key, stop right here
+      break;
+    }
+
+    if (kTypeDeletion == ikey.type) {
+      // hit a delete with the same user key, stop right here
+      // iter_ is positioned after delete
+      iter_->Next();
+      break;
+    }
+
+    if (kTypeValue == ikey.type) {
+      // hit a put, merge the put value with operands and store the
+      // final result in saved_value_. We are done!
+      // ignore corruption if there is any.
+      const Slice value = iter_->value();
+      user_merge_operator_->FullMerge(ikey.user_key, &value, operands,
+                                      &saved_value_, logger_);
+      // iter_ is positioned after put
+      iter_->Next();
+      return;
+    }
+
+    if (kTypeMerge == ikey.type) {
+      // hit a merge, add the value as an operand and run associative merge.
+      // when complete, add result to operands and continue.
+      const Slice& value = iter_->value();
+      operands.push_front(value.ToString());
+    }
+  }
+
+  // we either exhausted all internal keys under this user key, or hit
+  // a deletion marker.
+  // feed null as the existing value to the merge operator, such that
+  // client can differentiate this scenario and do things accordingly.
+  user_merge_operator_->FullMerge(saved_key_.GetKey(), nullptr, operands,
+                                  &saved_value_, logger_);
+}
+
+void DBIter::Prev() {
+  assert(valid_);
+
+  // Throw an exception now if merge_operator is provided
+  // TODO: support backward iteration
+  if (user_merge_operator_) {
+    Log(logger_, "Prev not supported yet if merge_operator is provided");
+    throw std::logic_error("DBIter::Prev backward iteration not supported"
+                           " if merge_operator is provided");
+  }
+
+  if (direction_ == kForward) {  // Switch directions?
+    // iter_ is pointing at the current entry.  Scan backwards until
+    // the key changes so we can use the normal reverse scanning code.
+    assert(iter_->Valid());  // Otherwise valid_ would have been false
+    saved_key_.SetUserKey(ExtractUserKey(iter_->key()));
+    while (true) {
+      iter_->Prev();
+      if (!iter_->Valid()) {
+        valid_ = false;
+        saved_key_.Clear();
+        ClearSavedValue();
+        return;
+      }
+      if (user_comparator_->Compare(ExtractUserKey(iter_->key()),
+                                    saved_key_.GetKey()) < 0) {
+        break;
+      }
+    }
+    direction_ = kReverse;
+  }
+
+  FindPrevUserEntry();
+}
+
+void DBIter::FindPrevUserEntry() {
+  assert(direction_ == kReverse);
+  uint64_t num_skipped = 0;
+
+  ValueType value_type = kTypeDeletion;
+  bool saved_key_valid = true;
+  if (iter_->Valid()) {
+    do {
+      ParsedInternalKey ikey;
+      if (ParseKey(&ikey) && ikey.sequence <= sequence_) {
+        if ((value_type != kTypeDeletion) &&
+            user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) < 0) {
+          // We encountered a non-deleted value in entries for previous keys,
+          break;
+        }
+        value_type = ikey.type;
+        if (value_type == kTypeDeletion) {
+          saved_key_.Clear();
+          ClearSavedValue();
+          saved_key_valid = false;
+        } else {
+          Slice raw_value = iter_->value();
+          if (saved_value_.capacity() > raw_value.size() + 1048576) {
+            std::string empty;
+            swap(empty, saved_value_);
+          }
+          saved_key_.SetUserKey(ExtractUserKey(iter_->key()));
+          saved_value_.assign(raw_value.data(), raw_value.size());
+        }
+      } else {
+        // In the case of ikey.sequence > sequence_, we might have already
+        // iterated to a different user key.
+        saved_key_valid = false;
+      }
+      num_skipped++;
+      // If we have sequentially iterated via numerous keys and still not
+      // found the prev user-key, then it is better to seek so that we can
+      // avoid too many key comparisons. We seek to the first occurence of
+      // our current key by looking for max sequence number.
+      if (saved_key_valid && num_skipped > max_skip_) {
+        num_skipped = 0;
+        std::string last_key;
+        AppendInternalKey(&last_key, ParsedInternalKey(saved_key_.GetKey(),
+                                                       kMaxSequenceNumber,
+                                                       kValueTypeForSeek));
+        iter_->Seek(last_key);
+        RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
+      } else {
+        iter_->Prev();
+      }
+    } while (iter_->Valid());
+  }
+
+  if (value_type == kTypeDeletion) {
+    // End
+    valid_ = false;
+    saved_key_.Clear();
+    ClearSavedValue();
+    direction_ = kForward;
+  } else {
+    valid_ = true;
+  }
+}
+
+void DBIter::Seek(const Slice& target) {
+  saved_key_.Clear();
+  // now savved_key is used to store internal key.
+  saved_key_.SetInternalKey(target, sequence_);
+  PERF_TIMER_AUTO(seek_internal_seek_time);
+  iter_->Seek(saved_key_.GetKey());
+  PERF_TIMER_STOP(seek_internal_seek_time);
+  if (iter_->Valid()) {
+    direction_ = kForward;
+    ClearSavedValue();
+    FindNextUserEntry(false /*not skipping */);
+  } else {
+    valid_ = false;
+  }
+}
+
+void DBIter::SeekToFirst() {
+  direction_ = kForward;
+  ClearSavedValue();
+  PERF_TIMER_AUTO(seek_internal_seek_time);
+  iter_->SeekToFirst();
+  PERF_TIMER_STOP(seek_internal_seek_time);
+  if (iter_->Valid()) {
+    FindNextUserEntry(false /* not skipping */);
+  } else {
+    valid_ = false;
+  }
+}
+
+void DBIter::SeekToLast() {
+  // Throw an exception for now if merge_operator is provided
+  // TODO: support backward iteration
+  if (user_merge_operator_) {
+    Log(logger_, "SeekToLast not supported yet if merge_operator is provided");
+    throw std::logic_error("DBIter::SeekToLast: backward iteration not"
+                           " supported if merge_operator is provided");
+  }
+
+  direction_ = kReverse;
+  ClearSavedValue();
+  PERF_TIMER_AUTO(seek_internal_seek_time);
+  iter_->SeekToLast();
+  PERF_TIMER_STOP(seek_internal_seek_time);
+  FindPrevUserEntry();
+}
+
+}  // anonymous namespace
+
+Iterator* NewDBIterator(
+    Env* env,
+    const Options& options,
+    const Comparator *user_key_comparator,
+    Iterator* internal_iter,
+    const SequenceNumber& sequence) {
+  return new DBIter(env, options, user_key_comparator,
+                    internal_iter, sequence);
+}
+
+}  // namespace rocksdb
diff --git a/db/db_iter.h b/db/db_iter.h
new file mode 100644 (file)
index 0000000..d8a3bad
--- /dev/null
@@ -0,0 +1,27 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <stdint.h>
+#include "rocksdb/db.h"
+#include "db/dbformat.h"
+
+namespace rocksdb {
+
+// Return a new iterator that converts internal keys (yielded by
+// "*internal_iter") that were live at the specified "sequence" number
+// into appropriate user keys.
+extern Iterator* NewDBIterator(
+    Env* env,
+    const Options& options,
+    const Comparator *user_key_comparator,
+    Iterator* internal_iter,
+    const SequenceNumber& sequence);
+
+}  // namespace rocksdb
diff --git a/db/db_stats_logger.cc b/db/db_stats_logger.cc
new file mode 100644 (file)
index 0000000..288e1bf
--- /dev/null
@@ -0,0 +1,95 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_impl.h"
+#include <string>
+#include <stdint.h>
+#include <stdio.h>
+#include "db/version_set.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "port/port.h"
+#include "util/mutexlock.h"
+
+namespace rocksdb {
+
+void DBImpl::MaybeScheduleLogDBDeployStats() {
+// we did say maybe
+#ifndef ROCKSDB_LITE
+  // There is a lock in the actual logger.
+  if (!logger_ || options_.db_stats_log_interval < 0
+      || host_name_.empty()) {
+    return;
+  }
+
+  if(bg_logstats_scheduled_ || shutting_down_.Acquire_Load()) {
+    // Already scheduled
+  } else {
+    int64_t current_ts = 0;
+    Status st = env_->GetCurrentTime(&current_ts);
+    if (!st.ok()) {
+      return;
+    }
+    if ((current_ts - last_log_ts) < options_.db_stats_log_interval) {
+      return;
+    }
+    last_log_ts = current_ts;
+    bg_logstats_scheduled_ = true;
+    env_->Schedule(&DBImpl::BGLogDBDeployStats, this);
+  }
+}
+
+void DBImpl::BGLogDBDeployStats(void* db) {
+  DBImpl* db_inst = reinterpret_cast<DBImpl*>(db);
+  db_inst->LogDBDeployStats();
+}
+
+void DBImpl::LogDBDeployStats() {
+  mutex_.Lock();
+
+  if (shutting_down_.Acquire_Load()) {
+    bg_logstats_scheduled_ = false;
+    bg_cv_.SignalAll();
+    mutex_.Unlock();
+    return;
+  }
+
+  char tmp_ver[100];
+  sprintf(tmp_ver, "%d.%d", kMajorVersion, kMinorVersion);
+  std::string version_info(tmp_ver);
+
+  uint64_t file_total_size = 0;
+  uint32_t file_total_num = 0;
+  Version* current = default_cf_handle_->cfd()->current();
+  for (int i = 0; i < current->NumberLevels(); i++) {
+    file_total_num += current->NumLevelFiles(i);
+    file_total_size += current->NumLevelBytes(i);
+  }
+
+  Version::LevelSummaryStorage scratch;
+  const char* file_num_summary = current->LevelSummary(&scratch);
+  std::string file_num_per_level(file_num_summary);
+  std::string data_size_per_level(file_num_summary);
+
+  mutex_.Unlock();
+
+  int64_t unix_ts;
+  env_->GetCurrentTime(&unix_ts);
+
+  logger_->Log_Deploy_Stats(version_info, host_name_,
+      db_absolute_path_, file_total_size, file_total_num, file_num_per_level,
+      data_size_per_level, unix_ts);
+
+  mutex_.Lock();
+  bg_logstats_scheduled_ = false;
+  bg_cv_.SignalAll();
+  mutex_.Unlock();
+#endif
+}
+}
diff --git a/db/db_test.cc b/db/db_test.cc
new file mode 100644 (file)
index 0000000..5162cec
--- /dev/null
@@ -0,0 +1,6834 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <algorithm>
+#include <iostream>
+#include <set>
+#include <unistd.h>
+#include <unordered_set>
+
+#include "db/dbformat.h"
+#include "db/db_impl.h"
+#include "db/filename.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+#include "table/block_based_table_factory.h"
+#include "table/plain_table_factory.h"
+#include "util/hash.h"
+#include "util/hash_linklist_rep.h"
+#include "utilities/merge_operators.h"
+#include "util/logging.h"
+#include "util/mutexlock.h"
+#include "util/statistics.h"
+#include "util/testharness.h"
+#include "util/sync_point.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+
+static bool SnappyCompressionSupported(const CompressionOptions& options) {
+  std::string out;
+  Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
+  return port::Snappy_Compress(options, in.data(), in.size(), &out);
+}
+
+static bool ZlibCompressionSupported(const CompressionOptions& options) {
+  std::string out;
+  Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
+  return port::Zlib_Compress(options, in.data(), in.size(), &out);
+}
+
+static bool BZip2CompressionSupported(const CompressionOptions& options) {
+  std::string out;
+  Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
+  return port::BZip2_Compress(options, in.data(), in.size(), &out);
+}
+
+static bool LZ4CompressionSupported(const CompressionOptions &options) {
+  std::string out;
+  Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
+  return port::LZ4_Compress(options, in.data(), in.size(), &out);
+}
+
+static bool LZ4HCCompressionSupported(const CompressionOptions &options) {
+  std::string out;
+  Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
+  return port::LZ4HC_Compress(options, in.data(), in.size(), &out);
+}
+
+static std::string RandomString(Random *rnd, int len) {
+  std::string r;
+  test::RandomString(rnd, len, &r);
+  return r;
+}
+
+namespace anon {
+class AtomicCounter {
+ private:
+  port::Mutex mu_;
+  int count_;
+ public:
+  AtomicCounter() : count_(0) { }
+  void Increment() {
+    MutexLock l(&mu_);
+    count_++;
+  }
+  int Read() {
+    MutexLock l(&mu_);
+    return count_;
+  }
+  void Reset() {
+    MutexLock l(&mu_);
+    count_ = 0;
+  }
+};
+
+}
+
+// Special Env used to delay background operations
+class SpecialEnv : public EnvWrapper {
+ public:
+  // sstable Sync() calls are blocked while this pointer is non-nullptr.
+  port::AtomicPointer delay_sstable_sync_;
+
+  // Simulate no-space errors while this pointer is non-nullptr.
+  port::AtomicPointer no_space_;
+
+  // Simulate non-writable file system while this pointer is non-nullptr
+  port::AtomicPointer non_writable_;
+
+  // Force sync of manifest files to fail while this pointer is non-nullptr
+  port::AtomicPointer manifest_sync_error_;
+
+  // Force write to manifest files to fail while this pointer is non-nullptr
+  port::AtomicPointer manifest_write_error_;
+
+  // Force write to log files to fail while this pointer is non-nullptr
+  port::AtomicPointer log_write_error_;
+
+  bool count_random_reads_;
+  anon::AtomicCounter random_read_counter_;
+
+  bool count_sequential_reads_;
+  anon::AtomicCounter sequential_read_counter_;
+
+  anon::AtomicCounter sleep_counter_;
+
+  explicit SpecialEnv(Env* base) : EnvWrapper(base) {
+    delay_sstable_sync_.Release_Store(nullptr);
+    no_space_.Release_Store(nullptr);
+    non_writable_.Release_Store(nullptr);
+    count_random_reads_ = false;
+    count_sequential_reads_ = false;
+    manifest_sync_error_.Release_Store(nullptr);
+    manifest_write_error_.Release_Store(nullptr);
+    log_write_error_.Release_Store(nullptr);
+   }
+
+  Status NewWritableFile(const std::string& f, unique_ptr<WritableFile>* r,
+                         const EnvOptions& soptions) {
+    class SSTableFile : public WritableFile {
+     private:
+      SpecialEnv* env_;
+      unique_ptr<WritableFile> base_;
+
+     public:
+      SSTableFile(SpecialEnv* env, unique_ptr<WritableFile>&& base)
+          : env_(env),
+            base_(std::move(base)) {
+      }
+      Status Append(const Slice& data) {
+        if (env_->no_space_.Acquire_Load() != nullptr) {
+          // Drop writes on the floor
+          return Status::OK();
+        } else {
+          return base_->Append(data);
+        }
+      }
+      Status Close() { return base_->Close(); }
+      Status Flush() { return base_->Flush(); }
+      Status Sync() {
+        while (env_->delay_sstable_sync_.Acquire_Load() != nullptr) {
+          env_->SleepForMicroseconds(100000);
+        }
+        return base_->Sync();
+      }
+    };
+    class ManifestFile : public WritableFile {
+     private:
+      SpecialEnv* env_;
+      unique_ptr<WritableFile> base_;
+     public:
+      ManifestFile(SpecialEnv* env, unique_ptr<WritableFile>&& b)
+          : env_(env), base_(std::move(b)) { }
+      Status Append(const Slice& data) {
+        if (env_->manifest_write_error_.Acquire_Load() != nullptr) {
+          return Status::IOError("simulated writer error");
+        } else {
+          return base_->Append(data);
+        }
+      }
+      Status Close() { return base_->Close(); }
+      Status Flush() { return base_->Flush(); }
+      Status Sync() {
+        if (env_->manifest_sync_error_.Acquire_Load() != nullptr) {
+          return Status::IOError("simulated sync error");
+        } else {
+          return base_->Sync();
+        }
+      }
+    };
+    class LogFile : public WritableFile {
+     private:
+      SpecialEnv* env_;
+      unique_ptr<WritableFile> base_;
+     public:
+      LogFile(SpecialEnv* env, unique_ptr<WritableFile>&& b)
+          : env_(env), base_(std::move(b)) { }
+      Status Append(const Slice& data) {
+        if (env_->log_write_error_.Acquire_Load() != nullptr) {
+          return Status::IOError("simulated writer error");
+        } else {
+          return base_->Append(data);
+        }
+      }
+      Status Close() { return base_->Close(); }
+      Status Flush() { return base_->Flush(); }
+      Status Sync() { return base_->Sync(); }
+    };
+
+    if (non_writable_.Acquire_Load() != nullptr) {
+      return Status::IOError("simulated write error");
+    }
+
+    Status s = target()->NewWritableFile(f, r, soptions);
+    if (s.ok()) {
+      if (strstr(f.c_str(), ".sst") != nullptr) {
+        r->reset(new SSTableFile(this, std::move(*r)));
+      } else if (strstr(f.c_str(), "MANIFEST") != nullptr) {
+        r->reset(new ManifestFile(this, std::move(*r)));
+      } else if (strstr(f.c_str(), "log") != nullptr) {
+        r->reset(new LogFile(this, std::move(*r)));
+      }
+    }
+    return s;
+  }
+
+  Status NewRandomAccessFile(const std::string& f,
+                             unique_ptr<RandomAccessFile>* r,
+                             const EnvOptions& soptions) {
+    class CountingFile : public RandomAccessFile {
+     private:
+      unique_ptr<RandomAccessFile> target_;
+      anon::AtomicCounter* counter_;
+     public:
+      CountingFile(unique_ptr<RandomAccessFile>&& target,
+                   anon::AtomicCounter* counter)
+          : target_(std::move(target)), counter_(counter) {
+      }
+      virtual Status Read(uint64_t offset, size_t n, Slice* result,
+                          char* scratch) const {
+        counter_->Increment();
+        return target_->Read(offset, n, result, scratch);
+      }
+    };
+
+    Status s = target()->NewRandomAccessFile(f, r, soptions);
+    if (s.ok() && count_random_reads_) {
+      r->reset(new CountingFile(std::move(*r), &random_read_counter_));
+    }
+    return s;
+  }
+
+  Status NewSequentialFile(const std::string& f, unique_ptr<SequentialFile>* r,
+                           const EnvOptions& soptions) {
+    class CountingFile : public SequentialFile {
+     private:
+      unique_ptr<SequentialFile> target_;
+      anon::AtomicCounter* counter_;
+
+     public:
+      CountingFile(unique_ptr<SequentialFile>&& target,
+                   anon::AtomicCounter* counter)
+          : target_(std::move(target)), counter_(counter) {}
+      virtual Status Read(size_t n, Slice* result, char* scratch) {
+        counter_->Increment();
+        return target_->Read(n, result, scratch);
+      }
+      virtual Status Skip(uint64_t n) { return target_->Skip(n); }
+    };
+
+    Status s = target()->NewSequentialFile(f, r, soptions);
+    if (s.ok() && count_sequential_reads_) {
+      r->reset(new CountingFile(std::move(*r), &sequential_read_counter_));
+    }
+    return s;
+  }
+
+  virtual void SleepForMicroseconds(int micros) {
+    sleep_counter_.Increment();
+    target()->SleepForMicroseconds(micros);
+  }
+};
+
+class DBTest {
+ private:
+  const FilterPolicy* filter_policy_;
+
+ protected:
+  // Sequence of option configurations to try
+  enum OptionConfig {
+    kBlockBasedTableWithWholeKeyHashIndex,
+    kDefault,
+    kBlockBasedTableWithPrefixHashIndex,
+    kPlainTableFirstBytePrefix,
+    kPlainTableAllBytesPrefix,
+    kVectorRep,
+    kHashLinkList,
+    kHashCuckoo,
+    kMergePut,
+    kFilter,
+    kUncompressed,
+    kNumLevel_3,
+    kDBLogDir,
+    kWalDir,
+    kManifestFileSize,
+    kCompactOnFlush,
+    kPerfOptions,
+    kDeletesFilterFirst,
+    kHashSkipList,
+    kUniversalCompaction,
+    kCompressedBlockCache,
+    kInfiniteMaxOpenFiles,
+    kxxHashChecksum,
+    kEnd
+  };
+  int option_config_;
+
+ public:
+  std::string dbname_;
+  SpecialEnv* env_;
+  DB* db_;
+  std::vector<ColumnFamilyHandle*> handles_;
+
+  Options last_options_;
+
+  // Skip some options, as they may not be applicable to a specific test.
+  // To add more skip constants, use values 4, 8, 16, etc.
+  enum OptionSkip {
+    kNoSkip = 0,
+    kSkipDeletesFilterFirst = 1,
+    kSkipUniversalCompaction = 2,
+    kSkipMergePut = 4,
+    kSkipPlainTable = 8,
+    kSkipHashIndex = 16,
+    kSkipNoSeekToLast = 32,
+    kSkipHashCuckoo = 64
+  };
+
+  DBTest() : option_config_(kDefault),
+             env_(new SpecialEnv(Env::Default())) {
+    filter_policy_ = NewBloomFilterPolicy(10);
+    dbname_ = test::TmpDir() + "/db_test";
+    ASSERT_OK(DestroyDB(dbname_, Options()));
+    db_ = nullptr;
+    Reopen();
+  }
+
+  ~DBTest() {
+    Close();
+    ASSERT_OK(DestroyDB(dbname_, Options()));
+    delete env_;
+    delete filter_policy_;
+  }
+
+  // Switch to a fresh database with the next option configuration to
+  // test.  Return false if there are no more configurations to test.
+  bool ChangeOptions(int skip_mask = kNoSkip) {
+    for(option_config_++; option_config_ < kEnd; option_config_++) {
+      if ((skip_mask & kSkipDeletesFilterFirst) &&
+          option_config_ == kDeletesFilterFirst) {
+        continue;
+      }
+      if ((skip_mask & kSkipUniversalCompaction) &&
+          option_config_ == kUniversalCompaction) {
+        continue;
+      }
+      if ((skip_mask & kSkipMergePut) && option_config_ == kMergePut) {
+        continue;
+      }
+      if ((skip_mask & kSkipNoSeekToLast) &&
+          (option_config_ == kHashLinkList ||
+           option_config_ == kHashSkipList)) {;
+        continue;
+      }
+      if ((skip_mask & kSkipPlainTable)
+          && (option_config_ == kPlainTableAllBytesPrefix
+              || option_config_ == kPlainTableFirstBytePrefix)) {
+        continue;
+      }
+      if ((skip_mask & kSkipPlainTable) &&
+          (option_config_ == kBlockBasedTableWithPrefixHashIndex ||
+           option_config_ == kBlockBasedTableWithWholeKeyHashIndex)) {
+        continue;
+      }
+      if ((skip_mask & kSkipHashCuckoo) && (option_config_ == kHashCuckoo)) {
+        continue;
+      }
+      break;
+    }
+
+    if (option_config_ >= kEnd) {
+      Destroy(&last_options_);
+      return false;
+    } else {
+      DestroyAndReopen();
+      return true;
+    }
+  }
+
+  // Switch between different compaction styles (we have only 2 now).
+  bool ChangeCompactOptions(Options* prev_options = nullptr) {
+    if (option_config_ == kDefault) {
+      option_config_ = kUniversalCompaction;
+      if (prev_options == nullptr) {
+        prev_options = &last_options_;
+      }
+      Destroy(prev_options);
+      TryReopen();
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  // Return the current option configuration.
+  Options CurrentOptions() {
+    Options options;
+    return CurrentOptions(options);
+  }
+
+  Options CurrentOptions(const Options& defaultOptions) {
+    // this redudant copy is to minimize code change w/o having lint error.
+    Options options = defaultOptions;
+    switch (option_config_) {
+      case kHashSkipList:
+        options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+        options.memtable_factory.reset(NewHashSkipListRepFactory());
+        break;
+      case kPlainTableFirstBytePrefix:
+        options.table_factory.reset(new PlainTableFactory());
+        options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+        options.allow_mmap_reads = true;
+        options.max_sequential_skip_in_iterations = 999999;
+        break;
+      case kPlainTableAllBytesPrefix:
+        options.table_factory.reset(new PlainTableFactory());
+        options.prefix_extractor.reset(NewNoopTransform());
+        options.allow_mmap_reads = true;
+        options.max_sequential_skip_in_iterations = 999999;
+        break;
+      case kMergePut:
+        options.merge_operator = MergeOperators::CreatePutOperator();
+        break;
+      case kFilter:
+        options.filter_policy = filter_policy_;
+        break;
+      case kUncompressed:
+        options.compression = kNoCompression;
+        break;
+      case kNumLevel_3:
+        options.num_levels = 3;
+        break;
+      case kDBLogDir:
+        options.db_log_dir = test::TmpDir();
+        break;
+      case kWalDir:
+        options.wal_dir = "/tmp/wal";
+        break;
+      case kManifestFileSize:
+        options.max_manifest_file_size = 50; // 50 bytes
+      case kCompactOnFlush:
+        options.purge_redundant_kvs_while_flush =
+          !options.purge_redundant_kvs_while_flush;
+        break;
+      case kPerfOptions:
+        options.hard_rate_limit = 2.0;
+        options.rate_limit_delay_max_milliseconds = 2;
+        // TODO -- test more options
+        break;
+      case kDeletesFilterFirst:
+        options.filter_deletes = true;
+        break;
+      case kVectorRep:
+        options.memtable_factory.reset(new VectorRepFactory(100));
+        break;
+      case kHashLinkList:
+        options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+        options.memtable_factory.reset(NewHashLinkListRepFactory(4));
+        break;
+      case kHashCuckoo:
+        options.memtable_factory.reset(
+            NewHashCuckooRepFactory(options.write_buffer_size));
+        break;
+      case kUniversalCompaction:
+        options.compaction_style = kCompactionStyleUniversal;
+        break;
+      case kCompressedBlockCache:
+        options.allow_mmap_writes = true;
+        options.block_cache_compressed = NewLRUCache(8*1024*1024);
+        break;
+      case kInfiniteMaxOpenFiles:
+        options.max_open_files = -1;
+        break;
+      case kxxHashChecksum: {
+        BlockBasedTableOptions table_options;
+        table_options.checksum = kxxHash;
+        options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+        break;
+      }
+      case kBlockBasedTableWithPrefixHashIndex: {
+        BlockBasedTableOptions table_options;
+        table_options.index_type = BlockBasedTableOptions::kHashSearch;
+        options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+        options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+        break;
+      }
+      case kBlockBasedTableWithWholeKeyHashIndex: {
+        BlockBasedTableOptions table_options;
+        table_options.index_type = BlockBasedTableOptions::kHashSearch;
+        options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+        options.prefix_extractor.reset(NewNoopTransform());
+        break;
+      }
+      default:
+        break;
+    }
+    return options;
+  }
+
+  DBImpl* dbfull() {
+    return reinterpret_cast<DBImpl*>(db_);
+  }
+
+  void CreateColumnFamilies(const std::vector<std::string>& cfs,
+                            const ColumnFamilyOptions* options = nullptr) {
+    ColumnFamilyOptions cf_opts;
+    if (options != nullptr) {
+      cf_opts = ColumnFamilyOptions(*options);
+    } else {
+      cf_opts = ColumnFamilyOptions(CurrentOptions());
+    }
+    int cfi = handles_.size();
+    handles_.resize(cfi + cfs.size());
+    for (auto cf : cfs) {
+      ASSERT_OK(db_->CreateColumnFamily(cf_opts, cf, &handles_[cfi++]));
+    }
+  }
+
+  void CreateAndReopenWithCF(const std::vector<std::string>& cfs,
+                             const Options* options = nullptr) {
+    CreateColumnFamilies(cfs, options);
+    std::vector<std::string> cfs_plus_default = cfs;
+    cfs_plus_default.insert(cfs_plus_default.begin(), kDefaultColumnFamilyName);
+    ReopenWithColumnFamilies(cfs_plus_default, options);
+  }
+
+  void ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+                                const std::vector<const Options*>& options) {
+    ASSERT_OK(TryReopenWithColumnFamilies(cfs, options));
+  }
+
+  void ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+                                const Options* options = nullptr) {
+    ASSERT_OK(TryReopenWithColumnFamilies(cfs, options));
+  }
+
+  Status TryReopenWithColumnFamilies(
+      const std::vector<std::string>& cfs,
+      const std::vector<const Options*>& options) {
+    Close();
+    ASSERT_EQ(cfs.size(), options.size());
+    std::vector<ColumnFamilyDescriptor> column_families;
+    for (size_t i = 0; i < cfs.size(); ++i) {
+      column_families.push_back(ColumnFamilyDescriptor(cfs[i], *options[i]));
+    }
+    DBOptions db_opts = DBOptions(*options[0]);
+    return DB::Open(db_opts, dbname_, column_families, &handles_, &db_);
+  }
+
+  Status TryReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+                                     const Options* options = nullptr) {
+    Close();
+    Options opts = (options == nullptr) ? CurrentOptions() : *options;
+    std::vector<const Options*> v_opts(cfs.size(), &opts);
+    return TryReopenWithColumnFamilies(cfs, v_opts);
+  }
+
+  void Reopen(Options* options = nullptr) {
+    ASSERT_OK(TryReopen(options));
+  }
+
+  void Close() {
+    for (auto h : handles_) {
+      delete h;
+    }
+    handles_.clear();
+    delete db_;
+    db_ = nullptr;
+  }
+
+  void DestroyAndReopen(Options* options = nullptr) {
+    //Destroy using last options
+    Destroy(&last_options_);
+    ASSERT_OK(TryReopen(options));
+  }
+
+  void Destroy(Options* options) {
+    Close();
+    ASSERT_OK(DestroyDB(dbname_, *options));
+  }
+
+  Status ReadOnlyReopen(Options* options) {
+    return DB::OpenForReadOnly(*options, dbname_, &db_);
+  }
+
+  Status TryReopen(Options* options = nullptr) {
+    Close();
+    Options opts;
+    if (options != nullptr) {
+      opts = *options;
+    } else {
+      opts = CurrentOptions();
+      opts.create_if_missing = true;
+    }
+    last_options_ = opts;
+
+    return DB::Open(opts, dbname_, &db_);
+  }
+
+  Status Flush(int cf = 0) {
+    if (cf == 0) {
+      return db_->Flush(FlushOptions());
+    } else {
+      return db_->Flush(FlushOptions(), handles_[cf]);
+    }
+  }
+
+  Status Put(const Slice& k, const Slice& v, WriteOptions wo = WriteOptions()) {
+    if (kMergePut == option_config_ ) {
+      return db_->Merge(wo, k, v);
+    } else {
+      return db_->Put(wo, k, v);
+    }
+  }
+
+  Status Put(int cf, const Slice& k, const Slice& v,
+             WriteOptions wo = WriteOptions()) {
+    if (kMergePut == option_config_) {
+      return db_->Merge(wo, handles_[cf], k, v);
+    } else {
+      return db_->Put(wo, handles_[cf], k, v);
+    }
+  }
+
+  Status Delete(const std::string& k) {
+    return db_->Delete(WriteOptions(), k);
+  }
+
+  Status Delete(int cf, const std::string& k) {
+    return db_->Delete(WriteOptions(), handles_[cf], k);
+  }
+
+  std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) {
+    ReadOptions options;
+    options.verify_checksums = true;
+    options.snapshot = snapshot;
+    std::string result;
+    Status s = db_->Get(options, k, &result);
+    if (s.IsNotFound()) {
+      result = "NOT_FOUND";
+    } else if (!s.ok()) {
+      result = s.ToString();
+    }
+    return result;
+  }
+
+  std::string Get(int cf, const std::string& k,
+                  const Snapshot* snapshot = nullptr) {
+    ReadOptions options;
+    options.verify_checksums = true;
+    options.snapshot = snapshot;
+    std::string result;
+    Status s = db_->Get(options, handles_[cf], k, &result);
+    if (s.IsNotFound()) {
+      result = "NOT_FOUND";
+    } else if (!s.ok()) {
+      result = s.ToString();
+    }
+    return result;
+  }
+
+  // Return a string that contains all key,value pairs in order,
+  // formatted like "(k1->v1)(k2->v2)".
+  std::string Contents(int cf = 0) {
+    std::vector<std::string> forward;
+    std::string result;
+    Iterator* iter = (cf == 0) ? db_->NewIterator(ReadOptions())
+                               : db_->NewIterator(ReadOptions(), handles_[cf]);
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      std::string s = IterStatus(iter);
+      result.push_back('(');
+      result.append(s);
+      result.push_back(')');
+      forward.push_back(s);
+    }
+
+    // Check reverse iteration results are the reverse of forward results
+    unsigned int matched = 0;
+    for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
+      ASSERT_LT(matched, forward.size());
+      ASSERT_EQ(IterStatus(iter), forward[forward.size() - matched - 1]);
+      matched++;
+    }
+    ASSERT_EQ(matched, forward.size());
+
+    delete iter;
+    return result;
+  }
+
+  std::string AllEntriesFor(const Slice& user_key, int cf = 0) {
+    Iterator* iter;
+    if (cf == 0) {
+      iter = dbfull()->TEST_NewInternalIterator();
+    } else {
+      iter = dbfull()->TEST_NewInternalIterator(handles_[cf]);
+    }
+    InternalKey target(user_key, kMaxSequenceNumber, kTypeValue);
+    iter->Seek(target.Encode());
+    std::string result;
+    if (!iter->status().ok()) {
+      result = iter->status().ToString();
+    } else {
+      result = "[ ";
+      bool first = true;
+      while (iter->Valid()) {
+        ParsedInternalKey ikey(Slice(), 0, kTypeValue);
+        if (!ParseInternalKey(iter->key(), &ikey)) {
+          result += "CORRUPTED";
+        } else {
+          if (last_options_.comparator->Compare(ikey.user_key, user_key) != 0) {
+            break;
+          }
+          if (!first) {
+            result += ", ";
+          }
+          first = false;
+          switch (ikey.type) {
+            case kTypeValue:
+              result += iter->value().ToString();
+              break;
+            case kTypeMerge:
+              // keep it the same as kTypeValue for testing kMergePut
+              result += iter->value().ToString();
+              break;
+            case kTypeDeletion:
+              result += "DEL";
+              break;
+            default:
+              assert(false);
+              break;
+          }
+        }
+        iter->Next();
+      }
+      if (!first) {
+        result += " ";
+      }
+      result += "]";
+    }
+    delete iter;
+    return result;
+  }
+
+  int NumTableFilesAtLevel(int level, int cf = 0) {
+    std::string property;
+    if (cf == 0) {
+      // default cfd
+      ASSERT_TRUE(db_->GetProperty(
+          "rocksdb.num-files-at-level" + NumberToString(level), &property));
+    } else {
+      ASSERT_TRUE(db_->GetProperty(
+          handles_[cf], "rocksdb.num-files-at-level" + NumberToString(level),
+          &property));
+    }
+    return atoi(property.c_str());
+  }
+
+  int TotalTableFiles(int cf = 0, int levels = -1) {
+    if (levels == -1) {
+      levels = CurrentOptions().num_levels;
+    }
+    int result = 0;
+    for (int level = 0; level < levels; level++) {
+      result += NumTableFilesAtLevel(level, cf);
+    }
+    return result;
+  }
+
+  // Return spread of files per level
+  std::string FilesPerLevel(int cf = 0) {
+    int num_levels =
+        (cf == 0) ? db_->NumberLevels() : db_->NumberLevels(handles_[1]);
+    std::string result;
+    int last_non_zero_offset = 0;
+    for (int level = 0; level < num_levels; level++) {
+      int f = NumTableFilesAtLevel(level, cf);
+      char buf[100];
+      snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
+      result += buf;
+      if (f > 0) {
+        last_non_zero_offset = result.size();
+      }
+    }
+    result.resize(last_non_zero_offset);
+    return result;
+  }
+
+  int CountFiles() {
+    std::vector<std::string> files;
+    env_->GetChildren(dbname_, &files);
+
+    std::vector<std::string> logfiles;
+    if (dbname_ != last_options_.wal_dir) {
+      env_->GetChildren(last_options_.wal_dir, &logfiles);
+    }
+
+    return static_cast<int>(files.size() + logfiles.size());
+  }
+
+  int CountLiveFiles() {
+    std::vector<LiveFileMetaData> metadata;
+    db_->GetLiveFilesMetaData(&metadata);
+    return metadata.size();
+  }
+
+  uint64_t Size(const Slice& start, const Slice& limit, int cf = 0) {
+    Range r(start, limit);
+    uint64_t size;
+    if (cf == 0) {
+      db_->GetApproximateSizes(&r, 1, &size);
+    } else {
+      db_->GetApproximateSizes(handles_[1], &r, 1, &size);
+    }
+    return size;
+  }
+
+  void Compact(int cf, const Slice& start, const Slice& limit) {
+    ASSERT_OK(db_->CompactRange(handles_[cf], &start, &limit));
+  }
+
+  void Compact(const Slice& start, const Slice& limit) {
+    ASSERT_OK(db_->CompactRange(&start, &limit));
+  }
+
+  // Do n memtable compactions, each of which produces an sstable
+  // covering the range [small,large].
+  void MakeTables(int n, const std::string& small, const std::string& large,
+                  int cf = 0) {
+    for (int i = 0; i < n; i++) {
+      ASSERT_OK(Put(cf, small, "begin"));
+      ASSERT_OK(Put(cf, large, "end"));
+      ASSERT_OK(Flush(cf));
+    }
+  }
+
+  // Prevent pushing of new sstables into deeper levels by adding
+  // tables that cover a specified range to all levels.
+  void FillLevels(const std::string& smallest, const std::string& largest,
+                  int cf) {
+    MakeTables(db_->NumberLevels(handles_[cf]), smallest, largest, cf);
+  }
+
+  void DumpFileCounts(const char* label) {
+    fprintf(stderr, "---\n%s:\n", label);
+    fprintf(stderr, "maxoverlap: %lld\n",
+            static_cast<long long>(
+                dbfull()->TEST_MaxNextLevelOverlappingBytes()));
+    for (int level = 0; level < db_->NumberLevels(); level++) {
+      int num = NumTableFilesAtLevel(level);
+      if (num > 0) {
+        fprintf(stderr, "  level %3d : %d files\n", level, num);
+      }
+    }
+  }
+
+  std::string DumpSSTableList() {
+    std::string property;
+    db_->GetProperty("rocksdb.sstables", &property);
+    return property;
+  }
+
+  std::string IterStatus(Iterator* iter) {
+    std::string result;
+    if (iter->Valid()) {
+      result = iter->key().ToString() + "->" + iter->value().ToString();
+    } else {
+      result = "(invalid)";
+    }
+    return result;
+  }
+
+  Options OptionsForLogIterTest() {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.WAL_ttl_seconds = 1000;
+    return options;
+  }
+
+  std::unique_ptr<TransactionLogIterator> OpenTransactionLogIter(
+      const SequenceNumber seq) {
+    unique_ptr<TransactionLogIterator> iter;
+    Status status = dbfull()->GetUpdatesSince(seq, &iter);
+    ASSERT_OK(status);
+    ASSERT_TRUE(iter->Valid());
+    return std::move(iter);
+  }
+
+  std::string DummyString(size_t len, char c = 'a') {
+    return std::string(len, c);
+  }
+
+  void VerifyIterLast(std::string expected_key, int cf = 0) {
+    Iterator* iter;
+    ReadOptions ro;
+    if (cf == 0) {
+      iter = db_->NewIterator(ro);
+    } else {
+      iter = db_->NewIterator(ro, handles_[cf]);
+    }
+    iter->SeekToLast();
+    ASSERT_EQ(IterStatus(iter), expected_key);
+    delete iter;
+  }
+
+  // Used to test InplaceUpdate
+
+  // If previous value is nullptr or delta is > than previous value,
+  //   sets newValue with delta
+  // If previous value is not empty,
+  //   updates previous value with 'b' string of previous value size - 1.
+  static UpdateStatus
+      updateInPlaceSmallerSize(char* prevValue, uint32_t* prevSize,
+                               Slice delta, std::string* newValue) {
+    if (prevValue == nullptr) {
+      *newValue = std::string(delta.size(), 'c');
+      return UpdateStatus::UPDATED;
+    } else {
+      *prevSize = *prevSize - 1;
+      std::string str_b = std::string(*prevSize, 'b');
+      memcpy(prevValue, str_b.c_str(), str_b.size());
+      return UpdateStatus::UPDATED_INPLACE;
+    }
+  }
+
+  static UpdateStatus
+      updateInPlaceSmallerVarintSize(char* prevValue, uint32_t* prevSize,
+                                     Slice delta, std::string* newValue) {
+    if (prevValue == nullptr) {
+      *newValue = std::string(delta.size(), 'c');
+      return UpdateStatus::UPDATED;
+    } else {
+      *prevSize = 1;
+      std::string str_b = std::string(*prevSize, 'b');
+      memcpy(prevValue, str_b.c_str(), str_b.size());
+      return UpdateStatus::UPDATED_INPLACE;
+    }
+  }
+
+  static UpdateStatus
+      updateInPlaceLargerSize(char* prevValue, uint32_t* prevSize,
+                              Slice delta, std::string* newValue) {
+    *newValue = std::string(delta.size(), 'c');
+    return UpdateStatus::UPDATED;
+  }
+
+  static UpdateStatus
+      updateInPlaceNoAction(char* prevValue, uint32_t* prevSize,
+                            Slice delta, std::string* newValue) {
+    return UpdateStatus::UPDATE_FAILED;
+  }
+
+  // Utility method to test InplaceUpdate
+  void validateNumberOfEntries(int numValues, int cf = 0) {
+    Iterator* iter;
+    if (cf != 0) {
+      iter = dbfull()->TEST_NewInternalIterator(handles_[cf]);
+    } else {
+      iter = dbfull()->TEST_NewInternalIterator();
+    }
+    iter->SeekToFirst();
+    ASSERT_EQ(iter->status().ok(), true);
+    int seq = numValues;
+    while (iter->Valid()) {
+      ParsedInternalKey ikey;
+      ikey.sequence = -1;
+      ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
+
+      // checks sequence number for updates
+      ASSERT_EQ(ikey.sequence, (unsigned)seq--);
+      iter->Next();
+    }
+    delete iter;
+    ASSERT_EQ(0, seq);
+  }
+
+  void CopyFile(const std::string& source, const std::string& destination,
+                uint64_t size = 0) {
+    const EnvOptions soptions;
+    unique_ptr<SequentialFile> srcfile;
+    ASSERT_OK(env_->NewSequentialFile(source, &srcfile, soptions));
+    unique_ptr<WritableFile> destfile;
+    ASSERT_OK(env_->NewWritableFile(destination, &destfile, soptions));
+
+    if (size == 0) {
+      // default argument means copy everything
+      ASSERT_OK(env_->GetFileSize(source, &size));
+    }
+
+    char buffer[4096];
+    Slice slice;
+    while (size > 0) {
+      uint64_t one = std::min(uint64_t(sizeof(buffer)), size);
+      ASSERT_OK(srcfile->Read(one, &slice, buffer));
+      ASSERT_OK(destfile->Append(slice));
+      size -= slice.size();
+    }
+    ASSERT_OK(destfile->Close());
+  }
+
+};
+
+static std::string Key(int i) {
+  char buf[100];
+  snprintf(buf, sizeof(buf), "key%06d", i);
+  return std::string(buf);
+}
+
+static long TestGetTickerCount(const Options& options, Tickers ticker_type) {
+  return options.statistics->getTickerCount(ticker_type);
+}
+
+// A helper function that ensures the table properties returned in
+// `GetPropertiesOfAllTablesTest` is correct.
+// This test assumes entries size is differnt for each of the tables.
+namespace {
+void VerifyTableProperties(DB* db, uint64_t expected_entries_size) {
+  TablePropertiesCollection props;
+  ASSERT_OK(db->GetPropertiesOfAllTables(&props));
+
+  ASSERT_EQ(4U, props.size());
+  std::unordered_set<uint64_t> unique_entries;
+
+  // Indirect test
+  uint64_t sum = 0;
+  for (const auto& item : props) {
+    unique_entries.insert(item.second->num_entries);
+    sum += item.second->num_entries;
+  }
+
+  ASSERT_EQ(props.size(), unique_entries.size());
+  ASSERT_EQ(expected_entries_size, sum);
+}
+}  // namespace
+
+TEST(DBTest, Empty) {
+  do {
+    Options options;
+    options.env = env_;
+    options.write_buffer_size = 100000;  // Small write buffer
+    options = CurrentOptions(options);
+    CreateAndReopenWithCF({"pikachu"}, &options);
+
+    std::string num;
+    ASSERT_TRUE(dbfull()->GetProperty(
+        handles_[1], "rocksdb.num-entries-active-mem-table", &num));
+    ASSERT_EQ("0", num);
+
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_TRUE(dbfull()->GetProperty(
+        handles_[1], "rocksdb.num-entries-active-mem-table", &num));
+    ASSERT_EQ("1", num);
+
+    env_->delay_sstable_sync_.Release_Store(env_);  // Block sync calls
+    Put(1, "k1", std::string(100000, 'x'));         // Fill memtable
+    ASSERT_TRUE(dbfull()->GetProperty(
+        handles_[1], "rocksdb.num-entries-active-mem-table", &num));
+    ASSERT_EQ("2", num);
+
+    Put(1, "k2", std::string(100000, 'y'));         // Trigger compaction
+    ASSERT_TRUE(dbfull()->GetProperty(
+        handles_[1], "rocksdb.num-entries-active-mem-table", &num));
+    ASSERT_EQ("1", num);
+
+    ASSERT_EQ("v1", Get(1, "foo"));
+    env_->delay_sstable_sync_.Release_Store(nullptr);   // Release sync calls
+  } while (ChangeOptions());
+}
+
+TEST(DBTest, ReadOnlyDB) {
+  ASSERT_OK(Put("foo", "v1"));
+  ASSERT_OK(Put("bar", "v2"));
+  ASSERT_OK(Put("foo", "v3"));
+  Close();
+
+  Options options;
+  ASSERT_OK(ReadOnlyReopen(&options));
+  ASSERT_EQ("v3", Get("foo"));
+  ASSERT_EQ("v2", Get("bar"));
+  Iterator* iter = db_->NewIterator(ReadOptions());
+  int count = 0;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ASSERT_OK(iter->status());
+    ++count;
+  }
+  ASSERT_EQ(count, 2);
+  delete iter;
+}
+
+// Make sure that when options.block_cache is set, after a new table is
+// created its index/filter blocks are added to block cache.
+TEST(DBTest, IndexAndFilterBlocksOfNewTableAddedToCache) {
+  Options options = CurrentOptions();
+  std::unique_ptr<const FilterPolicy> filter_policy(NewBloomFilterPolicy(20));
+  options.filter_policy = filter_policy.get();
+  options.create_if_missing = true;
+  options.statistics = rocksdb::CreateDBStatistics();
+  BlockBasedTableOptions table_options;
+  table_options.cache_index_and_filter_blocks = true;
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  CreateAndReopenWithCF({"pikachu"}, &options);
+
+  ASSERT_OK(Put(1, "key", "val"));
+  // Create a new table.
+  ASSERT_OK(Flush(1));
+
+  // index/filter blocks added to block cache right after table creation.
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(2, /* only index/filter were added */
+            TestGetTickerCount(options, BLOCK_CACHE_ADD));
+  ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_MISS));
+
+  // Make sure filter block is in cache.
+  std::string value;
+  ReadOptions ropt;
+  db_->KeyMayExist(ReadOptions(), handles_[1], "key", &value);
+
+  // Miss count should remain the same.
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+
+  db_->KeyMayExist(ReadOptions(), handles_[1], "key", &value);
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+
+  // Make sure index block is in cache.
+  auto index_block_hit = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT);
+  value = Get(1, "key");
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(index_block_hit + 1,
+            TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+
+  value = Get(1, "key");
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(index_block_hit + 2,
+            TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+}
+
+TEST(DBTest, GetPropertiesOfAllTablesTest) {
+  Options options = CurrentOptions();
+  Reopen(&options);
+  // Create 4 tables
+  for (int table = 0; table < 4; ++table) {
+    for (int i = 0; i < 10 + table; ++i) {
+      db_->Put(WriteOptions(), std::to_string(table * 100 + i), "val");
+    }
+    db_->Flush(FlushOptions());
+  }
+
+  // 1. Read table properties directly from file
+  Reopen(&options);
+  VerifyTableProperties(db_, 10 + 11 + 12 + 13);
+
+  // 2. Put two tables to table cache and
+  Reopen(&options);
+  // fetch key from 1st and 2nd table, which will internally place that table to
+  // the table cache.
+  for (int i = 0; i < 2; ++i) {
+    Get(std::to_string(i * 100 + 0));
+  }
+
+  VerifyTableProperties(db_, 10 + 11 + 12 + 13);
+
+  // 3. Put all tables to table cache
+  Reopen(&options);
+  // fetch key from 1st and 2nd table, which will internally place that table to
+  // the table cache.
+  for (int i = 0; i < 4; ++i) {
+    Get(std::to_string(i * 100 + 0));
+  }
+  VerifyTableProperties(db_, 10 + 11 + 12 + 13);
+}
+
+TEST(DBTest, LevelLimitReopen) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"pikachu"}, &options);
+
+  const std::string value(1024 * 1024, ' ');
+  int i = 0;
+  while (NumTableFilesAtLevel(2, 1) == 0) {
+    ASSERT_OK(Put(1, Key(i++), value));
+  }
+
+  options.num_levels = 1;
+  options.max_bytes_for_level_multiplier_additional.resize(1, 1);
+  Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, &options);
+  ASSERT_EQ(s.IsInvalidArgument(), true);
+  ASSERT_EQ(s.ToString(),
+            "Invalid argument: db has more levels than options.num_levels");
+
+  options.num_levels = 10;
+  options.max_bytes_for_level_multiplier_additional.resize(10, 1);
+  ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, &options));
+}
+
+TEST(DBTest, Preallocation) {
+  const std::string src = dbname_ + "/alloc_test";
+  unique_ptr<WritableFile> srcfile;
+  const EnvOptions soptions;
+  ASSERT_OK(env_->NewWritableFile(src, &srcfile, soptions));
+  srcfile->SetPreallocationBlockSize(1024 * 1024);
+
+  // No writes should mean no preallocation
+  size_t block_size, last_allocated_block;
+  srcfile->GetPreallocationStatus(&block_size, &last_allocated_block);
+  ASSERT_EQ(last_allocated_block, 0UL);
+
+  // Small write should preallocate one block
+  srcfile->Append("test");
+  srcfile->GetPreallocationStatus(&block_size, &last_allocated_block);
+  ASSERT_EQ(last_allocated_block, 1UL);
+
+  // Write an entire preallocation block, make sure we increased by two.
+  std::string buf(block_size, ' ');
+  srcfile->Append(buf);
+  srcfile->GetPreallocationStatus(&block_size, &last_allocated_block);
+  ASSERT_EQ(last_allocated_block, 2UL);
+
+  // Write five more blocks at once, ensure we're where we need to be.
+  buf = std::string(block_size * 5, ' ');
+  srcfile->Append(buf);
+  srcfile->GetPreallocationStatus(&block_size, &last_allocated_block);
+  ASSERT_EQ(last_allocated_block, 7UL);
+}
+
+TEST(DBTest, PutDeleteGet) {
+  do {
+    CreateAndReopenWithCF({"pikachu"});
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_OK(Put(1, "foo", "v2"));
+    ASSERT_EQ("v2", Get(1, "foo"));
+    ASSERT_OK(Delete(1, "foo"));
+    ASSERT_EQ("NOT_FOUND", Get(1, "foo"));
+  } while (ChangeOptions());
+}
+
+
+TEST(DBTest, GetFromImmutableLayer) {
+  do {
+    Options options;
+    options.env = env_;
+    options.write_buffer_size = 100000;  // Small write buffer
+    options = CurrentOptions(options);
+    CreateAndReopenWithCF({"pikachu"}, &options);
+
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_EQ("v1", Get(1, "foo"));
+
+    env_->delay_sstable_sync_.Release_Store(env_);   // Block sync calls
+    Put(1, "k1", std::string(100000, 'x'));          // Fill memtable
+    Put(1, "k2", std::string(100000, 'y'));          // Trigger flush
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_EQ("NOT_FOUND", Get(0, "foo"));
+    env_->delay_sstable_sync_.Release_Store(nullptr);   // Release sync calls
+  } while (ChangeOptions());
+}
+
+TEST(DBTest, GetFromVersions) {
+  do {
+    CreateAndReopenWithCF({"pikachu"});
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_EQ("NOT_FOUND", Get(0, "foo"));
+  } while (ChangeOptions());
+}
+
+TEST(DBTest, GetSnapshot) {
+  do {
+    CreateAndReopenWithCF({"pikachu"});
+    // Try with both a short key and a long key
+    for (int i = 0; i < 2; i++) {
+      std::string key = (i == 0) ? std::string("foo") : std::string(200, 'x');
+      ASSERT_OK(Put(1, key, "v1"));
+      const Snapshot* s1 = db_->GetSnapshot();
+      ASSERT_OK(Put(1, key, "v2"));
+      ASSERT_EQ("v2", Get(1, key));
+      ASSERT_EQ("v1", Get(1, key, s1));
+      ASSERT_OK(Flush(1));
+      ASSERT_EQ("v2", Get(1, key));
+      ASSERT_EQ("v1", Get(1, key, s1));
+      db_->ReleaseSnapshot(s1);
+    }
+    // skip as HashCuckooRep does not support snapshot
+  } while (ChangeOptions(kSkipHashCuckoo));
+}
+
+TEST(DBTest, GetLevel0Ordering) {
+  do {
+    CreateAndReopenWithCF({"pikachu"});
+    // Check that we process level-0 files in correct order.  The code
+    // below generates two level-0 files where the earlier one comes
+    // before the later one in the level-0 file list since the earlier
+    // one has a smaller "smallest" key.
+    ASSERT_OK(Put(1, "bar", "b"));
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_OK(Flush(1));
+    ASSERT_OK(Put(1, "foo", "v2"));
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ("v2", Get(1, "foo"));
+  } while (ChangeOptions());
+}
+
+TEST(DBTest, GetOrderedByLevels) {
+  do {
+    CreateAndReopenWithCF({"pikachu"});
+    ASSERT_OK(Put(1, "foo", "v1"));
+    Compact(1, "a", "z");
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_OK(Put(1, "foo", "v2"));
+    ASSERT_EQ("v2", Get(1, "foo"));
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ("v2", Get(1, "foo"));
+  } while (ChangeOptions());
+}
+
+TEST(DBTest, GetPicksCorrectFile) {
+  do {
+    CreateAndReopenWithCF({"pikachu"});
+    // Arrange to have multiple files in a non-level-0 level.
+    ASSERT_OK(Put(1, "a", "va"));
+    Compact(1, "a", "b");
+    ASSERT_OK(Put(1, "x", "vx"));
+    Compact(1, "x", "y");
+    ASSERT_OK(Put(1, "f", "vf"));
+    Compact(1, "f", "g");
+    ASSERT_EQ("va", Get(1, "a"));
+    ASSERT_EQ("vf", Get(1, "f"));
+    ASSERT_EQ("vx", Get(1, "x"));
+  } while (ChangeOptions());
+}
+
+TEST(DBTest, GetEncountersEmptyLevel) {
+  do {
+    CreateAndReopenWithCF({"pikachu"});
+    // Arrange for the following to happen:
+    //   * sstable A in level 0
+    //   * nothing in level 1
+    //   * sstable B in level 2
+    // Then do enough Get() calls to arrange for an automatic compaction
+    // of sstable A.  A bug would cause the compaction to be marked as
+    // occuring at level 1 (instead of the correct level 0).
+
+    // Step 1: First place sstables in levels 0 and 2
+    int compaction_count = 0;
+    while (NumTableFilesAtLevel(0, 1) == 0 || NumTableFilesAtLevel(2, 1) == 0) {
+      ASSERT_LE(compaction_count, 100) << "could not fill levels 0 and 2";
+      compaction_count++;
+      Put(1, "a", "begin");
+      Put(1, "z", "end");
+      ASSERT_OK(Flush(1));
+    }
+
+    // Step 2: clear level 1 if necessary.
+    dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
+    ASSERT_EQ(NumTableFilesAtLevel(0, 1), 1);
+    ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0);
+    ASSERT_EQ(NumTableFilesAtLevel(2, 1), 1);
+
+    // Step 3: read a bunch of times
+    for (int i = 0; i < 1000; i++) {
+      ASSERT_EQ("NOT_FOUND", Get(1, "missing"));
+    }
+
+    // Step 4: Wait for compaction to finish
+    env_->SleepForMicroseconds(1000000);
+
+    ASSERT_EQ(NumTableFilesAtLevel(0, 1), 1);  // XXX
+  } while (ChangeOptions(kSkipUniversalCompaction));
+}
+
+// KeyMayExist can lead to a few false positives, but not false negatives.
+// To make test deterministic, use a much larger number of bits per key-20 than
+// bits in the key, so that false positives are eliminated
+TEST(DBTest, KeyMayExist) {
+  do {
+    ReadOptions ropts;
+    std::string value;
+    Options options = CurrentOptions();
+    options.filter_policy = NewBloomFilterPolicy(20);
+    options.statistics = rocksdb::CreateDBStatistics();
+    CreateAndReopenWithCF({"pikachu"}, &options);
+
+    ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value));
+
+    ASSERT_OK(Put(1, "a", "b"));
+    bool value_found = false;
+    ASSERT_TRUE(
+        db_->KeyMayExist(ropts, handles_[1], "a", &value, &value_found));
+    ASSERT_TRUE(value_found);
+    ASSERT_EQ("b", value);
+
+    ASSERT_OK(Flush(1));
+    value.clear();
+
+    long numopen = TestGetTickerCount(options, NO_FILE_OPENS);
+    long cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
+    ASSERT_TRUE(
+        db_->KeyMayExist(ropts, handles_[1], "a", &value, &value_found));
+    ASSERT_TRUE(!value_found);
+    // assert that no new files were opened and no new blocks were
+    // read into block cache.
+    ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
+    ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+
+    ASSERT_OK(Delete(1, "a"));
+
+    numopen = TestGetTickerCount(options, NO_FILE_OPENS);
+    cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
+    ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value));
+    ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
+    ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+
+    ASSERT_OK(Flush(1));
+    db_->CompactRange(handles_[1], nullptr, nullptr);
+
+    numopen = TestGetTickerCount(options, NO_FILE_OPENS);
+    cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
+    ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value));
+    ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
+    ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+
+    ASSERT_OK(Delete(1, "c"));
+
+    numopen = TestGetTickerCount(options, NO_FILE_OPENS);
+    cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
+    ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "c", &value));
+    ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
+    ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+
+    delete options.filter_policy;
+
+    // KeyMayExist function only checks data in block caches, which is not used
+    // by plain table format.
+  } while (ChangeOptions(kSkipPlainTable | kSkipHashIndex));
+}
+
+TEST(DBTest, NonBlockingIteration) {
+  do {
+    ReadOptions non_blocking_opts, regular_opts;
+    Options options = CurrentOptions();
+    options.statistics = rocksdb::CreateDBStatistics();
+    non_blocking_opts.read_tier = kBlockCacheTier;
+    CreateAndReopenWithCF({"pikachu"}, &options);
+    // write one kv to the database.
+    ASSERT_OK(Put(1, "a", "b"));
+
+    // scan using non-blocking iterator. We should find it because
+    // it is in memtable.
+    Iterator* iter = db_->NewIterator(non_blocking_opts, handles_[1]);
+    int count = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ASSERT_OK(iter->status());
+      count++;
+    }
+    ASSERT_EQ(count, 1);
+    delete iter;
+
+    // flush memtable to storage. Now, the key should not be in the
+    // memtable neither in the block cache.
+    ASSERT_OK(Flush(1));
+
+    // verify that a non-blocking iterator does not find any
+    // kvs. Neither does it do any IOs to storage.
+    long numopen = TestGetTickerCount(options, NO_FILE_OPENS);
+    long cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
+    iter = db_->NewIterator(non_blocking_opts, handles_[1]);
+    count = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      count++;
+    }
+    ASSERT_EQ(count, 0);
+    ASSERT_TRUE(iter->status().IsIncomplete());
+    ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
+    ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+    delete iter;
+
+    // read in the specified block via a regular get
+    ASSERT_EQ(Get(1, "a"), "b");
+
+    // verify that we can find it via a non-blocking scan
+    numopen = TestGetTickerCount(options, NO_FILE_OPENS);
+    cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
+    iter = db_->NewIterator(non_blocking_opts, handles_[1]);
+    count = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ASSERT_OK(iter->status());
+      count++;
+    }
+    ASSERT_EQ(count, 1);
+    ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
+    ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+    delete iter;
+
+    // This test verifies block cache behaviors, which is not used by plain
+    // table format.
+    // Exclude kHashCuckoo as it does not support iteration currently
+  } while (ChangeOptions(kSkipPlainTable | kSkipNoSeekToLast |
+                         kSkipHashCuckoo));
+}
+
+// A delete is skipped for key if KeyMayExist(key) returns False
+// Tests Writebatch consistency and proper delete behaviour
+TEST(DBTest, FilterDeletes) {
+  do {
+    Options options = CurrentOptions();
+    options.filter_policy = NewBloomFilterPolicy(20);
+    options.filter_deletes = true;
+    CreateAndReopenWithCF({"pikachu"}, &options);
+    WriteBatch batch;
+
+    batch.Delete(handles_[1], "a");
+    dbfull()->Write(WriteOptions(), &batch);
+    ASSERT_EQ(AllEntriesFor("a", 1), "[ ]");  // Delete skipped
+    batch.Clear();
+
+    batch.Put(handles_[1], "a", "b");
+    batch.Delete(handles_[1], "a");
+    dbfull()->Write(WriteOptions(), &batch);
+    ASSERT_EQ(Get(1, "a"), "NOT_FOUND");
+    ASSERT_EQ(AllEntriesFor("a", 1), "[ DEL, b ]");  // Delete issued
+    batch.Clear();
+
+    batch.Delete(handles_[1], "c");
+    batch.Put(handles_[1], "c", "d");
+    dbfull()->Write(WriteOptions(), &batch);
+    ASSERT_EQ(Get(1, "c"), "d");
+    ASSERT_EQ(AllEntriesFor("c", 1), "[ d ]");  // Delete skipped
+    batch.Clear();
+
+    ASSERT_OK(Flush(1));  // A stray Flush
+
+    batch.Delete(handles_[1], "c");
+    dbfull()->Write(WriteOptions(), &batch);
+    ASSERT_EQ(AllEntriesFor("c", 1), "[ DEL, d ]");  // Delete issued
+    batch.Clear();
+
+    delete options.filter_policy;
+  } while (ChangeCompactOptions());
+}
+
+
+TEST(DBTest, IterSeekBeforePrev) {
+  ASSERT_OK(Put("a", "b"));
+  ASSERT_OK(Put("c", "d"));
+  dbfull()->Flush(FlushOptions());
+  ASSERT_OK(Put("0", "f"));
+  ASSERT_OK(Put("1", "h"));
+  dbfull()->Flush(FlushOptions());
+  ASSERT_OK(Put("2", "j"));
+  auto iter = db_->NewIterator(ReadOptions());
+  iter->Seek(Slice("c"));
+  iter->Prev();
+  iter->Seek(Slice("a"));
+  iter->Prev();
+  delete iter;
+}
+
+namespace {
+std::string MakeLongKey(size_t length, char c) {
+  return std::string(length, c);
+}
+}  // namespace
+
+TEST(DBTest, IterLongKeys) {
+  ASSERT_OK(Put(MakeLongKey(20, 0), "0"));
+  ASSERT_OK(Put(MakeLongKey(32, 2), "2"));
+  ASSERT_OK(Put("a", "b"));
+  dbfull()->Flush(FlushOptions());
+  ASSERT_OK(Put(MakeLongKey(50, 1), "1"));
+  ASSERT_OK(Put(MakeLongKey(127, 3), "3"));
+  ASSERT_OK(Put(MakeLongKey(64, 4), "4"));
+  auto iter = db_->NewIterator(ReadOptions());
+
+  // Create a key that needs to be skipped for Seq too new
+  iter->Seek(MakeLongKey(20, 0));
+  ASSERT_EQ(IterStatus(iter), MakeLongKey(20, 0) + "->0");
+  iter->Next();
+  ASSERT_EQ(IterStatus(iter), MakeLongKey(50, 1) + "->1");
+  iter->Next();
+  ASSERT_EQ(IterStatus(iter), MakeLongKey(32, 2) + "->2");
+  iter->Next();
+  ASSERT_EQ(IterStatus(iter), MakeLongKey(127, 3) + "->3");
+  iter->Next();
+  ASSERT_EQ(IterStatus(iter), MakeLongKey(64, 4) + "->4");
+  delete iter;
+
+  iter = db_->NewIterator(ReadOptions());
+  iter->Seek(MakeLongKey(50, 1));
+  ASSERT_EQ(IterStatus(iter), MakeLongKey(50, 1) + "->1");
+  iter->Next();
+  ASSERT_EQ(IterStatus(iter), MakeLongKey(32, 2) + "->2");
+  iter->Next();
+  ASSERT_EQ(IterStatus(iter), MakeLongKey(127, 3) + "->3");
+  delete iter;
+}
+
+
+TEST(DBTest, IterNextWithNewerSeq) {
+  ASSERT_OK(Put("0", "0"));
+  dbfull()->Flush(FlushOptions());
+  ASSERT_OK(Put("a", "b"));
+  ASSERT_OK(Put("c", "d"));
+  ASSERT_OK(Put("d", "e"));
+  auto iter = db_->NewIterator(ReadOptions());
+
+  // Create a key that needs to be skipped for Seq too new
+  for (uint64_t i = 0; i < last_options_.max_sequential_skip_in_iterations + 1;
+       i++) {
+    ASSERT_OK(Put("b", "f"));
+  }
+
+  iter->Seek(Slice("a"));
+  ASSERT_EQ(IterStatus(iter), "a->b");
+  iter->Next();
+  ASSERT_EQ(IterStatus(iter), "c->d");
+  delete iter;
+}
+
+TEST(DBTest, IterPrevWithNewerSeq) {
+  ASSERT_OK(Put("0", "0"));
+  dbfull()->Flush(FlushOptions());
+  ASSERT_OK(Put("a", "b"));
+  ASSERT_OK(Put("c", "d"));
+  ASSERT_OK(Put("d", "e"));
+  auto iter = db_->NewIterator(ReadOptions());
+
+  // Create a key that needs to be skipped for Seq too new
+  for (uint64_t i = 0; i < last_options_.max_sequential_skip_in_iterations + 1;
+       i++) {
+    ASSERT_OK(Put("b", "f"));
+  }
+
+  iter->Seek(Slice("d"));
+  ASSERT_EQ(IterStatus(iter), "d->e");
+  iter->Prev();
+  ASSERT_EQ(IterStatus(iter), "c->d");
+  iter->Prev();
+  ASSERT_EQ(IterStatus(iter), "a->b");
+
+  iter->Prev();
+  delete iter;
+}
+
+TEST(DBTest, IterPrevWithNewerSeq2) {
+  ASSERT_OK(Put("0", "0"));
+  dbfull()->Flush(FlushOptions());
+  ASSERT_OK(Put("a", "b"));
+  ASSERT_OK(Put("c", "d"));
+  ASSERT_OK(Put("d", "e"));
+  auto iter = db_->NewIterator(ReadOptions());
+  iter->Seek(Slice("c"));
+  ASSERT_EQ(IterStatus(iter), "c->d");
+
+  // Create a key that needs to be skipped for Seq too new
+  for (uint64_t i = 0; i < last_options_.max_sequential_skip_in_iterations + 1;
+      i++) {
+    ASSERT_OK(Put("b", "f"));
+  }
+
+  iter->Prev();
+  ASSERT_EQ(IterStatus(iter), "a->b");
+
+  iter->Prev();
+  delete iter;
+}
+
+TEST(DBTest, IterEmpty) {
+  do {
+    CreateAndReopenWithCF({"pikachu"});
+    Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]);
+
+    iter->SeekToFirst();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    iter->SeekToLast();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    iter->Seek("foo");
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    delete iter;
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, IterSingle) {
+  do {
+    CreateAndReopenWithCF({"pikachu"});
+    ASSERT_OK(Put(1, "a", "va"));
+    Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]);
+
+    iter->SeekToFirst();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+    iter->SeekToFirst();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    iter->SeekToLast();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+    iter->SeekToLast();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    iter->Seek("");
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    iter->Seek("a");
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    iter->Seek("b");
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    delete iter;
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, IterMulti) {
+  do {
+    CreateAndReopenWithCF({"pikachu"});
+    ASSERT_OK(Put(1, "a", "va"));
+    ASSERT_OK(Put(1, "b", "vb"));
+    ASSERT_OK(Put(1, "c", "vc"));
+    Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]);
+
+    iter->SeekToFirst();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "b->vb");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "c->vc");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+    iter->SeekToFirst();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    iter->SeekToLast();
+    ASSERT_EQ(IterStatus(iter), "c->vc");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "b->vb");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+    iter->SeekToLast();
+    ASSERT_EQ(IterStatus(iter), "c->vc");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    iter->Seek("");
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Seek("a");
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Seek("ax");
+    ASSERT_EQ(IterStatus(iter), "b->vb");
+
+    iter->Seek("b");
+    ASSERT_EQ(IterStatus(iter), "b->vb");
+    iter->Seek("z");
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    // Switch from reverse to forward
+    iter->SeekToLast();
+    iter->Prev();
+    iter->Prev();
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "b->vb");
+
+    // Switch from forward to reverse
+    iter->SeekToFirst();
+    iter->Next();
+    iter->Next();
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "b->vb");
+
+    // Make sure iter stays at snapshot
+    ASSERT_OK(Put(1, "a", "va2"));
+    ASSERT_OK(Put(1, "a2", "va3"));
+    ASSERT_OK(Put(1, "b", "vb2"));
+    ASSERT_OK(Put(1, "c", "vc2"));
+    ASSERT_OK(Delete(1, "b"));
+    iter->SeekToFirst();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "b->vb");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "c->vc");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+    iter->SeekToLast();
+    ASSERT_EQ(IterStatus(iter), "c->vc");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "b->vb");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    delete iter;
+  } while (ChangeCompactOptions());
+}
+
+// Check that we can skip over a run of user keys
+// by using reseek rather than sequential scan
+TEST(DBTest, IterReseek) {
+  Options options = CurrentOptions();
+  options.max_sequential_skip_in_iterations = 3;
+  options.create_if_missing = true;
+  options.statistics = rocksdb::CreateDBStatistics();
+  DestroyAndReopen(&options);
+  CreateAndReopenWithCF({"pikachu"}, &options);
+
+  // insert two keys with same userkey and verify that
+  // reseek is not invoked. For each of these test cases,
+  // verify that we can find the next key "b".
+  ASSERT_OK(Put(1, "a", "one"));
+  ASSERT_OK(Put(1, "a", "two"));
+  ASSERT_OK(Put(1, "b", "bone"));
+  Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]);
+  iter->SeekToFirst();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0);
+  ASSERT_EQ(IterStatus(iter), "a->two");
+  iter->Next();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0);
+  ASSERT_EQ(IterStatus(iter), "b->bone");
+  delete iter;
+
+  // insert a total of three keys with same userkey and verify
+  // that reseek is still not invoked.
+  ASSERT_OK(Put(1, "a", "three"));
+  iter = db_->NewIterator(ReadOptions(), handles_[1]);
+  iter->SeekToFirst();
+  ASSERT_EQ(IterStatus(iter), "a->three");
+  iter->Next();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0);
+  ASSERT_EQ(IterStatus(iter), "b->bone");
+  delete iter;
+
+  // insert a total of four keys with same userkey and verify
+  // that reseek is invoked.
+  ASSERT_OK(Put(1, "a", "four"));
+  iter = db_->NewIterator(ReadOptions(), handles_[1]);
+  iter->SeekToFirst();
+  ASSERT_EQ(IterStatus(iter), "a->four");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0);
+  iter->Next();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1);
+  ASSERT_EQ(IterStatus(iter), "b->bone");
+  delete iter;
+
+  // Testing reverse iterator
+  // At this point, we have three versions of "a" and one version of "b".
+  // The reseek statistics is already at 1.
+  int num_reseeks =
+      (int)TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION);
+
+  // Insert another version of b and assert that reseek is not invoked
+  ASSERT_OK(Put(1, "b", "btwo"));
+  iter = db_->NewIterator(ReadOptions(), handles_[1]);
+  iter->SeekToLast();
+  ASSERT_EQ(IterStatus(iter), "b->btwo");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION),
+            num_reseeks);
+  iter->Prev();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION),
+            num_reseeks + 1);
+  ASSERT_EQ(IterStatus(iter), "a->four");
+  delete iter;
+
+  // insert two more versions of b. This makes a total of 4 versions
+  // of b and 4 versions of a.
+  ASSERT_OK(Put(1, "b", "bthree"));
+  ASSERT_OK(Put(1, "b", "bfour"));
+  iter = db_->NewIterator(ReadOptions(), handles_[1]);
+  iter->SeekToLast();
+  ASSERT_EQ(IterStatus(iter), "b->bfour");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION),
+            num_reseeks + 2);
+  iter->Prev();
+
+  // the previous Prev call should have invoked reseek
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION),
+            num_reseeks + 3);
+  ASSERT_EQ(IterStatus(iter), "a->four");
+  delete iter;
+}
+
+TEST(DBTest, IterSmallAndLargeMix) {
+  do {
+    CreateAndReopenWithCF({"pikachu"});
+    ASSERT_OK(Put(1, "a", "va"));
+    ASSERT_OK(Put(1, "b", std::string(100000, 'b')));
+    ASSERT_OK(Put(1, "c", "vc"));
+    ASSERT_OK(Put(1, "d", std::string(100000, 'd')));
+    ASSERT_OK(Put(1, "e", std::string(100000, 'e')));
+
+    Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]);
+
+    iter->SeekToFirst();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b'));
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "c->vc");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd'));
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e'));
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    iter->SeekToLast();
+    ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e'));
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd'));
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "c->vc");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b'));
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    delete iter;
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, IterMultiWithDelete) {
+  do {
+    CreateAndReopenWithCF({"pikachu"});
+    ASSERT_OK(Put(1, "ka", "va"));
+    ASSERT_OK(Put(1, "kb", "vb"));
+    ASSERT_OK(Put(1, "kc", "vc"));
+    ASSERT_OK(Delete(1, "kb"));
+    ASSERT_EQ("NOT_FOUND", Get(1, "kb"));
+
+    Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]);
+    iter->Seek("kc");
+    ASSERT_EQ(IterStatus(iter), "kc->vc");
+    if (!CurrentOptions().merge_operator) {
+      // TODO: merge operator does not support backward iteration yet
+      if (kPlainTableAllBytesPrefix != option_config_&&
+          kBlockBasedTableWithWholeKeyHashIndex != option_config_ &&
+          kHashLinkList != option_config_) {
+        iter->Prev();
+        ASSERT_EQ(IterStatus(iter), "ka->va");
+      }
+    }
+    delete iter;
+  } while (ChangeOptions());
+}
+
+TEST(DBTest, IterPrevMaxSkip) {
+  do {
+    CreateAndReopenWithCF({"pikachu"});
+    for (int i = 0; i < 2; i++) {
+      ASSERT_OK(Put(1, "key1", "v1"));
+      ASSERT_OK(Put(1, "key2", "v2"));
+      ASSERT_OK(Put(1, "key3", "v3"));
+      ASSERT_OK(Put(1, "key4", "v4"));
+      ASSERT_OK(Put(1, "key5", "v5"));
+    }
+
+    VerifyIterLast("key5->v5", 1);
+
+    ASSERT_OK(Delete(1, "key5"));
+    VerifyIterLast("key4->v4", 1);
+
+    ASSERT_OK(Delete(1, "key4"));
+    VerifyIterLast("key3->v3", 1);
+
+    ASSERT_OK(Delete(1, "key3"));
+    VerifyIterLast("key2->v2", 1);
+
+    ASSERT_OK(Delete(1, "key2"));
+    VerifyIterLast("key1->v1", 1);
+
+    ASSERT_OK(Delete(1, "key1"));
+    VerifyIterLast("(invalid)", 1);
+  } while (ChangeOptions(kSkipMergePut | kSkipNoSeekToLast));
+}
+
+TEST(DBTest, IterWithSnapshot) {
+  do {
+    CreateAndReopenWithCF({"pikachu"});
+    ASSERT_OK(Put(1, "key1", "val1"));
+    ASSERT_OK(Put(1, "key2", "val2"));
+    ASSERT_OK(Put(1, "key3", "val3"));
+    ASSERT_OK(Put(1, "key4", "val4"));
+    ASSERT_OK(Put(1, "key5", "val5"));
+
+    const Snapshot *snapshot = db_->GetSnapshot();
+    ReadOptions options;
+    options.snapshot = snapshot;
+    Iterator* iter = db_->NewIterator(options, handles_[1]);
+
+    // Put more values after the snapshot
+    ASSERT_OK(Put(1, "key100", "val100"));
+    ASSERT_OK(Put(1, "key101", "val101"));
+
+    iter->Seek("key5");
+    ASSERT_EQ(IterStatus(iter), "key5->val5");
+    if (!CurrentOptions().merge_operator) {
+      // TODO: merge operator does not support backward iteration yet
+      if (kPlainTableAllBytesPrefix != option_config_&&
+        kBlockBasedTableWithWholeKeyHashIndex != option_config_ &&
+        kHashLinkList != option_config_) {
+        iter->Prev();
+        ASSERT_EQ(IterStatus(iter), "key4->val4");
+        iter->Prev();
+        ASSERT_EQ(IterStatus(iter), "key3->val3");
+
+        iter->Next();
+        ASSERT_EQ(IterStatus(iter), "key4->val4");
+        iter->Next();
+        ASSERT_EQ(IterStatus(iter), "key5->val5");
+      }
+      iter->Next();
+      ASSERT_TRUE(!iter->Valid());
+    }
+    db_->ReleaseSnapshot(snapshot);
+    delete iter;
+    // skip as HashCuckooRep does not support snapshot
+  } while (ChangeOptions(kSkipHashCuckoo));
+}
+
+TEST(DBTest, Recover) {
+  do {
+    CreateAndReopenWithCF({"pikachu"});
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_OK(Put(1, "baz", "v5"));
+
+    ReopenWithColumnFamilies({"default", "pikachu"});
+    ASSERT_EQ("v1", Get(1, "foo"));
+
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_EQ("v5", Get(1, "baz"));
+    ASSERT_OK(Put(1, "bar", "v2"));
+    ASSERT_OK(Put(1, "foo", "v3"));
+
+    ReopenWithColumnFamilies({"default", "pikachu"});
+    ASSERT_EQ("v3", Get(1, "foo"));
+    ASSERT_OK(Put(1, "foo", "v4"));
+    ASSERT_EQ("v4", Get(1, "foo"));
+    ASSERT_EQ("v2", Get(1, "bar"));
+    ASSERT_EQ("v5", Get(1, "baz"));
+  } while (ChangeOptions());
+}
+
+TEST(DBTest, RecoverWithTableHandle) {
+  do {
+    Options options;
+    options.create_if_missing = true;
+    options.write_buffer_size = 100;
+    options.disable_auto_compactions = true;
+    options = CurrentOptions(options);
+    DestroyAndReopen(&options);
+    CreateAndReopenWithCF({"pikachu"}, &options);
+
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_OK(Put(1, "bar", "v2"));
+    ASSERT_OK(Flush(1));
+    ASSERT_OK(Put(1, "foo", "v3"));
+    ASSERT_OK(Put(1, "bar", "v4"));
+    ASSERT_OK(Flush(1));
+    ASSERT_OK(Put(1, "big", std::string(100, 'a')));
+    ReopenWithColumnFamilies({"default", "pikachu"});
+
+    std::vector<std::vector<FileMetaData>> files;
+    dbfull()->TEST_GetFilesMetaData(handles_[1], &files);
+    int total_files = 0;
+    for (const auto& level : files) {
+      total_files += level.size();
+    }
+    ASSERT_EQ(total_files, 3);
+    for (const auto& level : files) {
+      for (const auto& file : level) {
+        if (kInfiniteMaxOpenFiles == option_config_) {
+          ASSERT_TRUE(file.table_reader_handle != nullptr);
+        } else {
+          ASSERT_TRUE(file.table_reader_handle == nullptr);
+        }
+      }
+    }
+  } while (ChangeOptions());
+}
+
+TEST(DBTest, IgnoreRecoveredLog) {
+  std::string backup_logs = dbname_ + "/backup_logs";
+
+  // delete old files in backup_logs directory
+  env_->CreateDirIfMissing(backup_logs);
+  std::vector<std::string> old_files;
+  env_->GetChildren(backup_logs, &old_files);
+  for (auto& file : old_files) {
+    if (file != "." && file != "..") {
+      env_->DeleteFile(backup_logs + "/" + file);
+    }
+  }
+
+  do {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.merge_operator = MergeOperators::CreateUInt64AddOperator();
+    options.wal_dir = dbname_ + "/logs";
+    DestroyAndReopen(&options);
+
+    // fill up the DB
+    std::string one, two;
+    PutFixed64(&one, 1);
+    PutFixed64(&two, 2);
+    ASSERT_OK(db_->Merge(WriteOptions(), Slice("foo"), Slice(one)));
+    ASSERT_OK(db_->Merge(WriteOptions(), Slice("foo"), Slice(one)));
+    ASSERT_OK(db_->Merge(WriteOptions(), Slice("bar"), Slice(one)));
+
+    // copy the logs to backup
+    std::vector<std::string> logs;
+    env_->GetChildren(options.wal_dir, &logs);
+    for (auto& log : logs) {
+      if (log != ".." && log != ".") {
+        CopyFile(options.wal_dir + "/" + log, backup_logs + "/" + log);
+      }
+    }
+
+    // recover the DB
+    Reopen(&options);
+    ASSERT_EQ(two, Get("foo"));
+    ASSERT_EQ(one, Get("bar"));
+    Close();
+
+    // copy the logs from backup back to wal dir
+    for (auto& log : logs) {
+      if (log != ".." && log != ".") {
+        CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log);
+      }
+    }
+    // this should ignore the log files, recovery should not happen again
+    // if the recovery happens, the same merge operator would be called twice,
+    // leading to incorrect results
+    Reopen(&options);
+    ASSERT_EQ(two, Get("foo"));
+    ASSERT_EQ(one, Get("bar"));
+    Close();
+    Destroy(&options);
+    Reopen(&options);
+    Close();
+
+    // copy the logs from backup back to wal dir
+    env_->CreateDirIfMissing(options.wal_dir);
+    for (auto& log : logs) {
+      if (log != ".." && log != ".") {
+        CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log);
+      }
+    }
+    // assert that we successfully recovered only from logs, even though we
+    // destroyed the DB
+    Reopen(&options);
+    ASSERT_EQ(two, Get("foo"));
+    ASSERT_EQ(one, Get("bar"));
+
+    // Recovery will fail if DB directory doesn't exist.
+    Destroy(&options);
+    // copy the logs from backup back to wal dir
+    env_->CreateDirIfMissing(options.wal_dir);
+    for (auto& log : logs) {
+      if (log != ".." && log != ".") {
+        CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log);
+        // we won't be needing this file no more
+        env_->DeleteFile(backup_logs + "/" + log);
+      }
+    }
+    Status s = TryReopen(&options);
+    ASSERT_TRUE(!s.ok());
+  } while (ChangeOptions(kSkipHashCuckoo));
+}
+
+TEST(DBTest, RollLog) {
+  do {
+    CreateAndReopenWithCF({"pikachu"});
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_OK(Put(1, "baz", "v5"));
+
+    ReopenWithColumnFamilies({"default", "pikachu"});
+    for (int i = 0; i < 10; i++) {
+      ReopenWithColumnFamilies({"default", "pikachu"});
+    }
+    ASSERT_OK(Put(1, "foo", "v4"));
+    for (int i = 0; i < 10; i++) {
+      ReopenWithColumnFamilies({"default", "pikachu"});
+    }
+  } while (ChangeOptions());
+}
+
+TEST(DBTest, WAL) {
+  do {
+    CreateAndReopenWithCF({"pikachu"});
+    WriteOptions writeOpt = WriteOptions();
+    writeOpt.disableWAL = true;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1"));
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1"));
+
+    ReopenWithColumnFamilies({"default", "pikachu"});
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_EQ("v1", Get(1, "bar"));
+
+    writeOpt.disableWAL = false;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v2"));
+    writeOpt.disableWAL = true;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v2"));
+
+    ReopenWithColumnFamilies({"default", "pikachu"});
+    // Both value's should be present.
+    ASSERT_EQ("v2", Get(1, "bar"));
+    ASSERT_EQ("v2", Get(1, "foo"));
+
+    writeOpt.disableWAL = true;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v3"));
+    writeOpt.disableWAL = false;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v3"));
+
+    ReopenWithColumnFamilies({"default", "pikachu"});
+    // again both values should be present.
+    ASSERT_EQ("v3", Get(1, "foo"));
+    ASSERT_EQ("v3", Get(1, "bar"));
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, CheckLock) {
+  do {
+    DB* localdb;
+    Options options = CurrentOptions();
+    ASSERT_OK(TryReopen(&options));
+
+    // second open should fail
+    ASSERT_TRUE(!(DB::Open(options, dbname_, &localdb)).ok());
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, FlushMultipleMemtable) {
+  do {
+    Options options = CurrentOptions();
+    WriteOptions writeOpt = WriteOptions();
+    writeOpt.disableWAL = true;
+    options.max_write_buffer_number = 4;
+    options.min_write_buffer_number_to_merge = 3;
+    CreateAndReopenWithCF({"pikachu"}, &options);
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1"));
+    ASSERT_OK(Flush(1));
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1"));
+
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_EQ("v1", Get(1, "bar"));
+    ASSERT_OK(Flush(1));
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, NumImmutableMemTable) {
+  do {
+    Options options = CurrentOptions();
+    WriteOptions writeOpt = WriteOptions();
+    writeOpt.disableWAL = true;
+    options.max_write_buffer_number = 4;
+    options.min_write_buffer_number_to_merge = 3;
+    options.write_buffer_size = 1000000;
+    CreateAndReopenWithCF({"pikachu"}, &options);
+
+    std::string big_value(1000000 * 2, 'x');
+    std::string num;
+    SetPerfLevel(kEnableTime);;
+
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k1", big_value));
+    ASSERT_TRUE(dbfull()->GetProperty(handles_[1],
+                                      "rocksdb.num-immutable-mem-table", &num));
+    ASSERT_EQ(num, "0");
+    ASSERT_TRUE(dbfull()->GetProperty(
+        handles_[1], "rocksdb.num-entries-active-mem-table", &num));
+    ASSERT_EQ(num, "1");
+    perf_context.Reset();
+    Get(1, "k1");
+    ASSERT_EQ(1, (int) perf_context.get_from_memtable_count);
+
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k2", big_value));
+    ASSERT_TRUE(dbfull()->GetProperty(handles_[1],
+                                      "rocksdb.num-immutable-mem-table", &num));
+    ASSERT_EQ(num, "1");
+    ASSERT_TRUE(dbfull()->GetProperty(
+        handles_[1], "rocksdb.num-entries-active-mem-table", &num));
+    ASSERT_EQ(num, "1");
+    ASSERT_TRUE(dbfull()->GetProperty(
+        handles_[1], "rocksdb.num-entries-imm-mem-tables", &num));
+    ASSERT_EQ(num, "1");
+
+    perf_context.Reset();
+    Get(1, "k1");
+    ASSERT_EQ(2, (int) perf_context.get_from_memtable_count);
+    perf_context.Reset();
+    Get(1, "k2");
+    ASSERT_EQ(1, (int) perf_context.get_from_memtable_count);
+
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k3", big_value));
+    ASSERT_TRUE(dbfull()->GetProperty(
+        handles_[1], "rocksdb.cur-size-active-mem-table", &num));
+    ASSERT_TRUE(dbfull()->GetProperty(handles_[1],
+                                      "rocksdb.num-immutable-mem-table", &num));
+    ASSERT_EQ(num, "2");
+    ASSERT_TRUE(dbfull()->GetProperty(
+        handles_[1], "rocksdb.num-entries-active-mem-table", &num));
+    ASSERT_EQ(num, "1");
+    ASSERT_TRUE(dbfull()->GetProperty(
+        handles_[1], "rocksdb.num-entries-imm-mem-tables", &num));
+    ASSERT_EQ(num, "2");
+    perf_context.Reset();
+    Get(1, "k2");
+    ASSERT_EQ(2, (int) perf_context.get_from_memtable_count);
+    perf_context.Reset();
+    Get(1, "k3");
+    ASSERT_EQ(1, (int) perf_context.get_from_memtable_count);
+    perf_context.Reset();
+    Get(1, "k1");
+    ASSERT_EQ(3, (int) perf_context.get_from_memtable_count);
+
+    ASSERT_OK(Flush(1));
+    ASSERT_TRUE(dbfull()->GetProperty(handles_[1],
+                                      "rocksdb.num-immutable-mem-table", &num));
+    ASSERT_EQ(num, "0");
+    ASSERT_TRUE(dbfull()->GetProperty(
+        handles_[1], "rocksdb.cur-size-active-mem-table", &num));
+    // "208" is the size of the metadata of an empty skiplist, this would
+    // break if we change the default skiplist implementation
+    ASSERT_EQ(num, "208");
+    SetPerfLevel(kDisable);
+  } while (ChangeCompactOptions());
+}
+
+class SleepingBackgroundTask {
+ public:
+  SleepingBackgroundTask()
+      : bg_cv_(&mutex_), should_sleep_(true), done_with_sleep_(false) {}
+  void DoSleep() {
+    MutexLock l(&mutex_);
+    while (should_sleep_) {
+      bg_cv_.Wait();
+    }
+    done_with_sleep_ = true;
+    bg_cv_.SignalAll();
+  }
+  void WakeUp() {
+    MutexLock l(&mutex_);
+    should_sleep_ = false;
+    bg_cv_.SignalAll();
+  }
+  void WaitUntilDone() {
+    MutexLock l(&mutex_);
+    while (!done_with_sleep_) {
+      bg_cv_.Wait();
+    }
+  }
+
+  static void DoSleepTask(void* arg) {
+    reinterpret_cast<SleepingBackgroundTask*>(arg)->DoSleep();
+  }
+
+ private:
+  port::Mutex mutex_;
+  port::CondVar bg_cv_;  // Signalled when background work finishes
+  bool should_sleep_;
+  bool done_with_sleep_;
+};
+
+TEST(DBTest, GetProperty) {
+  // Set sizes to both background thread pool to be 1 and block them.
+  env_->SetBackgroundThreads(1, Env::HIGH);
+  env_->SetBackgroundThreads(1, Env::LOW);
+  SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+  SleepingBackgroundTask sleeping_task_high;
+  env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_high,
+                 Env::Priority::HIGH);
+
+  Options options = CurrentOptions();
+  WriteOptions writeOpt = WriteOptions();
+  writeOpt.disableWAL = true;
+  options.compaction_style = kCompactionStyleUniversal;
+  options.level0_file_num_compaction_trigger = 1;
+  options.compaction_options_universal.size_ratio = 50;
+  options.max_background_compactions = 1;
+  options.max_background_flushes = 1;
+  options.max_write_buffer_number = 10;
+  options.min_write_buffer_number_to_merge = 1;
+  options.write_buffer_size = 1000000;
+  Reopen(&options);
+
+  std::string big_value(1000000 * 2, 'x');
+  std::string num;
+  SetPerfLevel(kEnableTime);
+
+  ASSERT_OK(dbfull()->Put(writeOpt, "k1", big_value));
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
+  ASSERT_EQ(num, "0");
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.mem-table-flush-pending", &num));
+  ASSERT_EQ(num, "0");
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.compaction-pending", &num));
+  ASSERT_EQ(num, "0");
+  perf_context.Reset();
+
+  ASSERT_OK(dbfull()->Put(writeOpt, "k2", big_value));
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
+  ASSERT_EQ(num, "1");
+  ASSERT_OK(dbfull()->Put(writeOpt, "k3", big_value));
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
+  ASSERT_EQ(num, "2");
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.mem-table-flush-pending", &num));
+  ASSERT_EQ(num, "1");
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.compaction-pending", &num));
+  ASSERT_EQ(num, "0");
+
+  sleeping_task_high.WakeUp();
+  sleeping_task_high.WaitUntilDone();
+  dbfull()->TEST_WaitForFlushMemTable();
+
+  ASSERT_OK(dbfull()->Put(writeOpt, "k4", big_value));
+  ASSERT_OK(dbfull()->Put(writeOpt, "k5", big_value));
+  dbfull()->TEST_WaitForFlushMemTable();
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.mem-table-flush-pending", &num));
+  ASSERT_EQ(num, "0");
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.compaction-pending", &num));
+  ASSERT_EQ(num, "1");
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+}
+
+TEST(DBTest, FLUSH) {
+  do {
+    CreateAndReopenWithCF({"pikachu"});
+    WriteOptions writeOpt = WriteOptions();
+    writeOpt.disableWAL = true;
+    SetPerfLevel(kEnableTime);;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1"));
+    // this will now also flush the last 2 writes
+    ASSERT_OK(Flush(1));
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1"));
+
+    perf_context.Reset();
+    Get(1, "foo");
+    ASSERT_TRUE((int) perf_context.get_from_output_files_time > 0);
+
+    ReopenWithColumnFamilies({"default", "pikachu"});
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_EQ("v1", Get(1, "bar"));
+
+    writeOpt.disableWAL = true;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v2"));
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v2"));
+    ASSERT_OK(Flush(1));
+
+    ReopenWithColumnFamilies({"default", "pikachu"});
+    ASSERT_EQ("v2", Get(1, "bar"));
+    perf_context.Reset();
+    ASSERT_EQ("v2", Get(1, "foo"));
+    ASSERT_TRUE((int) perf_context.get_from_output_files_time > 0);
+
+    writeOpt.disableWAL = false;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v3"));
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v3"));
+    ASSERT_OK(Flush(1));
+
+    ReopenWithColumnFamilies({"default", "pikachu"});
+    // 'foo' should be there because its put
+    // has WAL enabled.
+    ASSERT_EQ("v3", Get(1, "foo"));
+    ASSERT_EQ("v3", Get(1, "bar"));
+
+    SetPerfLevel(kDisable);
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, RecoveryWithEmptyLog) {
+  do {
+    CreateAndReopenWithCF({"pikachu"});
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_OK(Put(1, "foo", "v2"));
+    ReopenWithColumnFamilies({"default", "pikachu"});
+    ReopenWithColumnFamilies({"default", "pikachu"});
+    ASSERT_OK(Put(1, "foo", "v3"));
+    ReopenWithColumnFamilies({"default", "pikachu"});
+    ASSERT_EQ("v3", Get(1, "foo"));
+  } while (ChangeOptions());
+}
+
+// Check that writes done during a memtable compaction are recovered
+// if the database is shutdown during the memtable compaction.
+TEST(DBTest, RecoverDuringMemtableCompaction) {
+  do {
+    Options options;
+    options.env = env_;
+    options.write_buffer_size = 1000000;
+    options = CurrentOptions(options);
+    CreateAndReopenWithCF({"pikachu"}, &options);
+
+    // Trigger a long memtable compaction and reopen the database during it
+    ASSERT_OK(Put(1, "foo", "v1"));  // Goes to 1st log file
+    ASSERT_OK(Put(1, "big1", std::string(10000000, 'x')));  // Fills memtable
+    ASSERT_OK(Put(1, "big2", std::string(1000, 'y')));  // Triggers compaction
+    ASSERT_OK(Put(1, "bar", "v2"));                     // Goes to new log file
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, &options);
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_EQ("v2", Get(1, "bar"));
+    ASSERT_EQ(std::string(10000000, 'x'), Get(1, "big1"));
+    ASSERT_EQ(std::string(1000, 'y'), Get(1, "big2"));
+  } while (ChangeOptions());
+}
+
+TEST(DBTest, MinorCompactionsHappen) {
+  do {
+    Options options;
+    options.write_buffer_size = 10000;
+    options = CurrentOptions(options);
+    CreateAndReopenWithCF({"pikachu"}, &options);
+
+    const int N = 500;
+
+    int starting_num_tables = TotalTableFiles(1);
+    for (int i = 0; i < N; i++) {
+      ASSERT_OK(Put(1, Key(i), Key(i) + std::string(1000, 'v')));
+    }
+    int ending_num_tables = TotalTableFiles(1);
+    ASSERT_GT(ending_num_tables, starting_num_tables);
+
+    for (int i = 0; i < N; i++) {
+      ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(1, Key(i)));
+    }
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, &options);
+
+    for (int i = 0; i < N; i++) {
+      ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(1, Key(i)));
+    }
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, ManifestRollOver) {
+  do {
+    Options options;
+    options.max_manifest_file_size = 10 ;  // 10 bytes
+    options = CurrentOptions(options);
+    CreateAndReopenWithCF({"pikachu"}, &options);
+    {
+      ASSERT_OK(Put(1, "manifest_key1", std::string(1000, '1')));
+      ASSERT_OK(Put(1, "manifest_key2", std::string(1000, '2')));
+      ASSERT_OK(Put(1, "manifest_key3", std::string(1000, '3')));
+      uint64_t manifest_before_flush = dbfull()->TEST_Current_Manifest_FileNo();
+      ASSERT_OK(Flush(1));  // This should trigger LogAndApply.
+      uint64_t manifest_after_flush = dbfull()->TEST_Current_Manifest_FileNo();
+      ASSERT_GT(manifest_after_flush, manifest_before_flush);
+      ReopenWithColumnFamilies({"default", "pikachu"}, &options);
+      ASSERT_GT(dbfull()->TEST_Current_Manifest_FileNo(), manifest_after_flush);
+      // check if a new manifest file got inserted or not.
+      ASSERT_EQ(std::string(1000, '1'), Get(1, "manifest_key1"));
+      ASSERT_EQ(std::string(1000, '2'), Get(1, "manifest_key2"));
+      ASSERT_EQ(std::string(1000, '3'), Get(1, "manifest_key3"));
+    }
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, IdentityAcrossRestarts) {
+  do {
+    std::string id1;
+    ASSERT_OK(db_->GetDbIdentity(id1));
+
+    Options options = CurrentOptions();
+    Reopen(&options);
+    std::string id2;
+    ASSERT_OK(db_->GetDbIdentity(id2));
+    // id1 should match id2 because identity was not regenerated
+    ASSERT_EQ(id1.compare(id2), 0);
+
+    std::string idfilename = IdentityFileName(dbname_);
+    ASSERT_OK(env_->DeleteFile(idfilename));
+    Reopen(&options);
+    std::string id3;
+    ASSERT_OK(db_->GetDbIdentity(id3));
+    // id1 should NOT match id3 because identity was regenerated
+    ASSERT_NE(id1.compare(id3), 0);
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, RecoverWithLargeLog) {
+  do {
+    {
+      Options options = CurrentOptions();
+      CreateAndReopenWithCF({"pikachu"}, &options);
+      ASSERT_OK(Put(1, "big1", std::string(200000, '1')));
+      ASSERT_OK(Put(1, "big2", std::string(200000, '2')));
+      ASSERT_OK(Put(1, "small3", std::string(10, '3')));
+      ASSERT_OK(Put(1, "small4", std::string(10, '4')));
+      ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+    }
+
+    // Make sure that if we re-open with a small write buffer size that
+    // we flush table files in the middle of a large log file.
+    Options options;
+    options.write_buffer_size = 100000;
+    options = CurrentOptions(options);
+    ReopenWithColumnFamilies({"default", "pikachu"}, &options);
+    ASSERT_EQ(NumTableFilesAtLevel(0, 1), 3);
+    ASSERT_EQ(std::string(200000, '1'), Get(1, "big1"));
+    ASSERT_EQ(std::string(200000, '2'), Get(1, "big2"));
+    ASSERT_EQ(std::string(10, '3'), Get(1, "small3"));
+    ASSERT_EQ(std::string(10, '4'), Get(1, "small4"));
+    ASSERT_GT(NumTableFilesAtLevel(0, 1), 1);
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, CompactionsGenerateMultipleFiles) {
+  Options options;
+  options.write_buffer_size = 100000000;        // Large write buffer
+  options = CurrentOptions(options);
+  CreateAndReopenWithCF({"pikachu"}, &options);
+
+  Random rnd(301);
+
+  // Write 8MB (80 values, each 100K)
+  ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+  std::vector<std::string> values;
+  for (int i = 0; i < 80; i++) {
+    values.push_back(RandomString(&rnd, 100000));
+    ASSERT_OK(Put(1, Key(i), values[i]));
+  }
+
+  // Reopening moves updates to level-0
+  ReopenWithColumnFamilies({"default", "pikachu"}, &options);
+  dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
+
+  ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+  ASSERT_GT(NumTableFilesAtLevel(1, 1), 1);
+  for (int i = 0; i < 80; i++) {
+    ASSERT_EQ(Get(1, Key(i)), values[i]);
+  }
+}
+
+TEST(DBTest, CompactionTrigger) {
+  Options options;
+  options.write_buffer_size = 100<<10; //100KB
+  options.num_levels = 3;
+  options.max_mem_compaction_level = 0;
+  options.level0_file_num_compaction_trigger = 3;
+  options = CurrentOptions(options);
+  CreateAndReopenWithCF({"pikachu"}, &options);
+
+  Random rnd(301);
+
+  for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
+       num++) {
+    std::vector<std::string> values;
+    // Write 120KB (12 values, each 10K)
+    for (int i = 0; i < 12; i++) {
+      values.push_back(RandomString(&rnd, 10000));
+      ASSERT_OK(Put(1, Key(i), values[i]));
+    }
+    dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+    ASSERT_EQ(NumTableFilesAtLevel(0, 1), num + 1);
+  }
+
+  //generate one more file in level-0, and should trigger level-0 compaction
+  std::vector<std::string> values;
+  for (int i = 0; i < 12; i++) {
+    values.push_back(RandomString(&rnd, 10000));
+    ASSERT_OK(Put(1, Key(i), values[i]));
+  }
+  dbfull()->TEST_WaitForCompact();
+
+  ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1, 1), 1);
+}
+
+// This is a static filter used for filtering
+// kvs during the compaction process.
+static int cfilter_count;
+static std::string NEW_VALUE = "NewValue";
+
+class KeepFilter : public CompactionFilter {
+ public:
+  virtual bool Filter(int level, const Slice& key, const Slice& value,
+                      std::string* new_value, bool* value_changed) const
+      override {
+    cfilter_count++;
+    return false;
+  }
+
+  virtual const char* Name() const override { return "KeepFilter"; }
+};
+
+class DeleteFilter : public CompactionFilter {
+ public:
+  virtual bool Filter(int level, const Slice& key, const Slice& value,
+                      std::string* new_value, bool* value_changed) const
+      override {
+    cfilter_count++;
+    return true;
+  }
+
+  virtual const char* Name() const override { return "DeleteFilter"; }
+};
+
+class ChangeFilter : public CompactionFilter {
+ public:
+  explicit ChangeFilter() {}
+
+  virtual bool Filter(int level, const Slice& key, const Slice& value,
+                      std::string* new_value, bool* value_changed) const
+      override {
+    assert(new_value != nullptr);
+    *new_value = NEW_VALUE;
+    *value_changed = true;
+    return false;
+  }
+
+  virtual const char* Name() const override { return "ChangeFilter"; }
+};
+
+class KeepFilterFactory : public CompactionFilterFactory {
+ public:
+  explicit KeepFilterFactory(bool check_context = false)
+      : check_context_(check_context) {}
+
+  virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& context) override {
+    if (check_context_) {
+      ASSERT_EQ(expect_full_compaction_.load(), context.is_full_compaction);
+      ASSERT_EQ(expect_manual_compaction_.load(), context.is_manual_compaction);
+    }
+    return std::unique_ptr<CompactionFilter>(new KeepFilter());
+  }
+
+  virtual const char* Name() const override { return "KeepFilterFactory"; }
+  bool check_context_;
+  std::atomic_bool expect_full_compaction_;
+  std::atomic_bool expect_manual_compaction_;
+};
+
+class DeleteFilterFactory : public CompactionFilterFactory {
+ public:
+  virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& context) override {
+    if (context.is_manual_compaction) {
+      return std::unique_ptr<CompactionFilter>(new DeleteFilter());
+    } else {
+      return std::unique_ptr<CompactionFilter>(nullptr);
+    }
+  }
+
+  virtual const char* Name() const override { return "DeleteFilterFactory"; }
+};
+
+class ChangeFilterFactory : public CompactionFilterFactory {
+ public:
+  explicit ChangeFilterFactory() {}
+
+  virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& context) override {
+    return std::unique_ptr<CompactionFilter>(new ChangeFilter());
+  }
+
+  virtual const char* Name() const override { return "ChangeFilterFactory"; }
+};
+
+// TODO(kailiu) The tests on UniversalCompaction has some issues:
+//  1. A lot of magic numbers ("11" or "12").
+//  2. Made assumption on the memtable flush conidtions, which may change from
+//     time to time.
+TEST(DBTest, UniversalCompactionTrigger) {
+  Options options;
+  options.compaction_style = kCompactionStyleUniversal;
+  options.write_buffer_size = 100<<10; //100KB
+  // trigger compaction if there are >= 4 files
+  options.level0_file_num_compaction_trigger = 4;
+  KeepFilterFactory* filter = new KeepFilterFactory(true);
+  filter->expect_manual_compaction_.store(false);
+  options.compaction_filter_factory.reset(filter);
+
+  options = CurrentOptions(options);
+  CreateAndReopenWithCF({"pikachu"}, &options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  filter->expect_full_compaction_.store(true);
+  // Stage 1:
+  //   Generate a set of files at level 0, but don't trigger level-0
+  //   compaction.
+  for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
+       num++) {
+    // Write 110KB (11 values, each 10K)
+    for (int i = 0; i < 12; i++) {
+      ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 10000)));
+      key_idx++;
+    }
+    dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+    ASSERT_EQ(NumTableFilesAtLevel(0, 1), num + 1);
+  }
+
+  // Generate one more file at level-0, which should trigger level-0
+  // compaction.
+  for (int i = 0; i < 11; i++) {
+    ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 10000)));
+    key_idx++;
+  }
+  dbfull()->TEST_WaitForCompact();
+  // Suppose each file flushed from mem table has size 1. Now we compact
+  // (level0_file_num_compaction_trigger+1)=4 files and should have a big
+  // file of size 4.
+  ASSERT_EQ(NumTableFilesAtLevel(0, 1), 1);
+  for (int i = 1; i < options.num_levels ; i++) {
+    ASSERT_EQ(NumTableFilesAtLevel(i, 1), 0);
+  }
+
+  // Stage 2:
+  //   Now we have one file at level 0, with size 4. We also have some data in
+  //   mem table. Let's continue generating new files at level 0, but don't
+  //   trigger level-0 compaction.
+  //   First, clean up memtable before inserting new data. This will generate
+  //   a level-0 file, with size around 0.4 (according to previously written
+  //   data amount).
+  filter->expect_full_compaction_.store(false);
+  ASSERT_OK(Flush(1));
+  for (int num = 0; num < options.level0_file_num_compaction_trigger - 3;
+       num++) {
+    // Write 110KB (11 values, each 10K)
+    for (int i = 0; i < 11; i++) {
+      ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 10000)));
+      key_idx++;
+    }
+    dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+    ASSERT_EQ(NumTableFilesAtLevel(0, 1), num + 3);
+  }
+
+  // Generate one more file at level-0, which should trigger level-0
+  // compaction.
+  for (int i = 0; i < 11; i++) {
+    ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 10000)));
+    key_idx++;
+  }
+  dbfull()->TEST_WaitForCompact();
+  // Before compaction, we have 4 files at level 0, with size 4, 0.4, 1, 1.
+  // After comapction, we should have 2 files, with size 4, 2.4.
+  ASSERT_EQ(NumTableFilesAtLevel(0, 1), 2);
+  for (int i = 1; i < options.num_levels ; i++) {
+    ASSERT_EQ(NumTableFilesAtLevel(i, 1), 0);
+  }
+
+  // Stage 3:
+  //   Now we have 2 files at level 0, with size 4 and 2.4. Continue
+  //   generating new files at level 0.
+  for (int num = 0; num < options.level0_file_num_compaction_trigger - 3;
+       num++) {
+    // Write 110KB (11 values, each 10K)
+    for (int i = 0; i < 11; i++) {
+      ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 10000)));
+      key_idx++;
+    }
+    dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+    ASSERT_EQ(NumTableFilesAtLevel(0, 1), num + 3);
+  }
+
+  // Generate one more file at level-0, which should trigger level-0
+  // compaction.
+  for (int i = 0; i < 12; i++) {
+    ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 10000)));
+    key_idx++;
+  }
+  dbfull()->TEST_WaitForCompact();
+  // Before compaction, we have 4 files at level 0, with size 4, 2.4, 1, 1.
+  // After comapction, we should have 3 files, with size 4, 2.4, 2.
+  ASSERT_EQ(NumTableFilesAtLevel(0, 1), 3);
+  for (int i = 1; i < options.num_levels ; i++) {
+    ASSERT_EQ(NumTableFilesAtLevel(i, 1), 0);
+  }
+
+  // Stage 4:
+  //   Now we have 3 files at level 0, with size 4, 2.4, 2. Let's generate a
+  //   new file of size 1.
+  for (int i = 0; i < 11; i++) {
+    ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 10000)));
+    key_idx++;
+  }
+  dbfull()->TEST_WaitForCompact();
+  // Level-0 compaction is triggered, but no file will be picked up.
+  ASSERT_EQ(NumTableFilesAtLevel(0, 1), 4);
+  for (int i = 1; i < options.num_levels ; i++) {
+    ASSERT_EQ(NumTableFilesAtLevel(i, 1), 0);
+  }
+
+  // Stage 5:
+  //   Now we have 4 files at level 0, with size 4, 2.4, 2, 1. Let's generate
+  //   a new file of size 1.
+  filter->expect_full_compaction_.store(true);
+  for (int i = 0; i < 11; i++) {
+    ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 10000)));
+    key_idx++;
+  }
+  dbfull()->TEST_WaitForCompact();
+  // All files at level 0 will be compacted into a single one.
+  ASSERT_EQ(NumTableFilesAtLevel(0, 1), 1);
+  for (int i = 1; i < options.num_levels ; i++) {
+    ASSERT_EQ(NumTableFilesAtLevel(i, 1), 0);
+  }
+}
+
+TEST(DBTest, UniversalCompactionSizeAmplification) {
+  Options options;
+  options.compaction_style = kCompactionStyleUniversal;
+  options.write_buffer_size = 100<<10; //100KB
+  options.level0_file_num_compaction_trigger = 3;
+  CreateAndReopenWithCF({"pikachu"}, &options);
+
+  // Trigger compaction if size amplification exceeds 110%
+  options.compaction_options_universal.max_size_amplification_percent = 110;
+  options = CurrentOptions(options);
+  ReopenWithColumnFamilies({"default", "pikachu"}, &options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  //   Generate two files in Level 0. Both files are approx the same size.
+  for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
+       num++) {
+    // Write 110KB (11 values, each 10K)
+    for (int i = 0; i < 11; i++) {
+      ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 10000)));
+      key_idx++;
+    }
+    dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+    ASSERT_EQ(NumTableFilesAtLevel(0, 1), num + 1);
+  }
+  ASSERT_EQ(NumTableFilesAtLevel(0, 1), 2);
+
+  // Flush whatever is remaining in memtable. This is typically
+  // small, which should not trigger size ratio based compaction
+  // but will instead trigger size amplification.
+  ASSERT_OK(Flush(1));
+
+  dbfull()->TEST_WaitForCompact();
+
+  // Verify that size amplification did occur
+  ASSERT_EQ(NumTableFilesAtLevel(0, 1), 1);
+}
+
+TEST(DBTest, UniversalCompactionOptions) {
+  Options options;
+  options.compaction_style = kCompactionStyleUniversal;
+  options.write_buffer_size = 100<<10; //100KB
+  options.level0_file_num_compaction_trigger = 4;
+  options.num_levels = 1;
+  options.compaction_options_universal.compression_size_percent = -1;
+  options = CurrentOptions(options);
+  CreateAndReopenWithCF({"pikachu"}, &options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) {
+    // Write 110KB (11 values, each 10K)
+    for (int i = 0; i < 11; i++) {
+      ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 10000)));
+      key_idx++;
+    }
+    dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+
+    if (num < options.level0_file_num_compaction_trigger - 1) {
+      ASSERT_EQ(NumTableFilesAtLevel(0, 1), num + 1);
+    }
+  }
+
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(NumTableFilesAtLevel(0, 1), 1);
+  for (int i = 1; i < options.num_levels ; i++) {
+    ASSERT_EQ(NumTableFilesAtLevel(i, 1), 0);
+  }
+}
+
+TEST(DBTest, UniversalCompactionStopStyleSimilarSize) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.write_buffer_size = 100<<10; //100KB
+  // trigger compaction if there are >= 4 files
+  options.level0_file_num_compaction_trigger = 4;
+  options.compaction_options_universal.size_ratio = 10;
+  options.compaction_options_universal.stop_style = kCompactionStopStyleSimilarSize;
+  options.num_levels=1;
+  Reopen(&options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  // Stage 1:
+  //   Generate a set of files at level 0, but don't trigger level-0
+  //   compaction.
+  for (int num = 0;
+       num < options.level0_file_num_compaction_trigger-1;
+       num++) {
+    // Write 110KB (11 values, each 10K)
+    for (int i = 0; i < 11; i++) {
+      ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 10000)));
+      key_idx++;
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_EQ(NumTableFilesAtLevel(0), num + 1);
+  }
+
+  // Generate one more file at level-0, which should trigger level-0
+  // compaction.
+  for (int i = 0; i < 11; i++) {
+    ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 10000)));
+    key_idx++;
+  }
+  dbfull()->TEST_WaitForCompact();
+  // Suppose each file flushed from mem table has size 1. Now we compact
+  // (level0_file_num_compaction_trigger+1)=4 files and should have a big
+  // file of size 4.
+  ASSERT_EQ(NumTableFilesAtLevel(0), 1);
+
+  // Stage 2:
+  //   Now we have one file at level 0, with size 4. We also have some data in
+  //   mem table. Let's continue generating new files at level 0, but don't
+  //   trigger level-0 compaction.
+  //   First, clean up memtable before inserting new data. This will generate
+  //   a level-0 file, with size around 0.4 (according to previously written
+  //   data amount).
+  dbfull()->Flush(FlushOptions());
+  for (int num = 0;
+       num < options.level0_file_num_compaction_trigger-3;
+       num++) {
+    // Write 110KB (11 values, each 10K)
+    for (int i = 0; i < 11; i++) {
+      ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 10000)));
+      key_idx++;
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_EQ(NumTableFilesAtLevel(0), num + 3);
+  }
+
+  // Generate one more file at level-0, which should trigger level-0
+  // compaction.
+  for (int i = 0; i < 11; i++) {
+    ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 10000)));
+    key_idx++;
+  }
+  dbfull()->TEST_WaitForCompact();
+  // Before compaction, we have 4 files at level 0, with size 4, 0.4, 1, 1.
+  // After compaction, we should have 3 files, with size 4, 0.4, 2.
+  ASSERT_EQ(NumTableFilesAtLevel(0), 3);
+  // Stage 3:
+  //   Now we have 3 files at level 0, with size 4, 0.4, 2. Generate one
+  //   more file at level-0, which should trigger level-0 compaction.
+  for (int i = 0; i < 11; i++) {
+    ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 10000)));
+    key_idx++;
+  }
+  dbfull()->TEST_WaitForCompact();
+  // Level-0 compaction is triggered, but no file will be picked up.
+  ASSERT_EQ(NumTableFilesAtLevel(0), 4);
+}
+
+#if defined(SNAPPY)
+TEST(DBTest, CompressedCache) {
+  int num_iter = 80;
+
+  // Run this test three iterations.
+  // Iteration 1: only a uncompressed block cache
+  // Iteration 2: only a compressed block cache
+  // Iteration 3: both block cache and compressed cache
+  // Iteration 4: both block cache and compressed cache, but DB is not
+  // compressed
+  for (int iter = 0; iter < 4; iter++) {
+    Options options = CurrentOptions();
+    options.write_buffer_size = 64*1024;        // small write buffer
+    options.statistics = rocksdb::CreateDBStatistics();
+
+    switch (iter) {
+      case 0:
+        // only uncompressed block cache
+        options.block_cache = NewLRUCache(8*1024);
+        options.block_cache_compressed = nullptr;
+        break;
+      case 1:
+        // no block cache, only compressed cache
+        options.no_block_cache = true;
+        options.block_cache = nullptr;
+        options.block_cache_compressed = NewLRUCache(8*1024);
+        break;
+      case 2:
+        // both compressed and uncompressed block cache
+        options.block_cache = NewLRUCache(1024);
+        options.block_cache_compressed = NewLRUCache(8*1024);
+        break;
+      case 3:
+        // both block cache and compressed cache, but DB is not compressed
+        // also, make block cache sizes bigger, to trigger block cache hits
+        options.block_cache = NewLRUCache(1024 * 1024);
+        options.block_cache_compressed = NewLRUCache(8 * 1024 * 1024);
+        options.compression = kNoCompression;
+        break;
+      default:
+        ASSERT_TRUE(false);
+    }
+    CreateAndReopenWithCF({"pikachu"}, &options);
+    // default column family doesn't have block cache
+    Options no_block_cache_opts;
+    no_block_cache_opts.no_block_cache = true;
+    no_block_cache_opts.statistics = options.statistics;
+    options = CurrentOptions(options);
+    ReopenWithColumnFamilies({"default", "pikachu"},
+                             {&no_block_cache_opts, &options});
+
+    Random rnd(301);
+
+    // Write 8MB (80 values, each 100K)
+    ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+    std::vector<std::string> values;
+    std::string str;
+    for (int i = 0; i < num_iter; i++) {
+      if (i % 4 == 0) {        // high compression ratio
+        str = RandomString(&rnd, 1000);
+      }
+      values.push_back(str);
+      ASSERT_OK(Put(1, Key(i), values[i]));
+    }
+
+    // flush all data from memtable so that reads are from block cache
+    ASSERT_OK(Flush(1));
+
+    for (int i = 0; i < num_iter; i++) {
+      ASSERT_EQ(Get(1, Key(i)), values[i]);
+    }
+
+    // check that we triggered the appropriate code paths in the cache
+    switch (iter) {
+      case 0:
+        // only uncompressed block cache
+        ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0);
+        ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0);
+        break;
+      case 1:
+        // no block cache, only compressed cache
+        ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0);
+        ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0);
+        break;
+      case 2:
+        // both compressed and uncompressed block cache
+        ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0);
+        ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0);
+        break;
+      case 3:
+        // both compressed and uncompressed block cache
+        ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0);
+        ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_HIT), 0);
+        ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0);
+        // compressed doesn't have any hits since blocks are not compressed on
+        // storage
+        ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_HIT), 0);
+        break;
+      default:
+        ASSERT_TRUE(false);
+    }
+
+    options.create_if_missing = true;
+    DestroyAndReopen(&options);
+  }
+}
+
+static std::string CompressibleString(Random* rnd, int len) {
+  std::string r;
+  test::CompressibleString(rnd, 0.8, len, &r);
+  return r;
+}
+
+TEST(DBTest, UniversalCompactionCompressRatio1) {
+  Options options;
+  options.compaction_style = kCompactionStyleUniversal;
+  options.write_buffer_size = 100<<10; //100KB
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 1;
+  options.compaction_options_universal.compression_size_percent = 70;
+  options = CurrentOptions(options);
+  Reopen(&options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  // The first compaction (2) is compressed.
+  for (int num = 0; num < 2; num++) {
+    // Write 110KB (11 values, each 10K)
+    for (int i = 0; i < 11; i++) {
+      ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000)));
+      key_idx++;
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+    dbfull()->TEST_WaitForCompact();
+  }
+  ASSERT_LT((int)dbfull()->TEST_GetLevel0TotalSize(), 110000 * 2 * 0.9);
+
+  // The second compaction (4) is compressed
+  for (int num = 0; num < 2; num++) {
+    // Write 110KB (11 values, each 10K)
+    for (int i = 0; i < 11; i++) {
+      ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000)));
+      key_idx++;
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+    dbfull()->TEST_WaitForCompact();
+  }
+  ASSERT_LT((int)dbfull()->TEST_GetLevel0TotalSize(), 110000 * 4 * 0.9);
+
+  // The third compaction (2 4) is compressed since this time it is
+  // (1 1 3.2) and 3.2/5.2 doesn't reach ratio.
+  for (int num = 0; num < 2; num++) {
+    // Write 110KB (11 values, each 10K)
+    for (int i = 0; i < 11; i++) {
+      ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000)));
+      key_idx++;
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+    dbfull()->TEST_WaitForCompact();
+  }
+  ASSERT_LT((int)dbfull()->TEST_GetLevel0TotalSize(), 110000 * 6 * 0.9);
+
+  // When we start for the compaction up to (2 4 8), the latest
+  // compressed is not compressed.
+  for (int num = 0; num < 8; num++) {
+    // Write 110KB (11 values, each 10K)
+    for (int i = 0; i < 11; i++) {
+      ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000)));
+      key_idx++;
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+    dbfull()->TEST_WaitForCompact();
+  }
+  ASSERT_GT((int)dbfull()->TEST_GetLevel0TotalSize(),
+            110000 * 11 * 0.8 + 110000 * 2);
+}
+
+TEST(DBTest, UniversalCompactionCompressRatio2) {
+  Options options;
+  options.compaction_style = kCompactionStyleUniversal;
+  options.write_buffer_size = 100<<10; //100KB
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 1;
+  options.compaction_options_universal.compression_size_percent = 95;
+  options = CurrentOptions(options);
+  Reopen(&options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  // When we start for the compaction up to (2 4 8), the latest
+  // compressed is compressed given the size ratio to compress.
+  for (int num = 0; num < 14; num++) {
+    // Write 120KB (12 values, each 10K)
+    for (int i = 0; i < 12; i++) {
+      ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000)));
+      key_idx++;
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+    dbfull()->TEST_WaitForCompact();
+  }
+  ASSERT_LT((int)dbfull()->TEST_GetLevel0TotalSize(),
+            120000 * 12 * 0.8 + 120000 * 2);
+}
+#endif
+
+TEST(DBTest, ConvertCompactionStyle) {
+  Random rnd(301);
+  int max_key_level_insert = 200;
+  int max_key_universal_insert = 600;
+
+  // Stage 1: generate a db with level compaction
+  Options options;
+  options.write_buffer_size = 100<<10; //100KB
+  options.num_levels = 4;
+  options.level0_file_num_compaction_trigger = 3;
+  options.max_bytes_for_level_base = 500<<10; // 500KB
+  options.max_bytes_for_level_multiplier = 1;
+  options.target_file_size_base = 200<<10; // 200KB
+  options.target_file_size_multiplier = 1;
+  options = CurrentOptions(options);
+  CreateAndReopenWithCF({"pikachu"}, &options);
+
+  for (int i = 0; i <= max_key_level_insert; i++) {
+    // each value is 10K
+    ASSERT_OK(Put(1, Key(i), RandomString(&rnd, 10000)));
+  }
+  ASSERT_OK(Flush(1));
+  dbfull()->TEST_WaitForCompact();
+
+  ASSERT_GT(TotalTableFiles(1, 4), 1);
+  int non_level0_num_files = 0;
+  for (int i = 1; i < options.num_levels; i++) {
+    non_level0_num_files += NumTableFilesAtLevel(i, 1);
+  }
+  ASSERT_GT(non_level0_num_files, 0);
+
+  // Stage 2: reopen with universal compaction - should fail
+  options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options = CurrentOptions(options);
+  Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, &options);
+  ASSERT_TRUE(s.IsInvalidArgument());
+
+  // Stage 3: compact into a single file and move the file to level 0
+  options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.target_file_size_base = INT_MAX;
+  options.target_file_size_multiplier = 1;
+  options.max_bytes_for_level_base = INT_MAX;
+  options.max_bytes_for_level_multiplier = 1;
+  options = CurrentOptions(options);
+  ReopenWithColumnFamilies({"default", "pikachu"}, &options);
+
+  dbfull()->CompactRange(handles_[1], nullptr, nullptr, true /* reduce level */,
+                         0 /* reduce to level 0 */);
+
+  for (int i = 0; i < options.num_levels; i++) {
+    int num = NumTableFilesAtLevel(i, 1);
+    if (i == 0) {
+      ASSERT_EQ(num, 1);
+    } else {
+      ASSERT_EQ(num, 0);
+    }
+  }
+
+  // Stage 4: re-open in universal compaction style and do some db operations
+  options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.write_buffer_size = 100<<10; //100KB
+  options.level0_file_num_compaction_trigger = 3;
+  options = CurrentOptions(options);
+  ReopenWithColumnFamilies({"default", "pikachu"}, &options);
+
+  for (int i = max_key_level_insert / 2; i <= max_key_universal_insert; i++) {
+    ASSERT_OK(Put(1, Key(i), RandomString(&rnd, 10000)));
+  }
+  dbfull()->Flush(FlushOptions());
+  ASSERT_OK(Flush(1));
+  dbfull()->TEST_WaitForCompact();
+
+  for (int i = 1; i < options.num_levels; i++) {
+    ASSERT_EQ(NumTableFilesAtLevel(i, 1), 0);
+  }
+
+  // verify keys inserted in both level compaction style and universal
+  // compaction style
+  std::string keys_in_db;
+  Iterator* iter = dbfull()->NewIterator(ReadOptions(), handles_[1]);
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    keys_in_db.append(iter->key().ToString());
+    keys_in_db.push_back(',');
+  }
+  delete iter;
+
+  std::string expected_keys;
+  for (int i = 0; i <= max_key_universal_insert; i++) {
+    expected_keys.append(Key(i));
+    expected_keys.push_back(',');
+  }
+
+  ASSERT_EQ(keys_in_db, expected_keys);
+}
+
+namespace {
+void MinLevelHelper(DBTest* self, Options& options) {
+  Random rnd(301);
+
+  for (int num = 0;
+    num < options.level0_file_num_compaction_trigger - 1;
+    num++)
+  {
+    std::vector<std::string> values;
+    // Write 120KB (12 values, each 10K)
+    for (int i = 0; i < 12; i++) {
+      values.push_back(RandomString(&rnd, 10000));
+      ASSERT_OK(self->Put(Key(i), values[i]));
+    }
+    self->dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_EQ(self->NumTableFilesAtLevel(0), num + 1);
+  }
+
+  //generate one more file in level-0, and should trigger level-0 compaction
+  std::vector<std::string> values;
+  for (int i = 0; i < 12; i++) {
+    values.push_back(RandomString(&rnd, 10000));
+    ASSERT_OK(self->Put(Key(i), values[i]));
+  }
+  self->dbfull()->TEST_WaitForCompact();
+
+  ASSERT_EQ(self->NumTableFilesAtLevel(0), 0);
+  ASSERT_EQ(self->NumTableFilesAtLevel(1), 1);
+}
+
+// returns false if the calling-Test should be skipped
+bool MinLevelToCompress(CompressionType& type, Options& options, int wbits,
+                        int lev, int strategy) {
+  fprintf(stderr, "Test with compression options : window_bits = %d, level =  %d, strategy = %d}\n", wbits, lev, strategy);
+  options.write_buffer_size = 100<<10; //100KB
+  options.num_levels = 3;
+  options.max_mem_compaction_level = 0;
+  options.level0_file_num_compaction_trigger = 3;
+  options.create_if_missing = true;
+
+  if (SnappyCompressionSupported(CompressionOptions(wbits, lev, strategy))) {
+    type = kSnappyCompression;
+    fprintf(stderr, "using snappy\n");
+  } else if (ZlibCompressionSupported(
+               CompressionOptions(wbits, lev, strategy))) {
+    type = kZlibCompression;
+    fprintf(stderr, "using zlib\n");
+  } else if (BZip2CompressionSupported(
+               CompressionOptions(wbits, lev, strategy))) {
+    type = kBZip2Compression;
+    fprintf(stderr, "using bzip2\n");
+  } else if (LZ4CompressionSupported(
+                 CompressionOptions(wbits, lev, strategy))) {
+    type = kLZ4Compression;
+    fprintf(stderr, "using lz4\n");
+  } else if (LZ4HCCompressionSupported(
+                 CompressionOptions(wbits, lev, strategy))) {
+    type = kLZ4HCCompression;
+    fprintf(stderr, "using lz4hc\n");
+  } else {
+    fprintf(stderr, "skipping test, compression disabled\n");
+    return false;
+  }
+  options.compression_per_level.resize(options.num_levels);
+
+  // do not compress L0
+  for (int i = 0; i < 1; i++) {
+    options.compression_per_level[i] = kNoCompression;
+  }
+  for (int i = 1; i < options.num_levels; i++) {
+    options.compression_per_level[i] = type;
+  }
+  return true;
+}
+}  // namespace
+
+TEST(DBTest, MinLevelToCompress1) {
+  Options options = CurrentOptions();
+  CompressionType type;
+  if (!MinLevelToCompress(type, options, -14, -1, 0)) {
+    return;
+  }
+  Reopen(&options);
+  MinLevelHelper(this, options);
+
+  // do not compress L0 and L1
+  for (int i = 0; i < 2; i++) {
+    options.compression_per_level[i] = kNoCompression;
+  }
+  for (int i = 2; i < options.num_levels; i++) {
+    options.compression_per_level[i] = type;
+  }
+  DestroyAndReopen(&options);
+  MinLevelHelper(this, options);
+}
+
+TEST(DBTest, MinLevelToCompress2) {
+  Options options = CurrentOptions();
+  CompressionType type;
+  if (!MinLevelToCompress(type, options, 15, -1, 0)) {
+    return;
+  }
+  Reopen(&options);
+  MinLevelHelper(this, options);
+
+  // do not compress L0 and L1
+  for (int i = 0; i < 2; i++) {
+    options.compression_per_level[i] = kNoCompression;
+  }
+  for (int i = 2; i < options.num_levels; i++) {
+    options.compression_per_level[i] = type;
+  }
+  DestroyAndReopen(&options);
+  MinLevelHelper(this, options);
+}
+
+TEST(DBTest, RepeatedWritesToSameKey) {
+  do {
+    Options options;
+    options.env = env_;
+    options.write_buffer_size = 100000;  // Small write buffer
+    options = CurrentOptions(options);
+    CreateAndReopenWithCF({"pikachu"}, &options);
+
+    // We must have at most one file per level except for level-0,
+    // which may have up to kL0_StopWritesTrigger files.
+    const int kMaxFiles =
+        options.num_levels + options.level0_stop_writes_trigger;
+
+    Random rnd(301);
+    std::string value = RandomString(&rnd, 2 * options.write_buffer_size);
+    for (int i = 0; i < 5 * kMaxFiles; i++) {
+      ASSERT_OK(Put(1, "key", value));
+      ASSERT_LE(TotalTableFiles(1), kMaxFiles);
+    }
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, InPlaceUpdate) {
+  do {
+    Options options;
+    options.create_if_missing = true;
+    options.inplace_update_support = true;
+    options.env = env_;
+    options.write_buffer_size = 100000;
+    options = CurrentOptions(options);
+    CreateAndReopenWithCF({"pikachu"}, &options);
+
+    // Update key with values of smaller size
+    int numValues = 10;
+    for (int i = numValues; i > 0; i--) {
+      std::string value = DummyString(i, 'a');
+      ASSERT_OK(Put(1, "key", value));
+      ASSERT_EQ(value, Get(1, "key"));
+    }
+
+    // Only 1 instance for that key.
+    validateNumberOfEntries(1, 1);
+
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, InPlaceUpdateLargeNewValue) {
+  do {
+    Options options;
+    options.create_if_missing = true;
+    options.inplace_update_support = true;
+    options.env = env_;
+    options.write_buffer_size = 100000;
+    options = CurrentOptions(options);
+    CreateAndReopenWithCF({"pikachu"}, &options);
+
+    // Update key with values of larger size
+    int numValues = 10;
+    for (int i = 0; i < numValues; i++) {
+      std::string value = DummyString(i, 'a');
+      ASSERT_OK(Put(1, "key", value));
+      ASSERT_EQ(value, Get(1, "key"));
+    }
+
+    // All 10 updates exist in the internal iterator
+    validateNumberOfEntries(numValues, 1);
+
+  } while (ChangeCompactOptions());
+}
+
+
+TEST(DBTest, InPlaceUpdateCallbackSmallerSize) {
+  do {
+    Options options;
+    options.create_if_missing = true;
+    options.inplace_update_support = true;
+
+    options.env = env_;
+    options.write_buffer_size = 100000;
+    options.inplace_callback =
+      rocksdb::DBTest::updateInPlaceSmallerSize;
+    options = CurrentOptions(options);
+    CreateAndReopenWithCF({"pikachu"}, &options);
+
+    // Update key with values of smaller size
+    int numValues = 10;
+    ASSERT_OK(Put(1, "key", DummyString(numValues, 'a')));
+    ASSERT_EQ(DummyString(numValues, 'c'), Get(1, "key"));
+
+    for (int i = numValues; i > 0; i--) {
+      ASSERT_OK(Put(1, "key", DummyString(i, 'a')));
+      ASSERT_EQ(DummyString(i - 1, 'b'), Get(1, "key"));
+    }
+
+    // Only 1 instance for that key.
+    validateNumberOfEntries(1, 1);
+
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, InPlaceUpdateCallbackSmallerVarintSize) {
+  do {
+    Options options;
+    options.create_if_missing = true;
+    options.inplace_update_support = true;
+
+    options.env = env_;
+    options.write_buffer_size = 100000;
+    options.inplace_callback =
+      rocksdb::DBTest::updateInPlaceSmallerVarintSize;
+    options = CurrentOptions(options);
+    CreateAndReopenWithCF({"pikachu"}, &options);
+
+    // Update key with values of smaller varint size
+    int numValues = 265;
+    ASSERT_OK(Put(1, "key", DummyString(numValues, 'a')));
+    ASSERT_EQ(DummyString(numValues, 'c'), Get(1, "key"));
+
+    for (int i = numValues; i > 0; i--) {
+      ASSERT_OK(Put(1, "key", DummyString(i, 'a')));
+      ASSERT_EQ(DummyString(1, 'b'), Get(1, "key"));
+    }
+
+    // Only 1 instance for that key.
+    validateNumberOfEntries(1, 1);
+
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, InPlaceUpdateCallbackLargeNewValue) {
+  do {
+    Options options;
+    options.create_if_missing = true;
+    options.inplace_update_support = true;
+
+    options.env = env_;
+    options.write_buffer_size = 100000;
+    options.inplace_callback =
+      rocksdb::DBTest::updateInPlaceLargerSize;
+    options = CurrentOptions(options);
+    CreateAndReopenWithCF({"pikachu"}, &options);
+
+    // Update key with values of larger size
+    int numValues = 10;
+    for (int i = 0; i < numValues; i++) {
+      ASSERT_OK(Put(1, "key", DummyString(i, 'a')));
+      ASSERT_EQ(DummyString(i, 'c'), Get(1, "key"));
+    }
+
+    // No inplace updates. All updates are puts with new seq number
+    // All 10 updates exist in the internal iterator
+    validateNumberOfEntries(numValues, 1);
+
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, InPlaceUpdateCallbackNoAction) {
+  do {
+    Options options;
+    options.create_if_missing = true;
+    options.inplace_update_support = true;
+
+    options.env = env_;
+    options.write_buffer_size = 100000;
+    options.inplace_callback =
+      rocksdb::DBTest::updateInPlaceNoAction;
+    options = CurrentOptions(options);
+    CreateAndReopenWithCF({"pikachu"}, &options);
+
+    // Callback function requests no actions from db
+    ASSERT_OK(Put(1, "key", DummyString(1, 'a')));
+    ASSERT_EQ(Get(1, "key"), "NOT_FOUND");
+
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, CompactionFilter) {
+  Options options = CurrentOptions();
+  options.max_open_files = -1;
+  options.num_levels = 3;
+  options.max_mem_compaction_level = 0;
+  options.compaction_filter_factory = std::make_shared<KeepFilterFactory>();
+  options = CurrentOptions(options);
+  CreateAndReopenWithCF({"pikachu"}, &options);
+
+  // Write 100K keys, these are written to a few files in L0.
+  const std::string value(10, 'x');
+  for (int i = 0; i < 100000; i++) {
+    char key[100];
+    snprintf(key, sizeof(key), "B%010d", i);
+    Put(1, key, value);
+  }
+  ASSERT_OK(Flush(1));
+
+  // Push all files to the highest level L2. Verify that
+  // the compaction is each level invokes the filter for
+  // all the keys in that level.
+  cfilter_count = 0;
+  dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
+  ASSERT_EQ(cfilter_count, 100000);
+  cfilter_count = 0;
+  dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
+  ASSERT_EQ(cfilter_count, 100000);
+
+  ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0);
+  ASSERT_NE(NumTableFilesAtLevel(2, 1), 0);
+  cfilter_count = 0;
+
+  // All the files are in the lowest level.
+  // Verify that all but the 100001st record
+  // has sequence number zero. The 100001st record
+  // is at the tip of this snapshot and cannot
+  // be zeroed out.
+  // TODO: figure out sequence number squashtoo
+  int count = 0;
+  int total = 0;
+  Iterator* iter = dbfull()->TEST_NewInternalIterator(handles_[1]);
+  iter->SeekToFirst();
+  ASSERT_OK(iter->status());
+  while (iter->Valid()) {
+    ParsedInternalKey ikey(Slice(), 0, kTypeValue);
+    ikey.sequence = -1;
+    ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
+    total++;
+    if (ikey.sequence != 0) {
+      count++;
+    }
+    iter->Next();
+  }
+  ASSERT_EQ(total, 100000);
+  ASSERT_EQ(count, 1);
+  delete iter;
+
+  // overwrite all the 100K keys once again.
+  for (int i = 0; i < 100000; i++) {
+    char key[100];
+    snprintf(key, sizeof(key), "B%010d", i);
+    ASSERT_OK(Put(1, key, value));
+  }
+  ASSERT_OK(Flush(1));
+
+  // push all files to the highest level L2. This
+  // means that all keys should pass at least once
+  // via the compaction filter
+  cfilter_count = 0;
+  dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
+  ASSERT_EQ(cfilter_count, 100000);
+  cfilter_count = 0;
+  dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
+  ASSERT_EQ(cfilter_count, 100000);
+  ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0);
+  ASSERT_NE(NumTableFilesAtLevel(2, 1), 0);
+
+  // create a new database with the compaction
+  // filter in such a way that it deletes all keys
+  options.compaction_filter_factory = std::make_shared<DeleteFilterFactory>();
+  options.create_if_missing = true;
+  DestroyAndReopen(&options);
+  CreateAndReopenWithCF({"pikachu"}, &options);
+
+  // write all the keys once again.
+  for (int i = 0; i < 100000; i++) {
+    char key[100];
+    snprintf(key, sizeof(key), "B%010d", i);
+    ASSERT_OK(Put(1, key, value));
+  }
+  ASSERT_OK(Flush(1));
+  ASSERT_NE(NumTableFilesAtLevel(0, 1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(2, 1), 0);
+
+  // Push all files to the highest level L2. This
+  // triggers the compaction filter to delete all keys,
+  // verify that at the end of the compaction process,
+  // nothing is left.
+  cfilter_count = 0;
+  dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
+  ASSERT_EQ(cfilter_count, 100000);
+  cfilter_count = 0;
+  dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
+  ASSERT_EQ(cfilter_count, 0);
+  ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0);
+
+  // Scan the entire database to ensure that nothing is left
+  iter = db_->NewIterator(ReadOptions(), handles_[1]);
+  iter->SeekToFirst();
+  count = 0;
+  while (iter->Valid()) {
+    count++;
+    iter->Next();
+  }
+  ASSERT_EQ(count, 0);
+  delete iter;
+
+  // The sequence number of the remaining record
+  // is not zeroed out even though it is at the
+  // level Lmax because this record is at the tip
+  // TODO: remove the following or design a different
+  // test
+  count = 0;
+  iter = dbfull()->TEST_NewInternalIterator(handles_[1]);
+  iter->SeekToFirst();
+  ASSERT_OK(iter->status());
+  while (iter->Valid()) {
+    ParsedInternalKey ikey(Slice(), 0, kTypeValue);
+    ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
+    ASSERT_NE(ikey.sequence, (unsigned)0);
+    count++;
+    iter->Next();
+  }
+  ASSERT_EQ(count, 0);
+  delete iter;
+}
+
+TEST(DBTest, CompactionFilterWithValueChange) {
+  do {
+    Options options;
+    options.num_levels = 3;
+    options.max_mem_compaction_level = 0;
+    options.compaction_filter_factory =
+      std::make_shared<ChangeFilterFactory>();
+    options = CurrentOptions(options);
+    CreateAndReopenWithCF({"pikachu"}, &options);
+
+    // Write 100K+1 keys, these are written to a few files
+    // in L0. We do this so that the current snapshot points
+    // to the 100001 key.The compaction filter is  not invoked
+    // on keys that are visible via a snapshot because we
+    // anyways cannot delete it.
+    const std::string value(10, 'x');
+    for (int i = 0; i < 100001; i++) {
+      char key[100];
+      snprintf(key, sizeof(key), "B%010d", i);
+      Put(1, key, value);
+    }
+
+    // push all files to  lower levels
+    ASSERT_OK(Flush(1));
+    dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
+    dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
+
+    // re-write all data again
+    for (int i = 0; i < 100001; i++) {
+      char key[100];
+      snprintf(key, sizeof(key), "B%010d", i);
+      Put(1, key, value);
+    }
+
+    // push all files to  lower levels. This should
+    // invoke the compaction filter for all 100000 keys.
+    ASSERT_OK(Flush(1));
+    dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
+    dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
+
+    // verify that all keys now have the new value that
+    // was set by the compaction process.
+    for (int i = 0; i < 100001; i++) {
+      char key[100];
+      snprintf(key, sizeof(key), "B%010d", i);
+      std::string newvalue = Get(1, key);
+      ASSERT_EQ(newvalue.compare(NEW_VALUE), 0);
+    }
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, CompactionFilterContextManual) {
+  KeepFilterFactory* filter = new KeepFilterFactory();
+
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.compaction_filter_factory.reset(filter);
+  options.compression = kNoCompression;
+  options.level0_file_num_compaction_trigger = 8;
+  Reopen(&options);
+  int num_keys_per_file = 400;
+  for (int j = 0; j < 3; j++) {
+    // Write several keys.
+    const std::string value(10, 'x');
+    for (int i = 0; i < num_keys_per_file; i++) {
+      char key[100];
+      snprintf(key, sizeof(key), "B%08d%02d", i, j);
+      Put(key, value);
+    }
+    dbfull()->TEST_FlushMemTable();
+    // Make sure next file is much smaller so automatic compaction will not
+    // be triggered.
+    num_keys_per_file /= 2;
+  }
+
+  // Force a manual compaction
+  cfilter_count = 0;
+  filter->expect_manual_compaction_.store(true);
+  filter->expect_full_compaction_.store(false);  // Manual compaction always
+                                                 // set this flag.
+  dbfull()->CompactRange(nullptr, nullptr);
+  ASSERT_EQ(cfilter_count, 700);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 1);
+
+  // Verify total number of keys is correct after manual compaction.
+  int count = 0;
+  int total = 0;
+  Iterator* iter = dbfull()->TEST_NewInternalIterator();
+  iter->SeekToFirst();
+  ASSERT_OK(iter->status());
+  while (iter->Valid()) {
+    ParsedInternalKey ikey(Slice(), 0, kTypeValue);
+    ikey.sequence = -1;
+    ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
+    total++;
+    if (ikey.sequence != 0) {
+      count++;
+    }
+    iter->Next();
+  }
+  ASSERT_EQ(total, 700);
+  ASSERT_EQ(count, 1);
+  delete iter;
+}
+
+class KeepFilterV2 : public CompactionFilterV2 {
+ public:
+  virtual std::vector<bool> Filter(int level,
+                                   const SliceVector& keys,
+                                   const SliceVector& existing_values,
+                                   std::vector<std::string>* new_values,
+                                   std::vector<bool>* values_changed)
+    const override {
+    cfilter_count++;
+    std::vector<bool> ret;
+    new_values->clear();
+    values_changed->clear();
+    for (unsigned int i = 0; i < keys.size(); ++i) {
+      values_changed->push_back(false);
+      ret.push_back(false);
+    }
+    return ret;
+  }
+
+  virtual const char* Name() const override {
+    return "KeepFilterV2";
+  }
+};
+
+class DeleteFilterV2 : public CompactionFilterV2 {
+ public:
+  virtual std::vector<bool> Filter(int level,
+                                   const SliceVector& keys,
+                                   const SliceVector& existing_values,
+                                   std::vector<std::string>* new_values,
+                                   std::vector<bool>* values_changed)
+    const override {
+    cfilter_count++;
+    new_values->clear();
+    values_changed->clear();
+    std::vector<bool> ret;
+    for (unsigned int i = 0; i < keys.size(); ++i) {
+      values_changed->push_back(false);
+      ret.push_back(true);
+    }
+    return ret;
+  }
+
+  virtual const char* Name() const override {
+    return "DeleteFilterV2";
+  }
+};
+
+class ChangeFilterV2 : public CompactionFilterV2 {
+ public:
+  virtual std::vector<bool> Filter(int level,
+                                   const SliceVector& keys,
+                                   const SliceVector& existing_values,
+                                   std::vector<std::string>* new_values,
+                                   std::vector<bool>* values_changed)
+    const override {
+    std::vector<bool> ret;
+    new_values->clear();
+    values_changed->clear();
+    for (unsigned int i = 0; i < keys.size(); ++i) {
+      values_changed->push_back(true);
+      new_values->push_back(NEW_VALUE);
+      ret.push_back(false);
+    }
+    return ret;
+  }
+
+  virtual const char* Name() const override {
+    return "ChangeFilterV2";
+  }
+};
+
+class KeepFilterFactoryV2 : public CompactionFilterFactoryV2 {
+ public:
+  explicit KeepFilterFactoryV2(const SliceTransform* prefix_extractor)
+    : CompactionFilterFactoryV2(prefix_extractor) { }
+
+  virtual std::unique_ptr<CompactionFilterV2>
+  CreateCompactionFilterV2(
+      const CompactionFilterContext& context) override {
+    return std::unique_ptr<CompactionFilterV2>(new KeepFilterV2());
+  }
+
+  virtual const char* Name() const override {
+    return "KeepFilterFactoryV2";
+  }
+};
+
+class DeleteFilterFactoryV2 : public CompactionFilterFactoryV2 {
+ public:
+  explicit DeleteFilterFactoryV2(const SliceTransform* prefix_extractor)
+    : CompactionFilterFactoryV2(prefix_extractor) { }
+
+  virtual std::unique_ptr<CompactionFilterV2>
+  CreateCompactionFilterV2(
+      const CompactionFilterContext& context) override {
+    return std::unique_ptr<CompactionFilterV2>(new DeleteFilterV2());
+  }
+
+  virtual const char* Name() const override {
+    return "DeleteFilterFactoryV2";
+  }
+};
+
+class ChangeFilterFactoryV2 : public CompactionFilterFactoryV2 {
+ public:
+  explicit ChangeFilterFactoryV2(const SliceTransform* prefix_extractor)
+    : CompactionFilterFactoryV2(prefix_extractor) { }
+
+  virtual std::unique_ptr<CompactionFilterV2>
+  CreateCompactionFilterV2(
+      const CompactionFilterContext& context) override {
+    return std::unique_ptr<CompactionFilterV2>(new ChangeFilterV2());
+  }
+
+  virtual const char* Name() const override {
+    return "ChangeFilterFactoryV2";
+  }
+};
+
+TEST(DBTest, CompactionFilterV2) {
+  Options options = CurrentOptions();
+  options.num_levels = 3;
+  options.max_mem_compaction_level = 0;
+  // extract prefix
+  std::unique_ptr<const SliceTransform> prefix_extractor;
+  prefix_extractor.reset(NewFixedPrefixTransform(8));
+
+  options.compaction_filter_factory_v2
+    = std::make_shared<KeepFilterFactoryV2>(prefix_extractor.get());
+  // In a testing environment, we can only flush the application
+  // compaction filter buffer using universal compaction
+  option_config_ = kUniversalCompaction;
+  options.compaction_style = (rocksdb::CompactionStyle)1;
+  Reopen(&options);
+
+  // Write 100K keys, these are written to a few files in L0.
+  const std::string value(10, 'x');
+  for (int i = 0; i < 100000; i++) {
+    char key[100];
+    snprintf(key, sizeof(key), "B%08d%010d", i , i);
+    Put(key, value);
+  }
+
+  dbfull()->TEST_FlushMemTable();
+
+  dbfull()->TEST_CompactRange(0, nullptr, nullptr);
+  dbfull()->TEST_CompactRange(1, nullptr, nullptr);
+
+  ASSERT_EQ(NumTableFilesAtLevel(0), 1);
+
+  // All the files are in the lowest level.
+  int count = 0;
+  int total = 0;
+  Iterator* iter = dbfull()->TEST_NewInternalIterator();
+  iter->SeekToFirst();
+  ASSERT_OK(iter->status());
+  while (iter->Valid()) {
+    ParsedInternalKey ikey(Slice(), 0, kTypeValue);
+    ikey.sequence = -1;
+    ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
+    total++;
+    if (ikey.sequence != 0) {
+      count++;
+    }
+    iter->Next();
+  }
+
+  ASSERT_EQ(total, 100000);
+  // 1 snapshot only. Since we are using universal compacton,
+  // the sequence no is cleared for better compression
+  ASSERT_EQ(count, 1);
+  delete iter;
+
+  // create a new database with the compaction
+  // filter in such a way that it deletes all keys
+  options.compaction_filter_factory_v2 =
+    std::make_shared<DeleteFilterFactoryV2>(prefix_extractor.get());
+  options.create_if_missing = true;
+  DestroyAndReopen(&options);
+
+  // write all the keys once again.
+  for (int i = 0; i < 100000; i++) {
+    char key[100];
+    snprintf(key, sizeof(key), "B%08d%010d", i, i);
+    Put(key, value);
+  }
+
+  dbfull()->TEST_FlushMemTable();
+  ASSERT_NE(NumTableFilesAtLevel(0), 0);
+
+  dbfull()->TEST_CompactRange(0, nullptr, nullptr);
+  dbfull()->TEST_CompactRange(1, nullptr, nullptr);
+  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+
+  // Scan the entire database to ensure that nothing is left
+  iter = db_->NewIterator(ReadOptions());
+  iter->SeekToFirst();
+  count = 0;
+  while (iter->Valid()) {
+    count++;
+    iter->Next();
+  }
+
+  ASSERT_EQ(count, 0);
+  delete iter;
+}
+
+TEST(DBTest, CompactionFilterV2WithValueChange) {
+  Options options = CurrentOptions();
+  options.num_levels = 3;
+  options.max_mem_compaction_level = 0;
+  std::unique_ptr<const SliceTransform> prefix_extractor;
+  prefix_extractor.reset(NewFixedPrefixTransform(8));
+  options.compaction_filter_factory_v2 =
+    std::make_shared<ChangeFilterFactoryV2>(prefix_extractor.get());
+  // In a testing environment, we can only flush the application
+  // compaction filter buffer using universal compaction
+  option_config_ = kUniversalCompaction;
+  options.compaction_style = (rocksdb::CompactionStyle)1;
+  options = CurrentOptions(options);
+  Reopen(&options);
+
+  // Write 100K+1 keys, these are written to a few files
+  // in L0. We do this so that the current snapshot points
+  // to the 100001 key.The compaction filter is  not invoked
+  // on keys that are visible via a snapshot because we
+  // anyways cannot delete it.
+  const std::string value(10, 'x');
+  for (int i = 0; i < 100001; i++) {
+    char key[100];
+    snprintf(key, sizeof(key), "B%08d%010d", i, i);
+    Put(key, value);
+  }
+
+  // push all files to lower levels
+  dbfull()->TEST_FlushMemTable();
+  dbfull()->TEST_CompactRange(0, nullptr, nullptr);
+  dbfull()->TEST_CompactRange(1, nullptr, nullptr);
+
+  // verify that all keys now have the new value that
+  // was set by the compaction process.
+  for (int i = 0; i < 100001; i++) {
+    char key[100];
+    snprintf(key, sizeof(key), "B%08d%010d", i, i);
+    std::string newvalue = Get(key);
+    ASSERT_EQ(newvalue.compare(NEW_VALUE), 0);
+  }
+}
+
+TEST(DBTest, CompactionFilterV2NULLPrefix) {
+  Options options = CurrentOptions();
+  options.num_levels = 3;
+  options.max_mem_compaction_level = 0;
+  std::unique_ptr<const SliceTransform> prefix_extractor;
+  prefix_extractor.reset(NewFixedPrefixTransform(8));
+  options.compaction_filter_factory_v2 =
+    std::make_shared<ChangeFilterFactoryV2>(prefix_extractor.get());
+  // In a testing environment, we can only flush the application
+  // compaction filter buffer using universal compaction
+  option_config_ = kUniversalCompaction;
+  options.compaction_style = (rocksdb::CompactionStyle)1;
+  Reopen(&options);
+
+  // Write 100K+1 keys, these are written to a few files
+  // in L0. We do this so that the current snapshot points
+  // to the 100001 key.The compaction filter is  not invoked
+  // on keys that are visible via a snapshot because we
+  // anyways cannot delete it.
+  const std::string value(10, 'x');
+  char first_key[100];
+  snprintf(first_key, sizeof(first_key), "%s0000%010d", "NULL", 1);
+  Put(first_key, value);
+  for (int i = 1; i < 100000; i++) {
+    char key[100];
+    snprintf(key, sizeof(key), "%08d%010d", i, i);
+    Put(key, value);
+  }
+
+  char last_key[100];
+  snprintf(last_key, sizeof(last_key), "%s0000%010d", "NULL", 2);
+  Put(last_key, value);
+
+  // push all files to lower levels
+  dbfull()->TEST_FlushMemTable();
+  dbfull()->TEST_CompactRange(0, nullptr, nullptr);
+
+  // verify that all keys now have the new value that
+  // was set by the compaction process.
+  std::string newvalue = Get(first_key);
+  ASSERT_EQ(newvalue.compare(NEW_VALUE), 0);
+  newvalue = Get(last_key);
+  ASSERT_EQ(newvalue.compare(NEW_VALUE), 0);
+  for (int i = 1; i < 100000; i++) {
+    char key[100];
+    snprintf(key, sizeof(key), "%08d%010d", i, i);
+    std::string newvalue = Get(key);
+    ASSERT_EQ(newvalue.compare(NEW_VALUE), 0);
+  }
+}
+
+TEST(DBTest, SparseMerge) {
+  do {
+    Options options = CurrentOptions();
+    options.compression = kNoCompression;
+    CreateAndReopenWithCF({"pikachu"}, &options);
+
+    FillLevels("A", "Z", 1);
+
+    // Suppose there is:
+    //    small amount of data with prefix A
+    //    large amount of data with prefix B
+    //    small amount of data with prefix C
+    // and that recent updates have made small changes to all three prefixes.
+    // Check that we do not do a compaction that merges all of B in one shot.
+    const std::string value(1000, 'x');
+    Put(1, "A", "va");
+    // Write approximately 100MB of "B" values
+    for (int i = 0; i < 100000; i++) {
+      char key[100];
+      snprintf(key, sizeof(key), "B%010d", i);
+      Put(1, key, value);
+    }
+    Put(1, "C", "vc");
+    ASSERT_OK(Flush(1));
+    dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
+
+    // Make sparse update
+    Put(1, "A", "va2");
+    Put(1, "B100", "bvalue2");
+    Put(1, "C", "vc2");
+    ASSERT_OK(Flush(1));
+
+    // Compactions should not cause us to create a situation where
+    // a file overlaps too much data at the next level.
+    ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(handles_[1]),
+              20 * 1048576);
+    dbfull()->TEST_CompactRange(0, nullptr, nullptr);
+    ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(handles_[1]),
+              20 * 1048576);
+    dbfull()->TEST_CompactRange(1, nullptr, nullptr);
+    ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(handles_[1]),
+              20 * 1048576);
+  } while (ChangeCompactOptions());
+}
+
+static bool Between(uint64_t val, uint64_t low, uint64_t high) {
+  bool result = (val >= low) && (val <= high);
+  if (!result) {
+    fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n",
+            (unsigned long long)(val),
+            (unsigned long long)(low),
+            (unsigned long long)(high));
+  }
+  return result;
+}
+
+TEST(DBTest, ApproximateSizes) {
+  do {
+    Options options;
+    options.write_buffer_size = 100000000;        // Large write buffer
+    options.compression = kNoCompression;
+    options = CurrentOptions(options);
+    DestroyAndReopen();
+    CreateAndReopenWithCF({"pikachu"}, &options);
+
+    ASSERT_TRUE(Between(Size("", "xyz", 1), 0, 0));
+    ReopenWithColumnFamilies({"default", "pikachu"}, &options);
+    ASSERT_TRUE(Between(Size("", "xyz", 1), 0, 0));
+
+    // Write 8MB (80 values, each 100K)
+    ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+    const int N = 80;
+    static const int S1 = 100000;
+    static const int S2 = 105000;  // Allow some expansion from metadata
+    Random rnd(301);
+    for (int i = 0; i < N; i++) {
+      ASSERT_OK(Put(1, Key(i), RandomString(&rnd, S1)));
+    }
+
+    // 0 because GetApproximateSizes() does not account for memtable space
+    ASSERT_TRUE(Between(Size("", Key(50), 1), 0, 0));
+
+    // Check sizes across recovery by reopening a few times
+    for (int run = 0; run < 3; run++) {
+      ReopenWithColumnFamilies({"default", "pikachu"}, &options);
+
+      for (int compact_start = 0; compact_start < N; compact_start += 10) {
+        for (int i = 0; i < N; i += 10) {
+          ASSERT_TRUE(Between(Size("", Key(i), 1), S1 * i, S2 * i));
+          ASSERT_TRUE(Between(Size("", Key(i) + ".suffix", 1), S1 * (i + 1),
+                              S2 * (i + 1)));
+          ASSERT_TRUE(Between(Size(Key(i), Key(i + 10), 1), S1 * 10, S2 * 10));
+        }
+        ASSERT_TRUE(Between(Size("", Key(50), 1), S1 * 50, S2 * 50));
+        ASSERT_TRUE(
+            Between(Size("", Key(50) + ".suffix", 1), S1 * 50, S2 * 50));
+
+        std::string cstart_str = Key(compact_start);
+        std::string cend_str = Key(compact_start + 9);
+        Slice cstart = cstart_str;
+        Slice cend = cend_str;
+        dbfull()->TEST_CompactRange(0, &cstart, &cend, handles_[1]);
+      }
+
+      ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+      ASSERT_GT(NumTableFilesAtLevel(1, 1), 0);
+    }
+    // ApproximateOffsetOf() is not yet implemented in plain table format.
+  } while (ChangeOptions(kSkipUniversalCompaction | kSkipPlainTable));
+}
+
+TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) {
+  do {
+    Options options = CurrentOptions();
+    options.compression = kNoCompression;
+    CreateAndReopenWithCF({"pikachu"}, &options);
+
+    Random rnd(301);
+    std::string big1 = RandomString(&rnd, 100000);
+    ASSERT_OK(Put(1, Key(0), RandomString(&rnd, 10000)));
+    ASSERT_OK(Put(1, Key(1), RandomString(&rnd, 10000)));
+    ASSERT_OK(Put(1, Key(2), big1));
+    ASSERT_OK(Put(1, Key(3), RandomString(&rnd, 10000)));
+    ASSERT_OK(Put(1, Key(4), big1));
+    ASSERT_OK(Put(1, Key(5), RandomString(&rnd, 10000)));
+    ASSERT_OK(Put(1, Key(6), RandomString(&rnd, 300000)));
+    ASSERT_OK(Put(1, Key(7), RandomString(&rnd, 10000)));
+
+    // Check sizes across recovery by reopening a few times
+    for (int run = 0; run < 3; run++) {
+      ReopenWithColumnFamilies({"default", "pikachu"}, &options);
+
+      ASSERT_TRUE(Between(Size("", Key(0), 1), 0, 0));
+      ASSERT_TRUE(Between(Size("", Key(1), 1), 10000, 11000));
+      ASSERT_TRUE(Between(Size("", Key(2), 1), 20000, 21000));
+      ASSERT_TRUE(Between(Size("", Key(3), 1), 120000, 121000));
+      ASSERT_TRUE(Between(Size("", Key(4), 1), 130000, 131000));
+      ASSERT_TRUE(Between(Size("", Key(5), 1), 230000, 231000));
+      ASSERT_TRUE(Between(Size("", Key(6), 1), 240000, 241000));
+      ASSERT_TRUE(Between(Size("", Key(7), 1), 540000, 541000));
+      ASSERT_TRUE(Between(Size("", Key(8), 1), 550000, 560000));
+
+      ASSERT_TRUE(Between(Size(Key(3), Key(5), 1), 110000, 111000));
+
+      dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
+    }
+    // ApproximateOffsetOf() is not yet implemented in plain table format.
+  } while (ChangeOptions(kSkipPlainTable));
+}
+
+TEST(DBTest, IteratorPinsRef) {
+  do {
+    CreateAndReopenWithCF({"pikachu"});
+    Put(1, "foo", "hello");
+
+    // Get iterator that will yield the current contents of the DB.
+    Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]);
+
+    // Write to force compactions
+    Put(1, "foo", "newvalue1");
+    for (int i = 0; i < 100; i++) {
+      // 100K values
+      ASSERT_OK(Put(1, Key(i), Key(i) + std::string(100000, 'v')));
+    }
+    Put(1, "foo", "newvalue2");
+
+    iter->SeekToFirst();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("foo", iter->key().ToString());
+    ASSERT_EQ("hello", iter->value().ToString());
+    iter->Next();
+    ASSERT_TRUE(!iter->Valid());
+    delete iter;
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, Snapshot) {
+  do {
+    CreateAndReopenWithCF({"pikachu"});
+    Put(0, "foo", "0v1");
+    Put(1, "foo", "1v1");
+    const Snapshot* s1 = db_->GetSnapshot();
+    Put(0, "foo", "0v2");
+    Put(1, "foo", "1v2");
+    const Snapshot* s2 = db_->GetSnapshot();
+    Put(0, "foo", "0v3");
+    Put(1, "foo", "1v3");
+    const Snapshot* s3 = db_->GetSnapshot();
+
+    Put(0, "foo", "0v4");
+    Put(1, "foo", "1v4");
+    ASSERT_EQ("0v1", Get(0, "foo", s1));
+    ASSERT_EQ("1v1", Get(1, "foo", s1));
+    ASSERT_EQ("0v2", Get(0, "foo", s2));
+    ASSERT_EQ("1v2", Get(1, "foo", s2));
+    ASSERT_EQ("0v3", Get(0, "foo", s3));
+    ASSERT_EQ("1v3", Get(1, "foo", s3));
+    ASSERT_EQ("0v4", Get(0, "foo"));
+    ASSERT_EQ("1v4", Get(1, "foo"));
+
+    db_->ReleaseSnapshot(s3);
+    ASSERT_EQ("0v1", Get(0, "foo", s1));
+    ASSERT_EQ("1v1", Get(1, "foo", s1));
+    ASSERT_EQ("0v2", Get(0, "foo", s2));
+    ASSERT_EQ("1v2", Get(1, "foo", s2));
+    ASSERT_EQ("0v4", Get(0, "foo"));
+    ASSERT_EQ("1v4", Get(1, "foo"));
+
+    db_->ReleaseSnapshot(s1);
+    ASSERT_EQ("0v2", Get(0, "foo", s2));
+    ASSERT_EQ("1v2", Get(1, "foo", s2));
+    ASSERT_EQ("0v4", Get(0, "foo"));
+    ASSERT_EQ("1v4", Get(1, "foo"));
+
+    db_->ReleaseSnapshot(s2);
+    ASSERT_EQ("0v4", Get(0, "foo"));
+    ASSERT_EQ("1v4", Get(1, "foo"));
+  } while (ChangeOptions(kSkipHashCuckoo));
+}
+
+TEST(DBTest, HiddenValuesAreRemoved) {
+  do {
+    CreateAndReopenWithCF({"pikachu"});
+    Random rnd(301);
+    FillLevels("a", "z", 1);
+
+    std::string big = RandomString(&rnd, 50000);
+    Put(1, "foo", big);
+    Put(1, "pastfoo", "v");
+    const Snapshot* snapshot = db_->GetSnapshot();
+    Put(1, "foo", "tiny");
+    Put(1, "pastfoo2", "v2");  // Advance sequence number one more
+
+    ASSERT_OK(Flush(1));
+    ASSERT_GT(NumTableFilesAtLevel(0, 1), 0);
+
+    ASSERT_EQ(big, Get(1, "foo", snapshot));
+    ASSERT_TRUE(Between(Size("", "pastfoo", 1), 50000, 60000));
+    db_->ReleaseSnapshot(snapshot);
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny, " + big + " ]");
+    Slice x("x");
+    dbfull()->TEST_CompactRange(0, nullptr, &x, handles_[1]);
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny ]");
+    ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+    ASSERT_GE(NumTableFilesAtLevel(1, 1), 1);
+    dbfull()->TEST_CompactRange(1, nullptr, &x, handles_[1]);
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny ]");
+
+    ASSERT_TRUE(Between(Size("", "pastfoo", 1), 0, 1000));
+    // ApproximateOffsetOf() is not yet implemented in plain table format,
+    // which is used by Size().
+    // skip HashCuckooRep as it does not support snapshot
+  } while (ChangeOptions(kSkipUniversalCompaction | kSkipPlainTable |
+                         kSkipHashCuckoo));
+}
+
+TEST(DBTest, CompactBetweenSnapshots) {
+  do {
+    Options options = CurrentOptions();
+    options.disable_auto_compactions = true;
+    CreateAndReopenWithCF({"pikachu"});
+    Random rnd(301);
+    FillLevels("a", "z", 1);
+
+    Put(1, "foo", "first");
+    const Snapshot* snapshot1 = db_->GetSnapshot();
+    Put(1, "foo", "second");
+    Put(1, "foo", "third");
+    Put(1, "foo", "fourth");
+    const Snapshot* snapshot2 = db_->GetSnapshot();
+    Put(1, "foo", "fifth");
+    Put(1, "foo", "sixth");
+
+    // All entries (including duplicates) exist
+    // before any compaction is triggered.
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ("sixth", Get(1, "foo"));
+    ASSERT_EQ("fourth", Get(1, "foo", snapshot2));
+    ASSERT_EQ("first", Get(1, "foo", snapshot1));
+    ASSERT_EQ(AllEntriesFor("foo", 1),
+              "[ sixth, fifth, fourth, third, second, first ]");
+
+    // After a compaction, "second", "third" and "fifth" should
+    // be removed
+    FillLevels("a", "z", 1);
+    dbfull()->CompactRange(handles_[1], nullptr, nullptr);
+    ASSERT_EQ("sixth", Get(1, "foo"));
+    ASSERT_EQ("fourth", Get(1, "foo", snapshot2));
+    ASSERT_EQ("first", Get(1, "foo", snapshot1));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth, fourth, first ]");
+
+    // after we release the snapshot1, only two values left
+    db_->ReleaseSnapshot(snapshot1);
+    FillLevels("a", "z", 1);
+    dbfull()->CompactRange(handles_[1], nullptr, nullptr);
+
+    // We have only one valid snapshot snapshot2. Since snapshot1 is
+    // not valid anymore, "first" should be removed by a compaction.
+    ASSERT_EQ("sixth", Get(1, "foo"));
+    ASSERT_EQ("fourth", Get(1, "foo", snapshot2));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth, fourth ]");
+
+    // after we release the snapshot2, only one value should be left
+    db_->ReleaseSnapshot(snapshot2);
+    FillLevels("a", "z", 1);
+    dbfull()->CompactRange(handles_[1], nullptr, nullptr);
+    ASSERT_EQ("sixth", Get(1, "foo"));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth ]");
+    // skip HashCuckooRep as it does not support snapshot
+  } while (ChangeOptions(kSkipHashCuckoo));
+}
+
+TEST(DBTest, DeletionMarkers1) {
+  CreateAndReopenWithCF({"pikachu"});
+  Put(1, "foo", "v1");
+  ASSERT_OK(Flush(1));
+  const int last = CurrentOptions().max_mem_compaction_level;
+  // foo => v1 is now in last level
+  ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1);
+
+  // Place a table at level last-1 to prevent merging with preceding mutation
+  Put(1, "a", "begin");
+  Put(1, "z", "end");
+  Flush(1);
+  ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1);
+  ASSERT_EQ(NumTableFilesAtLevel(last - 1, 1), 1);
+
+  Delete(1, "foo");
+  Put(1, "foo", "v2");
+  ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, DEL, v1 ]");
+  ASSERT_OK(Flush(1));  // Moves to level last-2
+  if (CurrentOptions().purge_redundant_kvs_while_flush) {
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, v1 ]");
+  } else {
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, DEL, v1 ]");
+  }
+  Slice z("z");
+  dbfull()->TEST_CompactRange(last - 2, nullptr, &z, handles_[1]);
+  // DEL eliminated, but v1 remains because we aren't compacting that level
+  // (DEL can be eliminated because v2 hides v1).
+  ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, v1 ]");
+  dbfull()->TEST_CompactRange(last - 1, nullptr, nullptr, handles_[1]);
+  // Merging last-1 w/ last, so we are the base level for "foo", so
+  // DEL is removed.  (as is v1).
+  ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2 ]");
+}
+
+TEST(DBTest, DeletionMarkers2) {
+  CreateAndReopenWithCF({"pikachu"});
+  Put(1, "foo", "v1");
+  ASSERT_OK(Flush(1));
+  const int last = CurrentOptions().max_mem_compaction_level;
+  // foo => v1 is now in last level
+  ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1);
+
+  // Place a table at level last-1 to prevent merging with preceding mutation
+  Put(1, "a", "begin");
+  Put(1, "z", "end");
+  Flush(1);
+  ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1);
+  ASSERT_EQ(NumTableFilesAtLevel(last - 1, 1), 1);
+
+  Delete(1, "foo");
+  ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v1 ]");
+  ASSERT_OK(Flush(1));  // Moves to level last-2
+  ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v1 ]");
+  dbfull()->TEST_CompactRange(last - 2, nullptr, nullptr, handles_[1]);
+  // DEL kept: "last" file overlaps
+  ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v1 ]");
+  dbfull()->TEST_CompactRange(last - 1, nullptr, nullptr, handles_[1]);
+  // Merging last-1 w/ last, so we are the base level for "foo", so
+  // DEL is removed.  (as is v1).
+  ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]");
+}
+
+TEST(DBTest, OverlapInLevel0) {
+  do {
+    CreateAndReopenWithCF({"pikachu"});
+    int tmp = CurrentOptions().max_mem_compaction_level;
+    ASSERT_EQ(tmp, 2) << "Fix test to match config";
+
+    //Fill levels 1 and 2 to disable the pushing of new memtables to levels > 0.
+    ASSERT_OK(Put(1, "100", "v100"));
+    ASSERT_OK(Put(1, "999", "v999"));
+    Flush(1);
+    ASSERT_OK(Delete(1, "100"));
+    ASSERT_OK(Delete(1, "999"));
+    Flush(1);
+    ASSERT_EQ("0,1,1", FilesPerLevel(1));
+
+    // Make files spanning the following ranges in level-0:
+    //  files[0]  200 .. 900
+    //  files[1]  300 .. 500
+    // Note that files are sorted by smallest key.
+    ASSERT_OK(Put(1, "300", "v300"));
+    ASSERT_OK(Put(1, "500", "v500"));
+    Flush(1);
+    ASSERT_OK(Put(1, "200", "v200"));
+    ASSERT_OK(Put(1, "600", "v600"));
+    ASSERT_OK(Put(1, "900", "v900"));
+    Flush(1);
+    ASSERT_EQ("2,1,1", FilesPerLevel(1));
+
+    // Compact away the placeholder files we created initially
+    dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
+    dbfull()->TEST_CompactRange(2, nullptr, nullptr, handles_[1]);
+    ASSERT_EQ("2", FilesPerLevel(1));
+
+    // Do a memtable compaction.  Before bug-fix, the compaction would
+    // not detect the overlap with level-0 files and would incorrectly place
+    // the deletion in a deeper level.
+    ASSERT_OK(Delete(1, "600"));
+    Flush(1);
+    ASSERT_EQ("3", FilesPerLevel(1));
+    ASSERT_EQ("NOT_FOUND", Get(1, "600"));
+  } while (ChangeOptions(kSkipUniversalCompaction));
+}
+
+TEST(DBTest, L0_CompactionBug_Issue44_a) {
+  do {
+    CreateAndReopenWithCF({"pikachu"});
+    ASSERT_OK(Put(1, "b", "v"));
+    ReopenWithColumnFamilies({"default", "pikachu"});
+    ASSERT_OK(Delete(1, "b"));
+    ASSERT_OK(Delete(1, "a"));
+    ReopenWithColumnFamilies({"default", "pikachu"});
+    ASSERT_OK(Delete(1, "a"));
+    ReopenWithColumnFamilies({"default", "pikachu"});
+    ASSERT_OK(Put(1, "a", "v"));
+    ReopenWithColumnFamilies({"default", "pikachu"});
+    ReopenWithColumnFamilies({"default", "pikachu"});
+    ASSERT_EQ("(a->v)", Contents(1));
+    env_->SleepForMicroseconds(1000000);  // Wait for compaction to finish
+    ASSERT_EQ("(a->v)", Contents(1));
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, L0_CompactionBug_Issue44_b) {
+  do {
+    CreateAndReopenWithCF({"pikachu"});
+    Put(1, "", "");
+    ReopenWithColumnFamilies({"default", "pikachu"});
+    Delete(1, "e");
+    Put(1, "", "");
+    ReopenWithColumnFamilies({"default", "pikachu"});
+    Put(1, "c", "cv");
+    ReopenWithColumnFamilies({"default", "pikachu"});
+    Put(1, "", "");
+    ReopenWithColumnFamilies({"default", "pikachu"});
+    Put(1, "", "");
+    env_->SleepForMicroseconds(1000000);  // Wait for compaction to finish
+    ReopenWithColumnFamilies({"default", "pikachu"});
+    Put(1, "d", "dv");
+    ReopenWithColumnFamilies({"default", "pikachu"});
+    Put(1, "", "");
+    ReopenWithColumnFamilies({"default", "pikachu"});
+    Delete(1, "d");
+    Delete(1, "b");
+    ReopenWithColumnFamilies({"default", "pikachu"});
+    ASSERT_EQ("(->)(c->cv)", Contents(1));
+    env_->SleepForMicroseconds(1000000);  // Wait for compaction to finish
+    ASSERT_EQ("(->)(c->cv)", Contents(1));
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, ComparatorCheck) {
+  class NewComparator : public Comparator {
+   public:
+    virtual const char* Name() const { return "rocksdb.NewComparator"; }
+    virtual int Compare(const Slice& a, const Slice& b) const {
+      return BytewiseComparator()->Compare(a, b);
+    }
+    virtual void FindShortestSeparator(std::string* s, const Slice& l) const {
+      BytewiseComparator()->FindShortestSeparator(s, l);
+    }
+    virtual void FindShortSuccessor(std::string* key) const {
+      BytewiseComparator()->FindShortSuccessor(key);
+    }
+  };
+  Options new_options, options;
+  NewComparator cmp;
+  do {
+    CreateAndReopenWithCF({"pikachu"});
+    options = CurrentOptions();
+    new_options = CurrentOptions();
+    new_options.comparator = &cmp;
+    // only the non-default column family has non-matching comparator
+    Status s = TryReopenWithColumnFamilies({"default", "pikachu"},
+                                           {&options, &new_options});
+    ASSERT_TRUE(!s.ok());
+    ASSERT_TRUE(s.ToString().find("comparator") != std::string::npos)
+        << s.ToString();
+  } while (ChangeCompactOptions(&new_options));
+}
+
+TEST(DBTest, CustomComparator) {
+  class NumberComparator : public Comparator {
+   public:
+    virtual const char* Name() const { return "test.NumberComparator"; }
+    virtual int Compare(const Slice& a, const Slice& b) const {
+      return ToNumber(a) - ToNumber(b);
+    }
+    virtual void FindShortestSeparator(std::string* s, const Slice& l) const {
+      ToNumber(*s);     // Check format
+      ToNumber(l);      // Check format
+    }
+    virtual void FindShortSuccessor(std::string* key) const {
+      ToNumber(*key);   // Check format
+    }
+   private:
+    static int ToNumber(const Slice& x) {
+      // Check that there are no extra characters.
+      ASSERT_TRUE(x.size() >= 2 && x[0] == '[' && x[x.size()-1] == ']')
+          << EscapeString(x);
+      int val;
+      char ignored;
+      ASSERT_TRUE(sscanf(x.ToString().c_str(), "[%i]%c", &val, &ignored) == 1)
+          << EscapeString(x);
+      return val;
+    }
+  };
+  Options new_options;
+  NumberComparator cmp;
+  do {
+    new_options = CurrentOptions();
+    new_options.create_if_missing = true;
+    new_options.comparator = &cmp;
+    new_options.filter_policy = nullptr;     // Cannot use bloom filters
+    new_options.write_buffer_size = 1000;  // Compact more often
+    new_options = CurrentOptions(new_options);
+    DestroyAndReopen(&new_options);
+    CreateAndReopenWithCF({"pikachu"}, &new_options);
+    ASSERT_OK(Put(1, "[10]", "ten"));
+    ASSERT_OK(Put(1, "[0x14]", "twenty"));
+    for (int i = 0; i < 2; i++) {
+      ASSERT_EQ("ten", Get(1, "[10]"));
+      ASSERT_EQ("ten", Get(1, "[0xa]"));
+      ASSERT_EQ("twenty", Get(1, "[20]"));
+      ASSERT_EQ("twenty", Get(1, "[0x14]"));
+      ASSERT_EQ("NOT_FOUND", Get(1, "[15]"));
+      ASSERT_EQ("NOT_FOUND", Get(1, "[0xf]"));
+      Compact(1, "[0]", "[9999]");
+    }
+
+    for (int run = 0; run < 2; run++) {
+      for (int i = 0; i < 1000; i++) {
+        char buf[100];
+        snprintf(buf, sizeof(buf), "[%d]", i*10);
+        ASSERT_OK(Put(1, buf, buf));
+      }
+      Compact(1, "[0]", "[1000000]");
+    }
+  } while (ChangeCompactOptions(&new_options));
+}
+
+TEST(DBTest, ManualCompaction) {
+  CreateAndReopenWithCF({"pikachu"});
+  ASSERT_EQ(dbfull()->MaxMemCompactionLevel(), 2)
+      << "Need to update this test to match kMaxMemCompactLevel";
+
+  // iter - 0 with 7 levels
+  // iter - 1 with 3 levels
+  for (int iter = 0; iter < 2; ++iter) {
+    MakeTables(3, "p", "q", 1);
+    ASSERT_EQ("1,1,1", FilesPerLevel(1));
+
+    // Compaction range falls before files
+    Compact(1, "", "c");
+    ASSERT_EQ("1,1,1", FilesPerLevel(1));
+
+    // Compaction range falls after files
+    Compact(1, "r", "z");
+    ASSERT_EQ("1,1,1", FilesPerLevel(1));
+
+    // Compaction range overlaps files
+    Compact(1, "p1", "p9");
+    ASSERT_EQ("0,0,1", FilesPerLevel(1));
+
+    // Populate a different range
+    MakeTables(3, "c", "e", 1);
+    ASSERT_EQ("1,1,2", FilesPerLevel(1));
+
+    // Compact just the new range
+    Compact(1, "b", "f");
+    ASSERT_EQ("0,0,2", FilesPerLevel(1));
+
+    // Compact all
+    MakeTables(1, "a", "z", 1);
+    ASSERT_EQ("0,1,2", FilesPerLevel(1));
+    db_->CompactRange(handles_[1], nullptr, nullptr);
+    ASSERT_EQ("0,0,1", FilesPerLevel(1));
+
+    if (iter == 0) {
+      Options options = CurrentOptions();
+      options.num_levels = 3;
+      options.create_if_missing = true;
+      DestroyAndReopen(&options);
+      CreateAndReopenWithCF({"pikachu"}, &options);
+    }
+  }
+
+}
+
+TEST(DBTest, DBOpen_Options) {
+  std::string dbname = test::TmpDir() + "/db_options_test";
+  ASSERT_OK(DestroyDB(dbname, Options()));
+
+  // Does not exist, and create_if_missing == false: error
+  DB* db = nullptr;
+  Options opts;
+  opts.create_if_missing = false;
+  Status s = DB::Open(opts, dbname, &db);
+  ASSERT_TRUE(strstr(s.ToString().c_str(), "does not exist") != nullptr);
+  ASSERT_TRUE(db == nullptr);
+
+  // Does not exist, and create_if_missing == true: OK
+  opts.create_if_missing = true;
+  s = DB::Open(opts, dbname, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != nullptr);
+
+  delete db;
+  db = nullptr;
+
+  // Does exist, and error_if_exists == true: error
+  opts.create_if_missing = false;
+  opts.error_if_exists = true;
+  s = DB::Open(opts, dbname, &db);
+  ASSERT_TRUE(strstr(s.ToString().c_str(), "exists") != nullptr);
+  ASSERT_TRUE(db == nullptr);
+
+  // Does exist, and error_if_exists == false: OK
+  opts.create_if_missing = true;
+  opts.error_if_exists = false;
+  s = DB::Open(opts, dbname, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != nullptr);
+
+  delete db;
+  db = nullptr;
+}
+
+TEST(DBTest, DBOpen_Change_NumLevels) {
+  Options opts;
+  opts.create_if_missing = true;
+  DestroyAndReopen(&opts);
+  ASSERT_TRUE(db_ != nullptr);
+  CreateAndReopenWithCF({"pikachu"}, &opts);
+
+  ASSERT_OK(Put(1, "a", "123"));
+  ASSERT_OK(Put(1, "b", "234"));
+  db_->CompactRange(handles_[1], nullptr, nullptr);
+  Close();
+
+  opts.create_if_missing = false;
+  opts.num_levels = 2;
+  Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, &opts);
+  ASSERT_TRUE(strstr(s.ToString().c_str(), "Invalid argument") != nullptr);
+  ASSERT_TRUE(db_ == nullptr);
+}
+
+TEST(DBTest, DestroyDBMetaDatabase) {
+  std::string dbname = test::TmpDir() + "/db_meta";
+  std::string metadbname = MetaDatabaseName(dbname, 0);
+  std::string metametadbname = MetaDatabaseName(metadbname, 0);
+
+  // Destroy previous versions if they exist. Using the long way.
+  ASSERT_OK(DestroyDB(metametadbname, Options()));
+  ASSERT_OK(DestroyDB(metadbname, Options()));
+  ASSERT_OK(DestroyDB(dbname, Options()));
+
+  // Setup databases
+  Options opts;
+  opts.create_if_missing = true;
+  DB* db = nullptr;
+  ASSERT_OK(DB::Open(opts, dbname, &db));
+  delete db;
+  db = nullptr;
+  ASSERT_OK(DB::Open(opts, metadbname, &db));
+  delete db;
+  db = nullptr;
+  ASSERT_OK(DB::Open(opts, metametadbname, &db));
+  delete db;
+  db = nullptr;
+
+  // Delete databases
+  ASSERT_OK(DestroyDB(dbname, Options()));
+
+  // Check if deletion worked.
+  opts.create_if_missing = false;
+  ASSERT_TRUE(!(DB::Open(opts, dbname, &db)).ok());
+  ASSERT_TRUE(!(DB::Open(opts, metadbname, &db)).ok());
+  ASSERT_TRUE(!(DB::Open(opts, metametadbname, &db)).ok());
+}
+
+// Check that number of files does not grow when we are out of space
+TEST(DBTest, NoSpace) {
+  do {
+    Options options = CurrentOptions();
+    options.env = env_;
+    options.paranoid_checks = false;
+    Reopen(&options);
+
+    ASSERT_OK(Put("foo", "v1"));
+    ASSERT_EQ("v1", Get("foo"));
+    Compact("a", "z");
+    const int num_files = CountFiles();
+    env_->no_space_.Release_Store(env_);   // Force out-of-space errors
+    env_->sleep_counter_.Reset();
+    for (int i = 0; i < 5; i++) {
+      for (int level = 0; level < dbfull()->NumberLevels()-1; level++) {
+        dbfull()->TEST_CompactRange(level, nullptr, nullptr);
+      }
+    }
+
+    std::string property_value;
+    ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value));
+    ASSERT_EQ("5", property_value);
+
+    env_->no_space_.Release_Store(nullptr);
+    ASSERT_LT(CountFiles(), num_files + 3);
+
+    // Check that compaction attempts slept after errors
+    ASSERT_GE(env_->sleep_counter_.Read(), 5);
+  } while (ChangeCompactOptions());
+}
+
+// Check background error counter bumped on flush failures.
+TEST(DBTest, NoSpaceFlush) {
+  do {
+    Options options = CurrentOptions();
+    options.env = env_;
+    options.max_background_flushes = 1;
+    Reopen(&options);
+
+    ASSERT_OK(Put("foo", "v1"));
+    env_->no_space_.Release_Store(env_);  // Force out-of-space errors
+
+    std::string property_value;
+    // Background error count is 0 now.
+    ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value));
+    ASSERT_EQ("0", property_value);
+
+    dbfull()->TEST_FlushMemTable(false);
+
+    // Wait 300 milliseconds or background-errors turned 1 from 0.
+    int time_to_sleep_limit = 300000;
+    while (time_to_sleep_limit > 0) {
+      int to_sleep = (time_to_sleep_limit > 1000) ? 1000 : time_to_sleep_limit;
+      time_to_sleep_limit -= to_sleep;
+      env_->SleepForMicroseconds(to_sleep);
+
+      ASSERT_TRUE(
+          db_->GetProperty("rocksdb.background-errors", &property_value));
+      if (property_value == "1") {
+        break;
+      }
+    }
+    ASSERT_EQ("1", property_value);
+
+    env_->no_space_.Release_Store(nullptr);
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, NonWritableFileSystem) {
+  do {
+    Options options = CurrentOptions();
+    options.write_buffer_size = 1000;
+    options.env = env_;
+    Reopen(&options);
+    ASSERT_OK(Put("foo", "v1"));
+    env_->non_writable_.Release_Store(env_); // Force errors for new files
+    std::string big(100000, 'x');
+    int errors = 0;
+    for (int i = 0; i < 20; i++) {
+      if (!Put("foo", big).ok()) {
+        errors++;
+        env_->SleepForMicroseconds(100000);
+      }
+    }
+    ASSERT_GT(errors, 0);
+    env_->non_writable_.Release_Store(nullptr);
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, ManifestWriteError) {
+  // Test for the following problem:
+  // (a) Compaction produces file F
+  // (b) Log record containing F is written to MANIFEST file, but Sync() fails
+  // (c) GC deletes F
+  // (d) After reopening DB, reads fail since deleted F is named in log record
+
+  // We iterate twice.  In the second iteration, everything is the
+  // same except the log record never makes it to the MANIFEST file.
+  for (int iter = 0; iter < 2; iter++) {
+    port::AtomicPointer* error_type = (iter == 0)
+        ? &env_->manifest_sync_error_
+        : &env_->manifest_write_error_;
+
+    // Insert foo=>bar mapping
+    Options options = CurrentOptions();
+    options.env = env_;
+    options.create_if_missing = true;
+    options.error_if_exists = false;
+    DestroyAndReopen(&options);
+    ASSERT_OK(Put("foo", "bar"));
+    ASSERT_EQ("bar", Get("foo"));
+
+    // Memtable compaction (will succeed)
+    Flush();
+    ASSERT_EQ("bar", Get("foo"));
+    const int last = dbfull()->MaxMemCompactionLevel();
+    ASSERT_EQ(NumTableFilesAtLevel(last), 1);   // foo=>bar is now in last level
+
+    // Merging compaction (will fail)
+    error_type->Release_Store(env_);
+    dbfull()->TEST_CompactRange(last, nullptr, nullptr);  // Should fail
+    ASSERT_EQ("bar", Get("foo"));
+
+    // Recovery: should not lose data
+    error_type->Release_Store(nullptr);
+    Reopen(&options);
+    ASSERT_EQ("bar", Get("foo"));
+  }
+}
+
+TEST(DBTest, PutFailsParanoid) {
+  // Test the following:
+  // (a) A random put fails in paranoid mode (simulate by sync fail)
+  // (b) All other puts have to fail, even if writes would succeed
+  // (c) All of that should happen ONLY if paranoid_checks = true
+
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.error_if_exists = false;
+  options.paranoid_checks = true;
+  DestroyAndReopen(&options);
+  CreateAndReopenWithCF({"pikachu"}, &options);
+  Status s;
+
+  ASSERT_OK(Put(1, "foo", "bar"));
+  ASSERT_OK(Put(1, "foo1", "bar1"));
+  // simulate error
+  env_->log_write_error_.Release_Store(env_);
+  s = Put(1, "foo2", "bar2");
+  ASSERT_TRUE(!s.ok());
+  env_->log_write_error_.Release_Store(nullptr);
+  s = Put(1, "foo3", "bar3");
+  // the next put should fail, too
+  ASSERT_TRUE(!s.ok());
+  // but we're still able to read
+  ASSERT_EQ("bar", Get(1, "foo"));
+
+  // do the same thing with paranoid checks off
+  options.paranoid_checks = false;
+  DestroyAndReopen(&options);
+  CreateAndReopenWithCF({"pikachu"}, &options);
+
+  ASSERT_OK(Put(1, "foo", "bar"));
+  ASSERT_OK(Put(1, "foo1", "bar1"));
+  // simulate error
+  env_->log_write_error_.Release_Store(env_);
+  s = Put(1, "foo2", "bar2");
+  ASSERT_TRUE(!s.ok());
+  env_->log_write_error_.Release_Store(nullptr);
+  s = Put(1, "foo3", "bar3");
+  // the next put should NOT fail
+  ASSERT_TRUE(s.ok());
+}
+
+TEST(DBTest, FilesDeletedAfterCompaction) {
+  do {
+    CreateAndReopenWithCF({"pikachu"});
+    ASSERT_OK(Put(1, "foo", "v2"));
+    Compact(1, "a", "z");
+    const int num_files = CountLiveFiles();
+    for (int i = 0; i < 10; i++) {
+      ASSERT_OK(Put(1, "foo", "v2"));
+      Compact(1, "a", "z");
+    }
+    ASSERT_EQ(CountLiveFiles(), num_files);
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, BloomFilter) {
+  do {
+    env_->count_random_reads_ = true;
+    Options options = CurrentOptions();
+    options.env = env_;
+    options.no_block_cache = true;
+    options.filter_policy = NewBloomFilterPolicy(10);
+    CreateAndReopenWithCF({"pikachu"}, &options);
+
+    // Populate multiple layers
+    const int N = 10000;
+    for (int i = 0; i < N; i++) {
+      ASSERT_OK(Put(1, Key(i), Key(i)));
+    }
+    Compact(1, "a", "z");
+    for (int i = 0; i < N; i += 100) {
+      ASSERT_OK(Put(1, Key(i), Key(i)));
+    }
+    Flush(1);
+
+    // Prevent auto compactions triggered by seeks
+    env_->delay_sstable_sync_.Release_Store(env_);
+
+    // Lookup present keys.  Should rarely read from small sstable.
+    env_->random_read_counter_.Reset();
+    for (int i = 0; i < N; i++) {
+      ASSERT_EQ(Key(i), Get(1, Key(i)));
+    }
+    int reads = env_->random_read_counter_.Read();
+    fprintf(stderr, "%d present => %d reads\n", N, reads);
+    ASSERT_GE(reads, N);
+    ASSERT_LE(reads, N + 2*N/100);
+
+    // Lookup present keys.  Should rarely read from either sstable.
+    env_->random_read_counter_.Reset();
+    for (int i = 0; i < N; i++) {
+      ASSERT_EQ("NOT_FOUND", Get(1, Key(i) + ".missing"));
+    }
+    reads = env_->random_read_counter_.Read();
+    fprintf(stderr, "%d missing => %d reads\n", N, reads);
+    ASSERT_LE(reads, 3*N/100);
+
+    env_->delay_sstable_sync_.Release_Store(nullptr);
+    Close();
+    delete options.filter_policy;
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, SnapshotFiles) {
+  do {
+    Options options = CurrentOptions();
+    options.write_buffer_size = 100000000;        // Large write buffer
+    CreateAndReopenWithCF({"pikachu"}, &options);
+
+    Random rnd(301);
+
+    // Write 8MB (80 values, each 100K)
+    ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+    std::vector<std::string> values;
+    for (int i = 0; i < 80; i++) {
+      values.push_back(RandomString(&rnd, 100000));
+      ASSERT_OK(Put((i < 40), Key(i), values[i]));
+    }
+
+    // assert that nothing makes it to disk yet.
+    ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+
+    // get a file snapshot
+    uint64_t manifest_number = 0;
+    uint64_t manifest_size = 0;
+    std::vector<std::string> files;
+    dbfull()->DisableFileDeletions();
+    dbfull()->GetLiveFiles(files, &manifest_size);
+
+    // CURRENT, MANIFEST, *.sst files (one for each CF)
+    ASSERT_EQ(files.size(), 4U);
+
+    uint64_t number = 0;
+    FileType type;
+
+    // copy these files to a new snapshot directory
+    std::string snapdir = dbname_ + ".snapdir/";
+    std::string mkdir = "mkdir -p " + snapdir;
+    ASSERT_EQ(system(mkdir.c_str()), 0);
+
+    for (unsigned int i = 0; i < files.size(); i++) {
+      // our clients require that GetLiveFiles returns
+      // files with "/" as first character!
+      ASSERT_EQ(files[i][0], '/');
+      std::string src = dbname_ + files[i];
+      std::string dest = snapdir + files[i];
+
+      uint64_t size;
+      ASSERT_OK(env_->GetFileSize(src, &size));
+
+      // record the number and the size of the
+      // latest manifest file
+      if (ParseFileName(files[i].substr(1), &number, &type)) {
+        if (type == kDescriptorFile) {
+          if (number > manifest_number) {
+            manifest_number = number;
+            ASSERT_GE(size, manifest_size);
+            size = manifest_size; // copy only valid MANIFEST data
+          }
+        }
+      }
+      CopyFile(src, dest, size);
+    }
+
+    // release file snapshot
+    dbfull()->DisableFileDeletions();
+
+    // overwrite one key, this key should not appear in the snapshot
+    std::vector<std::string> extras;
+    for (unsigned int i = 0; i < 1; i++) {
+      extras.push_back(RandomString(&rnd, 100000));
+      ASSERT_OK(Put(0, Key(i), extras[i]));
+    }
+
+    // verify that data in the snapshot are correct
+    std::vector<ColumnFamilyDescriptor> column_families;
+    column_families.emplace_back("default", ColumnFamilyOptions());
+    column_families.emplace_back("pikachu", ColumnFamilyOptions());
+    std::vector<ColumnFamilyHandle*> cf_handles;
+    DB* snapdb;
+    DBOptions opts;
+    opts.create_if_missing = false;
+    Status stat =
+        DB::Open(opts, snapdir, column_families, &cf_handles, &snapdb);
+    ASSERT_OK(stat);
+
+    ReadOptions roptions;
+    std::string val;
+    for (unsigned int i = 0; i < 80; i++) {
+      stat = snapdb->Get(roptions, cf_handles[i < 40], Key(i), &val);
+      ASSERT_EQ(values[i].compare(val), 0);
+    }
+    for (auto cfh : cf_handles) {
+      delete cfh;
+    }
+    delete snapdb;
+
+    // look at the new live files after we added an 'extra' key
+    // and after we took the first snapshot.
+    uint64_t new_manifest_number = 0;
+    uint64_t new_manifest_size = 0;
+    std::vector<std::string> newfiles;
+    dbfull()->DisableFileDeletions();
+    dbfull()->GetLiveFiles(newfiles, &new_manifest_size);
+
+    // find the new manifest file. assert that this manifest file is
+    // the same one as in the previous snapshot. But its size should be
+    // larger because we added an extra key after taking the
+    // previous shapshot.
+    for (unsigned int i = 0; i < newfiles.size(); i++) {
+      std::string src = dbname_ + "/" + newfiles[i];
+      // record the lognumber and the size of the
+      // latest manifest file
+      if (ParseFileName(newfiles[i].substr(1), &number, &type)) {
+        if (type == kDescriptorFile) {
+          if (number > new_manifest_number) {
+            uint64_t size;
+            new_manifest_number = number;
+            ASSERT_OK(env_->GetFileSize(src, &size));
+            ASSERT_GE(size, new_manifest_size);
+          }
+        }
+      }
+    }
+    ASSERT_EQ(manifest_number, new_manifest_number);
+    ASSERT_GT(new_manifest_size, manifest_size);
+
+    // release file snapshot
+    dbfull()->DisableFileDeletions();
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, CompactOnFlush) {
+  do {
+    Options options = CurrentOptions();
+    options.purge_redundant_kvs_while_flush = true;
+    options.disable_auto_compactions = true;
+    CreateAndReopenWithCF({"pikachu"}, &options);
+
+    Put(1, "foo", "v1");
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ v1 ]");
+
+    // Write two new keys
+    Put(1, "a", "begin");
+    Put(1, "z", "end");
+    Flush(1);
+
+    // Case1: Delete followed by a put
+    Delete(1, "foo");
+    Put(1, "foo", "v2");
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, DEL, v1 ]");
+
+    // After the current memtable is flushed, the DEL should
+    // have been removed
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, v1 ]");
+
+    dbfull()->CompactRange(handles_[1], nullptr, nullptr);
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2 ]");
+
+    // Case 2: Delete followed by another delete
+    Delete(1, "foo");
+    Delete(1, "foo");
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, DEL, v2 ]");
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v2 ]");
+    dbfull()->CompactRange(handles_[1], nullptr, nullptr);
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]");
+
+    // Case 3: Put followed by a delete
+    Put(1, "foo", "v3");
+    Delete(1, "foo");
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v3 ]");
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL ]");
+    dbfull()->CompactRange(handles_[1], nullptr, nullptr);
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]");
+
+    // Case 4: Put followed by another Put
+    Put(1, "foo", "v4");
+    Put(1, "foo", "v5");
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ v5, v4 ]");
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ v5 ]");
+    dbfull()->CompactRange(handles_[1], nullptr, nullptr);
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ v5 ]");
+
+    // clear database
+    Delete(1, "foo");
+    dbfull()->CompactRange(handles_[1], nullptr, nullptr);
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]");
+
+    // Case 5: Put followed by snapshot followed by another Put
+    // Both puts should remain.
+    Put(1, "foo", "v6");
+    const Snapshot* snapshot = db_->GetSnapshot();
+    Put(1, "foo", "v7");
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ v7, v6 ]");
+    db_->ReleaseSnapshot(snapshot);
+
+    // clear database
+    Delete(1, "foo");
+    dbfull()->CompactRange(handles_[1], nullptr, nullptr);
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]");
+
+    // Case 5: snapshot followed by a put followed by another Put
+    // Only the last put should remain.
+    const Snapshot* snapshot1 = db_->GetSnapshot();
+    Put(1, "foo", "v8");
+    Put(1, "foo", "v9");
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ v9 ]");
+    db_->ReleaseSnapshot(snapshot1);
+  } while (ChangeCompactOptions());
+}
+
+namespace {
+std::vector<std::uint64_t> ListLogFiles(Env* env, const std::string& path) {
+  std::vector<std::string> files;
+  std::vector<uint64_t> log_files;
+  env->GetChildren(path, &files);
+  uint64_t number;
+  FileType type;
+  for (size_t i = 0; i < files.size(); ++i) {
+    if (ParseFileName(files[i], &number, &type)) {
+      if (type == kLogFile) {
+        log_files.push_back(number);
+      }
+    }
+  }
+  return std::move(log_files);
+}
+}  // namespace
+
+TEST(DBTest, WALArchivalTtl) {
+  do {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.WAL_ttl_seconds = 1000;
+    DestroyAndReopen(&options);
+
+    //  TEST : Create DB with a ttl and no size limit.
+    //  Put some keys. Count the log files present in the DB just after insert.
+    //  Re-open db. Causes deletion/archival to take place.
+    //  Assert that the files moved under "/archive".
+    //  Reopen db with small ttl.
+    //  Assert that archive was removed.
+
+    std::string archiveDir = ArchivalDirectory(dbname_);
+
+    for (int i = 0; i < 10; ++i) {
+      for (int j = 0; j < 10; ++j) {
+        ASSERT_OK(Put(Key(10 * i + j), DummyString(1024)));
+      }
+
+      std::vector<uint64_t> log_files = ListLogFiles(env_, dbname_);
+
+      options.create_if_missing = false;
+      Reopen(&options);
+
+      std::vector<uint64_t> logs = ListLogFiles(env_, archiveDir);
+      std::set<uint64_t> archivedFiles(logs.begin(), logs.end());
+
+      for (auto& log : log_files) {
+        ASSERT_TRUE(archivedFiles.find(log) != archivedFiles.end());
+      }
+    }
+
+    std::vector<uint64_t> log_files = ListLogFiles(env_, archiveDir);
+    ASSERT_TRUE(log_files.size() > 0);
+
+    options.WAL_ttl_seconds = 1;
+    env_->SleepForMicroseconds(2 * 1000 * 1000);
+    Reopen(&options);
+
+    log_files = ListLogFiles(env_, archiveDir);
+    ASSERT_TRUE(log_files.empty());
+  } while (ChangeCompactOptions());
+}
+
+namespace {
+uint64_t GetLogDirSize(std::string dir_path, SpecialEnv* env) {
+  uint64_t dir_size = 0;
+  std::vector<std::string> files;
+  env->GetChildren(dir_path, &files);
+  for (auto& f : files) {
+    uint64_t number;
+    FileType type;
+    if (ParseFileName(f, &number, &type) && type == kLogFile) {
+      std::string const file_path = dir_path + "/" + f;
+      uint64_t file_size;
+      env->GetFileSize(file_path, &file_size);
+      dir_size += file_size;
+    }
+  }
+  return dir_size;
+}
+}  // namespace
+
+TEST(DBTest, WALArchivalSizeLimit) {
+  do {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.WAL_ttl_seconds = 0;
+    options.WAL_size_limit_MB = 1000;
+
+    // TEST : Create DB with huge size limit and no ttl.
+    // Put some keys. Count the archived log files present in the DB
+    // just after insert. Assert that there are many enough.
+    // Change size limit. Re-open db.
+    // Assert that archive is not greater than WAL_size_limit_MB.
+    // Set ttl and time_to_check_ to small values. Re-open db.
+    // Assert that there are no archived logs left.
+
+    DestroyAndReopen(&options);
+    for (int i = 0; i < 128 * 128; ++i) {
+      ASSERT_OK(Put(Key(i), DummyString(1024)));
+    }
+    Reopen(&options);
+
+    std::string archive_dir = ArchivalDirectory(dbname_);
+    std::vector<std::uint64_t> log_files = ListLogFiles(env_, archive_dir);
+    ASSERT_TRUE(log_files.size() > 2);
+
+    options.WAL_size_limit_MB = 8;
+    Reopen(&options);
+    dbfull()->TEST_PurgeObsoleteteWAL();
+
+    uint64_t archive_size = GetLogDirSize(archive_dir, env_);
+    ASSERT_TRUE(archive_size <= options.WAL_size_limit_MB * 1024 * 1024);
+
+    options.WAL_ttl_seconds = 1;
+    dbfull()->TEST_SetDefaultTimeToCheck(1);
+    env_->SleepForMicroseconds(2 * 1000 * 1000);
+    Reopen(&options);
+    dbfull()->TEST_PurgeObsoleteteWAL();
+
+    log_files = ListLogFiles(env_, archive_dir);
+    ASSERT_TRUE(log_files.empty());
+  } while (ChangeCompactOptions());
+}
+
+namespace {
+SequenceNumber ReadRecords(
+    std::unique_ptr<TransactionLogIterator>& iter,
+    int& count) {
+  count = 0;
+  SequenceNumber lastSequence = 0;
+  BatchResult res;
+  while (iter->Valid()) {
+    res = iter->GetBatch();
+    ASSERT_TRUE(res.sequence > lastSequence);
+    ++count;
+    lastSequence = res.sequence;
+    ASSERT_OK(iter->status());
+    iter->Next();
+  }
+  return res.sequence;
+}
+
+void ExpectRecords(
+    const int expected_no_records,
+    std::unique_ptr<TransactionLogIterator>& iter) {
+  int num_records;
+  ReadRecords(iter, num_records);
+  ASSERT_EQ(num_records, expected_no_records);
+}
+}  // namespace
+
+TEST(DBTest, TransactionLogIterator) {
+  do {
+    Options options = OptionsForLogIterTest();
+    DestroyAndReopen(&options);
+    CreateAndReopenWithCF({"pikachu"}, &options);
+    Put(0, "key1", DummyString(1024));
+    Put(1, "key2", DummyString(1024));
+    Put(1, "key2", DummyString(1024));
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 3U);
+    {
+      auto iter = OpenTransactionLogIter(0);
+      ExpectRecords(3, iter);
+    }
+    ReopenWithColumnFamilies({"default", "pikachu"}, &options);
+    env_->SleepForMicroseconds(2 * 1000 * 1000);
+    {
+      Put(0, "key4", DummyString(1024));
+      Put(1, "key5", DummyString(1024));
+      Put(0, "key6", DummyString(1024));
+    }
+    {
+      auto iter = OpenTransactionLogIter(0);
+      ExpectRecords(6, iter);
+    }
+  } while (ChangeCompactOptions());
+}
+
+#ifndef NDEBUG // sync point is not included with DNDEBUG build
+TEST(DBTest, TransactionLogIteratorRace) {
+  // Setup sync point dependency to reproduce the race condition of
+  // a log file moved to archived dir, in the middle of GetSortedWalFiles
+  rocksdb::SyncPoint::GetInstance()->LoadDependency(
+    { { "DBImpl::GetSortedWalFiles:1", "DBImpl::PurgeObsoleteFiles:1" },
+      { "DBImpl::PurgeObsoleteFiles:2", "DBImpl::GetSortedWalFiles:2" },
+    });
+
+  do {
+    rocksdb::SyncPoint::GetInstance()->ClearTrace();
+    rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+    Options options = OptionsForLogIterTest();
+    DestroyAndReopen(&options);
+    Put("key1", DummyString(1024));
+    dbfull()->Flush(FlushOptions());
+    Put("key2", DummyString(1024));
+    dbfull()->Flush(FlushOptions());
+    Put("key3", DummyString(1024));
+    dbfull()->Flush(FlushOptions());
+    Put("key4", DummyString(1024));
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 4U);
+
+    {
+      auto iter = OpenTransactionLogIter(0);
+      ExpectRecords(4, iter);
+    }
+
+    rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+    // trigger async flush, and log move. Well, log move will
+    // wait until the GetSortedWalFiles:1 to reproduce the race
+    // condition
+    FlushOptions flush_options;
+    flush_options.wait = false;
+    dbfull()->Flush(flush_options);
+
+    // "key5" would be written in a new memtable and log
+    Put("key5", DummyString(1024));
+    {
+      // this iter would miss "key4" if not fixed
+      auto iter = OpenTransactionLogIter(0);
+      ExpectRecords(5, iter);
+    }
+  } while (ChangeCompactOptions());
+}
+#endif
+
+TEST(DBTest, TransactionLogIteratorMoveOverZeroFiles) {
+  do {
+    Options options = OptionsForLogIterTest();
+    DestroyAndReopen(&options);
+    CreateAndReopenWithCF({"pikachu"}, &options);
+    // Do a plain Reopen.
+    Put(1, "key1", DummyString(1024));
+    // Two reopens should create a zero record WAL file.
+    ReopenWithColumnFamilies({"default", "pikachu"}, &options);
+    ReopenWithColumnFamilies({"default", "pikachu"}, &options);
+
+    Put(1, "key2", DummyString(1024));
+
+    auto iter = OpenTransactionLogIter(0);
+    ExpectRecords(2, iter);
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, TransactionLogIteratorStallAtLastRecord) {
+  do {
+    Options options = OptionsForLogIterTest();
+    DestroyAndReopen(&options);
+    Put("key1", DummyString(1024));
+    auto iter = OpenTransactionLogIter(0);
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    iter->Next();
+    ASSERT_TRUE(!iter->Valid());
+    ASSERT_OK(iter->status());
+    Put("key2", DummyString(1024));
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, TransactionLogIteratorJustEmptyFile) {
+  do {
+    Options options = OptionsForLogIterTest();
+    DestroyAndReopen(&options);
+    unique_ptr<TransactionLogIterator> iter;
+    Status status = dbfull()->GetUpdatesSince(0, &iter);
+    // Check that an empty iterator is returned
+    ASSERT_TRUE(!iter->Valid());
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, TransactionLogIteratorCheckAfterRestart) {
+  do {
+    Options options = OptionsForLogIterTest();
+    DestroyAndReopen(&options);
+    Put("key1", DummyString(1024));
+    Put("key2", DummyString(1023));
+    dbfull()->Flush(FlushOptions());
+    Reopen(&options);
+    auto iter = OpenTransactionLogIter(0);
+    ExpectRecords(2, iter);
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, TransactionLogIteratorCorruptedLog) {
+  do {
+    Options options = OptionsForLogIterTest();
+    DestroyAndReopen(&options);
+    for (int i = 0; i < 1024; i++) {
+      Put("key"+std::to_string(i), DummyString(10));
+    }
+    dbfull()->Flush(FlushOptions());
+    // Corrupt this log to create a gap
+    rocksdb::VectorLogPtr wal_files;
+    ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files));
+    const auto logfilePath = dbname_ + "/" + wal_files.front()->PathName();
+    ASSERT_EQ(
+      0,
+      truncate(logfilePath.c_str(), wal_files.front()->SizeFileBytes() / 2));
+    // Insert a new entry to a new log file
+    Put("key1025", DummyString(10));
+    // Try to read from the beginning. Should stop before the gap and read less
+    // than 1025 entries
+    auto iter = OpenTransactionLogIter(0);
+    int count;
+    int last_sequence_read = ReadRecords(iter, count);
+    ASSERT_LT(last_sequence_read, 1025);
+    // Try to read past the gap, should be able to seek to key1025
+    auto iter2 = OpenTransactionLogIter(last_sequence_read + 1);
+    ExpectRecords(1, iter2);
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, TransactionLogIteratorBatchOperations) {
+  do {
+    Options options = OptionsForLogIterTest();
+    DestroyAndReopen(&options);
+    CreateAndReopenWithCF({"pikachu"}, &options);
+    WriteBatch batch;
+    batch.Put(handles_[1], "key1", DummyString(1024));
+    batch.Put(handles_[0], "key2", DummyString(1024));
+    batch.Put(handles_[1], "key3", DummyString(1024));
+    batch.Delete(handles_[0], "key2");
+    dbfull()->Write(WriteOptions(), &batch);
+    Flush(1);
+    Flush(0);
+    ReopenWithColumnFamilies({"default", "pikachu"}, &options);
+    Put(1, "key4", DummyString(1024));
+    auto iter = OpenTransactionLogIter(3);
+    ExpectRecords(2, iter);
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, TransactionLogIteratorBlobs) {
+  Options options = OptionsForLogIterTest();
+  DestroyAndReopen(&options);
+  CreateAndReopenWithCF({"pikachu"}, &options);
+  {
+    WriteBatch batch;
+    batch.Put(handles_[1], "key1", DummyString(1024));
+    batch.Put(handles_[0], "key2", DummyString(1024));
+    batch.PutLogData(Slice("blob1"));
+    batch.Put(handles_[1], "key3", DummyString(1024));
+    batch.PutLogData(Slice("blob2"));
+    batch.Delete(handles_[0], "key2");
+    dbfull()->Write(WriteOptions(), &batch);
+    ReopenWithColumnFamilies({"default", "pikachu"}, &options);
+  }
+
+  auto res = OpenTransactionLogIter(0)->GetBatch();
+  struct Handler : public WriteBatch::Handler {
+    std::string seen;
+    virtual Status PutCF(uint32_t cf, const Slice& key, const Slice& value) {
+      seen += "Put(" + std::to_string(cf) + ", " + key.ToString() + ", " +
+              std::to_string(value.size()) + ")";
+      return Status::OK();
+    }
+    virtual Status MergeCF(uint32_t cf, const Slice& key, const Slice& value) {
+      seen += "Merge(" + std::to_string(cf) + ", " + key.ToString() + ", " +
+              std::to_string(value.size()) + ")";
+      return Status::OK();
+    }
+    virtual void LogData(const Slice& blob) {
+      seen += "LogData(" + blob.ToString() + ")";
+    }
+    virtual Status DeleteCF(uint32_t cf, const Slice& key) {
+      seen += "Delete(" + std::to_string(cf) + ", " + key.ToString() + ")";
+      return Status::OK();
+    }
+  } handler;
+  res.writeBatchPtr->Iterate(&handler);
+  ASSERT_EQ(
+      "Put(1, key1, 1024)"
+      "Put(0, key2, 1024)"
+      "LogData(blob1)"
+      "Put(1, key3, 1024)"
+      "LogData(blob2)"
+      "Delete(0, key2)",
+      handler.seen);
+}
+
+TEST(DBTest, ReadFirstRecordCache) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  DestroyAndReopen(&options);
+
+  std::string path = dbname_ + "/000001.log";
+  unique_ptr<WritableFile> file;
+  ASSERT_OK(env_->NewWritableFile(path, &file, EnvOptions()));
+
+  SequenceNumber s;
+  ASSERT_OK(dbfull()->TEST_ReadFirstLine(path, &s));
+  ASSERT_EQ(s, 0U);
+
+  ASSERT_OK(dbfull()->TEST_ReadFirstRecord(kAliveLogFile, 1, &s));
+  ASSERT_EQ(s, 0U);
+
+  log::Writer writer(std::move(file));
+  WriteBatch batch;
+  batch.Put("foo", "bar");
+  WriteBatchInternal::SetSequence(&batch, 10);
+  writer.AddRecord(WriteBatchInternal::Contents(&batch));
+
+  env_->count_sequential_reads_ = true;
+  // sequential_read_counter_ sanity test
+  ASSERT_EQ(env_->sequential_read_counter_.Read(), 0);
+
+  ASSERT_OK(dbfull()->TEST_ReadFirstRecord(kAliveLogFile, 1, &s));
+  ASSERT_EQ(s, 10U);
+  // did a read
+  ASSERT_EQ(env_->sequential_read_counter_.Read(), 1);
+
+  ASSERT_OK(dbfull()->TEST_ReadFirstRecord(kAliveLogFile, 1, &s));
+  ASSERT_EQ(s, 10U);
+  // no new reads since the value is cached
+  ASSERT_EQ(env_->sequential_read_counter_.Read(), 1);
+}
+
+TEST(DBTest, ReadCompaction) {
+  std::string value(4096, '4'); // a string of size 4K
+  {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.max_open_files = 20; // only 10 file in file-cache
+    options.target_file_size_base = 512;
+    options.write_buffer_size = 64 * 1024;
+    options.filter_policy = nullptr;
+    options.block_size = 4096;
+    options.no_block_cache = true;
+    options.disable_seek_compaction = false;
+
+    CreateAndReopenWithCF({"pikachu"}, &options);
+
+    // Write 8MB (2000 values, each 4K)
+    ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+    std::vector<std::string> values;
+    for (int i = 0; i < 2000; i++) {
+      ASSERT_OK(Put(1, Key(i), value));
+    }
+
+    // clear level 0 and 1 if necessary.
+    Flush(1);
+    dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
+    dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
+    ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+    ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0);
+
+    // write some new keys into level 0
+    for (int i = 0; i < 2000; i = i + 16) {
+      ASSERT_OK(Put(1, Key(i), value));
+    }
+    Flush(1);
+
+    // Wait for any write compaction to finish
+    dbfull()->TEST_WaitForCompact();
+
+    // remember number of files in each level
+    int l1 = NumTableFilesAtLevel(0, 1);
+    int l2 = NumTableFilesAtLevel(1, 1);
+    int l3 = NumTableFilesAtLevel(2, 1);
+    ASSERT_NE(NumTableFilesAtLevel(0, 1), 0);
+    ASSERT_NE(NumTableFilesAtLevel(1, 1), 0);
+    ASSERT_NE(NumTableFilesAtLevel(2, 1), 0);
+
+    // read a bunch of times, trigger read compaction
+    for (int j = 0; j < 100; j++) {
+      for (int i = 0; i < 2000; i++) {
+        Get(1, Key(i));
+      }
+    }
+    // wait for read compaction to finish
+    env_->SleepForMicroseconds(1000000);
+
+    // verify that the number of files have decreased
+    // in some level, indicating that there was a compaction
+    ASSERT_TRUE(NumTableFilesAtLevel(0, 1) < l1 ||
+                NumTableFilesAtLevel(1, 1) < l2 ||
+                NumTableFilesAtLevel(2, 1) < l3);
+  }
+}
+
+// Multi-threaded test:
+namespace {
+
+static const int kColumnFamilies = 10;
+static const int kNumThreads = 10;
+static const int kTestSeconds = 10;
+static const int kNumKeys = 1000;
+
+struct MTState {
+  DBTest* test;
+  port::AtomicPointer stop;
+  port::AtomicPointer counter[kNumThreads];
+  port::AtomicPointer thread_done[kNumThreads];
+};
+
+struct MTThread {
+  MTState* state;
+  int id;
+};
+
+static void MTThreadBody(void* arg) {
+  MTThread* t = reinterpret_cast<MTThread*>(arg);
+  int id = t->id;
+  DB* db = t->state->test->db_;
+  uintptr_t counter = 0;
+  fprintf(stderr, "... starting thread %d\n", id);
+  Random rnd(1000 + id);
+  char valbuf[1500];
+  while (t->state->stop.Acquire_Load() == nullptr) {
+    t->state->counter[id].Release_Store(reinterpret_cast<void*>(counter));
+
+    int key = rnd.Uniform(kNumKeys);
+    char keybuf[20];
+    snprintf(keybuf, sizeof(keybuf), "%016d", key);
+
+    if (rnd.OneIn(2)) {
+      // Write values of the form <key, my id, counter, cf, unique_id>.
+      // into each of the CFs
+      // We add some padding for force compactions.
+      int unique_id = rnd.Uniform(1000000);
+      WriteBatch batch;
+      for (int cf = 0; cf < kColumnFamilies; ++cf) {
+        snprintf(valbuf, sizeof(valbuf), "%d.%d.%d.%d.%-1000d", key, id,
+                 static_cast<int>(counter), cf, unique_id);
+        batch.Put(t->state->test->handles_[cf], Slice(keybuf), Slice(valbuf));
+      }
+      ASSERT_OK(db->Write(WriteOptions(), &batch));
+    } else {
+      // Read a value and verify that it matches the pattern written above
+      // and that writes to all column families were atomic (unique_id is the
+      // same)
+      std::vector<Slice> keys(kColumnFamilies, Slice(keybuf));
+      std::vector<std::string> values;
+      std::vector<Status> statuses =
+          db->MultiGet(ReadOptions(), t->state->test->handles_, keys, &values);
+      Status s = statuses[0];
+      // all statuses have to be the same
+      for (size_t i = 1; i < statuses.size(); ++i) {
+        // they are either both ok or both not-found
+        ASSERT_TRUE((s.ok() && statuses[i].ok()) ||
+                    (s.IsNotFound() && statuses[i].IsNotFound()));
+      }
+      if (s.IsNotFound()) {
+        // Key has not yet been written
+      } else {
+        // Check that the writer thread counter is >= the counter in the value
+        ASSERT_OK(s);
+        int unique_id = -1;
+        for (int i = 0; i < kColumnFamilies; ++i) {
+          int k, w, c, cf, u;
+          ASSERT_EQ(5, sscanf(values[i].c_str(), "%d.%d.%d.%d.%d", &k, &w,
+                              &c, &cf, &u))
+              << values[i];
+          ASSERT_EQ(k, key);
+          ASSERT_GE(w, 0);
+          ASSERT_LT(w, kNumThreads);
+          ASSERT_LE((unsigned int)c, reinterpret_cast<uintptr_t>(
+                                         t->state->counter[w].Acquire_Load()));
+          ASSERT_EQ(cf, i);
+          if (i == 0) {
+            unique_id = u;
+          } else {
+            // this checks that updates across column families happened
+            // atomically -- all unique ids are the same
+            ASSERT_EQ(u, unique_id);
+          }
+        }
+      }
+    }
+    counter++;
+  }
+  t->state->thread_done[id].Release_Store(t);
+  fprintf(stderr, "... stopping thread %d after %d ops\n", id, int(counter));
+}
+
+}  // namespace
+
+TEST(DBTest, MultiThreaded) {
+  do {
+    std::vector<std::string> cfs;
+    for (int i = 1; i < kColumnFamilies; ++i) {
+      cfs.push_back(std::to_string(i));
+    }
+    CreateAndReopenWithCF(cfs);
+    // Initialize state
+    MTState mt;
+    mt.test = this;
+    mt.stop.Release_Store(0);
+    for (int id = 0; id < kNumThreads; id++) {
+      mt.counter[id].Release_Store(0);
+      mt.thread_done[id].Release_Store(0);
+    }
+
+    // Start threads
+    MTThread thread[kNumThreads];
+    for (int id = 0; id < kNumThreads; id++) {
+      thread[id].state = &mt;
+      thread[id].id = id;
+      env_->StartThread(MTThreadBody, &thread[id]);
+    }
+
+    // Let them run for a while
+    env_->SleepForMicroseconds(kTestSeconds * 1000000);
+
+    // Stop the threads and wait for them to finish
+    mt.stop.Release_Store(&mt);
+    for (int id = 0; id < kNumThreads; id++) {
+      while (mt.thread_done[id].Acquire_Load() == nullptr) {
+        env_->SleepForMicroseconds(100000);
+      }
+    }
+    // skip as HashCuckooRep does not support snapshot
+  } while (ChangeOptions(kSkipHashCuckoo));
+}
+
+// Group commit test:
+namespace {
+
+static const int kGCNumThreads = 4;
+static const int kGCNumKeys = 1000;
+
+struct GCThread {
+  DB* db;
+  int id;
+  std::atomic<bool> done;
+};
+
+static void GCThreadBody(void* arg) {
+  GCThread* t = reinterpret_cast<GCThread*>(arg);
+  int id = t->id;
+  DB* db = t->db;
+  WriteOptions wo;
+
+  for (int i = 0; i < kGCNumKeys; ++i) {
+    std::string kv(std::to_string(i + id * kGCNumKeys));
+    ASSERT_OK(db->Put(wo, kv, kv));
+  }
+  t->done = true;
+}
+
+}  // namespace
+
+TEST(DBTest, GroupCommitTest) {
+  do {
+    Options options = CurrentOptions();
+    options.statistics = rocksdb::CreateDBStatistics();
+    Reopen(&options);
+
+    // Start threads
+    GCThread thread[kGCNumThreads];
+    for (int id = 0; id < kGCNumThreads; id++) {
+      thread[id].id = id;
+      thread[id].db = db_;
+      thread[id].done = false;
+      env_->StartThread(GCThreadBody, &thread[id]);
+    }
+
+    for (int id = 0; id < kGCNumThreads; id++) {
+      while (thread[id].done == false) {
+        env_->SleepForMicroseconds(100000);
+      }
+    }
+    ASSERT_GT(TestGetTickerCount(options, WRITE_DONE_BY_OTHER), 0);
+
+    std::vector<std::string> expected_db;
+    for (int i = 0; i < kGCNumThreads * kGCNumKeys; ++i) {
+      expected_db.push_back(std::to_string(i));
+    }
+    sort(expected_db.begin(), expected_db.end());
+
+    Iterator* itr = db_->NewIterator(ReadOptions());
+    itr->SeekToFirst();
+    for (auto x : expected_db) {
+      ASSERT_TRUE(itr->Valid());
+      ASSERT_EQ(itr->key().ToString(), x);
+      ASSERT_EQ(itr->value().ToString(), x);
+      itr->Next();
+    }
+    ASSERT_TRUE(!itr->Valid());
+    delete itr;
+
+  } while (ChangeOptions(kSkipNoSeekToLast));
+}
+
+namespace {
+typedef std::map<std::string, std::string> KVMap;
+}
+
+class ModelDB: public DB {
+ public:
+  class ModelSnapshot : public Snapshot {
+   public:
+    KVMap map_;
+  };
+
+  explicit ModelDB(const Options& options) : options_(options) {}
+  using DB::Put;
+  virtual Status Put(const WriteOptions& o, ColumnFamilyHandle* cf,
+                     const Slice& k, const Slice& v) {
+    WriteBatch batch;
+    batch.Put(cf, k, v);
+    return Write(o, &batch);
+  }
+  using DB::Merge;
+  virtual Status Merge(const WriteOptions& o, ColumnFamilyHandle* cf,
+                       const Slice& k, const Slice& v) {
+    WriteBatch batch;
+    batch.Merge(cf, k, v);
+    return Write(o, &batch);
+  }
+  using DB::Delete;
+  virtual Status Delete(const WriteOptions& o, ColumnFamilyHandle* cf,
+                        const Slice& key) {
+    WriteBatch batch;
+    batch.Delete(cf, key);
+    return Write(o, &batch);
+  }
+  using DB::Get;
+  virtual Status Get(const ReadOptions& options, ColumnFamilyHandle* cf,
+                     const Slice& key, std::string* value) {
+    return Status::NotSupported(key);
+  }
+
+  using DB::MultiGet;
+  virtual std::vector<Status> MultiGet(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_family,
+      const std::vector<Slice>& keys, std::vector<std::string>* values) {
+    std::vector<Status> s(keys.size(),
+                          Status::NotSupported("Not implemented."));
+    return s;
+  }
+
+  using DB::GetPropertiesOfAllTables;
+  virtual Status GetPropertiesOfAllTables(ColumnFamilyHandle* column_family,
+                                          TablePropertiesCollection* props) {
+    return Status();
+  }
+
+  using DB::KeyMayExist;
+  virtual bool KeyMayExist(const ReadOptions& options,
+                           ColumnFamilyHandle* column_family, const Slice& key,
+                           std::string* value, bool* value_found = nullptr) {
+    if (value_found != nullptr) {
+      *value_found = false;
+    }
+    return true; // Not Supported directly
+  }
+  using DB::NewIterator;
+  virtual Iterator* NewIterator(const ReadOptions& options,
+                                ColumnFamilyHandle* column_family) {
+    if (options.snapshot == nullptr) {
+      KVMap* saved = new KVMap;
+      *saved = map_;
+      return new ModelIter(saved, true);
+    } else {
+      const KVMap* snapshot_state =
+          &(reinterpret_cast<const ModelSnapshot*>(options.snapshot)->map_);
+      return new ModelIter(snapshot_state, false);
+    }
+  }
+  virtual Status NewIterators(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_family,
+      std::vector<Iterator*>* iterators) {
+    return Status::NotSupported("Not supported yet");
+  }
+  virtual const Snapshot* GetSnapshot() {
+    ModelSnapshot* snapshot = new ModelSnapshot;
+    snapshot->map_ = map_;
+    return snapshot;
+  }
+
+  virtual void ReleaseSnapshot(const Snapshot* snapshot) {
+    delete reinterpret_cast<const ModelSnapshot*>(snapshot);
+  }
+
+  virtual Status Write(const WriteOptions& options, WriteBatch* batch) {
+    class Handler : public WriteBatch::Handler {
+     public:
+      KVMap* map_;
+      virtual void Put(const Slice& key, const Slice& value) {
+        (*map_)[key.ToString()] = value.ToString();
+      }
+      virtual void Merge(const Slice& key, const Slice& value) {
+        // ignore merge for now
+        //(*map_)[key.ToString()] = value.ToString();
+      }
+      virtual void Delete(const Slice& key) {
+        map_->erase(key.ToString());
+      }
+    };
+    Handler handler;
+    handler.map_ = &map_;
+    return batch->Iterate(&handler);
+  }
+
+  using DB::GetProperty;
+  virtual bool GetProperty(ColumnFamilyHandle* column_family,
+                           const Slice& property, std::string* value) {
+    return false;
+  }
+  using DB::GetApproximateSizes;
+  virtual void GetApproximateSizes(ColumnFamilyHandle* column_family,
+                                   const Range* range, int n, uint64_t* sizes) {
+    for (int i = 0; i < n; i++) {
+      sizes[i] = 0;
+    }
+  }
+  using DB::CompactRange;
+  virtual Status CompactRange(ColumnFamilyHandle* column_family,
+                              const Slice* start, const Slice* end,
+                              bool reduce_level, int target_level) {
+    return Status::NotSupported("Not supported operation.");
+  }
+
+  using DB::NumberLevels;
+  virtual int NumberLevels(ColumnFamilyHandle* column_family) { return 1; }
+
+  using DB::MaxMemCompactionLevel;
+  virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family) {
+    return 1;
+  }
+
+  using DB::Level0StopWriteTrigger;
+  virtual int Level0StopWriteTrigger(ColumnFamilyHandle* column_family) {
+    return -1;
+  }
+
+  virtual const std::string& GetName() const {
+    return name_;
+  }
+
+  virtual Env* GetEnv() const {
+    return nullptr;
+  }
+
+  using DB::GetOptions;
+  virtual const Options& GetOptions(ColumnFamilyHandle* column_family) const {
+    return options_;
+  }
+
+  using DB::Flush;
+  virtual Status Flush(const rocksdb::FlushOptions& options,
+                       ColumnFamilyHandle* column_family) {
+    Status ret;
+    return ret;
+  }
+
+  virtual Status DisableFileDeletions() {
+    return Status::OK();
+  }
+  virtual Status EnableFileDeletions(bool force) {
+    return Status::OK();
+  }
+  virtual Status GetLiveFiles(std::vector<std::string>&, uint64_t* size,
+                              bool flush_memtable = true) {
+    return Status::OK();
+  }
+
+  virtual Status GetSortedWalFiles(VectorLogPtr& files) {
+    return Status::OK();
+  }
+
+  virtual Status DeleteFile(std::string name) {
+    return Status::OK();
+  }
+
+  virtual Status GetDbIdentity(std::string& identity) {
+    return Status::OK();
+  }
+
+  virtual SequenceNumber GetLatestSequenceNumber() const {
+    return 0;
+  }
+  virtual Status GetUpdatesSince(
+      rocksdb::SequenceNumber, unique_ptr<rocksdb::TransactionLogIterator>*,
+      const TransactionLogIterator::ReadOptions&
+          read_options = TransactionLogIterator::ReadOptions()) {
+    return Status::NotSupported("Not supported in Model DB");
+  }
+
+  virtual ColumnFamilyHandle* DefaultColumnFamily() const { return nullptr; }
+
+ private:
+  class ModelIter: public Iterator {
+   public:
+    ModelIter(const KVMap* map, bool owned)
+        : map_(map), owned_(owned), iter_(map_->end()) {
+    }
+    ~ModelIter() {
+      if (owned_) delete map_;
+    }
+    virtual bool Valid() const { return iter_ != map_->end(); }
+    virtual void SeekToFirst() { iter_ = map_->begin(); }
+    virtual void SeekToLast() {
+      if (map_->empty()) {
+        iter_ = map_->end();
+      } else {
+        iter_ = map_->find(map_->rbegin()->first);
+      }
+    }
+    virtual void Seek(const Slice& k) {
+      iter_ = map_->lower_bound(k.ToString());
+    }
+    virtual void Next() { ++iter_; }
+    virtual void Prev() { --iter_; }
+    virtual Slice key() const { return iter_->first; }
+    virtual Slice value() const { return iter_->second; }
+    virtual Status status() const { return Status::OK(); }
+   private:
+    const KVMap* const map_;
+    const bool owned_;  // Do we own map_
+    KVMap::const_iterator iter_;
+  };
+  const Options options_;
+  KVMap map_;
+  std::string name_ = "";
+};
+
+static std::string RandomKey(Random* rnd, int minimum = 0) {
+  int len;
+  do {
+    len = (rnd->OneIn(3)
+           ? 1                // Short sometimes to encourage collisions
+           : (rnd->OneIn(100) ? rnd->Skewed(10) : rnd->Uniform(10)));
+  } while (len < minimum);
+  return test::RandomKey(rnd, len);
+}
+
+static bool CompareIterators(int step,
+                             DB* model,
+                             DB* db,
+                             const Snapshot* model_snap,
+                             const Snapshot* db_snap) {
+  ReadOptions options;
+  options.snapshot = model_snap;
+  Iterator* miter = model->NewIterator(options);
+  options.snapshot = db_snap;
+  Iterator* dbiter = db->NewIterator(options);
+  bool ok = true;
+  int count = 0;
+  for (miter->SeekToFirst(), dbiter->SeekToFirst();
+       ok && miter->Valid() && dbiter->Valid();
+       miter->Next(), dbiter->Next()) {
+    count++;
+    if (miter->key().compare(dbiter->key()) != 0) {
+      fprintf(stderr, "step %d: Key mismatch: '%s' vs. '%s'\n",
+              step,
+              EscapeString(miter->key()).c_str(),
+              EscapeString(dbiter->key()).c_str());
+      ok = false;
+      break;
+    }
+
+    if (miter->value().compare(dbiter->value()) != 0) {
+      fprintf(stderr, "step %d: Value mismatch for key '%s': '%s' vs. '%s'\n",
+              step,
+              EscapeString(miter->key()).c_str(),
+              EscapeString(miter->value()).c_str(),
+              EscapeString(miter->value()).c_str());
+      ok = false;
+    }
+  }
+
+  if (ok) {
+    if (miter->Valid() != dbiter->Valid()) {
+      fprintf(stderr, "step %d: Mismatch at end of iterators: %d vs. %d\n",
+              step, miter->Valid(), dbiter->Valid());
+      ok = false;
+    }
+  }
+  delete miter;
+  delete dbiter;
+  return ok;
+}
+
+TEST(DBTest, Randomized) {
+  Random rnd(test::RandomSeed());
+  do {
+    ModelDB model(CurrentOptions());
+    const int N = 10000;
+    const Snapshot* model_snap = nullptr;
+    const Snapshot* db_snap = nullptr;
+    std::string k, v;
+    for (int step = 0; step < N; step++) {
+      // TODO(sanjay): Test Get() works
+      int p = rnd.Uniform(100);
+      int minimum = 0;
+      if (option_config_ == kHashSkipList ||
+          option_config_ == kHashLinkList ||
+          option_config_ == kHashCuckoo ||
+          option_config_ == kPlainTableFirstBytePrefix ||
+          option_config_ == kBlockBasedTableWithWholeKeyHashIndex ||
+          option_config_ == kBlockBasedTableWithPrefixHashIndex) {
+        minimum = 1;
+      }
+      if (p < 45) {                               // Put
+        k = RandomKey(&rnd, minimum);
+        v = RandomString(&rnd,
+                         rnd.OneIn(20)
+                         ? 100 + rnd.Uniform(100)
+                         : rnd.Uniform(8));
+        ASSERT_OK(model.Put(WriteOptions(), k, v));
+        ASSERT_OK(db_->Put(WriteOptions(), k, v));
+
+      } else if (p < 90) {                        // Delete
+        k = RandomKey(&rnd, minimum);
+        ASSERT_OK(model.Delete(WriteOptions(), k));
+        ASSERT_OK(db_->Delete(WriteOptions(), k));
+
+
+      } else {                                    // Multi-element batch
+        WriteBatch b;
+        const int num = rnd.Uniform(8);
+        for (int i = 0; i < num; i++) {
+          if (i == 0 || !rnd.OneIn(10)) {
+            k = RandomKey(&rnd, minimum);
+          } else {
+            // Periodically re-use the same key from the previous iter, so
+            // we have multiple entries in the write batch for the same key
+          }
+          if (rnd.OneIn(2)) {
+            v = RandomString(&rnd, rnd.Uniform(10));
+            b.Put(k, v);
+          } else {
+            b.Delete(k);
+          }
+        }
+        ASSERT_OK(model.Write(WriteOptions(), &b));
+        ASSERT_OK(db_->Write(WriteOptions(), &b));
+      }
+
+      if ((step % 100) == 0) {
+        ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr));
+        ASSERT_TRUE(CompareIterators(step, &model, db_, model_snap, db_snap));
+
+        // Save a snapshot from each DB this time that we'll use next
+        // time we compare things, to make sure the current state is
+        // preserved with the snapshot
+        if (model_snap != nullptr) model.ReleaseSnapshot(model_snap);
+        if (db_snap != nullptr) db_->ReleaseSnapshot(db_snap);
+
+        Reopen();
+        ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr));
+
+        model_snap = model.GetSnapshot();
+        db_snap = db_->GetSnapshot();
+      }
+    }
+    if (model_snap != nullptr) model.ReleaseSnapshot(model_snap);
+    if (db_snap != nullptr) db_->ReleaseSnapshot(db_snap);
+    // skip cuckoo hash as it does not support snapshot.
+  } while (ChangeOptions(kSkipDeletesFilterFirst |
+                         kSkipNoSeekToLast | kSkipHashCuckoo));
+}
+
+TEST(DBTest, MultiGetSimple) {
+  do {
+    CreateAndReopenWithCF({"pikachu"});
+    ASSERT_OK(Put(1, "k1", "v1"));
+    ASSERT_OK(Put(1, "k2", "v2"));
+    ASSERT_OK(Put(1, "k3", "v3"));
+    ASSERT_OK(Put(1, "k4", "v4"));
+    ASSERT_OK(Delete(1, "k4"));
+    ASSERT_OK(Put(1, "k5", "v5"));
+    ASSERT_OK(Delete(1, "no_key"));
+
+    std::vector<Slice> keys({"k1", "k2", "k3", "k4", "k5", "no_key"});
+
+    std::vector<std::string> values(20, "Temporary data to be overwritten");
+    std::vector<ColumnFamilyHandle*> cfs(keys.size(), handles_[1]);
+
+    std::vector<Status> s = db_->MultiGet(ReadOptions(), cfs, keys, &values);
+    ASSERT_EQ(values.size(), keys.size());
+    ASSERT_EQ(values[0], "v1");
+    ASSERT_EQ(values[1], "v2");
+    ASSERT_EQ(values[2], "v3");
+    ASSERT_EQ(values[4], "v5");
+
+    ASSERT_OK(s[0]);
+    ASSERT_OK(s[1]);
+    ASSERT_OK(s[2]);
+    ASSERT_TRUE(s[3].IsNotFound());
+    ASSERT_OK(s[4]);
+    ASSERT_TRUE(s[5].IsNotFound());
+  } while (ChangeCompactOptions());
+}
+
+TEST(DBTest, MultiGetEmpty) {
+  do {
+    CreateAndReopenWithCF({"pikachu"});
+    // Empty Key Set
+    std::vector<Slice> keys;
+    std::vector<std::string> values;
+    std::vector<ColumnFamilyHandle*> cfs;
+    std::vector<Status> s = db_->MultiGet(ReadOptions(), cfs, keys, &values);
+    ASSERT_EQ(s.size(), 0U);
+
+    // Empty Database, Empty Key Set
+    DestroyAndReopen();
+    CreateAndReopenWithCF({"pikachu"});
+    s = db_->MultiGet(ReadOptions(), cfs, keys, &values);
+    ASSERT_EQ(s.size(), 0U);
+
+    // Empty Database, Search for Keys
+    keys.resize(2);
+    keys[0] = "a";
+    keys[1] = "b";
+    cfs.push_back(handles_[0]);
+    cfs.push_back(handles_[1]);
+    s = db_->MultiGet(ReadOptions(), cfs, keys, &values);
+    ASSERT_EQ((int)s.size(), 2);
+    ASSERT_TRUE(s[0].IsNotFound() && s[1].IsNotFound());
+  } while (ChangeCompactOptions());
+}
+
+namespace {
+void PrefixScanInit(DBTest *dbtest) {
+  char buf[100];
+  std::string keystr;
+  const int small_range_sstfiles = 5;
+  const int big_range_sstfiles = 5;
+
+  // Generate 11 sst files with the following prefix ranges.
+  // GROUP 0: [0,10]                              (level 1)
+  // GROUP 1: [1,2], [2,3], [3,4], [4,5], [5, 6]  (level 0)
+  // GROUP 2: [0,6], [0,7], [0,8], [0,9], [0,10]  (level 0)
+  //
+  // A seek with the previous API would do 11 random I/Os (to all the
+  // files).  With the new API and a prefix filter enabled, we should
+  // only do 2 random I/O, to the 2 files containing the key.
+
+  // GROUP 0
+  snprintf(buf, sizeof(buf), "%02d______:start", 0);
+  keystr = std::string(buf);
+  ASSERT_OK(dbtest->Put(keystr, keystr));
+  snprintf(buf, sizeof(buf), "%02d______:end", 10);
+  keystr = std::string(buf);
+  ASSERT_OK(dbtest->Put(keystr, keystr));
+  dbtest->Flush();
+  dbtest->dbfull()->CompactRange(nullptr, nullptr); // move to level 1
+
+  // GROUP 1
+  for (int i = 1; i <= small_range_sstfiles; i++) {
+    snprintf(buf, sizeof(buf), "%02d______:start", i);
+    keystr = std::string(buf);
+    ASSERT_OK(dbtest->Put(keystr, keystr));
+    snprintf(buf, sizeof(buf), "%02d______:end", i+1);
+    keystr = std::string(buf);
+    ASSERT_OK(dbtest->Put(keystr, keystr));
+    dbtest->Flush();
+  }
+
+  // GROUP 2
+  for (int i = 1; i <= big_range_sstfiles; i++) {
+    std::string keystr;
+    snprintf(buf, sizeof(buf), "%02d______:start", 0);
+    keystr = std::string(buf);
+    ASSERT_OK(dbtest->Put(keystr, keystr));
+    snprintf(buf, sizeof(buf), "%02d______:end",
+             small_range_sstfiles+i+1);
+    keystr = std::string(buf);
+    ASSERT_OK(dbtest->Put(keystr, keystr));
+    dbtest->Flush();
+  }
+}
+}  // namespace
+
+TEST(DBTest, PrefixScan) {
+  int count;
+  Slice prefix;
+  Slice key;
+  char buf[100];
+  Iterator* iter;
+  snprintf(buf, sizeof(buf), "03______:");
+  prefix = Slice(buf, 8);
+  key = Slice(buf, 9);
+  // db configs
+  env_->count_random_reads_ = true;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.no_block_cache = true;
+  options.filter_policy = NewBloomFilterPolicy(10);
+  options.prefix_extractor.reset(NewFixedPrefixTransform(8));
+  options.whole_key_filtering = false;
+  options.disable_auto_compactions = true;
+  options.max_background_compactions = 2;
+  options.create_if_missing = true;
+  options.disable_seek_compaction = true;
+  options.memtable_factory.reset(NewHashSkipListRepFactory());
+
+  // 11 RAND I/Os
+  DestroyAndReopen(&options);
+  PrefixScanInit(this);
+  count = 0;
+  env_->random_read_counter_.Reset();
+  iter = db_->NewIterator(ReadOptions());
+  for (iter->Seek(prefix); iter->Valid(); iter->Next()) {
+    if (! iter->key().starts_with(prefix)) {
+      break;
+    }
+    count++;
+  }
+  ASSERT_OK(iter->status());
+  delete iter;
+  ASSERT_EQ(count, 2);
+  ASSERT_EQ(env_->random_read_counter_.Read(), 2);
+  Close();
+  delete options.filter_policy;
+}
+
+namespace {
+std::string MakeKey(unsigned int num) {
+  char buf[30];
+  snprintf(buf, sizeof(buf), "%016u", num);
+  return std::string(buf);
+}
+
+void BM_LogAndApply(int iters, int num_base_files) {
+  std::string dbname = test::TmpDir() + "/rocksdb_test_benchmark";
+  ASSERT_OK(DestroyDB(dbname, Options()));
+
+  DB* db = nullptr;
+  Options opts;
+  opts.create_if_missing = true;
+  Status s = DB::Open(opts, dbname, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != nullptr);
+
+  delete db;
+  db = nullptr;
+
+  Env* env = Env::Default();
+
+  port::Mutex mu;
+  MutexLock l(&mu);
+
+  Options options;
+  EnvOptions sopt;
+  VersionSet vset(dbname, &options, sopt, nullptr);
+  std::vector<ColumnFamilyDescriptor> dummy;
+  dummy.push_back(ColumnFamilyDescriptor());
+  ASSERT_OK(vset.Recover(dummy));
+  auto default_cfd = vset.GetColumnFamilySet()->GetDefault();
+  VersionEdit vbase;
+  uint64_t fnum = 1;
+  for (int i = 0; i < num_base_files; i++) {
+    InternalKey start(MakeKey(2*fnum), 1, kTypeValue);
+    InternalKey limit(MakeKey(2*fnum+1), 1, kTypeDeletion);
+    vbase.AddFile(2, fnum++, 1 /* file size */, start, limit, 1, 1);
+  }
+  ASSERT_OK(vset.LogAndApply(default_cfd, &vbase, &mu));
+
+  uint64_t start_micros = env->NowMicros();
+
+  for (int i = 0; i < iters; i++) {
+    VersionEdit vedit;
+    vedit.DeleteFile(2, fnum);
+    InternalKey start(MakeKey(2*fnum), 1, kTypeValue);
+    InternalKey limit(MakeKey(2*fnum+1), 1, kTypeDeletion);
+    vedit.AddFile(2, fnum++, 1 /* file size */, start, limit, 1, 1);
+    vset.LogAndApply(default_cfd, &vedit, &mu);
+  }
+  uint64_t stop_micros = env->NowMicros();
+  unsigned int us = stop_micros - start_micros;
+  char buf[16];
+  snprintf(buf, sizeof(buf), "%d", num_base_files);
+  fprintf(stderr,
+          "BM_LogAndApply/%-6s   %8d iters : %9u us (%7.0f us / iter)\n",
+          buf, iters, us, ((float)us) / iters);
+}
+}  // namespace
+
+TEST(DBTest, TailingIteratorSingle) {
+  ReadOptions read_options;
+  read_options.tailing = true;
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+  iter->SeekToFirst();
+  ASSERT_TRUE(!iter->Valid());
+
+  // add a record and check that iter can see it
+  ASSERT_OK(db_->Put(WriteOptions(), "mirko", "fodor"));
+  iter->SeekToFirst();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().ToString(), "mirko");
+
+  iter->Next();
+  ASSERT_TRUE(!iter->Valid());
+}
+
+TEST(DBTest, TailingIteratorKeepAdding) {
+  CreateAndReopenWithCF({"pikachu"});
+  ReadOptions read_options;
+  read_options.tailing = true;
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
+  std::string value(1024, 'a');
+
+  const int num_records = 10000;
+  for (int i = 0; i < num_records; ++i) {
+    char buf[32];
+    snprintf(buf, sizeof(buf), "%016d", i);
+
+    Slice key(buf, 16);
+    ASSERT_OK(Put(1, key, value));
+
+    iter->Seek(key);
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(key), 0);
+  }
+}
+
+TEST(DBTest, TailingIteratorDeletes) {
+  CreateAndReopenWithCF({"pikachu"});
+  ReadOptions read_options;
+  read_options.tailing = true;
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
+
+  // write a single record, read it using the iterator, then delete it
+  ASSERT_OK(Put(1, "0test", "test"));
+  iter->SeekToFirst();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().ToString(), "0test");
+  ASSERT_OK(Delete(1, "0test"));
+
+  // write many more records
+  const int num_records = 10000;
+  std::string value(1024, 'A');
+
+  for (int i = 0; i < num_records; ++i) {
+    char buf[32];
+    snprintf(buf, sizeof(buf), "1%015d", i);
+
+    Slice key(buf, 16);
+    ASSERT_OK(Put(1, key, value));
+  }
+
+  // force a flush to make sure that no records are read from memtable
+  ASSERT_OK(Flush(1));
+
+  // skip "0test"
+  iter->Next();
+
+  // make sure we can read all new records using the existing iterator
+  int count = 0;
+  for (; iter->Valid(); iter->Next(), ++count) ;
+
+  ASSERT_EQ(count, num_records);
+}
+
+TEST(DBTest, TailingIteratorPrefixSeek) {
+  ReadOptions read_options;
+  read_options.tailing = true;
+
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.prefix_extractor.reset(NewFixedPrefixTransform(2));
+  options.memtable_factory.reset(NewHashSkipListRepFactory());
+  DestroyAndReopen(&options);
+  CreateAndReopenWithCF({"pikachu"}, &options);
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
+  ASSERT_OK(Put(1, "0101", "test"));
+
+  ASSERT_OK(Flush(1));
+
+  ASSERT_OK(Put(1, "0202", "test"));
+
+  // Seek(0102) shouldn't find any records since 0202 has a different prefix
+  iter->Seek("0102");
+  ASSERT_TRUE(!iter->Valid());
+
+  iter->Seek("0202");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().ToString(), "0202");
+
+  iter->Next();
+  ASSERT_TRUE(!iter->Valid());
+}
+
+TEST(DBTest, ChecksumTest) {
+  BlockBasedTableOptions table_options;
+  Options options = CurrentOptions();
+
+  table_options.checksum = kCRC32c;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(&options);
+  ASSERT_OK(Put("a", "b"));
+  ASSERT_OK(Put("c", "d"));
+  ASSERT_OK(Flush());  // table with crc checksum
+
+  table_options.checksum = kxxHash;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(&options);
+  ASSERT_OK(Put("e", "f"));
+  ASSERT_OK(Put("g", "h"));
+  ASSERT_OK(Flush());  // table with xxhash checksum
+
+  table_options.checksum = kCRC32c;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(&options);
+  ASSERT_EQ("b", Get("a"));
+  ASSERT_EQ("d", Get("c"));
+  ASSERT_EQ("f", Get("e"));
+  ASSERT_EQ("h", Get("g"));
+
+  table_options.checksum = kCRC32c;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(&options);
+  ASSERT_EQ("b", Get("a"));
+  ASSERT_EQ("d", Get("c"));
+  ASSERT_EQ("f", Get("e"));
+  ASSERT_EQ("h", Get("g"));
+}
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  if (argc > 1 && std::string(argv[1]) == "--benchmark") {
+    rocksdb::BM_LogAndApply(1000, 1);
+    rocksdb::BM_LogAndApply(1000, 100);
+    rocksdb::BM_LogAndApply(1000, 10000);
+    rocksdb::BM_LogAndApply(100, 100000);
+    return 0;
+  }
+
+  return rocksdb::test::RunAllTests();
+}
diff --git a/db/dbformat.cc b/db/dbformat.cc
new file mode 100644 (file)
index 0000000..e53d16d
--- /dev/null
@@ -0,0 +1,169 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "db/dbformat.h"
+
+#include <stdio.h>
+#include "port/port.h"
+#include "util/coding.h"
+#include "util/perf_context_imp.h"
+
+namespace rocksdb {
+
+uint64_t PackSequenceAndType(uint64_t seq, ValueType t) {
+  assert(seq <= kMaxSequenceNumber);
+  assert(t <= kValueTypeForSeek);
+  return (seq << 8) | t;
+}
+
+void AppendInternalKey(std::string* result, const ParsedInternalKey& key) {
+  result->append(key.user_key.data(), key.user_key.size());
+  PutFixed64(result, PackSequenceAndType(key.sequence, key.type));
+}
+
+std::string ParsedInternalKey::DebugString(bool hex) const {
+  char buf[50];
+  snprintf(buf, sizeof(buf), "' @ %llu : %d",
+           (unsigned long long) sequence,
+           int(type));
+  std::string result = "'";
+  result += user_key.ToString(hex);
+  result += buf;
+  return result;
+}
+
+std::string InternalKey::DebugString(bool hex) const {
+  std::string result;
+  ParsedInternalKey parsed;
+  if (ParseInternalKey(rep_, &parsed)) {
+    result = parsed.DebugString(hex);
+  } else {
+    result = "(bad)";
+    result.append(EscapeString(rep_));
+  }
+  return result;
+}
+
+const char* InternalKeyComparator::Name() const {
+  return name_.c_str();
+}
+
+int InternalKeyComparator::Compare(const Slice& akey, const Slice& bkey) const {
+  // Order by:
+  //    increasing user key (according to user-supplied comparator)
+  //    decreasing sequence number
+  //    decreasing type (though sequence# should be enough to disambiguate)
+  int r = user_comparator_->Compare(ExtractUserKey(akey), ExtractUserKey(bkey));
+  PERF_COUNTER_ADD(user_key_comparison_count, 1);
+  if (r == 0) {
+    const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8);
+    const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8);
+    if (anum > bnum) {
+      r = -1;
+    } else if (anum < bnum) {
+      r = +1;
+    }
+  }
+  return r;
+}
+
+int InternalKeyComparator::Compare(const ParsedInternalKey& a,
+                                   const ParsedInternalKey& b) const {
+  // Order by:
+  //    increasing user key (according to user-supplied comparator)
+  //    decreasing sequence number
+  //    decreasing type (though sequence# should be enough to disambiguate)
+  int r = user_comparator_->Compare(a.user_key, b.user_key);
+  PERF_COUNTER_ADD(user_key_comparison_count, 1);
+  if (r == 0) {
+    if (a.sequence > b.sequence) {
+      r = -1;
+    } else if (a.sequence < b.sequence) {
+      r = +1;
+    } else if (a.type > b.type) {
+      r = -1;
+    } else if (a.type < b.type) {
+      r = +1;
+    }
+  }
+  return r;
+}
+
+void InternalKeyComparator::FindShortestSeparator(
+      std::string* start,
+      const Slice& limit) const {
+  // Attempt to shorten the user portion of the key
+  Slice user_start = ExtractUserKey(*start);
+  Slice user_limit = ExtractUserKey(limit);
+  std::string tmp(user_start.data(), user_start.size());
+  user_comparator_->FindShortestSeparator(&tmp, user_limit);
+  if (tmp.size() < user_start.size() &&
+      user_comparator_->Compare(user_start, tmp) < 0) {
+    // User key has become shorter physically, but larger logically.
+    // Tack on the earliest possible number to the shortened user key.
+    PutFixed64(&tmp, PackSequenceAndType(kMaxSequenceNumber,kValueTypeForSeek));
+    assert(this->Compare(*start, tmp) < 0);
+    assert(this->Compare(tmp, limit) < 0);
+    start->swap(tmp);
+  }
+}
+
+void InternalKeyComparator::FindShortSuccessor(std::string* key) const {
+  Slice user_key = ExtractUserKey(*key);
+  std::string tmp(user_key.data(), user_key.size());
+  user_comparator_->FindShortSuccessor(&tmp);
+  if (tmp.size() < user_key.size() &&
+      user_comparator_->Compare(user_key, tmp) < 0) {
+    // User key has become shorter physically, but larger logically.
+    // Tack on the earliest possible number to the shortened user key.
+    PutFixed64(&tmp, PackSequenceAndType(kMaxSequenceNumber,kValueTypeForSeek));
+    assert(this->Compare(*key, tmp) < 0);
+    key->swap(tmp);
+  }
+}
+
+const char* InternalFilterPolicy::Name() const {
+  return user_policy_->Name();
+}
+
+void InternalFilterPolicy::CreateFilter(const Slice* keys, int n,
+                                        std::string* dst) const {
+  // We rely on the fact that the code in table.cc does not mind us
+  // adjusting keys[].
+  Slice* mkey = const_cast<Slice*>(keys);
+  for (int i = 0; i < n; i++) {
+    mkey[i] = ExtractUserKey(keys[i]);
+    // TODO(sanjay): Suppress dups?
+  }
+  user_policy_->CreateFilter(keys, n, dst);
+}
+
+bool InternalFilterPolicy::KeyMayMatch(const Slice& key, const Slice& f) const {
+  return user_policy_->KeyMayMatch(ExtractUserKey(key), f);
+}
+
+LookupKey::LookupKey(const Slice& user_key, SequenceNumber s) {
+  size_t usize = user_key.size();
+  size_t needed = usize + 13;  // A conservative estimate
+  char* dst;
+  if (needed <= sizeof(space_)) {
+    dst = space_;
+  } else {
+    dst = new char[needed];
+  }
+  start_ = dst;
+  dst = EncodeVarint32(dst, usize + 8);
+  kstart_ = dst;
+  memcpy(dst, user_key.data(), usize);
+  dst += usize;
+  EncodeFixed64(dst, PackSequenceAndType(s, kValueTypeForSeek));
+  dst += 8;
+  end_ = dst;
+}
+
+}  // namespace rocksdb
diff --git a/db/dbformat.h b/db/dbformat.h
new file mode 100644 (file)
index 0000000..1c86b12
--- /dev/null
@@ -0,0 +1,338 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <stdio.h>
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
+#include "rocksdb/types.h"
+#include "util/coding.h"
+#include "util/logging.h"
+
+namespace rocksdb {
+
+class InternalKey;
+
+// Value types encoded as the last component of internal keys.
+// DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk
+// data structures.
+// The highest bit of the value type needs to be reserved to SST tables
+// for them to do more flexible encoding.
+enum ValueType : unsigned char {
+  kTypeDeletion = 0x0,
+  kTypeValue = 0x1,
+  kTypeMerge = 0x2,
+  kTypeLogData = 0x3,
+  kTypeColumnFamilyDeletion = 0x4,
+  kTypeColumnFamilyValue = 0x5,
+  kTypeColumnFamilyMerge = 0x6,
+  kMaxValue = 0x7F
+};
+
+// kValueTypeForSeek defines the ValueType that should be passed when
+// constructing a ParsedInternalKey object for seeking to a particular
+// sequence number (since we sort sequence numbers in decreasing order
+// and the value type is embedded as the low 8 bits in the sequence
+// number in internal keys, we need to use the highest-numbered
+// ValueType, not the lowest).
+static const ValueType kValueTypeForSeek = kTypeMerge;
+
+// We leave eight bits empty at the bottom so a type and sequence#
+// can be packed together into 64-bits.
+static const SequenceNumber kMaxSequenceNumber =
+    ((0x1ull << 56) - 1);
+
+struct ParsedInternalKey {
+  Slice user_key;
+  SequenceNumber sequence;
+  ValueType type;
+
+  ParsedInternalKey() { }  // Intentionally left uninitialized (for speed)
+  ParsedInternalKey(const Slice& u, const SequenceNumber& seq, ValueType t)
+      : user_key(u), sequence(seq), type(t) { }
+  std::string DebugString(bool hex = false) const;
+};
+
+// Return the length of the encoding of "key".
+inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) {
+  return key.user_key.size() + 8;
+}
+
+extern uint64_t PackSequenceAndType(uint64_t seq, ValueType t);
+
+// Append the serialization of "key" to *result.
+extern void AppendInternalKey(std::string* result,
+                              const ParsedInternalKey& key);
+
+// Attempt to parse an internal key from "internal_key".  On success,
+// stores the parsed data in "*result", and returns true.
+//
+// On error, returns false, leaves "*result" in an undefined state.
+extern bool ParseInternalKey(const Slice& internal_key,
+                             ParsedInternalKey* result);
+
+// Returns the user key portion of an internal key.
+inline Slice ExtractUserKey(const Slice& internal_key) {
+  assert(internal_key.size() >= 8);
+  return Slice(internal_key.data(), internal_key.size() - 8);
+}
+
+inline ValueType ExtractValueType(const Slice& internal_key) {
+  assert(internal_key.size() >= 8);
+  const size_t n = internal_key.size();
+  uint64_t num = DecodeFixed64(internal_key.data() + n - 8);
+  unsigned char c = num & 0xff;
+  return static_cast<ValueType>(c);
+}
+
+// A comparator for internal keys that uses a specified comparator for
+// the user key portion and breaks ties by decreasing sequence number.
+class InternalKeyComparator : public Comparator {
+ private:
+  const Comparator* user_comparator_;
+  std::string name_;
+ public:
+  explicit InternalKeyComparator(const Comparator* c) : user_comparator_(c),
+    name_("rocksdb.InternalKeyComparator:" +
+          std::string(user_comparator_->Name())) {
+  }
+  virtual ~InternalKeyComparator() {}
+
+  virtual const char* Name() const;
+  virtual int Compare(const Slice& a, const Slice& b) const;
+  virtual void FindShortestSeparator(
+      std::string* start,
+      const Slice& limit) const;
+  virtual void FindShortSuccessor(std::string* key) const;
+
+  const Comparator* user_comparator() const { return user_comparator_; }
+
+  int Compare(const InternalKey& a, const InternalKey& b) const;
+  int Compare(const ParsedInternalKey& a, const ParsedInternalKey& b) const;
+};
+
+// Filter policy wrapper that converts from internal keys to user keys
+class InternalFilterPolicy : public FilterPolicy {
+ private:
+  const FilterPolicy* const user_policy_;
+ public:
+  explicit InternalFilterPolicy(const FilterPolicy* p) : user_policy_(p) { }
+  virtual const char* Name() const;
+  virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const;
+  virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const;
+};
+
+// Modules in this directory should keep internal keys wrapped inside
+// the following class instead of plain strings so that we do not
+// incorrectly use string comparisons instead of an InternalKeyComparator.
+class InternalKey {
+ private:
+  std::string rep_;
+ public:
+  InternalKey() { }   // Leave rep_ as empty to indicate it is invalid
+  InternalKey(const Slice& user_key, SequenceNumber s, ValueType t) {
+    AppendInternalKey(&rep_, ParsedInternalKey(user_key, s, t));
+  }
+
+  void DecodeFrom(const Slice& s) { rep_.assign(s.data(), s.size()); }
+  Slice Encode() const {
+    assert(!rep_.empty());
+    return rep_;
+  }
+
+  Slice user_key() const { return ExtractUserKey(rep_); }
+
+  void SetFrom(const ParsedInternalKey& p) {
+    rep_.clear();
+    AppendInternalKey(&rep_, p);
+  }
+
+  void Clear() { rep_.clear(); }
+
+  std::string DebugString(bool hex = false) const;
+};
+
+inline int InternalKeyComparator::Compare(
+    const InternalKey& a, const InternalKey& b) const {
+  return Compare(a.Encode(), b.Encode());
+}
+
+inline bool ParseInternalKey(const Slice& internal_key,
+                             ParsedInternalKey* result) {
+  const size_t n = internal_key.size();
+  if (n < 8) return false;
+  uint64_t num = DecodeFixed64(internal_key.data() + n - 8);
+  unsigned char c = num & 0xff;
+  result->sequence = num >> 8;
+  result->type = static_cast<ValueType>(c);
+  assert(result->type <= ValueType::kMaxValue);
+  result->user_key = Slice(internal_key.data(), n - 8);
+  return (c <= static_cast<unsigned char>(kValueTypeForSeek));
+}
+
+// Update the sequence number in the internal key
+inline void UpdateInternalKey(char* internal_key,
+                              const size_t internal_key_size,
+                              uint64_t seq, ValueType t) {
+  assert(internal_key_size >= 8);
+  char* seqtype = internal_key + internal_key_size - 8;
+  uint64_t newval = (seq << 8) | t;
+  EncodeFixed64(seqtype, newval);
+}
+
+// Get the sequence number from the internal key
+inline uint64_t GetInternalKeySeqno(const Slice& internal_key) {
+  const size_t n = internal_key.size();
+  assert(n >= 8);
+  uint64_t num = DecodeFixed64(internal_key.data() + n - 8);
+  return num >> 8;
+}
+
+
+// A helper class useful for DBImpl::Get()
+class LookupKey {
+ public:
+  // Initialize *this for looking up user_key at a snapshot with
+  // the specified sequence number.
+  LookupKey(const Slice& user_key, SequenceNumber sequence);
+
+  ~LookupKey();
+
+  // Return a key suitable for lookup in a MemTable.
+  Slice memtable_key() const { return Slice(start_, end_ - start_); }
+
+  // Return an internal key (suitable for passing to an internal iterator)
+  Slice internal_key() const { return Slice(kstart_, end_ - kstart_); }
+
+  // Return the user key
+  Slice user_key() const { return Slice(kstart_, end_ - kstart_ - 8); }
+
+ private:
+  // We construct a char array of the form:
+  //    klength  varint32               <-- start_
+  //    userkey  char[klength]          <-- kstart_
+  //    tag      uint64
+  //                                    <-- end_
+  // The array is a suitable MemTable key.
+  // The suffix starting with "userkey" can be used as an InternalKey.
+  const char* start_;
+  const char* kstart_;
+  const char* end_;
+  char space_[200];      // Avoid allocation for short keys
+
+  // No copying allowed
+  LookupKey(const LookupKey&);
+  void operator=(const LookupKey&);
+};
+
+inline LookupKey::~LookupKey() {
+  if (start_ != space_) delete[] start_;
+}
+
+class IterKey {
+ public:
+  IterKey() : key_(space_), buf_size_(sizeof(space_)), key_size_(0) {}
+
+  ~IterKey() { ResetBuffer(); }
+
+  Slice GetKey() const { return Slice(key_, key_size_); }
+
+  void Clear() { key_size_ = 0; }
+
+  void SetUserKey(const Slice& user_key) {
+    size_t size = user_key.size();
+    EnlargeBufferIfNeeded(size);
+    memcpy(key_, user_key.data(), size);
+    key_size_ = size;
+  }
+
+  void SetInternalKey(const Slice& user_key, SequenceNumber s,
+                      ValueType value_type = kValueTypeForSeek) {
+    size_t usize = user_key.size();
+    EnlargeBufferIfNeeded(usize + sizeof(uint64_t));
+    memcpy(key_, user_key.data(), usize);
+    EncodeFixed64(key_ + usize, PackSequenceAndType(s, value_type));
+    key_size_ = usize + sizeof(uint64_t);
+  }
+
+  void SetInternalKey(const ParsedInternalKey& parsed_key) {
+    SetInternalKey(parsed_key.user_key, parsed_key.sequence, parsed_key.type);
+  }
+
+ private:
+  char* key_;
+  size_t buf_size_;
+  size_t key_size_;
+  char space_[32];  // Avoid allocation for short keys
+
+  void ResetBuffer() {
+    if (key_ != nullptr && key_ != space_) {
+      delete[] key_;
+    }
+    key_ = space_;
+    buf_size_ = sizeof(space_);
+    key_size_ = 0;
+  }
+
+  // Enlarge the buffer size if needed based on key_size.
+  // By default, static allocated buffer is used. Once there is a key
+  // larger than the static allocated buffer, another buffer is dynamically
+  // allocated, until a larger key buffer is requested. In that case, we
+  // reallocate buffer and delete the old one.
+  void EnlargeBufferIfNeeded(size_t key_size) {
+    // If size is smaller than buffer size, continue using current buffer,
+    // or the static allocated one, as default
+    if (key_size > buf_size_) {
+      // Need to enlarge the buffer.
+      ResetBuffer();
+      key_ = new char[key_size];
+      buf_size_ = key_size;
+    }
+  }
+
+  // No copying allowed
+  IterKey(const IterKey&) = delete;
+  void operator=(const IterKey&) = delete;
+};
+
+class InternalKeySliceTransform : public SliceTransform {
+ public:
+  explicit InternalKeySliceTransform(const SliceTransform* transform)
+      : transform_(transform) {}
+
+  virtual const char* Name() const { return transform_->Name(); }
+
+  virtual Slice Transform(const Slice& src) const {
+    auto user_key = ExtractUserKey(src);
+    return transform_->Transform(user_key);
+  }
+
+  virtual bool InDomain(const Slice& src) const {
+    auto user_key = ExtractUserKey(src);
+    return transform_->InDomain(user_key);
+  }
+
+  virtual bool InRange(const Slice& dst) const {
+    auto user_key = ExtractUserKey(dst);
+    return transform_->InRange(user_key);
+  }
+
+  const SliceTransform* user_prefix_extractor() const { return transform_; }
+
+ private:
+  // Like comparator, InternalKeySliceTransform will not take care of the
+  // deletion of transform_
+  const SliceTransform* const transform_;
+};
+
+}  // namespace rocksdb
diff --git a/db/dbformat_test.cc b/db/dbformat_test.cc
new file mode 100644 (file)
index 0000000..b520f3c
--- /dev/null
@@ -0,0 +1,117 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/dbformat.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+
+namespace rocksdb {
+
+static std::string IKey(const std::string& user_key,
+                        uint64_t seq,
+                        ValueType vt) {
+  std::string encoded;
+  AppendInternalKey(&encoded, ParsedInternalKey(user_key, seq, vt));
+  return encoded;
+}
+
+static std::string Shorten(const std::string& s, const std::string& l) {
+  std::string result = s;
+  InternalKeyComparator(BytewiseComparator()).FindShortestSeparator(&result, l);
+  return result;
+}
+
+static std::string ShortSuccessor(const std::string& s) {
+  std::string result = s;
+  InternalKeyComparator(BytewiseComparator()).FindShortSuccessor(&result);
+  return result;
+}
+
+static void TestKey(const std::string& key,
+                    uint64_t seq,
+                    ValueType vt) {
+  std::string encoded = IKey(key, seq, vt);
+
+  Slice in(encoded);
+  ParsedInternalKey decoded("", 0, kTypeValue);
+
+  ASSERT_TRUE(ParseInternalKey(in, &decoded));
+  ASSERT_EQ(key, decoded.user_key.ToString());
+  ASSERT_EQ(seq, decoded.sequence);
+  ASSERT_EQ(vt, decoded.type);
+
+  ASSERT_TRUE(!ParseInternalKey(Slice("bar"), &decoded));
+}
+
+class FormatTest { };
+
+TEST(FormatTest, InternalKey_EncodeDecode) {
+  const char* keys[] = { "", "k", "hello", "longggggggggggggggggggggg" };
+  const uint64_t seq[] = {
+    1, 2, 3,
+    (1ull << 8) - 1, 1ull << 8, (1ull << 8) + 1,
+    (1ull << 16) - 1, 1ull << 16, (1ull << 16) + 1,
+    (1ull << 32) - 1, 1ull << 32, (1ull << 32) + 1
+  };
+  for (unsigned int k = 0; k < sizeof(keys) / sizeof(keys[0]); k++) {
+    for (unsigned int s = 0; s < sizeof(seq) / sizeof(seq[0]); s++) {
+      TestKey(keys[k], seq[s], kTypeValue);
+      TestKey("hello", 1, kTypeDeletion);
+    }
+  }
+}
+
+TEST(FormatTest, InternalKeyShortSeparator) {
+  // When user keys are same
+  ASSERT_EQ(IKey("foo", 100, kTypeValue),
+            Shorten(IKey("foo", 100, kTypeValue),
+                    IKey("foo", 99, kTypeValue)));
+  ASSERT_EQ(IKey("foo", 100, kTypeValue),
+            Shorten(IKey("foo", 100, kTypeValue),
+                    IKey("foo", 101, kTypeValue)));
+  ASSERT_EQ(IKey("foo", 100, kTypeValue),
+            Shorten(IKey("foo", 100, kTypeValue),
+                    IKey("foo", 100, kTypeValue)));
+  ASSERT_EQ(IKey("foo", 100, kTypeValue),
+            Shorten(IKey("foo", 100, kTypeValue),
+                    IKey("foo", 100, kTypeDeletion)));
+
+  // When user keys are misordered
+  ASSERT_EQ(IKey("foo", 100, kTypeValue),
+            Shorten(IKey("foo", 100, kTypeValue),
+                    IKey("bar", 99, kTypeValue)));
+
+  // When user keys are different, but correctly ordered
+  ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek),
+            Shorten(IKey("foo", 100, kTypeValue),
+                    IKey("hello", 200, kTypeValue)));
+
+  // When start user key is prefix of limit user key
+  ASSERT_EQ(IKey("foo", 100, kTypeValue),
+            Shorten(IKey("foo", 100, kTypeValue),
+                    IKey("foobar", 200, kTypeValue)));
+
+  // When limit user key is prefix of start user key
+  ASSERT_EQ(IKey("foobar", 100, kTypeValue),
+            Shorten(IKey("foobar", 100, kTypeValue),
+                    IKey("foo", 200, kTypeValue)));
+}
+
+TEST(FormatTest, InternalKeyShortestSuccessor) {
+  ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek),
+            ShortSuccessor(IKey("foo", 100, kTypeValue)));
+  ASSERT_EQ(IKey("\xff\xff", 100, kTypeValue),
+            ShortSuccessor(IKey("\xff\xff", 100, kTypeValue)));
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/db/deletefile_test.cc b/db/deletefile_test.cc
new file mode 100644 (file)
index 0000000..14f0324
--- /dev/null
@@ -0,0 +1,295 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/db.h"
+#include "db/db_impl.h"
+#include "db/filename.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+#include "rocksdb/env.h"
+#include "rocksdb/transaction_log.h"
+#include <vector>
+#include <stdlib.h>
+#include <map>
+#include <string>
+
+namespace rocksdb {
+
+class DeleteFileTest {
+ public:
+  std::string dbname_;
+  Options options_;
+  DB* db_;
+  Env* env_;
+  int numlevels_;
+
+  DeleteFileTest() {
+    db_ = nullptr;
+    env_ = Env::Default();
+    options_.write_buffer_size = 1024*1024*1000;
+    options_.target_file_size_base = 1024*1024*1000;
+    options_.max_bytes_for_level_base = 1024*1024*1000;
+    options_.WAL_ttl_seconds = 300; // Used to test log files
+    options_.WAL_size_limit_MB = 1024; // Used to test log files
+    dbname_ = test::TmpDir() + "/deletefile_test";
+    options_.wal_dir = dbname_ + "/wal_files";
+
+    // clean up all the files that might have been there before
+    std::vector<std::string> old_files;
+    env_->GetChildren(dbname_, &old_files);
+    for (auto file : old_files) {
+      env_->DeleteFile(dbname_ + "/" + file);
+    }
+    env_->GetChildren(options_.wal_dir, &old_files);
+    for (auto file : old_files) {
+      env_->DeleteFile(options_.wal_dir + "/" + file);
+    }
+
+    DestroyDB(dbname_, options_);
+    numlevels_ = 7;
+    ASSERT_OK(ReopenDB(true));
+  }
+
+  Status ReopenDB(bool create) {
+    delete db_;
+    if (create) {
+      DestroyDB(dbname_, options_);
+    }
+    db_ = nullptr;
+    options_.create_if_missing = create;
+    return DB::Open(options_, dbname_, &db_);
+  }
+
+  void CloseDB() {
+    delete db_;
+  }
+
+  void AddKeys(int numkeys, int startkey = 0) {
+    WriteOptions options;
+    options.sync = false;
+    ReadOptions roptions;
+    for (int i = startkey; i < (numkeys + startkey) ; i++) {
+      std::string temp = std::to_string(i);
+      Slice key(temp);
+      Slice value(temp);
+      ASSERT_OK(db_->Put(options, key, value));
+    }
+  }
+
+  int numKeysInLevels(
+    std::vector<LiveFileMetaData> &metadata,
+    std::vector<int> *keysperlevel = nullptr) {
+
+    if (keysperlevel != nullptr) {
+      keysperlevel->resize(numlevels_);
+    }
+
+    int numKeys = 0;
+    for (size_t i = 0; i < metadata.size(); i++) {
+      int startkey = atoi(metadata[i].smallestkey.c_str());
+      int endkey = atoi(metadata[i].largestkey.c_str());
+      int numkeysinfile = (endkey - startkey + 1);
+      numKeys += numkeysinfile;
+      if (keysperlevel != nullptr) {
+        (*keysperlevel)[(int)metadata[i].level] += numkeysinfile;
+      }
+      fprintf(stderr, "level %d name %s smallest %s largest %s\n",
+              metadata[i].level, metadata[i].name.c_str(),
+              metadata[i].smallestkey.c_str(),
+              metadata[i].largestkey.c_str());
+    }
+    return numKeys;
+  }
+
+  void CreateTwoLevels() {
+    AddKeys(50000, 10000);
+    DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+    ASSERT_OK(dbi->TEST_FlushMemTable());
+    ASSERT_OK(dbi->TEST_WaitForFlushMemTable());
+
+    AddKeys(50000, 10000);
+    ASSERT_OK(dbi->TEST_FlushMemTable());
+    ASSERT_OK(dbi->TEST_WaitForFlushMemTable());
+  }
+
+  void CheckFileTypeCounts(std::string& dir,
+                            int required_log,
+                            int required_sst,
+                            int required_manifest) {
+    std::vector<std::string> filenames;
+    env_->GetChildren(dir, &filenames);
+
+    int log_cnt = 0, sst_cnt = 0, manifest_cnt = 0;
+    for (auto file : filenames) {
+      uint64_t number;
+      FileType type;
+      if (ParseFileName(file, &number, &type)) {
+        log_cnt += (type == kLogFile);
+        sst_cnt += (type == kTableFile);
+        manifest_cnt += (type == kDescriptorFile);
+      }
+    }
+    ASSERT_EQ(required_log, log_cnt);
+    ASSERT_EQ(required_sst, sst_cnt);
+    ASSERT_EQ(required_manifest, manifest_cnt);
+  }
+
+};
+
+TEST(DeleteFileTest, AddKeysAndQueryLevels) {
+  CreateTwoLevels();
+  std::vector<LiveFileMetaData> metadata;
+  std::vector<int> keysinlevel;
+  db_->GetLiveFilesMetaData(&metadata);
+
+  std::string level1file = "";
+  int level1keycount = 0;
+  std::string level2file = "";
+  int level2keycount = 0;
+  int level1index = 0;
+  int level2index = 1;
+
+  ASSERT_EQ((int)metadata.size(), 2);
+  if (metadata[0].level == 2) {
+    level1index = 1;
+    level2index = 0;
+  }
+
+  level1file = metadata[level1index].name;
+  int startkey = atoi(metadata[level1index].smallestkey.c_str());
+  int endkey = atoi(metadata[level1index].largestkey.c_str());
+  level1keycount = (endkey - startkey + 1);
+  level2file = metadata[level2index].name;
+  startkey = atoi(metadata[level2index].smallestkey.c_str());
+  endkey = atoi(metadata[level2index].largestkey.c_str());
+  level2keycount = (endkey - startkey + 1);
+
+  // COntrolled setup. Levels 1 and 2 should both have 50K files.
+  // This is a little fragile as it depends on the current
+  // compaction heuristics.
+  ASSERT_EQ(level1keycount, 50000);
+  ASSERT_EQ(level2keycount, 50000);
+
+  Status status = db_->DeleteFile("0.sst");
+  ASSERT_TRUE(status.IsInvalidArgument());
+
+  // intermediate level files cannot be deleted.
+  status = db_->DeleteFile(level1file);
+  ASSERT_TRUE(status.IsInvalidArgument());
+
+  // Lowest level file deletion should succeed.
+  ASSERT_OK(db_->DeleteFile(level2file));
+
+  CloseDB();
+}
+
+TEST(DeleteFileTest, PurgeObsoleteFilesTest) {
+  CreateTwoLevels();
+  // there should be only one (empty) log file because CreateTwoLevels()
+  // flushes the memtables to disk
+  CheckFileTypeCounts(options_.wal_dir, 1, 0, 0);
+  // 2 ssts, 1 manifest
+  CheckFileTypeCounts(dbname_, 0, 2, 1);
+  std::string first("0"), last("999999");
+  Slice first_slice(first), last_slice(last);
+  db_->CompactRange(&first_slice, &last_slice, true, 2);
+  // 1 sst after compaction
+  CheckFileTypeCounts(dbname_, 0, 1, 1);
+
+  // this time, we keep an iterator alive
+  ReopenDB(true);
+  Iterator *itr = 0;
+  CreateTwoLevels();
+  itr = db_->NewIterator(ReadOptions());
+  db_->CompactRange(&first_slice, &last_slice, true, 2);
+  // 3 sst after compaction with live iterator
+  CheckFileTypeCounts(dbname_, 0, 3, 1);
+  delete itr;
+  // 1 sst after iterator deletion
+  CheckFileTypeCounts(dbname_, 0, 1, 1);
+
+  CloseDB();
+}
+
+TEST(DeleteFileTest, DeleteFileWithIterator) {
+  CreateTwoLevels();
+  ReadOptions options;
+  Iterator* it = db_->NewIterator(options);
+  std::vector<LiveFileMetaData> metadata;
+  db_->GetLiveFilesMetaData(&metadata);
+
+  std::string level2file = "";
+
+  ASSERT_EQ((int)metadata.size(), 2);
+  if (metadata[0].level == 1) {
+    level2file = metadata[1].name;
+  } else {
+    level2file = metadata[0].name;
+  }
+
+  Status status = db_->DeleteFile(level2file);
+  fprintf(stdout, "Deletion status %s: %s\n",
+          level2file.c_str(), status.ToString().c_str());
+  ASSERT_TRUE(status.ok());
+  it->SeekToFirst();
+  int numKeysIterated = 0;
+  while(it->Valid()) {
+    numKeysIterated++;
+    it->Next();
+  }
+  ASSERT_EQ(numKeysIterated, 50000);
+  delete it;
+  CloseDB();
+}
+
+TEST(DeleteFileTest, DeleteLogFiles) {
+  AddKeys(10, 0);
+  VectorLogPtr logfiles;
+  db_->GetSortedWalFiles(logfiles);
+  ASSERT_GT(logfiles.size(), 0UL);
+  // Take the last log file which is expected to be alive and try to delete it
+  // Should not succeed because live logs are not allowed to be deleted
+  std::unique_ptr<LogFile> alive_log = std::move(logfiles.back());
+  ASSERT_EQ(alive_log->Type(), kAliveLogFile);
+  ASSERT_TRUE(env_->FileExists(options_.wal_dir + "/" + alive_log->PathName()));
+  fprintf(stdout, "Deleting alive log file %s\n",
+          alive_log->PathName().c_str());
+  ASSERT_TRUE(!db_->DeleteFile(alive_log->PathName()).ok());
+  ASSERT_TRUE(env_->FileExists(options_.wal_dir + "/" + alive_log->PathName()));
+  logfiles.clear();
+
+  // Call Flush to bring about a new working log file and add more keys
+  // Call Flush again to flush out memtable and move alive log to archived log
+  // and try to delete the archived log file
+  FlushOptions fopts;
+  db_->Flush(fopts);
+  AddKeys(10, 0);
+  db_->Flush(fopts);
+  db_->GetSortedWalFiles(logfiles);
+  ASSERT_GT(logfiles.size(), 0UL);
+  std::unique_ptr<LogFile> archived_log = std::move(logfiles.front());
+  ASSERT_EQ(archived_log->Type(), kArchivedLogFile);
+  ASSERT_TRUE(env_->FileExists(options_.wal_dir + "/" +
+        archived_log->PathName()));
+  fprintf(stdout, "Deleting archived log file %s\n",
+          archived_log->PathName().c_str());
+  ASSERT_OK(db_->DeleteFile(archived_log->PathName()));
+  ASSERT_TRUE(!env_->FileExists(options_.wal_dir + "/" +
+        archived_log->PathName()));
+  CloseDB();
+}
+
+} //namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
+
diff --git a/db/file_indexer.cc b/db/file_indexer.cc
new file mode 100644 (file)
index 0000000..2de7660
--- /dev/null
@@ -0,0 +1,202 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/file_indexer.h"
+#include <algorithm>
+#include "rocksdb/comparator.h"
+#include "db/version_edit.h"
+
+namespace rocksdb {
+
+FileIndexer::FileIndexer(const uint32_t num_levels,
+                         const Comparator* ucmp)
+  : num_levels_(num_levels),
+    ucmp_(ucmp),
+    next_level_index_(num_levels),
+    level_rb_(num_levels, -1) {
+}
+
+
+uint32_t FileIndexer::NumLevelIndex() {
+  return next_level_index_.size();
+}
+
+uint32_t FileIndexer::LevelIndexSize(uint32_t level) {
+  return next_level_index_[level].size();
+}
+
+void FileIndexer::GetNextLevelIndex(
+    const uint32_t level, const uint32_t file_index, const int cmp_smallest,
+    const int cmp_largest, int32_t* left_bound, int32_t* right_bound) {
+  assert(level > 0);
+
+  // Last level, no hint
+  if (level == num_levels_ - 1) {
+    *left_bound = 0;
+    *right_bound = -1;
+    return;
+  }
+
+  assert(level < num_levels_ - 1);
+  assert(static_cast<int32_t>(file_index) <= level_rb_[level]);
+
+  const auto& index = next_level_index_[level][file_index];
+
+  if (cmp_smallest < 0) {
+    *left_bound = (level > 0 && file_index > 0) ?
+      next_level_index_[level][file_index - 1].largest_lb : 0;
+    *right_bound = index.smallest_rb;
+  } else if (cmp_smallest == 0) {
+    *left_bound = index.smallest_lb;
+    *right_bound = index.smallest_rb;
+  } else if (cmp_smallest > 0 && cmp_largest < 0) {
+    *left_bound = index.smallest_lb;
+    *right_bound = index.largest_rb;
+  } else if (cmp_largest == 0) {
+    *left_bound = index.largest_lb;
+    *right_bound = index.largest_rb;
+  } else if (cmp_largest > 0) {
+    *left_bound = index.largest_lb;
+    *right_bound = level_rb_[level + 1];
+  } else {
+    assert(false);
+  }
+
+  assert(*left_bound >= 0);
+  assert(*left_bound <= *right_bound + 1);
+  assert(*right_bound <= level_rb_[level + 1]);
+}
+
+void FileIndexer::ClearIndex() {
+  for (uint32_t level = 1; level < num_levels_; ++level) {
+    next_level_index_[level].clear();
+  }
+}
+
+void FileIndexer::UpdateIndex(std::vector<FileMetaData*>* const files) {
+  if (files == nullptr) {
+    return;
+  }
+
+  // L1 - Ln-1
+  for (uint32_t level = 1; level < num_levels_ - 1; ++level) {
+    const auto& upper_files = files[level];
+    const int32_t upper_size = upper_files.size();
+    const auto& lower_files = files[level + 1];
+    level_rb_[level] = upper_files.size() - 1;
+    if (upper_size == 0) {
+      continue;
+    }
+    auto& index = next_level_index_[level];
+    index.resize(upper_size);
+
+    CalculateLB(upper_files, lower_files, &index,
+        [this](const FileMetaData* a, const FileMetaData* b) -> int {
+          return ucmp_->Compare(a->smallest.user_key(), b->largest.user_key());
+        },
+        [](IndexUnit* index, int32_t f_idx) {
+          index->smallest_lb = f_idx;
+        });
+    CalculateLB(upper_files, lower_files, &index,
+        [this](const FileMetaData* a, const FileMetaData* b) -> int {
+          return ucmp_->Compare(a->largest.user_key(), b->largest.user_key());
+        },
+        [](IndexUnit* index, int32_t f_idx) {
+          index->largest_lb = f_idx;
+        });
+    CalculateRB(upper_files, lower_files, &index,
+        [this](const FileMetaData* a, const FileMetaData* b) -> int {
+          return ucmp_->Compare(a->smallest.user_key(), b->smallest.user_key());
+        },
+        [](IndexUnit* index, int32_t f_idx) {
+          index->smallest_rb = f_idx;
+        });
+    CalculateRB(upper_files, lower_files, &index,
+        [this](const FileMetaData* a, const FileMetaData* b) -> int {
+          return ucmp_->Compare(a->largest.user_key(), b->smallest.user_key());
+        },
+        [](IndexUnit* index, int32_t f_idx) {
+          index->largest_rb = f_idx;
+        });
+  }
+  level_rb_[num_levels_ - 1] = files[num_levels_ - 1].size() - 1;
+}
+
+void FileIndexer::CalculateLB(const std::vector<FileMetaData*>& upper_files,
+    const std::vector<FileMetaData*>& lower_files,
+    std::vector<IndexUnit>* index,
+    std::function<int(const FileMetaData*, const FileMetaData*)> cmp_op,
+    std::function<void(IndexUnit*, int32_t)> set_index) {
+  const int32_t upper_size = upper_files.size();
+  const int32_t lower_size = lower_files.size();
+  int32_t upper_idx = 0;
+  int32_t lower_idx = 0;
+  while (upper_idx < upper_size && lower_idx < lower_size) {
+    int cmp = cmp_op(upper_files[upper_idx], lower_files[lower_idx]);
+
+    if (cmp == 0) {
+      set_index(&(*index)[upper_idx], lower_idx);
+      ++upper_idx;
+      ++lower_idx;
+    } else if (cmp > 0) {
+      // Lower level's file (largest) is smaller, a key won't hit in that
+      // file. Move to next lower file
+      ++lower_idx;
+    } else {
+      // Lower level's file becomes larger, update the index, and
+      // move to the next upper file
+      set_index(&(*index)[upper_idx], lower_idx);
+      ++upper_idx;
+    }
+  }
+
+  while (upper_idx < upper_size) {
+    // Lower files are exhausted, that means the remaining upper files are
+    // greater than any lower files. Set the index to be the lower level size.
+    set_index(&(*index)[upper_idx], lower_size);
+    ++upper_idx;
+  }
+}
+
+void FileIndexer::CalculateRB(const std::vector<FileMetaData*>& upper_files,
+    const std::vector<FileMetaData*>& lower_files,
+    std::vector<IndexUnit>* index,
+    std::function<int(const FileMetaData*, const FileMetaData*)> cmp_op,
+    std::function<void(IndexUnit*, int32_t)> set_index) {
+  const int32_t upper_size = upper_files.size();
+  const int32_t lower_size = lower_files.size();
+  int32_t upper_idx = upper_size - 1;
+  int32_t lower_idx = lower_size - 1;
+  while (upper_idx >= 0 && lower_idx >= 0) {
+    int cmp = cmp_op(upper_files[upper_idx], lower_files[lower_idx]);
+
+    if (cmp == 0) {
+      set_index(&(*index)[upper_idx], lower_idx);
+      --upper_idx;
+      --lower_idx;
+    } else if (cmp < 0) {
+      // Lower level's file (smallest) is larger, a key won't hit in that
+      // file. Move to next lower file.
+      --lower_idx;
+    } else {
+      // Lower level's file becomes smaller, update the index, and move to
+      // the next the upper file
+      set_index(&(*index)[upper_idx], lower_idx);
+      --upper_idx;
+    }
+  }
+  while (upper_idx >= 0) {
+    // Lower files are exhausted, that means the remaining upper files are
+    // smaller than any lower files. Set it to -1.
+    set_index(&(*index)[upper_idx], -1);
+    --upper_idx;
+  }
+}
+
+}  // namespace rocksdb
diff --git a/db/file_indexer.h b/db/file_indexer.h
new file mode 100644 (file)
index 0000000..5e405df
--- /dev/null
@@ -0,0 +1,129 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <cstdint>
+#include <functional>
+#include <limits>
+#include <vector>
+
+namespace rocksdb {
+
+class Comparator;
+struct FileMetaData;
+
+// The file tree structure in Version is prebuilt and the range of each file
+// is known. On Version::Get(), it uses binary search to find a potential file
+// and then check if a target key can be found in the file by comparing the key
+// to each file's smallest and largest key. The results of these comparisions
+// can be reused beyond checking if a key falls into a file's range.
+// With some pre-calculated knowledge, each key comparision that has been done
+// can serve as a hint to narrow down further searches: if a key compared to
+// be smaller than a file's smallest or largest, that comparison can be used
+// to find out the right bound of next binary search. Similarly, if a key
+// compared to be larger than a file's smallest or largest, it can be utilized
+// to find out the left bound of next binary search.
+// With these hints: it can greatly reduce the range of binary search,
+// especially for bottom levels, given that one file most likely overlaps with
+// only N files from level below (where N is max_bytes_for_level_multiplier).
+// So on level L, we will only look at ~N files instead of N^L files on the
+// naive approach.
+class FileIndexer {
+ public:
+  FileIndexer(const uint32_t num_levels, const Comparator* ucmp);
+
+  uint32_t NumLevelIndex();
+
+  uint32_t LevelIndexSize(uint32_t level);
+
+  // Return a file index range in the next level to search for a key based on
+  // smallest and largest key comparision for the current file specified by
+  // level and file_index. When *left_index < *right_index, both index should
+  // be valid and fit in the vector size.
+  void GetNextLevelIndex(
+    const uint32_t level, const uint32_t file_index, const int cmp_smallest,
+    const int cmp_largest, int32_t* left_bound, int32_t* right_bound);
+
+  void ClearIndex();
+
+  void UpdateIndex(std::vector<FileMetaData*>* const files);
+
+  enum {
+    kLevelMaxIndex = std::numeric_limits<int32_t>::max()
+  };
+
+ private:
+  const uint32_t num_levels_;
+  const Comparator* ucmp_;
+
+  struct IndexUnit {
+    IndexUnit()
+      : smallest_lb(0), largest_lb(0), smallest_rb(-1), largest_rb(-1) {}
+    // During file search, a key is compared against smallest and largest
+    // from a FileMetaData. It can have 3 possible outcomes:
+    // (1) key is smaller than smallest, implying it is also smaller than
+    //     larger. Precalculated index based on "smallest < smallest" can
+    //     be used to provide right bound.
+    // (2) key is in between smallest and largest.
+    //     Precalculated index based on "smallest > greatest" can be used to
+    //     provide left bound.
+    //     Precalculated index based on "largest < smallest" can be used to
+    //     provide right bound.
+    // (3) key is larger than largest, implying it is also larger than smallest.
+    //     Precalculated index based on "largest > largest" can be used to
+    //     provide left bound.
+    //
+    // As a result, we will need to do:
+    // Compare smallest (<=) and largest keys from upper level file with
+    // smallest key from lower level to get a right bound.
+    // Compare smallest (>=) and largest keys from upper level file with
+    // largest key from lower level to get a left bound.
+    //
+    // Example:
+    //    level 1:              [50 - 60]
+    //    level 2:        [1 - 40], [45 - 55], [58 - 80]
+    // A key 35, compared to be less than 50, 3rd file on level 2 can be
+    // skipped according to rule (1). LB = 0, RB = 1.
+    // A key 53, sits in the middle 50 and 60. 1st file on level 2 can be
+    // skipped according to rule (2)-a, but the 3rd file cannot be skipped
+    // because 60 is greater than 58. LB = 1, RB = 2.
+    // A key 70, compared to be larger than 60. 1st and 2nd file can be skipped
+    // according to rule (3). LB = 2, RB = 2.
+    //
+    // Point to a left most file in a lower level that may contain a key,
+    // which compares greater than smallest of a FileMetaData (upper level)
+    int32_t smallest_lb;
+    // Point to a left most file in a lower level that may contain a key,
+    // which compares greater than largest of a FileMetaData (upper level)
+    int32_t largest_lb;
+    // Point to a right most file in a lower level that may contain a key,
+    // which compares smaller than smallest of a FileMetaData (upper level)
+    int32_t smallest_rb;
+    // Point to a right most file in a lower level that may contain a key,
+    // which compares smaller than largest of a FileMetaData (upper level)
+    int32_t largest_rb;
+  };
+
+  void CalculateLB(const std::vector<FileMetaData*>& upper_files,
+    const std::vector<FileMetaData*>& lower_files,
+    std::vector<IndexUnit>* index,
+    std::function<int(const FileMetaData*, const FileMetaData*)> cmp_op,
+    std::function<void(IndexUnit*, int32_t)> set_index);
+
+  void CalculateRB(const std::vector<FileMetaData*>& upper_files,
+    const std::vector<FileMetaData*>& lower_files,
+    std::vector<IndexUnit>* index,
+    std::function<int(const FileMetaData*, const FileMetaData*)> cmp_op,
+    std::function<void(IndexUnit*, int32_t)> set_index);
+
+  std::vector<std::vector<IndexUnit>> next_level_index_;
+  std::vector<int32_t> level_rb_;
+};
+
+}  // namespace rocksdb
diff --git a/db/file_indexer_test.cc b/db/file_indexer_test.cc
new file mode 100644 (file)
index 0000000..14d67f4
--- /dev/null
@@ -0,0 +1,330 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <string>
+#include "db/file_indexer.h"
+#include "db/dbformat.h"
+#include "db/version_edit.h"
+#include "rocksdb/comparator.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+
+class IntComparator : public Comparator {
+ public:
+  int Compare(const Slice& a, const Slice& b) const {
+    assert(a.size() == 8);
+    assert(b.size() == 8);
+    return *reinterpret_cast<const int64_t*>(a.data()) -
+      *reinterpret_cast<const int64_t*>(b.data());
+  }
+
+  const char* Name() const {
+    return "IntComparator";
+  }
+
+  void FindShortestSeparator(std::string* start, const Slice& limit) const {}
+
+  void FindShortSuccessor(std::string* key) const {}
+};
+
+
+struct FileIndexerTest {
+ public:
+  FileIndexerTest() :
+    kNumLevels(4), indexer(kNumLevels, &ucmp),
+    files(new std::vector<FileMetaData*>[kNumLevels]) {
+  }
+
+  ~FileIndexerTest() {
+    Reset();
+    delete[] files;
+  }
+
+  void AddFile(int level, int64_t smallest, int64_t largest) {
+    auto* f = new FileMetaData();
+    f->smallest = IntKey(smallest);
+    f->largest = IntKey(largest);
+    files[level].push_back(f);
+  }
+
+  InternalKey IntKey(int64_t v) {
+    return InternalKey(Slice(reinterpret_cast<char*>(&v), 8), 0, kTypeValue);
+  }
+
+  void Reset() {
+    for (uint32_t i = 0; i < kNumLevels; ++i) {
+      for (auto* f : files[i]) {
+        delete f;
+      }
+      files[i].clear();
+    }
+    indexer.ClearIndex();
+  }
+
+  void GetNextLevelIndex(const uint32_t level, const uint32_t file_index,
+      const int cmp_smallest, const int cmp_largest, int32_t* left_index,
+      int32_t* right_index) {
+    *left_index = 100;
+    *right_index = 100;
+    indexer.GetNextLevelIndex(level, file_index, cmp_smallest, cmp_largest,
+        left_index, right_index);
+  }
+
+  const uint32_t kNumLevels;
+  IntComparator ucmp;
+  FileIndexer indexer;
+
+  std::vector<FileMetaData*>* files;
+};
+
+TEST(FileIndexerTest, next_level_hint) {
+  for (uint32_t i = 0; i < kNumLevels; ++i) {
+    ASSERT_EQ(0U, indexer.LevelIndexSize(i));
+  }
+
+  // Case 1: no overlap, files are on the left of next level files
+  // level 1
+  AddFile(1, 100, 200);
+  AddFile(1, 300, 400);
+  AddFile(1, 500, 600);
+  // level 2
+  AddFile(2, 1500, 1600);
+  AddFile(2, 1601, 1699);
+  AddFile(2, 1700, 1800);
+  // level 3
+  AddFile(3, 2500, 2600);
+  AddFile(3, 2601, 2699);
+  AddFile(3, 2700, 2800);
+  indexer.UpdateIndex(files);
+  int32_t left = 100;
+  int32_t right = 100;
+  for (uint32_t level = 1; level < 3; ++level) {
+    for (uint32_t f = 0; f < 3; ++f) {
+      GetNextLevelIndex(level, f, -1, -1, &left, &right);
+      ASSERT_EQ(0, left);
+      ASSERT_EQ(-1, right);
+      GetNextLevelIndex(level, f, 0, -1, &left, &right);
+      ASSERT_EQ(0, left);
+      ASSERT_EQ(-1, right);
+      GetNextLevelIndex(level, f, 1, -1, &left, &right);
+      ASSERT_EQ(0, left);
+      ASSERT_EQ(-1, right);
+      GetNextLevelIndex(level, f, 1, 0, &left, &right);
+      ASSERT_EQ(0, left);
+      ASSERT_EQ(-1, right);
+      GetNextLevelIndex(level, f, 1, 1, &left, &right);
+      ASSERT_EQ(0, left);
+      ASSERT_EQ(2, right);
+    }
+  }
+
+  // Case 2: no overlap, files are on the right of next level files
+  Reset();
+  for (uint32_t i = 1; i < kNumLevels; ++i) {
+    ASSERT_EQ(0U, indexer.LevelIndexSize(i));
+  }
+  // level 1
+  AddFile(1, 2100, 2200);
+  AddFile(1, 2300, 2400);
+  AddFile(1, 2500, 2600);
+  // level 2
+  AddFile(2, 1500, 1600);
+  AddFile(2, 1501, 1699);
+  AddFile(2, 1700, 1800);
+  // level 3
+  AddFile(3, 500, 600);
+  AddFile(3, 501, 699);
+  AddFile(3, 700, 800);
+  indexer.UpdateIndex(files);
+  for (uint32_t level = 1; level < 3; ++level) {
+    for (uint32_t f = 0; f < 3; ++f) {
+      GetNextLevelIndex(level, f, -1, -1, &left, &right);
+      ASSERT_EQ(f == 0 ? 0 : 3, left);
+      ASSERT_EQ(2, right);
+      GetNextLevelIndex(level, f, 0, -1, &left, &right);
+      ASSERT_EQ(3, left);
+      ASSERT_EQ(2, right);
+      GetNextLevelIndex(level, f, 1, -1, &left, &right);
+      ASSERT_EQ(3, left);
+      ASSERT_EQ(2, right);
+      GetNextLevelIndex(level, f, 1, -1, &left, &right);
+      ASSERT_EQ(3, left);
+      ASSERT_EQ(2, right);
+      GetNextLevelIndex(level, f, 1, 0, &left, &right);
+      ASSERT_EQ(3, left);
+      ASSERT_EQ(2, right);
+      GetNextLevelIndex(level, f, 1, 1, &left, &right);
+      ASSERT_EQ(3, left);
+      ASSERT_EQ(2, right);
+    }
+  }
+
+  // Case 3: empty L2
+  Reset();
+  for (uint32_t i = 1; i < kNumLevels; ++i) {
+    ASSERT_EQ(0U, indexer.LevelIndexSize(i));
+  }
+  // level 1
+  AddFile(1, 2100, 2200);
+  AddFile(1, 2300, 2400);
+  AddFile(1, 2500, 2600);
+  // level 3
+  AddFile(3, 500, 600);
+  AddFile(3, 501, 699);
+  AddFile(3, 700, 800);
+  indexer.UpdateIndex(files);
+  for (uint32_t f = 0; f < 3; ++f) {
+    GetNextLevelIndex(1, f, -1, -1, &left, &right);
+    ASSERT_EQ(0, left);
+    ASSERT_EQ(-1, right);
+    GetNextLevelIndex(1, f, 0, -1, &left, &right);
+    ASSERT_EQ(0, left);
+    ASSERT_EQ(-1, right);
+    GetNextLevelIndex(1, f, 1, -1, &left, &right);
+    ASSERT_EQ(0, left);
+    ASSERT_EQ(-1, right);
+    GetNextLevelIndex(1, f, 1, -1, &left, &right);
+    ASSERT_EQ(0, left);
+    ASSERT_EQ(-1, right);
+    GetNextLevelIndex(1, f, 1, 0, &left, &right);
+    ASSERT_EQ(0, left);
+    ASSERT_EQ(-1, right);
+    GetNextLevelIndex(1, f, 1, 1, &left, &right);
+    ASSERT_EQ(0, left);
+    ASSERT_EQ(-1, right);
+  }
+
+
+  // Case 4: mixed
+  Reset();
+  for (uint32_t i = 1; i < kNumLevels; ++i) {
+    ASSERT_EQ(0U, indexer.LevelIndexSize(i));
+  }
+  // level 1
+  AddFile(1, 100, 200);
+  AddFile(1, 250, 400);
+  AddFile(1, 450, 500);
+  // level 2
+  AddFile(2, 100, 150);  // 0
+  AddFile(2, 200, 250);  // 1
+  AddFile(2, 251, 300);  // 2
+  AddFile(2, 301, 350);  // 3
+  AddFile(2, 500, 600);  // 4
+  // level 3
+  AddFile(3, 0, 50);
+  AddFile(3, 100, 200);
+  AddFile(3, 201, 250);
+  indexer.UpdateIndex(files);
+  // level 1, 0
+  GetNextLevelIndex(1, 0, -1, -1, &left, &right);
+  ASSERT_EQ(0, left);
+  ASSERT_EQ(0, right);
+  GetNextLevelIndex(1, 0, 0, -1, &left, &right);
+  ASSERT_EQ(0, left);
+  ASSERT_EQ(0, right);
+  GetNextLevelIndex(1, 0, 1, -1, &left, &right);
+  ASSERT_EQ(0, left);
+  ASSERT_EQ(1, right);
+  GetNextLevelIndex(1, 0, 1, 0, &left, &right);
+  ASSERT_EQ(1, left);
+  ASSERT_EQ(1, right);
+  GetNextLevelIndex(1, 0, 1, 1, &left, &right);
+  ASSERT_EQ(1, left);
+  ASSERT_EQ(4, right);
+  // level 1, 1
+  GetNextLevelIndex(1, 1, -1, -1, &left, &right);
+  ASSERT_EQ(1, left);
+  ASSERT_EQ(1, right);
+  GetNextLevelIndex(1, 1, 0, -1, &left, &right);
+  ASSERT_EQ(1, left);
+  ASSERT_EQ(1, right);
+  GetNextLevelIndex(1, 1, 1, -1, &left, &right);
+  ASSERT_EQ(1, left);
+  ASSERT_EQ(3, right);
+  GetNextLevelIndex(1, 1, 1, 0, &left, &right);
+  ASSERT_EQ(4, left);
+  ASSERT_EQ(3, right);
+  GetNextLevelIndex(1, 1, 1, 1, &left, &right);
+  ASSERT_EQ(4, left);
+  ASSERT_EQ(4, right);
+  // level 1, 2
+  GetNextLevelIndex(1, 2, -1, -1, &left, &right);
+  ASSERT_EQ(4, left);
+  ASSERT_EQ(3, right);
+  GetNextLevelIndex(1, 2, 0, -1, &left, &right);
+  ASSERT_EQ(4, left);
+  ASSERT_EQ(3, right);
+  GetNextLevelIndex(1, 2, 1, -1, &left, &right);
+  ASSERT_EQ(4, left);
+  ASSERT_EQ(4, right);
+  GetNextLevelIndex(1, 2, 1, 0, &left, &right);
+  ASSERT_EQ(4, left);
+  ASSERT_EQ(4, right);
+  GetNextLevelIndex(1, 2, 1, 1, &left, &right);
+  ASSERT_EQ(4, left);
+  ASSERT_EQ(4, right);
+  // level 2, 0
+  GetNextLevelIndex(2, 0, -1, -1, &left, &right);
+  ASSERT_EQ(0, left);
+  ASSERT_EQ(1, right);
+  GetNextLevelIndex(2, 0, 0, -1, &left, &right);
+  ASSERT_EQ(1, left);
+  ASSERT_EQ(1, right);
+  GetNextLevelIndex(2, 0, 1, -1, &left, &right);
+  ASSERT_EQ(1, left);
+  ASSERT_EQ(1, right);
+  GetNextLevelIndex(2, 0, 1, 0, &left, &right);
+  ASSERT_EQ(1, left);
+  ASSERT_EQ(1, right);
+  GetNextLevelIndex(2, 0, 1, 1, &left, &right);
+  ASSERT_EQ(1, left);
+  ASSERT_EQ(2, right);
+  // level 2, 1
+  GetNextLevelIndex(2, 1, -1, -1, &left, &right);
+  ASSERT_EQ(1, left);
+  ASSERT_EQ(1, right);
+  GetNextLevelIndex(2, 1, 0, -1, &left, &right);
+  ASSERT_EQ(1, left);
+  ASSERT_EQ(1, right);
+  GetNextLevelIndex(2, 1, 1, -1, &left, &right);
+  ASSERT_EQ(1, left);
+  ASSERT_EQ(2, right);
+  GetNextLevelIndex(2, 1, 1, 0, &left, &right);
+  ASSERT_EQ(2, left);
+  ASSERT_EQ(2, right);
+  GetNextLevelIndex(2, 1, 1, 1, &left, &right);
+  ASSERT_EQ(2, left);
+  ASSERT_EQ(2, right);
+  // level 2, [2 - 4], no overlap
+  for (uint32_t f = 2; f <= 4; ++f) {
+    GetNextLevelIndex(2, f, -1, -1, &left, &right);
+    ASSERT_EQ(f == 2 ? 2 : 3, left);
+    ASSERT_EQ(2, right);
+    GetNextLevelIndex(2, f, 0, -1, &left, &right);
+    ASSERT_EQ(3, left);
+    ASSERT_EQ(2, right);
+    GetNextLevelIndex(2, f, 1, -1, &left, &right);
+    ASSERT_EQ(3, left);
+    ASSERT_EQ(2, right);
+    GetNextLevelIndex(2, f, 1, 0, &left, &right);
+    ASSERT_EQ(3, left);
+    ASSERT_EQ(2, right);
+    GetNextLevelIndex(2, f, 1, 1, &left, &right);
+    ASSERT_EQ(3, left);
+    ASSERT_EQ(2, right);
+  }
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/db/filename.cc b/db/filename.cc
new file mode 100644 (file)
index 0000000..4b3ac8e
--- /dev/null
@@ -0,0 +1,261 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/filename.h"
+
+#include <ctype.h>
+#include <stdio.h>
+#include "db/dbformat.h"
+#include "rocksdb/env.h"
+#include "util/logging.h"
+
+namespace rocksdb {
+
+// Given a path, flatten the path name by replacing all chars not in
+// {[0-9,a-z,A-Z,-,_,.]} with _. And append '\0' at the end.
+// Return the number of chars stored in dest not including the trailing '\0'.
+static int FlattenPath(const std::string& path, char* dest, int len) {
+  int write_idx = 0;
+  int i = 0;
+  int src_len = path.size();
+
+  while (i < src_len && write_idx < len - 1) {
+    if ((path[i] >= 'a' && path[i] <= 'z') ||
+        (path[i] >= '0' && path[i] <= '9') ||
+        (path[i] >= 'A' && path[i] <= 'Z') ||
+        path[i] == '-' ||
+        path[i] == '.' ||
+        path[i] == '_'){
+      dest[write_idx++] = path[i];
+    } else {
+      if (i > 0)
+        dest[write_idx++] = '_';
+    }
+    i++;
+  }
+
+  dest[write_idx] = '\0';
+  return write_idx;
+}
+
+static std::string MakeFileName(const std::string& name, uint64_t number,
+                                const char* suffix) {
+  char buf[100];
+  snprintf(buf, sizeof(buf), "/%06llu.%s",
+           static_cast<unsigned long long>(number),
+           suffix);
+  return name + buf;
+}
+
+std::string LogFileName(const std::string& name, uint64_t number) {
+  assert(number > 0);
+  return MakeFileName(name, number, "log");
+}
+
+std::string ArchivalDirectory(const std::string& dir) {
+  return dir + "/" + ARCHIVAL_DIR;
+}
+std::string ArchivedLogFileName(const std::string& name, uint64_t number) {
+  assert(number > 0);
+  return MakeFileName(name + "/" + ARCHIVAL_DIR, number, "log");
+}
+
+std::string TableFileName(const std::string& name, uint64_t number) {
+  assert(number > 0);
+  return MakeFileName(name, number, "sst");
+}
+
+std::string DescriptorFileName(const std::string& dbname, uint64_t number) {
+  assert(number > 0);
+  char buf[100];
+  snprintf(buf, sizeof(buf), "/MANIFEST-%06llu",
+           static_cast<unsigned long long>(number));
+  return dbname + buf;
+}
+
+std::string CurrentFileName(const std::string& dbname) {
+  return dbname + "/CURRENT";
+}
+
+std::string LockFileName(const std::string& dbname) {
+  return dbname + "/LOCK";
+}
+
+std::string TempFileName(const std::string& dbname, uint64_t number) {
+  return MakeFileName(dbname, number, "dbtmp");
+}
+
+std::string InfoLogFileName(const std::string& dbname,
+    const std::string& db_path, const std::string& log_dir) {
+  if (log_dir.empty())
+    return dbname + "/LOG";
+
+  char flatten_db_path[256];
+  FlattenPath(db_path, flatten_db_path, 256);
+  return log_dir + "/" + flatten_db_path + "_LOG";
+}
+
+// Return the name of the old info log file for "dbname".
+std::string OldInfoLogFileName(const std::string& dbname, uint64_t ts,
+    const std::string& db_path, const std::string& log_dir) {
+  char buf[50];
+  snprintf(buf, sizeof(buf), "%llu", static_cast<unsigned long long>(ts));
+
+  if (log_dir.empty())
+    return dbname + "/LOG.old." + buf;
+
+  char flatten_db_path[256];
+  FlattenPath(db_path, flatten_db_path, 256);
+  return log_dir + "/" + flatten_db_path + "_LOG.old." + buf;
+}
+
+std::string MetaDatabaseName(const std::string& dbname, uint64_t number) {
+  char buf[100];
+  snprintf(buf, sizeof(buf), "/METADB-%llu",
+           static_cast<unsigned long long>(number));
+  return dbname + buf;
+}
+
+std::string IdentityFileName(const std::string& dbname) {
+  return dbname + "/IDENTITY";
+}
+
+// Owned filenames have the form:
+//    dbname/IDENTITY
+//    dbname/CURRENT
+//    dbname/LOCK
+//    dbname/LOG
+//    dbname/LOG.old.[0-9]+
+//    dbname/MANIFEST-[0-9]+
+//    dbname/[0-9]+.(log|sst)
+//    dbname/METADB-[0-9]+
+//    Disregards / at the beginning
+bool ParseFileName(const std::string& fname,
+                   uint64_t* number,
+                   FileType* type,
+                   WalFileType* log_type) {
+  Slice rest(fname);
+  if (fname.length() > 1 && fname[0] == '/') {
+    rest.remove_prefix(1);
+  }
+  if (rest == "IDENTITY") {
+    *number = 0;
+    *type = kIdentityFile;
+  } else if (rest == "CURRENT") {
+    *number = 0;
+    *type = kCurrentFile;
+  } else if (rest == "LOCK") {
+    *number = 0;
+    *type = kDBLockFile;
+  } else if (rest == "LOG" || rest == "LOG.old") {
+    *number = 0;
+    *type = kInfoLogFile;
+  } else if (rest.starts_with("LOG.old.")) {
+    uint64_t ts_suffix;
+    // sizeof also counts the trailing '\0'.
+    rest.remove_prefix(sizeof("LOG.old.") - 1);
+    if (!ConsumeDecimalNumber(&rest, &ts_suffix)) {
+      return false;
+    }
+    *number = ts_suffix;
+    *type = kInfoLogFile;
+  } else if (rest.starts_with("MANIFEST-")) {
+    rest.remove_prefix(strlen("MANIFEST-"));
+    uint64_t num;
+    if (!ConsumeDecimalNumber(&rest, &num)) {
+      return false;
+    }
+    if (!rest.empty()) {
+      return false;
+    }
+    *type = kDescriptorFile;
+    *number = num;
+  } else if (rest.starts_with("METADB-")) {
+    rest.remove_prefix(strlen("METADB-"));
+    uint64_t num;
+    if (!ConsumeDecimalNumber(&rest, &num)) {
+      return false;
+    }
+    if (!rest.empty()) {
+      return false;
+    }
+    *type = kMetaDatabase;
+    *number = num;
+  } else {
+    // Avoid strtoull() to keep filename format independent of the
+    // current locale
+    bool archive_dir_found = false;
+    if (rest.starts_with(ARCHIVAL_DIR)) {
+      if (rest.size() <= ARCHIVAL_DIR.size()) {
+        return false;
+      }
+      rest.remove_prefix(ARCHIVAL_DIR.size() + 1); // Add 1 to remove / also
+      if (log_type) {
+        *log_type = kArchivedLogFile;
+      }
+      archive_dir_found = true;
+    }
+    uint64_t num;
+    if (!ConsumeDecimalNumber(&rest, &num)) {
+      return false;
+    }
+    Slice suffix = rest;
+    if (suffix == Slice(".log")) {
+      *type = kLogFile;
+      if (log_type && !archive_dir_found) {
+        *log_type = kAliveLogFile;
+      }
+    } else if (archive_dir_found) {
+      return false; // Archive dir can contain only log files
+    } else if (suffix == Slice(".sst")) {
+      *type = kTableFile;
+    } else if (suffix == Slice(".dbtmp")) {
+      *type = kTempFile;
+    } else {
+      return false;
+    }
+    *number = num;
+  }
+  return true;
+}
+
+Status SetCurrentFile(Env* env, const std::string& dbname,
+                      uint64_t descriptor_number) {
+  // Remove leading "dbname/" and add newline to manifest file name
+  std::string manifest = DescriptorFileName(dbname, descriptor_number);
+  Slice contents = manifest;
+  assert(contents.starts_with(dbname + "/"));
+  contents.remove_prefix(dbname.size() + 1);
+  std::string tmp = TempFileName(dbname, descriptor_number);
+  Status s = WriteStringToFile(env, contents.ToString() + "\n", tmp, true);
+  if (s.ok()) {
+    s = env->RenameFile(tmp, CurrentFileName(dbname));
+  }
+  if (!s.ok()) {
+    env->DeleteFile(tmp);
+  }
+  return s;
+}
+
+Status SetIdentityFile(Env* env, const std::string& dbname) {
+  std::string id = env->GenerateUniqueId();
+  assert(!id.empty());
+  // Reserve the filename dbname/000000.dbtmp for the temporary identity file
+  std::string tmp = TempFileName(dbname, 0);
+  Status s = WriteStringToFile(env, id, tmp, true);
+  if (s.ok()) {
+    s = env->RenameFile(tmp, IdentityFileName(dbname));
+  }
+  if (!s.ok()) {
+    env->DeleteFile(tmp);
+  }
+  return s;
+}
+
+}  // namespace rocksdb
diff --git a/db/filename.h b/db/filename.h
new file mode 100644 (file)
index 0000000..8e55f11
--- /dev/null
@@ -0,0 +1,108 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// File names used by DB code
+
+#pragma once
+#include <stdint.h>
+#include <string>
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "rocksdb/transaction_log.h"
+#include "port/port.h"
+
+namespace rocksdb {
+
+class Env;
+
+enum FileType {
+  kLogFile,
+  kDBLockFile,
+  kTableFile,
+  kDescriptorFile,
+  kCurrentFile,
+  kTempFile,
+  kInfoLogFile,  // Either the current one, or an old one
+  kMetaDatabase,
+  kIdentityFile
+};
+
+// Return the name of the log file with the specified number
+// in the db named by "dbname".  The result will be prefixed with
+// "dbname".
+extern std::string LogFileName(const std::string& dbname, uint64_t number);
+
+static const std::string ARCHIVAL_DIR = "archive";
+
+extern std::string ArchivalDirectory(const std::string& dbname);
+
+//  Return the name of the archived log file with the specified number
+//  in the db named by "dbname". The result will be prefixed with "dbname".
+extern std::string ArchivedLogFileName(const std::string& dbname,
+                                       uint64_t num);
+
+// Return the name of the sstable with the specified number
+// in the db named by "dbname".  The result will be prefixed with
+// "dbname".
+extern std::string TableFileName(const std::string& dbname, uint64_t number);
+
+// Return the name of the descriptor file for the db named by
+// "dbname" and the specified incarnation number.  The result will be
+// prefixed with "dbname".
+extern std::string DescriptorFileName(const std::string& dbname,
+                                      uint64_t number);
+
+// Return the name of the current file.  This file contains the name
+// of the current manifest file.  The result will be prefixed with
+// "dbname".
+extern std::string CurrentFileName(const std::string& dbname);
+
+// Return the name of the lock file for the db named by
+// "dbname".  The result will be prefixed with "dbname".
+extern std::string LockFileName(const std::string& dbname);
+
+// Return the name of a temporary file owned by the db named "dbname".
+// The result will be prefixed with "dbname".
+extern std::string TempFileName(const std::string& dbname, uint64_t number);
+
+// Return the name of the info log file for "dbname".
+extern std::string InfoLogFileName(const std::string& dbname,
+    const std::string& db_path="", const std::string& log_dir="");
+
+// Return the name of the old info log file for "dbname".
+extern std::string OldInfoLogFileName(const std::string& dbname, uint64_t ts,
+    const std::string& db_path="", const std::string& log_dir="");
+
+// Return the name to use for a metadatabase. The result will be prefixed with
+// "dbname".
+extern std::string MetaDatabaseName(const std::string& dbname,
+                                    uint64_t number);
+
+// Return the name of the Identity file which stores a unique number for the db
+// that will get regenerated if the db loses all its data and is recreated fresh
+// either from a backup-image or empty
+extern std::string IdentityFileName(const std::string& dbname);
+
+// If filename is a rocksdb file, store the type of the file in *type.
+// The number encoded in the filename is stored in *number.  If the
+// filename was successfully parsed, returns true.  Else return false.
+extern bool ParseFileName(const std::string& filename,
+                          uint64_t* number,
+                          FileType* type,
+                          WalFileType* log_type = nullptr);
+
+// Make the CURRENT file point to the descriptor file with the
+// specified number.
+extern Status SetCurrentFile(Env* env, const std::string& dbname,
+                             uint64_t descriptor_number);
+
+// Make the IDENTITY file for the db
+extern Status SetIdentityFile(Env* env, const std::string& dbname);
+
+}  // namespace rocksdb
diff --git a/db/filename_test.cc b/db/filename_test.cc
new file mode 100644 (file)
index 0000000..0baa7fd
--- /dev/null
@@ -0,0 +1,140 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/filename.h"
+
+#include "db/dbformat.h"
+#include "port/port.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+
+namespace rocksdb {
+
+class FileNameTest { };
+
+TEST(FileNameTest, Parse) {
+  Slice db;
+  FileType type;
+  uint64_t number;
+
+  // Successful parses
+  static struct {
+    const char* fname;
+    uint64_t number;
+    FileType type;
+  } cases[] = {
+    { "100.log",            100,   kLogFile },
+    { "0.log",              0,     kLogFile },
+    { "0.sst",              0,     kTableFile },
+    { "CURRENT",            0,     kCurrentFile },
+    { "LOCK",               0,     kDBLockFile },
+    { "MANIFEST-2",         2,     kDescriptorFile },
+    { "MANIFEST-7",         7,     kDescriptorFile },
+    { "METADB-2",           2,     kMetaDatabase },
+    { "METADB-7",           7,     kMetaDatabase },
+    { "LOG",                0,     kInfoLogFile },
+    { "LOG.old",            0,     kInfoLogFile },
+    { "18446744073709551615.log", 18446744073709551615ull, kLogFile },
+  };
+  for (unsigned int i = 0; i < sizeof(cases) / sizeof(cases[0]); i++) {
+    std::string f = cases[i].fname;
+    ASSERT_TRUE(ParseFileName(f, &number, &type)) << f;
+    ASSERT_EQ(cases[i].type, type) << f;
+    ASSERT_EQ(cases[i].number, number) << f;
+  }
+
+  // Errors
+  static const char* errors[] = {
+    "",
+    "foo",
+    "foo-dx-100.log",
+    ".log",
+    "",
+    "manifest",
+    "CURREN",
+    "CURRENTX",
+    "MANIFES",
+    "MANIFEST",
+    "MANIFEST-",
+    "XMANIFEST-3",
+    "MANIFEST-3x",
+    "META",
+    "METADB",
+    "METADB-",
+    "XMETADB-3",
+    "METADB-3x",
+    "LOC",
+    "LOCKx",
+    "LO",
+    "LOGx",
+    "18446744073709551616.log",
+    "184467440737095516150.log",
+    "100",
+    "100.",
+    "100.lop"
+  };
+  for (unsigned int i = 0; i < sizeof(errors) / sizeof(errors[0]); i++) {
+    std::string f = errors[i];
+    ASSERT_TRUE(!ParseFileName(f, &number, &type)) << f;
+  };
+}
+
+TEST(FileNameTest, Construction) {
+  uint64_t number;
+  FileType type;
+  std::string fname;
+
+  fname = CurrentFileName("foo");
+  ASSERT_EQ("foo/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+  ASSERT_EQ(0U, number);
+  ASSERT_EQ(kCurrentFile, type);
+
+  fname = LockFileName("foo");
+  ASSERT_EQ("foo/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+  ASSERT_EQ(0U, number);
+  ASSERT_EQ(kDBLockFile, type);
+
+  fname = LogFileName("foo", 192);
+  ASSERT_EQ("foo/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+  ASSERT_EQ(192U, number);
+  ASSERT_EQ(kLogFile, type);
+
+  fname = TableFileName("bar", 200);
+  ASSERT_EQ("bar/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+  ASSERT_EQ(200U, number);
+  ASSERT_EQ(kTableFile, type);
+
+  fname = DescriptorFileName("bar", 100);
+  ASSERT_EQ("bar/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+  ASSERT_EQ(100U, number);
+  ASSERT_EQ(kDescriptorFile, type);
+
+  fname = TempFileName("tmp", 999);
+  ASSERT_EQ("tmp/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+  ASSERT_EQ(999U, number);
+  ASSERT_EQ(kTempFile, type);
+
+  fname = MetaDatabaseName("met", 100);
+  ASSERT_EQ("met/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+  ASSERT_EQ(100U, number);
+  ASSERT_EQ(kMetaDatabase, type);
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/db/internal_stats.cc b/db/internal_stats.cc
new file mode 100644 (file)
index 0000000..e8b22a7
--- /dev/null
@@ -0,0 +1,369 @@
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/internal_stats.h"
+#include "db/column_family.h"
+
+#include <vector>
+
+namespace rocksdb {
+
+DBPropertyType GetPropertyType(const Slice& property) {
+  Slice in = property;
+  Slice prefix("rocksdb.");
+  if (!in.starts_with(prefix)) return kUnknown;
+  in.remove_prefix(prefix.size());
+
+  if (in.starts_with("num-files-at-level")) {
+    return kNumFilesAtLevel;
+  } else if (in == "levelstats") {
+    return kLevelStats;
+  } else if (in == "stats") {
+    return kStats;
+  } else if (in == "sstables") {
+    return kSsTables;
+  } else if (in == "num-immutable-mem-table") {
+    return kNumImmutableMemTable;
+  } else if (in == "mem-table-flush-pending") {
+    return kMemtableFlushPending;
+  } else if (in == "compaction-pending") {
+    return kCompactionPending;
+  } else if (in == "background-errors") {
+    return kBackgroundErrors;
+  } else if (in == "cur-size-active-mem-table") {
+    return kCurSizeActiveMemTable;
+  } else if (in == "num-entries-active-mem-table") {
+    return kNumEntriesInMutableMemtable;
+  } else if (in == "num-entries-imm-mem-tables") {
+    return kNumEntriesInImmutableMemtable;
+  }
+  return kUnknown;
+}
+
+bool InternalStats::GetProperty(DBPropertyType property_type,
+                                const Slice& property, std::string* value,
+                                ColumnFamilyData* cfd) {
+  Version* current = cfd->current();
+  Slice in = property;
+
+  switch (property_type) {
+    case kNumFilesAtLevel: {
+      in.remove_prefix(strlen("rocksdb.num-files-at-level"));
+      uint64_t level;
+      bool ok = ConsumeDecimalNumber(&in, &level) && in.empty();
+      if (!ok || (int)level >= number_levels_) {
+        return false;
+      } else {
+        char buf[100];
+        snprintf(buf, sizeof(buf), "%d",
+                 current->NumLevelFiles(static_cast<int>(level)));
+        *value = buf;
+        return true;
+      }
+    }
+    case kLevelStats: {
+      char buf[1000];
+      snprintf(buf, sizeof(buf),
+               "Level Files Size(MB)\n"
+               "--------------------\n");
+      value->append(buf);
+
+      for (int level = 0; level < number_levels_; level++) {
+        snprintf(buf, sizeof(buf), "%3d %8d %8.0f\n", level,
+                 current->NumLevelFiles(level),
+                 current->NumLevelBytes(level) / 1048576.0);
+        value->append(buf);
+      }
+      return true;
+    }
+    case kStats: {
+      char buf[1000];
+
+      uint64_t wal_bytes = 0;
+      uint64_t wal_synced = 0;
+      uint64_t user_bytes_written = 0;
+      uint64_t write_other = 0;
+      uint64_t write_self = 0;
+      uint64_t write_with_wal = 0;
+      uint64_t total_bytes_written = 0;
+      uint64_t total_bytes_read = 0;
+      uint64_t micros_up = env_->NowMicros() - started_at_;
+      // Add "+1" to make sure seconds_up is > 0 and avoid NaN later
+      double seconds_up = (micros_up + 1) / 1000000.0;
+      uint64_t total_slowdown = 0;
+      uint64_t total_slowdown_count = 0;
+      uint64_t interval_bytes_written = 0;
+      uint64_t interval_bytes_read = 0;
+      uint64_t interval_bytes_new = 0;
+      double interval_seconds_up = 0;
+
+      if (statistics_) {
+        wal_bytes = statistics_->getTickerCount(WAL_FILE_BYTES);
+        wal_synced = statistics_->getTickerCount(WAL_FILE_SYNCED);
+        user_bytes_written = statistics_->getTickerCount(BYTES_WRITTEN);
+        write_other = statistics_->getTickerCount(WRITE_DONE_BY_OTHER);
+        write_self = statistics_->getTickerCount(WRITE_DONE_BY_SELF);
+        write_with_wal = statistics_->getTickerCount(WRITE_WITH_WAL);
+      }
+
+      snprintf(
+          buf, sizeof(buf),
+          "                               Compactions\n"
+          "Level  Files Size(MB) Score Time(sec)  Read(MB) Write(MB)    Rn(MB) "
+          " "
+          "Rnp1(MB)  Wnew(MB) RW-Amplify Read(MB/s) Write(MB/s)      Rn     "
+          "Rnp1 "
+          "    Wnp1     NewW    Count   msComp   msStall  Ln-stall Stall-cnt\n"
+          "--------------------------------------------------------------------"
+          "--"
+          "--------------------------------------------------------------------"
+          "--"
+          "----------------------------------------------------------------\n");
+      value->append(buf);
+      for (int level = 0; level < number_levels_; level++) {
+        int files = current->NumLevelFiles(level);
+        if (compaction_stats_[level].micros > 0 || files > 0) {
+          int64_t bytes_read = compaction_stats_[level].bytes_readn +
+                               compaction_stats_[level].bytes_readnp1;
+          int64_t bytes_new = compaction_stats_[level].bytes_written -
+                              compaction_stats_[level].bytes_readnp1;
+          double amplify =
+              (compaction_stats_[level].bytes_readn == 0)
+                  ? 0.0
+                  : (compaction_stats_[level].bytes_written +
+                     compaction_stats_[level].bytes_readnp1 +
+                     compaction_stats_[level].bytes_readn) /
+                        (double)compaction_stats_[level].bytes_readn;
+
+          total_bytes_read += bytes_read;
+          total_bytes_written += compaction_stats_[level].bytes_written;
+
+          uint64_t stalls = level == 0 ? (stall_counts_[LEVEL0_SLOWDOWN] +
+                                          stall_counts_[LEVEL0_NUM_FILES] +
+                                          stall_counts_[MEMTABLE_COMPACTION])
+                                       : stall_leveln_slowdown_count_[level];
+
+          double stall_us = level == 0 ? (stall_micros_[LEVEL0_SLOWDOWN] +
+                                          stall_micros_[LEVEL0_NUM_FILES] +
+                                          stall_micros_[MEMTABLE_COMPACTION])
+                                       : stall_leveln_slowdown_[level];
+
+          snprintf(buf, sizeof(buf),
+                   "%3d %8d %8.0f %5.1f %9.0f %9.0f %9.0f %9.0f %9.0f %9.0f "
+                   "%10.1f %9.1f %11.1f %8d %8d %8d %8d %8d %8d %9.1f %9.1f "
+                   "%9lu\n",
+                   level, files, current->NumLevelBytes(level) / 1048576.0,
+                   current->NumLevelBytes(level) /
+                       cfd->compaction_picker()->MaxBytesForLevel(level),
+                   compaction_stats_[level].micros / 1e6,
+                   bytes_read / 1048576.0,
+                   compaction_stats_[level].bytes_written / 1048576.0,
+                   compaction_stats_[level].bytes_readn / 1048576.0,
+                   compaction_stats_[level].bytes_readnp1 / 1048576.0,
+                   bytes_new / 1048576.0, amplify,
+                   // +1 to avoid division by 0
+                   (bytes_read / 1048576.0) /
+                       ((compaction_stats_[level].micros + 1) / 1000000.0),
+                   (compaction_stats_[level].bytes_written / 1048576.0) /
+                       ((compaction_stats_[level].micros + 1) / 1000000.0),
+                   compaction_stats_[level].files_in_leveln,
+                   compaction_stats_[level].files_in_levelnp1,
+                   compaction_stats_[level].files_out_levelnp1,
+                   compaction_stats_[level].files_out_levelnp1 -
+                       compaction_stats_[level].files_in_levelnp1,
+                   compaction_stats_[level].count,
+                   (int)((double)compaction_stats_[level].micros / 1000.0 /
+                         (compaction_stats_[level].count + 1)),
+                   (double)stall_us / 1000.0 / (stalls + 1),
+                   stall_us / 1000000.0, (unsigned long)stalls);
+          total_slowdown += stall_leveln_slowdown_[level];
+          total_slowdown_count += stall_leveln_slowdown_count_[level];
+          value->append(buf);
+        }
+      }
+
+      interval_bytes_new = user_bytes_written - last_stats_.ingest_bytes_;
+      interval_bytes_read =
+          total_bytes_read - last_stats_.compaction_bytes_read_;
+      interval_bytes_written =
+          total_bytes_written - last_stats_.compaction_bytes_written_;
+      interval_seconds_up = seconds_up - last_stats_.seconds_up_;
+
+      snprintf(buf, sizeof(buf), "Uptime(secs): %.1f total, %.1f interval\n",
+               seconds_up, interval_seconds_up);
+      value->append(buf);
+
+      snprintf(buf, sizeof(buf),
+               "Writes cumulative: %llu total, %llu batches, "
+               "%.1f per batch, %.2f ingest GB\n",
+               (unsigned long long)(write_other + write_self),
+               (unsigned long long)write_self,
+               (write_other + write_self) / (double)(write_self + 1),
+               user_bytes_written / (1048576.0 * 1024));
+      value->append(buf);
+
+      snprintf(buf, sizeof(buf),
+               "WAL cumulative: %llu WAL writes, %llu WAL syncs, "
+               "%.2f writes per sync, %.2f GB written\n",
+               (unsigned long long)write_with_wal,
+               (unsigned long long)wal_synced,
+               write_with_wal / (double)(wal_synced + 1),
+               wal_bytes / (1048576.0 * 1024));
+      value->append(buf);
+
+      snprintf(buf, sizeof(buf),
+               "Compaction IO cumulative (GB): "
+               "%.2f new, %.2f read, %.2f write, %.2f read+write\n",
+               user_bytes_written / (1048576.0 * 1024),
+               total_bytes_read / (1048576.0 * 1024),
+               total_bytes_written / (1048576.0 * 1024),
+               (total_bytes_read + total_bytes_written) / (1048576.0 * 1024));
+      value->append(buf);
+
+      snprintf(
+          buf, sizeof(buf),
+          "Compaction IO cumulative (MB/sec): "
+          "%.1f new, %.1f read, %.1f write, %.1f read+write\n",
+          user_bytes_written / 1048576.0 / seconds_up,
+          total_bytes_read / 1048576.0 / seconds_up,
+          total_bytes_written / 1048576.0 / seconds_up,
+          (total_bytes_read + total_bytes_written) / 1048576.0 / seconds_up);
+      value->append(buf);
+
+      // +1 to avoid divide by 0 and NaN
+      snprintf(
+          buf, sizeof(buf),
+          "Amplification cumulative: %.1f write, %.1f compaction\n",
+          (double)(total_bytes_written + wal_bytes) / (user_bytes_written + 1),
+          (double)(total_bytes_written + total_bytes_read + wal_bytes) /
+              (user_bytes_written + 1));
+      value->append(buf);
+
+      uint64_t interval_write_other = write_other - last_stats_.write_other_;
+      uint64_t interval_write_self = write_self - last_stats_.write_self_;
+
+      snprintf(buf, sizeof(buf),
+               "Writes interval: %llu total, %llu batches, "
+               "%.1f per batch, %.1f ingest MB\n",
+               (unsigned long long)(interval_write_other + interval_write_self),
+               (unsigned long long)interval_write_self,
+               (double)(interval_write_other + interval_write_self) /
+                   (interval_write_self + 1),
+               (user_bytes_written - last_stats_.ingest_bytes_) / 1048576.0);
+      value->append(buf);
+
+      uint64_t interval_write_with_wal =
+          write_with_wal - last_stats_.write_with_wal_;
+
+      uint64_t interval_wal_synced = wal_synced - last_stats_.wal_synced_;
+      uint64_t interval_wal_bytes = wal_bytes - last_stats_.wal_bytes_;
+
+      snprintf(buf, sizeof(buf),
+               "WAL interval: %llu WAL writes, %llu WAL syncs, "
+               "%.2f writes per sync, %.2f MB written\n",
+               (unsigned long long)interval_write_with_wal,
+               (unsigned long long)interval_wal_synced,
+               interval_write_with_wal / (double)(interval_wal_synced + 1),
+               interval_wal_bytes / (1048576.0 * 1024));
+      value->append(buf);
+
+      snprintf(buf, sizeof(buf),
+               "Compaction IO interval (MB): "
+               "%.2f new, %.2f read, %.2f write, %.2f read+write\n",
+               interval_bytes_new / 1048576.0, interval_bytes_read / 1048576.0,
+               interval_bytes_written / 1048576.0,
+               (interval_bytes_read + interval_bytes_written) / 1048576.0);
+      value->append(buf);
+
+      snprintf(buf, sizeof(buf),
+               "Compaction IO interval (MB/sec): "
+               "%.1f new, %.1f read, %.1f write, %.1f read+write\n",
+               interval_bytes_new / 1048576.0 / interval_seconds_up,
+               interval_bytes_read / 1048576.0 / interval_seconds_up,
+               interval_bytes_written / 1048576.0 / interval_seconds_up,
+               (interval_bytes_read + interval_bytes_written) / 1048576.0 /
+                   interval_seconds_up);
+      value->append(buf);
+
+      // +1 to avoid divide by 0 and NaN
+      snprintf(
+          buf, sizeof(buf),
+          "Amplification interval: %.1f write, %.1f compaction\n",
+          (double)(interval_bytes_written + wal_bytes) /
+              (interval_bytes_new + 1),
+          (double)(interval_bytes_written + interval_bytes_read + wal_bytes) /
+              (interval_bytes_new + 1));
+      value->append(buf);
+
+      snprintf(buf, sizeof(buf),
+               "Stalls(secs): %.3f level0_slowdown, %.3f level0_numfiles, "
+               "%.3f memtable_compaction, %.3f leveln_slowdown\n",
+               stall_micros_[LEVEL0_SLOWDOWN] / 1000000.0,
+               stall_micros_[LEVEL0_NUM_FILES] / 1000000.0,
+               stall_micros_[MEMTABLE_COMPACTION] / 1000000.0,
+               total_slowdown / 1000000.0);
+      value->append(buf);
+
+      snprintf(buf, sizeof(buf),
+               "Stalls(count): %lu level0_slowdown, %lu level0_numfiles, "
+               "%lu memtable_compaction, %lu leveln_slowdown\n",
+               (unsigned long)stall_counts_[LEVEL0_SLOWDOWN],
+               (unsigned long)stall_counts_[LEVEL0_NUM_FILES],
+               (unsigned long)stall_counts_[MEMTABLE_COMPACTION],
+               (unsigned long)total_slowdown_count);
+      value->append(buf);
+
+      last_stats_.compaction_bytes_read_ = total_bytes_read;
+      last_stats_.compaction_bytes_written_ = total_bytes_written;
+      last_stats_.ingest_bytes_ = user_bytes_written;
+      last_stats_.seconds_up_ = seconds_up;
+      last_stats_.wal_bytes_ = wal_bytes;
+      last_stats_.wal_synced_ = wal_synced;
+      last_stats_.write_with_wal_ = write_with_wal;
+      last_stats_.write_other_ = write_other;
+      last_stats_.write_self_ = write_self;
+
+      return true;
+    }
+    case kSsTables:
+      *value = current->DebugString();
+      return true;
+    case kNumImmutableMemTable:
+      *value = std::to_string(cfd->imm()->size());
+      return true;
+    case kMemtableFlushPending:
+      // Return number of mem tables that are ready to flush (made immutable)
+      *value = std::to_string(cfd->imm()->IsFlushPending() ? 1 : 0);
+      return true;
+    case kCompactionPending:
+      // 1 if the system already determines at least one compacdtion is needed.
+      // 0 otherwise,
+      *value = std::to_string(current->NeedsCompaction() ? 1 : 0);
+      return true;
+    case kBackgroundErrors:
+      // Accumulated number of  errors in background flushes or compactions.
+      *value = std::to_string(GetBackgroundErrorCount());
+      return true;
+    case kCurSizeActiveMemTable:
+      // Current size of the active memtable
+      *value = std::to_string(cfd->mem()->ApproximateMemoryUsage());
+      return true;
+    case kNumEntriesInMutableMemtable:
+      // Current size of the active memtable
+      *value = std::to_string(cfd->mem()->GetNumEntries());
+      return true;
+    case kNumEntriesInImmutableMemtable:
+      // Current size of the active memtable
+      *value = std::to_string(cfd->imm()->current()->GetTotalNumEntries());
+      return true;
+    default:
+      return false;
+  }
+}
+
+}  // namespace rocksdb
diff --git a/db/internal_stats.h b/db/internal_stats.h
new file mode 100644 (file)
index 0000000..2a74359
--- /dev/null
@@ -0,0 +1,187 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+
+#pragma once
+#include "rocksdb/statistics.h"
+#include "util/statistics.h"
+#include "db/version_set.h"
+
+#include <vector>
+#include <string>
+
+class ColumnFamilyData;
+
+namespace rocksdb {
+
+class MemTableList;
+class DBImpl;
+
+enum DBPropertyType {
+  kNumFilesAtLevel,  // Number of files at a specific level
+  kLevelStats,       // Return number of files and total sizes of each level
+  kStats,            // Return general statitistics of DB
+  kSsTables,         // Return a human readable string of current SST files
+  kNumImmutableMemTable,   // Return number of immutable mem tables
+  kMemtableFlushPending,   // Return 1 if mem table flushing is pending,
+                           // otherwise 0.
+  kCompactionPending,      // Return 1 if a compaction is pending. Otherwise 0.
+  kBackgroundErrors,       // Return accumulated background errors encountered.
+  kCurSizeActiveMemTable,  // Return current size of the active memtable
+  kNumEntriesInMutableMemtable,    // Return number of entries in the mutable
+                                   // memtable.
+  kNumEntriesInImmutableMemtable,  // Return sum of number of entries in all
+                                   // the immutable mem tables.
+  kUnknown,
+};
+
+extern DBPropertyType GetPropertyType(const Slice& property);
+
+class InternalStats {
+ public:
+  enum WriteStallType {
+    LEVEL0_SLOWDOWN,
+    MEMTABLE_COMPACTION,
+    LEVEL0_NUM_FILES,
+    WRITE_STALLS_ENUM_MAX,
+  };
+
+  InternalStats(int num_levels, Env* env, Statistics* statistics)
+      : compaction_stats_(num_levels),
+        stall_micros_(WRITE_STALLS_ENUM_MAX, 0),
+        stall_counts_(WRITE_STALLS_ENUM_MAX, 0),
+        stall_leveln_slowdown_(num_levels, 0),
+        stall_leveln_slowdown_count_(num_levels, 0),
+        bg_error_count_(0),
+        number_levels_(num_levels),
+        statistics_(statistics),
+        env_(env),
+        started_at_(env->NowMicros()) {}
+
+  // Per level compaction stats.  compaction_stats_[level] stores the stats for
+  // compactions that produced data for the specified "level".
+  struct CompactionStats {
+    uint64_t micros;
+
+    // Bytes read from level N during compaction between levels N and N+1
+    int64_t bytes_readn;
+
+    // Bytes read from level N+1 during compaction between levels N and N+1
+    int64_t bytes_readnp1;
+
+    // Total bytes written during compaction between levels N and N+1
+    int64_t bytes_written;
+
+    // Files read from level N during compaction between levels N and N+1
+    int files_in_leveln;
+
+    // Files read from level N+1 during compaction between levels N and N+1
+    int files_in_levelnp1;
+
+    // Files written during compaction between levels N and N+1
+    int files_out_levelnp1;
+
+    // Number of compactions done
+    int count;
+
+    CompactionStats()
+        : micros(0),
+          bytes_readn(0),
+          bytes_readnp1(0),
+          bytes_written(0),
+          files_in_leveln(0),
+          files_in_levelnp1(0),
+          files_out_levelnp1(0),
+          count(0) {}
+
+    void Add(const CompactionStats& c) {
+      this->micros += c.micros;
+      this->bytes_readn += c.bytes_readn;
+      this->bytes_readnp1 += c.bytes_readnp1;
+      this->bytes_written += c.bytes_written;
+      this->files_in_leveln += c.files_in_leveln;
+      this->files_in_levelnp1 += c.files_in_levelnp1;
+      this->files_out_levelnp1 += c.files_out_levelnp1;
+      this->count += 1;
+    }
+  };
+
+  void AddCompactionStats(int level, const CompactionStats& stats) {
+    compaction_stats_[level].Add(stats);
+  }
+
+  void RecordWriteStall(WriteStallType write_stall_type, uint64_t micros) {
+    stall_micros_[write_stall_type] += micros;
+    stall_counts_[write_stall_type]++;
+  }
+
+  void RecordLevelNSlowdown(int level, uint64_t micros) {
+    stall_leveln_slowdown_[level] += micros;
+    stall_leveln_slowdown_count_[level] += micros;
+  }
+
+  uint64_t GetBackgroundErrorCount() const { return bg_error_count_; }
+
+  uint64_t BumpAndGetBackgroundErrorCount() { return ++bg_error_count_; }
+
+  bool GetProperty(DBPropertyType property_type, const Slice& property,
+                   std::string* value, ColumnFamilyData* cfd);
+
+ private:
+  std::vector<CompactionStats> compaction_stats_;
+
+  // Used to compute per-interval statistics
+  struct StatsSnapshot {
+    uint64_t compaction_bytes_read_;     // Bytes read by compaction
+    uint64_t compaction_bytes_written_;  // Bytes written by compaction
+    uint64_t ingest_bytes_;              // Bytes written by user
+    uint64_t wal_bytes_;                 // Bytes written to WAL
+    uint64_t wal_synced_;                // Number of times WAL is synced
+    uint64_t write_with_wal_;            // Number of writes that request WAL
+    // These count the number of writes processed by the calling thread or
+    // another thread.
+    uint64_t write_other_;
+    uint64_t write_self_;
+    double seconds_up_;
+
+    StatsSnapshot()
+        : compaction_bytes_read_(0),
+          compaction_bytes_written_(0),
+          ingest_bytes_(0),
+          wal_bytes_(0),
+          wal_synced_(0),
+          write_with_wal_(0),
+          write_other_(0),
+          write_self_(0),
+          seconds_up_(0) {}
+  };
+
+  // Counters from the previous time per-interval stats were computed
+  StatsSnapshot last_stats_;
+
+  // These count the number of microseconds for which MakeRoomForWrite stalls.
+  std::vector<uint64_t> stall_micros_;
+  std::vector<uint64_t> stall_counts_;
+  std::vector<uint64_t> stall_leveln_slowdown_;
+  std::vector<uint64_t> stall_leveln_slowdown_count_;
+
+  // Total number of background errors encountered. Every time a flush task
+  // or compaction task fails, this counter is incremented. The failure can
+  // be caused by any possible reason, including file system errors, out of
+  // resources, or input file corruption. Failing when retrying the same flush
+  // or compaction will cause the counter to increase too.
+  uint64_t bg_error_count_;
+
+  int number_levels_;
+  Statistics* statistics_;
+  Env* env_;
+  uint64_t started_at_;
+};
+
+}  // namespace rocksdb
diff --git a/db/log_format.h b/db/log_format.h
new file mode 100644 (file)
index 0000000..919c087
--- /dev/null
@@ -0,0 +1,35 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Log format information shared by reader and writer.
+// See ../doc/log_format.txt for more detail.
+
+#pragma once
+namespace rocksdb {
+namespace log {
+
+enum RecordType {
+  // Zero is reserved for preallocated files
+  kZeroType = 0,
+  kFullType = 1,
+
+  // For fragments
+  kFirstType = 2,
+  kMiddleType = 3,
+  kLastType = 4
+};
+static const int kMaxRecordType = kLastType;
+
+static const unsigned int kBlockSize = 32768;
+
+// Header is checksum (4 bytes), type (1 byte), length (2 bytes).
+static const int kHeaderSize = 4 + 1 + 2;
+
+}  // namespace log
+}  // namespace rocksdb
diff --git a/db/log_reader.cc b/db/log_reader.cc
new file mode 100644 (file)
index 0000000..be1fb8c
--- /dev/null
@@ -0,0 +1,339 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/log_reader.h"
+
+#include <stdio.h>
+#include "rocksdb/env.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+
+namespace rocksdb {
+namespace log {
+
+Reader::Reporter::~Reporter() {
+}
+
+Reader::Reader(unique_ptr<SequentialFile>&& file, Reporter* reporter,
+               bool checksum, uint64_t initial_offset)
+    : file_(std::move(file)),
+      reporter_(reporter),
+      checksum_(checksum),
+      backing_store_(new char[kBlockSize]),
+      buffer_(),
+      eof_(false),
+      read_error_(false),
+      eof_offset_(0),
+      last_record_offset_(0),
+      end_of_buffer_offset_(0),
+      initial_offset_(initial_offset) {
+}
+
+Reader::~Reader() {
+  delete[] backing_store_;
+}
+
+bool Reader::SkipToInitialBlock() {
+  size_t offset_in_block = initial_offset_ % kBlockSize;
+  uint64_t block_start_location = initial_offset_ - offset_in_block;
+
+  // Don't search a block if we'd be in the trailer
+  if (offset_in_block > kBlockSize - 6) {
+    offset_in_block = 0;
+    block_start_location += kBlockSize;
+  }
+
+  end_of_buffer_offset_ = block_start_location;
+
+  // Skip to start of first block that can contain the initial record
+  if (block_start_location > 0) {
+    Status skip_status = file_->Skip(block_start_location);
+    if (!skip_status.ok()) {
+      ReportDrop(block_start_location, skip_status);
+      return false;
+    }
+  }
+
+  return true;
+}
+
+bool Reader::ReadRecord(Slice* record, std::string* scratch) {
+  if (last_record_offset_ < initial_offset_) {
+    if (!SkipToInitialBlock()) {
+      return false;
+    }
+  }
+
+  scratch->clear();
+  record->clear();
+  bool in_fragmented_record = false;
+  // Record offset of the logical record that we're reading
+  // 0 is a dummy value to make compilers happy
+  uint64_t prospective_record_offset = 0;
+
+  Slice fragment;
+  while (true) {
+    uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size();
+    const unsigned int record_type = ReadPhysicalRecord(&fragment);
+    switch (record_type) {
+      case kFullType:
+        if (in_fragmented_record) {
+          // Handle bug in earlier versions of log::Writer where
+          // it could emit an empty kFirstType record at the tail end
+          // of a block followed by a kFullType or kFirstType record
+          // at the beginning of the next block.
+          if (scratch->empty()) {
+            in_fragmented_record = false;
+          } else {
+            ReportCorruption(scratch->size(), "partial record without end(1)");
+          }
+        }
+        prospective_record_offset = physical_record_offset;
+        scratch->clear();
+        *record = fragment;
+        last_record_offset_ = prospective_record_offset;
+        return true;
+
+      case kFirstType:
+        if (in_fragmented_record) {
+          // Handle bug in earlier versions of log::Writer where
+          // it could emit an empty kFirstType record at the tail end
+          // of a block followed by a kFullType or kFirstType record
+          // at the beginning of the next block.
+          if (scratch->empty()) {
+            in_fragmented_record = false;
+          } else {
+            ReportCorruption(scratch->size(), "partial record without end(2)");
+          }
+        }
+        prospective_record_offset = physical_record_offset;
+        scratch->assign(fragment.data(), fragment.size());
+        in_fragmented_record = true;
+        break;
+
+      case kMiddleType:
+        if (!in_fragmented_record) {
+          ReportCorruption(fragment.size(),
+                           "missing start of fragmented record(1)");
+        } else {
+          scratch->append(fragment.data(), fragment.size());
+        }
+        break;
+
+      case kLastType:
+        if (!in_fragmented_record) {
+          ReportCorruption(fragment.size(),
+                           "missing start of fragmented record(2)");
+        } else {
+          scratch->append(fragment.data(), fragment.size());
+          *record = Slice(*scratch);
+          last_record_offset_ = prospective_record_offset;
+          return true;
+        }
+        break;
+
+      case kEof:
+        if (in_fragmented_record) {
+          // This can be caused by the writer dying immediately after
+          //  writing a physical record but before completing the next; don't
+          //  treat it as a corruption, just ignore the entire logical record.
+          scratch->clear();
+        }
+        return false;
+
+      case kBadRecord:
+        if (in_fragmented_record) {
+          ReportCorruption(scratch->size(), "error in middle of record");
+          in_fragmented_record = false;
+          scratch->clear();
+        }
+        break;
+
+      default: {
+        char buf[40];
+        snprintf(buf, sizeof(buf), "unknown record type %u", record_type);
+        ReportCorruption(
+            (fragment.size() + (in_fragmented_record ? scratch->size() : 0)),
+            buf);
+        in_fragmented_record = false;
+        scratch->clear();
+        break;
+      }
+    }
+  }
+  return false;
+}
+
+uint64_t Reader::LastRecordOffset() {
+  return last_record_offset_;
+}
+
+void Reader::UnmarkEOF() {
+  if (read_error_) {
+    return;
+  }
+
+  eof_ = false;
+
+  if (eof_offset_ == 0) {
+    return;
+  }
+
+  // If the EOF was in the middle of a block (a partial block was read) we have
+  // to read the rest of the block as ReadPhysicalRecord can only read full
+  // blocks and expects the file position indicator to be aligned to the start
+  // of a block.
+  //
+  //      consumed_bytes + buffer_size() + remaining == kBlockSize
+
+  size_t consumed_bytes = eof_offset_ - buffer_.size();
+  size_t remaining = kBlockSize - eof_offset_;
+
+  // backing_store_ is used to concatenate what is left in buffer_ and
+  // the remainder of the block. If buffer_ already uses backing_store_,
+  // we just append the new data.
+  if (buffer_.data() != backing_store_ + consumed_bytes) {
+    // Buffer_ does not use backing_store_ for storage.
+    // Copy what is left in buffer_ to backing_store.
+    memmove(backing_store_ + consumed_bytes, buffer_.data(), buffer_.size());
+  }
+
+  Slice read_buffer;
+  Status status = file_->Read(remaining, &read_buffer,
+    backing_store_ + eof_offset_);
+
+  size_t added = read_buffer.size();
+  end_of_buffer_offset_ += added;
+
+  if (!status.ok()) {
+    if (added > 0) {
+      ReportDrop(added, status);
+    }
+
+    read_error_ = true;
+    return;
+  }
+
+  if (read_buffer.data() != backing_store_ + eof_offset_) {
+    // Read did not write to backing_store_
+    memmove(backing_store_ + eof_offset_, read_buffer.data(),
+      read_buffer.size());
+  }
+
+  buffer_ = Slice(backing_store_ + consumed_bytes,
+    eof_offset_ + added - consumed_bytes);
+
+  if (added < remaining) {
+    eof_ = true;
+    eof_offset_ += added;
+  } else {
+    eof_offset_ = 0;
+  }
+}
+
+void Reader::ReportCorruption(size_t bytes, const char* reason) {
+  ReportDrop(bytes, Status::Corruption(reason));
+}
+
+void Reader::ReportDrop(size_t bytes, const Status& reason) {
+  if (reporter_ != nullptr &&
+      end_of_buffer_offset_ - buffer_.size() - bytes >= initial_offset_) {
+    reporter_->Corruption(bytes, reason);
+  }
+}
+
+unsigned int Reader::ReadPhysicalRecord(Slice* result) {
+  while (true) {
+    if (buffer_.size() < (size_t)kHeaderSize) {
+      if (!eof_ && !read_error_) {
+        // Last read was a full read, so this is a trailer to skip
+        buffer_.clear();
+        Status status = file_->Read(kBlockSize, &buffer_, backing_store_);
+        end_of_buffer_offset_ += buffer_.size();
+        if (!status.ok()) {
+          buffer_.clear();
+          ReportDrop(kBlockSize, status);
+          read_error_ = true;
+          return kEof;
+        } else if (buffer_.size() < (size_t)kBlockSize) {
+          eof_ = true;
+          eof_offset_ = buffer_.size();
+        }
+        continue;
+      } else {
+        // Note that if buffer_ is non-empty, we have a truncated header at the
+        //  end of the file, which can be caused by the writer crashing in the
+        //  middle of writing the header. Instead of considering this an error,
+        //  just report EOF.
+        buffer_.clear();
+        return kEof;
+      }
+    }
+
+    // Parse the header
+    const char* header = buffer_.data();
+    const uint32_t a = static_cast<uint32_t>(header[4]) & 0xff;
+    const uint32_t b = static_cast<uint32_t>(header[5]) & 0xff;
+    const unsigned int type = header[6];
+    const uint32_t length = a | (b << 8);
+    if (kHeaderSize + length > buffer_.size()) {
+      size_t drop_size = buffer_.size();
+      buffer_.clear();
+      if (!eof_) {
+        ReportCorruption(drop_size, "bad record length");
+        return kBadRecord;
+      }
+      // If the end of the file has been reached without reading |length| bytes
+      // of payload, assume the writer died in the middle of writing the record.
+      // Don't report a corruption.
+      return kEof;
+    }
+
+    if (type == kZeroType && length == 0) {
+      // Skip zero length record without reporting any drops since
+      // such records are produced by the mmap based writing code in
+      // env_posix.cc that preallocates file regions.
+      // NOTE: this should never happen in DB written by new RocksDB versions,
+      // since we turn off mmap writes to manifest and log files
+      buffer_.clear();
+      return kBadRecord;
+    }
+
+    // Check crc
+    if (checksum_) {
+      uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header));
+      uint32_t actual_crc = crc32c::Value(header + 6, 1 + length);
+      if (actual_crc != expected_crc) {
+        // Drop the rest of the buffer since "length" itself may have
+        // been corrupted and if we trust it, we could find some
+        // fragment of a real log record that just happens to look
+        // like a valid log record.
+        size_t drop_size = buffer_.size();
+        buffer_.clear();
+        ReportCorruption(drop_size, "checksum mismatch");
+        return kBadRecord;
+      }
+    }
+
+    buffer_.remove_prefix(kHeaderSize + length);
+
+    // Skip physical record that started before initial_offset_
+    if (end_of_buffer_offset_ - buffer_.size() - kHeaderSize - length <
+        initial_offset_) {
+      result->clear();
+      return kBadRecord;
+    }
+
+    *result = Slice(header + kHeaderSize, length);
+    return type;
+  }
+}
+
+}  // namespace log
+}  // namespace rocksdb
diff --git a/db/log_reader.h b/db/log_reader.h
new file mode 100644 (file)
index 0000000..81d334d
--- /dev/null
@@ -0,0 +1,130 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <memory>
+#include <stdint.h>
+
+#include "db/log_format.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+
+namespace rocksdb {
+
+class SequentialFile;
+using std::unique_ptr;
+
+namespace log {
+
+class Reader {
+ public:
+  // Interface for reporting errors.
+  class Reporter {
+   public:
+    virtual ~Reporter();
+
+    // Some corruption was detected.  "size" is the approximate number
+    // of bytes dropped due to the corruption.
+    virtual void Corruption(size_t bytes, const Status& status) = 0;
+  };
+
+  // Create a reader that will return log records from "*file".
+  // "*file" must remain live while this Reader is in use.
+  //
+  // If "reporter" is non-nullptr, it is notified whenever some data is
+  // dropped due to a detected corruption.  "*reporter" must remain
+  // live while this Reader is in use.
+  //
+  // If "checksum" is true, verify checksums if available.
+  //
+  // The Reader will start reading at the first record located at physical
+  // position >= initial_offset within the file.
+  Reader(unique_ptr<SequentialFile>&& file, Reporter* reporter,
+         bool checksum, uint64_t initial_offset);
+
+  ~Reader();
+
+  // Read the next record into *record.  Returns true if read
+  // successfully, false if we hit end of the input.  May use
+  // "*scratch" as temporary storage.  The contents filled in *record
+  // will only be valid until the next mutating operation on this
+  // reader or the next mutation to *scratch.
+  bool ReadRecord(Slice* record, std::string* scratch);
+
+  // Returns the physical offset of the last record returned by ReadRecord.
+  //
+  // Undefined before the first call to ReadRecord.
+  uint64_t LastRecordOffset();
+
+  // returns true if the reader has encountered an eof condition.
+  bool IsEOF() {
+    return eof_;
+  }
+
+  // when we know more data has been written to the file. we can use this
+  // function to force the reader to look again in the file.
+  // Also aligns the file position indicator to the start of the next block
+  // by reading the rest of the data from the EOF position to the end of the
+  // block that was partially read.
+  void UnmarkEOF();
+
+  SequentialFile* file() { return file_.get(); }
+
+ private:
+  const unique_ptr<SequentialFile> file_;
+  Reporter* const reporter_;
+  bool const checksum_;
+  char* const backing_store_;
+  Slice buffer_;
+  bool eof_;   // Last Read() indicated EOF by returning < kBlockSize
+  bool read_error_;   // Error occurred while reading from file
+
+  // Offset of the file position indicator within the last block when an
+  // EOF was detected.
+  size_t eof_offset_;
+
+  // Offset of the last record returned by ReadRecord.
+  uint64_t last_record_offset_;
+  // Offset of the first location past the end of buffer_.
+  uint64_t end_of_buffer_offset_;
+
+  // Offset at which to start looking for the first record to return
+  uint64_t const initial_offset_;
+
+  // Extend record types with the following special values
+  enum {
+    kEof = kMaxRecordType + 1,
+    // Returned whenever we find an invalid physical record.
+    // Currently there are three situations in which this happens:
+    // * The record has an invalid CRC (ReadPhysicalRecord reports a drop)
+    // * The record is a 0-length record (No drop is reported)
+    // * The record is below constructor's initial_offset (No drop is reported)
+    kBadRecord = kMaxRecordType + 2
+  };
+
+  // Skips all blocks that are completely before "initial_offset_".
+  //
+  // Returns true on success. Handles reporting.
+  bool SkipToInitialBlock();
+
+  // Return type, or one of the preceding special values
+  unsigned int ReadPhysicalRecord(Slice* result);
+
+  // Reports dropped bytes to the reporter.
+  // buffer_ must be updated to remove the dropped bytes prior to invocation.
+  void ReportCorruption(size_t bytes, const char* reason);
+  void ReportDrop(size_t bytes, const Status& reason);
+
+  // No copying allowed
+  Reader(const Reader&);
+  void operator=(const Reader&);
+};
+
+}  // namespace log
+}  // namespace rocksdb
diff --git a/db/log_test.cc b/db/log_test.cc
new file mode 100644 (file)
index 0000000..6577a6a
--- /dev/null
@@ -0,0 +1,689 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "rocksdb/env.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+#include "util/random.h"
+#include "util/testharness.h"
+
+namespace rocksdb {
+namespace log {
+
+// Construct a string of the specified length made out of the supplied
+// partial string.
+static std::string BigString(const std::string& partial_string, size_t n) {
+  std::string result;
+  while (result.size() < n) {
+    result.append(partial_string);
+  }
+  result.resize(n);
+  return result;
+}
+
+// Construct a string from a number
+static std::string NumberString(int n) {
+  char buf[50];
+  snprintf(buf, sizeof(buf), "%d.", n);
+  return std::string(buf);
+}
+
+// Return a skewed potentially long string
+static std::string RandomSkewedString(int i, Random* rnd) {
+  return BigString(NumberString(i), rnd->Skewed(17));
+}
+
+class LogTest {
+ private:
+  class StringDest : public WritableFile {
+   public:
+    std::string contents_;
+
+    explicit StringDest(Slice& reader_contents) :
+      WritableFile(),
+      contents_(""),
+      reader_contents_(reader_contents),
+      last_flush_(0) {
+      reader_contents_ = Slice(contents_.data(), 0);
+    };
+
+    virtual Status Close() { return Status::OK(); }
+    virtual Status Flush() {
+      ASSERT_TRUE(reader_contents_.size() <= last_flush_);
+      size_t offset = last_flush_ - reader_contents_.size();
+      reader_contents_ = Slice(
+          contents_.data() + offset,
+          contents_.size() - offset);
+      last_flush_ = contents_.size();
+
+      return Status::OK();
+    }
+    virtual Status Sync() { return Status::OK(); }
+    virtual Status Append(const Slice& slice) {
+      contents_.append(slice.data(), slice.size());
+      return Status::OK();
+    }
+    void Drop(size_t bytes) {
+      contents_.resize(contents_.size() - bytes);
+      reader_contents_ = Slice(
+          reader_contents_.data(), reader_contents_.size() - bytes);
+      last_flush_ = contents_.size();
+    }
+
+   private:
+    Slice& reader_contents_;
+    size_t last_flush_;
+  };
+
+  class StringSource : public SequentialFile {
+   public:
+    Slice& contents_;
+    bool force_error_;
+    size_t force_error_position_;
+    bool force_eof_;
+    size_t force_eof_position_;
+    bool returned_partial_;
+    explicit StringSource(Slice& contents) :
+      contents_(contents),
+      force_error_(false),
+      force_error_position_(0),
+      force_eof_(false),
+      force_eof_position_(0),
+      returned_partial_(false) { }
+
+    virtual Status Read(size_t n, Slice* result, char* scratch) {
+      ASSERT_TRUE(!returned_partial_) << "must not Read() after eof/error";
+
+      if (force_error_) {
+        if (force_error_position_ >= n) {
+          force_error_position_ -= n;
+        } else {
+          *result = Slice(contents_.data(), force_error_position_);
+          contents_.remove_prefix(force_error_position_);
+          force_error_ = false;
+          returned_partial_ = true;
+          return Status::Corruption("read error");
+        }
+      }
+
+      if (contents_.size() < n) {
+        n = contents_.size();
+        returned_partial_ = true;
+      }
+
+      if (force_eof_) {
+        if (force_eof_position_ >= n) {
+          force_eof_position_ -= n;
+        } else {
+          force_eof_ = false;
+          n = force_eof_position_;
+          returned_partial_ = true;
+        }
+      }
+
+      // By using scratch we ensure that caller has control over the
+      // lifetime of result.data()
+      memcpy(scratch, contents_.data(), n);
+      *result = Slice(scratch, n);
+
+      contents_.remove_prefix(n);
+      return Status::OK();
+    }
+
+    virtual Status Skip(uint64_t n) {
+      if (n > contents_.size()) {
+        contents_.clear();
+        return Status::NotFound("in-memory file skipepd past end");
+      }
+
+      contents_.remove_prefix(n);
+
+      return Status::OK();
+    }
+  };
+
+  class ReportCollector : public Reader::Reporter {
+   public:
+    size_t dropped_bytes_;
+    std::string message_;
+
+    ReportCollector() : dropped_bytes_(0) { }
+    virtual void Corruption(size_t bytes, const Status& status) {
+      dropped_bytes_ += bytes;
+      message_.append(status.ToString());
+    }
+  };
+
+  std::string& dest_contents() {
+    auto dest = dynamic_cast<StringDest*>(writer_.file());
+    assert(dest);
+    return dest->contents_;
+  }
+
+  const std::string& dest_contents() const {
+    auto dest = dynamic_cast<const StringDest*>(writer_.file());
+    assert(dest);
+    return dest->contents_;
+  }
+
+  void reset_source_contents() {
+    auto src = dynamic_cast<StringSource*>(reader_.file());
+    assert(src);
+    src->contents_ = dest_contents();
+  }
+
+  Slice reader_contents_;
+  unique_ptr<StringDest> dest_holder_;
+  unique_ptr<StringSource> source_holder_;
+  ReportCollector report_;
+  Writer writer_;
+  Reader reader_;
+
+  // Record metadata for testing initial offset functionality
+  static size_t initial_offset_record_sizes_[];
+  static uint64_t initial_offset_last_record_offsets_[];
+
+ public:
+  LogTest() : reader_contents_(),
+              dest_holder_(new StringDest(reader_contents_)),
+              source_holder_(new StringSource(reader_contents_)),
+              writer_(std::move(dest_holder_)),
+              reader_(std::move(source_holder_), &report_, true/*checksum*/,
+                      0/*initial_offset*/) {
+  }
+
+  void Write(const std::string& msg) {
+    writer_.AddRecord(Slice(msg));
+  }
+
+  size_t WrittenBytes() const {
+    return dest_contents().size();
+  }
+
+  std::string Read() {
+    std::string scratch;
+    Slice record;
+    if (reader_.ReadRecord(&record, &scratch)) {
+      return record.ToString();
+    } else {
+      return "EOF";
+    }
+  }
+
+  void IncrementByte(int offset, int delta) {
+    dest_contents()[offset] += delta;
+  }
+
+  void SetByte(int offset, char new_byte) {
+    dest_contents()[offset] = new_byte;
+  }
+
+  void ShrinkSize(int bytes) {
+    auto dest = dynamic_cast<StringDest*>(writer_.file());
+    assert(dest);
+    dest->Drop(bytes);
+  }
+
+  void FixChecksum(int header_offset, int len) {
+    // Compute crc of type/len/data
+    uint32_t crc = crc32c::Value(&dest_contents()[header_offset+6], 1 + len);
+    crc = crc32c::Mask(crc);
+    EncodeFixed32(&dest_contents()[header_offset], crc);
+  }
+
+  void ForceError(size_t position = 0) {
+    auto src = dynamic_cast<StringSource*>(reader_.file());
+    src->force_error_ = true;
+    src->force_error_position_ = position;
+  }
+
+  size_t DroppedBytes() const {
+    return report_.dropped_bytes_;
+  }
+
+  std::string ReportMessage() const {
+    return report_.message_;
+  }
+
+  void ForceEOF(size_t position = 0) {
+    auto src = dynamic_cast<StringSource*>(reader_.file());
+    src->force_eof_ = true;
+    src->force_eof_position_ = position;
+  }
+
+  void UnmarkEOF() {
+    auto src = dynamic_cast<StringSource*>(reader_.file());
+    src->returned_partial_ = false;
+    reader_.UnmarkEOF();
+  }
+
+  bool IsEOF() {
+    return reader_.IsEOF();
+  }
+
+  // Returns OK iff recorded error message contains "msg"
+  std::string MatchError(const std::string& msg) const {
+    if (report_.message_.find(msg) == std::string::npos) {
+      return report_.message_;
+    } else {
+      return "OK";
+    }
+  }
+
+  void WriteInitialOffsetLog() {
+    for (int i = 0; i < 4; i++) {
+      std::string record(initial_offset_record_sizes_[i],
+                         static_cast<char>('a' + i));
+      Write(record);
+    }
+  }
+
+  void CheckOffsetPastEndReturnsNoRecords(uint64_t offset_past_end) {
+    WriteInitialOffsetLog();
+    unique_ptr<StringSource> source(new StringSource(reader_contents_));
+    unique_ptr<Reader> offset_reader(
+      new Reader(std::move(source), &report_, true/*checksum*/,
+                 WrittenBytes() + offset_past_end));
+    Slice record;
+    std::string scratch;
+    ASSERT_TRUE(!offset_reader->ReadRecord(&record, &scratch));
+  }
+
+  void CheckInitialOffsetRecord(uint64_t initial_offset,
+                                int expected_record_offset) {
+    WriteInitialOffsetLog();
+    unique_ptr<StringSource> source(new StringSource(reader_contents_));
+    unique_ptr<Reader> offset_reader(
+      new Reader(std::move(source), &report_, true/*checksum*/,
+                 initial_offset));
+    Slice record;
+    std::string scratch;
+    ASSERT_TRUE(offset_reader->ReadRecord(&record, &scratch));
+    ASSERT_EQ(initial_offset_record_sizes_[expected_record_offset],
+              record.size());
+    ASSERT_EQ(initial_offset_last_record_offsets_[expected_record_offset],
+              offset_reader->LastRecordOffset());
+    ASSERT_EQ((char)('a' + expected_record_offset), record.data()[0]);
+  }
+
+};
+
+size_t LogTest::initial_offset_record_sizes_[] =
+    {10000,  // Two sizable records in first block
+     10000,
+     2 * log::kBlockSize - 1000,  // Span three blocks
+     1};
+
+uint64_t LogTest::initial_offset_last_record_offsets_[] =
+    {0,
+     kHeaderSize + 10000,
+     2 * (kHeaderSize + 10000),
+     2 * (kHeaderSize + 10000) +
+         (2 * log::kBlockSize - 1000) + 3 * kHeaderSize};
+
+
+TEST(LogTest, Empty) {
+  ASSERT_EQ("EOF", Read());
+}
+
+TEST(LogTest, ReadWrite) {
+  Write("foo");
+  Write("bar");
+  Write("");
+  Write("xxxx");
+  ASSERT_EQ("foo", Read());
+  ASSERT_EQ("bar", Read());
+  ASSERT_EQ("", Read());
+  ASSERT_EQ("xxxx", Read());
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ("EOF", Read());  // Make sure reads at eof work
+}
+
+TEST(LogTest, ManyBlocks) {
+  for (int i = 0; i < 100000; i++) {
+    Write(NumberString(i));
+  }
+  for (int i = 0; i < 100000; i++) {
+    ASSERT_EQ(NumberString(i), Read());
+  }
+  ASSERT_EQ("EOF", Read());
+}
+
+TEST(LogTest, Fragmentation) {
+  Write("small");
+  Write(BigString("medium", 50000));
+  Write(BigString("large", 100000));
+  ASSERT_EQ("small", Read());
+  ASSERT_EQ(BigString("medium", 50000), Read());
+  ASSERT_EQ(BigString("large", 100000), Read());
+  ASSERT_EQ("EOF", Read());
+}
+
+TEST(LogTest, MarginalTrailer) {
+  // Make a trailer that is exactly the same length as an empty record.
+  const int n = kBlockSize - 2*kHeaderSize;
+  Write(BigString("foo", n));
+  ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize), WrittenBytes());
+  Write("");
+  Write("bar");
+  ASSERT_EQ(BigString("foo", n), Read());
+  ASSERT_EQ("", Read());
+  ASSERT_EQ("bar", Read());
+  ASSERT_EQ("EOF", Read());
+}
+
+TEST(LogTest, MarginalTrailer2) {
+  // Make a trailer that is exactly the same length as an empty record.
+  const int n = kBlockSize - 2*kHeaderSize;
+  Write(BigString("foo", n));
+  ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize), WrittenBytes());
+  Write("bar");
+  ASSERT_EQ(BigString("foo", n), Read());
+  ASSERT_EQ("bar", Read());
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(0U, DroppedBytes());
+  ASSERT_EQ("", ReportMessage());
+}
+
+TEST(LogTest, ShortTrailer) {
+  const int n = kBlockSize - 2*kHeaderSize + 4;
+  Write(BigString("foo", n));
+  ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize + 4), WrittenBytes());
+  Write("");
+  Write("bar");
+  ASSERT_EQ(BigString("foo", n), Read());
+  ASSERT_EQ("", Read());
+  ASSERT_EQ("bar", Read());
+  ASSERT_EQ("EOF", Read());
+}
+
+TEST(LogTest, AlignedEof) {
+  const int n = kBlockSize - 2*kHeaderSize + 4;
+  Write(BigString("foo", n));
+  ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize + 4), WrittenBytes());
+  ASSERT_EQ(BigString("foo", n), Read());
+  ASSERT_EQ("EOF", Read());
+}
+
+TEST(LogTest, RandomRead) {
+  const int N = 500;
+  Random write_rnd(301);
+  for (int i = 0; i < N; i++) {
+    Write(RandomSkewedString(i, &write_rnd));
+  }
+  Random read_rnd(301);
+  for (int i = 0; i < N; i++) {
+    ASSERT_EQ(RandomSkewedString(i, &read_rnd), Read());
+  }
+  ASSERT_EQ("EOF", Read());
+}
+
+// Tests of all the error paths in log_reader.cc follow:
+
+TEST(LogTest, ReadError) {
+  Write("foo");
+  ForceError();
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ((unsigned int)kBlockSize, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("read error"));
+}
+
+TEST(LogTest, BadRecordType) {
+  Write("foo");
+  // Type is stored in header[6]
+  IncrementByte(6, 100);
+  FixChecksum(0, 3);
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(3U, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("unknown record type"));
+}
+
+TEST(LogTest, TruncatedTrailingRecordIsIgnored) {
+  Write("foo");
+  ShrinkSize(4);   // Drop all payload as well as a header byte
+  ASSERT_EQ("EOF", Read());
+  // Truncated last record is ignored, not treated as an error
+  ASSERT_EQ(0U, DroppedBytes());
+  ASSERT_EQ("", ReportMessage());
+}
+
+TEST(LogTest, BadLength) {
+  const int kPayloadSize = kBlockSize - kHeaderSize;
+  Write(BigString("bar", kPayloadSize));
+  Write("foo");
+  // Least significant size byte is stored in header[4].
+  IncrementByte(4, 1);
+  ASSERT_EQ("foo", Read());
+  ASSERT_EQ(kBlockSize, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("bad record length"));
+}
+
+TEST(LogTest, BadLengthAtEndIsIgnored) {
+  Write("foo");
+  ShrinkSize(1);
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(0U, DroppedBytes());
+  ASSERT_EQ("", ReportMessage());
+}
+
+TEST(LogTest, ChecksumMismatch) {
+  Write("foo");
+  IncrementByte(0, 10);
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(10U, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("checksum mismatch"));
+}
+
+TEST(LogTest, UnexpectedMiddleType) {
+  Write("foo");
+  SetByte(6, kMiddleType);
+  FixChecksum(0, 3);
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(3U, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("missing start"));
+}
+
+TEST(LogTest, UnexpectedLastType) {
+  Write("foo");
+  SetByte(6, kLastType);
+  FixChecksum(0, 3);
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(3U, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("missing start"));
+}
+
+TEST(LogTest, UnexpectedFullType) {
+  Write("foo");
+  Write("bar");
+  SetByte(6, kFirstType);
+  FixChecksum(0, 3);
+  ASSERT_EQ("bar", Read());
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(3U, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("partial record without end"));
+}
+
+TEST(LogTest, UnexpectedFirstType) {
+  Write("foo");
+  Write(BigString("bar", 100000));
+  SetByte(6, kFirstType);
+  FixChecksum(0, 3);
+  ASSERT_EQ(BigString("bar", 100000), Read());
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(3U, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("partial record without end"));
+}
+
+TEST(LogTest, MissingLastIsIgnored) {
+  Write(BigString("bar", kBlockSize));
+  // Remove the LAST block, including header.
+  ShrinkSize(14);
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ("", ReportMessage());
+  ASSERT_EQ(0U, DroppedBytes());
+}
+
+TEST(LogTest, PartialLastIsIgnored) {
+  Write(BigString("bar", kBlockSize));
+  // Cause a bad record length in the LAST block.
+  ShrinkSize(1);
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ("", ReportMessage());
+  ASSERT_EQ(0U, DroppedBytes());
+}
+
+TEST(LogTest, ErrorJoinsRecords) {
+  // Consider two fragmented records:
+  //    first(R1) last(R1) first(R2) last(R2)
+  // where the middle two fragments disappear.  We do not want
+  // first(R1),last(R2) to get joined and returned as a valid record.
+
+  // Write records that span two blocks
+  Write(BigString("foo", kBlockSize));
+  Write(BigString("bar", kBlockSize));
+  Write("correct");
+
+  // Wipe the middle block
+  for (unsigned int offset = kBlockSize; offset < 2*kBlockSize; offset++) {
+    SetByte(offset, 'x');
+  }
+
+  ASSERT_EQ("correct", Read());
+  ASSERT_EQ("EOF", Read());
+  const unsigned int dropped = DroppedBytes();
+  ASSERT_LE(dropped, 2*kBlockSize + 100);
+  ASSERT_GE(dropped, 2*kBlockSize);
+}
+
+TEST(LogTest, ReadStart) {
+  CheckInitialOffsetRecord(0, 0);
+}
+
+TEST(LogTest, ReadSecondOneOff) {
+  CheckInitialOffsetRecord(1, 1);
+}
+
+TEST(LogTest, ReadSecondTenThousand) {
+  CheckInitialOffsetRecord(10000, 1);
+}
+
+TEST(LogTest, ReadSecondStart) {
+  CheckInitialOffsetRecord(10007, 1);
+}
+
+TEST(LogTest, ReadThirdOneOff) {
+  CheckInitialOffsetRecord(10008, 2);
+}
+
+TEST(LogTest, ReadThirdStart) {
+  CheckInitialOffsetRecord(20014, 2);
+}
+
+TEST(LogTest, ReadFourthOneOff) {
+  CheckInitialOffsetRecord(20015, 3);
+}
+
+TEST(LogTest, ReadFourthFirstBlockTrailer) {
+  CheckInitialOffsetRecord(log::kBlockSize - 4, 3);
+}
+
+TEST(LogTest, ReadFourthMiddleBlock) {
+  CheckInitialOffsetRecord(log::kBlockSize + 1, 3);
+}
+
+TEST(LogTest, ReadFourthLastBlock) {
+  CheckInitialOffsetRecord(2 * log::kBlockSize + 1, 3);
+}
+
+TEST(LogTest, ReadFourthStart) {
+  CheckInitialOffsetRecord(
+      2 * (kHeaderSize + 1000) + (2 * log::kBlockSize - 1000) + 3 * kHeaderSize,
+      3);
+}
+
+TEST(LogTest, ReadEnd) {
+  CheckOffsetPastEndReturnsNoRecords(0);
+}
+
+TEST(LogTest, ReadPastEnd) {
+  CheckOffsetPastEndReturnsNoRecords(5);
+}
+
+TEST(LogTest, ClearEofSingleBlock) {
+  Write("foo");
+  Write("bar");
+  ForceEOF(3 + kHeaderSize + 2);
+  ASSERT_EQ("foo", Read());
+  UnmarkEOF();
+  ASSERT_EQ("bar", Read());
+  ASSERT_TRUE(IsEOF());
+  ASSERT_EQ("EOF", Read());
+  Write("xxx");
+  UnmarkEOF();
+  ASSERT_EQ("xxx", Read());
+  ASSERT_TRUE(IsEOF());
+}
+
+TEST(LogTest, ClearEofMultiBlock) {
+  size_t num_full_blocks = 5;
+  size_t n = (kBlockSize - kHeaderSize) * num_full_blocks + 25;
+  Write(BigString("foo", n));
+  Write(BigString("bar", n));
+  ForceEOF(n + num_full_blocks * kHeaderSize + 10);
+  ASSERT_EQ(BigString("foo", n), Read());
+  ASSERT_TRUE(IsEOF());
+  UnmarkEOF();
+  ASSERT_EQ(BigString("bar", n), Read());
+  ASSERT_TRUE(IsEOF());
+  Write(BigString("xxx", n));
+  UnmarkEOF();
+  ASSERT_EQ(BigString("xxx", n), Read());
+  ASSERT_TRUE(IsEOF());
+}
+
+TEST(LogTest, ClearEofError) {
+  // If an error occurs during Read() in UnmarkEOF(), the records contained
+  // in the buffer should be returned on subsequent calls of ReadRecord()
+  // until no more full records are left, whereafter ReadRecord() should return
+  // false to indicate that it cannot read any further.
+
+  Write("foo");
+  Write("bar");
+  UnmarkEOF();
+  ASSERT_EQ("foo", Read());
+  ASSERT_TRUE(IsEOF());
+  Write("xxx");
+  ForceError(0);
+  UnmarkEOF();
+  ASSERT_EQ("bar", Read());
+  ASSERT_EQ("EOF", Read());
+}
+
+TEST(LogTest, ClearEofError2) {
+  Write("foo");
+  Write("bar");
+  UnmarkEOF();
+  ASSERT_EQ("foo", Read());
+  Write("xxx");
+  ForceError(3);
+  UnmarkEOF();
+  ASSERT_EQ("bar", Read());
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(3U, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("read error"));
+}
+
+}  // namespace log
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/db/log_writer.cc b/db/log_writer.cc
new file mode 100644 (file)
index 0000000..df601a4
--- /dev/null
@@ -0,0 +1,108 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/log_writer.h"
+
+#include <stdint.h>
+#include "rocksdb/env.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+
+namespace rocksdb {
+namespace log {
+
+Writer::Writer(unique_ptr<WritableFile>&& dest)
+    : dest_(std::move(dest)),
+      block_offset_(0) {
+  for (int i = 0; i <= kMaxRecordType; i++) {
+    char t = static_cast<char>(i);
+    type_crc_[i] = crc32c::Value(&t, 1);
+  }
+}
+
+Writer::~Writer() {
+}
+
+Status Writer::AddRecord(const Slice& slice) {
+  const char* ptr = slice.data();
+  size_t left = slice.size();
+
+  // Fragment the record if necessary and emit it.  Note that if slice
+  // is empty, we still want to iterate once to emit a single
+  // zero-length record
+  Status s;
+  bool begin = true;
+  do {
+    const int leftover = kBlockSize - block_offset_;
+    assert(leftover >= 0);
+    if (leftover < kHeaderSize) {
+      // Switch to a new block
+      if (leftover > 0) {
+        // Fill the trailer (literal below relies on kHeaderSize being 7)
+        assert(kHeaderSize == 7);
+        dest_->Append(Slice("\x00\x00\x00\x00\x00\x00", leftover));
+      }
+      block_offset_ = 0;
+    }
+
+    // Invariant: we never leave < kHeaderSize bytes in a block.
+    assert(kBlockSize - block_offset_ - kHeaderSize >= 0);
+
+    const size_t avail = kBlockSize - block_offset_ - kHeaderSize;
+    const size_t fragment_length = (left < avail) ? left : avail;
+
+    RecordType type;
+    const bool end = (left == fragment_length);
+    if (begin && end) {
+      type = kFullType;
+    } else if (begin) {
+      type = kFirstType;
+    } else if (end) {
+      type = kLastType;
+    } else {
+      type = kMiddleType;
+    }
+
+    s = EmitPhysicalRecord(type, ptr, fragment_length);
+    ptr += fragment_length;
+    left -= fragment_length;
+    begin = false;
+  } while (s.ok() && left > 0);
+  return s;
+}
+
+Status Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n) {
+  assert(n <= 0xffff);  // Must fit in two bytes
+  assert(block_offset_ + kHeaderSize + n <= kBlockSize);
+
+  // Format the header
+  char buf[kHeaderSize];
+  buf[4] = static_cast<char>(n & 0xff);
+  buf[5] = static_cast<char>(n >> 8);
+  buf[6] = static_cast<char>(t);
+
+  // Compute the crc of the record type and the payload.
+  uint32_t crc = crc32c::Extend(type_crc_[t], ptr, n);
+  crc = crc32c::Mask(crc);                 // Adjust for storage
+  EncodeFixed32(buf, crc);
+
+  // Write the header and the payload
+  Status s = dest_->Append(Slice(buf, kHeaderSize));
+  if (s.ok()) {
+    s = dest_->Append(Slice(ptr, n));
+    if (s.ok()) {
+      s = dest_->Flush();
+    }
+  }
+  block_offset_ += kHeaderSize + n;
+  return s;
+}
+
+}  // namespace log
+}  // namespace rocksdb
diff --git a/db/log_writer.h b/db/log_writer.h
new file mode 100644 (file)
index 0000000..d7b7aff
--- /dev/null
@@ -0,0 +1,55 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <memory>
+#include <stdint.h>
+#include "db/log_format.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+
+namespace rocksdb {
+
+class WritableFile;
+
+using std::unique_ptr;
+
+namespace log {
+
+class Writer {
+ public:
+  // Create a writer that will append data to "*dest".
+  // "*dest" must be initially empty.
+  // "*dest" must remain live while this Writer is in use.
+  explicit Writer(unique_ptr<WritableFile>&& dest);
+  ~Writer();
+
+  Status AddRecord(const Slice& slice);
+
+  WritableFile* file() { return dest_.get(); }
+  const WritableFile* file() const { return dest_.get(); }
+
+ private:
+  unique_ptr<WritableFile> dest_;
+  int block_offset_;       // Current offset in block
+
+  // crc32c values for all supported record types.  These are
+  // pre-computed to reduce the overhead of computing the crc of the
+  // record type stored in the header.
+  uint32_t type_crc_[kMaxRecordType + 1];
+
+  Status EmitPhysicalRecord(RecordType type, const char* ptr, size_t length);
+
+  // No copying allowed
+  Writer(const Writer&);
+  void operator=(const Writer&);
+};
+
+}  // namespace log
+}  // namespace rocksdb
diff --git a/db/memtable.cc b/db/memtable.cc
new file mode 100644 (file)
index 0000000..424efe8
--- /dev/null
@@ -0,0 +1,600 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/memtable.h"
+
+#include <memory>
+#include <algorithm>
+#include <limits>
+
+#include "db/dbformat.h"
+#include "db/merge_context.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/slice_transform.h"
+#include "util/arena.h"
+#include "util/coding.h"
+#include "util/murmurhash.h"
+#include "util/mutexlock.h"
+#include "util/perf_context_imp.h"
+#include "util/statistics.h"
+#include "util/stop_watch.h"
+
+namespace rocksdb {
+
+MemTable::MemTable(const InternalKeyComparator& cmp, const Options& options)
+    : comparator_(cmp),
+      refs_(0),
+      kArenaBlockSize(OptimizeBlockSize(options.arena_block_size)),
+      kWriteBufferSize(options.write_buffer_size),
+      arena_(options.arena_block_size),
+      table_(options.memtable_factory->CreateMemTableRep(
+          comparator_, &arena_, options.prefix_extractor.get())),
+      num_entries_(0),
+      flush_in_progress_(false),
+      flush_completed_(false),
+      file_number_(0),
+      first_seqno_(0),
+      mem_next_logfile_number_(0),
+      locks_(options.inplace_update_support ? options.inplace_update_num_locks
+                                            : 0),
+      prefix_extractor_(options.prefix_extractor.get()),
+      should_flush_(ShouldFlushNow()) {
+  // if should_flush_ == true without an entry inserted, something must have
+  // gone wrong already.
+  assert(!should_flush_);
+  if (prefix_extractor_ && options.memtable_prefix_bloom_bits > 0) {
+    prefix_bloom_.reset(new DynamicBloom(options.memtable_prefix_bloom_bits,
+                                         options.bloom_locality,
+                                         options.memtable_prefix_bloom_probes));
+  }
+}
+
+MemTable::~MemTable() {
+  assert(refs_ == 0);
+}
+
+size_t MemTable::ApproximateMemoryUsage() {
+  size_t arena_usage = arena_.ApproximateMemoryUsage();
+  size_t table_usage = table_->ApproximateMemoryUsage();
+  // let MAX_USAGE =  std::numeric_limits<size_t>::max()
+  // then if arena_usage + total_usage >= MAX_USAGE, return MAX_USAGE.
+  // the following variation is to avoid numeric overflow.
+  if (arena_usage >= std::numeric_limits<size_t>::max() - table_usage) {
+    return std::numeric_limits<size_t>::max();
+  }
+  // otherwise, return the actual usage
+  return arena_usage + table_usage;
+}
+
+bool MemTable::ShouldFlushNow() const {
+  // In a lot of times, we cannot allocate arena blocks that exactly matches the
+  // buffer size. Thus we have to decide if we should over-allocate or
+  // under-allocate.
+  // This constant avariable can be interpreted as: if we still have more than
+  // "kAllowOverAllocationRatio * kArenaBlockSize" space left, we'd try to over
+  // allocate one more block.
+  const double kAllowOverAllocationRatio = 0.6;
+
+  // If arena still have room for new block allocation, we can safely say it
+  // shouldn't flush.
+  auto allocated_memory =
+      table_->ApproximateMemoryUsage() + arena_.MemoryAllocatedBytes();
+
+  // if we can still allocate one more block without exceeding the
+  // over-allocation ratio, then we should not flush.
+  if (allocated_memory + kArenaBlockSize <
+      kWriteBufferSize + kArenaBlockSize * kAllowOverAllocationRatio) {
+    return false;
+  }
+
+  // if user keeps adding entries that exceeds kWriteBufferSize, we need to
+  // flush earlier even though we still have much available memory left.
+  if (allocated_memory >
+      kWriteBufferSize + kArenaBlockSize * kAllowOverAllocationRatio) {
+    return true;
+  }
+
+  // In this code path, Arena has already allocated its "last block", which
+  // means the total allocatedmemory size is either:
+  //  (1) "moderately" over allocated the memory (no more than `0.6 * arena
+  // block size`. Or,
+  //  (2) the allocated memory is less than write buffer size, but we'll stop
+  // here since if we allocate a new arena block, we'll over allocate too much
+  // more (half of the arena block size) memory.
+  //
+  // In either case, to avoid over-allocate, the last block will stop allocation
+  // when its usage reaches a certain ratio, which we carefully choose "0.75
+  // full" as the stop condition because it addresses the following issue with
+  // great simplicity: What if the next inserted entry's size is
+  // bigger than AllocatedAndUnused()?
+  //
+  // The answer is: if the entry size is also bigger than 0.25 *
+  // kArenaBlockSize, a dedicated block will be allocated for it; otherwise
+  // arena will anyway skip the AllocatedAndUnused() and allocate a new, empty
+  // and regular block. In either case, we *overly* over-allocated.
+  //
+  // Therefore, setting the last block to be at most "0.75 full" avoids both
+  // cases.
+  //
+  // NOTE: the average percentage of waste space of this approach can be counted
+  // as: "arena block size * 0.25 / write buffer size". User who specify a small
+  // write buffer size and/or big arena block size may suffer.
+  return arena_.AllocatedAndUnused() < kArenaBlockSize / 4;
+}
+
+int MemTable::KeyComparator::operator()(const char* prefix_len_key1,
+                                        const char* prefix_len_key2) const {
+  // Internal keys are encoded as length-prefixed strings.
+  Slice k1 = GetLengthPrefixedSlice(prefix_len_key1);
+  Slice k2 = GetLengthPrefixedSlice(prefix_len_key2);
+  return comparator.Compare(k1, k2);
+}
+
+int MemTable::KeyComparator::operator()(const char* prefix_len_key,
+                                        const Slice& key)
+    const {
+  // Internal keys are encoded as length-prefixed strings.
+  Slice a = GetLengthPrefixedSlice(prefix_len_key);
+  return comparator.Compare(a, key);
+}
+
+Slice MemTableRep::UserKey(const char* key) const {
+  Slice slice = GetLengthPrefixedSlice(key);
+  return Slice(slice.data(), slice.size() - 8);
+}
+
+KeyHandle MemTableRep::Allocate(const size_t len, char** buf) {
+  *buf = arena_->Allocate(len);
+  return static_cast<KeyHandle>(*buf);
+}
+
+// Encode a suitable internal key target for "target" and return it.
+// Uses *scratch as scratch space, and the returned pointer will point
+// into this scratch space.
+const char* EncodeKey(std::string* scratch, const Slice& target) {
+  scratch->clear();
+  PutVarint32(scratch, target.size());
+  scratch->append(target.data(), target.size());
+  return scratch->data();
+}
+
+class MemTableIterator: public Iterator {
+ public:
+  MemTableIterator(const MemTable& mem, const ReadOptions& options,
+                   bool enforce_total_order)
+      : bloom_(nullptr),
+        prefix_extractor_(mem.prefix_extractor_),
+        valid_(false) {
+    if (prefix_extractor_ != nullptr && !enforce_total_order) {
+      bloom_ = mem.prefix_bloom_.get();
+      iter_.reset(mem.table_->GetDynamicPrefixIterator());
+    } else {
+      iter_.reset(mem.table_->GetIterator());
+    }
+  }
+
+  virtual bool Valid() const { return valid_; }
+  virtual void Seek(const Slice& k) {
+    if (bloom_ != nullptr &&
+        !bloom_->MayContain(prefix_extractor_->Transform(ExtractUserKey(k)))) {
+      valid_ = false;
+      return;
+    }
+    iter_->Seek(k, nullptr);
+    valid_ = iter_->Valid();
+  }
+  virtual void SeekToFirst() {
+    iter_->SeekToFirst();
+    valid_ = iter_->Valid();
+  }
+  virtual void SeekToLast() {
+    iter_->SeekToLast();
+    valid_ = iter_->Valid();
+  }
+  virtual void Next() {
+    assert(Valid());
+    iter_->Next();
+    valid_ = iter_->Valid();
+  }
+  virtual void Prev() {
+    assert(Valid());
+    iter_->Prev();
+    valid_ = iter_->Valid();
+  }
+  virtual Slice key() const {
+    assert(Valid());
+    return GetLengthPrefixedSlice(iter_->key());
+  }
+  virtual Slice value() const {
+    assert(Valid());
+    Slice key_slice = GetLengthPrefixedSlice(iter_->key());
+    return GetLengthPrefixedSlice(key_slice.data() + key_slice.size());
+  }
+
+  virtual Status status() const { return Status::OK(); }
+
+ private:
+  DynamicBloom* bloom_;
+  const SliceTransform* const prefix_extractor_;
+  std::unique_ptr<MemTableRep::Iterator> iter_;
+  bool valid_;
+
+  // No copying allowed
+  MemTableIterator(const MemTableIterator&);
+  void operator=(const MemTableIterator&);
+};
+
+Iterator* MemTable::NewIterator(const ReadOptions& options,
+    bool enforce_total_order) {
+  return new MemTableIterator(*this, options, enforce_total_order);
+}
+
+port::RWMutex* MemTable::GetLock(const Slice& key) {
+  static murmur_hash hash;
+  return &locks_[hash(key) % locks_.size()];
+}
+
+void MemTable::Add(SequenceNumber s, ValueType type,
+                   const Slice& key, /* user key */
+                   const Slice& value) {
+  // Format of an entry is concatenation of:
+  //  key_size     : varint32 of internal_key.size()
+  //  key bytes    : char[internal_key.size()]
+  //  value_size   : varint32 of value.size()
+  //  value bytes  : char[value.size()]
+  size_t key_size = key.size();
+  size_t val_size = value.size();
+  size_t internal_key_size = key_size + 8;
+  const size_t encoded_len =
+      VarintLength(internal_key_size) + internal_key_size +
+      VarintLength(val_size) + val_size;
+  char* buf = nullptr;
+  KeyHandle handle = table_->Allocate(encoded_len, &buf);
+  assert(buf != nullptr);
+  char* p = EncodeVarint32(buf, internal_key_size);
+  memcpy(p, key.data(), key_size);
+  p += key_size;
+  EncodeFixed64(p, (s << 8) | type);
+  p += 8;
+  p = EncodeVarint32(p, val_size);
+  memcpy(p, value.data(), val_size);
+  assert((unsigned)(p + val_size - buf) == (unsigned)encoded_len);
+  table_->Insert(handle);
+  num_entries_++;
+
+  if (prefix_bloom_) {
+    assert(prefix_extractor_);
+    prefix_bloom_->Add(prefix_extractor_->Transform(key));
+  }
+
+  // The first sequence number inserted into the memtable
+  assert(first_seqno_ == 0 || s > first_seqno_);
+  if (first_seqno_ == 0) {
+    first_seqno_ = s;
+  }
+
+  should_flush_ = ShouldFlushNow();
+}
+
+// Callback from MemTable::Get()
+namespace {
+
+struct Saver {
+  Status* status;
+  const LookupKey* key;
+  bool* found_final_value;  // Is value set correctly? Used by KeyMayExist
+  bool* merge_in_progress;
+  std::string* value;
+  const MergeOperator* merge_operator;
+  // the merge operations encountered;
+  MergeContext* merge_context;
+  MemTable* mem;
+  Logger* logger;
+  Statistics* statistics;
+  bool inplace_update_support;
+};
+}  // namespace
+
+static bool SaveValue(void* arg, const char* entry) {
+  Saver* s = reinterpret_cast<Saver*>(arg);
+  MergeContext* merge_context = s->merge_context;
+  const MergeOperator* merge_operator = s->merge_operator;
+
+  assert(s != nullptr && merge_context != nullptr);
+
+  // entry format is:
+  //    klength  varint32
+  //    userkey  char[klength-8]
+  //    tag      uint64
+  //    vlength  varint32
+  //    value    char[vlength]
+  // Check that it belongs to same user key.  We do not check the
+  // sequence number since the Seek() call above should have skipped
+  // all entries with overly large sequence numbers.
+  uint32_t key_length;
+  const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
+  if (s->mem->GetInternalKeyComparator().user_comparator()->Compare(
+          Slice(key_ptr, key_length - 8), s->key->user_key()) == 0) {
+    // Correct user key
+    const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
+    switch (static_cast<ValueType>(tag & 0xff)) {
+      case kTypeValue: {
+        if (s->inplace_update_support) {
+          s->mem->GetLock(s->key->user_key())->ReadLock();
+        }
+        Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
+        *(s->status) = Status::OK();
+        if (*(s->merge_in_progress)) {
+          assert(merge_operator);
+          if (!merge_operator->FullMerge(s->key->user_key(), &v,
+                                         merge_context->GetOperands(), s->value,
+                                         s->logger)) {
+            RecordTick(s->statistics, NUMBER_MERGE_FAILURES);
+            *(s->status) =
+                Status::Corruption("Error: Could not perform merge.");
+          }
+        } else {
+          s->value->assign(v.data(), v.size());
+        }
+        if (s->inplace_update_support) {
+          s->mem->GetLock(s->key->user_key())->Unlock();
+        }
+        *(s->found_final_value) = true;
+        return false;
+      }
+      case kTypeDeletion: {
+        if (*(s->merge_in_progress)) {
+          assert(merge_operator);
+          *(s->status) = Status::OK();
+          if (!merge_operator->FullMerge(s->key->user_key(), nullptr,
+                                         merge_context->GetOperands(), s->value,
+                                         s->logger)) {
+            RecordTick(s->statistics, NUMBER_MERGE_FAILURES);
+            *(s->status) =
+                Status::Corruption("Error: Could not perform merge.");
+          }
+        } else {
+          *(s->status) = Status::NotFound();
+        }
+        *(s->found_final_value) = true;
+        return false;
+      }
+      case kTypeMerge: {
+        std::string merge_result;  // temporary area for merge results later
+        Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
+        *(s->merge_in_progress) = true;
+        merge_context->PushOperand(v);
+        return true;
+      }
+      default:
+        assert(false);
+        return true;
+    }
+  }
+
+  // s->state could be Corrupt, merge or notfound
+  return false;
+}
+
+bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
+                   MergeContext& merge_context, const Options& options) {
+  PERF_TIMER_AUTO(get_from_memtable_time);
+
+  Slice user_key = key.user_key();
+  bool found_final_value = false;
+  bool merge_in_progress = s->IsMergeInProgress();
+
+  if (prefix_bloom_ &&
+      !prefix_bloom_->MayContain(prefix_extractor_->Transform(user_key))) {
+    // iter is null if prefix bloom says the key does not exist
+  } else {
+    Saver saver;
+    saver.status = s;
+    saver.found_final_value = &found_final_value;
+    saver.merge_in_progress = &merge_in_progress;
+    saver.key = &key;
+    saver.value = value;
+    saver.status = s;
+    saver.mem = this;
+    saver.merge_context = &merge_context;
+    saver.merge_operator = options.merge_operator.get();
+    saver.logger = options.info_log.get();
+    saver.inplace_update_support = options.inplace_update_support;
+    saver.statistics = options.statistics.get();
+    table_->Get(key, &saver, SaveValue);
+  }
+
+  // No change to value, since we have not yet found a Put/Delete
+  if (!found_final_value && merge_in_progress) {
+    *s = Status::MergeInProgress("");
+  }
+  PERF_TIMER_STOP(get_from_memtable_time);
+  PERF_COUNTER_ADD(get_from_memtable_count, 1);
+  return found_final_value;
+}
+
+void MemTable::Update(SequenceNumber seq,
+                      const Slice& key,
+                      const Slice& value) {
+  LookupKey lkey(key, seq);
+  Slice mem_key = lkey.memtable_key();
+
+  std::unique_ptr<MemTableRep::Iterator> iter(
+    table_->GetIterator(lkey.user_key()));
+  iter->Seek(lkey.internal_key(), mem_key.data());
+
+  if (iter->Valid()) {
+    // entry format is:
+    //    key_length  varint32
+    //    userkey  char[klength-8]
+    //    tag      uint64
+    //    vlength  varint32
+    //    value    char[vlength]
+    // Check that it belongs to same user key.  We do not check the
+    // sequence number since the Seek() call above should have skipped
+    // all entries with overly large sequence numbers.
+    const char* entry = iter->key();
+    uint32_t key_length = 0;
+    const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
+    if (comparator_.comparator.user_comparator()->Compare(
+        Slice(key_ptr, key_length - 8), lkey.user_key()) == 0) {
+      // Correct user key
+      const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
+      switch (static_cast<ValueType>(tag & 0xff)) {
+        case kTypeValue: {
+          Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length);
+          uint32_t prev_size = prev_value.size();
+          uint32_t new_size = value.size();
+
+          // Update value, if new value size  <= previous value size
+          if (new_size <= prev_size ) {
+            char* p = EncodeVarint32(const_cast<char*>(key_ptr) + key_length,
+                                     new_size);
+            WriteLock wl(GetLock(lkey.user_key()));
+            memcpy(p, value.data(), value.size());
+            assert((unsigned)((p + value.size()) - entry) ==
+                   (unsigned)(VarintLength(key_length) + key_length +
+                              VarintLength(value.size()) + value.size()));
+            return;
+          }
+        }
+        default:
+          // If the latest value is kTypeDeletion, kTypeMerge or kTypeLogData
+          // we don't have enough space for update inplace
+            Add(seq, kTypeValue, key, value);
+            return;
+      }
+    }
+  }
+
+  // key doesn't exist
+  Add(seq, kTypeValue, key, value);
+}
+
+bool MemTable::UpdateCallback(SequenceNumber seq,
+                              const Slice& key,
+                              const Slice& delta,
+                              const Options& options) {
+  LookupKey lkey(key, seq);
+  Slice memkey = lkey.memtable_key();
+
+  std::unique_ptr<MemTableRep::Iterator> iter(
+    table_->GetIterator(lkey.user_key()));
+  iter->Seek(lkey.internal_key(), memkey.data());
+
+  if (iter->Valid()) {
+    // entry format is:
+    //    key_length  varint32
+    //    userkey  char[klength-8]
+    //    tag      uint64
+    //    vlength  varint32
+    //    value    char[vlength]
+    // Check that it belongs to same user key.  We do not check the
+    // sequence number since the Seek() call above should have skipped
+    // all entries with overly large sequence numbers.
+    const char* entry = iter->key();
+    uint32_t key_length = 0;
+    const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
+    if (comparator_.comparator.user_comparator()->Compare(
+        Slice(key_ptr, key_length - 8), lkey.user_key()) == 0) {
+      // Correct user key
+      const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
+      switch (static_cast<ValueType>(tag & 0xff)) {
+        case kTypeValue: {
+          Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length);
+          uint32_t  prev_size = prev_value.size();
+
+          char* prev_buffer = const_cast<char*>(prev_value.data());
+          uint32_t  new_prev_size = prev_size;
+
+          std::string str_value;
+          WriteLock wl(GetLock(lkey.user_key()));
+          auto status = options.inplace_callback(prev_buffer, &new_prev_size,
+                                                    delta, &str_value);
+          if (status == UpdateStatus::UPDATED_INPLACE) {
+            // Value already updated by callback.
+            assert(new_prev_size <= prev_size);
+            if (new_prev_size < prev_size) {
+              // overwrite the new prev_size
+              char* p = EncodeVarint32(const_cast<char*>(key_ptr) + key_length,
+                                       new_prev_size);
+              if (VarintLength(new_prev_size) < VarintLength(prev_size)) {
+                // shift the value buffer as well.
+                memcpy(p, prev_buffer, new_prev_size);
+              }
+            }
+            RecordTick(options.statistics.get(), NUMBER_KEYS_UPDATED);
+            should_flush_ = ShouldFlushNow();
+            return true;
+          } else if (status == UpdateStatus::UPDATED) {
+            Add(seq, kTypeValue, key, Slice(str_value));
+            RecordTick(options.statistics.get(), NUMBER_KEYS_WRITTEN);
+            should_flush_ = ShouldFlushNow();
+            return true;
+          } else if (status == UpdateStatus::UPDATE_FAILED) {
+            // No action required. Return.
+            should_flush_ = ShouldFlushNow();
+            return true;
+          }
+        }
+        default:
+          break;
+      }
+    }
+  }
+  // If the latest value is not kTypeValue
+  // or key doesn't exist
+  return false;
+}
+
+size_t MemTable::CountSuccessiveMergeEntries(const LookupKey& key) {
+  Slice memkey = key.memtable_key();
+
+  // A total ordered iterator is costly for some memtablerep (prefix aware
+  // reps). By passing in the user key, we allow efficient iterator creation.
+  // The iterator only needs to be ordered within the same user key.
+  std::unique_ptr<MemTableRep::Iterator> iter(
+      table_->GetIterator(key.user_key()));
+  iter->Seek(key.internal_key(), memkey.data());
+
+  size_t num_successive_merges = 0;
+
+  for (; iter->Valid(); iter->Next()) {
+    const char* entry = iter->key();
+    uint32_t key_length = 0;
+    const char* iter_key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
+    if (comparator_.comparator.user_comparator()->Compare(
+            Slice(iter_key_ptr, key_length - 8), key.user_key()) != 0) {
+      break;
+    }
+
+    const uint64_t tag = DecodeFixed64(iter_key_ptr + key_length - 8);
+    if (static_cast<ValueType>(tag & 0xff) != kTypeMerge) {
+      break;
+    }
+
+    ++num_successive_merges;
+  }
+
+  return num_successive_merges;
+}
+
+void MemTableRep::Get(const LookupKey& k, void* callback_args,
+                      bool (*callback_func)(void* arg, const char* entry)) {
+  auto iter = GetIterator(k.user_key());
+  for (iter->Seek(k.internal_key(), k.memtable_key().data());
+       iter->Valid() && callback_func(callback_args, iter->key());
+       iter->Next()) {
+  }
+}
+
+}  // namespace rocksdb
diff --git a/db/memtable.h b/db/memtable.h
new file mode 100644 (file)
index 0000000..7e9af35
--- /dev/null
@@ -0,0 +1,217 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <string>
+#include <memory>
+#include <deque>
+#include "db/dbformat.h"
+#include "db/skiplist.h"
+#include "db/version_edit.h"
+#include "rocksdb/db.h"
+#include "rocksdb/memtablerep.h"
+#include "util/arena.h"
+#include "util/dynamic_bloom.h"
+
+namespace rocksdb {
+
+class Mutex;
+class MemTableIterator;
+class MergeContext;
+
+class MemTable {
+ public:
+  struct KeyComparator : public MemTableRep::KeyComparator {
+    const InternalKeyComparator comparator;
+    explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) { }
+    virtual int operator()(const char* prefix_len_key1,
+                           const char* prefix_len_key2) const;
+    virtual int operator()(const char* prefix_len_key,
+                           const Slice& key) const override;
+  };
+
+  // MemTables are reference counted.  The initial reference count
+  // is zero and the caller must call Ref() at least once.
+  explicit MemTable(const InternalKeyComparator& comparator,
+                    const Options& options);
+
+  ~MemTable();
+
+  // Increase reference count.
+  void Ref() { ++refs_; }
+
+  // Drop reference count.
+  // If the refcount goes to zero return this memtable, otherwise return null
+  MemTable* Unref() {
+    --refs_;
+    assert(refs_ >= 0);
+    if (refs_ <= 0) {
+      return this;
+    }
+    return nullptr;
+  }
+
+  // Returns an estimate of the number of bytes of data in use by this
+  // data structure.
+  //
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable.
+  size_t ApproximateMemoryUsage();
+
+  // This method heuristically determines if the memtable should continue to
+  // host more data.
+  bool ShouldFlush() const { return should_flush_; }
+
+  // Return an iterator that yields the contents of the memtable.
+  //
+  // The caller must ensure that the underlying MemTable remains live
+  // while the returned iterator is live.  The keys returned by this
+  // iterator are internal keys encoded by AppendInternalKey in the
+  // db/dbformat.{h,cc} module.
+  //
+  // By default, it returns an iterator for prefix seek if prefix_extractor
+  // is configured in Options.
+  Iterator* NewIterator(const ReadOptions& options,
+                        bool enforce_total_order = false);
+
+  // Add an entry into memtable that maps key to value at the
+  // specified sequence number and with the specified type.
+  // Typically value will be empty if type==kTypeDeletion.
+  void Add(SequenceNumber seq, ValueType type,
+           const Slice& key,
+           const Slice& value);
+
+  // If memtable contains a value for key, store it in *value and return true.
+  // If memtable contains a deletion for key, store a NotFound() error
+  // in *status and return true.
+  // If memtable contains Merge operation as the most recent entry for a key,
+  //   and the merge process does not stop (not reaching a value or delete),
+  //   prepend the current merge operand to *operands.
+  //   store MergeInProgress in s, and return false.
+  // Else, return false.
+  bool Get(const LookupKey& key, std::string* value, Status* s,
+           MergeContext& merge_context, const Options& options);
+
+  // Attempts to update the new_value inplace, else does normal Add
+  // Pseudocode
+  //   if key exists in current memtable && prev_value is of type kTypeValue
+  //     if new sizeof(new_value) <= sizeof(prev_value)
+  //       update inplace
+  //     else add(key, new_value)
+  //   else add(key, new_value)
+  void Update(SequenceNumber seq,
+              const Slice& key,
+              const Slice& value);
+
+  // If prev_value for key exits, attempts to update it inplace.
+  // else returns false
+  // Pseudocode
+  //   if key exists in current memtable && prev_value is of type kTypeValue
+  //     new_value = delta(prev_value)
+  //     if sizeof(new_value) <= sizeof(prev_value)
+  //       update inplace
+  //     else add(key, new_value)
+  //   else return false
+  bool UpdateCallback(SequenceNumber seq,
+                      const Slice& key,
+                      const Slice& delta,
+                      const Options& options);
+
+  // Returns the number of successive merge entries starting from the newest
+  // entry for the key up to the last non-merge entry or last entry for the
+  // key in the memtable.
+  size_t CountSuccessiveMergeEntries(const LookupKey& key);
+
+  // Get total number of entries in the mem table.
+  uint64_t GetNumEntries() const { return num_entries_; }
+
+  // Returns the edits area that is needed for flushing the memtable
+  VersionEdit* GetEdits() { return &edit_; }
+
+  // Returns the sequence number of the first element that was inserted
+  // into the memtable
+  SequenceNumber GetFirstSequenceNumber() { return first_seqno_; }
+
+  // Returns the next active logfile number when this memtable is about to
+  // be flushed to storage
+  uint64_t GetNextLogNumber() { return mem_next_logfile_number_; }
+
+  // Sets the next active logfile number when this memtable is about to
+  // be flushed to storage
+  void SetNextLogNumber(uint64_t num) { mem_next_logfile_number_ = num; }
+
+  // Notify the underlying storage that no more items will be added
+  void MarkImmutable() { table_->MarkReadOnly(); }
+
+  // return true if the current MemTableRep supports merge operator.
+  bool IsMergeOperatorSupported() const {
+    return table_->IsMergeOperatorSupported();
+  }
+
+  // return true if the current MemTableRep supports snapshots.
+  bool IsSnapshotSupported() const { return table_->IsSnapshotSupported(); }
+
+  // Get the lock associated for the key
+  port::RWMutex* GetLock(const Slice& key);
+
+  const InternalKeyComparator& GetInternalKeyComparator() const {
+    return comparator_.comparator;
+  }
+
+  const Arena& TEST_GetArena() const { return arena_; }
+
+ private:
+  // Dynamically check if we can add more incoming entries.
+  bool ShouldFlushNow() const;
+
+  friend class MemTableIterator;
+  friend class MemTableBackwardIterator;
+  friend class MemTableList;
+
+  KeyComparator comparator_;
+  int refs_;
+  const size_t kArenaBlockSize;
+  const size_t kWriteBufferSize;
+  Arena arena_;
+  unique_ptr<MemTableRep> table_;
+
+  uint64_t num_entries_;
+
+  // These are used to manage memtable flushes to storage
+  bool flush_in_progress_; // started the flush
+  bool flush_completed_;   // finished the flush
+  uint64_t file_number_;    // filled up after flush is complete
+
+  // The updates to be applied to the transaction log when this
+  // memtable is flushed to storage.
+  VersionEdit edit_;
+
+  // The sequence number of the kv that was inserted first
+  SequenceNumber first_seqno_;
+
+  // The log files earlier than this number can be deleted.
+  uint64_t mem_next_logfile_number_;
+
+  // rw locks for inplace updates
+  std::vector<port::RWMutex> locks_;
+
+  // No copying allowed
+  MemTable(const MemTable&);
+  void operator=(const MemTable&);
+
+  const SliceTransform* const prefix_extractor_;
+  std::unique_ptr<DynamicBloom> prefix_bloom_;
+
+  // a flag indicating if a memtable has met the criteria to flush
+  bool should_flush_;
+};
+
+extern const char* EncodeKey(std::string* scratch, const Slice& target);
+
+}  // namespace rocksdb
diff --git a/db/memtable_list.cc b/db/memtable_list.cc
new file mode 100644 (file)
index 0000000..2354219
--- /dev/null
@@ -0,0 +1,277 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#include "db/memtable_list.h"
+
+#include <string>
+#include "rocksdb/db.h"
+#include "db/memtable.h"
+#include "db/version_set.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "util/coding.h"
+#include "util/log_buffer.h"
+
+namespace rocksdb {
+
+class InternalKeyComparator;
+class Mutex;
+class VersionSet;
+
+MemTableListVersion::MemTableListVersion(MemTableListVersion* old) {
+  if (old != nullptr) {
+    memlist_ = old->memlist_;
+    size_ = old->size_;
+    for (auto& m : memlist_) {
+      m->Ref();
+    }
+  }
+}
+
+void MemTableListVersion::Ref() { ++refs_; }
+
+void MemTableListVersion::Unref(autovector<MemTable*>* to_delete) {
+  assert(refs_ >= 1);
+  --refs_;
+  if (refs_ == 0) {
+    // if to_delete is equal to nullptr it means we're confident
+    // that refs_ will not be zero
+    assert(to_delete != nullptr);
+    for (const auto& m : memlist_) {
+      MemTable* x = m->Unref();
+      if (x != nullptr) {
+        to_delete->push_back(x);
+      }
+    }
+    delete this;
+  }
+}
+
+int MemTableListVersion::size() const { return size_; }
+
+// Returns the total number of memtables in the list
+int MemTableList::size() const {
+  assert(num_flush_not_started_ <= current_->size_);
+  return current_->size_;
+}
+
+// Search all the memtables starting from the most recent one.
+// Return the most recent value found, if any.
+// Operands stores the list of merge operations to apply, so far.
+bool MemTableListVersion::Get(const LookupKey& key, std::string* value,
+                              Status* s, MergeContext& merge_context,
+                              const Options& options) {
+  for (auto& memtable : memlist_) {
+    if (memtable->Get(key, value, s, merge_context, options)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+void MemTableListVersion::AddIterators(const ReadOptions& options,
+                                       std::vector<Iterator*>* iterator_list) {
+  for (auto& m : memlist_) {
+    iterator_list->push_back(m->NewIterator(options));
+  }
+}
+
+uint64_t MemTableListVersion::GetTotalNumEntries() const {
+  uint64_t total_num = 0;
+  for (auto& m : memlist_) {
+    total_num += m->GetNumEntries();
+  }
+  return total_num;
+}
+
+// caller is responsible for referencing m
+void MemTableListVersion::Add(MemTable* m) {
+  assert(refs_ == 1);  // only when refs_ == 1 is MemTableListVersion mutable
+  memlist_.push_front(m);
+  ++size_;
+}
+
+// caller is responsible for unreferencing m
+void MemTableListVersion::Remove(MemTable* m) {
+  assert(refs_ == 1);  // only when refs_ == 1 is MemTableListVersion mutable
+  memlist_.remove(m);
+  --size_;
+}
+
+// Returns true if there is at least one memtable on which flush has
+// not yet started.
+bool MemTableList::IsFlushPending() const {
+  if ((flush_requested_ && num_flush_not_started_ >= 1) ||
+      (num_flush_not_started_ >= min_write_buffer_number_to_merge_)) {
+    assert(imm_flush_needed.NoBarrier_Load() != nullptr);
+    return true;
+  }
+  return false;
+}
+
+// Returns the memtables that need to be flushed.
+void MemTableList::PickMemtablesToFlush(autovector<MemTable*>* ret) {
+  const auto& memlist = current_->memlist_;
+  for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) {
+    MemTable* m = *it;
+    if (!m->flush_in_progress_) {
+      assert(!m->flush_completed_);
+      num_flush_not_started_--;
+      if (num_flush_not_started_ == 0) {
+        imm_flush_needed.Release_Store(nullptr);
+      }
+      m->flush_in_progress_ = true;  // flushing will start very soon
+      ret->push_back(m);
+    }
+  }
+  flush_requested_ = false;  // start-flush request is complete
+}
+
+void MemTableList::RollbackMemtableFlush(const autovector<MemTable*>& mems,
+                                         uint64_t file_number,
+                                         std::set<uint64_t>* pending_outputs) {
+  assert(!mems.empty());
+
+  // If the flush was not successful, then just reset state.
+  // Maybe a suceeding attempt to flush will be successful.
+  for (MemTable* m : mems) {
+    assert(m->flush_in_progress_);
+    assert(m->file_number_ == 0);
+
+    m->flush_in_progress_ = false;
+    m->flush_completed_ = false;
+    m->edit_.Clear();
+    num_flush_not_started_++;
+  }
+  pending_outputs->erase(file_number);
+  imm_flush_needed.Release_Store(reinterpret_cast<void *>(1));
+}
+
+// Record a successful flush in the manifest file
+Status MemTableList::InstallMemtableFlushResults(
+    ColumnFamilyData* cfd, const autovector<MemTable*>& mems, VersionSet* vset,
+    port::Mutex* mu, Logger* info_log, uint64_t file_number,
+    std::set<uint64_t>& pending_outputs, autovector<MemTable*>* to_delete,
+    Directory* db_directory, LogBuffer* log_buffer) {
+  mu->AssertHeld();
+
+  // flush was sucessful
+  for (size_t i = 0; i < mems.size(); ++i) {
+    // All the edits are associated with the first memtable of this batch.
+    assert(i == 0 || mems[i]->GetEdits()->NumEntries() == 0);
+
+    mems[i]->flush_completed_ = true;
+    mems[i]->file_number_ = file_number;
+  }
+
+  // if some other thread is already commiting, then return
+  Status s;
+  if (commit_in_progress_) {
+    return s;
+  }
+
+  // Only a single thread can be executing this piece of code
+  commit_in_progress_ = true;
+
+  // scan all memtables from the earliest, and commit those
+  // (in that order) that have finished flushing. Memetables
+  // are always committed in the order that they were created.
+  while (!current_->memlist_.empty() && s.ok()) {
+    MemTable* m = current_->memlist_.back();  // get the last element
+    if (!m->flush_completed_) {
+      break;
+    }
+
+    LogToBuffer(log_buffer, "[%s] Level-0 commit table #%lu started",
+                cfd->GetName().c_str(), (unsigned long)m->file_number_);
+
+    // this can release and reacquire the mutex.
+    s = vset->LogAndApply(cfd, &m->edit_, mu, db_directory);
+
+    // we will be changing the version in the next code path,
+    // so we better create a new one, since versions are immutable
+    InstallNewVersion();
+
+    // All the later memtables that have the same filenum
+    // are part of the same batch. They can be committed now.
+    uint64_t mem_id = 1;  // how many memtables has been flushed.
+    do {
+      if (s.ok()) { // commit new state
+        LogToBuffer(log_buffer,
+                    "[%s] Level-0 commit table #%lu: memtable #%lu done",
+                    cfd->GetName().c_str(), (unsigned long)m->file_number_,
+                    (unsigned long)mem_id);
+        current_->Remove(m);
+        assert(m->file_number_ > 0);
+
+        // pending_outputs can be cleared only after the newly created file
+        // has been written to a committed version so that other concurrently
+        // executing compaction threads do not mistakenly assume that this
+        // file is not live.
+        pending_outputs.erase(m->file_number_);
+        if (m->Unref() != nullptr) {
+          to_delete->push_back(m);
+        }
+      } else {
+        //commit failed. setup state so that we can flush again.
+        Log(info_log,
+            "Level-0 commit table #%lu: memtable #%lu failed",
+            (unsigned long)m->file_number_,
+            (unsigned long)mem_id);
+        m->flush_completed_ = false;
+        m->flush_in_progress_ = false;
+        m->edit_.Clear();
+        num_flush_not_started_++;
+        pending_outputs.erase(m->file_number_);
+        m->file_number_ = 0;
+        imm_flush_needed.Release_Store((void *)1);
+      }
+      ++mem_id;
+    } while (!current_->memlist_.empty() && (m = current_->memlist_.back()) &&
+             m->file_number_ == file_number);
+  }
+  commit_in_progress_ = false;
+  return s;
+}
+
+// New memtables are inserted at the front of the list.
+void MemTableList::Add(MemTable* m) {
+  assert(current_->size_ >= num_flush_not_started_);
+  InstallNewVersion();
+  // this method is used to move mutable memtable into an immutable list.
+  // since mutable memtable is already refcounted by the DBImpl,
+  // and when moving to the imutable list we don't unref it,
+  // we don't have to ref the memtable here. we just take over the
+  // reference from the DBImpl.
+  current_->Add(m);
+  m->MarkImmutable();
+  num_flush_not_started_++;
+  if (num_flush_not_started_ == 1) {
+    imm_flush_needed.Release_Store((void *)1);
+  }
+}
+
+// Returns an estimate of the number of bytes of data in use.
+size_t MemTableList::ApproximateMemoryUsage() {
+  size_t size = 0;
+  for (auto& memtable : current_->memlist_) {
+    size += memtable->ApproximateMemoryUsage();
+  }
+  return size;
+}
+
+void MemTableList::InstallNewVersion() {
+  if (current_->refs_ == 1) {
+    // we're the only one using the version, just keep using it
+  } else {
+    // somebody else holds the current version, we need to create new one
+    MemTableListVersion* version = current_;
+    current_ = new MemTableListVersion(current_);
+    current_->Ref();
+    version->Unref();
+  }
+}
+
+}  // namespace rocksdb
diff --git a/db/memtable_list.h b/db/memtable_list.h
new file mode 100644 (file)
index 0000000..d85380b
--- /dev/null
@@ -0,0 +1,152 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#pragma once
+
+#include <string>
+#include <list>
+#include <vector>
+#include <set>
+#include <deque>
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/iterator.h"
+
+#include "db/dbformat.h"
+#include "db/skiplist.h"
+#include "db/memtable.h"
+#include "rocksdb/db.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "util/autovector.h"
+#include "util/log_buffer.h"
+
+namespace rocksdb {
+
+class ColumnFamilyData;
+class InternalKeyComparator;
+class Mutex;
+
+// keeps a list of immutable memtables in a vector. the list is immutable
+// if refcount is bigger than one. It is used as a state for Get() and
+// Iterator code paths
+class MemTableListVersion {
+ public:
+  explicit MemTableListVersion(MemTableListVersion* old = nullptr);
+
+  void Ref();
+  void Unref(autovector<MemTable*>* to_delete = nullptr);
+
+  int size() const;
+
+  // Search all the memtables starting from the most recent one.
+  // Return the most recent value found, if any.
+  bool Get(const LookupKey& key, std::string* value, Status* s,
+           MergeContext& merge_context, const Options& options);
+
+  void AddIterators(const ReadOptions& options,
+                    std::vector<Iterator*>* iterator_list);
+
+  uint64_t GetTotalNumEntries() const;
+
+ private:
+  // REQUIRE: m is mutable memtable
+  void Add(MemTable* m);
+  // REQUIRE: m is mutable memtable
+  void Remove(MemTable* m);
+
+  friend class MemTableList;
+  std::list<MemTable*> memlist_;
+  int size_ = 0;
+  int refs_ = 0;
+};
+
+// This class stores references to all the immutable memtables.
+// The memtables are flushed to L0 as soon as possible and in
+// any order. If there are more than one immutable memtable, their
+// flushes can occur concurrently.  However, they are 'committed'
+// to the manifest in FIFO order to maintain correctness and
+// recoverability from a crash.
+class MemTableList {
+ public:
+  // A list of memtables.
+  explicit MemTableList(int min_write_buffer_number_to_merge)
+      : min_write_buffer_number_to_merge_(min_write_buffer_number_to_merge),
+        current_(new MemTableListVersion()),
+        num_flush_not_started_(0),
+        commit_in_progress_(false),
+        flush_requested_(false) {
+    imm_flush_needed.Release_Store(nullptr);
+    current_->Ref();
+  }
+  ~MemTableList() {}
+
+  MemTableListVersion* current() { return current_; }
+
+  // so that background threads can detect non-nullptr pointer to
+  // determine whether there is anything more to start flushing.
+  port::AtomicPointer imm_flush_needed;
+
+  // Returns the total number of memtables in the list
+  int size() const;
+
+  // Returns true if there is at least one memtable on which flush has
+  // not yet started.
+  bool IsFlushPending() const;
+
+  // Returns the earliest memtables that needs to be flushed. The returned
+  // memtables are guaranteed to be in the ascending order of created time.
+  void PickMemtablesToFlush(autovector<MemTable*>* mems);
+
+  // Reset status of the given memtable list back to pending state so that
+  // they can get picked up again on the next round of flush.
+  void RollbackMemtableFlush(const autovector<MemTable*>& mems,
+                             uint64_t file_number,
+                             std::set<uint64_t>* pending_outputs);
+
+  // Commit a successful flush in the manifest file
+  Status InstallMemtableFlushResults(ColumnFamilyData* cfd,
+                                     const autovector<MemTable*>& m,
+                                     VersionSet* vset, port::Mutex* mu,
+                                     Logger* info_log, uint64_t file_number,
+                                     std::set<uint64_t>& pending_outputs,
+                                     autovector<MemTable*>* to_delete,
+                                     Directory* db_directory,
+                                     LogBuffer* log_buffer);
+
+  // New memtables are inserted at the front of the list.
+  // Takes ownership of the referenced held on *m by the caller of Add().
+  void Add(MemTable* m);
+
+  // Returns an estimate of the number of bytes of data in use.
+  size_t ApproximateMemoryUsage();
+
+  // Request a flush of all existing memtables to storage
+  void FlushRequested() { flush_requested_ = true; }
+
+  // Copying allowed
+  // MemTableList(const MemTableList&);
+  // void operator=(const MemTableList&);
+
+ private:
+  // DB mutex held
+  void InstallNewVersion();
+
+  int min_write_buffer_number_to_merge_;
+
+  MemTableListVersion* current_;
+
+  // the number of elements that still need flushing
+  int num_flush_not_started_;
+
+  // committing in progress
+  bool commit_in_progress_;
+
+  // Requested a flush of all memtables to storage
+  bool flush_requested_;
+
+};
+
+}  // namespace rocksdb
diff --git a/db/merge_context.h b/db/merge_context.h
new file mode 100644 (file)
index 0000000..bf483a8
--- /dev/null
@@ -0,0 +1,69 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#pragma once
+#include "db/dbformat.h"
+#include "rocksdb/slice.h"
+#include <string>
+#include <deque>
+
+namespace rocksdb {
+
+const std::deque<std::string> empty_operand_list;
+
+// The merge context for merging a user key.
+// When doing a Get(), DB will create such a class and pass it when
+// issuing Get() operation to memtables and version_set. The operands
+// will be fetched from the context when issuing partial of full merge.
+class MergeContext {
+public:
+  // Clear all the operands
+  void Clear() {
+    if (operand_list) {
+      operand_list->clear();
+    }
+  }
+  // Replace all operands with merge_result, which are expected to be the
+  // merge result of them.
+  void PushPartialMergeResult(std::string& merge_result) {
+    assert (operand_list);
+    operand_list->clear();
+    operand_list->push_front(std::move(merge_result));
+  }
+  // Push a merge operand
+  void PushOperand(const Slice& operand_slice) {
+    Initialize();
+    operand_list->push_front(operand_slice.ToString());
+  }
+  // return total number of operands in the list
+  size_t GetNumOperands() const {
+    if (!operand_list) {
+      return 0;
+    }
+    return operand_list->size();
+  }
+  // Get the operand at the index.
+  Slice GetOperand(int index) const {
+    assert (operand_list);
+    return (*operand_list)[index];
+  }
+  // Return all the operands.
+  const std::deque<std::string>& GetOperands() const {
+    if (!operand_list) {
+      return empty_operand_list;
+    }
+    return *operand_list;
+  }
+private:
+  void Initialize() {
+    if (!operand_list) {
+      operand_list.reset(new std::deque<std::string>());
+    }
+  }
+  std::unique_ptr<std::deque<std::string>> operand_list;
+};
+
+} // namespace rocksdb
+
diff --git a/db/merge_helper.cc b/db/merge_helper.cc
new file mode 100644 (file)
index 0000000..0e36f6a
--- /dev/null
@@ -0,0 +1,209 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#include "merge_helper.h"
+#include "db/dbformat.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/merge_operator.h"
+#include "util/statistics.h"
+#include <string>
+#include <stdio.h>
+
+namespace rocksdb {
+
+// PRE:  iter points to the first merge type entry
+// POST: iter points to the first entry beyond the merge process (or the end)
+//       keys_, operands_ are updated to reflect the merge result.
+//       keys_ stores the list of keys encountered while merging.
+//       operands_ stores the list of merge operands encountered while merging.
+//       keys_[i] corresponds to operands_[i] for each i.
+void MergeHelper::MergeUntil(Iterator* iter, SequenceNumber stop_before,
+                             bool at_bottom, Statistics* stats, int* steps) {
+  // Get a copy of the internal key, before it's invalidated by iter->Next()
+  // Also maintain the list of merge operands seen.
+  keys_.clear();
+  operands_.clear();
+  keys_.push_front(iter->key().ToString());
+  operands_.push_front(iter->value().ToString());
+
+  success_ = false;   // Will become true if we hit Put/Delete or bottom
+
+  // We need to parse the internal key again as the parsed key is
+  // backed by the internal key!
+  // Assume no internal key corruption as it has been successfully parsed
+  // by the caller.
+  // Invariant: keys_.back() will not change. Hence, orig_ikey is always valid.
+  ParsedInternalKey orig_ikey;
+  ParseInternalKey(keys_.back(), &orig_ikey);
+
+  bool hit_the_next_user_key = false;
+  std::string merge_result;  // Temporary value for merge results
+  if (steps) {
+    ++(*steps);
+  }
+  for (iter->Next(); iter->Valid(); iter->Next()) {
+    ParsedInternalKey ikey;
+    assert(operands_.size() >= 1);        // Should be invariants!
+    assert(keys_.size() == operands_.size());
+
+    if (!ParseInternalKey(iter->key(), &ikey)) {
+      // stop at corrupted key
+      if (assert_valid_internal_key_) {
+        assert(!"corrupted internal key is not expected");
+      }
+      break;
+    }
+
+    if (user_comparator_->Compare(ikey.user_key, orig_ikey.user_key) != 0) {
+      // hit a different user key, stop right here
+      hit_the_next_user_key = true;
+      break;
+    }
+
+    if (stop_before && ikey.sequence <= stop_before) {
+      // hit an entry that's visible by the previous snapshot, can't touch that
+      break;
+    }
+
+    // At this point we are guaranteed that we need to process this key.
+
+    if (kTypeDeletion == ikey.type) {
+      // hit a delete
+      //   => merge nullptr with operands_
+      //   => store result in operands_.back() (and update keys_.back())
+      //   => change the entry type to kTypeValue for keys_.back()
+      // We are done! Return a success if the merge passes.
+      success_ = user_merge_operator_->FullMerge(ikey.user_key, nullptr,
+                                                 operands_, &merge_result,
+                                                 logger_);
+
+      // We store the result in keys_.back() and operands_.back()
+      // if nothing went wrong (i.e.: no operand corruption on disk)
+      if (success_) {
+        std::string& key = keys_.back();  // The original key encountered
+        orig_ikey.type = kTypeValue;
+        UpdateInternalKey(&key[0], key.size(),
+                          orig_ikey.sequence, orig_ikey.type);
+        swap(operands_.back(), merge_result);
+      } else {
+        RecordTick(stats, NUMBER_MERGE_FAILURES);
+      }
+
+      // move iter to the next entry (before doing anything else)
+      iter->Next();
+      if (steps) {
+        ++(*steps);
+      }
+      return;
+    }
+
+    if (kTypeValue == ikey.type) {
+      // hit a put
+      //   => merge the put value with operands_
+      //   => store result in operands_.back() (and update keys_.back())
+      //   => change the entry type to kTypeValue for keys_.back()
+      // We are done! Success!
+      const Slice value = iter->value();
+      success_ = user_merge_operator_->FullMerge(ikey.user_key, &value,
+                                                 operands_, &merge_result,
+                                                 logger_);
+
+      // We store the result in keys_.back() and operands_.back()
+      // if nothing went wrong (i.e.: no operand corruption on disk)
+      if (success_) {
+        std::string& key = keys_.back();  // The original key encountered
+        orig_ikey.type = kTypeValue;
+        UpdateInternalKey(&key[0], key.size(),
+                          orig_ikey.sequence, orig_ikey.type);
+        swap(operands_.back(), merge_result);
+      } else {
+        RecordTick(stats, NUMBER_MERGE_FAILURES);
+      }
+
+      // move iter to the next entry
+      iter->Next();
+      if (steps) {
+        ++(*steps);
+      }
+      return;
+    }
+
+    if (kTypeMerge == ikey.type) {
+      // hit a merge
+      //   => merge the operand into the front of the operands_ list
+      //   => use the user's associative merge function to determine how.
+      //   => then continue because we haven't yet seen a Put/Delete.
+      assert(!operands_.empty()); // Should have at least one element in it
+
+      // keep queuing keys and operands until we either meet a put / delete
+      // request or later did a partial merge.
+      keys_.push_front(iter->key().ToString());
+      operands_.push_front(iter->value().ToString());
+      if (steps) {
+        ++(*steps);
+      }
+    }
+  }
+
+  // We are sure we have seen this key's entire history if we are at the
+  // last level and exhausted all internal keys of this user key.
+  // NOTE: !iter->Valid() does not necessarily mean we hit the
+  // beginning of a user key, as versions of a user key might be
+  // split into multiple files (even files on the same level)
+  // and some files might not be included in the compaction/merge.
+  //
+  // There are also cases where we have seen the root of history of this
+  // key without being sure of it. Then, we simply miss the opportunity
+  // to combine the keys. Since VersionSet::SetupOtherInputs() always makes
+  // sure that all merge-operands on the same level get compacted together,
+  // this will simply lead to these merge operands moving to the next level.
+  //
+  // So, we only perform the following logic (to merge all operands together
+  // without a Put/Delete) if we are certain that we have seen the end of key.
+  bool surely_seen_the_beginning = hit_the_next_user_key && at_bottom;
+  if (surely_seen_the_beginning) {
+    // do a final merge with nullptr as the existing value and say
+    // bye to the merge type (it's now converted to a Put)
+    assert(kTypeMerge == orig_ikey.type);
+    assert(operands_.size() >= 1);
+    assert(operands_.size() == keys_.size());
+    success_ = user_merge_operator_->FullMerge(orig_ikey.user_key, nullptr,
+                                               operands_, &merge_result,
+                                               logger_);
+
+    if (success_) {
+      std::string& key = keys_.back();  // The original key encountered
+      orig_ikey.type = kTypeValue;
+      UpdateInternalKey(&key[0], key.size(),
+                        orig_ikey.sequence, orig_ikey.type);
+
+      // The final value() is always stored in operands_.back()
+      swap(operands_.back(),merge_result);
+    } else {
+      RecordTick(stats, NUMBER_MERGE_FAILURES);
+      // Do nothing if not success_. Leave keys() and operands() as they are.
+    }
+  } else {
+    // We haven't seen the beginning of the key nor a Put/Delete.
+    // Attempt to use the user's associative merge function to
+    // merge the stacked merge operands into a single operand.
+
+    if (operands_.size() >= 2 &&
+        operands_.size() >= min_partial_merge_operands_ &&
+        user_merge_operator_->PartialMergeMulti(
+            orig_ikey.user_key,
+            std::deque<Slice>(operands_.begin(), operands_.end()),
+            &merge_result, logger_)) {
+      // Merging of operands (associative merge) was successful.
+      // Replace operands with the merge result
+      operands_.clear();
+      operands_.push_front(std::move(merge_result));
+      keys_.erase(keys_.begin(), keys_.end() - 1);
+    }
+  }
+}
+
+} // namespace rocksdb
diff --git a/db/merge_helper.h b/db/merge_helper.h
new file mode 100644 (file)
index 0000000..fef153e
--- /dev/null
@@ -0,0 +1,105 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#ifndef MERGE_HELPER_H
+#define MERGE_HELPER_H
+
+#include "db/dbformat.h"
+#include "rocksdb/slice.h"
+#include <string>
+#include <deque>
+
+namespace rocksdb {
+
+class Comparator;
+class Iterator;
+class Logger;
+class MergeOperator;
+class Statistics;
+
+class MergeHelper {
+ public:
+  MergeHelper(const Comparator* user_comparator,
+              const MergeOperator* user_merge_operator, Logger* logger,
+              unsigned min_partial_merge_operands,
+              bool assert_valid_internal_key)
+      : user_comparator_(user_comparator),
+        user_merge_operator_(user_merge_operator),
+        logger_(logger),
+        min_partial_merge_operands_(min_partial_merge_operands),
+        assert_valid_internal_key_(assert_valid_internal_key),
+        keys_(),
+        operands_(),
+        success_(false) {}
+
+  // Merge entries until we hit
+  //     - a corrupted key
+  //     - a Put/Delete,
+  //     - a different user key,
+  //     - a specific sequence number (snapshot boundary),
+  //  or - the end of iteration
+  // iter: (IN)  points to the first merge type entry
+  //       (OUT) points to the first entry not included in the merge process
+  // stop_before: (IN) a sequence number that merge should not cross.
+  //                   0 means no restriction
+  // at_bottom:   (IN) true if the iterator covers the bottem level, which means
+  //                   we could reach the start of the history of this user key.
+  void MergeUntil(Iterator* iter, SequenceNumber stop_before = 0,
+                  bool at_bottom = false, Statistics* stats = nullptr,
+                  int* steps = nullptr);
+
+  // Query the merge result
+  // These are valid until the next MergeUntil call
+  // If the merging was successful:
+  //   - IsSuccess() will be true
+  //   - key() will have the latest sequence number of the merges.
+  //           The type will be Put or Merge. See IMPORTANT 1 note, below.
+  //   - value() will be the result of merging all the operands together
+  //   - The user should ignore keys() and values().
+  //
+  //   IMPORTANT 1: the key type could change after the MergeUntil call.
+  //        Put/Delete + Merge + ... + Merge => Put
+  //        Merge + ... + Merge => Merge
+  //
+  // If the merge operator is not associative, and if a Put/Delete is not found
+  // then the merging will be unsuccessful. In this case:
+  //   - IsSuccess() will be false
+  //   - keys() contains the list of internal keys seen in order of iteration.
+  //   - values() contains the list of values (merges) seen in the same order.
+  //              values() is parallel to keys() so that the first entry in
+  //              keys() is the key associated with the first entry in values()
+  //              and so on. These lists will be the same length.
+  //              All of these pairs will be merges over the same user key.
+  //              See IMPORTANT 2 note below.
+  //   - The user should ignore key() and value().
+  //
+  //   IMPORTANT 2: The entries were traversed in order from BACK to FRONT.
+  //                So keys().back() was the first key seen by iterator.
+  // TODO: Re-style this comment to be like the first one
+  bool IsSuccess() { return success_; }
+  Slice key() { assert(success_); return Slice(keys_.back()); }
+  Slice value() { assert(success_); return Slice(operands_.back()); }
+  const std::deque<std::string>& keys() { assert(!success_); return keys_; }
+  const std::deque<std::string>& values() {
+    assert(!success_); return operands_;
+  }
+
+ private:
+  const Comparator* user_comparator_;
+  const MergeOperator* user_merge_operator_;
+  Logger* logger_;
+  unsigned min_partial_merge_operands_;
+  bool assert_valid_internal_key_; // enforce no internal key corruption?
+
+  // the scratch area that holds the result of MergeUntil
+  // valid up to the next MergeUntil call
+  std::deque<std::string> keys_;    // Keeps track of the sequence of keys seen
+  std::deque<std::string> operands_;  // Parallel with keys_; stores the values
+  bool success_;
+};
+
+} // namespace rocksdb
+
+#endif
diff --git a/db/merge_operator.cc b/db/merge_operator.cc
new file mode 100644 (file)
index 0000000..a14df8a
--- /dev/null
@@ -0,0 +1,77 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+/**
+ * Back-end implementation details specific to the Merge Operator.
+ */
+
+#include "rocksdb/merge_operator.h"
+
+namespace rocksdb {
+
+// The default implementation of PartialMergeMulti, which invokes
+// PartialMerge multiple times internally and merges two operands at
+// a time.
+bool MergeOperator::PartialMergeMulti(const Slice& key,
+                                      const std::deque<Slice>& operand_list,
+                                      std::string* new_value,
+                                      Logger* logger) const {
+  assert(operand_list.size() >= 2);
+  // Simply loop through the operands
+  std::string temp_value;
+  Slice temp_slice(operand_list[0]);
+
+  for (size_t i = 1; i < operand_list.size(); ++i) {
+    auto& operand = operand_list[i];
+    if (!PartialMerge(key, temp_slice, operand, &temp_value, logger)) {
+      return false;
+    }
+    swap(temp_value, *new_value);
+    temp_slice = Slice(*new_value);
+  }
+
+  // The result will be in *new_value. All merges succeeded.
+  return true;
+}
+
+// Given a "real" merge from the library, call the user's
+// associative merge function one-by-one on each of the operands.
+// NOTE: It is assumed that the client's merge-operator will handle any errors.
+bool AssociativeMergeOperator::FullMerge(
+    const Slice& key,
+    const Slice* existing_value,
+    const std::deque<std::string>& operand_list,
+    std::string* new_value,
+    Logger* logger) const {
+
+  // Simply loop through the operands
+  Slice temp_existing;
+  std::string temp_value;
+  for (const auto& operand : operand_list) {
+    Slice value(operand);
+    if (!Merge(key, existing_value, value, &temp_value, logger)) {
+      return false;
+    }
+    swap(temp_value, *new_value);
+    temp_existing = Slice(*new_value);
+    existing_value = &temp_existing;
+  }
+
+  // The result will be in *new_value. All merges succeeded.
+  return true;
+}
+
+// Call the user defined simple merge on the operands;
+// NOTE: It is assumed that the client's merge-operator will handle any errors.
+bool AssociativeMergeOperator::PartialMerge(
+    const Slice& key,
+    const Slice& left_operand,
+    const Slice& right_operand,
+    std::string* new_value,
+    Logger* logger) const {
+  return Merge(key, &left_operand, right_operand, new_value, logger);
+}
+
+} // namespace rocksdb
diff --git a/db/merge_test.cc b/db/merge_test.cc
new file mode 100644 (file)
index 0000000..9bdf543
--- /dev/null
@@ -0,0 +1,472 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#include <assert.h>
+#include <memory>
+#include <iostream>
+
+#include "rocksdb/cache.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/merge_operator.h"
+#include "db/dbformat.h"
+#include "db/db_impl.h"
+#include "db/write_batch_internal.h"
+#include "utilities/merge_operators.h"
+#include "util/testharness.h"
+#include "utilities/db_ttl.h"
+
+using namespace std;
+using namespace rocksdb;
+
+namespace {
+  int numMergeOperatorCalls;
+  void resetNumMergeOperatorCalls() {
+    numMergeOperatorCalls = 0;
+  }
+
+  int num_partial_merge_calls;
+  void resetNumPartialMergeCalls() {
+    num_partial_merge_calls = 0;
+  }
+}
+
+class CountMergeOperator : public AssociativeMergeOperator {
+ public:
+  CountMergeOperator() {
+    mergeOperator_ = MergeOperators::CreateUInt64AddOperator();
+  }
+
+  virtual bool Merge(const Slice& key,
+                     const Slice* existing_value,
+                     const Slice& value,
+                     std::string* new_value,
+                     Logger* logger) const override {
+    ++numMergeOperatorCalls;
+    if (existing_value == nullptr) {
+      new_value->assign(value.data(), value.size());
+      return true;
+    }
+
+    return mergeOperator_->PartialMerge(
+        key,
+        *existing_value,
+        value,
+        new_value,
+        logger);
+  }
+
+  virtual bool PartialMergeMulti(const Slice& key,
+                                 const std::deque<Slice>& operand_list,
+                                 std::string* new_value, Logger* logger) const {
+    ++num_partial_merge_calls;
+    return mergeOperator_->PartialMergeMulti(key, operand_list, new_value,
+                                             logger);
+  }
+
+  virtual const char* Name() const override {
+    return "UInt64AddOperator";
+  }
+
+ private:
+  std::shared_ptr<MergeOperator> mergeOperator_;
+};
+
+namespace {
+std::shared_ptr<DB> OpenDb(const string& dbname, const bool ttl = false,
+                           const size_t max_successive_merges = 0,
+                           const uint32_t min_partial_merge_operands = 2) {
+  DB* db;
+  Options options;
+  options.create_if_missing = true;
+  options.merge_operator = std::make_shared<CountMergeOperator>();
+  options.max_successive_merges = max_successive_merges;
+  options.min_partial_merge_operands = min_partial_merge_operands;
+  Status s;
+  DestroyDB(dbname, Options());
+  if (ttl) {
+    cout << "Opening database with TTL\n";
+    DBWithTTL* db_with_ttl;
+    s = DBWithTTL::Open(options, dbname, &db_with_ttl);
+    db = db_with_ttl;
+  } else {
+    s = DB::Open(options, dbname, &db);
+  }
+  if (!s.ok()) {
+    cerr << s.ToString() << endl;
+    assert(false);
+  }
+  return std::shared_ptr<DB>(db);
+}
+}  // namespace
+
+// Imagine we are maintaining a set of uint64 counters.
+// Each counter has a distinct name. And we would like
+// to support four high level operations:
+// set, add, get and remove
+// This is a quick implementation without a Merge operation.
+class Counters {
+
+ protected:
+  std::shared_ptr<DB> db_;
+
+  WriteOptions put_option_;
+  ReadOptions get_option_;
+  WriteOptions delete_option_;
+
+  uint64_t default_;
+
+ public:
+  explicit Counters(std::shared_ptr<DB> db, uint64_t defaultCount = 0)
+      : db_(db),
+        put_option_(),
+        get_option_(),
+        delete_option_(),
+        default_(defaultCount) {
+    assert(db_);
+  }
+
+  virtual ~Counters() {}
+
+  // public interface of Counters.
+  // All four functions return false
+  // if the underlying level db operation failed.
+
+  // mapped to a levedb Put
+  bool set(const string& key, uint64_t value) {
+    // just treat the internal rep of int64 as the string
+    Slice slice((char *)&value, sizeof(value));
+    auto s = db_->Put(put_option_, key, slice);
+
+    if (s.ok()) {
+      return true;
+    } else {
+      cerr << s.ToString() << endl;
+      return false;
+    }
+  }
+
+  // mapped to a rocksdb Delete
+  bool remove(const string& key) {
+    auto s = db_->Delete(delete_option_, key);
+
+    if (s.ok()) {
+      return true;
+    } else {
+      cerr << s.ToString() << std::endl;
+      return false;
+    }
+  }
+
+  // mapped to a rocksdb Get
+  bool get(const string& key, uint64_t *value) {
+    string str;
+    auto s = db_->Get(get_option_, key, &str);
+
+    if (s.IsNotFound()) {
+      // return default value if not found;
+      *value = default_;
+      return true;
+    } else if (s.ok()) {
+      // deserialization
+      if (str.size() != sizeof(uint64_t)) {
+        cerr << "value corruption\n";
+        return false;
+      }
+      *value = DecodeFixed64(&str[0]);
+      return true;
+    } else {
+      cerr << s.ToString() << std::endl;
+      return false;
+    }
+  }
+
+  // 'add' is implemented as get -> modify -> set
+  // An alternative is a single merge operation, see MergeBasedCounters
+  virtual bool add(const string& key, uint64_t value) {
+    uint64_t base = default_;
+    return get(key, &base) && set(key, base + value);
+  }
+
+
+  // convenience functions for testing
+  void assert_set(const string& key, uint64_t value) {
+    assert(set(key, value));
+  }
+
+  void assert_remove(const string& key) {
+    assert(remove(key));
+  }
+
+  uint64_t assert_get(const string& key) {
+    uint64_t value = default_;
+    int result = get(key, &value);
+    assert(result);
+    if (result == 0) exit(1); // Disable unused variable warning.
+    return value;
+  }
+
+  void assert_add(const string& key, uint64_t value) {
+    int result = add(key, value);
+    assert(result);
+    if (result == 0) exit(1); // Disable unused variable warning. 
+  }
+};
+
+// Implement 'add' directly with the new Merge operation
+class MergeBasedCounters : public Counters {
+ private:
+  WriteOptions merge_option_; // for merge
+
+ public:
+  explicit MergeBasedCounters(std::shared_ptr<DB> db, uint64_t defaultCount = 0)
+      : Counters(db, defaultCount),
+        merge_option_() {
+  }
+
+  // mapped to a rocksdb Merge operation
+  virtual bool add(const string& key, uint64_t value) override {
+    char encoded[sizeof(uint64_t)];
+    EncodeFixed64(encoded, value);
+    Slice slice(encoded, sizeof(uint64_t));
+    auto s = db_->Merge(merge_option_, key, slice);
+
+    if (s.ok()) {
+      return true;
+    } else {
+      cerr << s.ToString() << endl;
+      return false;
+    }
+  }
+};
+
+namespace {
+void dumpDb(DB* db) {
+  auto it = unique_ptr<Iterator>(db->NewIterator(ReadOptions()));
+  for (it->SeekToFirst(); it->Valid(); it->Next()) {
+    uint64_t value = DecodeFixed64(it->value().data());
+    cout << it->key().ToString() << ": "  << value << endl;
+  }
+  assert(it->status().ok());  // Check for any errors found during the scan
+}
+
+void testCounters(Counters& counters, DB* db, bool test_compaction) {
+
+  FlushOptions o;
+  o.wait = true;
+
+  counters.assert_set("a", 1);
+
+  if (test_compaction) db->Flush(o);
+
+  assert(counters.assert_get("a") == 1);
+
+  counters.assert_remove("b");
+
+  // defaut value is 0 if non-existent
+  assert(counters.assert_get("b") == 0);
+
+  counters.assert_add("a", 2);
+
+  if (test_compaction) db->Flush(o);
+
+  // 1+2 = 3
+  assert(counters.assert_get("a")== 3);
+
+  dumpDb(db);
+
+  std::cout << "1\n";
+
+  // 1+...+49 = ?
+  uint64_t sum = 0;
+  for (int i = 1; i < 50; i++) {
+    counters.assert_add("b", i);
+    sum += i;
+  }
+  assert(counters.assert_get("b") == sum);
+
+  std::cout << "2\n";
+  dumpDb(db);
+
+  std::cout << "3\n";
+
+  if (test_compaction) {
+    db->Flush(o);
+
+    cout << "Compaction started ...\n";
+    db->CompactRange(nullptr, nullptr);
+    cout << "Compaction ended\n";
+
+    dumpDb(db);
+
+    assert(counters.assert_get("a")== 3);
+    assert(counters.assert_get("b") == sum);
+  }
+}
+
+void testSuccessiveMerge(
+    Counters& counters, int max_num_merges, int num_merges) {
+
+  counters.assert_remove("z");
+  uint64_t sum = 0;
+
+  for (int i = 1; i <= num_merges; ++i) {
+    resetNumMergeOperatorCalls();
+    counters.assert_add("z", i);
+    sum += i;
+
+    if (i % (max_num_merges + 1) == 0) {
+      assert(numMergeOperatorCalls == max_num_merges + 1);
+    } else {
+      assert(numMergeOperatorCalls == 0);
+    }
+
+    resetNumMergeOperatorCalls();
+    assert(counters.assert_get("z") == sum);
+    assert(numMergeOperatorCalls == i % (max_num_merges + 1));
+  }
+}
+
+void testPartialMerge(Counters* counters, DB* db, int max_merge, int min_merge,
+                      int count) {
+  FlushOptions o;
+  o.wait = true;
+
+  // Test case 1: partial merge should be called when the number of merge
+  //              operands exceeds the threshold.
+  uint64_t tmp_sum = 0;
+  resetNumPartialMergeCalls();
+  for (int i = 1; i <= count; i++) {
+    counters->assert_add("b", i);
+    tmp_sum += i;
+  }
+  db->Flush(o);
+  db->CompactRange(nullptr, nullptr);
+  ASSERT_EQ(tmp_sum, counters->assert_get("b"));
+  if (count > max_merge) {
+    // in this case, FullMerge should be called instead.
+    ASSERT_EQ(num_partial_merge_calls, 0);
+  } else {
+    // if count >= min_merge, then partial merge should be called once.
+    ASSERT_EQ((count >= min_merge), (num_partial_merge_calls == 1));
+  }
+
+  // Test case 2: partial merge should not be called when a put is found.
+  resetNumPartialMergeCalls();
+  tmp_sum = 0;
+  db->Put(rocksdb::WriteOptions(), "c", "10");
+  for (int i = 1; i <= count; i++) {
+    counters->assert_add("c", i);
+    tmp_sum += i;
+  }
+  db->Flush(o);
+  db->CompactRange(nullptr, nullptr);
+  ASSERT_EQ(tmp_sum, counters->assert_get("c"));
+  ASSERT_EQ(num_partial_merge_calls, 0);
+}
+
+void testSingleBatchSuccessiveMerge(
+    DB* db,
+    int max_num_merges,
+    int num_merges) {
+  assert(num_merges > max_num_merges);
+
+  Slice key("BatchSuccessiveMerge");
+  uint64_t merge_value = 1;
+  Slice merge_value_slice((char *)&merge_value, sizeof(merge_value));
+
+  // Create the batch
+  WriteBatch batch;
+  for (int i = 0; i < num_merges; ++i) {
+    batch.Merge(key, merge_value_slice);
+  }
+
+  // Apply to memtable and count the number of merges
+  resetNumMergeOperatorCalls();
+  {
+    Status s = db->Write(WriteOptions(), &batch);
+    assert(s.ok());
+  }
+  assert(numMergeOperatorCalls ==
+      num_merges - (num_merges % (max_num_merges + 1)));
+
+  // Get the value
+  resetNumMergeOperatorCalls();
+  string get_value_str;
+  {
+    Status s = db->Get(ReadOptions(), key, &get_value_str);
+    assert(s.ok());
+  }
+  assert(get_value_str.size() == sizeof(uint64_t));
+  uint64_t get_value = DecodeFixed64(&get_value_str[0]);
+  ASSERT_EQ(get_value, num_merges * merge_value);
+  ASSERT_EQ(numMergeOperatorCalls, (num_merges % (max_num_merges + 1)));
+}
+
+void runTest(int argc, const string& dbname, const bool use_ttl = false) {
+  auto db = OpenDb(dbname, use_ttl);
+
+  {
+    cout << "Test read-modify-write counters... \n";
+    Counters counters(db, 0);
+    testCounters(counters, db.get(), true);
+  }
+
+  bool compact = false;
+  if (argc > 1) {
+    compact = true;
+    cout << "Turn on Compaction\n";
+  }
+
+  {
+    cout << "Test merge-based counters... \n";
+    MergeBasedCounters counters(db, 0);
+    testCounters(counters, db.get(), compact);
+  }
+
+  DestroyDB(dbname, Options());
+  db.reset();
+
+  {
+    cout << "Test merge in memtable... \n";
+    size_t max_merge = 5;
+    auto db = OpenDb(dbname, use_ttl, max_merge);
+    MergeBasedCounters counters(db, 0);
+    testCounters(counters, db.get(), compact);
+    testSuccessiveMerge(counters, max_merge, max_merge * 2);
+    testSingleBatchSuccessiveMerge(db.get(), 5, 7);
+    DestroyDB(dbname, Options());
+  }
+
+  {
+    cout << "Test Partial-Merge\n";
+    size_t max_merge = 100;
+    for (uint32_t min_merge = 5; min_merge < 25; min_merge += 5) {
+      for (uint32_t count = min_merge - 1; count <= min_merge + 1; count++) {
+        auto db = OpenDb(dbname, use_ttl, max_merge, min_merge);
+        MergeBasedCounters counters(db, 0);
+        testPartialMerge(&counters, db.get(), max_merge, min_merge, count);
+        DestroyDB(dbname, Options());
+      }
+      {
+        auto db = OpenDb(dbname, use_ttl, max_merge, min_merge);
+        MergeBasedCounters counters(db, 0);
+        testPartialMerge(&counters, db.get(), max_merge, min_merge,
+                         min_merge * 10);
+        DestroyDB(dbname, Options());
+      }
+    }
+  }
+}
+}  // namespace
+
+int main(int argc, char *argv[]) {
+  //TODO: Make this test like a general rocksdb unit-test
+  runTest(argc, test::TmpDir() + "/merge_testdb");
+  runTest(argc, test::TmpDir() + "/merge_testdbttl", true); // Run test on TTL database
+  printf("Passed all tests!\n");
+  return 0;
+}
diff --git a/db/perf_context_test.cc b/db/perf_context_test.cc
new file mode 100644 (file)
index 0000000..a182fb5
--- /dev/null
@@ -0,0 +1,358 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#include <algorithm>
+#include <iostream>
+#include <vector>
+#include "/usr/include/valgrind/callgrind.h"
+
+#include "rocksdb/db.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/memtablerep.h"
+#include "util/histogram.h"
+#include "util/stop_watch.h"
+#include "util/testharness.h"
+
+
+bool FLAGS_random_key = false;
+bool FLAGS_use_set_based_memetable = false;
+int FLAGS_total_keys = 100;
+int FLAGS_write_buffer_size = 1000000000;
+int FLAGS_max_write_buffer_number = 8;
+int FLAGS_min_write_buffer_number_to_merge = 7;
+
+// Path to the database on file system
+const std::string kDbName = rocksdb::test::TmpDir() + "/perf_context_test";
+
+namespace rocksdb {
+
+std::shared_ptr<DB> OpenDb() {
+    DB* db;
+    Options options;
+    options.create_if_missing = true;
+    options.write_buffer_size = FLAGS_write_buffer_size;
+    options.max_write_buffer_number = FLAGS_max_write_buffer_number;
+    options.min_write_buffer_number_to_merge =
+      FLAGS_min_write_buffer_number_to_merge;
+
+    if (FLAGS_use_set_based_memetable) {
+      auto prefix_extractor = rocksdb::NewFixedPrefixTransform(0);
+      options.memtable_factory.reset(
+          NewHashSkipListRepFactory(prefix_extractor));
+    }
+
+    Status s = DB::Open(options, kDbName,  &db);
+    ASSERT_OK(s);
+    return std::shared_ptr<DB>(db);
+}
+
+class PerfContextTest { };
+
+TEST(PerfContextTest, SeekIntoDeletion) {
+  DestroyDB(kDbName, Options());
+  auto db = OpenDb();
+  WriteOptions write_options;
+  ReadOptions read_options;
+
+  for (int i = 0; i < FLAGS_total_keys; ++i) {
+    std::string key = "k" + std::to_string(i);
+    std::string value = "v" + std::to_string(i);
+
+    db->Put(write_options, key, value);
+  }
+
+  for (int i = 0; i < FLAGS_total_keys -1 ; ++i) {
+    std::string key = "k" + std::to_string(i);
+    db->Delete(write_options, key);
+  }
+
+  HistogramImpl hist_get;
+  HistogramImpl hist_get_time;
+  for (int i = 0; i < FLAGS_total_keys - 1; ++i) {
+    std::string key = "k" + std::to_string(i);
+    std::string value;
+
+    perf_context.Reset();
+    StopWatchNano timer(Env::Default(), true);
+    auto status = db->Get(read_options, key, &value);
+    auto elapsed_nanos = timer.ElapsedNanos();
+    ASSERT_TRUE(status.IsNotFound());
+    hist_get.Add(perf_context.user_key_comparison_count);
+    hist_get_time.Add(elapsed_nanos);
+  }
+
+  std::cout << "Get uesr key comparison: \n" << hist_get.ToString()
+            << "Get time: \n" << hist_get_time.ToString();
+
+  HistogramImpl hist_seek_to_first;
+  std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+
+  perf_context.Reset();
+  StopWatchNano timer(Env::Default(), true);
+  iter->SeekToFirst();
+  hist_seek_to_first.Add(perf_context.user_key_comparison_count);
+  auto elapsed_nanos = timer.ElapsedNanos();
+
+  std::cout << "SeekToFirst uesr key comparison: \n" << hist_seek_to_first.ToString()
+            << "ikey skipped: " << perf_context.internal_key_skipped_count << "\n"
+            << "idelete skipped: " << perf_context.internal_delete_skipped_count << "\n"
+            << "elapsed: " << elapsed_nanos << "\n";
+
+  HistogramImpl hist_seek;
+  for (int i = 0; i < FLAGS_total_keys; ++i) {
+    std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+    std::string key = "k" + std::to_string(i);
+
+    perf_context.Reset();
+    StopWatchNano timer(Env::Default(), true);
+    iter->Seek(key);
+    auto elapsed_nanos = timer.ElapsedNanos();
+    hist_seek.Add(perf_context.user_key_comparison_count);
+    std::cout << "seek cmp: " << perf_context.user_key_comparison_count
+              << " ikey skipped " << perf_context.internal_key_skipped_count
+              << " idelete skipped " << perf_context.internal_delete_skipped_count
+              << " elapsed: " << elapsed_nanos << "ns\n";
+
+    perf_context.Reset();
+    ASSERT_TRUE(iter->Valid());
+    StopWatchNano timer2(Env::Default(), true);
+    iter->Next();
+    auto elapsed_nanos2 = timer2.ElapsedNanos();
+    std::cout << "next cmp: " << perf_context.user_key_comparison_count
+              << "elapsed: " << elapsed_nanos2 << "ns\n";
+  }
+
+  std::cout << "Seek uesr key comparison: \n" << hist_seek.ToString();
+}
+
+TEST(PerfContextTest, StopWatchNanoOverhead) {
+  // profile the timer cost by itself!
+  const int kTotalIterations = 1000000;
+  std::vector<uint64_t> timings(kTotalIterations);
+
+  StopWatchNano timer(Env::Default(), true);
+  for (auto& timing : timings) {
+    timing = timer.ElapsedNanos(true /* reset */);
+  }
+
+  HistogramImpl histogram;
+  for (const auto timing : timings) {
+    histogram.Add(timing);
+  }
+
+  std::cout << histogram.ToString();
+}
+
+TEST(PerfContextTest, StopWatchOverhead) {
+  // profile the timer cost by itself!
+  const int kTotalIterations = 1000000;
+  std::vector<uint64_t> timings(kTotalIterations);
+
+  StopWatch timer(Env::Default());
+  for (auto& timing : timings) {
+    timing = timer.ElapsedMicros();
+  }
+
+  HistogramImpl histogram;
+  uint64_t prev_timing = 0;
+  for (const auto timing : timings) {
+    histogram.Add(timing - prev_timing);
+    prev_timing = timing;
+  }
+
+  std::cout << histogram.ToString();
+}
+
+void ProfileKeyComparison() {
+  DestroyDB(kDbName, Options());    // Start this test with a fresh DB
+
+  auto db = OpenDb();
+
+  WriteOptions write_options;
+  ReadOptions read_options;
+
+  HistogramImpl hist_put;
+  HistogramImpl hist_get;
+  HistogramImpl hist_get_snapshot;
+  HistogramImpl hist_get_memtable;
+  HistogramImpl hist_get_post_process;
+  HistogramImpl hist_num_memtable_checked;
+  HistogramImpl hist_write_pre_post;
+  HistogramImpl hist_write_wal_time;
+  HistogramImpl hist_write_memtable_time;
+
+  std::cout << "Inserting " << FLAGS_total_keys << " key/value pairs\n...\n";
+
+  std::vector<int> keys;
+  for (int i = 0; i < FLAGS_total_keys; ++i) {
+    keys.push_back(i);
+  }
+
+  if (FLAGS_random_key) {
+    std::random_shuffle(keys.begin(), keys.end());
+  }
+
+  for (const int i : keys) {
+    std::string key = "k" + std::to_string(i);
+    std::string value = "v" + std::to_string(i);
+
+    perf_context.Reset();
+    db->Put(write_options, key, value);
+    hist_write_pre_post.Add(perf_context.write_pre_and_post_process_time);
+    hist_write_wal_time.Add(perf_context.write_wal_time);
+    hist_write_memtable_time.Add(perf_context.write_memtable_time);
+    hist_put.Add(perf_context.user_key_comparison_count);
+
+    perf_context.Reset();
+    db->Get(read_options, key, &value);
+    hist_get_snapshot.Add(perf_context.get_snapshot_time);
+    hist_get_memtable.Add(perf_context.get_from_memtable_time);
+    hist_num_memtable_checked.Add(perf_context.get_from_memtable_count);
+    hist_get_post_process.Add(perf_context.get_post_process_time);
+    hist_get.Add(perf_context.user_key_comparison_count);
+  }
+
+  std::cout << "Put uesr key comparison: \n" << hist_put.ToString()
+            << "Get uesr key comparison: \n" << hist_get.ToString();
+  std::cout << "Put(): Pre and Post Process Time: \n"
+            << hist_write_pre_post.ToString()
+            << " Writing WAL time: \n"
+            << hist_write_wal_time.ToString() << "\n"
+            << " Writing Mem Table time: \n"
+            << hist_write_memtable_time.ToString() << "\n";
+
+  std::cout << "Get(): Time to get snapshot: \n"
+            << hist_get_snapshot.ToString()
+            << " Time to get value from memtables: \n"
+            << hist_get_memtable.ToString() << "\n"
+            << " Number of memtables checked: \n"
+            << hist_num_memtable_checked.ToString() << "\n"
+            << " Time to post process: \n"
+            << hist_get_post_process.ToString() << "\n";
+}
+
+TEST(PerfContextTest, KeyComparisonCount) {
+  SetPerfLevel(kEnableCount);
+  ProfileKeyComparison();
+
+  SetPerfLevel(kDisable);
+  ProfileKeyComparison();
+
+  SetPerfLevel(kEnableTime);
+  ProfileKeyComparison();
+}
+
+// make perf_context_test
+// export ROCKSDB_TESTS=PerfContextTest.SeekKeyComparison
+// For one memtable:
+// ./perf_context_test --write_buffer_size=500000 --total_keys=10000
+// For two memtables:
+// ./perf_context_test --write_buffer_size=250000 --total_keys=10000
+// Specify --random_key=1 to shuffle the key before insertion
+// Results show that, for sequential insertion, worst-case Seek Key comparison
+// is close to the total number of keys (linear), when there is only one
+// memtable. When there are two memtables, even the avg Seek Key comparison
+// starts to become linear to the input size.
+
+TEST(PerfContextTest, SeekKeyComparison) {
+  DestroyDB(kDbName, Options());
+  auto db = OpenDb();
+  WriteOptions write_options;
+  ReadOptions read_options;
+
+  std::cout << "Inserting " << FLAGS_total_keys << " key/value pairs\n...\n";
+
+  std::vector<int> keys;
+  for (int i = 0; i < FLAGS_total_keys; ++i) {
+    keys.push_back(i);
+  }
+
+  if (FLAGS_random_key) {
+    std::random_shuffle(keys.begin(), keys.end());
+  }
+
+  HistogramImpl hist_put_time;
+  HistogramImpl hist_wal_time;
+  HistogramImpl hist_time_diff;
+
+  SetPerfLevel(kEnableTime);
+  StopWatchNano timer(Env::Default());
+  for (const int i : keys) {
+    std::string key = "k" + std::to_string(i);
+    std::string value = "v" + std::to_string(i);
+
+    perf_context.Reset();
+    timer.Start();
+    db->Put(write_options, key, value);
+    auto put_time = timer.ElapsedNanos();
+    hist_put_time.Add(put_time);
+    hist_wal_time.Add(perf_context.write_wal_time);
+    hist_time_diff.Add(put_time - perf_context.write_wal_time);
+  }
+
+  std::cout << "Put time:\n" << hist_put_time.ToString()
+            << "WAL time:\n" << hist_wal_time.ToString()
+            << "time diff:\n" << hist_time_diff.ToString();
+
+  HistogramImpl hist_seek;
+  HistogramImpl hist_next;
+
+  for (int i = 0; i < FLAGS_total_keys; ++i) {
+    std::string key = "k" + std::to_string(i);
+    std::string value = "v" + std::to_string(i);
+
+    std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+    perf_context.Reset();
+    iter->Seek(key);
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->value().ToString(), value);
+    hist_seek.Add(perf_context.user_key_comparison_count);
+  }
+
+  std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+  for (iter->SeekToFirst(); iter->Valid();) {
+    perf_context.Reset();
+    iter->Next();
+    hist_next.Add(perf_context.user_key_comparison_count);
+  }
+
+  std::cout << "Seek:\n" << hist_seek.ToString()
+            << "Next:\n" << hist_next.ToString();
+}
+
+}
+
+int main(int argc, char** argv) {
+
+  for (int i = 1; i < argc; i++) {
+    int n;
+    char junk;
+
+    if (sscanf(argv[i], "--write_buffer_size=%d%c", &n, &junk) == 1) {
+      FLAGS_write_buffer_size = n;
+    }
+
+    if (sscanf(argv[i], "--total_keys=%d%c", &n, &junk) == 1) {
+      FLAGS_total_keys = n;
+    }
+
+    if (sscanf(argv[i], "--random_key=%d%c", &n, &junk) == 1 &&
+        (n == 0 || n == 1)) {
+      FLAGS_random_key = n;
+    }
+
+    if (sscanf(argv[i], "--use_set_based_memetable=%d%c", &n, &junk) == 1 &&
+        (n == 0 || n == 1)) {
+      FLAGS_use_set_based_memetable = n;
+    }
+
+  }
+
+  std::cout << kDbName << "\n";
+
+  rocksdb::test::RunAllTests();
+  return 0;
+}
diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc
new file mode 100644 (file)
index 0000000..517ef0a
--- /dev/null
@@ -0,0 +1,834 @@
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include <algorithm>
+#include <set>
+
+#include "db/db_impl.h"
+#include "db/filename.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
+#include "table/meta_blocks.h"
+#include "table/plain_table_factory.h"
+#include "table/plain_table_reader.h"
+#include "util/hash.h"
+#include "util/logging.h"
+#include "util/mutexlock.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+#include "utilities/merge_operators.h"
+
+using std::unique_ptr;
+
+namespace rocksdb {
+
+class PlainTableDBTest {
+ protected:
+ private:
+  std::string dbname_;
+  Env* env_;
+  DB* db_;
+
+  Options last_options_;
+
+ public:
+  PlainTableDBTest() : env_(Env::Default()) {
+    dbname_ = test::TmpDir() + "/plain_table_db_test";
+    ASSERT_OK(DestroyDB(dbname_, Options()));
+    db_ = nullptr;
+    Reopen();
+  }
+
+  ~PlainTableDBTest() {
+    delete db_;
+    ASSERT_OK(DestroyDB(dbname_, Options()));
+  }
+
+  // Return the current option configuration.
+  Options CurrentOptions() {
+    Options options;
+    options.table_factory.reset(NewPlainTableFactory(16, 2, 0.8, 3));
+    options.prefix_extractor.reset(NewFixedPrefixTransform(8));
+    options.allow_mmap_reads = true;
+    return options;
+  }
+
+  DBImpl* dbfull() {
+    return reinterpret_cast<DBImpl*>(db_);
+  }
+
+  void Reopen(Options* options = nullptr) {
+    ASSERT_OK(TryReopen(options));
+  }
+
+  void Close() {
+    delete db_;
+    db_ = nullptr;
+  }
+
+  void DestroyAndReopen(Options* options = nullptr) {
+    //Destroy using last options
+    Destroy(&last_options_);
+    ASSERT_OK(TryReopen(options));
+  }
+
+  void Destroy(Options* options) {
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(DestroyDB(dbname_, *options));
+  }
+
+  Status PureReopen(Options* options, DB** db) {
+    return DB::Open(*options, dbname_, db);
+  }
+
+  Status TryReopen(Options* options = nullptr) {
+    delete db_;
+    db_ = nullptr;
+    Options opts;
+    if (options != nullptr) {
+      opts = *options;
+    } else {
+      opts = CurrentOptions();
+      opts.create_if_missing = true;
+    }
+    last_options_ = opts;
+
+    return DB::Open(opts, dbname_, &db_);
+  }
+
+  Status Put(const Slice& k, const Slice& v) {
+    return db_->Put(WriteOptions(), k, v);
+  }
+
+  Status Delete(const std::string& k) {
+    return db_->Delete(WriteOptions(), k);
+  }
+
+  std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) {
+    ReadOptions options;
+    options.snapshot = snapshot;
+    std::string result;
+    Status s = db_->Get(options, k, &result);
+    if (s.IsNotFound()) {
+      result = "NOT_FOUND";
+    } else if (!s.ok()) {
+      result = s.ToString();
+    }
+    return result;
+  }
+
+
+  int NumTableFilesAtLevel(int level) {
+    std::string property;
+    ASSERT_TRUE(
+        db_->GetProperty("rocksdb.num-files-at-level" + NumberToString(level),
+                         &property));
+    return atoi(property.c_str());
+  }
+
+  // Return spread of files per level
+  std::string FilesPerLevel() {
+    std::string result;
+    int last_non_zero_offset = 0;
+    for (int level = 0; level < db_->NumberLevels(); level++) {
+      int f = NumTableFilesAtLevel(level);
+      char buf[100];
+      snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
+      result += buf;
+      if (f > 0) {
+        last_non_zero_offset = result.size();
+      }
+    }
+    result.resize(last_non_zero_offset);
+    return result;
+  }
+
+  std::string IterStatus(Iterator* iter) {
+    std::string result;
+    if (iter->Valid()) {
+      result = iter->key().ToString() + "->" + iter->value().ToString();
+    } else {
+      result = "(invalid)";
+    }
+    return result;
+  }
+};
+
+TEST(PlainTableDBTest, Empty) {
+  ASSERT_TRUE(dbfull() != nullptr);
+  ASSERT_EQ("NOT_FOUND", Get("0000000000000foo"));
+}
+
+class TestPlainTableReader : public PlainTableReader {
+ public:
+  TestPlainTableReader(const EnvOptions& storage_options,
+                       const InternalKeyComparator& icomparator,
+                       uint64_t file_size, int bloom_bits_per_key,
+                       double hash_table_ratio, size_t index_sparseness,
+                       const TableProperties* table_properties,
+                       unique_ptr<RandomAccessFile>&& file,
+                       const Options& options, bool* expect_bloom_not_match)
+      : PlainTableReader(options, std::move(file), storage_options, icomparator,
+                         file_size, bloom_bits_per_key, hash_table_ratio,
+                         index_sparseness, table_properties),
+        expect_bloom_not_match_(expect_bloom_not_match) {
+    Status s = PopulateIndex(const_cast<TableProperties*>(table_properties));
+    ASSERT_TRUE(s.ok());
+  }
+
+  virtual ~TestPlainTableReader() {}
+
+ private:
+  virtual bool MatchBloom(uint32_t hash) const override {
+    bool ret = PlainTableReader::MatchBloom(hash);
+    ASSERT_TRUE(!*expect_bloom_not_match_ || !ret);
+    return ret;
+  }
+  bool* expect_bloom_not_match_;
+};
+
+extern const uint64_t kPlainTableMagicNumber;
+class TestPlainTableFactory : public PlainTableFactory {
+ public:
+  explicit TestPlainTableFactory(bool* expect_bloom_not_match,
+                                 uint32_t user_key_len =
+                                     kPlainTableVariableLength,
+                                 int bloom_bits_per_key = 0,
+                                 double hash_table_ratio = 0.75,
+                                 size_t index_sparseness = 16)
+      : PlainTableFactory(user_key_len, user_key_len, hash_table_ratio,
+                          hash_table_ratio),
+        bloom_bits_per_key_(bloom_bits_per_key),
+        hash_table_ratio_(hash_table_ratio),
+        index_sparseness_(index_sparseness),
+        expect_bloom_not_match_(expect_bloom_not_match) {}
+
+  Status NewTableReader(const Options& options, const EnvOptions& soptions,
+                        const InternalKeyComparator& internal_comparator,
+                        unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
+                        unique_ptr<TableReader>* table) const override {
+    TableProperties* props = nullptr;
+    auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber,
+                                 options.env, options.info_log.get(), &props);
+    ASSERT_TRUE(s.ok());
+
+    std::unique_ptr<PlainTableReader> new_reader(new TestPlainTableReader(
+        soptions, internal_comparator, file_size, bloom_bits_per_key_,
+        hash_table_ratio_, index_sparseness_, props, std::move(file), options,
+        expect_bloom_not_match_));
+
+    *table = std::move(new_reader);
+    return s;
+  }
+
+ private:
+  int bloom_bits_per_key_;
+  double hash_table_ratio_;
+  size_t index_sparseness_;
+  bool* expect_bloom_not_match_;
+};
+
+TEST(PlainTableDBTest, Flush) {
+  for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
+    for (int total_order = 0; total_order <= 1; total_order++) {
+      Options options = CurrentOptions();
+      options.create_if_missing = true;
+      // Set only one bucket to force bucket conflict.
+      // Test index interval for the same prefix to be 1, 2 and 4
+      if (total_order) {
+        options.table_factory.reset(
+            NewTotalOrderPlainTableFactory(16, bloom_bits, 2));
+      } else {
+        options.table_factory.reset(NewPlainTableFactory(16, bloom_bits));
+      }
+      DestroyAndReopen(&options);
+
+      ASSERT_OK(Put("1000000000000foo", "v1"));
+      ASSERT_OK(Put("0000000000000bar", "v2"));
+      ASSERT_OK(Put("1000000000000foo", "v3"));
+      dbfull()->TEST_FlushMemTable();
+
+      TablePropertiesCollection ptc;
+      reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc);
+      ASSERT_EQ(1U, ptc.size());
+      auto row = ptc.begin();
+      auto tp = row->second;
+      ASSERT_EQ(
+          total_order ? "4" : "12",
+          (tp->user_collected_properties).at("plain_table_hash_table_size"));
+      ASSERT_EQ(
+          total_order ? "9" : "0",
+          (tp->user_collected_properties).at("plain_table_sub_index_size"));
+
+      ASSERT_EQ("v3", Get("1000000000000foo"));
+      ASSERT_EQ("v2", Get("0000000000000bar"));
+    }
+  }
+}
+
+TEST(PlainTableDBTest, Flush2) {
+  for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
+    for (int total_order = 0; total_order <= 1; total_order++) {
+      bool expect_bloom_not_match = false;
+      Options options = CurrentOptions();
+      options.create_if_missing = true;
+      // Set only one bucket to force bucket conflict.
+      // Test index interval for the same prefix to be 1, 2 and 4
+      if (total_order) {
+        options.prefix_extractor = nullptr;
+        options.table_factory.reset(new TestPlainTableFactory(
+            &expect_bloom_not_match, 16, bloom_bits, 0, 2));
+      } else {
+        options.table_factory.reset(
+            new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits));
+      }
+      DestroyAndReopen(&options);
+      ASSERT_OK(Put("0000000000000bar", "b"));
+      ASSERT_OK(Put("1000000000000foo", "v1"));
+      dbfull()->TEST_FlushMemTable();
+
+      ASSERT_OK(Put("1000000000000foo", "v2"));
+      dbfull()->TEST_FlushMemTable();
+      ASSERT_EQ("v2", Get("1000000000000foo"));
+
+      ASSERT_OK(Put("0000000000000eee", "v3"));
+      dbfull()->TEST_FlushMemTable();
+      ASSERT_EQ("v3", Get("0000000000000eee"));
+
+      ASSERT_OK(Delete("0000000000000bar"));
+      dbfull()->TEST_FlushMemTable();
+      ASSERT_EQ("NOT_FOUND", Get("0000000000000bar"));
+
+      ASSERT_OK(Put("0000000000000eee", "v5"));
+      ASSERT_OK(Put("9000000000000eee", "v5"));
+      dbfull()->TEST_FlushMemTable();
+      ASSERT_EQ("v5", Get("0000000000000eee"));
+
+      // Test Bloom Filter
+      if (bloom_bits > 0) {
+        // Neither key nor value should exist.
+        expect_bloom_not_match = true;
+        ASSERT_EQ("NOT_FOUND", Get("5_not00000000bar"));
+
+        // Key doesn't exist any more but prefix exists.
+        if (total_order) {
+          ASSERT_EQ("NOT_FOUND", Get("1000000000000not"));
+          ASSERT_EQ("NOT_FOUND", Get("0000000000000not"));
+        }
+        expect_bloom_not_match = false;
+      }
+    }
+  }
+}
+
+TEST(PlainTableDBTest, Iterator) {
+  for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
+    for (int total_order = 0; total_order <= 1; total_order++) {
+      bool expect_bloom_not_match = false;
+      Options options = CurrentOptions();
+      options.create_if_missing = true;
+      // Set only one bucket to force bucket conflict.
+      // Test index interval for the same prefix to be 1, 2 and 4
+      if (total_order) {
+        options.prefix_extractor = nullptr;
+        options.table_factory.reset(new TestPlainTableFactory(
+            &expect_bloom_not_match, 16, bloom_bits, 0, 2));
+      } else {
+        options.table_factory.reset(
+            new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits));
+      }
+      DestroyAndReopen(&options);
+
+      ASSERT_OK(Put("1000000000foo002", "v_2"));
+      ASSERT_OK(Put("0000000000000bar", "random"));
+      ASSERT_OK(Put("1000000000foo001", "v1"));
+      ASSERT_OK(Put("3000000000000bar", "bar_v"));
+      ASSERT_OK(Put("1000000000foo003", "v__3"));
+      ASSERT_OK(Put("1000000000foo004", "v__4"));
+      ASSERT_OK(Put("1000000000foo005", "v__5"));
+      ASSERT_OK(Put("1000000000foo007", "v__7"));
+      ASSERT_OK(Put("1000000000foo008", "v__8"));
+      dbfull()->TEST_FlushMemTable();
+      ASSERT_EQ("v1", Get("1000000000foo001"));
+      ASSERT_EQ("v__3", Get("1000000000foo003"));
+      Iterator* iter = dbfull()->NewIterator(ReadOptions());
+      iter->Seek("1000000000foo000");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("1000000000foo001", iter->key().ToString());
+      ASSERT_EQ("v1", iter->value().ToString());
+
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("1000000000foo002", iter->key().ToString());
+      ASSERT_EQ("v_2", iter->value().ToString());
+
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("1000000000foo003", iter->key().ToString());
+      ASSERT_EQ("v__3", iter->value().ToString());
+
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("1000000000foo004", iter->key().ToString());
+      ASSERT_EQ("v__4", iter->value().ToString());
+
+      iter->Seek("3000000000000bar");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("3000000000000bar", iter->key().ToString());
+      ASSERT_EQ("bar_v", iter->value().ToString());
+
+      iter->Seek("1000000000foo000");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("1000000000foo001", iter->key().ToString());
+      ASSERT_EQ("v1", iter->value().ToString());
+
+      iter->Seek("1000000000foo005");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("1000000000foo005", iter->key().ToString());
+      ASSERT_EQ("v__5", iter->value().ToString());
+
+      iter->Seek("1000000000foo006");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("1000000000foo007", iter->key().ToString());
+      ASSERT_EQ("v__7", iter->value().ToString());
+
+      iter->Seek("1000000000foo008");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("1000000000foo008", iter->key().ToString());
+      ASSERT_EQ("v__8", iter->value().ToString());
+
+      if (total_order == 0) {
+        iter->Seek("1000000000foo009");
+        ASSERT_TRUE(iter->Valid());
+        ASSERT_EQ("3000000000000bar", iter->key().ToString());
+      }
+
+      // Test Bloom Filter
+      if (bloom_bits > 0) {
+        if (!total_order) {
+          // Neither key nor value should exist.
+          expect_bloom_not_match = true;
+          iter->Seek("2not000000000bar");
+          ASSERT_TRUE(!iter->Valid());
+          ASSERT_EQ("NOT_FOUND", Get("2not000000000bar"));
+          expect_bloom_not_match = false;
+        } else {
+          expect_bloom_not_match = true;
+          ASSERT_EQ("NOT_FOUND", Get("2not000000000bar"));
+          expect_bloom_not_match = false;
+        }
+      }
+
+      delete iter;
+    }
+  }
+}
+
+namespace {
+std::string MakeLongKey(size_t length, char c) {
+  return std::string(length, c);
+}
+}  // namespace
+
+TEST(PlainTableDBTest, IteratorLargeKeys) {
+  Options options = CurrentOptions();
+  options.table_factory.reset(NewTotalOrderPlainTableFactory(0, 0, 16));
+  options.create_if_missing = true;
+  options.prefix_extractor.reset();
+  DestroyAndReopen(&options);
+
+  std::string key_list[] = {
+      MakeLongKey(30, '0'),
+      MakeLongKey(16, '1'),
+      MakeLongKey(32, '2'),
+      MakeLongKey(60, '3'),
+      MakeLongKey(90, '4'),
+      MakeLongKey(50, '5'),
+      MakeLongKey(26, '6')
+  };
+
+  for (size_t i = 0; i < 7; i++) {
+    ASSERT_OK(Put(key_list[i], std::to_string(i)));
+  }
+
+  dbfull()->TEST_FlushMemTable();
+
+  Iterator* iter = dbfull()->NewIterator(ReadOptions());
+  iter->Seek(key_list[0]);
+
+  for (size_t i = 0; i < 7; i++) {
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(key_list[i], iter->key().ToString());
+    ASSERT_EQ(std::to_string(i), iter->value().ToString());
+    iter->Next();
+  }
+
+  ASSERT_TRUE(!iter->Valid());
+
+  delete iter;
+}
+
+// A test comparator which compare two strings in this way:
+// (1) first compare prefix of 8 bytes in alphabet order,
+// (2) if two strings share the same prefix, sort the other part of the string
+//     in the reverse alphabet order.
+class SimpleSuffixReverseComparator : public Comparator {
+ public:
+  SimpleSuffixReverseComparator() {}
+
+  virtual const char* Name() const { return "SimpleSuffixReverseComparator"; }
+
+  virtual int Compare(const Slice& a, const Slice& b) const {
+    Slice prefix_a = Slice(a.data(), 8);
+    Slice prefix_b = Slice(b.data(), 8);
+    int prefix_comp = prefix_a.compare(prefix_b);
+    if (prefix_comp != 0) {
+      return prefix_comp;
+    } else {
+      Slice suffix_a = Slice(a.data() + 8, a.size() - 8);
+      Slice suffix_b = Slice(b.data() + 8, b.size() - 8);
+      return -(suffix_a.compare(suffix_b));
+    }
+  }
+  virtual void FindShortestSeparator(std::string* start,
+                                     const Slice& limit) const {}
+
+  virtual void FindShortSuccessor(std::string* key) const {}
+};
+
+TEST(PlainTableDBTest, IteratorReverseSuffixComparator) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  // Set only one bucket to force bucket conflict.
+  // Test index interval for the same prefix to be 1, 2 and 4
+  SimpleSuffixReverseComparator comp;
+  options.comparator = &comp;
+  DestroyAndReopen(&options);
+
+  ASSERT_OK(Put("1000000000foo002", "v_2"));
+  ASSERT_OK(Put("0000000000000bar", "random"));
+  ASSERT_OK(Put("1000000000foo001", "v1"));
+  ASSERT_OK(Put("3000000000000bar", "bar_v"));
+  ASSERT_OK(Put("1000000000foo003", "v__3"));
+  ASSERT_OK(Put("1000000000foo004", "v__4"));
+  ASSERT_OK(Put("1000000000foo005", "v__5"));
+  ASSERT_OK(Put("1000000000foo007", "v__7"));
+  ASSERT_OK(Put("1000000000foo008", "v__8"));
+  dbfull()->TEST_FlushMemTable();
+  ASSERT_EQ("v1", Get("1000000000foo001"));
+  ASSERT_EQ("v__3", Get("1000000000foo003"));
+  Iterator* iter = dbfull()->NewIterator(ReadOptions());
+  iter->Seek("1000000000foo009");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("1000000000foo008", iter->key().ToString());
+  ASSERT_EQ("v__8", iter->value().ToString());
+
+  iter->Next();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("1000000000foo007", iter->key().ToString());
+  ASSERT_EQ("v__7", iter->value().ToString());
+
+  iter->Next();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("1000000000foo005", iter->key().ToString());
+  ASSERT_EQ("v__5", iter->value().ToString());
+
+  iter->Next();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("1000000000foo004", iter->key().ToString());
+  ASSERT_EQ("v__4", iter->value().ToString());
+
+  iter->Seek("3000000000000bar");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("3000000000000bar", iter->key().ToString());
+  ASSERT_EQ("bar_v", iter->value().ToString());
+
+  iter->Seek("1000000000foo005");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("1000000000foo005", iter->key().ToString());
+  ASSERT_EQ("v__5", iter->value().ToString());
+
+  iter->Seek("1000000000foo006");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("1000000000foo005", iter->key().ToString());
+  ASSERT_EQ("v__5", iter->value().ToString());
+
+  iter->Seek("1000000000foo008");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("1000000000foo008", iter->key().ToString());
+  ASSERT_EQ("v__8", iter->value().ToString());
+
+  iter->Seek("1000000000foo000");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("3000000000000bar", iter->key().ToString());
+
+  delete iter;
+}
+
+TEST(PlainTableDBTest, HashBucketConflict) {
+  for (unsigned char i = 1; i <= 3; i++) {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    // Set only one bucket to force bucket conflict.
+    // Test index interval for the same prefix to be 1, 2 and 4
+    options.table_factory.reset(NewTotalOrderPlainTableFactory(16, 0, 2 ^ i));
+    DestroyAndReopen(&options);
+    ASSERT_OK(Put("5000000000000fo0", "v1"));
+    ASSERT_OK(Put("5000000000000fo1", "v2"));
+    ASSERT_OK(Put("5000000000000fo2", "v"));
+    ASSERT_OK(Put("2000000000000fo0", "v3"));
+    ASSERT_OK(Put("2000000000000fo1", "v4"));
+    ASSERT_OK(Put("2000000000000fo2", "v"));
+    ASSERT_OK(Put("2000000000000fo3", "v"));
+
+    dbfull()->TEST_FlushMemTable();
+
+    ASSERT_EQ("v1", Get("5000000000000fo0"));
+    ASSERT_EQ("v2", Get("5000000000000fo1"));
+    ASSERT_EQ("v3", Get("2000000000000fo0"));
+    ASSERT_EQ("v4", Get("2000000000000fo1"));
+
+    ASSERT_EQ("NOT_FOUND", Get("5000000000000bar"));
+    ASSERT_EQ("NOT_FOUND", Get("2000000000000bar"));
+    ASSERT_EQ("NOT_FOUND", Get("5000000000000fo8"));
+    ASSERT_EQ("NOT_FOUND", Get("2000000000000fo8"));
+
+    ReadOptions ro;
+    Iterator* iter = dbfull()->NewIterator(ro);
+
+    iter->Seek("5000000000000fo0");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("5000000000000fo0", iter->key().ToString());
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("5000000000000fo1", iter->key().ToString());
+
+    iter->Seek("5000000000000fo1");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("5000000000000fo1", iter->key().ToString());
+
+    iter->Seek("2000000000000fo0");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("2000000000000fo0", iter->key().ToString());
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("2000000000000fo1", iter->key().ToString());
+
+    iter->Seek("2000000000000fo1");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("2000000000000fo1", iter->key().ToString());
+
+    iter->Seek("2000000000000bar");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("2000000000000fo0", iter->key().ToString());
+
+    iter->Seek("5000000000000bar");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("5000000000000fo0", iter->key().ToString());
+
+    iter->Seek("2000000000000fo8");
+    ASSERT_TRUE(!iter->Valid() ||
+                options.comparator->Compare(iter->key(), "20000001") > 0);
+
+    iter->Seek("5000000000000fo8");
+    ASSERT_TRUE(!iter->Valid());
+
+    iter->Seek("1000000000000fo2");
+    ASSERT_TRUE(!iter->Valid());
+
+    iter->Seek("3000000000000fo2");
+    ASSERT_TRUE(!iter->Valid());
+
+    iter->Seek("8000000000000fo2");
+    ASSERT_TRUE(!iter->Valid());
+
+    delete iter;
+  }
+}
+
+TEST(PlainTableDBTest, HashBucketConflictReverseSuffixComparator) {
+  for (unsigned char i = 1; i <= 3; i++) {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    SimpleSuffixReverseComparator comp;
+    options.comparator = &comp;
+    // Set only one bucket to force bucket conflict.
+    // Test index interval for the same prefix to be 1, 2 and 4
+    options.table_factory.reset(NewTotalOrderPlainTableFactory(16, 0, 2 ^ i));
+    DestroyAndReopen(&options);
+    ASSERT_OK(Put("5000000000000fo0", "v1"));
+    ASSERT_OK(Put("5000000000000fo1", "v2"));
+    ASSERT_OK(Put("5000000000000fo2", "v"));
+    ASSERT_OK(Put("2000000000000fo0", "v3"));
+    ASSERT_OK(Put("2000000000000fo1", "v4"));
+    ASSERT_OK(Put("2000000000000fo2", "v"));
+    ASSERT_OK(Put("2000000000000fo3", "v"));
+
+    dbfull()->TEST_FlushMemTable();
+
+    ASSERT_EQ("v1", Get("5000000000000fo0"));
+    ASSERT_EQ("v2", Get("5000000000000fo1"));
+    ASSERT_EQ("v3", Get("2000000000000fo0"));
+    ASSERT_EQ("v4", Get("2000000000000fo1"));
+
+    ASSERT_EQ("NOT_FOUND", Get("5000000000000bar"));
+    ASSERT_EQ("NOT_FOUND", Get("2000000000000bar"));
+    ASSERT_EQ("NOT_FOUND", Get("5000000000000fo8"));
+    ASSERT_EQ("NOT_FOUND", Get("2000000000000fo8"));
+
+    ReadOptions ro;
+    Iterator* iter = dbfull()->NewIterator(ro);
+
+    iter->Seek("5000000000000fo1");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("5000000000000fo1", iter->key().ToString());
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("5000000000000fo0", iter->key().ToString());
+
+    iter->Seek("5000000000000fo1");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("5000000000000fo1", iter->key().ToString());
+
+    iter->Seek("2000000000000fo1");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("2000000000000fo1", iter->key().ToString());
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("2000000000000fo0", iter->key().ToString());
+
+    iter->Seek("2000000000000fo1");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("2000000000000fo1", iter->key().ToString());
+
+    iter->Seek("2000000000000var");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("2000000000000fo3", iter->key().ToString());
+
+    iter->Seek("5000000000000var");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("5000000000000fo2", iter->key().ToString());
+
+    std::string seek_key = "2000000000000bar";
+    iter->Seek(seek_key);
+    ASSERT_TRUE(!iter->Valid() ||
+                options.prefix_extractor->Transform(iter->key()) !=
+                    options.prefix_extractor->Transform(seek_key));
+
+    iter->Seek("1000000000000fo2");
+    ASSERT_TRUE(!iter->Valid());
+
+    iter->Seek("3000000000000fo2");
+    ASSERT_TRUE(!iter->Valid());
+
+    iter->Seek("8000000000000fo2");
+    ASSERT_TRUE(!iter->Valid());
+
+    delete iter;
+  }
+}
+
+TEST(PlainTableDBTest, NonExistingKeyToNonEmptyBucket) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  // Set only one bucket to force bucket conflict.
+  // Test index interval for the same prefix to be 1, 2 and 4
+  options.table_factory.reset(NewTotalOrderPlainTableFactory(16, 0, 5));
+  DestroyAndReopen(&options);
+  ASSERT_OK(Put("5000000000000fo0", "v1"));
+  ASSERT_OK(Put("5000000000000fo1", "v2"));
+  ASSERT_OK(Put("5000000000000fo2", "v3"));
+
+  dbfull()->TEST_FlushMemTable();
+
+  ASSERT_EQ("v1", Get("5000000000000fo0"));
+  ASSERT_EQ("v2", Get("5000000000000fo1"));
+  ASSERT_EQ("v3", Get("5000000000000fo2"));
+
+  ASSERT_EQ("NOT_FOUND", Get("8000000000000bar"));
+  ASSERT_EQ("NOT_FOUND", Get("1000000000000bar"));
+
+  Iterator* iter = dbfull()->NewIterator(ReadOptions());
+
+  iter->Seek("5000000000000bar");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("5000000000000fo0", iter->key().ToString());
+
+  iter->Seek("5000000000000fo8");
+  ASSERT_TRUE(!iter->Valid());
+
+  iter->Seek("1000000000000fo2");
+  ASSERT_TRUE(!iter->Valid());
+
+  iter->Seek("8000000000000fo2");
+  ASSERT_TRUE(!iter->Valid());
+
+  delete iter;
+}
+
+static std::string Key(int i) {
+  char buf[100];
+  snprintf(buf, sizeof(buf), "key_______%06d", i);
+  return std::string(buf);
+}
+
+static std::string RandomString(Random* rnd, int len) {
+  std::string r;
+  test::RandomString(rnd, len, &r);
+  return r;
+}
+
+TEST(PlainTableDBTest, CompactionTrigger) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100 << 10; //100KB
+  options.num_levels = 3;
+  options.max_mem_compaction_level = 0;
+  options.level0_file_num_compaction_trigger = 3;
+  Reopen(&options);
+
+  Random rnd(301);
+
+  for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
+      num++) {
+    std::vector<std::string> values;
+    // Write 120KB (12 values, each 10K)
+    for (int i = 0; i < 12; i++) {
+      values.push_back(RandomString(&rnd, 10000));
+      ASSERT_OK(Put(Key(i), values[i]));
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_EQ(NumTableFilesAtLevel(0), num + 1);
+  }
+
+  //generate one more file in level-0, and should trigger level-0 compaction
+  std::vector<std::string> values;
+  for (int i = 0; i < 12; i++) {
+    values.push_back(RandomString(&rnd, 10000));
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  dbfull()->TEST_WaitForCompact();
+
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1), 1);
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/db/prefix_test.cc b/db/prefix_test.cc
new file mode 100644 (file)
index 0000000..18036bb
--- /dev/null
@@ -0,0 +1,479 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include <algorithm>
+#include <iostream>
+#include <vector>
+
+#include <gflags/gflags.h>
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/memtablerep.h"
+#include "util/histogram.h"
+#include "util/stop_watch.h"
+#include "util/testharness.h"
+
+DEFINE_bool(trigger_deadlock, false,
+            "issue delete in range scan to trigger PrefixHashMap deadlock");
+DEFINE_uint64(bucket_count, 100000, "number of buckets");
+DEFINE_uint64(num_locks, 10001, "number of locks");
+DEFINE_bool(random_prefix, false, "randomize prefix");
+DEFINE_uint64(total_prefixes, 100000, "total number of prefixes");
+DEFINE_uint64(items_per_prefix, 1, "total number of values per prefix");
+DEFINE_int64(write_buffer_size, 33554432, "");
+DEFINE_int64(max_write_buffer_number, 2, "");
+DEFINE_int64(min_write_buffer_number_to_merge, 1, "");
+DEFINE_int32(skiplist_height, 4, "");
+DEFINE_int32(memtable_prefix_bloom_bits, 10000000, "");
+DEFINE_int32(memtable_prefix_bloom_probes, 10, "");
+DEFINE_int32(value_size, 40, "");
+
+// Path to the database on file system
+const std::string kDbName = rocksdb::test::TmpDir() + "/prefix_test";
+
+namespace rocksdb {
+
+struct TestKey {
+  uint64_t prefix;
+  uint64_t sorted;
+
+  TestKey(uint64_t prefix, uint64_t sorted) : prefix(prefix), sorted(sorted) {}
+};
+
+// return a slice backed by test_key
+inline Slice TestKeyToSlice(const TestKey& test_key) {
+  return Slice((const char*)&test_key, sizeof(test_key));
+}
+
+inline const TestKey* SliceToTestKey(const Slice& slice) {
+  return (const TestKey*)slice.data();
+}
+
+class TestKeyComparator : public Comparator {
+ public:
+
+  // Compare needs to be aware of the possibility of a and/or b is
+  // prefix only
+  virtual int Compare(const Slice& a, const Slice& b) const {
+    const TestKey* key_a = SliceToTestKey(a);
+    const TestKey* key_b = SliceToTestKey(b);
+    if (key_a->prefix != key_b->prefix) {
+      if (key_a->prefix < key_b->prefix) return -1;
+      if (key_a->prefix > key_b->prefix) return 1;
+    } else {
+      ASSERT_TRUE(key_a->prefix == key_b->prefix);
+      // note, both a and b could be prefix only
+      if (a.size() != b.size()) {
+        // one of them is prefix
+        ASSERT_TRUE(
+          (a.size() == sizeof(uint64_t) && b.size() == sizeof(TestKey)) ||
+          (b.size() == sizeof(uint64_t) && a.size() == sizeof(TestKey)));
+        if (a.size() < b.size()) return -1;
+        if (a.size() > b.size()) return 1;
+      } else {
+        // both a and b are prefix
+        if (a.size() == sizeof(uint64_t)) {
+          return 0;
+        }
+
+        // both a and b are whole key
+        ASSERT_TRUE(a.size() == sizeof(TestKey) && b.size() == sizeof(TestKey));
+        if (key_a->sorted < key_b->sorted) return -1;
+        if (key_a->sorted > key_b->sorted) return 1;
+        if (key_a->sorted == key_b->sorted) return 0;
+      }
+    }
+    return 0;
+  }
+
+  virtual const char* Name() const override {
+    return "TestKeyComparator";
+  }
+
+  virtual void FindShortestSeparator(
+      std::string* start,
+      const Slice& limit) const {
+  }
+
+  virtual void FindShortSuccessor(std::string* key) const {}
+
+};
+
+namespace {
+void PutKey(DB* db, WriteOptions write_options, uint64_t prefix,
+            uint64_t suffix, const Slice& value) {
+  TestKey test_key(prefix, suffix);
+  Slice key = TestKeyToSlice(test_key);
+  ASSERT_OK(db->Put(write_options, key, value));
+}
+
+void SeekIterator(Iterator* iter, uint64_t prefix, uint64_t suffix) {
+  TestKey test_key(prefix, suffix);
+  Slice key = TestKeyToSlice(test_key);
+  iter->Seek(key);
+}
+
+const std::string kNotFoundResult = "NOT_FOUND";
+
+std::string Get(DB* db, const ReadOptions& read_options, uint64_t prefix,
+                uint64_t suffix) {
+  TestKey test_key(prefix, suffix);
+  Slice key = TestKeyToSlice(test_key);
+
+  std::string result;
+  Status s = db->Get(read_options, key, &result);
+  if (s.IsNotFound()) {
+    result = kNotFoundResult;
+  } else if (!s.ok()) {
+    result = s.ToString();
+  }
+  return result;
+}
+}  // namespace
+
+class PrefixTest {
+ public:
+  std::shared_ptr<DB> OpenDb() {
+    DB* db;
+
+    options.create_if_missing = true;
+    options.write_buffer_size = FLAGS_write_buffer_size;
+    options.max_write_buffer_number = FLAGS_max_write_buffer_number;
+    options.min_write_buffer_number_to_merge =
+      FLAGS_min_write_buffer_number_to_merge;
+
+    options.memtable_prefix_bloom_bits = FLAGS_memtable_prefix_bloom_bits;
+    options.memtable_prefix_bloom_probes = FLAGS_memtable_prefix_bloom_probes;
+
+    Status s = DB::Open(options, kDbName,  &db);
+    ASSERT_OK(s);
+    return std::shared_ptr<DB>(db);
+  }
+
+  void FirstOption() {
+    option_config_ = kBegin;
+  }
+
+  bool NextOptions(int bucket_count) {
+    // skip some options
+    option_config_++;
+    if (option_config_ < kEnd) {
+      options.prefix_extractor.reset(NewFixedPrefixTransform(8));
+      switch(option_config_) {
+        case kHashSkipList:
+          options.memtable_factory.reset(
+              NewHashSkipListRepFactory(bucket_count, FLAGS_skiplist_height));
+          return true;
+        case kHashLinkList:
+          options.memtable_factory.reset(
+              NewHashLinkListRepFactory(bucket_count));
+          return true;
+        default:
+          return false;
+      }
+    }
+    return false;
+  }
+
+  PrefixTest() : option_config_(kBegin) {
+    options.comparator = new TestKeyComparator();
+  }
+  ~PrefixTest() {
+    delete options.comparator;
+  }
+ protected:
+  enum OptionConfig {
+    kBegin,
+    kHashSkipList,
+    kHashLinkList,
+    kEnd
+  };
+  int option_config_;
+  Options options;
+};
+
+TEST(PrefixTest, TestResult) {
+  for (int num_buckets = 1; num_buckets <= 2; num_buckets++) {
+    FirstOption();
+    while (NextOptions(num_buckets)) {
+      std::cout << "*** Mem table: " << options.memtable_factory->Name()
+                << " number of buckets: " << num_buckets
+                << std::endl;
+      DestroyDB(kDbName, Options());
+      auto db = OpenDb();
+      WriteOptions write_options;
+      ReadOptions read_options;
+
+      // 1. Insert one row.
+      Slice v16("v16");
+      PutKey(db.get(), write_options, 1, 6, v16);
+      std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+      SeekIterator(iter.get(), 1, 6);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v16 == iter->value());
+      SeekIterator(iter.get(), 1, 5);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v16 == iter->value());
+      SeekIterator(iter.get(), 1, 5);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v16 == iter->value());
+      iter->Next();
+      ASSERT_TRUE(!iter->Valid());
+
+      SeekIterator(iter.get(), 2, 0);
+      ASSERT_TRUE(!iter->Valid());
+
+      ASSERT_EQ(v16.ToString(), Get(db.get(), read_options, 1, 6));
+      ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 1, 5));
+      ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 1, 7));
+      ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 0, 6));
+      ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 2, 6));
+
+      // 2. Insert an entry for the same prefix as the last entry in the bucket.
+      Slice v17("v17");
+      PutKey(db.get(), write_options, 1, 7, v17);
+      iter.reset(db->NewIterator(read_options));
+      SeekIterator(iter.get(), 1, 7);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v17 == iter->value());
+
+      SeekIterator(iter.get(), 1, 6);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v16 == iter->value());
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v17 == iter->value());
+      iter->Next();
+      ASSERT_TRUE(!iter->Valid());
+
+      SeekIterator(iter.get(), 2, 0);
+      ASSERT_TRUE(!iter->Valid());
+
+      // 3. Insert an entry for the same prefix as the head of the bucket.
+      Slice v15("v15");
+      PutKey(db.get(), write_options, 1, 5, v15);
+      iter.reset(db->NewIterator(read_options));
+
+      SeekIterator(iter.get(), 1, 7);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v17 == iter->value());
+
+      SeekIterator(iter.get(), 1, 5);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v15 == iter->value());
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v16 == iter->value());
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v17 == iter->value());
+
+      SeekIterator(iter.get(), 1, 5);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v15 == iter->value());
+
+      ASSERT_EQ(v15.ToString(), Get(db.get(), read_options, 1, 5));
+      ASSERT_EQ(v16.ToString(), Get(db.get(), read_options, 1, 6));
+      ASSERT_EQ(v17.ToString(), Get(db.get(), read_options, 1, 7));
+
+      // 4. Insert an entry with a larger prefix
+      Slice v22("v22");
+      PutKey(db.get(), write_options, 2, 2, v22);
+      iter.reset(db->NewIterator(read_options));
+
+      SeekIterator(iter.get(), 2, 2);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v22 == iter->value());
+      SeekIterator(iter.get(), 2, 0);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v22 == iter->value());
+
+      SeekIterator(iter.get(), 1, 5);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v15 == iter->value());
+
+      SeekIterator(iter.get(), 1, 7);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v17 == iter->value());
+
+      // 5. Insert an entry with a smaller prefix
+      Slice v02("v02");
+      PutKey(db.get(), write_options, 0, 2, v02);
+      iter.reset(db->NewIterator(read_options));
+
+      SeekIterator(iter.get(), 0, 2);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v02 == iter->value());
+      SeekIterator(iter.get(), 0, 0);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v02 == iter->value());
+
+      SeekIterator(iter.get(), 2, 0);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v22 == iter->value());
+
+      SeekIterator(iter.get(), 1, 5);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v15 == iter->value());
+
+      SeekIterator(iter.get(), 1, 7);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v17 == iter->value());
+
+      // 6. Insert to the beginning and the end of the first prefix
+      Slice v13("v13");
+      Slice v18("v18");
+      PutKey(db.get(), write_options, 1, 3, v13);
+      PutKey(db.get(), write_options, 1, 8, v18);
+      iter.reset(db->NewIterator(read_options));
+      SeekIterator(iter.get(), 1, 7);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v17 == iter->value());
+
+      SeekIterator(iter.get(), 1, 3);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v13 == iter->value());
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v15 == iter->value());
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v16 == iter->value());
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v17 == iter->value());
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v18 == iter->value());
+
+      SeekIterator(iter.get(), 0, 0);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v02 == iter->value());
+
+      SeekIterator(iter.get(), 2, 0);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v22 == iter->value());
+
+      ASSERT_EQ(v22.ToString(), Get(db.get(), read_options, 2, 2));
+      ASSERT_EQ(v02.ToString(), Get(db.get(), read_options, 0, 2));
+      ASSERT_EQ(v13.ToString(), Get(db.get(), read_options, 1, 3));
+      ASSERT_EQ(v15.ToString(), Get(db.get(), read_options, 1, 5));
+      ASSERT_EQ(v16.ToString(), Get(db.get(), read_options, 1, 6));
+      ASSERT_EQ(v17.ToString(), Get(db.get(), read_options, 1, 7));
+      ASSERT_EQ(v18.ToString(), Get(db.get(), read_options, 1, 8));
+    }
+  }
+}
+
+TEST(PrefixTest, DynamicPrefixIterator) {
+  while (NextOptions(FLAGS_bucket_count)) {
+    std::cout << "*** Mem table: " << options.memtable_factory->Name()
+        << std::endl;
+    DestroyDB(kDbName, Options());
+    auto db = OpenDb();
+    WriteOptions write_options;
+    ReadOptions read_options;
+
+    std::vector<uint64_t> prefixes;
+    for (uint64_t i = 0; i < FLAGS_total_prefixes; ++i) {
+      prefixes.push_back(i);
+    }
+
+    if (FLAGS_random_prefix) {
+      std::random_shuffle(prefixes.begin(), prefixes.end());
+    }
+
+    HistogramImpl hist_put_time;
+    HistogramImpl hist_put_comparison;
+
+    // insert x random prefix, each with y continuous element.
+    for (auto prefix : prefixes) {
+       for (uint64_t sorted = 0; sorted < FLAGS_items_per_prefix; sorted++) {
+        TestKey test_key(prefix, sorted);
+
+        Slice key = TestKeyToSlice(test_key);
+        std::string value(FLAGS_value_size, 0);
+
+        perf_context.Reset();
+        StopWatchNano timer(Env::Default(), true);
+        ASSERT_OK(db->Put(write_options, key, value));
+        hist_put_time.Add(timer.ElapsedNanos());
+        hist_put_comparison.Add(perf_context.user_key_comparison_count);
+      }
+    }
+
+    std::cout << "Put key comparison: \n" << hist_put_comparison.ToString()
+              << "Put time: \n" << hist_put_time.ToString();
+
+    // test seek existing keys
+    HistogramImpl hist_seek_time;
+    HistogramImpl hist_seek_comparison;
+
+    std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+
+    for (auto prefix : prefixes) {
+      TestKey test_key(prefix, FLAGS_items_per_prefix / 2);
+      Slice key = TestKeyToSlice(test_key);
+      std::string value = "v" + std::to_string(0);
+
+      perf_context.Reset();
+      StopWatchNano timer(Env::Default(), true);
+      auto key_prefix = options.prefix_extractor->Transform(key);
+      uint64_t total_keys = 0;
+      for (iter->Seek(key);
+           iter->Valid() && iter->key().starts_with(key_prefix);
+           iter->Next()) {
+        if (FLAGS_trigger_deadlock) {
+          std::cout << "Behold the deadlock!\n";
+          db->Delete(write_options, iter->key());
+        }
+        total_keys++;
+      }
+      hist_seek_time.Add(timer.ElapsedNanos());
+      hist_seek_comparison.Add(perf_context.user_key_comparison_count);
+      ASSERT_EQ(total_keys, FLAGS_items_per_prefix - FLAGS_items_per_prefix/2);
+    }
+
+    std::cout << "Seek key comparison: \n"
+              << hist_seek_comparison.ToString()
+              << "Seek time: \n"
+              << hist_seek_time.ToString();
+
+    // test non-existing keys
+    HistogramImpl hist_no_seek_time;
+    HistogramImpl hist_no_seek_comparison;
+
+    for (auto prefix = FLAGS_total_prefixes;
+         prefix < FLAGS_total_prefixes + 10000;
+         prefix++) {
+      TestKey test_key(prefix, 0);
+      Slice key = TestKeyToSlice(test_key);
+
+      perf_context.Reset();
+      StopWatchNano timer(Env::Default(), true);
+      iter->Seek(key);
+      hist_no_seek_time.Add(timer.ElapsedNanos());
+      hist_no_seek_comparison.Add(perf_context.user_key_comparison_count);
+      ASSERT_TRUE(!iter->Valid());
+    }
+
+    std::cout << "non-existing Seek key comparison: \n"
+              << hist_no_seek_comparison.ToString()
+              << "non-existing Seek time: \n"
+              << hist_no_seek_time.ToString();
+  }
+}
+
+}
+
+int main(int argc, char** argv) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  std::cout << kDbName << "\n";
+
+  rocksdb::test::RunAllTests();
+  return 0;
+}
diff --git a/db/repair.cc b/db/repair.cc
new file mode 100644 (file)
index 0000000..8ae64b2
--- /dev/null
@@ -0,0 +1,403 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// We recover the contents of the descriptor from the other files we find.
+// (1) Any log files are first converted to tables
+// (2) We scan every table to compute
+//     (a) smallest/largest for the table
+//     (b) largest sequence number in the table
+// (3) We generate descriptor contents:
+//      - log number is set to zero
+//      - next-file-number is set to 1 + largest file number we found
+//      - last-sequence-number is set to largest sequence# found across
+//        all tables (see 2c)
+//      - compaction pointers are cleared
+//      - every table file is added at level 0
+//
+// Possible optimization 1:
+//   (a) Compute total size and use to pick appropriate max-level M
+//   (b) Sort tables by largest sequence# in the table
+//   (c) For each table: if it overlaps earlier table, place in level-0,
+//       else place in level-M.
+// Possible optimization 2:
+//   Store per-table metadata (smallest, largest, largest-seq#, ...)
+//   in the table's meta section to speed up ScanTable.
+
+#ifndef ROCKSDB_LITE
+
+#include "db/builder.h"
+#include "db/db_impl.h"
+#include "db/dbformat.h"
+#include "db/filename.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/memtable.h"
+#include "db/table_cache.h"
+#include "db/version_edit.h"
+#include "db/write_batch_internal.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+
+namespace rocksdb {
+
+namespace {
+
+class Repairer {
+ public:
+  Repairer(const std::string& dbname, const Options& options)
+      : dbname_(dbname),
+        env_(options.env),
+        icmp_(options.comparator),
+        ipolicy_(options.filter_policy),
+        options_(SanitizeOptions(dbname, &icmp_, &ipolicy_, options)),
+        raw_table_cache_(
+            // TableCache can be small since we expect each table to be opened
+            // once.
+            NewLRUCache(10, options_.table_cache_numshardbits,
+                        options_.table_cache_remove_scan_count_limit)),
+        next_file_number_(1) {
+    table_cache_ = new TableCache(dbname_, &options_, storage_options_,
+                                  raw_table_cache_.get());
+    edit_ = new VersionEdit();
+  }
+
+  ~Repairer() {
+    delete table_cache_;
+    raw_table_cache_.reset();
+    delete edit_;
+  }
+
+  Status Run() {
+    Status status = FindFiles();
+    if (status.ok()) {
+      ConvertLogFilesToTables();
+      ExtractMetaData();
+      status = WriteDescriptor();
+    }
+    if (status.ok()) {
+      unsigned long long bytes = 0;
+      for (size_t i = 0; i < tables_.size(); i++) {
+        bytes += tables_[i].meta.file_size;
+      }
+      Log(options_.info_log,
+          "**** Repaired rocksdb %s; "
+          "recovered %d files; %llu bytes. "
+          "Some data may have been lost. "
+          "****",
+          dbname_.c_str(),
+          static_cast<int>(tables_.size()),
+          bytes);
+    }
+    return status;
+  }
+
+ private:
+  struct TableInfo {
+    FileMetaData meta;
+    SequenceNumber min_sequence;
+    SequenceNumber max_sequence;
+  };
+
+  std::string const dbname_;
+  Env* const env_;
+  InternalKeyComparator const icmp_;
+  InternalFilterPolicy const ipolicy_;
+  Options const options_;
+  std::shared_ptr<Cache> raw_table_cache_;
+  TableCache* table_cache_;
+  VersionEdit* edit_;
+
+  std::vector<std::string> manifests_;
+  std::vector<uint64_t> table_numbers_;
+  std::vector<uint64_t> logs_;
+  std::vector<TableInfo> tables_;
+  uint64_t next_file_number_;
+  const EnvOptions storage_options_;
+
+  Status FindFiles() {
+    std::vector<std::string> filenames;
+    Status status = env_->GetChildren(dbname_, &filenames);
+    if (!status.ok()) {
+      return status;
+    }
+    if (filenames.empty()) {
+      return Status::Corruption(dbname_, "repair found no files");
+    }
+
+    uint64_t number;
+    FileType type;
+    for (size_t i = 0; i < filenames.size(); i++) {
+      if (ParseFileName(filenames[i], &number, &type)) {
+        if (type == kDescriptorFile) {
+          manifests_.push_back(filenames[i]);
+        } else {
+          if (number + 1 > next_file_number_) {
+            next_file_number_ = number + 1;
+          }
+          if (type == kLogFile) {
+            logs_.push_back(number);
+          } else if (type == kTableFile) {
+            table_numbers_.push_back(number);
+          } else {
+            // Ignore other files
+          }
+        }
+      }
+    }
+    return status;
+  }
+
+  void ConvertLogFilesToTables() {
+    for (size_t i = 0; i < logs_.size(); i++) {
+      std::string logname = LogFileName(dbname_, logs_[i]);
+      Status status = ConvertLogToTable(logs_[i]);
+      if (!status.ok()) {
+        Log(options_.info_log, "Log #%llu: ignoring conversion error: %s",
+            (unsigned long long) logs_[i],
+            status.ToString().c_str());
+      }
+      ArchiveFile(logname);
+    }
+  }
+
+  Status ConvertLogToTable(uint64_t log) {
+    struct LogReporter : public log::Reader::Reporter {
+      Env* env;
+      std::shared_ptr<Logger> info_log;
+      uint64_t lognum;
+      virtual void Corruption(size_t bytes, const Status& s) {
+        // We print error messages for corruption, but continue repairing.
+        Log(info_log, "Log #%llu: dropping %d bytes; %s",
+            (unsigned long long) lognum,
+            static_cast<int>(bytes),
+            s.ToString().c_str());
+      }
+    };
+
+    // Open the log file
+    std::string logname = LogFileName(dbname_, log);
+    unique_ptr<SequentialFile> lfile;
+    Status status = env_->NewSequentialFile(logname, &lfile, storage_options_);
+    if (!status.ok()) {
+      return status;
+    }
+
+    // Create the log reader.
+    LogReporter reporter;
+    reporter.env = env_;
+    reporter.info_log = options_.info_log;
+    reporter.lognum = log;
+    // We intentially make log::Reader do checksumming so that
+    // corruptions cause entire commits to be skipped instead of
+    // propagating bad information (like overly large sequence
+    // numbers).
+    log::Reader reader(std::move(lfile), &reporter, false/*do not checksum*/,
+                       0/*initial_offset*/);
+
+    // Read all the records and add to a memtable
+    std::string scratch;
+    Slice record;
+    WriteBatch batch;
+    MemTable* mem = new MemTable(icmp_, options_);
+    auto cf_mems_default = new ColumnFamilyMemTablesDefault(mem, &options_);
+    mem->Ref();
+    int counter = 0;
+    while (reader.ReadRecord(&record, &scratch)) {
+      if (record.size() < 12) {
+        reporter.Corruption(
+            record.size(), Status::Corruption("log record too small"));
+        continue;
+      }
+      WriteBatchInternal::SetContents(&batch, record);
+      status = WriteBatchInternal::InsertInto(&batch, cf_mems_default);
+      if (status.ok()) {
+        counter += WriteBatchInternal::Count(&batch);
+      } else {
+        Log(options_.info_log, "Log #%llu: ignoring %s",
+            (unsigned long long) log,
+            status.ToString().c_str());
+        status = Status::OK();  // Keep going with rest of file
+      }
+    }
+
+    // Do not record a version edit for this conversion to a Table
+    // since ExtractMetaData() will also generate edits.
+    FileMetaData meta;
+    meta.number = next_file_number_++;
+    ReadOptions ro;
+    Iterator* iter = mem->NewIterator(ro, true /* enforce_total_order */);
+    status = BuildTable(dbname_, env_, options_, storage_options_, table_cache_,
+                        iter, &meta, icmp_, 0, 0, kNoCompression);
+    delete iter;
+    delete mem->Unref();
+    delete cf_mems_default;
+    mem = nullptr;
+    if (status.ok()) {
+      if (meta.file_size > 0) {
+        table_numbers_.push_back(meta.number);
+      }
+    }
+    Log(options_.info_log, "Log #%llu: %d ops saved to Table #%llu %s",
+        (unsigned long long) log,
+        counter,
+        (unsigned long long) meta.number,
+        status.ToString().c_str());
+    return status;
+  }
+
+  void ExtractMetaData() {
+    for (size_t i = 0; i < table_numbers_.size(); i++) {
+      TableInfo t;
+      t.meta.number = table_numbers_[i];
+      Status status = ScanTable(&t);
+      if (!status.ok()) {
+        std::string fname = TableFileName(dbname_, table_numbers_[i]);
+        Log(options_.info_log, "Table #%llu: ignoring %s",
+            (unsigned long long) table_numbers_[i],
+            status.ToString().c_str());
+        ArchiveFile(fname);
+      } else {
+        tables_.push_back(t);
+      }
+    }
+  }
+
+  Status ScanTable(TableInfo* t) {
+    std::string fname = TableFileName(dbname_, t->meta.number);
+    int counter = 0;
+    Status status = env_->GetFileSize(fname, &t->meta.file_size);
+    if (status.ok()) {
+      FileMetaData dummy_meta(t->meta.number, t->meta.file_size);
+      Iterator* iter = table_cache_->NewIterator(
+          ReadOptions(), storage_options_, icmp_, dummy_meta);
+      bool empty = true;
+      ParsedInternalKey parsed;
+      t->min_sequence = 0;
+      t->max_sequence = 0;
+      for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+        Slice key = iter->key();
+        if (!ParseInternalKey(key, &parsed)) {
+          Log(options_.info_log, "Table #%llu: unparsable key %s",
+              (unsigned long long) t->meta.number,
+              EscapeString(key).c_str());
+          continue;
+        }
+
+        counter++;
+        if (empty) {
+          empty = false;
+          t->meta.smallest.DecodeFrom(key);
+        }
+        t->meta.largest.DecodeFrom(key);
+        if (parsed.sequence < t->min_sequence) {
+          t->min_sequence = parsed.sequence;
+        }
+        if (parsed.sequence > t->max_sequence) {
+          t->max_sequence = parsed.sequence;
+        }
+      }
+      if (!iter->status().ok()) {
+        status = iter->status();
+      }
+      delete iter;
+    }
+    Log(options_.info_log, "Table #%llu: %d entries %s",
+        (unsigned long long) t->meta.number,
+        counter,
+        status.ToString().c_str());
+    return status;
+  }
+
+  Status WriteDescriptor() {
+    std::string tmp = TempFileName(dbname_, 1);
+    unique_ptr<WritableFile> file;
+    Status status = env_->NewWritableFile(
+        tmp, &file, env_->OptimizeForManifestWrite(storage_options_));
+    if (!status.ok()) {
+      return status;
+    }
+
+    SequenceNumber max_sequence = 0;
+    for (size_t i = 0; i < tables_.size(); i++) {
+      if (max_sequence < tables_[i].max_sequence) {
+        max_sequence = tables_[i].max_sequence;
+      }
+    }
+
+    edit_->SetComparatorName(icmp_.user_comparator()->Name());
+    edit_->SetLogNumber(0);
+    edit_->SetNextFile(next_file_number_);
+    edit_->SetLastSequence(max_sequence);
+
+    for (size_t i = 0; i < tables_.size(); i++) {
+      // TODO(opt): separate out into multiple levels
+      const TableInfo& t = tables_[i];
+      edit_->AddFile(0, t.meta.number, t.meta.file_size,
+                    t.meta.smallest, t.meta.largest,
+                    t.min_sequence, t.max_sequence);
+    }
+
+    //fprintf(stderr, "NewDescriptor:\n%s\n", edit_.DebugString().c_str());
+    {
+      log::Writer log(std::move(file));
+      std::string record;
+      edit_->EncodeTo(&record);
+      status = log.AddRecord(record);
+    }
+
+    if (!status.ok()) {
+      env_->DeleteFile(tmp);
+    } else {
+      // Discard older manifests
+      for (size_t i = 0; i < manifests_.size(); i++) {
+        ArchiveFile(dbname_ + "/" + manifests_[i]);
+      }
+
+      // Install new manifest
+      status = env_->RenameFile(tmp, DescriptorFileName(dbname_, 1));
+      if (status.ok()) {
+        status = SetCurrentFile(env_, dbname_, 1);
+      } else {
+        env_->DeleteFile(tmp);
+      }
+    }
+    return status;
+  }
+
+  void ArchiveFile(const std::string& fname) {
+    // Move into another directory.  E.g., for
+    //    dir/foo
+    // rename to
+    //    dir/lost/foo
+    const char* slash = strrchr(fname.c_str(), '/');
+    std::string new_dir;
+    if (slash != nullptr) {
+      new_dir.assign(fname.data(), slash - fname.data());
+    }
+    new_dir.append("/lost");
+    env_->CreateDir(new_dir);  // Ignore error
+    std::string new_file = new_dir;
+    new_file.append("/");
+    new_file.append((slash == nullptr) ? fname.c_str() : slash + 1);
+    Status s = env_->RenameFile(fname, new_file);
+    Log(options_.info_log, "Archiving %s: %s\n",
+        fname.c_str(), s.ToString().c_str());
+  }
+};
+}  // namespace
+
+Status RepairDB(const std::string& dbname, const Options& options) {
+  Repairer repairer(dbname, options);
+  return repairer.Run();
+}
+
+}  // namespace rocksdb
+
+#endif  // ROCKSDB_LITE
diff --git a/db/simple_table_db_test.cc b/db/simple_table_db_test.cc
new file mode 100644 (file)
index 0000000..affa614
--- /dev/null
@@ -0,0 +1,794 @@
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include <algorithm>
+#include <set>
+
+#include "rocksdb/db.h"
+#include "rocksdb/filter_policy.h"
+#include "db/db_impl.h"
+#include "db/filename.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/env.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+#include "table/table_builder.h"
+#include "util/hash.h"
+#include "util/logging.h"
+#include "util/mutexlock.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+#include "utilities/merge_operators.h"
+
+using std::unique_ptr;
+
+// IS THIS FILE STILL NEEDED?
+namespace rocksdb {
+
+// SimpleTable is a simple table format for UNIT TEST ONLY. It is not built
+// as production quality.
+// SimpleTable requires the input key size to be fixed 16 bytes, value cannot
+// be longer than 150000 bytes and stored data on disk in this format:
+// +--------------------------------------------+  <= key1 offset
+// | key1            | value_size (4 bytes) |   |
+// +----------------------------------------+   |
+// | value1                                     |
+// |                                            |
+// +----------------------------------------+---+  <= key2 offset
+// | key2            | value_size (4 bytes) |   |
+// +----------------------------------------+   |
+// | value2                                     |
+// |                                            |
+// |        ......                              |
+// +-----------------+--------------------------+   <= index_block_offset
+// | key1            | key1 offset (8 bytes)    |
+// +-----------------+--------------------------+
+// | key2            | key2 offset (8 bytes)    |
+// +-----------------+--------------------------+
+// | key3            | key3 offset (8 bytes)    |
+// +-----------------+--------------------------+
+// |        ......                              |
+// +-----------------+------------+-------------+
+// | index_block_offset (8 bytes) |
+// +------------------------------+
+
+// SimpleTable is a simple table format for UNIT TEST ONLY. It is not built
+// as production quality.
+class SimpleTableReader: public TableReader {
+public:
+  // Attempt to open the table that is stored in bytes [0..file_size)
+  // of "file", and read the metadata entries necessary to allow
+  // retrieving data from the table.
+  //
+  // If successful, returns ok and sets "*table" to the newly opened
+  // table.  The client should delete "*table" when no longer needed.
+  // If there was an error while initializing the table, sets "*table"
+  // to nullptr and returns a non-ok status.  Does not take ownership of
+  // "*source", but the client must ensure that "source" remains live
+  // for the duration of the returned table's lifetime.
+  //
+  // *file must remain live while this Table is in use.
+  static Status Open(const Options& options, const EnvOptions& soptions,
+                     unique_ptr<RandomAccessFile> && file, uint64_t file_size,
+                     unique_ptr<TableReader>* table_reader);
+
+  Iterator* NewIterator(const ReadOptions&) override;
+
+  Status Get(const ReadOptions&, const Slice& key, void* arg,
+             bool (*handle_result)(void* arg, const ParsedInternalKey& k,
+                                   const Slice& v, bool),
+             void (*mark_key_may_exist)(void*) = nullptr) override;
+
+  uint64_t ApproximateOffsetOf(const Slice& key) override;
+
+  void SetupForCompaction() override;
+
+  std::shared_ptr<const TableProperties> GetTableProperties() const override;
+
+  ~SimpleTableReader();
+
+private:
+  struct Rep;
+  Rep* rep_;
+
+  explicit SimpleTableReader(Rep* rep) {
+    rep_ = rep;
+  }
+  friend class TableCache;
+  friend class SimpleTableIterator;
+
+  Status GetOffset(const Slice& target, uint64_t* offset);
+
+  // No copying allowed
+  explicit SimpleTableReader(const TableReader&) = delete;
+  void operator=(const TableReader&) = delete;
+};
+
+// Iterator to iterate SimpleTable
+class SimpleTableIterator: public Iterator {
+public:
+  explicit SimpleTableIterator(SimpleTableReader* table);
+  ~SimpleTableIterator();
+
+  bool Valid() const;
+
+  void SeekToFirst();
+
+  void SeekToLast();
+
+  void Seek(const Slice& target);
+
+  void Next();
+
+  void Prev();
+
+  Slice key() const;
+
+  Slice value() const;
+
+  Status status() const;
+
+private:
+  SimpleTableReader* table_;
+  uint64_t offset_;
+  uint64_t next_offset_;
+  Slice key_;
+  Slice value_;
+  char tmp_str_[4];
+  char* key_str_;
+  char* value_str_;
+  int value_str_len_;
+  Status status_;
+  // No copying allowed
+  SimpleTableIterator(const SimpleTableIterator&) = delete;
+  void operator=(const Iterator&) = delete;
+};
+
+struct SimpleTableReader::Rep {
+  ~Rep() {
+  }
+  Rep(const EnvOptions& storage_options, uint64_t index_start_offset,
+      int num_entries) :
+      soptions(storage_options), index_start_offset(index_start_offset),
+      num_entries(num_entries) {
+  }
+
+  Options options;
+  const EnvOptions& soptions;
+  Status status;
+  unique_ptr<RandomAccessFile> file;
+  uint64_t index_start_offset;
+  int num_entries;
+  std::shared_ptr<TableProperties> table_properties;
+
+  const static int user_key_size = 16;
+  const static int offset_length = 8;
+  const static int key_footer_len = 8;
+
+  static int GetInternalKeyLength() {
+    return user_key_size + key_footer_len;
+  }
+};
+
+SimpleTableReader::~SimpleTableReader() {
+  delete rep_;
+}
+
+Status SimpleTableReader::Open(const Options& options,
+                               const EnvOptions& soptions,
+                               unique_ptr<RandomAccessFile> && file,
+                               uint64_t size,
+                               unique_ptr<TableReader>* table_reader) {
+  char footer_space[Rep::offset_length];
+  Slice footer_input;
+  Status s = file->Read(size - Rep::offset_length, Rep::offset_length,
+                        &footer_input, footer_space);
+  if (s.ok()) {
+    uint64_t index_start_offset = DecodeFixed64(footer_space);
+
+    int num_entries = (size - Rep::offset_length - index_start_offset)
+        / (Rep::GetInternalKeyLength() + Rep::offset_length);
+    SimpleTableReader::Rep* rep = new SimpleTableReader::Rep(soptions,
+                                                             index_start_offset,
+                                                             num_entries);
+
+    rep->file = std::move(file);
+    rep->options = options;
+    table_reader->reset(new SimpleTableReader(rep));
+  }
+  return s;
+}
+
+void SimpleTableReader::SetupForCompaction() {
+}
+
+std::shared_ptr<const TableProperties> SimpleTableReader::GetTableProperties()
+    const {
+  return rep_->table_properties;
+}
+
+Iterator* SimpleTableReader::NewIterator(const ReadOptions& options) {
+  return new SimpleTableIterator(this);
+}
+
+Status SimpleTableReader::GetOffset(const Slice& target, uint64_t* offset) {
+  uint32_t left = 0;
+  uint32_t right = rep_->num_entries - 1;
+  char key_chars[Rep::GetInternalKeyLength()];
+  Slice tmp_slice;
+
+  uint32_t target_offset = 0;
+  while (left <= right) {
+    uint32_t mid = (left + right + 1) / 2;
+
+    uint64_t offset_to_read = rep_->index_start_offset
+        + (Rep::GetInternalKeyLength() + Rep::offset_length) * mid;
+    Status s = rep_->file->Read(offset_to_read, Rep::GetInternalKeyLength(),
+                                &tmp_slice, key_chars);
+    if (!s.ok()) {
+      return s;
+    }
+
+    InternalKeyComparator ikc(rep_->options.comparator);
+    int compare_result = ikc.Compare(tmp_slice, target);
+
+    if (compare_result < 0) {
+      if (left == right) {
+        target_offset = right + 1;
+        break;
+      }
+      left = mid;
+    } else {
+      if (left == right) {
+        target_offset = left;
+        break;
+      }
+      right = mid - 1;
+    }
+  }
+
+  if (target_offset >= (uint32_t) rep_->num_entries) {
+    *offset = rep_->index_start_offset;
+    return Status::OK();
+  }
+
+  char value_offset_chars[Rep::offset_length];
+
+  int64_t offset_for_value_offset = rep_->index_start_offset
+      + (Rep::GetInternalKeyLength() + Rep::offset_length) * target_offset
+      + Rep::GetInternalKeyLength();
+  Status s = rep_->file->Read(offset_for_value_offset, Rep::offset_length,
+                              &tmp_slice, value_offset_chars);
+  if (s.ok()) {
+    *offset = DecodeFixed64(value_offset_chars);
+  }
+  return s;
+}
+
+Status SimpleTableReader::Get(const ReadOptions& options, const Slice& k,
+                              void* arg,
+                              bool (*saver)(void*, const ParsedInternalKey&,
+                                            const Slice&, bool),
+                              void (*mark_key_may_exist)(void*)) {
+  Status s;
+  SimpleTableIterator* iter = new SimpleTableIterator(this);
+  for (iter->Seek(k); iter->Valid(); iter->Next()) {
+    ParsedInternalKey parsed_key;
+    if (!ParseInternalKey(iter->key(), &parsed_key)) {
+      return Status::Corruption(Slice());
+    }
+
+    if (!(*saver)(arg, parsed_key, iter->value(), true)) {
+      break;
+    }
+  }
+  s = iter->status();
+  delete iter;
+  return s;
+}
+
+uint64_t SimpleTableReader::ApproximateOffsetOf(const Slice& key) {
+  return 0;
+}
+
+SimpleTableIterator::SimpleTableIterator(SimpleTableReader* table) :
+    table_(table) {
+  key_str_ = new char[SimpleTableReader::Rep::GetInternalKeyLength()];
+  value_str_len_ = -1;
+  SeekToFirst();
+}
+
+SimpleTableIterator::~SimpleTableIterator() {
+ delete[] key_str_;
+ if (value_str_len_ >= 0) {
+   delete[] value_str_;
+ }
+}
+
+bool SimpleTableIterator::Valid() const {
+  return offset_ < table_->rep_->index_start_offset;
+}
+
+void SimpleTableIterator::SeekToFirst() {
+  next_offset_ = 0;
+  Next();
+}
+
+void SimpleTableIterator::SeekToLast() {
+  assert(false);
+}
+
+void SimpleTableIterator::Seek(const Slice& target) {
+  Status s = table_->GetOffset(target, &next_offset_);
+  if (!s.ok()) {
+    status_ = s;
+  }
+  Next();
+}
+
+void SimpleTableIterator::Next() {
+  offset_ = next_offset_;
+  if (offset_ >= table_->rep_->index_start_offset) {
+    return;
+  }
+  Slice result;
+  int internal_key_size = SimpleTableReader::Rep::GetInternalKeyLength();
+
+  Status s = table_->rep_->file->Read(next_offset_, internal_key_size, &result,
+                                      key_str_);
+  next_offset_ += internal_key_size;
+  key_ = result;
+
+  Slice value_size_slice;
+  s = table_->rep_->file->Read(next_offset_, 4, &value_size_slice, tmp_str_);
+  next_offset_ += 4;
+  uint32_t value_size = DecodeFixed32(tmp_str_);
+
+  Slice value_slice;
+  if ((int) value_size > value_str_len_) {
+    if (value_str_len_ >= 0) {
+      delete[] value_str_;
+    }
+    value_str_ = new char[value_size];
+    value_str_len_ = value_size;
+  }
+  s = table_->rep_->file->Read(next_offset_, value_size, &value_slice,
+                               value_str_);
+  next_offset_ += value_size;
+  value_ = value_slice;
+}
+
+void SimpleTableIterator::Prev() {
+  assert(false);
+}
+
+Slice SimpleTableIterator::key() const {
+  Log(table_->rep_->options.info_log, "key!!!!");
+  return key_;
+}
+
+Slice SimpleTableIterator::value() const {
+  return value_;
+}
+
+Status SimpleTableIterator::status() const {
+  return status_;
+}
+
+class SimpleTableBuilder: public TableBuilder {
+public:
+  // Create a builder that will store the contents of the table it is
+  // building in *file.  Does not close the file.  It is up to the
+  // caller to close the file after calling Finish(). The output file
+  // will be part of level specified by 'level'.  A value of -1 means
+  // that the caller does not know which level the output file will reside.
+  SimpleTableBuilder(const Options& options, WritableFile* file,
+                     CompressionType compression_type);
+
+  // REQUIRES: Either Finish() or Abandon() has been called.
+  ~SimpleTableBuilder();
+
+  // Add key,value to the table being constructed.
+  // REQUIRES: key is after any previously added key according to comparator.
+  // REQUIRES: Finish(), Abandon() have not been called
+  void Add(const Slice& key, const Slice& value) override;
+
+  // Return non-ok iff some error has been detected.
+  Status status() const override;
+
+  // Finish building the table.  Stops using the file passed to the
+  // constructor after this function returns.
+  // REQUIRES: Finish(), Abandon() have not been called
+  Status Finish() override;
+
+  // Indicate that the contents of this builder should be abandoned.  Stops
+  // using the file passed to the constructor after this function returns.
+  // If the caller is not going to call Finish(), it must call Abandon()
+  // before destroying this builder.
+  // REQUIRES: Finish(), Abandon() have not been called
+  void Abandon() override;
+
+  // Number of calls to Add() so far.
+  uint64_t NumEntries() const override;
+
+  // Size of the file generated so far.  If invoked after a successful
+  // Finish() call, returns the size of the final generated file.
+  uint64_t FileSize() const override;
+
+private:
+  struct Rep;
+  Rep* rep_;
+
+  // No copying allowed
+  SimpleTableBuilder(const SimpleTableBuilder&) = delete;
+  void operator=(const SimpleTableBuilder&) = delete;
+};
+
+struct SimpleTableBuilder::Rep {
+  Options options;
+  WritableFile* file;
+  uint64_t offset = 0;
+  Status status;
+
+  uint64_t num_entries = 0;
+
+  bool closed = false;  // Either Finish() or Abandon() has been called.
+
+  const static int user_key_size = 16;
+  const static int offset_length = 8;
+  const static int key_footer_len = 8;
+
+  static int GetInternalKeyLength() {
+    return user_key_size + key_footer_len;
+  }
+
+  std::string index;
+
+  Rep(const Options& opt, WritableFile* f) :
+      options(opt), file(f) {
+  }
+  ~Rep() {
+  }
+};
+
+SimpleTableBuilder::SimpleTableBuilder(const Options& options,
+                                       WritableFile* file,
+                                       CompressionType compression_type) :
+    rep_(new SimpleTableBuilder::Rep(options, file)) {
+}
+
+SimpleTableBuilder::~SimpleTableBuilder() {
+  delete (rep_);
+}
+
+void SimpleTableBuilder::Add(const Slice& key, const Slice& value) {
+  assert((int ) key.size() == Rep::GetInternalKeyLength());
+
+  // Update index
+  rep_->index.append(key.data(), key.size());
+  PutFixed64(&(rep_->index), rep_->offset);
+
+  // Write key-value pair
+  rep_->file->Append(key);
+  rep_->offset += Rep::GetInternalKeyLength();
+
+  std::string size;
+  int value_size = value.size();
+  PutFixed32(&size, value_size);
+  Slice sizeSlice(size);
+  rep_->file->Append(sizeSlice);
+  rep_->file->Append(value);
+  rep_->offset += value_size + 4;
+
+  rep_->num_entries++;
+}
+
+Status SimpleTableBuilder::status() const {
+  return Status::OK();
+}
+
+Status SimpleTableBuilder::Finish() {
+  Rep* r = rep_;
+  assert(!r->closed);
+  r->closed = true;
+
+  uint64_t index_offset = rep_->offset;
+  Slice index_slice(rep_->index);
+  rep_->file->Append(index_slice);
+  rep_->offset += index_slice.size();
+
+  std::string index_offset_str;
+  PutFixed64(&index_offset_str, index_offset);
+  Slice foot_slice(index_offset_str);
+  rep_->file->Append(foot_slice);
+  rep_->offset += foot_slice.size();
+
+  return Status::OK();
+}
+
+void SimpleTableBuilder::Abandon() {
+  rep_->closed = true;
+}
+
+uint64_t SimpleTableBuilder::NumEntries() const {
+  return rep_->num_entries;
+}
+
+uint64_t SimpleTableBuilder::FileSize() const {
+  return rep_->offset;
+}
+
+class SimpleTableFactory: public TableFactory {
+public:
+  ~SimpleTableFactory() {
+  }
+  SimpleTableFactory() {
+  }
+  const char* Name() const override {
+    return "SimpleTable";
+  }
+  Status NewTableReader(const Options& options, const EnvOptions& soptions,
+                        const InternalKeyComparator& internal_key,
+                        unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
+                        unique_ptr<TableReader>* table_reader) const;
+
+  TableBuilder* NewTableBuilder(const Options& options,
+                                const InternalKeyComparator& internal_key,
+                                WritableFile* file,
+                                CompressionType compression_type) const;
+};
+
+Status SimpleTableFactory::NewTableReader(
+    const Options& options, const EnvOptions& soptions,
+    const InternalKeyComparator& internal_key,
+    unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
+    unique_ptr<TableReader>* table_reader) const {
+
+  return SimpleTableReader::Open(options, soptions, std::move(file), file_size,
+                                 table_reader);
+}
+
+TableBuilder* SimpleTableFactory::NewTableBuilder(
+    const Options& options, const InternalKeyComparator& internal_key,
+    WritableFile* file, CompressionType compression_type) const {
+  return new SimpleTableBuilder(options, file, compression_type);
+}
+
+class SimpleTableDBTest {
+protected:
+public:
+  std::string dbname_;
+  Env* env_;
+  DB* db_;
+
+  Options last_options_;
+
+  SimpleTableDBTest() :
+      env_(Env::Default()) {
+    dbname_ = test::TmpDir() + "/simple_table_db_test";
+    ASSERT_OK(DestroyDB(dbname_, Options()));
+    db_ = nullptr;
+    Reopen();
+  }
+
+  ~SimpleTableDBTest() {
+    delete db_;
+    ASSERT_OK(DestroyDB(dbname_, Options()));
+  }
+
+  // Return the current option configuration.
+  Options CurrentOptions() {
+    Options options;
+    options.table_factory.reset(new SimpleTableFactory());
+    return options;
+  }
+
+  DBImpl* dbfull() {
+    return reinterpret_cast<DBImpl*>(db_);
+  }
+
+  void Reopen(Options* options = nullptr) {
+    ASSERT_OK(TryReopen(options));
+  }
+
+  void Close() {
+    delete db_;
+    db_ = nullptr;
+  }
+
+  void DestroyAndReopen(Options* options = nullptr) {
+    //Destroy using last options
+    Destroy(&last_options_);
+    ASSERT_OK(TryReopen(options));
+  }
+
+  void Destroy(Options* options) {
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(DestroyDB(dbname_, *options));
+  }
+
+  Status PureReopen(Options* options, DB** db) {
+    return DB::Open(*options, dbname_, db);
+  }
+
+  Status TryReopen(Options* options = nullptr) {
+    delete db_;
+    db_ = nullptr;
+    Options opts;
+    if (options != nullptr) {
+      opts = *options;
+    } else {
+      opts = CurrentOptions();
+      opts.create_if_missing = true;
+    }
+    last_options_ = opts;
+
+    return DB::Open(opts, dbname_, &db_);
+  }
+
+  Status Put(const Slice& k, const Slice& v) {
+    return db_->Put(WriteOptions(), k, v);
+  }
+
+  Status Delete(const std::string& k) {
+    return db_->Delete(WriteOptions(), k);
+  }
+
+  std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) {
+    ReadOptions options;
+    options.snapshot = snapshot;
+    std::string result;
+    Status s = db_->Get(options, k, &result);
+    if (s.IsNotFound()) {
+      result = "NOT_FOUND";
+    } else if (!s.ok()) {
+      result = s.ToString();
+    }
+    return result;
+  }
+
+
+  int NumTableFilesAtLevel(int level) {
+    std::string property;
+    ASSERT_TRUE(
+        db_->GetProperty("rocksdb.num-files-at-level" + NumberToString(level),
+                         &property));
+    return atoi(property.c_str());
+  }
+
+  // Return spread of files per level
+  std::string FilesPerLevel() {
+    std::string result;
+    int last_non_zero_offset = 0;
+    for (int level = 0; level < db_->NumberLevels(); level++) {
+      int f = NumTableFilesAtLevel(level);
+      char buf[100];
+      snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
+      result += buf;
+      if (f > 0) {
+        last_non_zero_offset = result.size();
+      }
+    }
+    result.resize(last_non_zero_offset);
+    return result;
+  }
+
+  std::string IterStatus(Iterator* iter) {
+    std::string result;
+    if (iter->Valid()) {
+      result = iter->key().ToString() + "->" + iter->value().ToString();
+    } else {
+      result = "(invalid)";
+    }
+    return result;
+  }
+};
+
+TEST(SimpleTableDBTest, Empty) {
+  ASSERT_TRUE(db_ != nullptr);
+  ASSERT_EQ("NOT_FOUND", Get("0000000000000foo"));
+}
+
+TEST(SimpleTableDBTest, ReadWrite) {
+  ASSERT_OK(Put("0000000000000foo", "v1"));
+  ASSERT_EQ("v1", Get("0000000000000foo"));
+  ASSERT_OK(Put("0000000000000bar", "v2"));
+  ASSERT_OK(Put("0000000000000foo", "v3"));
+  ASSERT_EQ("v3", Get("0000000000000foo"));
+  ASSERT_EQ("v2", Get("0000000000000bar"));
+}
+
+TEST(SimpleTableDBTest, Flush) {
+  ASSERT_OK(Put("0000000000000foo", "v1"));
+  ASSERT_OK(Put("0000000000000bar", "v2"));
+  ASSERT_OK(Put("0000000000000foo", "v3"));
+  dbfull()->TEST_FlushMemTable();
+  ASSERT_EQ("v3", Get("0000000000000foo"));
+  ASSERT_EQ("v2", Get("0000000000000bar"));
+}
+
+TEST(SimpleTableDBTest, Flush2) {
+  ASSERT_OK(Put("0000000000000bar", "b"));
+  ASSERT_OK(Put("0000000000000foo", "v1"));
+  dbfull()->TEST_FlushMemTable();
+
+  ASSERT_OK(Put("0000000000000foo", "v2"));
+  dbfull()->TEST_FlushMemTable();
+  ASSERT_EQ("v2", Get("0000000000000foo"));
+
+  ASSERT_OK(Put("0000000000000eee", "v3"));
+  dbfull()->TEST_FlushMemTable();
+  ASSERT_EQ("v3", Get("0000000000000eee"));
+
+  ASSERT_OK(Delete("0000000000000bar"));
+  dbfull()->TEST_FlushMemTable();
+  ASSERT_EQ("NOT_FOUND", Get("0000000000000bar"));
+
+  ASSERT_OK(Put("0000000000000eee", "v5"));
+  dbfull()->TEST_FlushMemTable();
+  ASSERT_EQ("v5", Get("0000000000000eee"));
+}
+
+static std::string Key(int i) {
+  char buf[100];
+  snprintf(buf, sizeof(buf), "key_______%06d", i);
+  return std::string(buf);
+}
+
+static std::string RandomString(Random* rnd, int len) {
+  std::string r;
+  test::RandomString(rnd, len, &r);
+  return r;
+}
+
+TEST(SimpleTableDBTest, CompactionTrigger) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100 << 10; //100KB
+  options.num_levels = 3;
+  options.max_mem_compaction_level = 0;
+  options.level0_file_num_compaction_trigger = 3;
+  Reopen(&options);
+
+  Random rnd(301);
+
+  for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
+      num++) {
+    std::vector<std::string> values;
+    // Write 120KB (12 values, each 10K)
+    for (int i = 0; i < 12; i++) {
+      values.push_back(RandomString(&rnd, 10000));
+      ASSERT_OK(Put(Key(i), values[i]));
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_EQ(NumTableFilesAtLevel(0), num + 1);
+  }
+
+  //generate one more file in level-0, and should trigger level-0 compaction
+  std::vector<std::string> values;
+  for (int i = 0; i < 12; i++) {
+    values.push_back(RandomString(&rnd, 10000));
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  dbfull()->TEST_WaitForCompact();
+
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1), 1);
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/db/skiplist.h b/db/skiplist.h
new file mode 100644 (file)
index 0000000..751f7c3
--- /dev/null
@@ -0,0 +1,429 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Thread safety
+// -------------
+//
+// Writes require external synchronization, most likely a mutex.
+// Reads require a guarantee that the SkipList will not be destroyed
+// while the read is in progress.  Apart from that, reads progress
+// without any internal locking or synchronization.
+//
+// Invariants:
+//
+// (1) Allocated nodes are never deleted until the SkipList is
+// destroyed.  This is trivially guaranteed by the code since we
+// never delete any skip list nodes.
+//
+// (2) The contents of a Node except for the next/prev pointers are
+// immutable after the Node has been linked into the SkipList.
+// Only Insert() modifies the list, and it is careful to initialize
+// a node and use release-stores to publish the nodes in one or
+// more lists.
+//
+// ... prev vs. next pointer ordering ...
+//
+
+#pragma once
+#include <assert.h>
+#include <stdlib.h>
+#include "util/arena.h"
+#include "port/port.h"
+#include "util/arena.h"
+#include "util/random.h"
+
+namespace rocksdb {
+
+template<typename Key, class Comparator>
+class SkipList {
+ private:
+  struct Node;
+
+ public:
+  // Create a new SkipList object that will use "cmp" for comparing keys,
+  // and will allocate memory using "*arena".  Objects allocated in the arena
+  // must remain allocated for the lifetime of the skiplist object.
+  explicit SkipList(Comparator cmp, Arena* arena,
+                    int32_t max_height = 12, int32_t branching_factor = 4);
+
+  // Insert key into the list.
+  // REQUIRES: nothing that compares equal to key is currently in the list.
+  void Insert(const Key& key);
+
+  // Returns true iff an entry that compares equal to key is in the list.
+  bool Contains(const Key& key) const;
+
+  // Iteration over the contents of a skip list
+  class Iterator {
+   public:
+    // Initialize an iterator over the specified list.
+    // The returned iterator is not valid.
+    explicit Iterator(const SkipList* list);
+
+    // Change the underlying skiplist used for this iterator
+    // This enables us not changing the iterator without deallocating
+    // an old one and then allocating a new one
+    void SetList(const SkipList* list);
+
+    // Returns true iff the iterator is positioned at a valid node.
+    bool Valid() const;
+
+    // Returns the key at the current position.
+    // REQUIRES: Valid()
+    const Key& key() const;
+
+    // Advances to the next position.
+    // REQUIRES: Valid()
+    void Next();
+
+    // Advances to the previous position.
+    // REQUIRES: Valid()
+    void Prev();
+
+    // Advance to the first entry with a key >= target
+    void Seek(const Key& target);
+
+    // Position at the first entry in list.
+    // Final state of iterator is Valid() iff list is not empty.
+    void SeekToFirst();
+
+    // Position at the last entry in list.
+    // Final state of iterator is Valid() iff list is not empty.
+    void SeekToLast();
+
+   private:
+    const SkipList* list_;
+    Node* node_;
+    // Intentionally copyable
+  };
+
+ private:
+  const int32_t kMaxHeight_;
+  const int32_t kBranching_;
+
+  // Immutable after construction
+  Comparator const compare_;
+  Arena* const arena_;    // Arena used for allocations of nodes
+
+  Node* const head_;
+
+  // Modified only by Insert().  Read racily by readers, but stale
+  // values are ok.
+  port::AtomicPointer max_height_;   // Height of the entire list
+
+  // Used for optimizing sequential insert patterns
+  Node** prev_;
+  int32_t prev_height_;
+
+  inline int GetMaxHeight() const {
+    return static_cast<int>(
+        reinterpret_cast<intptr_t>(max_height_.NoBarrier_Load()));
+  }
+
+  // Read/written only by Insert().
+  Random rnd_;
+
+  Node* NewNode(const Key& key, int height);
+  int RandomHeight();
+  bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); }
+
+  // Return true if key is greater than the data stored in "n"
+  bool KeyIsAfterNode(const Key& key, Node* n) const;
+
+  // Return the earliest node that comes at or after key.
+  // Return nullptr if there is no such node.
+  //
+  // If prev is non-nullptr, fills prev[level] with pointer to previous
+  // node at "level" for every level in [0..max_height_-1].
+  Node* FindGreaterOrEqual(const Key& key, Node** prev) const;
+
+  // Return the latest node with a key < key.
+  // Return head_ if there is no such node.
+  Node* FindLessThan(const Key& key) const;
+
+  // Return the last node in the list.
+  // Return head_ if list is empty.
+  Node* FindLast() const;
+
+  // No copying allowed
+  SkipList(const SkipList&);
+  void operator=(const SkipList&);
+};
+
+// Implementation details follow
+template<typename Key, class Comparator>
+struct SkipList<Key, Comparator>::Node {
+  explicit Node(const Key& k) : key(k) { }
+
+  Key const key;
+
+  // Accessors/mutators for links.  Wrapped in methods so we can
+  // add the appropriate barriers as necessary.
+  Node* Next(int n) {
+    assert(n >= 0);
+    // Use an 'acquire load' so that we observe a fully initialized
+    // version of the returned Node.
+    return reinterpret_cast<Node*>(next_[n].Acquire_Load());
+  }
+  void SetNext(int n, Node* x) {
+    assert(n >= 0);
+    // Use a 'release store' so that anybody who reads through this
+    // pointer observes a fully initialized version of the inserted node.
+    next_[n].Release_Store(x);
+  }
+
+  // No-barrier variants that can be safely used in a few locations.
+  Node* NoBarrier_Next(int n) {
+    assert(n >= 0);
+    return reinterpret_cast<Node*>(next_[n].NoBarrier_Load());
+  }
+  void NoBarrier_SetNext(int n, Node* x) {
+    assert(n >= 0);
+    next_[n].NoBarrier_Store(x);
+  }
+
+ private:
+  // Array of length equal to the node height.  next_[0] is lowest level link.
+  port::AtomicPointer next_[1];
+};
+
+template<typename Key, class Comparator>
+typename SkipList<Key, Comparator>::Node*
+SkipList<Key, Comparator>::NewNode(const Key& key, int height) {
+  char* mem = arena_->AllocateAligned(
+      sizeof(Node) + sizeof(port::AtomicPointer) * (height - 1));
+  return new (mem) Node(key);
+}
+
+template<typename Key, class Comparator>
+inline SkipList<Key, Comparator>::Iterator::Iterator(const SkipList* list) {
+  SetList(list);
+}
+
+template<typename Key, class Comparator>
+inline void SkipList<Key, Comparator>::Iterator::SetList(const SkipList* list) {
+  list_ = list;
+  node_ = nullptr;
+}
+
+template<typename Key, class Comparator>
+inline bool SkipList<Key, Comparator>::Iterator::Valid() const {
+  return node_ != nullptr;
+}
+
+template<typename Key, class Comparator>
+inline const Key& SkipList<Key, Comparator>::Iterator::key() const {
+  assert(Valid());
+  return node_->key;
+}
+
+template<typename Key, class Comparator>
+inline void SkipList<Key, Comparator>::Iterator::Next() {
+  assert(Valid());
+  node_ = node_->Next(0);
+}
+
+template<typename Key, class Comparator>
+inline void SkipList<Key, Comparator>::Iterator::Prev() {
+  // Instead of using explicit "prev" links, we just search for the
+  // last node that falls before key.
+  assert(Valid());
+  node_ = list_->FindLessThan(node_->key);
+  if (node_ == list_->head_) {
+    node_ = nullptr;
+  }
+}
+
+template<typename Key, class Comparator>
+inline void SkipList<Key, Comparator>::Iterator::Seek(const Key& target) {
+  node_ = list_->FindGreaterOrEqual(target, nullptr);
+}
+
+template<typename Key, class Comparator>
+inline void SkipList<Key, Comparator>::Iterator::SeekToFirst() {
+  node_ = list_->head_->Next(0);
+}
+
+template<typename Key, class Comparator>
+inline void SkipList<Key, Comparator>::Iterator::SeekToLast() {
+  node_ = list_->FindLast();
+  if (node_ == list_->head_) {
+    node_ = nullptr;
+  }
+}
+
+template<typename Key, class Comparator>
+int SkipList<Key, Comparator>::RandomHeight() {
+  // Increase height with probability 1 in kBranching
+  int height = 1;
+  while (height < kMaxHeight_ && ((rnd_.Next() % kBranching_) == 0)) {
+    height++;
+  }
+  assert(height > 0);
+  assert(height <= kMaxHeight_);
+  return height;
+}
+
+template<typename Key, class Comparator>
+bool SkipList<Key, Comparator>::KeyIsAfterNode(const Key& key, Node* n) const {
+  // nullptr n is considered infinite
+  return (n != nullptr) && (compare_(n->key, key) < 0);
+}
+
+template<typename Key, class Comparator>
+typename SkipList<Key, Comparator>::Node* SkipList<Key, Comparator>::
+  FindGreaterOrEqual(const Key& key, Node** prev) const {
+  // Use prev as an optimization hint and fallback to slow path
+  if (prev && !KeyIsAfterNode(key, prev[0]->Next(0))) {
+    Node* x = prev[0];
+    Node* next = x->Next(0);
+    if ((x == head_) || KeyIsAfterNode(key, x)) {
+      // Adjust all relevant insertion points to the previous entry
+      for (int i = 1; i < prev_height_; i++) {
+        prev[i] = x;
+      }
+      return next;
+    }
+  }
+  // Normal lookup
+  Node* x = head_;
+  int level = GetMaxHeight() - 1;
+  while (true) {
+    Node* next = x->Next(level);
+    // Make sure the lists are sorted.
+    // If x points to head_ or next points nullptr, it is trivially satisfied.
+    assert((x == head_) || (next == nullptr) || KeyIsAfterNode(next->key, x));
+    if (KeyIsAfterNode(key, next)) {
+      // Keep searching in this list
+      x = next;
+    } else {
+      if (prev != nullptr) prev[level] = x;
+      if (level == 0) {
+        return next;
+      } else {
+        // Switch to next list
+        level--;
+      }
+    }
+  }
+}
+
+template<typename Key, class Comparator>
+typename SkipList<Key, Comparator>::Node*
+SkipList<Key, Comparator>::FindLessThan(const Key& key) const {
+  Node* x = head_;
+  int level = GetMaxHeight() - 1;
+  while (true) {
+    assert(x == head_ || compare_(x->key, key) < 0);
+    Node* next = x->Next(level);
+    if (next == nullptr || compare_(next->key, key) >= 0) {
+      if (level == 0) {
+        return x;
+      } else {
+        // Switch to next list
+        level--;
+      }
+    } else {
+      x = next;
+    }
+  }
+}
+
+template<typename Key, class Comparator>
+typename SkipList<Key, Comparator>::Node* SkipList<Key, Comparator>::FindLast()
+    const {
+  Node* x = head_;
+  int level = GetMaxHeight() - 1;
+  while (true) {
+    Node* next = x->Next(level);
+    if (next == nullptr) {
+      if (level == 0) {
+        return x;
+      } else {
+        // Switch to next list
+        level--;
+      }
+    } else {
+      x = next;
+    }
+  }
+}
+
+template<typename Key, class Comparator>
+SkipList<Key, Comparator>::SkipList(const Comparator cmp, Arena* arena,
+                                   int32_t max_height,
+                                   int32_t branching_factor)
+    : kMaxHeight_(max_height),
+      kBranching_(branching_factor),
+      compare_(cmp),
+      arena_(arena),
+      head_(NewNode(0 /* any key will do */, max_height)),
+      max_height_(reinterpret_cast<void*>(1)),
+      prev_height_(1),
+      rnd_(0xdeadbeef) {
+  assert(kMaxHeight_ > 0);
+  assert(kBranching_ > 0);
+  // Allocate the prev_ Node* array, directly from the passed-in arena.
+  // prev_ does not need to be freed, as its life cycle is tied up with
+  // the arena as a whole.
+  prev_ = (Node**) arena_->AllocateAligned(sizeof(Node*) * kMaxHeight_);
+  for (int i = 0; i < kMaxHeight_; i++) {
+    head_->SetNext(i, nullptr);
+    prev_[i] = head_;
+  }
+}
+
+template<typename Key, class Comparator>
+void SkipList<Key, Comparator>::Insert(const Key& key) {
+  // TODO(opt): We can use a barrier-free variant of FindGreaterOrEqual()
+  // here since Insert() is externally synchronized.
+  Node* x = FindGreaterOrEqual(key, prev_);
+
+  // Our data structure does not allow duplicate insertion
+  assert(x == nullptr || !Equal(key, x->key));
+
+  int height = RandomHeight();
+  if (height > GetMaxHeight()) {
+    for (int i = GetMaxHeight(); i < height; i++) {
+      prev_[i] = head_;
+    }
+    //fprintf(stderr, "Change height from %d to %d\n", max_height_, height);
+
+    // It is ok to mutate max_height_ without any synchronization
+    // with concurrent readers.  A concurrent reader that observes
+    // the new value of max_height_ will see either the old value of
+    // new level pointers from head_ (nullptr), or a new value set in
+    // the loop below.  In the former case the reader will
+    // immediately drop to the next level since nullptr sorts after all
+    // keys.  In the latter case the reader will use the new node.
+    max_height_.NoBarrier_Store(reinterpret_cast<void*>(height));
+  }
+
+  x = NewNode(key, height);
+  for (int i = 0; i < height; i++) {
+    // NoBarrier_SetNext() suffices since we will add a barrier when
+    // we publish a pointer to "x" in prev[i].
+    x->NoBarrier_SetNext(i, prev_[i]->NoBarrier_Next(i));
+    prev_[i]->SetNext(i, x);
+  }
+  prev_[0] = x;
+  prev_height_ = height;
+}
+
+template<typename Key, class Comparator>
+bool SkipList<Key, Comparator>::Contains(const Key& key) const {
+  Node* x = FindGreaterOrEqual(key, nullptr);
+  if (x != nullptr && Equal(key, x->key)) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+}  // namespace rocksdb
diff --git a/db/skiplist_test.cc b/db/skiplist_test.cc
new file mode 100644 (file)
index 0000000..b87ddcb
--- /dev/null
@@ -0,0 +1,383 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/skiplist.h"
+#include <set>
+#include "rocksdb/env.h"
+#include "util/arena.h"
+#include "util/hash.h"
+#include "util/random.h"
+#include "util/testharness.h"
+
+namespace rocksdb {
+
+typedef uint64_t Key;
+
+struct TestComparator {
+  int operator()(const Key& a, const Key& b) const {
+    if (a < b) {
+      return -1;
+    } else if (a > b) {
+      return +1;
+    } else {
+      return 0;
+    }
+  }
+};
+
+class SkipTest { };
+
+TEST(SkipTest, Empty) {
+  Arena arena;
+  TestComparator cmp;
+  SkipList<Key, TestComparator> list(cmp, &arena);
+  ASSERT_TRUE(!list.Contains(10));
+
+  SkipList<Key, TestComparator>::Iterator iter(&list);
+  ASSERT_TRUE(!iter.Valid());
+  iter.SeekToFirst();
+  ASSERT_TRUE(!iter.Valid());
+  iter.Seek(100);
+  ASSERT_TRUE(!iter.Valid());
+  iter.SeekToLast();
+  ASSERT_TRUE(!iter.Valid());
+}
+
+TEST(SkipTest, InsertAndLookup) {
+  const int N = 2000;
+  const int R = 5000;
+  Random rnd(1000);
+  std::set<Key> keys;
+  Arena arena;
+  TestComparator cmp;
+  SkipList<Key, TestComparator> list(cmp, &arena);
+  for (int i = 0; i < N; i++) {
+    Key key = rnd.Next() % R;
+    if (keys.insert(key).second) {
+      list.Insert(key);
+    }
+  }
+
+  for (int i = 0; i < R; i++) {
+    if (list.Contains(i)) {
+      ASSERT_EQ(keys.count(i), 1U);
+    } else {
+      ASSERT_EQ(keys.count(i), 0U);
+    }
+  }
+
+  // Simple iterator tests
+  {
+    SkipList<Key, TestComparator>::Iterator iter(&list);
+    ASSERT_TRUE(!iter.Valid());
+
+    iter.Seek(0);
+    ASSERT_TRUE(iter.Valid());
+    ASSERT_EQ(*(keys.begin()), iter.key());
+
+    iter.SeekToFirst();
+    ASSERT_TRUE(iter.Valid());
+    ASSERT_EQ(*(keys.begin()), iter.key());
+
+    iter.SeekToLast();
+    ASSERT_TRUE(iter.Valid());
+    ASSERT_EQ(*(keys.rbegin()), iter.key());
+  }
+
+  // Forward iteration test
+  for (int i = 0; i < R; i++) {
+    SkipList<Key, TestComparator>::Iterator iter(&list);
+    iter.Seek(i);
+
+    // Compare against model iterator
+    std::set<Key>::iterator model_iter = keys.lower_bound(i);
+    for (int j = 0; j < 3; j++) {
+      if (model_iter == keys.end()) {
+        ASSERT_TRUE(!iter.Valid());
+        break;
+      } else {
+        ASSERT_TRUE(iter.Valid());
+        ASSERT_EQ(*model_iter, iter.key());
+        ++model_iter;
+        iter.Next();
+      }
+    }
+  }
+
+  // Backward iteration test
+  {
+    SkipList<Key, TestComparator>::Iterator iter(&list);
+    iter.SeekToLast();
+
+    // Compare against model iterator
+    for (std::set<Key>::reverse_iterator model_iter = keys.rbegin();
+         model_iter != keys.rend();
+         ++model_iter) {
+      ASSERT_TRUE(iter.Valid());
+      ASSERT_EQ(*model_iter, iter.key());
+      iter.Prev();
+    }
+    ASSERT_TRUE(!iter.Valid());
+  }
+}
+
+// We want to make sure that with a single writer and multiple
+// concurrent readers (with no synchronization other than when a
+// reader's iterator is created), the reader always observes all the
+// data that was present in the skip list when the iterator was
+// constructor.  Because insertions are happening concurrently, we may
+// also observe new values that were inserted since the iterator was
+// constructed, but we should never miss any values that were present
+// at iterator construction time.
+//
+// We generate multi-part keys:
+//     <key,gen,hash>
+// where:
+//     key is in range [0..K-1]
+//     gen is a generation number for key
+//     hash is hash(key,gen)
+//
+// The insertion code picks a random key, sets gen to be 1 + the last
+// generation number inserted for that key, and sets hash to Hash(key,gen).
+//
+// At the beginning of a read, we snapshot the last inserted
+// generation number for each key.  We then iterate, including random
+// calls to Next() and Seek().  For every key we encounter, we
+// check that it is either expected given the initial snapshot or has
+// been concurrently added since the iterator started.
+class ConcurrentTest {
+ private:
+  static const uint32_t K = 4;
+
+  static uint64_t key(Key key) { return (key >> 40); }
+  static uint64_t gen(Key key) { return (key >> 8) & 0xffffffffu; }
+  static uint64_t hash(Key key) { return key & 0xff; }
+
+  static uint64_t HashNumbers(uint64_t k, uint64_t g) {
+    uint64_t data[2] = { k, g };
+    return Hash(reinterpret_cast<char*>(data), sizeof(data), 0);
+  }
+
+  static Key MakeKey(uint64_t k, uint64_t g) {
+    assert(sizeof(Key) == sizeof(uint64_t));
+    assert(k <= K);  // We sometimes pass K to seek to the end of the skiplist
+    assert(g <= 0xffffffffu);
+    return ((k << 40) | (g << 8) | (HashNumbers(k, g) & 0xff));
+  }
+
+  static bool IsValidKey(Key k) {
+    return hash(k) == (HashNumbers(key(k), gen(k)) & 0xff);
+  }
+
+  static Key RandomTarget(Random* rnd) {
+    switch (rnd->Next() % 10) {
+      case 0:
+        // Seek to beginning
+        return MakeKey(0, 0);
+      case 1:
+        // Seek to end
+        return MakeKey(K, 0);
+      default:
+        // Seek to middle
+        return MakeKey(rnd->Next() % K, 0);
+    }
+  }
+
+  // Per-key generation
+  struct State {
+    port::AtomicPointer generation[K];
+    void Set(int k, intptr_t v) {
+      generation[k].Release_Store(reinterpret_cast<void*>(v));
+    }
+    intptr_t Get(int k) {
+      return reinterpret_cast<intptr_t>(generation[k].Acquire_Load());
+    }
+
+    State() {
+      for (unsigned int k = 0; k < K; k++) {
+        Set(k, 0);
+      }
+    }
+  };
+
+  // Current state of the test
+  State current_;
+
+  Arena arena_;
+
+  // SkipList is not protected by mu_.  We just use a single writer
+  // thread to modify it.
+  SkipList<Key, TestComparator> list_;
+
+ public:
+  ConcurrentTest() : list_(TestComparator(), &arena_) {}
+
+  // REQUIRES: External synchronization
+  void WriteStep(Random* rnd) {
+    const uint32_t k = rnd->Next() % K;
+    const intptr_t g = current_.Get(k) + 1;
+    const Key key = MakeKey(k, g);
+    list_.Insert(key);
+    current_.Set(k, g);
+  }
+
+  void ReadStep(Random* rnd) {
+    // Remember the initial committed state of the skiplist.
+    State initial_state;
+    for (unsigned int k = 0; k < K; k++) {
+      initial_state.Set(k, current_.Get(k));
+    }
+
+    Key pos = RandomTarget(rnd);
+    SkipList<Key, TestComparator>::Iterator iter(&list_);
+    iter.Seek(pos);
+    while (true) {
+      Key current;
+      if (!iter.Valid()) {
+        current = MakeKey(K, 0);
+      } else {
+        current = iter.key();
+        ASSERT_TRUE(IsValidKey(current)) << current;
+      }
+      ASSERT_LE(pos, current) << "should not go backwards";
+
+      // Verify that everything in [pos,current) was not present in
+      // initial_state.
+      while (pos < current) {
+        ASSERT_LT(key(pos), K) << pos;
+
+        // Note that generation 0 is never inserted, so it is ok if
+        // <*,0,*> is missing.
+        ASSERT_TRUE((gen(pos) == 0U) ||
+                    (gen(pos) > (uint64_t)initial_state.Get(key(pos)))
+                    ) << "key: " << key(pos)
+                      << "; gen: " << gen(pos)
+                      << "; initgen: "
+                      << initial_state.Get(key(pos));
+
+        // Advance to next key in the valid key space
+        if (key(pos) < key(current)) {
+          pos = MakeKey(key(pos) + 1, 0);
+        } else {
+          pos = MakeKey(key(pos), gen(pos) + 1);
+        }
+      }
+
+      if (!iter.Valid()) {
+        break;
+      }
+
+      if (rnd->Next() % 2) {
+        iter.Next();
+        pos = MakeKey(key(pos), gen(pos) + 1);
+      } else {
+        Key new_target = RandomTarget(rnd);
+        if (new_target > pos) {
+          pos = new_target;
+          iter.Seek(new_target);
+        }
+      }
+    }
+  }
+};
+const uint32_t ConcurrentTest::K;
+
+// Simple test that does single-threaded testing of the ConcurrentTest
+// scaffolding.
+TEST(SkipTest, ConcurrentWithoutThreads) {
+  ConcurrentTest test;
+  Random rnd(test::RandomSeed());
+  for (int i = 0; i < 10000; i++) {
+    test.ReadStep(&rnd);
+    test.WriteStep(&rnd);
+  }
+}
+
+class TestState {
+ public:
+  ConcurrentTest t_;
+  int seed_;
+  port::AtomicPointer quit_flag_;
+
+  enum ReaderState {
+    STARTING,
+    RUNNING,
+    DONE
+  };
+
+  explicit TestState(int s)
+      : seed_(s),
+        quit_flag_(nullptr),
+        state_(STARTING),
+        state_cv_(&mu_) {}
+
+  void Wait(ReaderState s) {
+    mu_.Lock();
+    while (state_ != s) {
+      state_cv_.Wait();
+    }
+    mu_.Unlock();
+  }
+
+  void Change(ReaderState s) {
+    mu_.Lock();
+    state_ = s;
+    state_cv_.Signal();
+    mu_.Unlock();
+  }
+
+ private:
+  port::Mutex mu_;
+  ReaderState state_;
+  port::CondVar state_cv_;
+};
+
+static void ConcurrentReader(void* arg) {
+  TestState* state = reinterpret_cast<TestState*>(arg);
+  Random rnd(state->seed_);
+  int64_t reads = 0;
+  state->Change(TestState::RUNNING);
+  while (!state->quit_flag_.Acquire_Load()) {
+    state->t_.ReadStep(&rnd);
+    ++reads;
+  }
+  state->Change(TestState::DONE);
+}
+
+static void RunConcurrent(int run) {
+  const int seed = test::RandomSeed() + (run * 100);
+  Random rnd(seed);
+  const int N = 1000;
+  const int kSize = 1000;
+  for (int i = 0; i < N; i++) {
+    if ((i % 100) == 0) {
+      fprintf(stderr, "Run %d of %d\n", i, N);
+    }
+    TestState state(seed + 1);
+    Env::Default()->Schedule(ConcurrentReader, &state);
+    state.Wait(TestState::RUNNING);
+    for (int i = 0; i < kSize; i++) {
+      state.t_.WriteStep(&rnd);
+    }
+    state.quit_flag_.Release_Store(&state);  // Any non-nullptr arg will do
+    state.Wait(TestState::DONE);
+  }
+}
+
+TEST(SkipTest, Concurrent1) { RunConcurrent(1); }
+TEST(SkipTest, Concurrent2) { RunConcurrent(2); }
+TEST(SkipTest, Concurrent3) { RunConcurrent(3); }
+TEST(SkipTest, Concurrent4) { RunConcurrent(4); }
+TEST(SkipTest, Concurrent5) { RunConcurrent(5); }
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/db/snapshot.h b/db/snapshot.h
new file mode 100644 (file)
index 0000000..2c2e3ea
--- /dev/null
@@ -0,0 +1,86 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include "rocksdb/db.h"
+
+namespace rocksdb {
+
+class SnapshotList;
+
+// Snapshots are kept in a doubly-linked list in the DB.
+// Each SnapshotImpl corresponds to a particular sequence number.
+class SnapshotImpl : public Snapshot {
+ public:
+  SequenceNumber number_;  // const after creation
+
+ private:
+  friend class SnapshotList;
+
+  // SnapshotImpl is kept in a doubly-linked circular list
+  SnapshotImpl* prev_;
+  SnapshotImpl* next_;
+
+  SnapshotList* list_;                 // just for sanity checks
+};
+
+class SnapshotList {
+ public:
+  SnapshotList() {
+    list_.prev_ = &list_;
+    list_.next_ = &list_;
+    list_.number_ = 0xFFFFFFFFL;      // placeholder marker, for debugging
+  }
+
+  bool empty() const { return list_.next_ == &list_; }
+  SnapshotImpl* oldest() const { assert(!empty()); return list_.next_; }
+  SnapshotImpl* newest() const { assert(!empty()); return list_.prev_; }
+
+  const SnapshotImpl* New(SequenceNumber seq) {
+    SnapshotImpl* s = new SnapshotImpl;
+    s->number_ = seq;
+    s->list_ = this;
+    s->next_ = &list_;
+    s->prev_ = list_.prev_;
+    s->prev_->next_ = s;
+    s->next_->prev_ = s;
+    return s;
+  }
+
+  void Delete(const SnapshotImpl* s) {
+    assert(s->list_ == this);
+    s->prev_->next_ = s->next_;
+    s->next_->prev_ = s->prev_;
+    delete s;
+  }
+
+  // retrieve all snapshot numbers. They are sorted in ascending order.
+  void getAll(std::vector<SequenceNumber>& ret) {
+    if (empty()) return;
+    SnapshotImpl* s = &list_;
+    while (s->next_ != &list_) {
+      ret.push_back(s->next_->number_);
+      s = s ->next_;
+    }
+  }
+
+  // get the sequence number of the most recent snapshot
+  const SequenceNumber GetNewest() {
+    if (empty()) {
+      return 0;
+    }
+    return newest()->number_;
+  }
+
+ private:
+  // Dummy head of doubly-linked list of snapshots
+  SnapshotImpl list_;
+};
+
+}  // namespace rocksdb
diff --git a/db/table_cache.cc b/db/table_cache.cc
new file mode 100644 (file)
index 0000000..2321d03
--- /dev/null
@@ -0,0 +1,197 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/table_cache.h"
+
+#include "db/filename.h"
+#include "db/version_edit.h"
+
+#include "rocksdb/statistics.h"
+#include "table/table_reader.h"
+#include "util/coding.h"
+#include "util/stop_watch.h"
+
+namespace rocksdb {
+
+static void DeleteEntry(const Slice& key, void* value) {
+  TableReader* table_reader = reinterpret_cast<TableReader*>(value);
+  delete table_reader;
+}
+
+static void UnrefEntry(void* arg1, void* arg2) {
+  Cache* cache = reinterpret_cast<Cache*>(arg1);
+  Cache::Handle* h = reinterpret_cast<Cache::Handle*>(arg2);
+  cache->Release(h);
+}
+
+static Slice GetSliceForFileNumber(uint64_t* file_number) {
+  return Slice(reinterpret_cast<const char*>(file_number),
+               sizeof(*file_number));
+}
+
+TableCache::TableCache(const std::string& dbname, const Options* options,
+                       const EnvOptions& storage_options, Cache* const cache)
+    : env_(options->env),
+      dbname_(dbname),
+      options_(options),
+      storage_options_(storage_options),
+      cache_(cache) {}
+
+TableCache::~TableCache() {
+}
+
+TableReader* TableCache::GetTableReaderFromHandle(Cache::Handle* handle) {
+  return reinterpret_cast<TableReader*>(cache_->Value(handle));
+}
+
+void TableCache::ReleaseHandle(Cache::Handle* handle) {
+  cache_->Release(handle);
+}
+
+Status TableCache::FindTable(const EnvOptions& toptions,
+                             const InternalKeyComparator& internal_comparator,
+                             uint64_t file_number, uint64_t file_size,
+                             Cache::Handle** handle, bool* table_io,
+                             const bool no_io) {
+  Status s;
+  Slice key = GetSliceForFileNumber(&file_number);
+  *handle = cache_->Lookup(key);
+  if (*handle == nullptr) {
+    if (no_io) { // Dont do IO and return a not-found status
+      return Status::Incomplete("Table not found in table_cache, no_io is set");
+    }
+    if (table_io != nullptr) {
+      *table_io = true;    // we had to do IO from storage
+    }
+    std::string fname = TableFileName(dbname_, file_number);
+    unique_ptr<RandomAccessFile> file;
+    unique_ptr<TableReader> table_reader;
+    s = env_->NewRandomAccessFile(fname, &file, toptions);
+    RecordTick(options_->statistics.get(), NO_FILE_OPENS);
+    if (s.ok()) {
+      if (options_->advise_random_on_open) {
+        file->Hint(RandomAccessFile::RANDOM);
+      }
+      StopWatch sw(env_, options_->statistics.get(), TABLE_OPEN_IO_MICROS);
+      s = options_->table_factory->NewTableReader(
+          *options_, toptions, internal_comparator, std::move(file), file_size,
+          &table_reader);
+    }
+
+    if (!s.ok()) {
+      assert(table_reader == nullptr);
+      RecordTick(options_->statistics.get(), NO_FILE_ERRORS);
+      // We do not cache error results so that if the error is transient,
+      // or somebody repairs the file, we recover automatically.
+    } else {
+      assert(file.get() == nullptr);
+      *handle = cache_->Insert(key, table_reader.release(), 1, &DeleteEntry);
+    }
+  }
+  return s;
+}
+
+Iterator* TableCache::NewIterator(const ReadOptions& options,
+                                  const EnvOptions& toptions,
+                                  const InternalKeyComparator& icomparator,
+                                  const FileMetaData& file_meta,
+                                  TableReader** table_reader_ptr,
+                                  bool for_compaction) {
+  if (table_reader_ptr != nullptr) {
+    *table_reader_ptr = nullptr;
+  }
+  TableReader* table_reader = file_meta.table_reader;
+  Cache::Handle* handle = nullptr;
+  Status s;
+  if (table_reader == nullptr) {
+    s = FindTable(toptions, icomparator, file_meta.number, file_meta.file_size,
+                  &handle, nullptr, options.read_tier == kBlockCacheTier);
+    if (!s.ok()) {
+      return NewErrorIterator(s);
+    }
+    table_reader = GetTableReaderFromHandle(handle);
+  }
+
+  Iterator* result = table_reader->NewIterator(options);
+  if (handle != nullptr) {
+    result->RegisterCleanup(&UnrefEntry, cache_, handle);
+  }
+  if (table_reader_ptr != nullptr) {
+    *table_reader_ptr = table_reader;
+  }
+
+  if (for_compaction) {
+    table_reader->SetupForCompaction();
+  }
+
+  return result;
+}
+
+Status TableCache::Get(const ReadOptions& options,
+                       const InternalKeyComparator& internal_comparator,
+                       const FileMetaData& file_meta, const Slice& k, void* arg,
+                       bool (*saver)(void*, const ParsedInternalKey&,
+                                     const Slice&, bool),
+                       bool* table_io, void (*mark_key_may_exist)(void*)) {
+  TableReader* t = file_meta.table_reader;
+  Status s;
+  Cache::Handle* handle = nullptr;
+  if (!t) {
+    s = FindTable(storage_options_, internal_comparator, file_meta.number,
+                  file_meta.file_size, &handle, table_io,
+                  options.read_tier == kBlockCacheTier);
+    if (s.ok()) {
+      t = GetTableReaderFromHandle(handle);
+    }
+  }
+  if (s.ok()) {
+    s = t->Get(options, k, arg, saver, mark_key_may_exist);
+    if (handle != nullptr) {
+      ReleaseHandle(handle);
+    }
+  } else if (options.read_tier && s.IsIncomplete()) {
+    // Couldnt find Table in cache but treat as kFound if no_io set
+    (*mark_key_may_exist)(arg);
+    return Status::OK();
+  }
+  return s;
+}
+Status TableCache::GetTableProperties(
+    const EnvOptions& toptions,
+    const InternalKeyComparator& internal_comparator,
+    const FileMetaData& file_meta,
+    std::shared_ptr<const TableProperties>* properties, bool no_io) {
+  Status s;
+  auto table_reader = file_meta.table_reader;
+  // table already been pre-loaded?
+  if (table_reader) {
+    *properties = table_reader->GetTableProperties();
+
+    return s;
+  }
+
+  bool table_io;
+  Cache::Handle* table_handle = nullptr;
+  s = FindTable(toptions, internal_comparator, file_meta.number,
+                file_meta.file_size, &table_handle, &table_io, no_io);
+  if (!s.ok()) {
+    return s;
+  }
+  assert(table_handle);
+  auto table = GetTableReaderFromHandle(table_handle);
+  *properties = table->GetTableProperties();
+  ReleaseHandle(table_handle);
+  return s;
+}
+
+void TableCache::Evict(Cache* cache, uint64_t file_number) {
+  cache->Erase(GetSliceForFileNumber(&file_number));
+}
+
+}  // namespace rocksdb
diff --git a/db/table_cache.h b/db/table_cache.h
new file mode 100644 (file)
index 0000000..e8cd7ea
--- /dev/null
@@ -0,0 +1,94 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Thread-safe (provides internal synchronization)
+
+#pragma once
+#include <string>
+#include <stdint.h>
+
+#include "db/dbformat.h"
+#include "port/port.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/env.h"
+#include "rocksdb/table.h"
+#include "table/table_reader.h"
+
+namespace rocksdb {
+
+class Env;
+struct FileMetaData;
+
+// TODO(sdong): try to come up with a better API to pass the file information
+//              other than simply passing FileMetaData.
+class TableCache {
+ public:
+  TableCache(const std::string& dbname, const Options* options,
+             const EnvOptions& storage_options, Cache* cache);
+  ~TableCache();
+
+  // Return an iterator for the specified file number (the corresponding
+  // file length must be exactly "file_size" bytes).  If "tableptr" is
+  // non-nullptr, also sets "*tableptr" to point to the Table object
+  // underlying the returned iterator, or nullptr if no Table object underlies
+  // the returned iterator.  The returned "*tableptr" object is owned by
+  // the cache and should not be deleted, and is valid for as long as the
+  // returned iterator is live.
+  Iterator* NewIterator(const ReadOptions& options, const EnvOptions& toptions,
+                        const InternalKeyComparator& internal_comparator,
+                        const FileMetaData& file_meta,
+                        TableReader** table_reader_ptr = nullptr,
+                        bool for_compaction = false);
+
+  // If a seek to internal key "k" in specified file finds an entry,
+  // call (*handle_result)(arg, found_key, found_value) repeatedly until
+  // it returns false.
+  Status Get(const ReadOptions& options,
+             const InternalKeyComparator& internal_comparator,
+             const FileMetaData& file_meta, const Slice& k, void* arg,
+             bool (*handle_result)(void*, const ParsedInternalKey&,
+                                   const Slice&, bool),
+             bool* table_io, void (*mark_key_may_exist)(void*) = nullptr);
+
+  // Evict any entry for the specified file number
+  static void Evict(Cache* cache, uint64_t file_number);
+
+  // Find table reader
+  Status FindTable(const EnvOptions& toptions,
+                   const InternalKeyComparator& internal_comparator,
+                   uint64_t file_number, uint64_t file_size, Cache::Handle**,
+                   bool* table_io = nullptr, const bool no_io = false);
+
+  // Get TableReader from a cache handle.
+  TableReader* GetTableReaderFromHandle(Cache::Handle* handle);
+
+  // Get the table properties of a given table.
+  // @no_io: indicates if we should load table to the cache if it is not present
+  //         in table cache yet.
+  // @returns: `properties` will be reset on success. Please note that we will
+  //            return Status::Incomplete() if table is not present in cache and
+  //            we set `no_io` to be true.
+  Status GetTableProperties(const EnvOptions& toptions,
+                            const InternalKeyComparator& internal_comparator,
+                            const FileMetaData& file_meta,
+                            std::shared_ptr<const TableProperties>* properties,
+                            bool no_io = false);
+
+  // Release the handle from a cache
+  void ReleaseHandle(Cache::Handle* handle);
+
+ private:
+  Env* const env_;
+  const std::string dbname_;
+  const Options* options_;
+  const EnvOptions& storage_options_;
+  Cache* const cache_;
+};
+
+}  // namespace rocksdb
diff --git a/db/table_properties_collector.cc b/db/table_properties_collector.cc
new file mode 100644 (file)
index 0000000..25bd700
--- /dev/null
@@ -0,0 +1,83 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include "db/table_properties_collector.h"
+
+#include "db/dbformat.h"
+#include "util/coding.h"
+
+namespace rocksdb {
+
+Status InternalKeyPropertiesCollector::Add(
+    const Slice& key, const Slice& value) {
+  ParsedInternalKey ikey;
+  if (!ParseInternalKey(key, &ikey)) {
+    return Status::InvalidArgument("Invalid internal key");
+  }
+
+  if (ikey.type == ValueType::kTypeDeletion) {
+    ++deleted_keys_;
+  }
+
+  return Status::OK();
+}
+
+Status InternalKeyPropertiesCollector::Finish(
+    UserCollectedProperties* properties) {
+  assert(properties);
+  assert(properties->find(
+        InternalKeyTablePropertiesNames::kDeletedKeys) == properties->end());
+  std::string val;
+
+  PutVarint64(&val, deleted_keys_);
+  properties->insert({ InternalKeyTablePropertiesNames::kDeletedKeys, val });
+
+  return Status::OK();
+}
+
+UserCollectedProperties
+InternalKeyPropertiesCollector::GetReadableProperties() const {
+  return {
+    { "kDeletedKeys", std::to_string(deleted_keys_) }
+  };
+}
+
+
+Status UserKeyTablePropertiesCollector::Add(
+    const Slice& key, const Slice& value) {
+  ParsedInternalKey ikey;
+  if (!ParseInternalKey(key, &ikey)) {
+    return Status::InvalidArgument("Invalid internal key");
+  }
+
+  return collector_->Add(ikey.user_key, value);
+}
+
+Status UserKeyTablePropertiesCollector::Finish(
+    UserCollectedProperties* properties) {
+  return collector_->Finish(properties);
+}
+
+UserCollectedProperties
+UserKeyTablePropertiesCollector::GetReadableProperties() const {
+  return collector_->GetReadableProperties();
+}
+
+
+const std::string InternalKeyTablePropertiesNames::kDeletedKeys
+  = "rocksdb.deleted.keys";
+
+uint64_t GetDeletedKeys(
+    const UserCollectedProperties& props) {
+  auto pos = props.find(InternalKeyTablePropertiesNames::kDeletedKeys);
+  if (pos == props.end()) {
+    return 0;
+  }
+  Slice raw = pos->second;
+  uint64_t val = 0;
+  return GetVarint64(&raw, &val) ? val : 0;
+}
+
+}  // namespace rocksdb
diff --git a/db/table_properties_collector.h b/db/table_properties_collector.h
new file mode 100644 (file)
index 0000000..6cf5629
--- /dev/null
@@ -0,0 +1,72 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file defines a collection of statistics collectors.
+#pragma once
+
+#include "rocksdb/table_properties.h"
+
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace rocksdb {
+
+struct InternalKeyTablePropertiesNames {
+  static const std::string kDeletedKeys;
+};
+
+// Collecting the statistics for internal keys. Visible only by internal
+// rocksdb modules.
+class InternalKeyPropertiesCollector : public TablePropertiesCollector {
+ public:
+  virtual Status Add(const Slice& key, const Slice& value) override;
+
+  virtual Status Finish(UserCollectedProperties* properties) override;
+
+  virtual const char* Name() const override {
+    return "InternalKeyPropertiesCollector";
+  }
+
+  UserCollectedProperties GetReadableProperties() const override;
+
+ private:
+  uint64_t deleted_keys_ = 0;
+};
+
+// When rocksdb creates a new table, it will encode all "user keys" into
+// "internal keys", which contains meta information of a given entry.
+//
+// This class extracts user key from the encoded internal key when Add() is
+// invoked.
+class UserKeyTablePropertiesCollector : public TablePropertiesCollector {
+ public:
+  explicit UserKeyTablePropertiesCollector(
+      TablePropertiesCollector* collector) :
+      UserKeyTablePropertiesCollector(
+        std::shared_ptr<TablePropertiesCollector>(collector)
+    ) {
+  }
+
+  explicit UserKeyTablePropertiesCollector(
+      std::shared_ptr<TablePropertiesCollector> collector) :
+      collector_(collector) {
+  }
+
+  virtual ~UserKeyTablePropertiesCollector() { }
+
+  virtual Status Add(const Slice& key, const Slice& value) override;
+
+  virtual Status Finish(UserCollectedProperties* properties) override;
+
+  virtual const char* Name() const override { return collector_->Name(); }
+
+  UserCollectedProperties GetReadableProperties() const override;
+
+ protected:
+  std::shared_ptr<TablePropertiesCollector> collector_;
+};
+
+}  // namespace rocksdb
diff --git a/db/table_properties_collector_test.cc b/db/table_properties_collector_test.cc
new file mode 100644 (file)
index 0000000..ea15260
--- /dev/null
@@ -0,0 +1,312 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include <map>
+#include <memory>
+#include <string>
+
+#include "db/db_impl.h"
+#include "db/dbformat.h"
+#include "db/table_properties_collector.h"
+#include "rocksdb/table.h"
+#include "table/block_based_table_factory.h"
+#include "table/meta_blocks.h"
+#include "table/plain_table_factory.h"
+#include "table/table_builder.h"
+#include "util/coding.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+
+class TablePropertiesTest {
+};
+
+// TODO(kailiu) the following classes should be moved to some more general
+// places, so that other tests can also make use of them.
+// `FakeWritableFile` and `FakeRandomeAccessFile` bypass the real file system
+// and therefore enable us to quickly setup the tests.
+class FakeWritableFile : public WritableFile {
+ public:
+  ~FakeWritableFile() { }
+
+  const std::string& contents() const { return contents_; }
+
+  virtual Status Close() { return Status::OK(); }
+  virtual Status Flush() { return Status::OK(); }
+  virtual Status Sync() { return Status::OK(); }
+
+  virtual Status Append(const Slice& data) {
+    contents_.append(data.data(), data.size());
+    return Status::OK();
+  }
+
+ private:
+  std::string contents_;
+};
+
+
+class FakeRandomeAccessFile : public RandomAccessFile {
+ public:
+  explicit FakeRandomeAccessFile(const Slice& contents)
+      : contents_(contents.data(), contents.size()) {
+  }
+
+  virtual ~FakeRandomeAccessFile() { }
+
+  uint64_t Size() const { return contents_.size(); }
+
+  virtual Status Read(uint64_t offset, size_t n, Slice* result,
+                       char* scratch) const {
+    if (offset > contents_.size()) {
+      return Status::InvalidArgument("invalid Read offset");
+    }
+    if (offset + n > contents_.size()) {
+      n = contents_.size() - offset;
+    }
+    memcpy(scratch, &contents_[offset], n);
+    *result = Slice(scratch, n);
+    return Status::OK();
+  }
+
+ private:
+  std::string contents_;
+};
+
+
+class DumbLogger : public Logger {
+ public:
+  virtual void Logv(const char* format, va_list ap) { }
+  virtual size_t GetLogFileSize() const { return 0; }
+};
+
+// Utilities test functions
+namespace {
+void MakeBuilder(const Options& options,
+                 const InternalKeyComparator& internal_comparator,
+                 std::unique_ptr<FakeWritableFile>* writable,
+                 std::unique_ptr<TableBuilder>* builder) {
+  writable->reset(new FakeWritableFile);
+  builder->reset(options.table_factory->NewTableBuilder(
+      options, internal_comparator, writable->get(), options.compression));
+}
+}  // namespace
+
+// Collects keys that starts with "A" in a table.
+class RegularKeysStartWithA: public TablePropertiesCollector {
+ public:
+   const char* Name() const { return "RegularKeysStartWithA"; }
+
+   Status Finish(UserCollectedProperties* properties) {
+     std::string encoded;
+     PutVarint32(&encoded, count_);
+     *properties = UserCollectedProperties {
+       { "TablePropertiesTest", "Rocksdb" },
+       { "Count", encoded }
+     };
+     return Status::OK();
+   }
+
+   Status Add(const Slice& user_key, const Slice& value) {
+     // simply asssume all user keys are not empty.
+     if (user_key.data()[0] == 'A') {
+       ++count_;
+     }
+     return Status::OK();
+   }
+
+  virtual UserCollectedProperties GetReadableProperties() const {
+    return UserCollectedProperties{};
+  }
+
+
+ private:
+  uint32_t count_ = 0;
+};
+
+extern uint64_t kBlockBasedTableMagicNumber;
+extern uint64_t kPlainTableMagicNumber;
+namespace {
+void TestCustomizedTablePropertiesCollector(
+    uint64_t magic_number, bool encode_as_internal, const Options& options,
+    const InternalKeyComparator& internal_comparator) {
+  // make sure the entries will be inserted with order.
+  std::map<std::string, std::string> kvs = {
+    {"About   ", "val5"},  // starts with 'A'
+    {"Abstract", "val2"},  // starts with 'A'
+    {"Around  ", "val7"},  // starts with 'A'
+    {"Beyond  ", "val3"},
+    {"Builder ", "val1"},
+    {"Cancel  ", "val4"},
+    {"Find    ", "val6"},
+  };
+
+  // -- Step 1: build table
+  std::unique_ptr<TableBuilder> builder;
+  std::unique_ptr<FakeWritableFile> writable;
+  MakeBuilder(options, internal_comparator, &writable, &builder);
+
+  for (const auto& kv : kvs) {
+    if (encode_as_internal) {
+      InternalKey ikey(kv.first, 0, ValueType::kTypeValue);
+      builder->Add(ikey.Encode(), kv.second);
+    } else {
+      builder->Add(kv.first, kv.second);
+    }
+  }
+  ASSERT_OK(builder->Finish());
+
+  // -- Step 2: Read properties
+  FakeRandomeAccessFile readable(writable->contents());
+  TableProperties* props;
+  Status s = ReadTableProperties(
+      &readable,
+      writable->contents().size(),
+      magic_number,
+      Env::Default(),
+      nullptr,
+      &props
+  );
+  std::unique_ptr<TableProperties> props_guard(props);
+  ASSERT_OK(s);
+
+  auto user_collected = props->user_collected_properties;
+
+  ASSERT_EQ("Rocksdb", user_collected.at("TablePropertiesTest"));
+
+  uint32_t starts_with_A = 0;
+  Slice key(user_collected.at("Count"));
+  ASSERT_TRUE(GetVarint32(&key, &starts_with_A));
+  ASSERT_EQ(3u, starts_with_A);
+}
+}  // namespace
+
+TEST(TablePropertiesTest, CustomizedTablePropertiesCollector) {
+  // Test properties collectors with internal keys or regular keys
+  // for block based table
+  for (bool encode_as_internal : { true, false }) {
+    Options options;
+    auto collector = new RegularKeysStartWithA();
+    if (encode_as_internal) {
+      options.table_properties_collectors = {
+        std::make_shared<UserKeyTablePropertiesCollector>(collector)
+      };
+    } else {
+      options.table_properties_collectors.resize(1);
+      options.table_properties_collectors[0].reset(collector);
+    }
+    test::PlainInternalKeyComparator ikc(options.comparator);
+    TestCustomizedTablePropertiesCollector(kBlockBasedTableMagicNumber,
+                                           encode_as_internal, options, ikc);
+  }
+
+  // test plain table
+  Options options;
+  options.table_properties_collectors.push_back(
+      std::make_shared<RegularKeysStartWithA>()
+  );
+  options.table_factory = std::make_shared<PlainTableFactory>(8, 8, 0);
+  test::PlainInternalKeyComparator ikc(options.comparator);
+  TestCustomizedTablePropertiesCollector(kPlainTableMagicNumber, true, options,
+                                         ikc);
+}
+
+namespace {
+void TestInternalKeyPropertiesCollector(
+    uint64_t magic_number,
+    bool sanitized,
+    std::shared_ptr<TableFactory> table_factory) {
+  InternalKey keys[] = {
+    InternalKey("A       ", 0, ValueType::kTypeValue),
+    InternalKey("B       ", 0, ValueType::kTypeValue),
+    InternalKey("C       ", 0, ValueType::kTypeValue),
+    InternalKey("W       ", 0, ValueType::kTypeDeletion),
+    InternalKey("X       ", 0, ValueType::kTypeDeletion),
+    InternalKey("Y       ", 0, ValueType::kTypeDeletion),
+    InternalKey("Z       ", 0, ValueType::kTypeDeletion),
+  };
+
+  std::unique_ptr<TableBuilder> builder;
+  std::unique_ptr<FakeWritableFile> writable;
+  Options options;
+  test::PlainInternalKeyComparator pikc(options.comparator);
+
+  options.table_factory = table_factory;
+  if (sanitized) {
+    options.table_properties_collectors = {
+      std::make_shared<RegularKeysStartWithA>()
+    };
+    // with sanitization, even regular properties collector will be able to
+    // handle internal keys.
+    auto comparator = options.comparator;
+    // HACK: Set options.info_log to avoid writing log in
+    // SanitizeOptions().
+    options.info_log = std::make_shared<DumbLogger>();
+    options = SanitizeOptions("db",            // just a place holder
+                              &pikc, nullptr,  // don't care filter policy
+                              options);
+    options.comparator = comparator;
+  } else {
+    options.table_properties_collectors = {
+      std::make_shared<InternalKeyPropertiesCollector>()
+    };
+  }
+
+  MakeBuilder(options, pikc, &writable, &builder);
+  for (const auto& k : keys) {
+    builder->Add(k.Encode(), "val");
+  }
+
+  ASSERT_OK(builder->Finish());
+
+  FakeRandomeAccessFile readable(writable->contents());
+  TableProperties* props;
+  Status s = ReadTableProperties(
+      &readable,
+      writable->contents().size(),
+      magic_number,
+      Env::Default(),
+      nullptr,
+      &props
+  );
+  ASSERT_OK(s);
+
+  std::unique_ptr<TableProperties> props_guard(props);
+  auto user_collected = props->user_collected_properties;
+  uint64_t deleted = GetDeletedKeys(user_collected);
+  ASSERT_EQ(4u, deleted);
+
+  if (sanitized) {
+    uint32_t starts_with_A = 0;
+    Slice key(user_collected.at("Count"));
+    ASSERT_TRUE(GetVarint32(&key, &starts_with_A));
+    ASSERT_EQ(1u, starts_with_A);
+  }
+}
+}  // namespace
+
+TEST(TablePropertiesTest, InternalKeyPropertiesCollector) {
+  TestInternalKeyPropertiesCollector(
+      kBlockBasedTableMagicNumber,
+      true /* sanitize */,
+      std::make_shared<BlockBasedTableFactory>()
+  );
+  TestInternalKeyPropertiesCollector(
+      kBlockBasedTableMagicNumber,
+      true /* not sanitize */,
+      std::make_shared<BlockBasedTableFactory>()
+  );
+  TestInternalKeyPropertiesCollector(
+      kPlainTableMagicNumber,
+      false /* not sanitize */,
+      std::make_shared<PlainTableFactory>(8, 8, 0)
+  );
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/db/tailing_iter.cc b/db/tailing_iter.cc
new file mode 100644 (file)
index 0000000..67b59b2
--- /dev/null
@@ -0,0 +1,221 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef ROCKSDB_LITE
+#include "db/tailing_iter.h"
+
+#include <string>
+#include <utility>
+#include <vector>
+#include "db/db_impl.h"
+#include "db/db_iter.h"
+#include "db/column_family.h"
+#include "rocksdb/env.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "table/merger.h"
+
+namespace rocksdb {
+
+TailingIterator::TailingIterator(Env* const env, DBImpl* db,
+    const ReadOptions& read_options, ColumnFamilyData* cfd)
+    : env_(env),
+      db_(db),
+      read_options_(read_options),
+      cfd_(cfd),
+      super_version_(nullptr),
+      current_(nullptr),
+      status_(Status::InvalidArgument("Seek() not called on this iterator")) {}
+
+TailingIterator::~TailingIterator() {
+  Cleanup();
+}
+
+bool TailingIterator::Valid() const {
+  return current_ != nullptr;
+}
+
+void TailingIterator::SeekToFirst() {
+  if (!IsCurrentVersion()) {
+    CreateIterators();
+  }
+
+  mutable_->SeekToFirst();
+  immutable_->SeekToFirst();
+  UpdateCurrent();
+}
+
+void TailingIterator::Seek(const Slice& target) {
+  if (!IsCurrentVersion()) {
+    CreateIterators();
+  }
+
+  mutable_->Seek(target);
+
+  // We maintain the interval (prev_key_, immutable_->key()] such that there
+  // are no records with keys within that range in immutable_ other than
+  // immutable_->key(). Since immutable_ can't change in this version, we don't
+  // need to do a seek if 'target' belongs to that interval (i.e. immutable_ is
+  // already at the correct position)!
+  //
+  // If prefix seek is used and immutable_ is not valid, seek if target has a
+  // different prefix than prev_key.
+  //
+  // prev_key_ is updated by Next(). SeekImmutable() sets prev_key_ to
+  // 'target' -- in this case, prev_key_ is included in the interval, so
+  // prev_inclusive_ has to be set.
+
+  const Comparator* cmp = cfd_->user_comparator();
+  if (!is_prev_set_ || cmp->Compare(prev_key_, target) >= !is_prev_inclusive_ ||
+      (immutable_->Valid() && cmp->Compare(target, immutable_->key()) > 0) ||
+      (cfd_->options()->prefix_extractor != nullptr && !IsSamePrefix(target))) {
+    SeekImmutable(target);
+  }
+
+  UpdateCurrent();
+}
+
+void TailingIterator::Next() {
+  assert(Valid());
+
+  if (!IsCurrentVersion()) {
+    // save the current key, create new iterators and then seek
+    std::string current_key = key().ToString();
+    Slice key_slice(current_key.data(), current_key.size());
+
+    CreateIterators();
+    Seek(key_slice);
+
+    if (!Valid() || key().compare(key_slice) != 0) {
+      // record with current_key no longer exists
+      return;
+    }
+
+  } else if (current_ == immutable_.get()) {
+    // immutable iterator is advanced -- update prev_key_
+    prev_key_ = key().ToString();
+    is_prev_inclusive_ = false;
+    is_prev_set_ = true;
+  }
+
+  current_->Next();
+  UpdateCurrent();
+}
+
+Slice TailingIterator::key() const {
+  assert(Valid());
+  return current_->key();
+}
+
+Slice TailingIterator::value() const {
+  assert(Valid());
+  return current_->value();
+}
+
+Status TailingIterator::status() const {
+  if (!status_.ok()) {
+    return status_;
+  } else if (!mutable_->status().ok()) {
+    return mutable_->status();
+  } else {
+    return immutable_->status();
+  }
+}
+
+void TailingIterator::Prev() {
+  status_ = Status::NotSupported("This iterator doesn't support Prev()");
+}
+
+void TailingIterator::SeekToLast() {
+  status_ = Status::NotSupported("This iterator doesn't support SeekToLast()");
+}
+
+void TailingIterator::Cleanup() {
+  // Release old super version if necessary
+  mutable_.reset();
+  immutable_.reset();
+  if (super_version_ != nullptr && super_version_->Unref()) {
+    DBImpl::DeletionState deletion_state;
+    db_->mutex_.Lock();
+    super_version_->Cleanup();
+    db_->FindObsoleteFiles(deletion_state, false, true);
+    db_->mutex_.Unlock();
+    delete super_version_;
+    if (deletion_state.HaveSomethingToDelete()) {
+      db_->PurgeObsoleteFiles(deletion_state);
+    }
+  }
+}
+
+void TailingIterator::CreateIterators() {
+  Cleanup();
+  super_version_= cfd_->GetReferencedSuperVersion(&(db_->mutex_));
+
+  Iterator* mutable_iter = super_version_->mem->NewIterator(read_options_);
+  // create a DBIter that only uses memtable content; see NewIterator()
+  mutable_.reset(
+      NewDBIterator(env_, *cfd_->options(), cfd_->user_comparator(),
+                    mutable_iter, kMaxSequenceNumber));
+
+  std::vector<Iterator*> list;
+  super_version_->imm->AddIterators(read_options_, &list);
+  super_version_->current->AddIterators(
+      read_options_, *cfd_->soptions(), &list);
+  Iterator* immutable_iter =
+      NewMergingIterator(&cfd_->internal_comparator(), &list[0], list.size());
+
+  // create a DBIter that only uses memtable content; see NewIterator()
+  immutable_.reset(
+      NewDBIterator(env_, *cfd_->options(), cfd_->user_comparator(),
+                    immutable_iter, kMaxSequenceNumber));
+
+  current_ = nullptr;
+  is_prev_set_ = false;
+}
+
+void TailingIterator::UpdateCurrent() {
+  current_ = nullptr;
+
+  if (mutable_->Valid()) {
+    current_ = mutable_.get();
+  }
+  const Comparator* cmp = cfd_->user_comparator();
+  if (immutable_->Valid() &&
+      (current_ == nullptr ||
+       cmp->Compare(immutable_->key(), current_->key()) < 0)) {
+    current_ = immutable_.get();
+  }
+
+  if (!status_.ok()) {
+    // reset status that was set by Prev() or SeekToLast()
+    status_ = Status::OK();
+  }
+}
+
+bool TailingIterator::IsCurrentVersion() const {
+  return super_version_ != nullptr &&
+         super_version_->version_number == cfd_->GetSuperVersionNumber();
+}
+
+bool TailingIterator::IsSamePrefix(const Slice& target) const {
+  const SliceTransform* extractor = cfd_->options()->prefix_extractor.get();
+
+  assert(extractor);
+  assert(is_prev_set_);
+
+  return extractor->Transform(target)
+    .compare(extractor->Transform(prev_key_)) == 0;
+}
+
+void TailingIterator::SeekImmutable(const Slice& target) {
+  prev_key_ = target.ToString();
+  is_prev_inclusive_ = true;
+  is_prev_set_ = true;
+
+  immutable_->Seek(target);
+}
+
+}  // namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/db/tailing_iter.h b/db/tailing_iter.h
new file mode 100644 (file)
index 0000000..6b9c513
--- /dev/null
@@ -0,0 +1,97 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <string>
+
+#include "rocksdb/db.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+
+namespace rocksdb {
+
+class DBImpl;
+class Env;
+struct SuperVersion;
+class ColumnFamilyData;
+
+/**
+ * TailingIterator is a special type of iterator that doesn't use an (implicit)
+ * snapshot. In other words, it can be used to read data that was added to the
+ * db after the iterator had been created.
+ *
+ * TailingIterator is optimized for sequential reading. It doesn't support
+ * Prev() and SeekToLast() operations.
+ */
+class TailingIterator : public Iterator {
+ public:
+  TailingIterator(Env* const env, DBImpl* db, const ReadOptions& read_options,
+                  ColumnFamilyData* cfd);
+  virtual ~TailingIterator();
+
+  virtual bool Valid() const override;
+  virtual void SeekToFirst() override;
+  virtual void SeekToLast() override;
+  virtual void Seek(const Slice& target) override;
+  virtual void Next() override;
+  virtual void Prev() override;
+  virtual Slice key() const override;
+  virtual Slice value() const override;
+  virtual Status status() const override;
+
+ private:
+  void Cleanup();
+
+  Env* const env_;
+  DBImpl* const db_;
+  const ReadOptions read_options_;
+  ColumnFamilyData* const cfd_;
+  SuperVersion* super_version_;
+
+  // TailingIterator merges the contents of the two iterators below (one using
+  // mutable memtable contents only, other over SSTs and immutable memtables).
+  // See DBIter::GetTailingIteratorPair().
+  std::unique_ptr<Iterator> mutable_;
+  std::unique_ptr<Iterator> immutable_;
+
+  // points to either mutable_ or immutable_
+  Iterator* current_;
+
+  // key that precedes immutable iterator's current key
+  std::string prev_key_;
+
+  // unless prev_set is true, prev_key/prev_head is not valid and shouldn't be
+  // used; reset by createIterators()
+  bool is_prev_set_;
+
+  // prev_key_ was set by SeekImmutable(), which means that the interval of
+  // keys covered by immutable_ is [prev_key_, current], i.e. it includes the
+  // left endpoint
+  bool is_prev_inclusive_;
+
+  // internal iterator status
+  Status status_;
+
+  // check if this iterator's version matches DB's version
+  bool IsCurrentVersion() const;
+
+  // check if SeekImmutable() is needed due to target having a different prefix
+  // than prev_key_ (used when in prefix seek mode)
+  bool IsSamePrefix(const Slice& target) const;
+
+  // creates mutable_ and immutable_ iterators and updates version_number_
+  void CreateIterators();
+
+  // set current_ to be one of the iterators with the smallest key
+  void UpdateCurrent();
+
+  // seek on immutable_ and update prev_key
+  void SeekImmutable(const Slice& target);
+};
+
+}  // namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/db/transaction_log_impl.cc b/db/transaction_log_impl.cc
new file mode 100644 (file)
index 0000000..82e58f1
--- /dev/null
@@ -0,0 +1,261 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef ROCKSDB_LITE
+#include "db/transaction_log_impl.h"
+#include "db/write_batch_internal.h"
+
+namespace rocksdb {
+
+TransactionLogIteratorImpl::TransactionLogIteratorImpl(
+    const std::string& dir, const DBOptions* options,
+    const TransactionLogIterator::ReadOptions& read_options,
+    const EnvOptions& soptions, const SequenceNumber seq,
+    std::unique_ptr<VectorLogPtr> files, DBImpl const* const dbimpl)
+    : dir_(dir),
+      options_(options),
+      read_options_(read_options),
+      soptions_(soptions),
+      startingSequenceNumber_(seq),
+      files_(std::move(files)),
+      started_(false),
+      isValid_(false),
+      currentFileIndex_(0),
+      currentBatchSeq_(0),
+      currentLastSeq_(0),
+      dbimpl_(dbimpl) {
+  assert(files_ != nullptr);
+  assert(dbimpl_ != nullptr);
+
+  reporter_.env = options_->env;
+  reporter_.info_log = options_->info_log.get();
+  SeekToStartSequence(); // Seek till starting sequence
+}
+
+Status TransactionLogIteratorImpl::OpenLogFile(
+    const LogFile* logFile,
+    unique_ptr<SequentialFile>* file) {
+  Env* env = options_->env;
+  if (logFile->Type() == kArchivedLogFile) {
+    std::string fname = ArchivedLogFileName(dir_, logFile->LogNumber());
+    return env->NewSequentialFile(fname, file, soptions_);
+  } else {
+    std::string fname = LogFileName(dir_, logFile->LogNumber());
+    Status status = env->NewSequentialFile(fname, file, soptions_);
+    if (!status.ok()) {
+      //  If cannot open file in DB directory.
+      //  Try the archive dir, as it could have moved in the meanwhile.
+      fname = ArchivedLogFileName(dir_, logFile->LogNumber());
+      status = env->NewSequentialFile(fname, file, soptions_);
+    }
+    return status;
+  }
+}
+
+BatchResult TransactionLogIteratorImpl::GetBatch()  {
+  assert(isValid_);  //  cannot call in a non valid state.
+  BatchResult result;
+  result.sequence = currentBatchSeq_;
+  result.writeBatchPtr = std::move(currentBatch_);
+  return result;
+}
+
+Status TransactionLogIteratorImpl::status() {
+  return currentStatus_;
+}
+
+bool TransactionLogIteratorImpl::Valid() {
+  return started_ && isValid_;
+}
+
+bool TransactionLogIteratorImpl::RestrictedRead(
+    Slice* record,
+    std::string* scratch) {
+  // Don't read if no more complete entries to read from logs
+  if (currentLastSeq_ >= dbimpl_->GetLatestSequenceNumber()) {
+    return false;
+  }
+  return currentLogReader_->ReadRecord(record, scratch);
+}
+
+void TransactionLogIteratorImpl::SeekToStartSequence(
+    uint64_t startFileIndex,
+    bool strict) {
+  std::string scratch;
+  Slice record;
+  started_ = false;
+  isValid_ = false;
+  if (files_->size() <= startFileIndex) {
+    return;
+  }
+  Status s = OpenLogReader(files_->at(startFileIndex).get());
+  if (!s.ok()) {
+    currentStatus_ = s;
+    return;
+  }
+  while (RestrictedRead(&record, &scratch)) {
+    if (record.size() < 12) {
+      reporter_.Corruption(
+        record.size(), Status::Corruption("very small log record"));
+      continue;
+    }
+    UpdateCurrentWriteBatch(record);
+    if (currentLastSeq_ >= startingSequenceNumber_) {
+      if (strict && currentBatchSeq_ != startingSequenceNumber_) {
+        currentStatus_ = Status::Corruption("Gap in sequence number. Could not "
+                                            "seek to required sequence number");
+        reporter_.Info(currentStatus_.ToString().c_str());
+        return;
+      } else if (strict) {
+        reporter_.Info("Could seek required sequence number. Iterator will "
+                       "continue.");
+      }
+      isValid_ = true;
+      started_ = true; // set started_ as we could seek till starting sequence
+      return;
+    } else {
+      isValid_ = false;
+    }
+  }
+
+  // Could not find start sequence in first file. Normally this must be the
+  // only file. Otherwise log the error and let the iterator return next entry
+  // If strict is set, we want to seek exactly till the start sequence and it
+  // should have been present in the file we scanned above
+  if (strict) {
+    currentStatus_ = Status::Corruption("Gap in sequence number. Could not "
+                                        "seek to required sequence number");
+    reporter_.Info(currentStatus_.ToString().c_str());
+  } else if (files_->size() != 1) {
+    currentStatus_ = Status::Corruption("Start sequence was not found, "
+                                        "skipping to the next available");
+    reporter_.Info(currentStatus_.ToString().c_str());
+    // Let NextImpl find the next available entry. started_ remains false
+    // because we don't want to check for gaps while moving to start sequence
+    NextImpl(true);
+  }
+}
+
+void TransactionLogIteratorImpl::Next() {
+  return NextImpl(false);
+}
+
+void TransactionLogIteratorImpl::NextImpl(bool internal) {
+  std::string scratch;
+  Slice record;
+  isValid_ = false;
+  if (!internal && !started_) {
+    // Runs every time until we can seek to the start sequence
+    return SeekToStartSequence();
+  }
+  while(true) {
+    assert(currentLogReader_);
+    if (currentLogReader_->IsEOF()) {
+      currentLogReader_->UnmarkEOF();
+    }
+    while (RestrictedRead(&record, &scratch)) {
+      if (record.size() < 12) {
+        reporter_.Corruption(
+          record.size(), Status::Corruption("very small log record"));
+        continue;
+      } else {
+        // started_ should be true if called by application
+        assert(internal || started_);
+        // started_ should be false if called internally
+        assert(!internal || !started_);
+        UpdateCurrentWriteBatch(record);
+        if (internal && !started_) {
+          started_ = true;
+        }
+        return;
+      }
+    }
+
+    // Open the next file
+    if (currentFileIndex_ < files_->size() - 1) {
+      ++currentFileIndex_;
+      Status status =OpenLogReader(files_->at(currentFileIndex_).get());
+      if (!status.ok()) {
+        isValid_ = false;
+        currentStatus_ = status;
+        return;
+      }
+    } else {
+      isValid_ = false;
+      if (currentLastSeq_ == dbimpl_->GetLatestSequenceNumber()) {
+        currentStatus_ = Status::OK();
+      } else {
+        currentStatus_ = Status::Corruption("NO MORE DATA LEFT");
+      }
+      return;
+    }
+  }
+}
+
+bool TransactionLogIteratorImpl::IsBatchExpected(
+    const WriteBatch* batch,
+    const SequenceNumber expectedSeq) {
+  assert(batch);
+  SequenceNumber batchSeq = WriteBatchInternal::Sequence(batch);
+  if (batchSeq != expectedSeq) {
+    char buf[200];
+    snprintf(buf, sizeof(buf),
+             "Discontinuity in log records. Got seq=%lu, Expected seq=%lu, "
+             "Last flushed seq=%lu.Log iterator will reseek the correct "
+             "batch.",
+             (unsigned long)batchSeq,
+             (unsigned long)expectedSeq,
+             (unsigned long)dbimpl_->GetLatestSequenceNumber());
+    reporter_.Info(buf);
+    return false;
+  }
+  return true;
+}
+
+void TransactionLogIteratorImpl::UpdateCurrentWriteBatch(const Slice& record) {
+  std::unique_ptr<WriteBatch> batch(new WriteBatch());
+  WriteBatchInternal::SetContents(batch.get(), record);
+
+  SequenceNumber expectedSeq = currentLastSeq_ + 1;
+  // If the iterator has started, then confirm that we get continuous batches
+  if (started_ && !IsBatchExpected(batch.get(), expectedSeq)) {
+    // Seek to the batch having expected sequence number
+    if (expectedSeq < files_->at(currentFileIndex_)->StartSequence()) {
+      // Expected batch must lie in the previous log file
+      // Avoid underflow.
+      if (currentFileIndex_ != 0) {
+        currentFileIndex_--;
+      }
+    }
+    startingSequenceNumber_ = expectedSeq;
+    // currentStatus_ will be set to Ok if reseek succeeds
+    currentStatus_ = Status::NotFound("Gap in sequence numbers");
+    return SeekToStartSequence(currentFileIndex_, true);
+  }
+
+  currentBatchSeq_ = WriteBatchInternal::Sequence(batch.get());
+  currentLastSeq_ = currentBatchSeq_ +
+                    WriteBatchInternal::Count(batch.get()) - 1;
+  // currentBatchSeq_ can only change here
+  assert(currentLastSeq_ <= dbimpl_->GetLatestSequenceNumber());
+
+  currentBatch_ = move(batch);
+  isValid_ = true;
+  currentStatus_ = Status::OK();
+}
+
+Status TransactionLogIteratorImpl::OpenLogReader(const LogFile* logFile) {
+  unique_ptr<SequentialFile> file;
+  Status status = OpenLogFile(logFile, &file);
+  if (!status.ok()) {
+    return status;
+  }
+  assert(file);
+  currentLogReader_.reset(new log::Reader(std::move(file), &reporter_,
+                                          read_options_.verify_checksums_, 0));
+  return Status::OK();
+}
+}  //  namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/db/transaction_log_impl.h b/db/transaction_log_impl.h
new file mode 100644 (file)
index 0000000..319b01c
--- /dev/null
@@ -0,0 +1,120 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef ROCKSDB_LITE
+#pragma once
+#include <vector>
+
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/types.h"
+#include "rocksdb/transaction_log.h"
+#include "db/db_impl.h"
+#include "db/log_reader.h"
+#include "db/filename.h"
+
+namespace rocksdb {
+
+struct LogReporter : public log::Reader::Reporter {
+  Env* env;
+  Logger* info_log;
+  virtual void Corruption(size_t bytes, const Status& s) {
+    Log(info_log, "dropping %zu bytes; %s", bytes, s.ToString().c_str());
+  }
+  virtual void Info(const char* s) {
+    Log(info_log, "%s", s);
+  }
+};
+
+class LogFileImpl : public LogFile {
+ public:
+  LogFileImpl(uint64_t logNum, WalFileType logType, SequenceNumber startSeq,
+              uint64_t sizeBytes) :
+    logNumber_(logNum),
+    type_(logType),
+    startSequence_(startSeq),
+    sizeFileBytes_(sizeBytes) {
+  }
+
+  std::string PathName() const {
+    if (type_ == kArchivedLogFile) {
+      return ArchivedLogFileName("", logNumber_);
+    }
+    return LogFileName("", logNumber_);
+  }
+
+  uint64_t LogNumber() const { return logNumber_; }
+
+  WalFileType Type() const { return type_; }
+
+  SequenceNumber StartSequence() const { return startSequence_; }
+
+  uint64_t SizeFileBytes() const { return sizeFileBytes_; }
+
+  bool operator < (const LogFile& that) const {
+    return LogNumber() < that.LogNumber();
+  }
+
+ private:
+  uint64_t logNumber_;
+  WalFileType type_;
+  SequenceNumber startSequence_;
+  uint64_t sizeFileBytes_;
+
+};
+
+class TransactionLogIteratorImpl : public TransactionLogIterator {
+ public:
+  TransactionLogIteratorImpl(
+      const std::string& dir, const DBOptions* options,
+      const TransactionLogIterator::ReadOptions& read_options,
+      const EnvOptions& soptions, const SequenceNumber seqNum,
+      std::unique_ptr<VectorLogPtr> files, DBImpl const* const dbimpl);
+
+  virtual bool Valid();
+
+  virtual void Next();
+
+  virtual Status status();
+
+  virtual BatchResult GetBatch();
+
+ private:
+  const std::string& dir_;
+  const DBOptions* options_;
+  const TransactionLogIterator::ReadOptions read_options_;
+  const EnvOptions& soptions_;
+  SequenceNumber startingSequenceNumber_;
+  std::unique_ptr<VectorLogPtr> files_;
+  bool started_;
+  bool isValid_;  // not valid when it starts of.
+  Status currentStatus_;
+  size_t currentFileIndex_;
+  std::unique_ptr<WriteBatch> currentBatch_;
+  unique_ptr<log::Reader> currentLogReader_;
+  Status OpenLogFile(const LogFile* logFile, unique_ptr<SequentialFile>* file);
+  LogReporter reporter_;
+  SequenceNumber currentBatchSeq_; // sequence number at start of current batch
+  SequenceNumber currentLastSeq_; // last sequence in the current batch
+  DBImpl const * const dbimpl_; // The db on whose log files this iterates
+
+  // Reads from transaction log only if the writebatch record has been written
+  bool RestrictedRead(Slice* record, std::string* scratch);
+  // Seeks to startingSequenceNumber reading from startFileIndex in files_.
+  // If strict is set,then must get a batch starting with startingSequenceNumber
+  void SeekToStartSequence(uint64_t startFileIndex = 0, bool strict = false);
+  // Implementation of Next. SeekToStartSequence calls it internally with
+  // internal=true to let it find next entry even if it has to jump gaps because
+  // the iterator may start off from the first available entry but promises to
+  // be continuous after that
+  void NextImpl(bool internal = false);
+  // Check if batch is expected, else return false
+  bool IsBatchExpected(const WriteBatch* batch, SequenceNumber expectedSeq);
+  // Update current batch if a continuous batch is found, else return false
+  void UpdateCurrentWriteBatch(const Slice& record);
+  Status OpenLogReader(const LogFile* file);
+};
+}  //  namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/db/version_edit.cc b/db/version_edit.cc
new file mode 100644 (file)
index 0000000..24d7f0d
--- /dev/null
@@ -0,0 +1,364 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_edit.h"
+
+#include "db/version_set.h"
+#include "util/coding.h"
+#include "rocksdb/slice.h"
+
+namespace rocksdb {
+
+// Tag numbers for serialized VersionEdit.  These numbers are written to
+// disk and should not be changed.
+enum Tag {
+  kComparator           = 1,
+  kLogNumber            = 2,
+  kNextFileNumber       = 3,
+  kLastSequence         = 4,
+  kCompactPointer       = 5,
+  kDeletedFile          = 6,
+  kNewFile              = 7,
+  // 8 was used for large value refs
+  kPrevLogNumber        = 9,
+
+  // these are new formats divergent from open source leveldb
+  kNewFile2             = 100,  // store smallest & largest seqno
+
+  kColumnFamily         = 200,  // specify column family for version edit
+  kColumnFamilyAdd      = 201,
+  kColumnFamilyDrop     = 202,
+  kMaxColumnFamily      = 203,
+};
+
+void VersionEdit::Clear() {
+  comparator_.clear();
+  max_level_ = 0;
+  log_number_ = 0;
+  prev_log_number_ = 0;
+  last_sequence_ = 0;
+  next_file_number_ = 0;
+  max_column_family_ = 0;
+  has_comparator_ = false;
+  has_log_number_ = false;
+  has_prev_log_number_ = false;
+  has_next_file_number_ = false;
+  has_last_sequence_ = false;
+  has_max_column_family_ = false;
+  deleted_files_.clear();
+  new_files_.clear();
+  column_family_ = 0;
+  is_column_family_add_ = 0;
+  is_column_family_drop_ = 0;
+  column_family_name_.clear();
+}
+
+void VersionEdit::EncodeTo(std::string* dst) const {
+  if (has_comparator_) {
+    PutVarint32(dst, kComparator);
+    PutLengthPrefixedSlice(dst, comparator_);
+  }
+  if (has_log_number_) {
+    PutVarint32(dst, kLogNumber);
+    PutVarint64(dst, log_number_);
+  }
+  if (has_prev_log_number_) {
+    PutVarint32(dst, kPrevLogNumber);
+    PutVarint64(dst, prev_log_number_);
+  }
+  if (has_next_file_number_) {
+    PutVarint32(dst, kNextFileNumber);
+    PutVarint64(dst, next_file_number_);
+  }
+  if (has_last_sequence_) {
+    PutVarint32(dst, kLastSequence);
+    PutVarint64(dst, last_sequence_);
+  }
+  if (has_max_column_family_) {
+    PutVarint32(dst, kMaxColumnFamily);
+    PutVarint32(dst, max_column_family_);
+  }
+
+  for (const auto& deleted : deleted_files_) {
+    PutVarint32(dst, kDeletedFile);
+    PutVarint32(dst, deleted.first /* level */);
+    PutVarint64(dst, deleted.second /* file number */);
+  }
+
+  for (size_t i = 0; i < new_files_.size(); i++) {
+    const FileMetaData& f = new_files_[i].second;
+    PutVarint32(dst, kNewFile2);
+    PutVarint32(dst, new_files_[i].first);  // level
+    PutVarint64(dst, f.number);
+    PutVarint64(dst, f.file_size);
+    PutLengthPrefixedSlice(dst, f.smallest.Encode());
+    PutLengthPrefixedSlice(dst, f.largest.Encode());
+    PutVarint64(dst, f.smallest_seqno);
+    PutVarint64(dst, f.largest_seqno);
+  }
+
+  // 0 is default and does not need to be explicitly written
+  if (column_family_ != 0) {
+    PutVarint32(dst, kColumnFamily);
+    PutVarint32(dst, column_family_);
+  }
+
+  if (is_column_family_add_) {
+    PutVarint32(dst, kColumnFamilyAdd);
+    PutLengthPrefixedSlice(dst, Slice(column_family_name_));
+  }
+
+  if (is_column_family_drop_) {
+    PutVarint32(dst, kColumnFamilyDrop);
+  }
+}
+
+static bool GetInternalKey(Slice* input, InternalKey* dst) {
+  Slice str;
+  if (GetLengthPrefixedSlice(input, &str)) {
+    dst->DecodeFrom(str);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+bool VersionEdit::GetLevel(Slice* input, int* level, const char** msg) {
+  uint32_t v;
+  if (GetVarint32(input, &v)) {
+    *level = v;
+    if (max_level_ < *level) {
+      max_level_ = *level;
+    }
+    return true;
+  } else {
+    return false;
+  }
+}
+
+Status VersionEdit::DecodeFrom(const Slice& src) {
+  Clear();
+  Slice input = src;
+  const char* msg = nullptr;
+  uint32_t tag;
+
+  // Temporary storage for parsing
+  int level;
+  uint64_t number;
+  FileMetaData f;
+  Slice str;
+  InternalKey key;
+
+  while (msg == nullptr && GetVarint32(&input, &tag)) {
+    switch (tag) {
+      case kComparator:
+        if (GetLengthPrefixedSlice(&input, &str)) {
+          comparator_ = str.ToString();
+          has_comparator_ = true;
+        } else {
+          msg = "comparator name";
+        }
+        break;
+
+      case kLogNumber:
+        if (GetVarint64(&input, &log_number_)) {
+          has_log_number_ = true;
+        } else {
+          msg = "log number";
+        }
+        break;
+
+      case kPrevLogNumber:
+        if (GetVarint64(&input, &prev_log_number_)) {
+          has_prev_log_number_ = true;
+        } else {
+          msg = "previous log number";
+        }
+        break;
+
+      case kNextFileNumber:
+        if (GetVarint64(&input, &next_file_number_)) {
+          has_next_file_number_ = true;
+        } else {
+          msg = "next file number";
+        }
+        break;
+
+      case kLastSequence:
+        if (GetVarint64(&input, &last_sequence_)) {
+          has_last_sequence_ = true;
+        } else {
+          msg = "last sequence number";
+        }
+        break;
+
+      case kMaxColumnFamily:
+        if (GetVarint32(&input, &max_column_family_)) {
+          has_max_column_family_ = true;
+        } else {
+          msg = "max column family";
+        }
+        break;
+
+      case kCompactPointer:
+        if (GetLevel(&input, &level, &msg) &&
+            GetInternalKey(&input, &key)) {
+          // we don't use compact pointers anymore,
+          // but we should not fail if they are still
+          // in manifest
+        } else {
+          if (!msg) {
+            msg = "compaction pointer";
+          }
+        }
+        break;
+
+      case kDeletedFile:
+        if (GetLevel(&input, &level, &msg) &&
+            GetVarint64(&input, &number)) {
+          deleted_files_.insert(std::make_pair(level, number));
+        } else {
+          if (!msg) {
+            msg = "deleted file";
+          }
+        }
+        break;
+
+      case kNewFile:
+        if (GetLevel(&input, &level, &msg) &&
+            GetVarint64(&input, &f.number) &&
+            GetVarint64(&input, &f.file_size) &&
+            GetInternalKey(&input, &f.smallest) &&
+            GetInternalKey(&input, &f.largest)) {
+          new_files_.push_back(std::make_pair(level, f));
+        } else {
+          if (!msg) {
+            msg = "new-file entry";
+          }
+        }
+        break;
+
+      case kNewFile2:
+        if (GetLevel(&input, &level, &msg) &&
+            GetVarint64(&input, &f.number) &&
+            GetVarint64(&input, &f.file_size) &&
+            GetInternalKey(&input, &f.smallest) &&
+            GetInternalKey(&input, &f.largest) &&
+            GetVarint64(&input, &f.smallest_seqno) &&
+            GetVarint64(&input, &f.largest_seqno) ) {
+          new_files_.push_back(std::make_pair(level, f));
+        } else {
+          if (!msg) {
+            msg = "new-file2 entry";
+          }
+        }
+        break;
+
+      case kColumnFamily:
+        if (!GetVarint32(&input, &column_family_)) {
+          if (!msg) {
+            msg = "set column family id";
+          }
+        }
+        break;
+
+      case kColumnFamilyAdd:
+        if (GetLengthPrefixedSlice(&input, &str)) {
+          is_column_family_add_ = true;
+          column_family_name_ = str.ToString();
+        } else {
+          if (!msg) {
+            msg = "column family add";
+          }
+        }
+        break;
+
+      case kColumnFamilyDrop:
+        is_column_family_drop_ = true;
+        break;
+
+      default:
+        msg = "unknown tag";
+        break;
+    }
+  }
+
+  if (msg == nullptr && !input.empty()) {
+    msg = "invalid tag";
+  }
+
+  Status result;
+  if (msg != nullptr) {
+    result = Status::Corruption("VersionEdit", msg);
+  }
+  return result;
+}
+
+std::string VersionEdit::DebugString(bool hex_key) const {
+  std::string r;
+  r.append("VersionEdit {");
+  if (has_comparator_) {
+    r.append("\n  Comparator: ");
+    r.append(comparator_);
+  }
+  if (has_log_number_) {
+    r.append("\n  LogNumber: ");
+    AppendNumberTo(&r, log_number_);
+  }
+  if (has_prev_log_number_) {
+    r.append("\n  PrevLogNumber: ");
+    AppendNumberTo(&r, prev_log_number_);
+  }
+  if (has_next_file_number_) {
+    r.append("\n  NextFile: ");
+    AppendNumberTo(&r, next_file_number_);
+  }
+  if (has_last_sequence_) {
+    r.append("\n  LastSeq: ");
+    AppendNumberTo(&r, last_sequence_);
+  }
+  for (DeletedFileSet::const_iterator iter = deleted_files_.begin();
+       iter != deleted_files_.end();
+       ++iter) {
+    r.append("\n  DeleteFile: ");
+    AppendNumberTo(&r, iter->first);
+    r.append(" ");
+    AppendNumberTo(&r, iter->second);
+  }
+  for (size_t i = 0; i < new_files_.size(); i++) {
+    const FileMetaData& f = new_files_[i].second;
+    r.append("\n  AddFile: ");
+    AppendNumberTo(&r, new_files_[i].first);
+    r.append(" ");
+    AppendNumberTo(&r, f.number);
+    r.append(" ");
+    AppendNumberTo(&r, f.file_size);
+    r.append(" ");
+    r.append(f.smallest.DebugString(hex_key));
+    r.append(" .. ");
+    r.append(f.largest.DebugString(hex_key));
+  }
+  r.append("\n  ColumnFamily: ");
+  AppendNumberTo(&r, column_family_);
+  if (is_column_family_add_) {
+    r.append("\n  ColumnFamilyAdd: ");
+    r.append(column_family_name_);
+  }
+  if (is_column_family_drop_) {
+    r.append("\n  ColumnFamilyDrop");
+  }
+  if (has_max_column_family_) {
+    r.append("\n  MaxColumnFamily: ");
+    AppendNumberTo(&r, max_column_family_);
+  }
+  r.append("\n}\n");
+  return r;
+}
+
+}  // namespace rocksdb
diff --git a/db/version_edit.h b/db/version_edit.h
new file mode 100644 (file)
index 0000000..acaec8a
--- /dev/null
@@ -0,0 +1,176 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <set>
+#include <utility>
+#include <vector>
+#include <string>
+#include "rocksdb/cache.h"
+#include "db/dbformat.h"
+
+namespace rocksdb {
+
+class VersionSet;
+
+struct FileMetaData {
+  int refs;
+  int allowed_seeks;          // Seeks allowed until compaction
+  uint64_t number;
+  uint64_t file_size;         // File size in bytes
+  InternalKey smallest;       // Smallest internal key served by table
+  InternalKey largest;        // Largest internal key served by table
+  bool being_compacted;       // Is this file undergoing compaction?
+  SequenceNumber smallest_seqno;// The smallest seqno in this file
+  SequenceNumber largest_seqno; // The largest seqno in this file
+
+  // Needs to be disposed when refs becomes 0.
+  Cache::Handle* table_reader_handle;
+  // Table reader in table_reader_handle
+  TableReader* table_reader;
+
+  FileMetaData(uint64_t number, uint64_t file_size)
+      : refs(0),
+        allowed_seeks(1 << 30),
+        number(number),
+        file_size(file_size),
+        being_compacted(false),
+        table_reader_handle(nullptr),
+        table_reader(nullptr) {}
+  FileMetaData() : FileMetaData(0, 0) {}
+};
+
+class VersionEdit {
+ public:
+  VersionEdit() { Clear(); }
+  ~VersionEdit() { }
+
+  void Clear();
+
+  void SetComparatorName(const Slice& name) {
+    has_comparator_ = true;
+    comparator_ = name.ToString();
+  }
+  void SetLogNumber(uint64_t num) {
+    has_log_number_ = true;
+    log_number_ = num;
+  }
+  void SetPrevLogNumber(uint64_t num) {
+    has_prev_log_number_ = true;
+    prev_log_number_ = num;
+  }
+  void SetNextFile(uint64_t num) {
+    has_next_file_number_ = true;
+    next_file_number_ = num;
+  }
+  void SetLastSequence(SequenceNumber seq) {
+    has_last_sequence_ = true;
+    last_sequence_ = seq;
+  }
+  void SetMaxColumnFamily(uint32_t max_column_family) {
+    has_max_column_family_ = true;
+    max_column_family_ = max_column_family;
+  }
+
+  // Add the specified file at the specified number.
+  // REQUIRES: This version has not been saved (see VersionSet::SaveTo)
+  // REQUIRES: "smallest" and "largest" are smallest and largest keys in file
+  void AddFile(int level, uint64_t file,
+               uint64_t file_size,
+               const InternalKey& smallest,
+               const InternalKey& largest,
+               const SequenceNumber& smallest_seqno,
+               const SequenceNumber& largest_seqno) {
+    assert(smallest_seqno <= largest_seqno);
+    FileMetaData f;
+    f.number = file;
+    f.file_size = file_size;
+    f.smallest = smallest;
+    f.largest = largest;
+    f.smallest_seqno = smallest_seqno;
+    f.largest_seqno = largest_seqno;
+    new_files_.push_back(std::make_pair(level, f));
+  }
+
+  // Delete the specified "file" from the specified "level".
+  void DeleteFile(int level, uint64_t file) {
+    deleted_files_.insert({level, file});
+  }
+
+  // Number of edits
+  int NumEntries() {
+    return new_files_.size() + deleted_files_.size();
+  }
+
+  bool IsColumnFamilyManipulation() {
+    return is_column_family_add_ || is_column_family_drop_;
+  }
+
+  void SetColumnFamily(uint32_t column_family_id) {
+    column_family_ = column_family_id;
+  }
+
+  // set column family ID by calling SetColumnFamily()
+  void AddColumnFamily(const std::string& name) {
+    assert(!is_column_family_drop_);
+    assert(!is_column_family_add_);
+    assert(NumEntries() == 0);
+    is_column_family_add_ = true;
+    column_family_name_ = name;
+  }
+
+  // set column family ID by calling SetColumnFamily()
+  void DropColumnFamily() {
+    assert(!is_column_family_drop_);
+    assert(!is_column_family_add_);
+    assert(NumEntries() == 0);
+    is_column_family_drop_ = true;
+  }
+
+  void EncodeTo(std::string* dst) const;
+  Status DecodeFrom(const Slice& src);
+
+  std::string DebugString(bool hex_key = false) const;
+
+ private:
+  friend class VersionSet;
+
+  typedef std::set< std::pair<int, uint64_t>> DeletedFileSet;
+
+  bool GetLevel(Slice* input, int* level, const char** msg);
+
+  int max_level_;
+  std::string comparator_;
+  uint64_t log_number_;
+  uint64_t prev_log_number_;
+  uint64_t next_file_number_;
+  uint32_t max_column_family_;
+  SequenceNumber last_sequence_;
+  bool has_comparator_;
+  bool has_log_number_;
+  bool has_prev_log_number_;
+  bool has_next_file_number_;
+  bool has_last_sequence_;
+  bool has_max_column_family_;
+
+  DeletedFileSet deleted_files_;
+  std::vector<std::pair<int, FileMetaData>> new_files_;
+
+  // Each version edit record should have column_family_id set
+  // If it's not set, it is default (0)
+  uint32_t column_family_;
+  // a version edit can be either column_family add or
+  // column_family drop. If it's column family add,
+  // it also includes column family name.
+  bool is_column_family_drop_;
+  bool is_column_family_add_;
+  std::string column_family_name_;
+};
+
+}  // namespace rocksdb
diff --git a/db/version_edit_test.cc b/db/version_edit_test.cc
new file mode 100644 (file)
index 0000000..7842b32
--- /dev/null
@@ -0,0 +1,65 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_edit.h"
+#include "util/testharness.h"
+
+namespace rocksdb {
+
+static void TestEncodeDecode(const VersionEdit& edit) {
+  std::string encoded, encoded2;
+  edit.EncodeTo(&encoded);
+  VersionEdit parsed;
+  Status s = parsed.DecodeFrom(encoded);
+  ASSERT_TRUE(s.ok()) << s.ToString();
+  parsed.EncodeTo(&encoded2);
+  ASSERT_EQ(encoded, encoded2);
+}
+
+class VersionEditTest { };
+
+TEST(VersionEditTest, EncodeDecode) {
+  static const uint64_t kBig = 1ull << 50;
+
+  VersionEdit edit;
+  for (int i = 0; i < 4; i++) {
+    TestEncodeDecode(edit);
+    edit.AddFile(3, kBig + 300 + i, kBig + 400 + i,
+                 InternalKey("foo", kBig + 500 + i, kTypeValue),
+                 InternalKey("zoo", kBig + 600 + i, kTypeDeletion),
+                 kBig + 500 + i,
+                 kBig + 600 + i);
+    edit.DeleteFile(4, kBig + 700 + i);
+  }
+
+  edit.SetComparatorName("foo");
+  edit.SetLogNumber(kBig + 100);
+  edit.SetNextFile(kBig + 200);
+  edit.SetLastSequence(kBig + 1000);
+  TestEncodeDecode(edit);
+}
+
+TEST(VersionEditTest, ColumnFamilyTest) {
+  VersionEdit edit;
+  edit.SetColumnFamily(2);
+  edit.AddColumnFamily("column_family");
+  edit.SetMaxColumnFamily(5);
+  TestEncodeDecode(edit);
+
+  edit.Clear();
+  edit.SetColumnFamily(3);
+  edit.DropColumnFamily();
+  TestEncodeDecode(edit);
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/db/version_set.cc b/db/version_set.cc
new file mode 100644 (file)
index 0000000..00d9caf
--- /dev/null
@@ -0,0 +1,2780 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#define __STDC_FORMAT_MACROS
+#include "db/version_set.h"
+
+#include <inttypes.h>
+#include <algorithm>
+#include <map>
+#include <set>
+#include <climits>
+#include <unordered_map>
+#include <stdio.h>
+
+#include "db/filename.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/memtable.h"
+#include "db/merge_context.h"
+#include "db/table_cache.h"
+#include "db/compaction.h"
+#include "rocksdb/env.h"
+#include "rocksdb/merge_operator.h"
+#include "table/table_reader.h"
+#include "table/merger.h"
+#include "table/two_level_iterator.h"
+#include "table/format.h"
+#include "table/plain_table_factory.h"
+#include "table/meta_blocks.h"
+#include "util/coding.h"
+#include "util/logging.h"
+#include "util/stop_watch.h"
+
+namespace rocksdb {
+
+static uint64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
+  uint64_t sum = 0;
+  for (size_t i = 0; i < files.size() && files[i]; i++) {
+    sum += files[i]->file_size;
+  }
+  return sum;
+}
+
+Version::~Version() {
+  assert(refs_ == 0);
+
+  // Remove from linked list
+  prev_->next_ = next_;
+  next_->prev_ = prev_;
+
+  // Drop references to files
+  for (int level = 0; level < num_levels_; level++) {
+    for (size_t i = 0; i < files_[level].size(); i++) {
+      FileMetaData* f = files_[level][i];
+      assert(f->refs > 0);
+      f->refs--;
+      if (f->refs <= 0) {
+        if (f->table_reader_handle) {
+          cfd_->table_cache()->ReleaseHandle(f->table_reader_handle);
+          f->table_reader_handle = nullptr;
+        }
+        vset_->obsolete_files_.push_back(f);
+      }
+    }
+  }
+  delete[] files_;
+}
+
+int FindFileInRange(const InternalKeyComparator& icmp,
+    const std::vector<FileMetaData*>& files,
+    const Slice& key,
+    uint32_t left,
+    uint32_t right) {
+  while (left < right) {
+    uint32_t mid = (left + right) / 2;
+    const FileMetaData* f = files[mid];
+    if (icmp.InternalKeyComparator::Compare(f->largest.Encode(), key) < 0) {
+      // Key at "mid.largest" is < "target".  Therefore all
+      // files at or before "mid" are uninteresting.
+      left = mid + 1;
+    } else {
+      // Key at "mid.largest" is >= "target".  Therefore all files
+      // after "mid" are uninteresting.
+      right = mid;
+    }
+  }
+  return right;
+}
+
+int FindFile(const InternalKeyComparator& icmp,
+             const std::vector<FileMetaData*>& files,
+             const Slice& key) {
+  return FindFileInRange(icmp, files, key, 0, files.size());
+}
+
+static bool AfterFile(const Comparator* ucmp,
+                      const Slice* user_key, const FileMetaData* f) {
+  // nullptr user_key occurs before all keys and is therefore never after *f
+  return (user_key != nullptr &&
+          ucmp->Compare(*user_key, f->largest.user_key()) > 0);
+}
+
+static bool BeforeFile(const Comparator* ucmp,
+                       const Slice* user_key, const FileMetaData* f) {
+  // nullptr user_key occurs after all keys and is therefore never before *f
+  return (user_key != nullptr &&
+          ucmp->Compare(*user_key, f->smallest.user_key()) < 0);
+}
+
+bool SomeFileOverlapsRange(
+    const InternalKeyComparator& icmp,
+    bool disjoint_sorted_files,
+    const std::vector<FileMetaData*>& files,
+    const Slice* smallest_user_key,
+    const Slice* largest_user_key) {
+  const Comparator* ucmp = icmp.user_comparator();
+  if (!disjoint_sorted_files) {
+    // Need to check against all files
+    for (size_t i = 0; i < files.size(); i++) {
+      const FileMetaData* f = files[i];
+      if (AfterFile(ucmp, smallest_user_key, f) ||
+          BeforeFile(ucmp, largest_user_key, f)) {
+        // No overlap
+      } else {
+        return true;  // Overlap
+      }
+    }
+    return false;
+  }
+
+  // Binary search over file list
+  uint32_t index = 0;
+  if (smallest_user_key != nullptr) {
+    // Find the earliest possible internal key for smallest_user_key
+    InternalKey small(*smallest_user_key, kMaxSequenceNumber,kValueTypeForSeek);
+    index = FindFile(icmp, files, small.Encode());
+  }
+
+  if (index >= files.size()) {
+    // beginning of range is after all files, so no overlap.
+    return false;
+  }
+
+  return !BeforeFile(ucmp, largest_user_key, files[index]);
+}
+
+namespace {
+// Used for LevelFileNumIterator to pass "block handle" value,
+// which actually means file information in this iterator.
+// It contains subset of fields of FileMetaData, that is sufficient
+// for table cache to use.
+struct EncodedFileMetaData {
+  uint64_t number;   // file number
+  uint64_t file_size;   // file size
+  TableReader* table_reader;   // cached table reader
+};
+}  // namespace
+
+// An internal iterator.  For a given version/level pair, yields
+// information about the files in the level.  For a given entry, key()
+// is the largest key that occurs in the file, and value() is an
+// 16-byte value containing the file number and file size, both
+// encoded using EncodeFixed64.
+class Version::LevelFileNumIterator : public Iterator {
+ public:
+  LevelFileNumIterator(const InternalKeyComparator& icmp,
+                       const std::vector<FileMetaData*>* flist)
+      : icmp_(icmp),
+        flist_(flist),
+        index_(flist->size()) {        // Marks as invalid
+  }
+  virtual bool Valid() const {
+    return index_ < flist_->size();
+  }
+  virtual void Seek(const Slice& target) {
+    index_ = FindFile(icmp_, *flist_, target);
+  }
+  virtual void SeekToFirst() { index_ = 0; }
+  virtual void SeekToLast() {
+    index_ = flist_->empty() ? 0 : flist_->size() - 1;
+  }
+  virtual void Next() {
+    assert(Valid());
+    index_++;
+  }
+  virtual void Prev() {
+    assert(Valid());
+    if (index_ == 0) {
+      index_ = flist_->size();  // Marks as invalid
+    } else {
+      index_--;
+    }
+  }
+  Slice key() const {
+    assert(Valid());
+    return (*flist_)[index_]->largest.Encode();
+  }
+  Slice value() const {
+    assert(Valid());
+    auto* file_meta = (*flist_)[index_];
+    current_value_.number = file_meta->number;
+    current_value_.file_size = file_meta->file_size;
+    current_value_.table_reader = file_meta->table_reader;
+    return Slice(reinterpret_cast<const char*>(&current_value_),
+                 sizeof(EncodedFileMetaData));
+  }
+  virtual Status status() const { return Status::OK(); }
+ private:
+  const InternalKeyComparator icmp_;
+  const std::vector<FileMetaData*>* const flist_;
+  uint32_t index_;
+  mutable EncodedFileMetaData current_value_;
+};
+
+class Version::LevelFileIteratorState : public TwoLevelIteratorState {
+ public:
+  LevelFileIteratorState(TableCache* table_cache,
+    const ReadOptions& read_options, const EnvOptions& env_options,
+    const InternalKeyComparator& icomparator, bool for_compaction,
+    bool prefix_enabled)
+    : TwoLevelIteratorState(prefix_enabled),
+      table_cache_(table_cache), read_options_(read_options),
+      env_options_(env_options), icomparator_(icomparator),
+      for_compaction_(for_compaction) {}
+
+  Iterator* NewSecondaryIterator(const Slice& meta_handle) override {
+    if (meta_handle.size() != sizeof(EncodedFileMetaData)) {
+      return NewErrorIterator(
+          Status::Corruption("FileReader invoked with unexpected value"));
+    } else {
+      const EncodedFileMetaData* encoded_meta =
+          reinterpret_cast<const EncodedFileMetaData*>(meta_handle.data());
+      FileMetaData meta(encoded_meta->number, encoded_meta->file_size);
+      meta.table_reader = encoded_meta->table_reader;
+      return table_cache_->NewIterator(read_options_, env_options_,
+          icomparator_, meta, nullptr /* don't need reference to table*/,
+          for_compaction_);
+    }
+  }
+
+  bool PrefixMayMatch(const Slice& internal_key) override {
+    return true;
+  }
+
+ private:
+  TableCache* table_cache_;
+  const ReadOptions read_options_;
+  const EnvOptions& env_options_;
+  const InternalKeyComparator& icomparator_;
+  bool for_compaction_;
+};
+
+Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props) {
+  auto table_cache = cfd_->table_cache();
+  auto options = cfd_->options();
+  for (int level = 0; level < num_levels_; level++) {
+    for (const auto& file_meta : files_[level]) {
+      auto fname = TableFileName(vset_->dbname_, file_meta->number);
+      // 1. If the table is already present in table cache, load table
+      // properties from there.
+      std::shared_ptr<const TableProperties> table_properties;
+      Status s = table_cache->GetTableProperties(
+          vset_->storage_options_, cfd_->internal_comparator(), *file_meta,
+          &table_properties, true /* no io */);
+      if (s.ok()) {
+        props->insert({fname, table_properties});
+        continue;
+      }
+
+      // We only ignore error type `Incomplete` since it's by design that we
+      // disallow table when it's not in table cache.
+      if (!s.IsIncomplete()) {
+        return s;
+      }
+
+      // 2. Table is not present in table cache, we'll read the table properties
+      // directly from the properties block in the file.
+      std::unique_ptr<RandomAccessFile> file;
+      s = options->env->NewRandomAccessFile(fname, &file,
+                                            vset_->storage_options_);
+      if (!s.ok()) {
+        return s;
+      }
+
+      TableProperties* raw_table_properties;
+      // By setting the magic number to kInvalidTableMagicNumber, we can by
+      // pass the magic number check in the footer.
+      s = ReadTableProperties(
+          file.get(), file_meta->file_size,
+          Footer::kInvalidTableMagicNumber /* table's magic number */,
+          vset_->env_, options->info_log.get(), &raw_table_properties);
+      if (!s.ok()) {
+        return s;
+      }
+      RecordTick(options->statistics.get(),
+                 NUMBER_DIRECT_LOAD_TABLE_PROPERTIES);
+
+      props->insert({fname, std::shared_ptr<const TableProperties>(
+                                raw_table_properties)});
+    }
+  }
+
+  return Status::OK();
+}
+
+void Version::AddIterators(const ReadOptions& read_options,
+                           const EnvOptions& soptions,
+                           std::vector<Iterator*>* iters) {
+  // Merge all level zero files together since they may overlap
+  for (const FileMetaData* file : files_[0]) {
+    iters->push_back(cfd_->table_cache()->NewIterator(
+        read_options, soptions, cfd_->internal_comparator(), *file));
+  }
+
+  // For levels > 0, we can use a concatenating iterator that sequentially
+  // walks through the non-overlapping files in the level, opening them
+  // lazily.
+  for (int level = 1; level < num_levels_; level++) {
+    if (!files_[level].empty()) {
+      iters->push_back(NewTwoLevelIterator(new LevelFileIteratorState(
+          cfd_->table_cache(), read_options, soptions,
+          cfd_->internal_comparator(), false /* for_compaction */,
+          cfd_->options()->prefix_extractor != nullptr),
+        new LevelFileNumIterator(cfd_->internal_comparator(), &files_[level])));
+    }
+  }
+}
+
+// Callback from TableCache::Get()
+namespace {
+enum SaverState {
+  kNotFound,
+  kFound,
+  kDeleted,
+  kCorrupt,
+  kMerge // saver contains the current merge result (the operands)
+};
+struct Saver {
+  SaverState state;
+  const Comparator* ucmp;
+  Slice user_key;
+  bool* value_found; // Is value set correctly? Used by KeyMayExist
+  std::string* value;
+  const MergeOperator* merge_operator;
+  // the merge operations encountered;
+  MergeContext* merge_context;
+  Logger* logger;
+  bool didIO;    // did we do any disk io?
+  Statistics* statistics;
+};
+}
+
+// Called from TableCache::Get and Table::Get when file/block in which
+// key may  exist are not there in TableCache/BlockCache respectively. In this
+// case we  can't guarantee that key does not exist and are not permitted to do
+// IO to be  certain.Set the status=kFound and value_found=false to let the
+// caller know that key may exist but is not there in memory
+static void MarkKeyMayExist(void* arg) {
+  Saver* s = reinterpret_cast<Saver*>(arg);
+  s->state = kFound;
+  if (s->value_found != nullptr) {
+    *(s->value_found) = false;
+  }
+}
+
+static bool SaveValue(void* arg, const ParsedInternalKey& parsed_key,
+                      const Slice& v, bool didIO) {
+  Saver* s = reinterpret_cast<Saver*>(arg);
+  MergeContext* merge_contex = s->merge_context;
+  std::string merge_result;  // temporary area for merge results later
+
+  assert(s != nullptr && merge_contex != nullptr);
+
+  // TODO: didIO and Merge?
+  s->didIO = didIO;
+  if (s->ucmp->Compare(parsed_key.user_key, s->user_key) == 0) {
+    // Key matches. Process it
+    switch (parsed_key.type) {
+      case kTypeValue:
+        if (kNotFound == s->state) {
+          s->state = kFound;
+          s->value->assign(v.data(), v.size());
+        } else if (kMerge == s->state) {
+          assert(s->merge_operator != nullptr);
+          s->state = kFound;
+          if (!s->merge_operator->FullMerge(s->user_key, &v,
+                                            merge_contex->GetOperands(),
+                                            s->value, s->logger)) {
+            RecordTick(s->statistics, NUMBER_MERGE_FAILURES);
+            s->state = kCorrupt;
+          }
+        } else {
+          assert(false);
+        }
+        return false;
+
+      case kTypeDeletion:
+        if (kNotFound == s->state) {
+          s->state = kDeleted;
+        } else if (kMerge == s->state) {
+          s->state = kFound;
+          if (!s->merge_operator->FullMerge(s->user_key, nullptr,
+                                            merge_contex->GetOperands(),
+                                            s->value, s->logger)) {
+            RecordTick(s->statistics, NUMBER_MERGE_FAILURES);
+            s->state = kCorrupt;
+          }
+        } else {
+          assert(false);
+        }
+        return false;
+
+      case kTypeMerge:
+        assert(s->state == kNotFound || s->state == kMerge);
+        s->state = kMerge;
+        merge_contex->PushOperand(v);
+        return true;
+
+      default:
+        assert(false);
+        break;
+    }
+  }
+
+  // s->state could be Corrupt, merge or notfound
+
+  return false;
+}
+
+namespace {
+bool NewestFirst(FileMetaData* a, FileMetaData* b) {
+  return a->number > b->number;
+}
+bool NewestFirstBySeqNo(FileMetaData* a, FileMetaData* b) {
+  if (a->smallest_seqno != b->smallest_seqno) {
+    return a->smallest_seqno > b->smallest_seqno;
+  }
+  if (a->largest_seqno != b->largest_seqno) {
+    return a->largest_seqno > b->largest_seqno;
+  }
+  // Break ties by file number
+  return NewestFirst(a, b);
+}
+bool BySmallestKey(FileMetaData* a, FileMetaData* b,
+                   const InternalKeyComparator* cmp) {
+  int r = cmp->Compare(a->smallest, b->smallest);
+  if (r != 0) {
+    return (r < 0);
+  }
+  // Break ties by file number
+  return (a->number < b->number);
+}
+}  // anonymous namespace
+
+Version::Version(ColumnFamilyData* cfd, VersionSet* vset,
+                 uint64_t version_number)
+    : cfd_(cfd),
+      internal_comparator_((cfd == nullptr) ? nullptr
+                                            : &cfd->internal_comparator()),
+      user_comparator_((cfd == nullptr)
+                           ? nullptr
+                           : internal_comparator_->user_comparator()),
+      table_cache_((cfd == nullptr) ? nullptr : cfd->table_cache()),
+      merge_operator_((cfd == nullptr) ? nullptr
+                                       : cfd->options()->merge_operator.get()),
+      info_log_((cfd == nullptr) ? nullptr : cfd->options()->info_log.get()),
+      db_statistics_((cfd == nullptr) ? nullptr
+                                      : cfd->options()->statistics.get()),
+      vset_(vset),
+      next_(this),
+      prev_(this),
+      refs_(0),
+      // cfd is nullptr if Version is dummy
+      num_levels_(cfd == nullptr ? 0 : cfd->NumberLevels()),
+      files_(new std::vector<FileMetaData*>[num_levels_]),
+      files_by_size_(num_levels_),
+      next_file_to_compact_by_size_(num_levels_),
+      file_to_compact_(nullptr),
+      file_to_compact_level_(-1),
+      compaction_score_(num_levels_),
+      compaction_level_(num_levels_),
+      version_number_(version_number),
+      file_indexer_(num_levels_, cfd == nullptr ?  nullptr
+          : cfd->internal_comparator().user_comparator()) {
+}
+
+void Version::Get(const ReadOptions& options,
+                  const LookupKey& k,
+                  std::string* value,
+                  Status* status,
+                  MergeContext* merge_context,
+                  GetStats* stats,
+                  bool* value_found) {
+  Slice ikey = k.internal_key();
+  Slice user_key = k.user_key();
+
+  assert(status->ok() || status->IsMergeInProgress());
+  Saver saver;
+  saver.state = status->ok()? kNotFound : kMerge;
+  saver.ucmp = user_comparator_;
+  saver.user_key = user_key;
+  saver.value_found = value_found;
+  saver.value = value;
+  saver.merge_operator = merge_operator_;
+  saver.merge_context = merge_context;
+  saver.logger = info_log_;
+  saver.didIO = false;
+  saver.statistics = db_statistics_;
+
+  stats->seek_file = nullptr;
+  stats->seek_file_level = -1;
+  FileMetaData* last_file_read = nullptr;
+  int last_file_read_level = -1;
+
+  // We can search level-by-level since entries never hop across
+  // levels. Therefore we are guaranteed that if we find data
+  // in an smaller level, later levels are irrelevant (unless we
+  // are MergeInProgress).
+
+  int32_t search_left_bound = 0;
+  int32_t search_right_bound = FileIndexer::kLevelMaxIndex;
+  for (int level = 0; level < num_levels_; ++level) {
+    int num_files = files_[level].size();
+    if (num_files == 0) {
+      // When current level is empty, the search bound generated from upper
+      // level must be [0, -1] or [0, FileIndexer::kLevelMaxIndex] if it is
+      // also empty.
+      assert(search_left_bound == 0);
+      assert(search_right_bound == -1 ||
+             search_right_bound == FileIndexer::kLevelMaxIndex);
+      // Since current level is empty, it will need to search all files in the
+      // next level
+      search_left_bound = 0;
+      search_right_bound = FileIndexer::kLevelMaxIndex;
+      continue;
+    }
+
+    // Get the list of files to search in this level
+    FileMetaData* const* files = &files_[level][0];
+
+    // Some files may overlap each other. We find
+    // all files that overlap user_key and process them in order from
+    // newest to oldest. In the context of merge-operator,
+    // this can occur at any level. Otherwise, it only occurs
+    // at Level-0 (since Put/Deletes are always compacted into a single entry).
+    int32_t start_index;
+    if (level == 0) {
+      // On Level-0, we read through all files to check for overlap.
+      start_index = 0;
+    } else {
+      // On Level-n (n>=1), files are sorted. Binary search to find the earliest
+      // file whose largest key >= ikey. Search left bound and right bound are
+      // used to narrow the range.
+      if (search_left_bound == search_right_bound) {
+        start_index = search_left_bound;
+      } else if (search_left_bound < search_right_bound) {
+        if (search_right_bound == FileIndexer::kLevelMaxIndex) {
+          search_right_bound = num_files - 1;
+        }
+        start_index = FindFileInRange(cfd_->internal_comparator(),
+            files_[level], ikey, search_left_bound, search_right_bound);
+      } else {
+        // search_left_bound > search_right_bound, key does not exist in this
+        // level. Since no comparision is done in this level, it will need to
+        // search all files in the next level.
+        search_left_bound = 0;
+        search_right_bound = FileIndexer::kLevelMaxIndex;
+        continue;
+      }
+    }
+    // Traverse each relevant file to find the desired key
+#ifndef NDEBUG
+    FileMetaData* prev_file = nullptr;
+#endif
+
+    for (int32_t i = start_index; i < num_files;) {
+      FileMetaData* f = files[i];
+      // Check if key is within a file's range. If search left bound and right
+      // bound point to the same find, we are sure key falls in range.
+      assert(level == 0 || i == start_index ||
+             user_comparator_->Compare(user_key, f->smallest.user_key()) <= 0);
+
+      int cmp_smallest = user_comparator_->Compare(user_key, f->smallest.user_key());
+      int cmp_largest = -1;
+      if (cmp_smallest >= 0) {
+        cmp_largest = user_comparator_->Compare(user_key, f->largest.user_key());
+      }
+
+      // Setup file search bound for the next level based on the comparison
+      // results
+      if (level > 0) {
+        file_indexer_.GetNextLevelIndex(level, i, cmp_smallest, cmp_largest,
+            &search_left_bound, &search_right_bound);
+      }
+      // Key falls out of current file's range
+      if (cmp_smallest < 0 || cmp_largest > 0) {
+        if (level == 0) {
+          ++i;
+          continue;
+        } else {
+          break;
+        }
+      }
+
+#ifndef NDEBUG
+      // Sanity check to make sure that the files are correctly sorted
+      if (prev_file) {
+        if (level != 0) {
+          int comp_sign =
+              internal_comparator_->Compare(prev_file->largest, f->smallest);
+          assert(comp_sign < 0);
+        } else {
+          // level == 0, the current file cannot be newer than the previous one.
+          if (cfd_->options()->compaction_style == kCompactionStyleUniversal) {
+            assert(!NewestFirstBySeqNo(f, prev_file));
+          } else {
+            assert(!NewestFirst(f, prev_file));
+          }
+        }
+      }
+      prev_file = f;
+#endif
+      bool tableIO = false;
+      *status = table_cache_->Get(options, *internal_comparator_, *f, ikey,
+                                  &saver, SaveValue, &tableIO, MarkKeyMayExist);
+      // TODO: examine the behavior for corrupted key
+      if (!status->ok()) {
+        return;
+      }
+
+      if (last_file_read != nullptr && stats->seek_file == nullptr) {
+        // We have had more than one seek for this read.  Charge the 1st file.
+        stats->seek_file = last_file_read;
+        stats->seek_file_level = last_file_read_level;
+      }
+
+      // If we did any IO as part of the read, then we remember it because
+      // it is a possible candidate for seek-based compaction. saver.didIO
+      // is true if the block had to be read in from storage and was not
+      // pre-exisiting in the block cache. Also, if this file was not pre-
+      // existing in the table cache and had to be freshly opened that needed
+      // the index blocks to be read-in, then tableIO is true. One thing
+      // to note is that the index blocks are not part of the block cache.
+      if (saver.didIO || tableIO) {
+        last_file_read = f;
+        last_file_read_level = level;
+      }
+
+      switch (saver.state) {
+        case kNotFound:
+          break;      // Keep searching in other files
+        case kFound:
+          return;
+        case kDeleted:
+          *status = Status::NotFound();  // Use empty error message for speed
+          return;
+        case kCorrupt:
+          *status = Status::Corruption("corrupted key for ", user_key);
+          return;
+        case kMerge:
+          break;
+      }
+      if (level > 0 && cmp_largest < 0) {
+        break;
+      } else {
+        ++i;
+      }
+    }
+  }
+
+
+  if (kMerge == saver.state) {
+    // merge_operands are in saver and we hit the beginning of the key history
+    // do a final merge of nullptr and operands;
+    if (merge_operator_->FullMerge(user_key, nullptr,
+                                   saver.merge_context->GetOperands(), value,
+                                   info_log_)) {
+      *status = Status::OK();
+    } else {
+      RecordTick(db_statistics_, NUMBER_MERGE_FAILURES);
+      *status = Status::Corruption("could not perform end-of-key merge for ",
+                                   user_key);
+    }
+  } else {
+    *status = Status::NotFound(); // Use an empty error message for speed
+  }
+}
+
+bool Version::UpdateStats(const GetStats& stats) {
+  FileMetaData* f = stats.seek_file;
+  if (f != nullptr) {
+    f->allowed_seeks--;
+    if (f->allowed_seeks <= 0 && file_to_compact_ == nullptr) {
+      file_to_compact_ = f;
+      file_to_compact_level_ = stats.seek_file_level;
+      return true;
+    }
+  }
+  return false;
+}
+
+void Version::ComputeCompactionScore(
+    std::vector<uint64_t>& size_being_compacted) {
+  double max_score = 0;
+  int max_score_level = 0;
+
+  int num_levels_to_check =
+      (cfd_->options()->compaction_style != kCompactionStyleUniversal)
+          ? NumberLevels() - 1
+          : 1;
+
+  for (int level = 0; level < num_levels_to_check; level++) {
+    double score;
+    if (level == 0) {
+      // We treat level-0 specially by bounding the number of files
+      // instead of number of bytes for two reasons:
+      //
+      // (1) With larger write-buffer sizes, it is nice not to do too
+      // many level-0 compactions.
+      //
+      // (2) The files in level-0 are merged on every read and
+      // therefore we wish to avoid too many files when the individual
+      // file size is small (perhaps because of a small write-buffer
+      // setting, or very high compression ratios, or lots of
+      // overwrites/deletions).
+      int numfiles = 0;
+      for (unsigned int i = 0; i < files_[level].size(); i++) {
+        if (!files_[level][i]->being_compacted) {
+          numfiles++;
+        }
+      }
+
+      // If we are slowing down writes, then we better compact that first
+      if (numfiles >= cfd_->options()->level0_stop_writes_trigger) {
+        score = 1000000;
+      } else if (numfiles >= cfd_->options()->level0_slowdown_writes_trigger) {
+        score = 10000;
+      } else {
+        score = static_cast<double>(numfiles) /
+                cfd_->options()->level0_file_num_compaction_trigger;
+      }
+    } else {
+      // Compute the ratio of current size to size limit.
+      const uint64_t level_bytes =
+          TotalFileSize(files_[level]) - size_being_compacted[level];
+      score = static_cast<double>(level_bytes) /
+              cfd_->compaction_picker()->MaxBytesForLevel(level);
+      if (max_score < score) {
+        max_score = score;
+        max_score_level = level;
+      }
+    }
+    compaction_level_[level] = level;
+    compaction_score_[level] = score;
+  }
+
+  // update the max compaction score in levels 1 to n-1
+  max_compaction_score_ = max_score;
+  max_compaction_score_level_ = max_score_level;
+
+  // sort all the levels based on their score. Higher scores get listed
+  // first. Use bubble sort because the number of entries are small.
+  for (int i = 0; i < NumberLevels() - 2; i++) {
+    for (int j = i + 1; j < NumberLevels() - 1; j++) {
+      if (compaction_score_[i] < compaction_score_[j]) {
+        double score = compaction_score_[i];
+        int level = compaction_level_[i];
+        compaction_score_[i] = compaction_score_[j];
+        compaction_level_[i] = compaction_level_[j];
+        compaction_score_[j] = score;
+        compaction_level_[j] = level;
+      }
+    }
+  }
+}
+
+namespace {
+
+// Compator that is used to sort files based on their size
+// In normal mode: descending size
+bool CompareSizeDescending(const Version::Fsize& first,
+                           const Version::Fsize& second) {
+  return (first.file->file_size > second.file->file_size);
+}
+// A static compator used to sort files based on their seqno
+// In universal style : descending seqno
+bool CompareSeqnoDescending(const Version::Fsize& first,
+                            const Version::Fsize& second) {
+  if (first.file->smallest_seqno > second.file->smallest_seqno) {
+    assert(first.file->largest_seqno > second.file->largest_seqno);
+    return true;
+  }
+  assert(first.file->largest_seqno <= second.file->largest_seqno);
+  return false;
+}
+
+} // anonymous namespace
+
+void Version::UpdateFilesBySize() {
+  // No need to sort the highest level because it is never compacted.
+  int max_level =
+      (cfd_->options()->compaction_style == kCompactionStyleUniversal)
+          ? NumberLevels()
+          : NumberLevels() - 1;
+
+  for (int level = 0; level < max_level; level++) {
+    const std::vector<FileMetaData*>& files = files_[level];
+    std::vector<int>& files_by_size = files_by_size_[level];
+    assert(files_by_size.size() == 0);
+
+    // populate a temp vector for sorting based on size
+    std::vector<Fsize> temp(files.size());
+    for (unsigned int i = 0; i < files.size(); i++) {
+      temp[i].index = i;
+      temp[i].file = files[i];
+    }
+
+    // sort the top number_of_files_to_sort_ based on file size
+    if (cfd_->options()->compaction_style == kCompactionStyleUniversal) {
+      int num = temp.size();
+      std::partial_sort(temp.begin(), temp.begin() + num, temp.end(),
+                        CompareSeqnoDescending);
+    } else {
+      int num = Version::number_of_files_to_sort_;
+      if (num > (int)temp.size()) {
+        num = temp.size();
+      }
+      std::partial_sort(temp.begin(), temp.begin() + num, temp.end(),
+                        CompareSizeDescending);
+    }
+    assert(temp.size() == files.size());
+
+    // initialize files_by_size_
+    for (unsigned int i = 0; i < temp.size(); i++) {
+      files_by_size.push_back(temp[i].index);
+    }
+    next_file_to_compact_by_size_[level] = 0;
+    assert(files_[level].size() == files_by_size_[level].size());
+  }
+}
+
+void Version::Ref() {
+  ++refs_;
+}
+
+bool Version::Unref() {
+  assert(refs_ >= 1);
+  --refs_;
+  if (refs_ == 0) {
+    delete this;
+    return true;
+  }
+  return false;
+}
+
+bool Version::NeedsCompaction() const {
+  if (file_to_compact_ != nullptr) {
+    return true;
+  }
+  // In universal compaction case, this check doesn't really
+  // check the compaction condition, but checks num of files threshold
+  // only. We are not going to miss any compaction opportunity
+  // but it's likely that more compactions are scheduled but
+  // ending up with nothing to do. We can improve it later.
+  // TODO(sdong): improve this function to be accurate for universal
+  //              compactions.
+  int num_levels_to_check =
+      (cfd_->options()->compaction_style != kCompactionStyleUniversal)
+          ? NumberLevels() - 1
+          : 1;
+  for (int i = 0; i < num_levels_to_check; i++) {
+    if (compaction_score_[i] >= 1) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool Version::OverlapInLevel(int level,
+                             const Slice* smallest_user_key,
+                             const Slice* largest_user_key) {
+  return SomeFileOverlapsRange(cfd_->internal_comparator(), (level > 0),
+                               files_[level], smallest_user_key,
+                               largest_user_key);
+}
+
+int Version::PickLevelForMemTableOutput(
+    const Slice& smallest_user_key,
+    const Slice& largest_user_key) {
+  int level = 0;
+  if (!OverlapInLevel(0, &smallest_user_key, &largest_user_key)) {
+    // Push to next level if there is no overlap in next level,
+    // and the #bytes overlapping in the level after that are limited.
+    InternalKey start(smallest_user_key, kMaxSequenceNumber, kValueTypeForSeek);
+    InternalKey limit(largest_user_key, 0, static_cast<ValueType>(0));
+    std::vector<FileMetaData*> overlaps;
+    int max_mem_compact_level = cfd_->options()->max_mem_compaction_level;
+    while (max_mem_compact_level > 0 && level < max_mem_compact_level) {
+      if (OverlapInLevel(level + 1, &smallest_user_key, &largest_user_key)) {
+        break;
+      }
+      if (level + 2 >= num_levels_) {
+        level++;
+        break;
+      }
+      GetOverlappingInputs(level + 2, &start, &limit, &overlaps);
+      const uint64_t sum = TotalFileSize(overlaps);
+      if (sum > cfd_->compaction_picker()->MaxGrandParentOverlapBytes(level)) {
+        break;
+      }
+      level++;
+    }
+  }
+
+  return level;
+}
+
+// Store in "*inputs" all files in "level" that overlap [begin,end]
+// If hint_index is specified, then it points to a file in the
+// overlapping range.
+// The file_index returns a pointer to any file in an overlapping range.
+void Version::GetOverlappingInputs(int level,
+                                   const InternalKey* begin,
+                                   const InternalKey* end,
+                                   std::vector<FileMetaData*>* inputs,
+                                   int hint_index,
+                                   int* file_index) {
+  inputs->clear();
+  Slice user_begin, user_end;
+  if (begin != nullptr) {
+    user_begin = begin->user_key();
+  }
+  if (end != nullptr) {
+    user_end = end->user_key();
+  }
+  if (file_index) {
+    *file_index = -1;
+  }
+  const Comparator* user_cmp = cfd_->internal_comparator().user_comparator();
+  if (begin != nullptr && end != nullptr && level > 0) {
+    GetOverlappingInputsBinarySearch(level, user_begin, user_end, inputs,
+      hint_index, file_index);
+    return;
+  }
+  for (size_t i = 0; i < files_[level].size(); ) {
+    FileMetaData* f = files_[level][i++];
+    const Slice file_start = f->smallest.user_key();
+    const Slice file_limit = f->largest.user_key();
+    if (begin != nullptr && user_cmp->Compare(file_limit, user_begin) < 0) {
+      // "f" is completely before specified range; skip it
+    } else if (end != nullptr && user_cmp->Compare(file_start, user_end) > 0) {
+      // "f" is completely after specified range; skip it
+    } else {
+      inputs->push_back(f);
+      if (level == 0) {
+        // Level-0 files may overlap each other.  So check if the newly
+        // added file has expanded the range.  If so, restart search.
+        if (begin != nullptr && user_cmp->Compare(file_start, user_begin) < 0) {
+          user_begin = file_start;
+          inputs->clear();
+          i = 0;
+        } else if (end != nullptr
+            && user_cmp->Compare(file_limit, user_end) > 0) {
+          user_end = file_limit;
+          inputs->clear();
+          i = 0;
+        }
+      } else if (file_index) {
+        *file_index = i-1;
+      }
+    }
+  }
+}
+
+// Store in "*inputs" all files in "level" that overlap [begin,end]
+// Employ binary search to find at least one file that overlaps the
+// specified range. From that file, iterate backwards and
+// forwards to find all overlapping files.
+void Version::GetOverlappingInputsBinarySearch(
+    int level,
+    const Slice& user_begin,
+    const Slice& user_end,
+    std::vector<FileMetaData*>* inputs,
+    int hint_index,
+    int* file_index) {
+  assert(level > 0);
+  int min = 0;
+  int mid = 0;
+  int max = files_[level].size() -1;
+  bool foundOverlap = false;
+  const Comparator* user_cmp = cfd_->internal_comparator().user_comparator();
+
+  // if the caller already knows the index of a file that has overlap,
+  // then we can skip the binary search.
+  if (hint_index != -1) {
+    mid = hint_index;
+    foundOverlap = true;
+  }
+
+  while (!foundOverlap && min <= max) {
+    mid = (min + max)/2;
+    FileMetaData* f = files_[level][mid];
+    const Slice file_start = f->smallest.user_key();
+    const Slice file_limit = f->largest.user_key();
+    if (user_cmp->Compare(file_limit, user_begin) < 0) {
+      min = mid + 1;
+    } else if (user_cmp->Compare(user_end, file_start) < 0) {
+      max = mid - 1;
+    } else {
+      foundOverlap = true;
+      break;
+    }
+  }
+
+  // If there were no overlapping files, return immediately.
+  if (!foundOverlap) {
+    return;
+  }
+  // returns the index where an overlap is found
+  if (file_index) {
+    *file_index = mid;
+  }
+  ExtendOverlappingInputs(level, user_begin, user_end, inputs, mid);
+}
+
+// Store in "*inputs" all files in "level" that overlap [begin,end]
+// The midIndex specifies the index of at least one file that
+// overlaps the specified range. From that file, iterate backward
+// and forward to find all overlapping files.
+void Version::ExtendOverlappingInputs(
+    int level,
+    const Slice& user_begin,
+    const Slice& user_end,
+    std::vector<FileMetaData*>* inputs,
+    unsigned int midIndex) {
+
+  const Comparator* user_cmp = cfd_->internal_comparator().user_comparator();
+#ifndef NDEBUG
+  {
+    // assert that the file at midIndex overlaps with the range
+    assert(midIndex < files_[level].size());
+    FileMetaData* f = files_[level][midIndex];
+    const Slice fstart = f->smallest.user_key();
+    const Slice flimit = f->largest.user_key();
+    if (user_cmp->Compare(fstart, user_begin) >= 0) {
+      assert(user_cmp->Compare(fstart, user_end) <= 0);
+    } else {
+      assert(user_cmp->Compare(flimit, user_begin) >= 0);
+    }
+  }
+#endif
+  int startIndex = midIndex + 1;
+  int endIndex = midIndex;
+  int count __attribute__((unused)) = 0;
+
+  // check backwards from 'mid' to lower indices
+  for (int i = midIndex; i >= 0 ; i--) {
+    FileMetaData* f = files_[level][i];
+    const Slice file_limit = f->largest.user_key();
+    if (user_cmp->Compare(file_limit, user_begin) >= 0) {
+      startIndex = i;
+      assert((count++, true));
+    } else {
+      break;
+    }
+  }
+  // check forward from 'mid+1' to higher indices
+  for (unsigned int i = midIndex+1; i < files_[level].size(); i++) {
+    FileMetaData* f = files_[level][i];
+    const Slice file_start = f->smallest.user_key();
+    if (user_cmp->Compare(file_start, user_end) <= 0) {
+      assert((count++, true));
+      endIndex = i;
+    } else {
+      break;
+    }
+  }
+  assert(count == endIndex - startIndex + 1);
+
+  // insert overlapping files into vector
+  for (int i = startIndex; i <= endIndex; i++) {
+    FileMetaData* f = files_[level][i];
+    inputs->push_back(f);
+  }
+}
+
+// Returns true iff the first or last file in inputs contains
+// an overlapping user key to the file "just outside" of it (i.e.
+// just after the last file, or just before the first file)
+// REQUIRES: "*inputs" is a sorted list of non-overlapping files
+bool Version::HasOverlappingUserKey(
+    const std::vector<FileMetaData*>* inputs,
+    int level) {
+
+  // If inputs empty, there is no overlap.
+  // If level == 0, it is assumed that all needed files were already included.
+  if (inputs->empty() || level == 0){
+    return false;
+  }
+
+  const Comparator* user_cmp = cfd_->internal_comparator().user_comparator();
+  const std::vector<FileMetaData*>& files = files_[level];
+  const size_t kNumFiles = files.size();
+
+  // Check the last file in inputs against the file after it
+  size_t last_file = FindFile(cfd_->internal_comparator(), files,
+                              inputs->back()->largest.Encode());
+  assert(0 <= last_file && last_file < kNumFiles);  // File should exist!
+  if (last_file < kNumFiles-1) {                    // If not the last file
+    const Slice last_key_in_input = files[last_file]->largest.user_key();
+    const Slice first_key_after = files[last_file+1]->smallest.user_key();
+    if (user_cmp->Compare(last_key_in_input, first_key_after) == 0) {
+      // The last user key in input overlaps with the next file's first key
+      return true;
+    }
+  }
+
+  // Check the first file in inputs against the file just before it
+  size_t first_file = FindFile(cfd_->internal_comparator(), files,
+                               inputs->front()->smallest.Encode());
+  assert(0 <= first_file && first_file <= last_file);   // File should exist!
+  if (first_file > 0) {                                 // If not first file
+    const Slice& first_key_in_input = files[first_file]->smallest.user_key();
+    const Slice& last_key_before = files[first_file-1]->largest.user_key();
+    if (user_cmp->Compare(first_key_in_input, last_key_before) == 0) {
+      // The first user key in input overlaps with the previous file's last key
+      return true;
+    }
+  }
+
+  return false;
+}
+
+int64_t Version::NumLevelBytes(int level) const {
+  assert(level >= 0);
+  assert(level < NumberLevels());
+  return TotalFileSize(files_[level]);
+}
+
+const char* Version::LevelSummary(LevelSummaryStorage* scratch) const {
+  int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files[");
+  for (int i = 0; i < NumberLevels(); i++) {
+    int sz = sizeof(scratch->buffer) - len;
+    int ret = snprintf(scratch->buffer + len, sz, "%d ", int(files_[i].size()));
+    if (ret < 0 || ret >= sz) break;
+    len += ret;
+  }
+  snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "]");
+  return scratch->buffer;
+}
+
+const char* Version::LevelFileSummary(FileSummaryStorage* scratch,
+                                      int level) const {
+  int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files_size[");
+  for (const auto& f : files_[level]) {
+    int sz = sizeof(scratch->buffer) - len;
+    int ret = snprintf(scratch->buffer + len, sz,
+                       "#%lu(seq=%lu,sz=%lu,%lu) ",
+                       (unsigned long)f->number,
+                       (unsigned long)f->smallest_seqno,
+                       (unsigned long)f->file_size,
+                       (unsigned long)f->being_compacted);
+    if (ret < 0 || ret >= sz)
+      break;
+    len += ret;
+  }
+  snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "]");
+  return scratch->buffer;
+}
+
+int64_t Version::MaxNextLevelOverlappingBytes() {
+  uint64_t result = 0;
+  std::vector<FileMetaData*> overlaps;
+  for (int level = 1; level < NumberLevels() - 1; level++) {
+    for (const auto& f : files_[level]) {
+      GetOverlappingInputs(level + 1, &f->smallest, &f->largest, &overlaps);
+      const uint64_t sum = TotalFileSize(overlaps);
+      if (sum > result) {
+        result = sum;
+      }
+    }
+  }
+  return result;
+}
+
+void Version::AddLiveFiles(std::set<uint64_t>* live) {
+  for (int level = 0; level < NumberLevels(); level++) {
+    const std::vector<FileMetaData*>& files = files_[level];
+    for (const auto& file : files) {
+      live->insert(file->number);
+    }
+  }
+}
+
+std::string Version::DebugString(bool hex) const {
+  std::string r;
+  for (int level = 0; level < num_levels_; level++) {
+    // E.g.,
+    //   --- level 1 ---
+    //   17:123['a' .. 'd']
+    //   20:43['e' .. 'g']
+    r.append("--- level ");
+    AppendNumberTo(&r, level);
+    r.append(" --- version# ");
+    AppendNumberTo(&r, version_number_);
+    r.append(" ---\n");
+    const std::vector<FileMetaData*>& files = files_[level];
+    for (size_t i = 0; i < files.size(); i++) {
+      r.push_back(' ');
+      AppendNumberTo(&r, files[i]->number);
+      r.push_back(':');
+      AppendNumberTo(&r, files[i]->file_size);
+      r.append("[");
+      r.append(files[i]->smallest.DebugString(hex));
+      r.append(" .. ");
+      r.append(files[i]->largest.DebugString(hex));
+      r.append("]\n");
+    }
+  }
+  return r;
+}
+
+// this is used to batch writes to the manifest file
+struct VersionSet::ManifestWriter {
+  Status status;
+  bool done;
+  port::CondVar cv;
+  ColumnFamilyData* cfd;
+  VersionEdit* edit;
+
+  explicit ManifestWriter(port::Mutex* mu, ColumnFamilyData* cfd,
+                          VersionEdit* e)
+      : done(false), cv(mu), cfd(cfd), edit(e) {}
+};
+
+// A helper class so we can efficiently apply a whole sequence
+// of edits to a particular state without creating intermediate
+// Versions that contain full copies of the intermediate state.
+class VersionSet::Builder {
+ private:
+  // Helper to sort v->files_
+  // kLevel0LevelCompaction -- NewestFirst
+  // kLevel0UniversalCompaction -- NewestFirstBySeqNo
+  // kLevelNon0 -- BySmallestKey
+  struct FileComparator {
+    enum SortMethod {
+      kLevel0LevelCompaction = 0,
+      kLevel0UniversalCompaction = 1,
+      kLevelNon0 = 2,
+    } sort_method;
+    const InternalKeyComparator* internal_comparator;
+
+    bool operator()(FileMetaData* f1, FileMetaData* f2) const {
+      switch (sort_method) {
+        case kLevel0LevelCompaction:
+          return NewestFirst(f1, f2);
+        case kLevel0UniversalCompaction:
+          return NewestFirstBySeqNo(f1, f2);
+        case kLevelNon0:
+          return BySmallestKey(f1, f2, internal_comparator);
+      }
+      assert(false);
+      return false;
+    }
+  };
+
+  typedef std::set<FileMetaData*, FileComparator> FileSet;
+  struct LevelState {
+    std::set<uint64_t> deleted_files;
+    FileSet* added_files;
+  };
+
+  ColumnFamilyData* cfd_;
+  Version* base_;
+  LevelState* levels_;
+  FileComparator level_zero_cmp_;
+  FileComparator level_nonzero_cmp_;
+
+ public:
+  Builder(ColumnFamilyData* cfd) : cfd_(cfd), base_(cfd->current()) {
+    base_->Ref();
+    levels_ = new LevelState[base_->NumberLevels()];
+    level_zero_cmp_.sort_method =
+        (cfd_->options()->compaction_style == kCompactionStyleUniversal)
+            ? FileComparator::kLevel0UniversalCompaction
+            : FileComparator::kLevel0LevelCompaction;
+    level_nonzero_cmp_.sort_method = FileComparator::kLevelNon0;
+    level_nonzero_cmp_.internal_comparator = &cfd->internal_comparator();
+
+    levels_[0].added_files = new FileSet(level_zero_cmp_);
+    for (int level = 1; level < base_->NumberLevels(); level++) {
+        levels_[level].added_files = new FileSet(level_nonzero_cmp_);
+    }
+  }
+
+  ~Builder() {
+    for (int level = 0; level < base_->NumberLevels(); level++) {
+      const FileSet* added = levels_[level].added_files;
+      std::vector<FileMetaData*> to_unref;
+      to_unref.reserve(added->size());
+      for (FileSet::const_iterator it = added->begin();
+          it != added->end(); ++it) {
+        to_unref.push_back(*it);
+      }
+      delete added;
+      for (uint32_t i = 0; i < to_unref.size(); i++) {
+        FileMetaData* f = to_unref[i];
+        f->refs--;
+        if (f->refs <= 0) {
+          if (f->table_reader_handle) {
+            cfd_->table_cache()->ReleaseHandle(f->table_reader_handle);
+            f->table_reader_handle = nullptr;
+          }
+          delete f;
+        }
+      }
+    }
+
+    delete[] levels_;
+    base_->Unref();
+  }
+
+  void CheckConsistency(Version* v) {
+#ifndef NDEBUG
+    // make sure the files are sorted correctly
+    for (int level = 0; level < v->NumberLevels(); level++) {
+      for (size_t i = 1; i < v->files_[level].size(); i++) {
+        auto f1 = v->files_[level][i - 1];
+        auto f2 = v->files_[level][i];
+        if (level == 0) {
+          assert(level_zero_cmp_(f1, f2));
+          if (cfd_->options()->compaction_style == kCompactionStyleUniversal) {
+            assert(f1->largest_seqno > f2->largest_seqno);
+          }
+        } else {
+          assert(level_nonzero_cmp_(f1, f2));
+
+          // Make sure there is no overlap in levels > 0
+          if (cfd_->internal_comparator().Compare(f1->largest, f2->smallest) >=
+              0) {
+            fprintf(stderr, "overlapping ranges in same level %s vs. %s\n",
+                    (f1->largest).DebugString().c_str(),
+                    (f2->smallest).DebugString().c_str());
+            abort();
+          }
+        }
+      }
+    }
+#endif
+  }
+
+  void CheckConsistencyForDeletes(VersionEdit* edit, unsigned int number,
+                                  int level) {
+#ifndef NDEBUG
+      // a file to be deleted better exist in the previous version
+      bool found = false;
+      for (int l = 0; !found && l < base_->NumberLevels(); l++) {
+        const std::vector<FileMetaData*>& base_files = base_->files_[l];
+        for (unsigned int i = 0; i < base_files.size(); i++) {
+          FileMetaData* f = base_files[i];
+          if (f->number == number) {
+            found =  true;
+            break;
+          }
+        }
+      }
+      // if the file did not exist in the previous version, then it
+      // is possibly moved from lower level to higher level in current
+      // version
+      for (int l = level+1; !found && l < base_->NumberLevels(); l++) {
+        const FileSet* added = levels_[l].added_files;
+        for (FileSet::const_iterator added_iter = added->begin();
+             added_iter != added->end(); ++added_iter) {
+          FileMetaData* f = *added_iter;
+          if (f->number == number) {
+            found = true;
+            break;
+          }
+        }
+      }
+
+      // maybe this file was added in a previous edit that was Applied
+      if (!found) {
+        const FileSet* added = levels_[level].added_files;
+        for (FileSet::const_iterator added_iter = added->begin();
+             added_iter != added->end(); ++added_iter) {
+          FileMetaData* f = *added_iter;
+          if (f->number == number) {
+            found = true;
+            break;
+          }
+        }
+      }
+      assert(found);
+#endif
+  }
+
+  // Apply all of the edits in *edit to the current state.
+  void Apply(VersionEdit* edit) {
+    CheckConsistency(base_);
+
+    // Delete files
+    const VersionEdit::DeletedFileSet& del = edit->deleted_files_;
+    for (const auto& del_file : del) {
+      const auto level = del_file.first;
+      const auto number = del_file.second;
+      levels_[level].deleted_files.insert(number);
+      CheckConsistencyForDeletes(edit, number, level);
+    }
+
+    // Add new files
+    for (const auto& new_file : edit->new_files_) {
+      const int level = new_file.first;
+      FileMetaData* f = new FileMetaData(new_file.second);
+      f->refs = 1;
+
+      // We arrange to automatically compact this file after
+      // a certain number of seeks.  Let's assume:
+      //   (1) One seek costs 10ms
+      //   (2) Writing or reading 1MB costs 10ms (100MB/s)
+      //   (3) A compaction of 1MB does 25MB of IO:
+      //         1MB read from this level
+      //         10-12MB read from next level (boundaries may be misaligned)
+      //         10-12MB written to next level
+      // This implies that 25 seeks cost the same as the compaction
+      // of 1MB of data.  I.e., one seek costs approximately the
+      // same as the compaction of 40KB of data.  We are a little
+      // conservative and allow approximately one seek for every 16KB
+      // of data before triggering a compaction.
+      f->allowed_seeks = (f->file_size / 16384);
+      if (f->allowed_seeks < 100) f->allowed_seeks = 100;
+
+      levels_[level].deleted_files.erase(f->number);
+      levels_[level].added_files->insert(f);
+    }
+  }
+
+  // Save the current state in *v.
+  void SaveTo(Version* v) {
+    CheckConsistency(base_);
+    CheckConsistency(v);
+
+    for (int level = 0; level < base_->NumberLevels(); level++) {
+      const auto& cmp = (level == 0) ? level_zero_cmp_ : level_nonzero_cmp_;
+      // Merge the set of added files with the set of pre-existing files.
+      // Drop any deleted files.  Store the result in *v.
+      const auto& base_files = base_->files_[level];
+      auto base_iter = base_files.begin();
+      auto base_end = base_files.end();
+      const auto& added_files = *levels_[level].added_files;
+      v->files_[level].reserve(base_files.size() + added_files.size());
+
+      for (const auto& added : added_files) {
+        // Add all smaller files listed in base_
+        for (auto bpos = std::upper_bound(base_iter, base_end, added, cmp);
+             base_iter != bpos;
+             ++base_iter) {
+          MaybeAddFile(v, level, *base_iter);
+        }
+
+        MaybeAddFile(v, level, added);
+      }
+
+      // Add remaining base files
+      for (; base_iter != base_end; ++base_iter) {
+        MaybeAddFile(v, level, *base_iter);
+      }
+    }
+
+    CheckConsistency(v);
+
+    v->file_indexer_.UpdateIndex(v->files_);
+  }
+
+  void LoadTableHandlers() {
+    for (int level = 0; level < cfd_->NumberLevels(); level++) {
+      for (auto& file_meta : *(levels_[level].added_files)) {
+        assert (!file_meta->table_reader_handle);
+        bool table_io;
+        cfd_->table_cache()->FindTable(
+            base_->vset_->storage_options_, cfd_->internal_comparator(),
+            file_meta->number, file_meta->file_size,
+            &file_meta->table_reader_handle, &table_io, false);
+        if (file_meta->table_reader_handle != nullptr) {
+          // Load table_reader
+          file_meta->table_reader =
+              cfd_->table_cache()->GetTableReaderFromHandle(
+                  file_meta->table_reader_handle);
+        }
+      }
+    }
+  }
+
+  void MaybeAddFile(Version* v, int level, FileMetaData* f) {
+    if (levels_[level].deleted_files.count(f->number) > 0) {
+      // File is deleted: do nothing
+    } else {
+      auto* files = &v->files_[level];
+      if (level > 0 && !files->empty()) {
+        // Must not overlap
+        assert(cfd_->internal_comparator().Compare(
+                   (*files)[files->size() - 1]->largest, f->smallest) < 0);
+      }
+      f->refs++;
+      files->push_back(f);
+    }
+  }
+};
+
+VersionSet::VersionSet(const std::string& dbname, const DBOptions* options,
+                       const EnvOptions& storage_options, Cache* table_cache)
+    : column_family_set_(new ColumnFamilySet(dbname, options, storage_options,
+                                             table_cache)),
+      env_(options->env),
+      dbname_(dbname),
+      options_(options),
+      next_file_number_(2),
+      manifest_file_number_(0),  // Filled by Recover()
+      pending_manifest_file_number_(0),
+      last_sequence_(0),
+      prev_log_number_(0),
+      current_version_number_(0),
+      manifest_file_size_(0),
+      storage_options_(storage_options),
+      storage_options_compactions_(storage_options_) {}
+
+VersionSet::~VersionSet() {
+  // we need to delete column_family_set_ because its destructor depends on
+  // VersionSet
+  column_family_set_.reset();
+  for (auto file : obsolete_files_) {
+    delete file;
+  }
+  obsolete_files_.clear();
+}
+
+void VersionSet::AppendVersion(ColumnFamilyData* column_family_data,
+                               Version* v) {
+  // Make "v" current
+  assert(v->refs_ == 0);
+  Version* current = column_family_data->current();
+  assert(v != current);
+  if (current != nullptr) {
+    assert(current->refs_ > 0);
+    current->Unref();
+  }
+  column_family_data->SetCurrent(v);
+  v->Ref();
+
+  // Append to linked list
+  v->prev_ = column_family_data->dummy_versions()->prev_;
+  v->next_ = column_family_data->dummy_versions();
+  v->prev_->next_ = v;
+  v->next_->prev_ = v;
+}
+
+Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
+                               VersionEdit* edit, port::Mutex* mu,
+                               Directory* db_directory, bool new_descriptor_log,
+                               const ColumnFamilyOptions* options) {
+  mu->AssertHeld();
+
+  // column_family_data can be nullptr only if this is column_family_add.
+  // in that case, we also need to specify ColumnFamilyOptions
+  if (column_family_data == nullptr) {
+    assert(edit->is_column_family_add_);
+    assert(options != nullptr);
+  }
+
+  // queue our request
+  ManifestWriter w(mu, column_family_data, edit);
+  manifest_writers_.push_back(&w);
+  while (!w.done && &w != manifest_writers_.front()) {
+    w.cv.Wait();
+  }
+  if (w.done) {
+    return w.status;
+  }
+  if (column_family_data != nullptr && column_family_data->IsDropped()) {
+    // if column family is dropped by the time we get here, no need to write
+    // anything to the manifest
+    manifest_writers_.pop_front();
+    // Notify new head of write queue
+    if (!manifest_writers_.empty()) {
+      manifest_writers_.front()->cv.Signal();
+    }
+    return Status::OK();
+  }
+
+  std::vector<VersionEdit*> batch_edits;
+  Version* v = nullptr;
+  std::unique_ptr<Builder> builder(nullptr);
+
+  // process all requests in the queue
+  ManifestWriter* last_writer = &w;
+  assert(!manifest_writers_.empty());
+  assert(manifest_writers_.front() == &w);
+  if (edit->IsColumnFamilyManipulation()) {
+    // no group commits for column family add or drop
+    LogAndApplyCFHelper(edit);
+    batch_edits.push_back(edit);
+  } else {
+    v = new Version(column_family_data, this, current_version_number_++);
+    builder.reset(new Builder(column_family_data));
+    for (const auto& writer : manifest_writers_) {
+      if (writer->edit->IsColumnFamilyManipulation() ||
+          writer->cfd->GetID() != column_family_data->GetID()) {
+        // no group commits for column family add or drop
+        // also, group commits across column families are not supported
+        break;
+      }
+      last_writer = writer;
+      LogAndApplyHelper(column_family_data, builder.get(), v, last_writer->edit,
+                        mu);
+      batch_edits.push_back(last_writer->edit);
+    }
+    builder->SaveTo(v);
+  }
+
+  // Initialize new descriptor log file if necessary by creating
+  // a temporary file that contains a snapshot of the current version.
+  uint64_t new_manifest_file_size = 0;
+  Status s;
+
+  assert(pending_manifest_file_number_ == 0);
+  if (!descriptor_log_ ||
+      manifest_file_size_ > options_->max_manifest_file_size) {
+    pending_manifest_file_number_ = NewFileNumber();
+    batch_edits.back()->SetNextFile(next_file_number_);
+    new_descriptor_log = true;
+  } else {
+    pending_manifest_file_number_ = manifest_file_number_;
+  }
+
+  if (new_descriptor_log) {
+    // if we're writing out new snapshot make sure to persist max column family
+    if (column_family_set_->GetMaxColumnFamily() > 0) {
+      edit->SetMaxColumnFamily(column_family_set_->GetMaxColumnFamily());
+    }
+  }
+
+  // Unlock during expensive operations. New writes cannot get here
+  // because &w is ensuring that all new writes get queued.
+  {
+    std::vector<uint64_t> size_being_compacted;
+    if (!edit->IsColumnFamilyManipulation()) {
+      size_being_compacted.resize(v->NumberLevels() - 1);
+      // calculate the amount of data being compacted at every level
+      column_family_data->compaction_picker()->SizeBeingCompacted(
+          size_being_compacted);
+    }
+
+    mu->Unlock();
+
+    if (!edit->IsColumnFamilyManipulation() && options_->max_open_files == -1) {
+      // unlimited table cache. Pre-load table handle now.
+      // Need to do it out of the mutex.
+      builder->LoadTableHandlers();
+    }
+
+    // This is fine because everything inside of this block is serialized --
+    // only one thread can be here at the same time
+    if (new_descriptor_log) {
+      unique_ptr<WritableFile> descriptor_file;
+      s = env_->NewWritableFile(
+          DescriptorFileName(dbname_, pending_manifest_file_number_),
+          &descriptor_file, env_->OptimizeForManifestWrite(storage_options_));
+      if (s.ok()) {
+        descriptor_file->SetPreallocationBlockSize(
+            options_->manifest_preallocation_size);
+        descriptor_log_.reset(new log::Writer(std::move(descriptor_file)));
+        s = WriteSnapshot(descriptor_log_.get());
+      }
+    }
+
+    if (!edit->IsColumnFamilyManipulation()) {
+      // The calls to ComputeCompactionScore and UpdateFilesBySize are cpu-heavy
+      // and is best called outside the mutex.
+      v->ComputeCompactionScore(size_being_compacted);
+      v->UpdateFilesBySize();
+    }
+
+    // Write new record to MANIFEST log
+    if (s.ok()) {
+      for (auto& e : batch_edits) {
+        std::string record;
+        e->EncodeTo(&record);
+        s = descriptor_log_->AddRecord(record);
+        if (!s.ok()) {
+          break;
+        }
+      }
+      if (s.ok()) {
+        if (options_->use_fsync) {
+          StopWatch sw(env_, options_->statistics.get(),
+                       MANIFEST_FILE_SYNC_MICROS);
+          s = descriptor_log_->file()->Fsync();
+        } else {
+          StopWatch sw(env_, options_->statistics.get(),
+                       MANIFEST_FILE_SYNC_MICROS);
+          s = descriptor_log_->file()->Sync();
+        }
+      }
+      if (!s.ok()) {
+        Log(options_->info_log, "MANIFEST write: %s\n", s.ToString().c_str());
+        bool all_records_in = true;
+        for (auto& e : batch_edits) {
+          std::string record;
+          e->EncodeTo(&record);
+          if (!ManifestContains(pending_manifest_file_number_, record)) {
+            all_records_in = false;
+            break;
+          }
+        }
+        if (all_records_in) {
+          Log(options_->info_log,
+              "MANIFEST contains log record despite error; advancing to new "
+              "version to prevent mismatch between in-memory and logged state"
+              " If paranoid is set, then the db is now in readonly mode.");
+          s = Status::OK();
+        }
+      }
+    }
+
+    // If we just created a new descriptor file, install it by writing a
+    // new CURRENT file that points to it.
+    if (s.ok() && new_descriptor_log) {
+      s = SetCurrentFile(env_, dbname_, pending_manifest_file_number_);
+      if (s.ok() && pending_manifest_file_number_ > manifest_file_number_) {
+        // delete old manifest file
+        Log(options_->info_log,
+            "Deleting manifest %" PRIu64 " current manifest %" PRIu64 "\n",
+            manifest_file_number_, pending_manifest_file_number_);
+        // we don't care about an error here, PurgeObsoleteFiles will take care
+        // of it later
+        env_->DeleteFile(DescriptorFileName(dbname_, manifest_file_number_));
+      }
+      if (!options_->disableDataSync && db_directory != nullptr) {
+        db_directory->Fsync();
+      }
+    }
+
+    if (s.ok()) {
+      // find offset in manifest file where this version is stored.
+      new_manifest_file_size = descriptor_log_->file()->GetFileSize();
+    }
+
+    LogFlush(options_->info_log);
+    mu->Lock();
+  }
+
+  // Install the new version
+  if (s.ok()) {
+    if (edit->is_column_family_add_) {
+      // no group commit on column family add
+      assert(batch_edits.size() == 1);
+      assert(options != nullptr);
+      CreateColumnFamily(*options, edit);
+    } else if (edit->is_column_family_drop_) {
+      assert(batch_edits.size() == 1);
+      column_family_data->SetDropped();
+      if (column_family_data->Unref()) {
+        delete column_family_data;
+      }
+    } else {
+      uint64_t max_log_number_in_batch  = 0;
+      for (auto& e : batch_edits) {
+        if (e->has_log_number_) {
+          max_log_number_in_batch =
+              std::max(max_log_number_in_batch, e->log_number_);
+        }
+      }
+      if (max_log_number_in_batch != 0) {
+        assert(column_family_data->GetLogNumber() <= max_log_number_in_batch);
+        column_family_data->SetLogNumber(max_log_number_in_batch);
+      }
+      AppendVersion(column_family_data, v);
+    }
+
+    manifest_file_number_ = pending_manifest_file_number_;
+    manifest_file_size_ = new_manifest_file_size;
+    prev_log_number_ = edit->prev_log_number_;
+  } else {
+    Log(options_->info_log, "Error in committing version %lu to [%s]",
+        (unsigned long)v->GetVersionNumber(),
+        column_family_data->GetName().c_str());
+    delete v;
+    if (new_descriptor_log) {
+      descriptor_log_.reset();
+      env_->DeleteFile(
+          DescriptorFileName(dbname_, pending_manifest_file_number_));
+    }
+  }
+  pending_manifest_file_number_ = 0;
+
+  // wake up all the waiting writers
+  while (true) {
+    ManifestWriter* ready = manifest_writers_.front();
+    manifest_writers_.pop_front();
+    if (ready != &w) {
+      ready->status = s;
+      ready->done = true;
+      ready->cv.Signal();
+    }
+    if (ready == last_writer) break;
+  }
+  // Notify new head of write queue
+  if (!manifest_writers_.empty()) {
+    manifest_writers_.front()->cv.Signal();
+  }
+  return s;
+}
+
+void VersionSet::LogAndApplyCFHelper(VersionEdit* edit) {
+  assert(edit->IsColumnFamilyManipulation());
+  edit->SetNextFile(next_file_number_);
+  edit->SetLastSequence(last_sequence_);
+  if (edit->is_column_family_drop_) {
+    // if we drop column family, we have to make sure to save max column family,
+    // so that we don't reuse existing ID
+    edit->SetMaxColumnFamily(column_family_set_->GetMaxColumnFamily());
+  }
+}
+
+void VersionSet::LogAndApplyHelper(ColumnFamilyData* cfd, Builder* builder,
+                                   Version* v, VersionEdit* edit,
+                                   port::Mutex* mu) {
+  mu->AssertHeld();
+  assert(!edit->IsColumnFamilyManipulation());
+
+  if (edit->has_log_number_) {
+    assert(edit->log_number_ >= cfd->GetLogNumber());
+    assert(edit->log_number_ < next_file_number_);
+  }
+
+  if (!edit->has_prev_log_number_) {
+    edit->SetPrevLogNumber(prev_log_number_);
+  }
+  edit->SetNextFile(next_file_number_);
+  edit->SetLastSequence(last_sequence_);
+
+  builder->Apply(edit);
+}
+
+Status VersionSet::Recover(
+    const std::vector<ColumnFamilyDescriptor>& column_families,
+    bool read_only) {
+  std::unordered_map<std::string, ColumnFamilyOptions> cf_name_to_options;
+  for (auto cf : column_families) {
+    cf_name_to_options.insert({cf.name, cf.options});
+  }
+  // keeps track of column families in manifest that were not found in
+  // column families parameters. if those column families are not dropped
+  // by subsequent manifest records, Recover() will return failure status
+  std::unordered_map<int, std::string> column_families_not_found;
+
+  // Read "CURRENT" file, which contains a pointer to the current manifest file
+  std::string manifest_filename;
+  Status s = ReadFileToString(
+      env_, CurrentFileName(dbname_), &manifest_filename
+  );
+  if (!s.ok()) {
+    return s;
+  }
+  if (manifest_filename.empty() ||
+      manifest_filename.back() != '\n') {
+    return Status::Corruption("CURRENT file does not end with newline");
+  }
+  // remove the trailing '\n'
+  manifest_filename.resize(manifest_filename.size() - 1);
+  FileType type;
+  bool parse_ok =
+      ParseFileName(manifest_filename, &manifest_file_number_, &type);
+  if (!parse_ok || type != kDescriptorFile) {
+    return Status::Corruption("CURRENT file corrupted");
+  }
+
+  Log(options_->info_log, "Recovering from manifest file: %s\n",
+      manifest_filename.c_str());
+
+  manifest_filename = dbname_ + "/" + manifest_filename;
+  unique_ptr<SequentialFile> manifest_file;
+  s = env_->NewSequentialFile(manifest_filename, &manifest_file,
+                              storage_options_);
+  if (!s.ok()) {
+    return s;
+  }
+  uint64_t manifest_file_size;
+  s = env_->GetFileSize(manifest_filename, &manifest_file_size);
+  if (!s.ok()) {
+    return s;
+  }
+
+  bool have_log_number = false;
+  bool have_prev_log_number = false;
+  bool have_next_file = false;
+  bool have_last_sequence = false;
+  uint64_t next_file = 0;
+  uint64_t last_sequence = 0;
+  uint64_t log_number = 0;
+  uint64_t prev_log_number = 0;
+  uint32_t max_column_family = 0;
+  std::unordered_map<uint32_t, Builder*> builders;
+
+  // add default column family
+  auto default_cf_iter = cf_name_to_options.find(kDefaultColumnFamilyName);
+  if (default_cf_iter == cf_name_to_options.end()) {
+    return Status::InvalidArgument("Default column family not specified");
+  }
+  VersionEdit default_cf_edit;
+  default_cf_edit.AddColumnFamily(kDefaultColumnFamilyName);
+  default_cf_edit.SetColumnFamily(0);
+  ColumnFamilyData* default_cfd =
+      CreateColumnFamily(default_cf_iter->second, &default_cf_edit);
+  builders.insert({0, new Builder(default_cfd)});
+
+  {
+    VersionSet::LogReporter reporter;
+    reporter.status = &s;
+    log::Reader reader(std::move(manifest_file), &reporter, true /*checksum*/,
+                       0 /*initial_offset*/);
+    Slice record;
+    std::string scratch;
+    while (reader.ReadRecord(&record, &scratch) && s.ok()) {
+      VersionEdit edit;
+      s = edit.DecodeFrom(record);
+      if (!s.ok()) {
+        break;
+      }
+
+      // Not found means that user didn't supply that column
+      // family option AND we encountered column family add
+      // record. Once we encounter column family drop record,
+      // we will delete the column family from
+      // column_families_not_found.
+      bool cf_in_not_found =
+          column_families_not_found.find(edit.column_family_) !=
+          column_families_not_found.end();
+      // in builders means that user supplied that column family
+      // option AND that we encountered column family add record
+      bool cf_in_builders =
+          builders.find(edit.column_family_) != builders.end();
+
+      // they can't both be true
+      assert(!(cf_in_not_found && cf_in_builders));
+
+      ColumnFamilyData* cfd = nullptr;
+
+      if (edit.is_column_family_add_) {
+        if (cf_in_builders || cf_in_not_found) {
+          s = Status::Corruption(
+              "Manifest adding the same column family twice");
+          break;
+        }
+        auto cf_options = cf_name_to_options.find(edit.column_family_name_);
+        if (cf_options == cf_name_to_options.end()) {
+          column_families_not_found.insert(
+              {edit.column_family_, edit.column_family_name_});
+        } else {
+          cfd = CreateColumnFamily(cf_options->second, &edit);
+          builders.insert({edit.column_family_, new Builder(cfd)});
+        }
+      } else if (edit.is_column_family_drop_) {
+        if (cf_in_builders) {
+          auto builder = builders.find(edit.column_family_);
+          assert(builder != builders.end());
+          delete builder->second;
+          builders.erase(builder);
+          cfd = column_family_set_->GetColumnFamily(edit.column_family_);
+          if (cfd->Unref()) {
+            delete cfd;
+            cfd = nullptr;
+          } else {
+            // who else can have reference to cfd!?
+            assert(false);
+          }
+        } else if (cf_in_not_found) {
+          column_families_not_found.erase(edit.column_family_);
+        } else {
+          s = Status::Corruption(
+              "Manifest - dropping non-existing column family");
+          break;
+        }
+      } else if (!cf_in_not_found) {
+        if (!cf_in_builders) {
+          s = Status::Corruption(
+              "Manifest record referencing unknown column family");
+          break;
+        }
+
+        cfd = column_family_set_->GetColumnFamily(edit.column_family_);
+        // this should never happen since cf_in_builders is true
+        assert(cfd != nullptr);
+        if (edit.max_level_ >= cfd->current()->NumberLevels()) {
+          s = Status::InvalidArgument(
+              "db has more levels than options.num_levels");
+          break;
+        }
+
+        // if it is not column family add or column family drop,
+        // then it's a file add/delete, which should be forwarded
+        // to builder
+        auto builder = builders.find(edit.column_family_);
+        assert(builder != builders.end());
+        builder->second->Apply(&edit);
+      }
+
+      if (cfd != nullptr) {
+        if (edit.has_log_number_) {
+          if (cfd->GetLogNumber() > edit.log_number_) {
+            Log(options_->info_log,
+                "MANIFEST corruption detected, but ignored - Log numbers in "
+                "records NOT monotonically increasing");
+          } else {
+            cfd->SetLogNumber(edit.log_number_);
+            have_log_number = true;
+          }
+        }
+        if (edit.has_comparator_ &&
+            edit.comparator_ != cfd->user_comparator()->Name()) {
+          s = Status::InvalidArgument(
+              cfd->user_comparator()->Name(),
+              "does not match existing comparator " + edit.comparator_);
+          break;
+        }
+      }
+
+      if (edit.has_prev_log_number_) {
+        prev_log_number = edit.prev_log_number_;
+        have_prev_log_number = true;
+      }
+
+      if (edit.has_next_file_number_) {
+        next_file = edit.next_file_number_;
+        have_next_file = true;
+      }
+
+      if (edit.has_max_column_family_) {
+        max_column_family = edit.max_column_family_;
+      }
+
+      if (edit.has_last_sequence_) {
+        last_sequence = edit.last_sequence_;
+        have_last_sequence = true;
+      }
+    }
+  }
+
+  if (s.ok()) {
+    if (!have_next_file) {
+      s = Status::Corruption("no meta-nextfile entry in descriptor");
+    } else if (!have_log_number) {
+      s = Status::Corruption("no meta-lognumber entry in descriptor");
+    } else if (!have_last_sequence) {
+      s = Status::Corruption("no last-sequence-number entry in descriptor");
+    }
+
+    if (!have_prev_log_number) {
+      prev_log_number = 0;
+    }
+
+    column_family_set_->UpdateMaxColumnFamily(max_column_family);
+
+    MarkFileNumberUsed(prev_log_number);
+    MarkFileNumberUsed(log_number);
+  }
+
+  // there were some column families in the MANIFEST that weren't specified
+  // in the argument. This is OK in read_only mode
+  if (read_only == false && column_families_not_found.size() > 0) {
+    std::string list_of_not_found;
+    for (const auto& cf : column_families_not_found) {
+      list_of_not_found += ", " + cf.second;
+    }
+    list_of_not_found = list_of_not_found.substr(2);
+    s = Status::InvalidArgument(
+        "You have to open all column families. Column families not opened: " +
+        list_of_not_found);
+  }
+
+  if (s.ok()) {
+    for (auto cfd : *column_family_set_) {
+      auto builders_iter = builders.find(cfd->GetID());
+      assert(builders_iter != builders.end());
+      auto builder = builders_iter->second;
+
+      if (options_->max_open_files == -1) {
+      // unlimited table cache. Pre-load table handle now.
+      // Need to do it out of the mutex.
+        builder->LoadTableHandlers();
+      }
+
+      Version* v = new Version(cfd, this, current_version_number_++);
+      builder->SaveTo(v);
+
+      // Install recovered version
+      std::vector<uint64_t> size_being_compacted(v->NumberLevels() - 1);
+      cfd->compaction_picker()->SizeBeingCompacted(size_being_compacted);
+      v->ComputeCompactionScore(size_being_compacted);
+      v->UpdateFilesBySize();
+      AppendVersion(cfd, v);
+    }
+
+    manifest_file_size_ = manifest_file_size;
+    next_file_number_ = next_file + 1;
+    last_sequence_ = last_sequence;
+    prev_log_number_ = prev_log_number;
+
+    Log(options_->info_log, "Recovered from manifest file:%s succeeded,"
+        "manifest_file_number is %lu, next_file_number is %lu, "
+        "last_sequence is %lu, log_number is %lu,"
+        "prev_log_number is %lu,"
+        "max_column_family is %u\n",
+        manifest_filename.c_str(),
+        (unsigned long)manifest_file_number_,
+        (unsigned long)next_file_number_,
+        (unsigned long)last_sequence_,
+        (unsigned long)log_number,
+        (unsigned long)prev_log_number_,
+        column_family_set_->GetMaxColumnFamily());
+
+    for (auto cfd : *column_family_set_) {
+      Log(options_->info_log,
+          "Column family [%s] (ID %u), log number is %" PRIu64 "\n",
+          cfd->GetName().c_str(), cfd->GetID(), cfd->GetLogNumber());
+    }
+  }
+
+  for (auto builder : builders) {
+    delete builder.second;
+  }
+
+  return s;
+}
+
+Status VersionSet::ListColumnFamilies(std::vector<std::string>* column_families,
+                                      const std::string& dbname, Env* env) {
+  // these are just for performance reasons, not correcntes,
+  // so we're fine using the defaults
+  EnvOptions soptions;
+  // Read "CURRENT" file, which contains a pointer to the current manifest file
+  std::string current;
+  Status s = ReadFileToString(env, CurrentFileName(dbname), &current);
+  if (!s.ok()) {
+    return s;
+  }
+  if (current.empty() || current[current.size()-1] != '\n') {
+    return Status::Corruption("CURRENT file does not end with newline");
+  }
+  current.resize(current.size() - 1);
+
+  std::string dscname = dbname + "/" + current;
+  unique_ptr<SequentialFile> file;
+  s = env->NewSequentialFile(dscname, &file, soptions);
+  if (!s.ok()) {
+    return s;
+  }
+
+  std::map<uint32_t, std::string> column_family_names;
+  // default column family is always implicitly there
+  column_family_names.insert({0, kDefaultColumnFamilyName});
+  VersionSet::LogReporter reporter;
+  reporter.status = &s;
+  log::Reader reader(std::move(file), &reporter, true /*checksum*/,
+                     0 /*initial_offset*/);
+  Slice record;
+  std::string scratch;
+  while (reader.ReadRecord(&record, &scratch) && s.ok()) {
+    VersionEdit edit;
+    s = edit.DecodeFrom(record);
+    if (!s.ok()) {
+      break;
+    }
+    if (edit.is_column_family_add_) {
+      if (column_family_names.find(edit.column_family_) !=
+          column_family_names.end()) {
+        s = Status::Corruption("Manifest adding the same column family twice");
+        break;
+      }
+      column_family_names.insert(
+          {edit.column_family_, edit.column_family_name_});
+    } else if (edit.is_column_family_drop_) {
+      if (column_family_names.find(edit.column_family_) ==
+          column_family_names.end()) {
+        s = Status::Corruption(
+            "Manifest - dropping non-existing column family");
+        break;
+      }
+      column_family_names.erase(edit.column_family_);
+    }
+  }
+
+  column_families->clear();
+  if (s.ok()) {
+    for (const auto& iter : column_family_names) {
+      column_families->push_back(iter.second);
+    }
+  }
+
+  return s;
+}
+
+#ifndef ROCKSDB_LITE
+Status VersionSet::ReduceNumberOfLevels(const std::string& dbname,
+                                        const Options* options,
+                                        const EnvOptions& storage_options,
+                                        int new_levels) {
+  if (new_levels <= 1) {
+    return Status::InvalidArgument(
+        "Number of levels needs to be bigger than 1");
+  }
+
+  ColumnFamilyOptions cf_options(*options);
+  std::shared_ptr<Cache> tc(NewLRUCache(
+      options->max_open_files - 10, options->table_cache_numshardbits,
+      options->table_cache_remove_scan_count_limit));
+  VersionSet versions(dbname, options, storage_options, tc.get());
+  Status status;
+
+  std::vector<ColumnFamilyDescriptor> dummy;
+  ColumnFamilyDescriptor dummy_descriptor(kDefaultColumnFamilyName,
+                                          ColumnFamilyOptions(*options));
+  dummy.push_back(dummy_descriptor);
+  status = versions.Recover(dummy);
+  if (!status.ok()) {
+    return status;
+  }
+
+  Version* current_version =
+      versions.GetColumnFamilySet()->GetDefault()->current();
+  int current_levels = current_version->NumberLevels();
+
+  if (current_levels <= new_levels) {
+    return Status::OK();
+  }
+
+  // Make sure there are file only on one level from
+  // (new_levels-1) to (current_levels-1)
+  int first_nonempty_level = -1;
+  int first_nonempty_level_filenum = 0;
+  for (int i = new_levels - 1; i < current_levels; i++) {
+    int file_num = current_version->NumLevelFiles(i);
+    if (file_num != 0) {
+      if (first_nonempty_level < 0) {
+        first_nonempty_level = i;
+        first_nonempty_level_filenum = file_num;
+      } else {
+        char msg[255];
+        snprintf(msg, sizeof(msg),
+                 "Found at least two levels containing files: "
+                 "[%d:%d],[%d:%d].\n",
+                 first_nonempty_level, first_nonempty_level_filenum, i,
+                 file_num);
+        return Status::InvalidArgument(msg);
+      }
+    }
+  }
+
+  std::vector<FileMetaData*>* old_files_list = current_version->files_;
+  // we need to allocate an array with the old number of levels size to
+  // avoid SIGSEGV in WriteSnapshot()
+  // however, all levels bigger or equal to new_levels will be empty
+  std::vector<FileMetaData*>* new_files_list =
+      new std::vector<FileMetaData*>[current_levels];
+  for (int i = 0; i < new_levels - 1; i++) {
+    new_files_list[i] = old_files_list[i];
+  }
+
+  if (first_nonempty_level > 0) {
+    new_files_list[new_levels - 1] = old_files_list[first_nonempty_level];
+  }
+
+  delete[] current_version->files_;
+  current_version->files_ = new_files_list;
+  current_version->num_levels_ = new_levels;
+
+  VersionEdit ve;
+  port::Mutex dummy_mutex;
+  MutexLock l(&dummy_mutex);
+  return versions.LogAndApply(versions.GetColumnFamilySet()->GetDefault(), &ve,
+                              &dummy_mutex, nullptr, true);
+}
+
+Status VersionSet::DumpManifest(Options& options, std::string& dscname,
+                                bool verbose, bool hex) {
+  // Open the specified manifest file.
+  unique_ptr<SequentialFile> file;
+  Status s = options.env->NewSequentialFile(dscname, &file, storage_options_);
+  if (!s.ok()) {
+    return s;
+  }
+
+  bool have_prev_log_number = false;
+  bool have_next_file = false;
+  bool have_last_sequence = false;
+  uint64_t next_file = 0;
+  uint64_t last_sequence = 0;
+  uint64_t prev_log_number = 0;
+  int count = 0;
+  std::unordered_map<uint32_t, std::string> comparators;
+  std::unordered_map<uint32_t, Builder*> builders;
+
+  // add default column family
+  VersionEdit default_cf_edit;
+  default_cf_edit.AddColumnFamily(kDefaultColumnFamilyName);
+  default_cf_edit.SetColumnFamily(0);
+  ColumnFamilyData* default_cfd =
+      CreateColumnFamily(ColumnFamilyOptions(options), &default_cf_edit);
+  builders.insert({0, new Builder(default_cfd)});
+
+  {
+    VersionSet::LogReporter reporter;
+    reporter.status = &s;
+    log::Reader reader(std::move(file), &reporter, true/*checksum*/,
+                       0/*initial_offset*/);
+    Slice record;
+    std::string scratch;
+    while (reader.ReadRecord(&record, &scratch) && s.ok()) {
+      VersionEdit edit;
+      s = edit.DecodeFrom(record);
+      if (!s.ok()) {
+        break;
+      }
+
+      // Write out each individual edit
+      if (verbose) {
+        printf("*************************Edit[%d] = %s\n",
+                count, edit.DebugString(hex).c_str());
+      }
+      count++;
+
+      bool cf_in_builders =
+          builders.find(edit.column_family_) != builders.end();
+
+      if (edit.has_comparator_) {
+        comparators.insert({edit.column_family_, edit.comparator_});
+      }
+
+      ColumnFamilyData* cfd = nullptr;
+
+      if (edit.is_column_family_add_) {
+        if (cf_in_builders) {
+          s = Status::Corruption(
+              "Manifest adding the same column family twice");
+          break;
+        }
+        cfd = CreateColumnFamily(ColumnFamilyOptions(options), &edit);
+        builders.insert({edit.column_family_, new Builder(cfd)});
+      } else if (edit.is_column_family_drop_) {
+        if (!cf_in_builders) {
+          s = Status::Corruption(
+              "Manifest - dropping non-existing column family");
+          break;
+        }
+        auto builder_iter = builders.find(edit.column_family_);
+        delete builder_iter->second;
+        builders.erase(builder_iter);
+        comparators.erase(edit.column_family_);
+        cfd = column_family_set_->GetColumnFamily(edit.column_family_);
+        assert(cfd != nullptr);
+        cfd->Unref();
+        delete cfd;
+        cfd = nullptr;
+      } else {
+        if (!cf_in_builders) {
+          s = Status::Corruption(
+              "Manifest record referencing unknown column family");
+          break;
+        }
+
+        cfd = column_family_set_->GetColumnFamily(edit.column_family_);
+        // this should never happen since cf_in_builders is true
+        assert(cfd != nullptr);
+
+        // if it is not column family add or column family drop,
+        // then it's a file add/delete, which should be forwarded
+        // to builder
+        auto builder = builders.find(edit.column_family_);
+        assert(builder != builders.end());
+        builder->second->Apply(&edit);
+      }
+
+      if (cfd != nullptr && edit.has_log_number_) {
+        cfd->SetLogNumber(edit.log_number_);
+      }
+
+      if (edit.has_prev_log_number_) {
+        prev_log_number = edit.prev_log_number_;
+        have_prev_log_number = true;
+      }
+
+      if (edit.has_next_file_number_) {
+        next_file = edit.next_file_number_;
+        have_next_file = true;
+      }
+
+      if (edit.has_last_sequence_) {
+        last_sequence = edit.last_sequence_;
+        have_last_sequence = true;
+      }
+
+      if (edit.has_max_column_family_) {
+        column_family_set_->UpdateMaxColumnFamily(edit.max_column_family_);
+      }
+    }
+  }
+  file.reset();
+
+  if (s.ok()) {
+    if (!have_next_file) {
+      s = Status::Corruption("no meta-nextfile entry in descriptor");
+      printf("no meta-nextfile entry in descriptor");
+    } else if (!have_last_sequence) {
+      printf("no last-sequence-number entry in descriptor");
+      s = Status::Corruption("no last-sequence-number entry in descriptor");
+    }
+
+    if (!have_prev_log_number) {
+      prev_log_number = 0;
+    }
+  }
+
+  if (s.ok()) {
+    for (auto cfd : *column_family_set_) {
+      auto builders_iter = builders.find(cfd->GetID());
+      assert(builders_iter != builders.end());
+      auto builder = builders_iter->second;
+
+      Version* v = new Version(cfd, this, current_version_number_++);
+      builder->SaveTo(v);
+      std::vector<uint64_t> size_being_compacted(v->NumberLevels() - 1);
+      cfd->compaction_picker()->SizeBeingCompacted(size_being_compacted);
+      v->ComputeCompactionScore(size_being_compacted);
+      v->UpdateFilesBySize();
+      delete builder;
+
+      printf("--------------- Column family \"%s\"  (ID %u) --------------\n",
+             cfd->GetName().c_str(), (unsigned int)cfd->GetID());
+      printf("log number: %lu\n", (unsigned long)cfd->GetLogNumber());
+      auto comparator = comparators.find(cfd->GetID());
+      if (comparator != comparators.end()) {
+        printf("comparator: %s\n", comparator->second.c_str());
+      } else {
+        printf("comparator: <NO COMPARATOR>\n");
+      }
+      printf("%s \n", v->DebugString(hex).c_str());
+      delete v;
+    }
+
+    next_file_number_ = next_file + 1;
+    last_sequence_ = last_sequence;
+    prev_log_number_ = prev_log_number;
+
+    printf(
+        "next_file_number %lu last_sequence "
+        "%lu  prev_log_number %lu max_column_family %u\n",
+        (unsigned long)next_file_number_, (unsigned long)last_sequence,
+        (unsigned long)prev_log_number,
+        column_family_set_->GetMaxColumnFamily());
+  }
+
+  return s;
+}
+#endif  // ROCKSDB_LITE
+
+void VersionSet::MarkFileNumberUsed(uint64_t number) {
+  if (next_file_number_ <= number) {
+    next_file_number_ = number + 1;
+  }
+}
+
+Status VersionSet::WriteSnapshot(log::Writer* log) {
+  // TODO: Break up into multiple records to reduce memory usage on recovery?
+
+  // WARNING: This method doesn't hold a mutex!!
+
+  // This is done without DB mutex lock held, but only within single-threaded
+  // LogAndApply. Column family manipulations can only happen within LogAndApply
+  // (the same single thread), so we're safe to iterate.
+  for (auto cfd : *column_family_set_) {
+    {
+      // Store column family info
+      VersionEdit edit;
+      if (cfd->GetID() != 0) {
+        // default column family is always there,
+        // no need to explicitly write it
+        edit.AddColumnFamily(cfd->GetName());
+        edit.SetColumnFamily(cfd->GetID());
+      }
+      edit.SetComparatorName(
+          cfd->internal_comparator().user_comparator()->Name());
+      std::string record;
+      edit.EncodeTo(&record);
+      Status s = log->AddRecord(record);
+      if (!s.ok()) {
+        return s;
+      }
+    }
+
+    {
+      // Save files
+      VersionEdit edit;
+      edit.SetColumnFamily(cfd->GetID());
+
+      for (int level = 0; level < cfd->NumberLevels(); level++) {
+        for (const auto& f : cfd->current()->files_[level]) {
+          edit.AddFile(level,
+                       f->number,
+                       f->file_size,
+                       f->smallest,
+                       f->largest,
+                       f->smallest_seqno,
+                       f->largest_seqno);
+        }
+      }
+      edit.SetLogNumber(cfd->GetLogNumber());
+      std::string record;
+      edit.EncodeTo(&record);
+      Status s = log->AddRecord(record);
+      if (!s.ok()) {
+        return s;
+      }
+    }
+  }
+
+  return Status::OK();
+}
+
+// Opens the mainfest file and reads all records
+// till it finds the record we are looking for.
+bool VersionSet::ManifestContains(uint64_t manifest_file_number,
+                                  const std::string& record) const {
+  std::string fname =
+      DescriptorFileName(dbname_, manifest_file_number);
+  Log(options_->info_log, "ManifestContains: checking %s\n", fname.c_str());
+  unique_ptr<SequentialFile> file;
+  Status s = env_->NewSequentialFile(fname, &file, storage_options_);
+  if (!s.ok()) {
+    Log(options_->info_log, "ManifestContains: %s\n", s.ToString().c_str());
+    Log(options_->info_log,
+        "ManifestContains: is unable to reopen the manifest file  %s",
+        fname.c_str());
+    return false;
+  }
+  log::Reader reader(std::move(file), nullptr, true/*checksum*/, 0);
+  Slice r;
+  std::string scratch;
+  bool result = false;
+  while (reader.ReadRecord(&r, &scratch)) {
+    if (r == Slice(record)) {
+      result = true;
+      break;
+    }
+  }
+  Log(options_->info_log, "ManifestContains: result = %d\n", result ? 1 : 0);
+  return result;
+}
+
+
+uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) {
+  uint64_t result = 0;
+  for (int level = 0; level < v->NumberLevels(); level++) {
+    const std::vector<FileMetaData*>& files = v->files_[level];
+    for (size_t i = 0; i < files.size(); i++) {
+      if (v->cfd_->internal_comparator().Compare(files[i]->largest, ikey) <=
+          0) {
+        // Entire file is before "ikey", so just add the file size
+        result += files[i]->file_size;
+      } else if (v->cfd_->internal_comparator().Compare(files[i]->smallest,
+                                                        ikey) > 0) {
+        // Entire file is after "ikey", so ignore
+        if (level > 0) {
+          // Files other than level 0 are sorted by meta->smallest, so
+          // no further files in this level will contain data for
+          // "ikey".
+          break;
+        }
+      } else {
+        // "ikey" falls in the range for this table.  Add the
+        // approximate offset of "ikey" within the table.
+        TableReader* table_reader_ptr;
+        Iterator* iter = v->cfd_->table_cache()->NewIterator(
+            ReadOptions(), storage_options_, v->cfd_->internal_comparator(),
+            *(files[i]), &table_reader_ptr);
+        if (table_reader_ptr != nullptr) {
+          result += table_reader_ptr->ApproximateOffsetOf(ikey.Encode());
+        }
+        delete iter;
+      }
+    }
+  }
+  return result;
+}
+
+void VersionSet::AddLiveFiles(std::vector<uint64_t>* live_list) {
+  // pre-calculate space requirement
+  int64_t total_files = 0;
+  for (auto cfd : *column_family_set_) {
+    Version* dummy_versions = cfd->dummy_versions();
+    for (Version* v = dummy_versions->next_; v != dummy_versions;
+         v = v->next_) {
+      for (int level = 0; level < v->NumberLevels(); level++) {
+        total_files += v->files_[level].size();
+      }
+    }
+  }
+
+  // just one time extension to the right size
+  live_list->reserve(live_list->size() + total_files);
+
+  for (auto cfd : *column_family_set_) {
+    Version* dummy_versions = cfd->dummy_versions();
+    for (Version* v = dummy_versions->next_; v != dummy_versions;
+         v = v->next_) {
+      for (int level = 0; level < v->NumberLevels(); level++) {
+        for (const auto& f : v->files_[level]) {
+          live_list->push_back(f->number);
+        }
+      }
+    }
+  }
+}
+
+Iterator* VersionSet::MakeInputIterator(Compaction* c) {
+  auto cfd = c->column_family_data();
+  ReadOptions read_options;
+  read_options.verify_checksums =
+    cfd->options()->verify_checksums_in_compaction;
+  read_options.fill_cache = false;
+
+  // Level-0 files have to be merged together.  For other levels,
+  // we will make a concatenating iterator per level.
+  // TODO(opt): use concatenating iterator for level-0 if there is no overlap
+  const int space = (c->level() == 0 ? c->inputs(0)->size() + 1 : 2);
+  Iterator** list = new Iterator*[space];
+  int num = 0;
+  for (int which = 0; which < 2; which++) {
+    if (!c->inputs(which)->empty()) {
+      if (c->level() + which == 0) {
+        for (const auto& file : *c->inputs(which)) {
+          list[num++] = cfd->table_cache()->NewIterator(
+              read_options, storage_options_compactions_,
+              cfd->internal_comparator(), *file, nullptr,
+              true /* for compaction */);
+        }
+      } else {
+        // Create concatenating iterator for the files from this level
+        list[num++] = NewTwoLevelIterator(new Version::LevelFileIteratorState(
+              cfd->table_cache(), read_options, storage_options_,
+              cfd->internal_comparator(), true /* for_compaction */,
+              false /* prefix enabled */),
+            new Version::LevelFileNumIterator(cfd->internal_comparator(),
+                                              c->inputs(which)));
+      }
+    }
+  }
+  assert(num <= space);
+  Iterator* result = NewMergingIterator(
+      &c->column_family_data()->internal_comparator(), list, num);
+  delete[] list;
+  return result;
+}
+
+// verify that the files listed in this compaction are present
+// in the current version
+bool VersionSet::VerifyCompactionFileConsistency(Compaction* c) {
+#ifndef NDEBUG
+  Version* version = c->column_family_data()->current();
+  if (c->input_version() != version) {
+    Log(options_->info_log,
+        "[%s] VerifyCompactionFileConsistency version mismatch",
+        c->column_family_data()->GetName().c_str());
+  }
+
+  // verify files in level
+  int level = c->level();
+  for (int i = 0; i < c->num_input_files(0); i++) {
+    uint64_t number = c->input(0,i)->number;
+
+    // look for this file in the current version
+    bool found = false;
+    for (unsigned int j = 0; j < version->files_[level].size(); j++) {
+      FileMetaData* f = version->files_[level][j];
+      if (f->number == number) {
+        found = true;
+        break;
+      }
+    }
+    if (!found) {
+      return false; // input files non existant in current version
+    }
+  }
+  // verify level+1 files
+  level++;
+  for (int i = 0; i < c->num_input_files(1); i++) {
+    uint64_t number = c->input(1,i)->number;
+
+    // look for this file in the current version
+    bool found = false;
+    for (unsigned int j = 0; j < version->files_[level].size(); j++) {
+      FileMetaData* f = version->files_[level][j];
+      if (f->number == number) {
+        found = true;
+        break;
+      }
+    }
+    if (!found) {
+      return false; // input files non existant in current version
+    }
+  }
+#endif
+  return true;     // everything good
+}
+
+Status VersionSet::GetMetadataForFile(uint64_t number, int* filelevel,
+                                      FileMetaData** meta,
+                                      ColumnFamilyData** cfd) {
+  for (auto cfd_iter : *column_family_set_) {
+    Version* version = cfd_iter->current();
+    for (int level = 0; level < version->NumberLevels(); level++) {
+      for (const auto& file : version->files_[level]) {
+        if (file->number == number) {
+          *meta = file;
+          *filelevel = level;
+          *cfd = cfd_iter;
+          return Status::OK();
+        }
+      }
+    }
+  }
+  return Status::NotFound("File not present in any level");
+}
+
+void VersionSet::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
+  for (auto cfd : *column_family_set_) {
+    for (int level = 0; level < cfd->NumberLevels(); level++) {
+      for (const auto& file : cfd->current()->files_[level]) {
+        LiveFileMetaData filemetadata;
+        filemetadata.column_family_name = cfd->GetName();
+        filemetadata.name = TableFileName("", file->number);
+        filemetadata.level = level;
+        filemetadata.size = file->file_size;
+        filemetadata.smallestkey = file->smallest.user_key().ToString();
+        filemetadata.largestkey = file->largest.user_key().ToString();
+        filemetadata.smallest_seqno = file->smallest_seqno;
+        filemetadata.largest_seqno = file->largest_seqno;
+        metadata->push_back(filemetadata);
+      }
+    }
+  }
+}
+
+void VersionSet::GetObsoleteFiles(std::vector<FileMetaData*>* files) {
+  files->insert(files->end(), obsolete_files_.begin(), obsolete_files_.end());
+  obsolete_files_.clear();
+}
+
+ColumnFamilyData* VersionSet::CreateColumnFamily(
+    const ColumnFamilyOptions& options, VersionEdit* edit) {
+  assert(edit->is_column_family_add_);
+
+  Version* dummy_versions = new Version(nullptr, this);
+  auto new_cfd = column_family_set_->CreateColumnFamily(
+      edit->column_family_name_, edit->column_family_, dummy_versions, options);
+
+  Version* v = new Version(new_cfd, this, current_version_number_++);
+
+  AppendVersion(new_cfd, v);
+  new_cfd->CreateNewMemtable();
+  new_cfd->SetLogNumber(edit->log_number_);
+  return new_cfd;
+}
+
+}  // namespace rocksdb
diff --git a/db/version_set.h b/db/version_set.h
new file mode 100644 (file)
index 0000000..13a1383
--- /dev/null
@@ -0,0 +1,493 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// The representation of a DBImpl consists of a set of Versions.  The
+// newest version is called "current".  Older versions may be kept
+// around to provide a consistent view to live iterators.
+//
+// Each Version keeps track of a set of Table files per level.  The
+// entire set of versions is maintained in a VersionSet.
+//
+// Version,VersionSet are thread-compatible, but require external
+// synchronization on all accesses.
+
+#pragma once
+#include <map>
+#include <memory>
+#include <set>
+#include <vector>
+#include <deque>
+#include <atomic>
+#include <limits>
+#include "db/dbformat.h"
+#include "db/version_edit.h"
+#include "port/port.h"
+#include "db/table_cache.h"
+#include "db/compaction.h"
+#include "db/compaction_picker.h"
+#include "db/column_family.h"
+#include "db/log_reader.h"
+#include "db/file_indexer.h"
+
+namespace rocksdb {
+
+namespace log { class Writer; }
+
+class Compaction;
+class CompactionPicker;
+class Iterator;
+class LogBuffer;
+class LookupKey;
+class MemTable;
+class Version;
+class VersionSet;
+class MergeContext;
+class ColumnFamilyData;
+class ColumnFamilySet;
+class TableCache;
+
+// Return the smallest index i such that files[i]->largest >= key.
+// Return files.size() if there is no such file.
+// REQUIRES: "files" contains a sorted list of non-overlapping files.
+extern int FindFile(const InternalKeyComparator& icmp,
+                    const std::vector<FileMetaData*>& files,
+                    const Slice& key);
+
+// Returns true iff some file in "files" overlaps the user key range
+// [*smallest,*largest].
+// smallest==nullptr represents a key smaller than all keys in the DB.
+// largest==nullptr represents a key largest than all keys in the DB.
+// REQUIRES: If disjoint_sorted_files, files[] contains disjoint ranges
+//           in sorted order.
+extern bool SomeFileOverlapsRange(
+    const InternalKeyComparator& icmp,
+    bool disjoint_sorted_files,
+    const std::vector<FileMetaData*>& files,
+    const Slice* smallest_user_key,
+    const Slice* largest_user_key);
+
+class Version {
+ public:
+  // Append to *iters a sequence of iterators that will
+  // yield the contents of this Version when merged together.
+  // REQUIRES: This version has been saved (see VersionSet::SaveTo)
+  void AddIterators(const ReadOptions&, const EnvOptions& soptions,
+                    std::vector<Iterator*>* iters);
+
+  // Lookup the value for key.  If found, store it in *val and
+  // return OK.  Else return a non-OK status.  Fills *stats.
+  // Uses *operands to store merge_operator operations to apply later
+  // REQUIRES: lock is not held
+  struct GetStats {
+    FileMetaData* seek_file;
+    int seek_file_level;
+  };
+  void Get(const ReadOptions&, const LookupKey& key, std::string* val,
+           Status* status, MergeContext* merge_context, GetStats* stats,
+           bool* value_found = nullptr);
+
+  // Adds "stats" into the current state.  Returns true if a new
+  // compaction may need to be triggered, false otherwise.
+  // REQUIRES: lock is held
+  bool UpdateStats(const GetStats& stats);
+
+  // Updates internal structures that keep track of compaction scores
+  // We use compaction scores to figure out which compaction to do next
+  // REQUIRES: If Version is not yet saved to current_, it can be called without
+  // a lock. Once a version is saved to current_, call only with mutex held
+  void ComputeCompactionScore(std::vector<uint64_t>& size_being_compacted);
+
+  // Reference count management (so Versions do not disappear out from
+  // under live iterators)
+  void Ref();
+  // Decrease reference count. Delete the object if no reference left
+  // and return true. Otherwise, return false.
+  bool Unref();
+
+  // Returns true iff some level needs a compaction.
+  bool NeedsCompaction() const;
+
+  // Returns the maxmimum compaction score for levels 1 to max
+  double MaxCompactionScore() const { return max_compaction_score_; }
+
+  // See field declaration
+  int MaxCompactionScoreLevel() const { return max_compaction_score_level_; }
+
+  void GetOverlappingInputs(
+      int level,
+      const InternalKey* begin,         // nullptr means before all keys
+      const InternalKey* end,           // nullptr means after all keys
+      std::vector<FileMetaData*>* inputs,
+      int hint_index = -1,              // index of overlap file
+      int* file_index = nullptr);          // return index of overlap file
+
+  void GetOverlappingInputsBinarySearch(
+      int level,
+      const Slice& begin,         // nullptr means before all keys
+      const Slice& end,           // nullptr means after all keys
+      std::vector<FileMetaData*>* inputs,
+      int hint_index,             // index of overlap file
+      int* file_index);           // return index of overlap file
+
+  void ExtendOverlappingInputs(
+      int level,
+      const Slice& begin,         // nullptr means before all keys
+      const Slice& end,           // nullptr means after all keys
+      std::vector<FileMetaData*>* inputs,
+      unsigned int index);                 // start extending from this index
+
+  // Returns true iff some file in the specified level overlaps
+  // some part of [*smallest_user_key,*largest_user_key].
+  // smallest_user_key==NULL represents a key smaller than all keys in the DB.
+  // largest_user_key==NULL represents a key largest than all keys in the DB.
+  bool OverlapInLevel(int level,
+                      const Slice* smallest_user_key,
+                      const Slice* largest_user_key);
+
+  // Returns true iff the first or last file in inputs contains
+  // an overlapping user key to the file "just outside" of it (i.e.
+  // just after the last file, or just before the first file)
+  // REQUIRES: "*inputs" is a sorted list of non-overlapping files
+  bool HasOverlappingUserKey(const std::vector<FileMetaData*>* inputs,
+                             int level);
+
+
+  // Return the level at which we should place a new memtable compaction
+  // result that covers the range [smallest_user_key,largest_user_key].
+  int PickLevelForMemTableOutput(const Slice& smallest_user_key,
+                                 const Slice& largest_user_key);
+
+  int NumberLevels() const { return num_levels_; }
+
+  // REQUIRES: lock is held
+  int NumLevelFiles(int level) const { return files_[level].size(); }
+
+  // Return the combined file size of all files at the specified level.
+  int64_t NumLevelBytes(int level) const;
+
+  // Return a human-readable short (single-line) summary of the number
+  // of files per level.  Uses *scratch as backing store.
+  struct LevelSummaryStorage {
+    char buffer[100];
+  };
+  struct FileSummaryStorage {
+    char buffer[1000];
+  };
+  const char* LevelSummary(LevelSummaryStorage* scratch) const;
+  // Return a human-readable short (single-line) summary of files
+  // in a specified level.  Uses *scratch as backing store.
+  const char* LevelFileSummary(FileSummaryStorage* scratch, int level) const;
+
+  // Return the maximum overlapping data (in bytes) at next level for any
+  // file at a level >= 1.
+  int64_t MaxNextLevelOverlappingBytes();
+
+  // Add all files listed in the current version to *live.
+  void AddLiveFiles(std::set<uint64_t>* live);
+
+  // Return a human readable string that describes this version's contents.
+  std::string DebugString(bool hex = false) const;
+
+  // Returns the version nuber of this version
+  uint64_t GetVersionNumber() const { return version_number_; }
+
+  // REQUIRES: lock is held
+  // On success, *props will be populated with all SSTables' table properties.
+  // The keys of `props` are the sst file name, the values of `props` are the
+  // tables' propertis, represented as shared_ptr.
+  Status GetPropertiesOfAllTables(TablePropertiesCollection* props);
+
+  // used to sort files by size
+  struct Fsize {
+    int index;
+    FileMetaData* file;
+  };
+
+ private:
+  friend class Compaction;
+  friend class VersionSet;
+  friend class DBImpl;
+  friend class ColumnFamilyData;
+  friend class CompactionPicker;
+  friend class LevelCompactionPicker;
+  friend class UniversalCompactionPicker;
+
+  class LevelFileNumIterator;
+  class LevelFileIteratorState;
+
+  bool PrefixMayMatch(const ReadOptions& options, Iterator* level_iter,
+                      const Slice& internal_prefix) const;
+
+  // Sort all files for this version based on their file size and
+  // record results in files_by_size_. The largest files are listed first.
+  void UpdateFilesBySize();
+
+  ColumnFamilyData* cfd_;  // ColumnFamilyData to which this Version belongs
+  const InternalKeyComparator* internal_comparator_;
+  const Comparator* user_comparator_;
+  TableCache* table_cache_;
+  const MergeOperator* merge_operator_;
+  Logger* info_log_;
+  Statistics* db_statistics_;
+  VersionSet* vset_;            // VersionSet to which this Version belongs
+  Version* next_;               // Next version in linked list
+  Version* prev_;               // Previous version in linked list
+  int refs_;                    // Number of live refs to this version
+  int num_levels_;              // Number of levels
+
+  // List of files per level, files in each level are arranged
+  // in increasing order of keys
+  std::vector<FileMetaData*>* files_;
+
+  // A list for the same set of files that are stored in files_,
+  // but files in each level are now sorted based on file
+  // size. The file with the largest size is at the front.
+  // This vector stores the index of the file from files_.
+  std::vector<std::vector<int>> files_by_size_;
+
+  // An index into files_by_size_ that specifies the first
+  // file that is not yet compacted
+  std::vector<int> next_file_to_compact_by_size_;
+
+  // Only the first few entries of files_by_size_ are sorted.
+  // There is no need to sort all the files because it is likely
+  // that on a running system, we need to look at only the first
+  // few largest files because a new version is created every few
+  // seconds/minutes (because of concurrent compactions).
+  static const int number_of_files_to_sort_ = 50;
+
+  // Next file to compact based on seek stats.
+  FileMetaData* file_to_compact_;
+  int file_to_compact_level_;
+
+  // Level that should be compacted next and its compaction score.
+  // Score < 1 means compaction is not strictly needed.  These fields
+  // are initialized by Finalize().
+  // The most critical level to be compacted is listed first
+  // These are used to pick the best compaction level
+  std::vector<double> compaction_score_;
+  std::vector<int> compaction_level_;
+  double max_compaction_score_; // max score in l1 to ln-1
+  int max_compaction_score_level_; // level on which max score occurs
+
+  // A version number that uniquely represents this version. This is
+  // used for debugging and logging purposes only.
+  uint64_t version_number_;
+
+  Version(ColumnFamilyData* cfd, VersionSet* vset, uint64_t version_number = 0);
+  FileIndexer file_indexer_;
+
+  ~Version();
+
+  // re-initializes the index that is used to offset into files_by_size_
+  // to find the next compaction candidate file.
+  void ResetNextCompactionIndex(int level) {
+    next_file_to_compact_by_size_[level] = 0;
+  }
+
+  // No copying allowed
+  Version(const Version&);
+  void operator=(const Version&);
+};
+
+class VersionSet {
+ public:
+  VersionSet(const std::string& dbname, const DBOptions* options,
+             const EnvOptions& storage_options, Cache* table_cache);
+  ~VersionSet();
+
+  // Apply *edit to the current version to form a new descriptor that
+  // is both saved to persistent state and installed as the new
+  // current version.  Will release *mu while actually writing to the file.
+  // column_family_options has to be set if edit is column family add
+  // REQUIRES: *mu is held on entry.
+  // REQUIRES: no other thread concurrently calls LogAndApply()
+  Status LogAndApply(ColumnFamilyData* column_family_data, VersionEdit* edit,
+                     port::Mutex* mu, Directory* db_directory = nullptr,
+                     bool new_descriptor_log = false,
+                     const ColumnFamilyOptions* column_family_options =
+                         nullptr);
+
+  // Recover the last saved descriptor from persistent storage.
+  // If read_only == true, Recover() will not complain if some column families
+  // are not opened
+  Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families,
+                 bool read_only = false);
+
+  // Reads a manifest file and returns a list of column families in
+  // column_families.
+  static Status ListColumnFamilies(std::vector<std::string>* column_families,
+                                   const std::string& dbname, Env* env);
+
+#ifndef ROCKSDB_LITE
+  // Try to reduce the number of levels. This call is valid when
+  // only one level from the new max level to the old
+  // max level containing files.
+  // The call is static, since number of levels is immutable during
+  // the lifetime of a RocksDB instance. It reduces number of levels
+  // in a DB by applying changes to manifest.
+  // For example, a db currently has 7 levels [0-6], and a call to
+  // to reduce to 5 [0-4] can only be executed when only one level
+  // among [4-6] contains files.
+  static Status ReduceNumberOfLevels(const std::string& dbname,
+                                     const Options* options,
+                                     const EnvOptions& storage_options,
+                                     int new_levels);
+
+  // printf contents (for debugging)
+  Status DumpManifest(Options& options, std::string& manifestFileName,
+                      bool verbose, bool hex = false);
+
+#endif  // ROCKSDB_LITE
+
+  // Return the current manifest file number
+  uint64_t ManifestFileNumber() const { return manifest_file_number_; }
+
+  uint64_t PendingManifestFileNumber() const {
+    return pending_manifest_file_number_;
+  }
+
+  // Allocate and return a new file number
+  uint64_t NewFileNumber() { return next_file_number_++; }
+
+  // Arrange to reuse "file_number" unless a newer file number has
+  // already been allocated.
+  // REQUIRES: "file_number" was returned by a call to NewFileNumber().
+  void ReuseFileNumber(uint64_t file_number) {
+    if (next_file_number_ == file_number + 1) {
+      next_file_number_ = file_number;
+    }
+  }
+
+  // Return the last sequence number.
+  uint64_t LastSequence() const {
+    return last_sequence_.load(std::memory_order_acquire);
+  }
+
+  // Set the last sequence number to s.
+  void SetLastSequence(uint64_t s) {
+    assert(s >= last_sequence_);
+    last_sequence_.store(s, std::memory_order_release);
+  }
+
+  // Mark the specified file number as used.
+  void MarkFileNumberUsed(uint64_t number);
+
+  // Return the log file number for the log file that is currently
+  // being compacted, or zero if there is no such log file.
+  uint64_t PrevLogNumber() const { return prev_log_number_; }
+
+  // Returns the minimum log number such that all
+  // log numbers less than or equal to it can be deleted
+  uint64_t MinLogNumber() const {
+    uint64_t min_log_num = std::numeric_limits<uint64_t>::max();
+    for (auto cfd : *column_family_set_) {
+      if (min_log_num > cfd->GetLogNumber()) {
+        min_log_num = cfd->GetLogNumber();
+      }
+    }
+    return min_log_num;
+  }
+
+  // Create an iterator that reads over the compaction inputs for "*c".
+  // The caller should delete the iterator when no longer needed.
+  Iterator* MakeInputIterator(Compaction* c);
+
+  // Add all files listed in any live version to *live.
+  void AddLiveFiles(std::vector<uint64_t>* live_list);
+
+  // Return the approximate offset in the database of the data for
+  // "key" as of version "v".
+  uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key);
+
+  // Return the size of the current manifest file
+  uint64_t ManifestFileSize() const { return manifest_file_size_; }
+
+  // verify that the files that we started with for a compaction
+  // still exist in the current version and in the same original level.
+  // This ensures that a concurrent compaction did not erroneously
+  // pick the same files to compact.
+  bool VerifyCompactionFileConsistency(Compaction* c);
+
+  Status GetMetadataForFile(uint64_t number, int* filelevel,
+                            FileMetaData** metadata, ColumnFamilyData** cfd);
+
+  void GetLiveFilesMetaData(
+    std::vector<LiveFileMetaData> *metadata);
+
+  void GetObsoleteFiles(std::vector<FileMetaData*>* files);
+
+  ColumnFamilySet* GetColumnFamilySet() { return column_family_set_.get(); }
+
+ private:
+  class Builder;
+  struct ManifestWriter;
+
+  friend class Version;
+
+  struct LogReporter : public log::Reader::Reporter {
+    Status* status;
+    virtual void Corruption(size_t bytes, const Status& s) {
+      if (this->status->ok()) *this->status = s;
+    }
+  };
+
+  // Save current contents to *log
+  Status WriteSnapshot(log::Writer* log);
+
+  void AppendVersion(ColumnFamilyData* column_family_data, Version* v);
+
+  bool ManifestContains(uint64_t manifest_file_number,
+                        const std::string& record) const;
+
+  ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& options,
+                                       VersionEdit* edit);
+
+  std::unique_ptr<ColumnFamilySet> column_family_set_;
+
+  Env* const env_;
+  const std::string dbname_;
+  const DBOptions* const options_;
+  uint64_t next_file_number_;
+  uint64_t manifest_file_number_;
+  uint64_t pending_manifest_file_number_;
+  std::atomic<uint64_t> last_sequence_;
+  uint64_t prev_log_number_;  // 0 or backing store for memtable being compacted
+
+  // Opened lazily
+  unique_ptr<log::Writer> descriptor_log_;
+
+  // generates a increasing version number for every new version
+  uint64_t current_version_number_;
+
+  // Queue of writers to the manifest file
+  std::deque<ManifestWriter*> manifest_writers_;
+
+  // Current size of manifest file
+  uint64_t manifest_file_size_;
+
+  std::vector<FileMetaData*> obsolete_files_;
+
+  // storage options for all reads and writes except compactions
+  const EnvOptions& storage_options_;
+
+  // storage options used for compactions. This is a copy of
+  // storage_options_ but with readaheads set to readahead_compactions_.
+  const EnvOptions storage_options_compactions_;
+
+  // No copying allowed
+  VersionSet(const VersionSet&);
+  void operator=(const VersionSet&);
+
+  void LogAndApplyCFHelper(VersionEdit* edit);
+  void LogAndApplyHelper(ColumnFamilyData* cfd, Builder* b, Version* v,
+                         VersionEdit* edit, port::Mutex* mu);
+};
+
+}  // namespace rocksdb
diff --git a/db/version_set_test.cc b/db/version_set_test.cc
new file mode 100644 (file)
index 0000000..1af95dd
--- /dev/null
@@ -0,0 +1,184 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_set.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+
+class FindFileTest {
+ public:
+  std::vector<FileMetaData*> files_;
+  bool disjoint_sorted_files_;
+
+  FindFileTest() : disjoint_sorted_files_(true) { }
+
+  ~FindFileTest() {
+    for (unsigned int i = 0; i < files_.size(); i++) {
+      delete files_[i];
+    }
+  }
+
+  void Add(const char* smallest, const char* largest,
+           SequenceNumber smallest_seq = 100,
+           SequenceNumber largest_seq = 100) {
+    FileMetaData* f = new FileMetaData;
+    f->number = files_.size() + 1;
+    f->smallest = InternalKey(smallest, smallest_seq, kTypeValue);
+    f->largest = InternalKey(largest, largest_seq, kTypeValue);
+    files_.push_back(f);
+  }
+
+  int Find(const char* key) {
+    InternalKey target(key, 100, kTypeValue);
+    InternalKeyComparator cmp(BytewiseComparator());
+    return FindFile(cmp, files_, target.Encode());
+  }
+
+  bool Overlaps(const char* smallest, const char* largest) {
+    InternalKeyComparator cmp(BytewiseComparator());
+    Slice s(smallest != nullptr ? smallest : "");
+    Slice l(largest != nullptr ? largest : "");
+    return SomeFileOverlapsRange(cmp, disjoint_sorted_files_, files_,
+                                 (smallest != nullptr ? &s : nullptr),
+                                 (largest != nullptr ? &l : nullptr));
+  }
+};
+
+TEST(FindFileTest, Empty) {
+  ASSERT_EQ(0, Find("foo"));
+  ASSERT_TRUE(! Overlaps("a", "z"));
+  ASSERT_TRUE(! Overlaps(nullptr, "z"));
+  ASSERT_TRUE(! Overlaps("a", nullptr));
+  ASSERT_TRUE(! Overlaps(nullptr, nullptr));
+}
+
+TEST(FindFileTest, Single) {
+  Add("p", "q");
+  ASSERT_EQ(0, Find("a"));
+  ASSERT_EQ(0, Find("p"));
+  ASSERT_EQ(0, Find("p1"));
+  ASSERT_EQ(0, Find("q"));
+  ASSERT_EQ(1, Find("q1"));
+  ASSERT_EQ(1, Find("z"));
+
+  ASSERT_TRUE(! Overlaps("a", "b"));
+  ASSERT_TRUE(! Overlaps("z1", "z2"));
+  ASSERT_TRUE(Overlaps("a", "p"));
+  ASSERT_TRUE(Overlaps("a", "q"));
+  ASSERT_TRUE(Overlaps("a", "z"));
+  ASSERT_TRUE(Overlaps("p", "p1"));
+  ASSERT_TRUE(Overlaps("p", "q"));
+  ASSERT_TRUE(Overlaps("p", "z"));
+  ASSERT_TRUE(Overlaps("p1", "p2"));
+  ASSERT_TRUE(Overlaps("p1", "z"));
+  ASSERT_TRUE(Overlaps("q", "q"));
+  ASSERT_TRUE(Overlaps("q", "q1"));
+
+  ASSERT_TRUE(! Overlaps(nullptr, "j"));
+  ASSERT_TRUE(! Overlaps("r", nullptr));
+  ASSERT_TRUE(Overlaps(nullptr, "p"));
+  ASSERT_TRUE(Overlaps(nullptr, "p1"));
+  ASSERT_TRUE(Overlaps("q", nullptr));
+  ASSERT_TRUE(Overlaps(nullptr, nullptr));
+}
+
+
+TEST(FindFileTest, Multiple) {
+  Add("150", "200");
+  Add("200", "250");
+  Add("300", "350");
+  Add("400", "450");
+  ASSERT_EQ(0, Find("100"));
+  ASSERT_EQ(0, Find("150"));
+  ASSERT_EQ(0, Find("151"));
+  ASSERT_EQ(0, Find("199"));
+  ASSERT_EQ(0, Find("200"));
+  ASSERT_EQ(1, Find("201"));
+  ASSERT_EQ(1, Find("249"));
+  ASSERT_EQ(1, Find("250"));
+  ASSERT_EQ(2, Find("251"));
+  ASSERT_EQ(2, Find("299"));
+  ASSERT_EQ(2, Find("300"));
+  ASSERT_EQ(2, Find("349"));
+  ASSERT_EQ(2, Find("350"));
+  ASSERT_EQ(3, Find("351"));
+  ASSERT_EQ(3, Find("400"));
+  ASSERT_EQ(3, Find("450"));
+  ASSERT_EQ(4, Find("451"));
+
+  ASSERT_TRUE(! Overlaps("100", "149"));
+  ASSERT_TRUE(! Overlaps("251", "299"));
+  ASSERT_TRUE(! Overlaps("451", "500"));
+  ASSERT_TRUE(! Overlaps("351", "399"));
+
+  ASSERT_TRUE(Overlaps("100", "150"));
+  ASSERT_TRUE(Overlaps("100", "200"));
+  ASSERT_TRUE(Overlaps("100", "300"));
+  ASSERT_TRUE(Overlaps("100", "400"));
+  ASSERT_TRUE(Overlaps("100", "500"));
+  ASSERT_TRUE(Overlaps("375", "400"));
+  ASSERT_TRUE(Overlaps("450", "450"));
+  ASSERT_TRUE(Overlaps("450", "500"));
+}
+
+TEST(FindFileTest, MultipleNullBoundaries) {
+  Add("150", "200");
+  Add("200", "250");
+  Add("300", "350");
+  Add("400", "450");
+  ASSERT_TRUE(! Overlaps(nullptr, "149"));
+  ASSERT_TRUE(! Overlaps("451", nullptr));
+  ASSERT_TRUE(Overlaps(nullptr, nullptr));
+  ASSERT_TRUE(Overlaps(nullptr, "150"));
+  ASSERT_TRUE(Overlaps(nullptr, "199"));
+  ASSERT_TRUE(Overlaps(nullptr, "200"));
+  ASSERT_TRUE(Overlaps(nullptr, "201"));
+  ASSERT_TRUE(Overlaps(nullptr, "400"));
+  ASSERT_TRUE(Overlaps(nullptr, "800"));
+  ASSERT_TRUE(Overlaps("100", nullptr));
+  ASSERT_TRUE(Overlaps("200", nullptr));
+  ASSERT_TRUE(Overlaps("449", nullptr));
+  ASSERT_TRUE(Overlaps("450", nullptr));
+}
+
+TEST(FindFileTest, OverlapSequenceChecks) {
+  Add("200", "200", 5000, 3000);
+  ASSERT_TRUE(! Overlaps("199", "199"));
+  ASSERT_TRUE(! Overlaps("201", "300"));
+  ASSERT_TRUE(Overlaps("200", "200"));
+  ASSERT_TRUE(Overlaps("190", "200"));
+  ASSERT_TRUE(Overlaps("200", "210"));
+}
+
+TEST(FindFileTest, OverlappingFiles) {
+  Add("150", "600");
+  Add("400", "500");
+  disjoint_sorted_files_ = false;
+  ASSERT_TRUE(! Overlaps("100", "149"));
+  ASSERT_TRUE(! Overlaps("601", "700"));
+  ASSERT_TRUE(Overlaps("100", "150"));
+  ASSERT_TRUE(Overlaps("100", "200"));
+  ASSERT_TRUE(Overlaps("100", "300"));
+  ASSERT_TRUE(Overlaps("100", "400"));
+  ASSERT_TRUE(Overlaps("100", "500"));
+  ASSERT_TRUE(Overlaps("375", "400"));
+  ASSERT_TRUE(Overlaps("450", "450"));
+  ASSERT_TRUE(Overlaps("450", "500"));
+  ASSERT_TRUE(Overlaps("450", "700"));
+  ASSERT_TRUE(Overlaps("600", "700"));
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/db/write_batch.cc b/db/write_batch.cc
new file mode 100644 (file)
index 0000000..734d1e3
--- /dev/null
@@ -0,0 +1,489 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// WriteBatch::rep_ :=
+//    sequence: fixed64
+//    count: fixed32
+//    data: record[count]
+// record :=
+//    kTypeValue varstring varstring
+//    kTypeMerge varstring varstring
+//    kTypeDeletion varstring
+//    kTypeColumnFamilyValue varint32 varstring varstring
+//    kTypeColumnFamilyMerge varint32 varstring varstring
+//    kTypeColumnFamilyDeletion varint32 varstring varstring
+// varstring :=
+//    len: varint32
+//    data: uint8[len]
+
+#include "rocksdb/write_batch.h"
+#include "rocksdb/options.h"
+#include "rocksdb/merge_operator.h"
+#include "db/dbformat.h"
+#include "db/db_impl.h"
+#include "db/memtable.h"
+#include "db/snapshot.h"
+#include "db/write_batch_internal.h"
+#include "util/coding.h"
+#include "util/statistics.h"
+#include <stdexcept>
+
+namespace rocksdb {
+
+// WriteBatch header has an 8-byte sequence number followed by a 4-byte count.
+static const size_t kHeader = 12;
+
+WriteBatch::WriteBatch(size_t reserved_bytes) {
+  rep_.reserve((reserved_bytes > kHeader) ? reserved_bytes : kHeader);
+  Clear();
+}
+
+WriteBatch::~WriteBatch() { }
+
+WriteBatch::Handler::~Handler() { }
+
+void WriteBatch::Handler::Put(const Slice& key, const Slice& value) {
+  // you need to either implement Put or PutCF
+  throw std::runtime_error("Handler::Put not implemented!");
+}
+
+void WriteBatch::Handler::Merge(const Slice& key, const Slice& value) {
+  throw std::runtime_error("Handler::Merge not implemented!");
+}
+
+void WriteBatch::Handler::Delete(const Slice& key) {
+  // you need to either implement Delete or DeleteCF
+  throw std::runtime_error("Handler::Delete not implemented!");
+}
+
+void WriteBatch::Handler::LogData(const Slice& blob) {
+  // If the user has not specified something to do with blobs, then we ignore
+  // them.
+}
+
+bool WriteBatch::Handler::Continue() {
+  return true;
+}
+
+void WriteBatch::Clear() {
+  rep_.clear();
+  rep_.resize(kHeader);
+}
+
+int WriteBatch::Count() const {
+  return WriteBatchInternal::Count(this);
+}
+
+Status WriteBatch::Iterate(Handler* handler) const {
+  Slice input(rep_);
+  if (input.size() < kHeader) {
+    return Status::Corruption("malformed WriteBatch (too small)");
+  }
+
+  input.remove_prefix(kHeader);
+  Slice key, value, blob;
+  int found = 0;
+  Status s;
+  while (s.ok() && !input.empty() && handler->Continue()) {
+    char tag = input[0];
+    input.remove_prefix(1);
+    uint32_t column_family = 0;  // default
+    switch (tag) {
+      case kTypeColumnFamilyValue:
+        if (!GetVarint32(&input, &column_family)) {
+          return Status::Corruption("bad WriteBatch Put");
+        }
+      // intentional fallthrough
+      case kTypeValue:
+        if (GetLengthPrefixedSlice(&input, &key) &&
+            GetLengthPrefixedSlice(&input, &value)) {
+          s = handler->PutCF(column_family, key, value);
+          found++;
+        } else {
+          return Status::Corruption("bad WriteBatch Put");
+        }
+        break;
+      case kTypeColumnFamilyDeletion:
+        if (!GetVarint32(&input, &column_family)) {
+          return Status::Corruption("bad WriteBatch Delete");
+        }
+      // intentional fallthrough
+      case kTypeDeletion:
+        if (GetLengthPrefixedSlice(&input, &key)) {
+          s = handler->DeleteCF(column_family, key);
+          found++;
+        } else {
+          return Status::Corruption("bad WriteBatch Delete");
+        }
+        break;
+      case kTypeColumnFamilyMerge:
+        if (!GetVarint32(&input, &column_family)) {
+          return Status::Corruption("bad WriteBatch Merge");
+        }
+      // intentional fallthrough
+      case kTypeMerge:
+        if (GetLengthPrefixedSlice(&input, &key) &&
+            GetLengthPrefixedSlice(&input, &value)) {
+          s = handler->MergeCF(column_family, key, value);
+          found++;
+        } else {
+          return Status::Corruption("bad WriteBatch Merge");
+        }
+        break;
+      case kTypeLogData:
+        if (GetLengthPrefixedSlice(&input, &blob)) {
+          handler->LogData(blob);
+        } else {
+          return Status::Corruption("bad WriteBatch Blob");
+        }
+        break;
+      default:
+        return Status::Corruption("unknown WriteBatch tag");
+    }
+  }
+  if (!s.ok()) {
+    return s;
+  }
+  if (found != WriteBatchInternal::Count(this)) {
+    return Status::Corruption("WriteBatch has wrong count");
+  } else {
+    return Status::OK();
+  }
+}
+
+int WriteBatchInternal::Count(const WriteBatch* b) {
+  return DecodeFixed32(b->rep_.data() + 8);
+}
+
+void WriteBatchInternal::SetCount(WriteBatch* b, int n) {
+  EncodeFixed32(&b->rep_[8], n);
+}
+
+SequenceNumber WriteBatchInternal::Sequence(const WriteBatch* b) {
+  return SequenceNumber(DecodeFixed64(b->rep_.data()));
+}
+
+void WriteBatchInternal::SetSequence(WriteBatch* b, SequenceNumber seq) {
+  EncodeFixed64(&b->rep_[0], seq);
+}
+
+void WriteBatchInternal::Put(WriteBatch* b, uint32_t column_family_id,
+                             const Slice& key, const Slice& value) {
+  WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+  if (column_family_id == 0) {
+    b->rep_.push_back(static_cast<char>(kTypeValue));
+  } else {
+    b->rep_.push_back(static_cast<char>(kTypeColumnFamilyValue));
+    PutVarint32(&b->rep_, column_family_id);
+  }
+  PutLengthPrefixedSlice(&b->rep_, key);
+  PutLengthPrefixedSlice(&b->rep_, value);
+}
+
+namespace {
+inline uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family) {
+  uint32_t column_family_id = 0;
+  if (column_family != nullptr) {
+    auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+    column_family_id = cfh->GetID();
+  }
+  return column_family_id;
+}
+}  // namespace
+
+void WriteBatch::Put(ColumnFamilyHandle* column_family, const Slice& key,
+                     const Slice& value) {
+  WriteBatchInternal::Put(this, GetColumnFamilyID(column_family), key, value);
+}
+
+void WriteBatchInternal::Put(WriteBatch* b, uint32_t column_family_id,
+                             const SliceParts& key, const SliceParts& value) {
+  WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+  if (column_family_id == 0) {
+    b->rep_.push_back(static_cast<char>(kTypeValue));
+  } else {
+    b->rep_.push_back(static_cast<char>(kTypeColumnFamilyValue));
+    PutVarint32(&b->rep_, column_family_id);
+  }
+  PutLengthPrefixedSliceParts(&b->rep_, key);
+  PutLengthPrefixedSliceParts(&b->rep_, value);
+}
+
+void WriteBatch::Put(ColumnFamilyHandle* column_family, const SliceParts& key,
+                     const SliceParts& value) {
+  WriteBatchInternal::Put(this, GetColumnFamilyID(column_family), key, value);
+}
+
+void WriteBatchInternal::Delete(WriteBatch* b, uint32_t column_family_id,
+                                const Slice& key) {
+  WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+  if (column_family_id == 0) {
+    b->rep_.push_back(static_cast<char>(kTypeDeletion));
+  } else {
+    b->rep_.push_back(static_cast<char>(kTypeColumnFamilyDeletion));
+    PutVarint32(&b->rep_, column_family_id);
+  }
+  PutLengthPrefixedSlice(&b->rep_, key);
+}
+
+void WriteBatch::Delete(ColumnFamilyHandle* column_family, const Slice& key) {
+  WriteBatchInternal::Delete(this, GetColumnFamilyID(column_family), key);
+}
+
+void WriteBatchInternal::Merge(WriteBatch* b, uint32_t column_family_id,
+                               const Slice& key, const Slice& value) {
+  WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+  if (column_family_id == 0) {
+    b->rep_.push_back(static_cast<char>(kTypeMerge));
+  } else {
+    b->rep_.push_back(static_cast<char>(kTypeColumnFamilyMerge));
+    PutVarint32(&b->rep_, column_family_id);
+  }
+  PutLengthPrefixedSlice(&b->rep_, key);
+  PutLengthPrefixedSlice(&b->rep_, value);
+}
+
+void WriteBatch::Merge(ColumnFamilyHandle* column_family, const Slice& key,
+                       const Slice& value) {
+  WriteBatchInternal::Merge(this, GetColumnFamilyID(column_family), key, value);
+}
+
+void WriteBatch::PutLogData(const Slice& blob) {
+  rep_.push_back(static_cast<char>(kTypeLogData));
+  PutLengthPrefixedSlice(&rep_, blob);
+}
+
+namespace {
+class MemTableInserter : public WriteBatch::Handler {
+ public:
+  SequenceNumber sequence_;
+  ColumnFamilyMemTables* cf_mems_;
+  bool recovery_;
+  uint64_t log_number_;
+  DBImpl* db_;
+  const bool dont_filter_deletes_;
+
+  MemTableInserter(SequenceNumber sequence, ColumnFamilyMemTables* cf_mems,
+                   bool recovery, uint64_t log_number, DB* db,
+                   const bool dont_filter_deletes)
+      : sequence_(sequence),
+        cf_mems_(cf_mems),
+        recovery_(recovery),
+        log_number_(log_number),
+        db_(reinterpret_cast<DBImpl*>(db)),
+        dont_filter_deletes_(dont_filter_deletes) {
+    assert(cf_mems);
+    if (!dont_filter_deletes_) {
+      assert(db_);
+    }
+  }
+
+  bool SeekToColumnFamily(uint32_t column_family_id, Status* s) {
+    bool found = cf_mems_->Seek(column_family_id);
+    if (recovery_ && (!found || log_number_ < cf_mems_->GetLogNumber())) {
+      // if in recovery envoronment:
+      // * If column family was not found, it might mean that the WAL write
+      // batch references to the column family that was dropped after the
+      // insert. We don't want to fail the whole write batch in that case -- we
+      // just ignore the update.
+      // * If log_number_ < cf_mems_->GetLogNumber(), this means that column
+      // family already contains updates from this log. We can't apply updates
+      // twice because of update-in-place or merge workloads -- ignore the
+      // update
+      *s = Status::OK();
+      return false;
+    }
+    if (!found) {
+      assert(!recovery_);
+      // If the column family was not found in non-recovery enviornment
+      // (client's write code-path), we have to fail the write and return
+      // the failure status to the client.
+      *s = Status::InvalidArgument(
+          "Invalid column family specified in write batch");
+      return false;
+    }
+    return true;
+  }
+
+  virtual Status PutCF(uint32_t column_family_id, const Slice& key,
+                       const Slice& value) {
+    Status seek_status;
+    if (!SeekToColumnFamily(column_family_id, &seek_status)) {
+      ++sequence_;
+      return seek_status;
+    }
+    MemTable* mem = cf_mems_->GetMemTable();
+    const Options* options = cf_mems_->GetOptions();
+    if (!options->inplace_update_support) {
+      mem->Add(sequence_, kTypeValue, key, value);
+    } else if (options->inplace_callback == nullptr) {
+      mem->Update(sequence_, key, value);
+      RecordTick(options->statistics.get(), NUMBER_KEYS_UPDATED);
+    } else {
+      if (mem->UpdateCallback(sequence_, key, value, *options)) {
+      } else {
+        // key not found in memtable. Do sst get, update, add
+        SnapshotImpl read_from_snapshot;
+        read_from_snapshot.number_ = sequence_;
+        ReadOptions ropts;
+        ropts.snapshot = &read_from_snapshot;
+
+        std::string prev_value;
+        std::string merged_value;
+
+        auto cf_handle = cf_mems_->GetColumnFamilyHandle();
+        if (cf_handle == nullptr) {
+          cf_handle = db_->DefaultColumnFamily();
+        }
+        Status s = db_->Get(ropts, cf_handle, key, &prev_value);
+
+        char* prev_buffer = const_cast<char*>(prev_value.c_str());
+        uint32_t prev_size = prev_value.size();
+        auto status = options->inplace_callback(s.ok() ? prev_buffer : nullptr,
+                                                s.ok() ? &prev_size : nullptr,
+                                                value, &merged_value);
+        if (status == UpdateStatus::UPDATED_INPLACE) {
+          // prev_value is updated in-place with final value.
+          mem->Add(sequence_, kTypeValue, key, Slice(prev_buffer, prev_size));
+          RecordTick(options->statistics.get(), NUMBER_KEYS_WRITTEN);
+        } else if (status == UpdateStatus::UPDATED) {
+          // merged_value contains the final value.
+          mem->Add(sequence_, kTypeValue, key, Slice(merged_value));
+          RecordTick(options->statistics.get(), NUMBER_KEYS_WRITTEN);
+        }
+      }
+    }
+    // Since all Puts are logged in trasaction logs (if enabled), always bump
+    // sequence number. Even if the update eventually fails and does not result
+    // in memtable add/update.
+    sequence_++;
+    return Status::OK();
+  }
+
+  virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
+                         const Slice& value) {
+    Status seek_status;
+    if (!SeekToColumnFamily(column_family_id, &seek_status)) {
+      ++sequence_;
+      return seek_status;
+    }
+    MemTable* mem = cf_mems_->GetMemTable();
+    const Options* options = cf_mems_->GetOptions();
+    bool perform_merge = false;
+
+    if (options->max_successive_merges > 0 && db_ != nullptr) {
+      LookupKey lkey(key, sequence_);
+
+      // Count the number of successive merges at the head
+      // of the key in the memtable
+      size_t num_merges = mem->CountSuccessiveMergeEntries(lkey);
+
+      if (num_merges >= options->max_successive_merges) {
+        perform_merge = true;
+      }
+    }
+
+    if (perform_merge) {
+      // 1) Get the existing value
+      std::string get_value;
+
+      // Pass in the sequence number so that we also include previous merge
+      // operations in the same batch.
+      SnapshotImpl read_from_snapshot;
+      read_from_snapshot.number_ = sequence_;
+      ReadOptions read_options;
+      read_options.snapshot = &read_from_snapshot;
+
+      auto cf_handle = cf_mems_->GetColumnFamilyHandle();
+      if (cf_handle == nullptr) {
+        cf_handle = db_->DefaultColumnFamily();
+      }
+      db_->Get(read_options, cf_handle, key, &get_value);
+      Slice get_value_slice = Slice(get_value);
+
+      // 2) Apply this merge
+      auto merge_operator = options->merge_operator.get();
+      assert(merge_operator);
+
+      std::deque<std::string> operands;
+      operands.push_front(value.ToString());
+      std::string new_value;
+      if (!merge_operator->FullMerge(key, &get_value_slice, operands,
+                                     &new_value, options->info_log.get())) {
+          // Failed to merge!
+        RecordTick(options->statistics.get(), NUMBER_MERGE_FAILURES);
+
+          // Store the delta in memtable
+          perform_merge = false;
+      } else {
+        // 3) Add value to memtable
+        mem->Add(sequence_, kTypeValue, key, new_value);
+      }
+    }
+
+    if (!perform_merge) {
+      // Add merge operator to memtable
+      mem->Add(sequence_, kTypeMerge, key, value);
+    }
+
+    sequence_++;
+    return Status::OK();
+  }
+
+  virtual Status DeleteCF(uint32_t column_family_id, const Slice& key) {
+    Status seek_status;
+    if (!SeekToColumnFamily(column_family_id, &seek_status)) {
+      ++sequence_;
+      return seek_status;
+    }
+    MemTable* mem = cf_mems_->GetMemTable();
+    const Options* options = cf_mems_->GetOptions();
+    if (!dont_filter_deletes_ && options->filter_deletes) {
+      SnapshotImpl read_from_snapshot;
+      read_from_snapshot.number_ = sequence_;
+      ReadOptions ropts;
+      ropts.snapshot = &read_from_snapshot;
+      std::string value;
+      auto cf_handle = cf_mems_->GetColumnFamilyHandle();
+      if (cf_handle == nullptr) {
+        cf_handle = db_->DefaultColumnFamily();
+      }
+      if (!db_->KeyMayExist(ropts, cf_handle, key, &value)) {
+        RecordTick(options->statistics.get(), NUMBER_FILTERED_DELETES);
+        return Status::OK();
+      }
+    }
+    mem->Add(sequence_, kTypeDeletion, key, Slice());
+    sequence_++;
+    return Status::OK();
+  }
+};
+}  // namespace
+
+Status WriteBatchInternal::InsertInto(const WriteBatch* b,
+                                      ColumnFamilyMemTables* memtables,
+                                      bool recovery, uint64_t log_number,
+                                      DB* db, const bool dont_filter_deletes) {
+  MemTableInserter inserter(WriteBatchInternal::Sequence(b), memtables,
+                            recovery, log_number, db, dont_filter_deletes);
+  return b->Iterate(&inserter);
+}
+
+void WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) {
+  assert(contents.size() >= kHeader);
+  b->rep_.assign(contents.data(), contents.size());
+}
+
+void WriteBatchInternal::Append(WriteBatch* dst, const WriteBatch* src) {
+  SetCount(dst, Count(dst) + Count(src));
+  assert(src->rep_.size() >= kHeader);
+  dst->rep_.append(src->rep_.data() + kHeader, src->rep_.size() - kHeader);
+}
+
+}  // namespace rocksdb
diff --git a/db/write_batch_internal.h b/db/write_batch_internal.h
new file mode 100644 (file)
index 0000000..85e85b3
--- /dev/null
@@ -0,0 +1,123 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include "rocksdb/types.h"
+#include "rocksdb/write_batch.h"
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+
+namespace rocksdb {
+
+class MemTable;
+
+class ColumnFamilyMemTables {
+ public:
+  virtual ~ColumnFamilyMemTables() {}
+  virtual bool Seek(uint32_t column_family_id) = 0;
+  // returns true if the update to memtable should be ignored
+  // (useful when recovering from log whose updates have already
+  // been processed)
+  virtual uint64_t GetLogNumber() const = 0;
+  virtual MemTable* GetMemTable() const = 0;
+  virtual const Options* GetOptions() const = 0;
+  virtual ColumnFamilyHandle* GetColumnFamilyHandle() = 0;
+};
+
+class ColumnFamilyMemTablesDefault : public ColumnFamilyMemTables {
+ public:
+  ColumnFamilyMemTablesDefault(MemTable* mem, const Options* options)
+      : ok_(false), mem_(mem), options_(options) {}
+
+  bool Seek(uint32_t column_family_id) override {
+    ok_ = (column_family_id == 0);
+    return ok_;
+  }
+
+  uint64_t GetLogNumber() const override { return 0; }
+
+  MemTable* GetMemTable() const override {
+    assert(ok_);
+    return mem_;
+  }
+
+  const Options* GetOptions() const override {
+    assert(ok_);
+    return options_;
+  }
+
+  ColumnFamilyHandle* GetColumnFamilyHandle() override { return nullptr; }
+
+ private:
+  bool ok_;
+  MemTable* mem_;
+  const Options* const options_;
+};
+
+// WriteBatchInternal provides static methods for manipulating a
+// WriteBatch that we don't want in the public WriteBatch interface.
+class WriteBatchInternal {
+ public:
+  // WriteBatch methods with column_family_id instead of ColumnFamilyHandle*
+  static void Put(WriteBatch* batch, uint32_t column_family_id,
+                  const Slice& key, const Slice& value);
+
+  static void Put(WriteBatch* batch, uint32_t column_family_id,
+                  const SliceParts& key, const SliceParts& value);
+
+  static void Delete(WriteBatch* batch, uint32_t column_family_id,
+                     const Slice& key);
+
+  static void Merge(WriteBatch* batch, uint32_t column_family_id,
+                    const Slice& key, const Slice& value);
+
+  // Return the number of entries in the batch.
+  static int Count(const WriteBatch* batch);
+
+  // Set the count for the number of entries in the batch.
+  static void SetCount(WriteBatch* batch, int n);
+
+  // Return the seqeunce number for the start of this batch.
+  static SequenceNumber Sequence(const WriteBatch* batch);
+
+  // Store the specified number as the seqeunce number for the start of
+  // this batch.
+  static void SetSequence(WriteBatch* batch, SequenceNumber seq);
+
+  static Slice Contents(const WriteBatch* batch) {
+    return Slice(batch->rep_);
+  }
+
+  static size_t ByteSize(const WriteBatch* batch) {
+    return batch->rep_.size();
+  }
+
+  static void SetContents(WriteBatch* batch, const Slice& contents);
+
+  // Inserts batch entries into memtable
+  // If dont_filter_deletes is false AND options.filter_deletes is true,
+  // then --> Drops deletes in batch if db->KeyMayExist returns false
+  // If recovery == true, this means InsertInto is executed on a recovery
+  // code-path. WriteBatch referencing a dropped column family can be
+  // found on a recovery code-path and should be ignored (recovery should not
+  // fail). Additionally, the memtable will be updated only if
+  // memtables->GetLogNumber() >= log_number
+  // However, if recovery == false, any WriteBatch referencing
+  // non-existing column family will return a failure. Also, log_number is
+  // ignored in that case
+  static Status InsertInto(const WriteBatch* batch,
+                           ColumnFamilyMemTables* memtables,
+                           bool recovery = false, uint64_t log_number = 0,
+                           DB* db = nullptr,
+                           const bool dont_filter_deletes = true);
+
+  static void Append(WriteBatch* dst, const WriteBatch* src);
+};
+
+}  // namespace rocksdb
diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc
new file mode 100644 (file)
index 0000000..febd35c
--- /dev/null
@@ -0,0 +1,323 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/db.h"
+
+#include <memory>
+#include "db/memtable.h"
+#include "db/column_family.h"
+#include "db/write_batch_internal.h"
+#include "rocksdb/env.h"
+#include "rocksdb/memtablerep.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+
+namespace rocksdb {
+
+static std::string PrintContents(WriteBatch* b) {
+  InternalKeyComparator cmp(BytewiseComparator());
+  auto factory = std::make_shared<SkipListFactory>();
+  Options options;
+  options.memtable_factory = factory;
+  MemTable* mem = new MemTable(cmp, options);
+  mem->Ref();
+  std::string state;
+  ColumnFamilyMemTablesDefault cf_mems_default(mem, &options);
+  Status s = WriteBatchInternal::InsertInto(b, &cf_mems_default);
+  int count = 0;
+  Iterator* iter = mem->NewIterator(ReadOptions());
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ParsedInternalKey ikey;
+    memset((void *)&ikey, 0, sizeof(ikey));
+    ASSERT_TRUE(ParseInternalKey(iter->key(), &ikey));
+    switch (ikey.type) {
+      case kTypeValue:
+        state.append("Put(");
+        state.append(ikey.user_key.ToString());
+        state.append(", ");
+        state.append(iter->value().ToString());
+        state.append(")");
+        count++;
+        break;
+      case kTypeMerge:
+        state.append("Merge(");
+        state.append(ikey.user_key.ToString());
+        state.append(", ");
+        state.append(iter->value().ToString());
+        state.append(")");
+        count++;
+        break;
+      case kTypeDeletion:
+        state.append("Delete(");
+        state.append(ikey.user_key.ToString());
+        state.append(")");
+        count++;
+        break;
+      default:
+        assert(false);
+        break;
+    }
+    state.append("@");
+    state.append(NumberToString(ikey.sequence));
+  }
+  delete iter;
+  if (!s.ok()) {
+    state.append(s.ToString());
+  } else if (count != WriteBatchInternal::Count(b)) {
+    state.append("CountMismatch()");
+  }
+  delete mem->Unref();
+  return state;
+}
+
+class WriteBatchTest { };
+
+TEST(WriteBatchTest, Empty) {
+  WriteBatch batch;
+  ASSERT_EQ("", PrintContents(&batch));
+  ASSERT_EQ(0, WriteBatchInternal::Count(&batch));
+  ASSERT_EQ(0, batch.Count());
+}
+
+TEST(WriteBatchTest, Multiple) {
+  WriteBatch batch;
+  batch.Put(Slice("foo"), Slice("bar"));
+  batch.Delete(Slice("box"));
+  batch.Put(Slice("baz"), Slice("boo"));
+  WriteBatchInternal::SetSequence(&batch, 100);
+  ASSERT_EQ(100U, WriteBatchInternal::Sequence(&batch));
+  ASSERT_EQ(3, WriteBatchInternal::Count(&batch));
+  ASSERT_EQ("Put(baz, boo)@102"
+            "Delete(box)@101"
+            "Put(foo, bar)@100",
+            PrintContents(&batch));
+  ASSERT_EQ(3, batch.Count());
+}
+
+TEST(WriteBatchTest, Corruption) {
+  WriteBatch batch;
+  batch.Put(Slice("foo"), Slice("bar"));
+  batch.Delete(Slice("box"));
+  WriteBatchInternal::SetSequence(&batch, 200);
+  Slice contents = WriteBatchInternal::Contents(&batch);
+  WriteBatchInternal::SetContents(&batch,
+                                  Slice(contents.data(),contents.size()-1));
+  ASSERT_EQ("Put(foo, bar)@200"
+            "Corruption: bad WriteBatch Delete",
+            PrintContents(&batch));
+}
+
+TEST(WriteBatchTest, Append) {
+  WriteBatch b1, b2;
+  WriteBatchInternal::SetSequence(&b1, 200);
+  WriteBatchInternal::SetSequence(&b2, 300);
+  WriteBatchInternal::Append(&b1, &b2);
+  ASSERT_EQ("",
+            PrintContents(&b1));
+  ASSERT_EQ(0, b1.Count());
+  b2.Put("a", "va");
+  WriteBatchInternal::Append(&b1, &b2);
+  ASSERT_EQ("Put(a, va)@200",
+            PrintContents(&b1));
+  ASSERT_EQ(1, b1.Count());
+  b2.Clear();
+  b2.Put("b", "vb");
+  WriteBatchInternal::Append(&b1, &b2);
+  ASSERT_EQ("Put(a, va)@200"
+            "Put(b, vb)@201",
+            PrintContents(&b1));
+  ASSERT_EQ(2, b1.Count());
+  b2.Delete("foo");
+  WriteBatchInternal::Append(&b1, &b2);
+  ASSERT_EQ("Put(a, va)@200"
+            "Put(b, vb)@202"
+            "Put(b, vb)@201"
+            "Delete(foo)@203",
+            PrintContents(&b1));
+  ASSERT_EQ(4, b1.Count());
+}
+
+namespace {
+  struct TestHandler : public WriteBatch::Handler {
+    std::string seen;
+    virtual Status PutCF(uint32_t column_family_id, const Slice& key,
+                       const Slice& value) {
+      if (column_family_id == 0) {
+        seen += "Put(" + key.ToString() + ", " + value.ToString() + ")";
+      } else {
+        seen += "PutCF(" + std::to_string(column_family_id) + ", " +
+                key.ToString() + ", " + value.ToString() + ")";
+      }
+      return Status::OK();
+    }
+    virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
+                         const Slice& value) {
+      if (column_family_id == 0) {
+        seen += "Merge(" + key.ToString() + ", " + value.ToString() + ")";
+      } else {
+        seen += "MergeCF(" + std::to_string(column_family_id) + ", " +
+                key.ToString() + ", " + value.ToString() + ")";
+      }
+      return Status::OK();
+    }
+    virtual void LogData(const Slice& blob) {
+      seen += "LogData(" + blob.ToString() + ")";
+    }
+    virtual Status DeleteCF(uint32_t column_family_id, const Slice& key) {
+      if (column_family_id == 0) {
+        seen += "Delete(" + key.ToString() + ")";
+      } else {
+        seen += "DeleteCF(" + std::to_string(column_family_id) + ", " +
+                key.ToString() + ")";
+      }
+      return Status::OK();
+    }
+  };
+}
+
+TEST(WriteBatchTest, Blob) {
+  WriteBatch batch;
+  batch.Put(Slice("k1"), Slice("v1"));
+  batch.Put(Slice("k2"), Slice("v2"));
+  batch.Put(Slice("k3"), Slice("v3"));
+  batch.PutLogData(Slice("blob1"));
+  batch.Delete(Slice("k2"));
+  batch.PutLogData(Slice("blob2"));
+  batch.Merge(Slice("foo"), Slice("bar"));
+  ASSERT_EQ(5, batch.Count());
+  ASSERT_EQ("Merge(foo, bar)@4"
+            "Put(k1, v1)@0"
+            "Delete(k2)@3"
+            "Put(k2, v2)@1"
+            "Put(k3, v3)@2",
+            PrintContents(&batch));
+
+  TestHandler handler;
+  batch.Iterate(&handler);
+  ASSERT_EQ(
+            "Put(k1, v1)"
+            "Put(k2, v2)"
+            "Put(k3, v3)"
+            "LogData(blob1)"
+            "Delete(k2)"
+            "LogData(blob2)"
+            "Merge(foo, bar)",
+            handler.seen);
+}
+
+TEST(WriteBatchTest, Continue) {
+  WriteBatch batch;
+
+  struct Handler : public TestHandler {
+    int num_seen = 0;
+    virtual Status PutCF(uint32_t column_family_id, const Slice& key,
+                       const Slice& value) {
+      ++num_seen;
+      return TestHandler::PutCF(column_family_id, key, value);
+    }
+    virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
+                         const Slice& value) {
+      ++num_seen;
+      return TestHandler::MergeCF(column_family_id, key, value);
+    }
+    virtual void LogData(const Slice& blob) {
+      ++num_seen;
+      TestHandler::LogData(blob);
+    }
+    virtual Status DeleteCF(uint32_t column_family_id, const Slice& key) {
+      ++num_seen;
+      return TestHandler::DeleteCF(column_family_id, key);
+    }
+    virtual bool Continue() override {
+      return num_seen < 3;
+    }
+  } handler;
+
+  batch.Put(Slice("k1"), Slice("v1"));
+  batch.PutLogData(Slice("blob1"));
+  batch.Delete(Slice("k1"));
+  batch.PutLogData(Slice("blob2"));
+  batch.Merge(Slice("foo"), Slice("bar"));
+  batch.Iterate(&handler);
+  ASSERT_EQ(
+            "Put(k1, v1)"
+            "LogData(blob1)"
+            "Delete(k1)",
+            handler.seen);
+}
+
+TEST(WriteBatchTest, PutGatherSlices) {
+  WriteBatch batch;
+  batch.Put(Slice("foo"), Slice("bar"));
+
+  {
+    // Try a write where the key is one slice but the value is two
+    Slice key_slice("baz");
+    Slice value_slices[2] = { Slice("header"), Slice("payload") };
+    batch.Put(SliceParts(&key_slice, 1),
+              SliceParts(value_slices, 2));
+  }
+
+  {
+    // One where the key is composite but the value is a single slice
+    Slice key_slices[3] = { Slice("key"), Slice("part2"), Slice("part3") };
+    Slice value_slice("value");
+    batch.Put(SliceParts(key_slices, 3),
+              SliceParts(&value_slice, 1));
+  }
+
+  WriteBatchInternal::SetSequence(&batch, 100);
+  ASSERT_EQ("Put(baz, headerpayload)@101"
+            "Put(foo, bar)@100"
+            "Put(keypart2part3, value)@102",
+            PrintContents(&batch));
+  ASSERT_EQ(3, batch.Count());
+}
+
+namespace {
+class ColumnFamilyHandleImplDummy : public ColumnFamilyHandleImpl {
+ public:
+  explicit ColumnFamilyHandleImplDummy(int id)
+      : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr), id_(id) {}
+  uint32_t GetID() const override { return id_; }
+
+ private:
+  uint32_t id_;
+};
+}  // namespace anonymous
+
+TEST(WriteBatchTest, ColumnFamiliesBatchTest) {
+  WriteBatch batch;
+  ColumnFamilyHandleImplDummy zero(0), two(2), three(3), eight(8);
+  batch.Put(&zero, Slice("foo"), Slice("bar"));
+  batch.Put(&two, Slice("twofoo"), Slice("bar2"));
+  batch.Put(&eight, Slice("eightfoo"), Slice("bar8"));
+  batch.Delete(&eight, Slice("eightfoo"));
+  batch.Merge(&three, Slice("threethree"), Slice("3three"));
+  batch.Put(&zero, Slice("foo"), Slice("bar"));
+  batch.Merge(Slice("omom"), Slice("nom"));
+
+  TestHandler handler;
+  batch.Iterate(&handler);
+  ASSERT_EQ(
+      "Put(foo, bar)"
+      "PutCF(2, twofoo, bar2)"
+      "PutCF(8, eightfoo, bar8)"
+      "DeleteCF(8, eightfoo)"
+      "MergeCF(3, threethree, 3three)"
+      "Put(foo, bar)"
+      "Merge(omom, nom)",
+      handler.seen);
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/doc/doc.css b/doc/doc.css
new file mode 100644 (file)
index 0000000..700c564
--- /dev/null
@@ -0,0 +1,89 @@
+body {
+  margin-left: 0.5in;
+  margin-right: 0.5in;
+  background: white;
+  color: black;
+}
+
+h1 {
+  margin-left: -0.2in;
+  font-size: 14pt;
+}
+h2 {
+  margin-left: -0in;
+  font-size: 12pt;
+}
+h3 {
+  margin-left: -0in;
+}
+h4 {
+  margin-left: -0in;
+}
+hr {
+  margin-left: -0in;
+}
+
+/* Definition lists: definition term bold */
+dt {
+  font-weight: bold;
+}
+
+address {
+  text-align: center;
+}
+code,samp,var {
+  color: blue;
+}
+kbd {
+  color: #600000;
+}
+div.note p {
+  float: right;
+  width: 3in;
+  margin-right: 0%;
+  padding: 1px;
+  border: 2px solid #6060a0;
+  background-color: #fffff0;
+}
+
+ul {
+  margin-top: -0em;
+  margin-bottom: -0em;
+}
+
+ol {
+  margin-top: -0em;
+  margin-bottom: -0em;
+}
+
+UL.nobullets {
+  list-style-type: none;
+  list-style-image: none;
+  margin-left: -1em;
+}
+
+p {
+  margin: 1em 0 1em 0;
+  padding: 0 0 0 0;
+}
+
+pre {
+  line-height: 1.3em;
+  padding: 0.4em 0 0.8em 0;
+  margin:  0 0 0 0;
+  border:  0 0 0 0;
+  color: blue;
+}
+
+.datatable {
+  margin-left: auto;
+  margin-right: auto;
+  margin-top: 2em;
+  margin-bottom: 2em;
+  border: 1px solid;
+}
+
+.datatable td,th {
+  padding: 0 0.5em 0 0.5em;
+  text-align: right;
+}
diff --git a/doc/index.html b/doc/index.html
new file mode 100644 (file)
index 0000000..71f515e
--- /dev/null
@@ -0,0 +1,831 @@
+<!DOCTYPE html>
+<html>
+<head>
+<link rel="stylesheet" type="text/css" href="doc.css" />
+<title>RocksDB</title>
+</head>
+
+<body>
+<h1>RocksDB</h1>
+<address>The Facebook Database Engineering Team</address>
+<address>Build on earlier work on leveldb by Sanjay Ghemawat
+               (sanjay@google.com) and Jeff Dean (jeff@google.com)</address>
+<p>
+The <code>rocksdb</code> library provides a persistent key value store.  Keys and
+values are arbitrary byte arrays.  The keys are ordered within the key
+value store according to a user-specified comparator function.
+
+<p>
+<h1>Opening A Database</h1>
+<p>
+A <code>rocksdb</code> database has a name which corresponds to a file system
+directory.  All of the contents of database are stored in this
+directory.  The following example shows how to open a database,
+creating it if necessary:
+<p>
+<pre>
+  #include &lt;assert&gt;
+  #include "rocksdb/db.h"
+
+  rocksdb::DB* db;
+  rocksdb::Options options;
+  options.create_if_missing = true;
+  rocksdb::Status status = rocksdb::DB::Open(options, "/tmp/testdb", &amp;db);
+  assert(status.ok());
+  ...
+</pre>
+If you want to raise an error if the database already exists, add
+the following line before the <code>rocksdb::DB::Open</code> call:
+<pre>
+  options.error_if_exists = true;
+</pre>
+<h1>Status</h1>
+<p>
+You may have noticed the <code>rocksdb::Status</code> type above.  Values of this
+type are returned by most functions in <code>rocksdb</code> that may encounter an
+error.  You can check if such a result is ok, and also print an
+associated error message:
+<p>
+<pre>
+   rocksdb::Status s = ...;
+   if (!s.ok()) cerr &lt;&lt; s.ToString() &lt;&lt; endl;
+</pre>
+<h1>Closing A Database</h1>
+<p>
+When you are done with a database, just delete the database object.
+Example:
+<p>
+<pre>
+  ... open the db as described above ...
+  ... do something with db ...
+  delete db;
+</pre>
+<h1>Reads And Writes</h1>
+<p>
+The database provides <code>Put</code>, <code>Delete</code>, and <code>Get</code> methods to
+modify/query the database.  For example, the following code
+moves the value stored under key1 to key2.
+<pre>
+  std::string value;
+  rocksdb::Status s = db-&gt;Get(rocksdb::ReadOptions(), key1, &amp;value);
+  if (s.ok()) s = db-&gt;Put(rocksdb::WriteOptions(), key2, value);
+  if (s.ok()) s = db-&gt;Delete(rocksdb::WriteOptions(), key1);
+</pre>
+
+<h1>Atomic Updates</h1>
+<p>
+Note that if the process dies after the Put of key2 but before the
+delete of key1, the same value may be left stored under multiple keys.
+Such problems can be avoided by using the <code>WriteBatch</code> class to
+atomically apply a set of updates:
+<p>
+<pre>
+  #include "rocksdb/write_batch.h"
+  ...
+  std::string value;
+  rocksdb::Status s = db-&gt;Get(rocksdb::ReadOptions(), key1, &amp;value);
+  if (s.ok()) {
+    rocksdb::WriteBatch batch;
+    batch.Delete(key1);
+    batch.Put(key2, value);
+    s = db-&gt;Write(rocksdb::WriteOptions(), &amp;batch);
+  }
+</pre>
+The <code>WriteBatch</code> holds a sequence of edits to be made to the database,
+and these edits within the batch are applied in order.  Note that we
+called <code>Delete</code> before <code>Put</code> so that if <code>key1</code> is identical to <code>key2</code>,
+we do not end up erroneously dropping the value entirely.
+<p>
+Apart from its atomicity benefits, <code>WriteBatch</code> may also be used to
+speed up bulk updates by placing lots of individual mutations into the
+same batch.
+
+<h1>Synchronous Writes</h1>
+By default, each write to <code>leveldb</code> is asynchronous: it
+returns after pushing the write from the process into the operating
+system.  The transfer from operating system memory to the underlying
+persistent storage happens asynchronously.  The <code>sync</code> flag
+can be turned on for a particular write to make the write operation
+not return until the data being written has been pushed all the way to
+persistent storage.  (On Posix systems, this is implemented by calling
+either <code>fsync(...)</code> or <code>fdatasync(...)</code> or
+<code>msync(..., MS_SYNC)</code> before the write operation returns.)
+<pre>
+  rocksdb::WriteOptions write_options;
+  write_options.sync = true;
+  db-&gt;Put(write_options, ...);
+</pre>
+Asynchronous writes are often more than a thousand times as fast as
+synchronous writes.  The downside of asynchronous writes is that a
+crash of the machine may cause the last few updates to be lost.  Note
+that a crash of just the writing process (i.e., not a reboot) will not
+cause any loss since even when <code>sync</code> is false, an update
+is pushed from the process memory into the operating system before it
+is considered done.
+
+<p>
+Asynchronous writes can often be used safely.  For example, when
+loading a large amount of data into the database you can handle lost
+updates by restarting the bulk load after a crash.  A hybrid scheme is
+also possible where every Nth write is synchronous, and in the event
+of a crash, the bulk load is restarted just after the last synchronous
+write finished by the previous run.  (The synchronous write can update
+a marker that describes where to restart on a crash.)
+
+<p>
+<code>WriteBatch</code> provides an alternative to asynchronous writes.
+Multiple updates may be placed in the same <code>WriteBatch</code> and
+applied together using a synchronous write (i.e.,
+<code>write_options.sync</code> is set to true).  The extra cost of
+the synchronous write will be amortized across all of the writes in
+the batch.
+
+<p>
+We also provide a way to completely disable Write Ahead Log for a
+particular write. If you set write_option.disableWAL to true, the
+write will not go to the log at all and may be lost in an event of
+process crash.
+
+<p>
+When opening a DB, you can disable syncing of data files by setting
+Options::disableDataSync to true. This can be useful when doing
+bulk-loading or big idempotent operations. Once the operation is
+finished, you can manually call sync() to flush all dirty buffers
+to stable storage.
+
+<p>
+RocksDB by default uses faster fdatasync() to sync files. If you want
+to use fsync(), you can set Options::use_fsync to true. You should set
+this to true on filesystems like ext3 that can lose files after a
+reboot.
+
+<p>
+<h1>Concurrency</h1>
+<p>
+A database may only be opened by one process at a time.
+The <code>rocksdb</code> implementation acquires a lock from the
+operating system to prevent misuse.  Within a single process, the
+same <code>rocksdb::DB</code> object may be safely shared by multiple
+concurrent threads.  I.e., different threads may write into or fetch
+iterators or call <code>Get</code> on the same database without any
+external synchronization (the leveldb implementation will
+automatically do the required synchronization).  However other objects
+(like Iterator and WriteBatch) may require external synchronization.
+If two threads share such an object, they must protect access to it
+using their own locking protocol.  More details are available in
+the public header files.
+
+<p>
+<h1>Merge operators</h1>
+<p>
+Merge operators provide efficient support for read-modify-write operation.
+More on the interface and implementation can be found on:
+<p>
+<a href="https://github.com/facebook/rocksdb/wiki/Merge-Operator">
+    Merge Operator</a>
+<p>
+<a href="https://github.com/facebook/rocksdb/wiki/Merge-Operator-Implementation">
+    Merge Operator Implementation</a>
+
+<p>
+<h1>Iteration</h1>
+<p>
+The following example demonstrates how to print all key,value pairs
+in a database.
+<p>
+<pre>
+  rocksdb::Iterator* it = db-&gt;NewIterator(rocksdb::ReadOptions());
+  for (it-&gt;SeekToFirst(); it-&gt;Valid(); it-&gt;Next()) {
+    cout &lt;&lt; it-&gt;key().ToString() &lt;&lt; ": "  &lt;&lt; it-&gt;value().ToString() &lt;&lt; endl;
+  }
+  assert(it-&gt;status().ok());  // Check for any errors found during the scan
+  delete it;
+</pre>
+The following variation shows how to process just the keys in the
+range <code>[start,limit)</code>:
+<p>
+<pre>
+  for (it-&gt;Seek(start);
+       it-&gt;Valid() &amp;&amp; it-&gt;key().ToString() &lt; limit;
+       it-&gt;Next()) {
+    ...
+  }
+</pre>
+You can also process entries in reverse order.  (Caveat: reverse
+iteration may be somewhat slower than forward iteration.)
+<p>
+<pre>
+  for (it-&gt;SeekToLast(); it-&gt;Valid(); it-&gt;Prev()) {
+    ...
+  }
+</pre>
+<h1>Snapshots</h1>
+<p>
+Snapshots provide consistent read-only views over the entire state of
+the key-value store.  <code>ReadOptions::snapshot</code> may be non-NULL to indicate
+that a read should operate on a particular version of the DB state.
+If <code>ReadOptions::snapshot</code> is NULL, the read will operate on an
+implicit snapshot of the current state.
+<p>
+Snapshots are created by the DB::GetSnapshot() method:
+<p>
+<pre>
+  rocksdb::ReadOptions options;
+  options.snapshot = db-&gt;GetSnapshot();
+  ... apply some updates to db ...
+  rocksdb::Iterator* iter = db-&gt;NewIterator(options);
+  ... read using iter to view the state when the snapshot was created ...
+  delete iter;
+  db-&gt;ReleaseSnapshot(options.snapshot);
+</pre>
+Note that when a snapshot is no longer needed, it should be released
+using the DB::ReleaseSnapshot interface.  This allows the
+implementation to get rid of state that was being maintained just to
+support reading as of that snapshot.
+<h1>Slice</h1>
+<p>
+The return value of the <code>it->key()</code> and <code>it->value()</code> calls above
+are instances of the <code>rocksdb::Slice</code> type.  <code>Slice</code> is a simple
+structure that contains a length and a pointer to an external byte
+array.  Returning a <code>Slice</code> is a cheaper alternative to returning a
+<code>std::string</code> since we do not need to copy potentially large keys and
+values.  In addition, <code>rocksdb</code> methods do not return null-terminated
+C-style strings since <code>rocksdb</code> keys and values are allowed to
+contain '\0' bytes.
+<p>
+C++ strings and null-terminated C-style strings can be easily converted
+to a Slice:
+<p>
+<pre>
+   rocksdb::Slice s1 = "hello";
+
+   std::string str("world");
+   rocksdb::Slice s2 = str;
+</pre>
+A Slice can be easily converted back to a C++ string:
+<pre>
+   std::string str = s1.ToString();
+   assert(str == std::string("hello"));
+</pre>
+Be careful when using Slices since it is up to the caller to ensure that
+the external byte array into which the Slice points remains live while
+the Slice is in use.  For example, the following is buggy:
+<p>
+<pre>
+   rocksdb::Slice slice;
+   if (...) {
+     std::string str = ...;
+     slice = str;
+   }
+   Use(slice);
+</pre>
+When the <code>if</code> statement goes out of scope, <code>str</code> will be destroyed and the
+backing storage for <code>slice</code> will disappear.
+<p>
+<h1>Comparators</h1>
+<p>
+The preceding examples used the default ordering function for key,
+which orders bytes lexicographically.  You can however supply a custom
+comparator when opening a database.  For example, suppose each
+database key consists of two numbers and we should sort by the first
+number, breaking ties by the second number.  First, define a proper
+subclass of <code>rocksdb::Comparator</code> that expresses these rules:
+<p>
+<pre>
+  class TwoPartComparator : public rocksdb::Comparator {
+   public:
+    // Three-way comparison function:
+    //   if a &lt; b: negative result
+    //   if a &gt; b: positive result
+    //   else: zero result
+    int Compare(const rocksdb::Slice&amp; a, const rocksdb::Slice&amp; b) const {
+      int a1, a2, b1, b2;
+      ParseKey(a, &amp;a1, &amp;a2);
+      ParseKey(b, &amp;b1, &amp;b2);
+      if (a1 &lt; b1) return -1;
+      if (a1 &gt; b1) return +1;
+      if (a2 &lt; b2) return -1;
+      if (a2 &gt; b2) return +1;
+      return 0;
+    }
+
+    // Ignore the following methods for now:
+    const char* Name() const { return "TwoPartComparator"; }
+    void FindShortestSeparator(std::string*, const rocksdb::Slice&amp;) const { }
+    void FindShortSuccessor(std::string*) const { }
+  };
+</pre>
+Now create a database using this custom comparator:
+<p>
+<pre>
+  TwoPartComparator cmp;
+  rocksdb::DB* db;
+  rocksdb::Options options;
+  options.create_if_missing = true;
+  options.comparator = &amp;cmp;
+  rocksdb::Status status = rocksdb::DB::Open(options, "/tmp/testdb", &amp;db);
+  ...
+</pre>
+<h2>Backwards compatibility</h2>
+<p>
+The result of the comparator's <code>Name</code> method is attached to the
+database when it is created, and is checked on every subsequent
+database open.  If the name changes, the <code>rocksdb::DB::Open</code> call will
+fail.  Therefore, change the name if and only if the new key format
+and comparison function are incompatible with existing databases, and
+it is ok to discard the contents of all existing databases.
+<p>
+You can however still gradually evolve your key format over time with
+a little bit of pre-planning.  For example, you could store a version
+number at the end of each key (one byte should suffice for most uses).
+When you wish to switch to a new key format (e.g., adding an optional
+third part to the keys processed by <code>TwoPartComparator</code>),
+(a) keep the same comparator name (b) increment the version number
+for new keys (c) change the comparator function so it uses the
+version numbers found in the keys to decide how to interpret them.
+
+
+<p>
+<h1>MemTable and Table factories</h1>
+<p>
+By default, we keep the data in memory in skiplist memtable and the data
+on disk in a table format described here:
+<a href="https://github.com/facebook/rocksdb/wiki/Rocksdb-Table-Format">
+    RocksDB Table Format</a>.
+<p>
+Since one of the goals of RocksDB is to have
+different parts of the system easily pluggable, we support different
+implementations of both memtable and table format. You can supply
+your own memtable factory by setting <code>Options::memtable_factory</code>
+and your own table factory by setting <code>Options::table_factory</code>.
+For available memtable factories, please refer to
+<code>rocksdb/memtablerep.h</code> and for table factores to
+<code>rocksdb/table.h</code>. These features are both in active development
+and please be wary of any API changes that might break your application
+going forward.
+<p>
+You can also read more about memtables here:
+<a href="https://github.com/facebook/rocksdb/wiki/Rocksdb-Architecture-Guide#memtables">
+Memtables wiki
+</a>
+
+<p>
+<h1>Performance</h1>
+<p>
+Performance can be tuned by changing the default values of the
+types defined in <code>include/rocksdb/options.h</code>.
+
+<p>
+<h2>Block size</h2>
+<p>
+<code>rocksdb</code> groups adjacent keys together into the same block and such a
+block is the unit of transfer to and from persistent storage.  The
+default block size is approximately 4096 uncompressed bytes.
+Applications that mostly do bulk scans over the contents of the
+database may wish to increase this size.  Applications that do a lot
+of point reads of small values may wish to switch to a smaller block
+size if performance measurements indicate an improvement.  There isn't
+much benefit in using blocks smaller than one kilobyte, or larger than
+a few megabytes.  Also note that compression will be more effective
+with larger block sizes. To change block size parameter, use
+<code>Options::block_size</code>.
+<p>
+<h2>Write buffer</h2>
+<p>
+<code>Options::write_buffer_size</code> specifies the amount of data
+to build up in memory before converting to a sorted on-disk file.
+Larger values increase performance, especially during bulk loads.
+Up to max_write_buffer_number write buffers may be held in memory
+at the same time,
+so you may wish to adjust this parameter to control memory usage.
+Also, a larger write buffer will result in a longer recovery time
+the next time the database is opened.
+Related option is
+<code>Options::max_write_buffer_number</code>, which is maximum number
+of write buffers that are built up in memory. The default is 2, so that
+when 1 write buffer is being flushed to storage, new writes can continue
+to the other write buffer.
+<code>Options::min_write_buffer_number_to_merge</code> is the minimum number
+of write buffers that will be merged together before writing to storage.
+If set to 1, then all write buffers are flushed to L0 as individual files and
+this increases read amplification because a get request has to check in all
+of these files. Also, an in-memory merge may result in writing lesser
+data to storage if there are duplicate records in each of these
+individual write buffers.  Default: 1
+<p>
+<h2>Compression</h2>
+<p>
+Each block is individually compressed before being written to
+persistent storage.  Compression is on by default since the default
+compression method is very fast, and is automatically disabled for
+uncompressible data.  In rare cases, applications may want to disable
+compression entirely, but should only do so if benchmarks show a
+performance improvement:
+<p>
+<pre>
+  rocksdb::Options options;
+  options.compression = rocksdb::kNoCompression;
+  ... rocksdb::DB::Open(options, name, ...) ....
+</pre>
+<h2>Cache</h2>
+<p>
+The contents of the database are stored in a set of files in the
+filesystem and each file stores a sequence of compressed blocks.  If
+<code>options.block_cache</code> is non-NULL, it is used to cache frequently
+used uncompressed block contents. If <code>options.block_cache_compressed</code>
+is non-NULL, it is used to cache frequently used compressed blocks. Compressed
+cache is an alternative to OS cache, which also caches compressed blocks. If
+compressed cache is used, the OS cache will be disabled automatically by setting
+<code>options.allow_os_buffer</code> to false.
+<p>
+<pre>
+  #include "rocksdb/cache.h"
+
+  rocksdb::Options options;
+  options.block_cache = rocksdb::NewLRUCache(100 * 1048576);  // 100MB uncompressed cache
+  options.block_cache_compressed = rocksdb::NewLRUCache(100 * 1048576);  // 100MB compressed cache
+  rocksdb::DB* db;
+  rocksdb::DB::Open(options, name, &db);
+  ... use the db ...
+  delete db
+  delete options.block_cache;
+  delete options.block_cache_compressed;
+</pre>
+<p>
+When performing a bulk read, the application may wish to disable
+caching so that the data processed by the bulk read does not end up
+displacing most of the cached contents.  A per-iterator option can be
+used to achieve this:
+<p>
+<pre>
+  rocksdb::ReadOptions options;
+  options.fill_cache = false;
+  rocksdb::Iterator* it = db-&gt;NewIterator(options);
+  for (it-&gt;SeekToFirst(); it-&gt;Valid(); it-&gt;Next()) {
+    ...
+  }
+</pre>
+<p>
+You can also disable block cache by setting <code>options.no_block_cache</code>
+to true.
+<h2>Key Layout</h2>
+<p>
+Note that the unit of disk transfer and caching is a block.  Adjacent
+keys (according to the database sort order) will usually be placed in
+the same block.  Therefore the application can improve its performance
+by placing keys that are accessed together near each other and placing
+infrequently used keys in a separate region of the key space.
+<p>
+For example, suppose we are implementing a simple file system on top
+of <code>rocksdb</code>.  The types of entries we might wish to store are:
+<p>
+<pre>
+   filename -&gt; permission-bits, length, list of file_block_ids
+   file_block_id -&gt; data
+</pre>
+We might want to prefix <code>filename</code> keys with one letter (say '/') and the
+<code>file_block_id</code> keys with a different letter (say '0') so that scans
+over just the metadata do not force us to fetch and cache bulky file
+contents.
+<p>
+<h2>Filters</h2>
+<p>
+Because of the way <code>rocksdb</code> data is organized on disk,
+a single <code>Get()</code> call may involve multiple reads from disk.
+The optional <code>FilterPolicy</code> mechanism can be used to reduce
+the number of disk reads substantially.
+<pre>
+   rocksdb::Options options;
+   options.filter_policy = NewBloomFilter(10);
+   rocksdb::DB* db;
+   rocksdb::DB::Open(options, "/tmp/testdb", &amp;db);
+   ... use the database ...
+   delete db;
+   delete options.filter_policy;
+</pre>
+The preceding code associates a
+<a href="http://en.wikipedia.org/wiki/Bloom_filter">Bloom filter</a>
+based filtering policy with the database.  Bloom filter based
+filtering relies on keeping some number of bits of data in memory per
+key (in this case 10 bits per key since that is the argument we passed
+to NewBloomFilter).  This filter will reduce the number of unnecessary
+disk reads needed for <code>Get()</code> calls by a factor of
+approximately a 100.  Increasing the bits per key will lead to a
+larger reduction at the cost of more memory usage.  We recommend that
+applications whose working set does not fit in memory and that do a
+lot of random reads set a filter policy.
+<p>
+If you are using a custom comparator, you should ensure that the filter
+policy you are using is compatible with your comparator.  For example,
+consider a comparator that ignores trailing spaces when comparing keys.
+<code>NewBloomFilter</code> must not be used with such a comparator.
+Instead, the application should provide a custom filter policy that
+also ignores trailing spaces.  For example:
+<pre>
+  class CustomFilterPolicy : public rocksdb::FilterPolicy {
+   private:
+    FilterPolicy* builtin_policy_;
+   public:
+    CustomFilterPolicy() : builtin_policy_(NewBloomFilter(10)) { }
+    ~CustomFilterPolicy() { delete builtin_policy_; }
+
+    const char* Name() const { return "IgnoreTrailingSpacesFilter"; }
+
+    void CreateFilter(const Slice* keys, int n, std::string* dst) const {
+      // Use builtin bloom filter code after removing trailing spaces
+      std::vector&lt;Slice&gt; trimmed(n);
+      for (int i = 0; i &lt; n; i++) {
+        trimmed[i] = RemoveTrailingSpaces(keys[i]);
+      }
+      return builtin_policy_-&gt;CreateFilter(&amp;trimmed[i], n, dst);
+    }
+
+    bool KeyMayMatch(const Slice& key, const Slice& filter) const {
+      // Use builtin bloom filter code after removing trailing spaces
+      return builtin_policy_-&gt;KeyMayMatch(RemoveTrailingSpaces(key), filter);
+    }
+  };
+</pre>
+<p>
+Advanced applications may provide a filter policy that does not use
+a bloom filter but uses some other mechanism for summarizing a set
+of keys.  See <code>rocksdb/filter_policy.h</code> for detail.
+<p>
+<h1>Checksums</h1>
+<p>
+<code>rocksdb</code> associates checksums with all data it stores in the file system.
+There are two separate controls provided over how aggressively these
+checksums are verified:
+<p>
+<ul>
+<li> <code>ReadOptions::verify_checksums</code> may be set to true to force
+  checksum verification of all data that is read from the file system on
+  behalf of a particular read.  By default, no such verification is
+  done.
+<p>
+<li> <code>Options::paranoid_checks</code> may be set to true before opening a
+  database to make the database implementation raise an error as soon as
+  it detects an internal corruption.  Depending on which portion of the
+  database has been corrupted, the error may be raised when the database
+  is opened, or later by another database operation.  By default,
+  paranoid checking is off so that the database can be used even if
+  parts of its persistent storage have been corrupted.
+<p>
+  If a database is corrupted (perhaps it cannot be opened when
+  paranoid checking is turned on), the <code>rocksdb::RepairDB</code> function
+  may be used to recover as much of the data as possible.
+<p>
+</ul>
+
+<p>
+<h1>Compaction</h1>
+<p>
+You can read more on Compactions here:
+<a href="https://github.com/facebook/rocksdb/wiki/Rocksdb-Architecture-Guide#multi-threaded-compactions">
+    Multi-threaded compactions
+</a>
+<p>
+Here we give overview of the options that impact behavior of Compactions:
+<ul>
+<p>
+<li><code>Options::compaction_style</code> - RocksDB currently supports two
+compaction algorithms - Universal  style and Level style. This option switches
+between the two.  Can be kCompactionStyleUniversal or kCompactionStyleLevel.
+If this is kCompactionStyleUniversal, then you can configure universal style
+parameters with <code>Options::compaction_options_universal</code>.
+<p>
+<li><code>Options::disable_auto_compactions</code> - Disable automatic compactions.
+Manual compactions can still be issued on this database.
+<p>
+<li><code>Options::compaction_filter</code> - Allows an application to modify/delete
+a key-value during background compaction. The client must provide
+compaction_filter_factory if it requires a new compaction filter to be used
+for different compaction processes. Client should specify only one of filter
+or factory.
+<p>
+<li><code>Options::compaction_filter_factory</code> - a factory that provides
+compaction filter objects which allow an application to modify/delete a
+key-value during background compaction.
+</ul>
+<p>
+Other options impacting performance of compactions and when they get triggered
+are:
+<ul>
+<p>
+<li> <code>Options::access_hint_on_compaction_start</code> - Specify the file access
+pattern once a compaction is started. It will be applied to all input files of a compaction. Default: NORMAL
+<p>
+<li> <code>Options::level0_file_num_compaction_trigger</code> -  Number of files to trigger level-0 compaction.
+A negative value means that level-0 compaction will not be triggered by number of files at all.
+<p>
+<li> <code>Options::max_mem_compaction_level</code> -  Maximum level to which a new compacted memtable is pushed if it
+does not create overlap.  We try to push to level 2 to avoid the relatively expensive level 0=>1 compactions and to avoid some
+expensive manifest file operations.  We do not push all the way to the largest level since that can generate a lot of wasted disk
+space if the same key space is being repeatedly overwritten.
+<p>
+<li> <code>Options::target_file_size_base</code> and <code>Options::target_file_size_multiplier</code> -
+Target file size for compaction.  target_file_size_base is per-file size for level-1.
+Target file size for level L can be calculated by target_file_size_base * (target_file_size_multiplier ^ (L-1))
+For example, if target_file_size_base is 2MB and target_file_size_multiplier is 10, then each file on level-1 will
+be 2MB, and each file on level 2 will be 20MB, and each file on level-3 will be 200MB. Default target_file_size_base is 2MB
+and default target_file_size_multiplier is 1.
+<p>
+<li> <code>Options::expanded_compaction_factor</code> -  Maximum number of bytes in all compacted files.  We avoid expanding
+the lower level file set of a compaction if it would make the total compaction cover more than
+(expanded_compaction_factor * targetFileSizeLevel()) many bytes.
+<p>
+<li> <code>Options::source_compaction_factor</code> -    Maximum number of bytes in all source files to be compacted in a
+single compaction run. We avoid picking too many files in the source level so that we do not exceed the total source bytes
+for compaction to exceed (source_compaction_factor * targetFileSizeLevel()) many bytes.
+Default:1, i.e. pick maxfilesize amount of data as the source of a compaction.
+<p>
+<li> <code>Options::max_grandparent_overlap_factor</code> -   Control maximum bytes of overlaps in grandparent (i.e., level+2) before we
+stop building a single file in a level->level+1 compaction.
+<p>
+<li> <code>Options::disable_seek_compaction</code> -  Disable compaction triggered by seek.
+With bloomfilter and fast storage, a miss on one level is very cheap if the file handle is cached in table cache
+(which is true if max_open_files is large).
+<p>
+<li> <code>Options::max_background_compactions</code> - Maximum number of concurrent background jobs, submitted to
+the default LOW priority thread pool
+</ul>
+
+<p>
+You can learn more about all of those options in <code>rocksdb/options.h</code>
+
+<h2> Universal style compaction specific settings</h2>
+<p>
+If you're using Universal style compaction, there is an object <code>CompactionOptionsUniversal</code>
+that hold all the different options for that compaction. The exact definition is in
+<code>rocksdb/universal_compaction.h</code> and you can set it in <code>Options::compaction_options_universal</code>.
+Here we give short overview of options in <code>CompactionOptionsUniversal</code>:
+<ul>
+<p>
+<li> <code>CompactionOptionsUniversal::size_ratio</code> - Percentage flexibility while comparing file size. If the candidate file(s)
+   size is 1% smaller than the next file's size, then include next file into
+   this candidate set.  Default: 1
+<p>
+<li> <code>CompactionOptionsUniversal::min_merge_width</code> - The minimum number of files in a single compaction run. Default: 2
+<p>
+<li> <code>CompactionOptionsUniversal::max_merge_width</code> - The maximum number of files in a single compaction run. Default: UINT_MAX
+<p>
+<li> <code>CompactionOptionsUniversal::max_size_amplification_percent</code> - The size amplification is defined as the amount (in percentage) of
+additional storage needed to store a single byte of data in the database.  For example, a size amplification of 2% means that a database that
+contains 100 bytes of user-data may occupy upto 102 bytes of physical storage. By this definition, a fully compacted database has
+a size amplification of 0%. Rocksdb uses the following heuristic to calculate size amplification: it assumes that all files excluding
+the earliest file contribute to the size amplification.  Default: 200, which means that a 100 byte database could require upto
+300 bytes of storage.
+<p>
+<li> <code>CompactionOptionsUniversal::compression_size_percent</code> - If this option is set to be -1 (the default value), all the output files
+will follow compression type specified.  If this option is not negative, we will try to make sure compressed
+size is just above this value. In normal cases, at least this percentage
+of data will be compressed.
+When we are compacting to a new file, here is the criteria whether
+it needs to be compressed: assuming here are the list of files sorted
+by generation time: [ A1...An B1...Bm C1...Ct ],
+where A1 is the newest and Ct is the oldest, and we are going to compact
+B1...Bm, we calculate the total size of all the files as total_size, as
+well as  the total size of C1...Ct as total_C, the compaction output file
+will be compressed iff total_C / total_size < this percentage
+<p>
+<li> <code>CompactionOptionsUniversal::stop_style</code> - The algorithm used to stop picking files into a single compaction run.
+Can be kCompactionStopStyleSimilarSize (pick files of similar size) or kCompactionStopStyleTotalSize (total size of picked files > next file).
+Default: kCompactionStopStyleTotalSize
+</ul>
+
+<h1>Thread pools</h1>
+<p>
+A thread pool is associated with Env environment object. The client has to create a thread pool by setting the number of background
+threads using method <code>Env::SetBackgroundThreads()</code> defined in <code>rocksdb/env.h</code>.
+We use the thread pool for compactions and memtable flushes.
+Since memtable flushes are in critical code path (stalling memtable flush can stall writes, increasing p99), we suggest
+having two thread pools - with priorities HIGH and LOW. Memtable flushes can be set up to be scheduled on HIGH thread pool.
+There are two options available for configuration of background compactions and flushes:
+<ul>
+<p>
+<li> <code>Options::max_background_compactions</code> - Maximum number of concurrent background jobs,
+submitted to the default LOW priority thread pool
+<p>
+<li> <code>Options::max_background_flushes</code> - Maximum number of concurrent background memtable flush jobs, submitted to
+the HIGH priority thread pool.  By default, all background jobs (major compaction and memtable flush) go
+to the LOW priority pool. If this option is set to a positive number, memtable flush jobs will be submitted to the HIGH priority pool.
+It is important when the same Env is shared by multiple db instances.  Without a separate pool, long running major compaction jobs could
+potentially block memtable flush jobs of other db instances, leading to unnecessary Put stalls.
+</ul>
+<p>
+<pre>
+  #include "rocksdb/env.h"
+  #include "rocksdb/db.h"
+
+  auto env = rocksdb::Env::Default();
+  env->SetBackgroundThreads(2, rocksdb::Env::LOW);
+  env->SetBackgroundThreads(1, rocksdb::Env::HIGH);
+  rocksdb::DB* db;
+  rocksdb::Options options;
+  options.env = env;
+  options.max_background_compactions = 2;
+  options.max_background_flushes = 1;
+  rocksdb::Status status = rocksdb::DB::Open(options, "/tmp/testdb", &amp;db);
+  assert(status.ok());
+  ...
+</pre>
+<h1>Approximate Sizes</h1>
+<p>
+The <code>GetApproximateSizes</code> method can used to get the approximate
+number of bytes of file system space used by one or more key ranges.
+<p>
+<pre>
+   rocksdb::Range ranges[2];
+   ranges[0] = rocksdb::Range("a", "c");
+   ranges[1] = rocksdb::Range("x", "z");
+   uint64_t sizes[2];
+   rocksdb::Status s = db-&gt;GetApproximateSizes(ranges, 2, sizes);
+</pre>
+The preceding call will set <code>sizes[0]</code> to the approximate number of
+bytes of file system space used by the key range <code>[a..c)</code> and
+<code>sizes[1]</code> to the approximate number of bytes used by the key range
+<code>[x..z)</code>.
+<p>
+<h1>Environment</h1>
+<p>
+All file operations (and other operating system calls) issued by the
+<code>rocksdb</code> implementation are routed through a <code>rocksdb::Env</code> object.
+Sophisticated clients may wish to provide their own <code>Env</code>
+implementation to get better control.  For example, an application may
+introduce artificial delays in the file IO paths to limit the impact
+of <code>rocksdb</code> on other activities in the system.
+<p>
+<pre>
+  class SlowEnv : public rocksdb::Env {
+    .. implementation of the Env interface ...
+  };
+
+  SlowEnv env;
+  rocksdb::Options options;
+  options.env = &amp;env;
+  Status s = rocksdb::DB::Open(options, ...);
+</pre>
+<h1>Porting</h1>
+<p>
+<code>rocksdb</code> may be ported to a new platform by providing platform
+specific implementations of the types/methods/functions exported by
+<code>rocksdb/port/port.h</code>.  See <code>rocksdb/port/port_example.h</code> for more
+details.
+<p>
+In addition, the new platform may need a new default <code>rocksdb::Env</code>
+implementation.  See <code>rocksdb/util/env_posix.h</code> for an example.
+
+<h1>Statistics</h1>
+<p>
+To be able to efficiently tune your application, it is always helpful if you
+have access to usage statistics. You can collect those statistics by setting
+<code>Options::table_properties_collectors</code> or
+<code>Options::statistics</code>. For more information, refer to
+<code>rocksdb/table_properties.h</code> and <code>rocksdb/statistics.h</code>.
+These should not add significant overhead to your application and we
+recommend exporting them to other monitoring tools.
+
+<h1>Purging WAL files</h1>
+<p>
+By default, old write-ahead logs are deleted automatically when they fall out
+of scope and application doesn't need them anymore. There are options that
+enable the user to archive the logs and then delete them lazily, either in
+TTL fashion or based on size limit.
+
+The options are <code>Options::WAL_ttl_seconds</code> and
+<code>Options::WAL_size_limit_MB</code>. Here is how they can be used:
+<ul>
+<li>
+<p>
+If both set to 0, logs will be deleted asap and will never get into the archive.
+<li>
+<p>
+If <code>WAL_ttl_seconds</code> is 0 and WAL_size_limit_MB is not 0, WAL
+files will be checked every 10 min and if total size is greater then
+<code>WAL_size_limit_MB</code>, they will be deleted starting with the
+earliest until size_limit is met. All empty files will be deleted.
+<li>
+<p>
+If <code>WAL_ttl_seconds</code> is not 0 and WAL_size_limit_MB is 0, then
+WAL files will be checked every <code>WAL_ttl_seconds / 2</code> and those
+that are older than WAL_ttl_seconds will be deleted.
+<li>
+<p>
+If both are not 0, WAL files will be checked every 10 min and both
+checks will be performed with ttl being first.
+</ul>
+
+<h1>Other Information</h1>
+<p>
+Details about the <code>rocksdb</code> implementation may be found in
+the following documents:
+<ul>
+<li> <a href="https://github.com/facebook/rocksdb/wiki/Rocksdb-Architecture-Guide">
+  RocksDB Architecture Guide</a>
+<li> <a href="https://github.com/facebook/rocksdb/wiki/Rocksdb-Table-Format">
+    Format of an immutable Table file</a>
+<li> <a href="log_format.txt">Format of a log file</a>
+</ul>
+
+</body>
+</html>
diff --git a/doc/log_format.txt b/doc/log_format.txt
new file mode 100644 (file)
index 0000000..3a0414b
--- /dev/null
@@ -0,0 +1,75 @@
+The log file contents are a sequence of 32KB blocks.  The only
+exception is that the tail of the file may contain a partial block.
+
+Each block consists of a sequence of records:
+   block := record* trailer?
+   record :=
+       checksum: uint32        // crc32c of type and data[]
+       length: uint16
+       type: uint8             // One of FULL, FIRST, MIDDLE, LAST
+       data: uint8[length]
+
+A record never starts within the last six bytes of a block (since it
+won't fit).  Any leftover bytes here form the trailer, which must
+consist entirely of zero bytes and must be skipped by readers.  
+
+Aside: if exactly seven bytes are left in the current block, and a new
+non-zero length record is added, the writer must emit a FIRST record
+(which contains zero bytes of user data) to fill up the trailing seven
+bytes of the block and then emit all of the user data in subsequent
+blocks.
+
+More types may be added in the future.  Some Readers may skip record
+types they do not understand, others may report that some data was
+skipped.
+
+FULL == 1
+FIRST == 2
+MIDDLE == 3
+LAST == 4
+
+The FULL record contains the contents of an entire user record.
+
+FIRST, MIDDLE, LAST are types used for user records that have been
+split into multiple fragments (typically because of block boundaries).
+FIRST is the type of the first fragment of a user record, LAST is the
+type of the last fragment of a user record, and MID is the type of all
+interior fragments of a user record.
+
+Example: consider a sequence of user records:
+   A: length 1000
+   B: length 97270
+   C: length 8000
+A will be stored as a FULL record in the first block.
+
+B will be split into three fragments: first fragment occupies the rest
+of the first block, second fragment occupies the entirety of the
+second block, and the third fragment occupies a prefix of the third
+block.  This will leave six bytes free in the third block, which will
+be left empty as the trailer.
+
+C will be stored as a FULL record in the fourth block.
+
+===================
+
+Some benefits over the recordio format:
+
+(1) We do not need any heuristics for resyncing - just go to next
+block boundary and scan.  If there is a corruption, skip to the next
+block.  As a side-benefit, we do not get confused when part of the
+contents of one log file are embedded as a record inside another log
+file.
+
+(2) Splitting at approximate boundaries (e.g., for mapreduce) is
+simple: find the next block boundary and skip records until we
+hit a FULL or FIRST record.
+
+(3) We do not need extra buffering for large records.
+
+Some downsides compared to recordio format:
+
+(1) No packing of tiny records.  This could be fixed by adding a new
+record type, so it is a shortcoming of the current implementation,
+not necessarily the format.
+
+(2) No compression.  Again, this could be fixed by adding new record types.
diff --git a/doc/rockslogo.jpg b/doc/rockslogo.jpg
new file mode 100644 (file)
index 0000000..363905a
Binary files /dev/null and b/doc/rockslogo.jpg differ
diff --git a/doc/rockslogo.png b/doc/rockslogo.png
new file mode 100644 (file)
index 0000000..1961360
Binary files /dev/null and b/doc/rockslogo.png differ
diff --git a/hdfs/README b/hdfs/README
new file mode 100644 (file)
index 0000000..9b7d0a6
--- /dev/null
@@ -0,0 +1,26 @@
+This directory contains the hdfs extensions needed to make rocksdb store
+files in HDFS.
+
+The hdfs.h file is copied from the Apache Hadoop 1.0 source code. 
+It defines the libhdfs library
+(http://hadoop.apache.org/common/docs/r0.20.2/libhdfs.html) to access 
+data in HDFS.  The libhdfs.a is copied from the Apache Hadoop 1.0 build. 
+It implements the API defined in hdfs.h. If your hadoop cluster is running
+a different hadoop release, then install these two files manually from your
+hadoop distribution and then recompile rocksdb.
+
+The env_hdfs.h file defines the rocksdb objects that are needed to talk to an
+underlying filesystem. 
+
+If you want to compile rocksdb with hdfs support, please set the following
+enviroment variables appropriately:
+   USE_HDFS=1
+   JAVA_HOME=/usr/local/jdk-6u22-64
+   LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/jdk-6u22-64/jre/lib/amd64/server:/usr/local/jdk-6u22-64/jre/lib/amd64/:./snappy/libs
+   make clean all db_bench
+
+To run dbbench,
+  set CLASSPATH to include your hadoop distribution
+  db_bench --hdfs="hdfs://hbaseudbperf001.snc1.facebook.com:9000"
+
+
diff --git a/hdfs/env_hdfs.h b/hdfs/env_hdfs.h
new file mode 100644 (file)
index 0000000..303cd81
--- /dev/null
@@ -0,0 +1,323 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+
+#pragma once
+#include <algorithm>
+#include <stdio.h>
+#include <sys/time.h>
+#include <time.h>
+#include <iostream>
+#include "rocksdb/env.h"
+#include "rocksdb/status.h"
+
+#ifdef USE_HDFS
+#include "hdfs/hdfs.h"
+
+namespace rocksdb {
+
+static const std::string kProto = "hdfs://";
+static const std::string pathsep = "/";
+
+// Thrown during execution when there is an issue with the supplied
+// arguments.
+class HdfsUsageException : public std::exception { };
+
+// A simple exception that indicates something went wrong that is not
+// recoverable.  The intention is for the message to be printed (with
+// nothing else) and the process terminate.
+class HdfsFatalException : public std::exception {
+public:
+  explicit HdfsFatalException(const std::string& s) : what_(s) { }
+  virtual ~HdfsFatalException() throw() { }
+  virtual const char* what() const throw() {
+    return what_.c_str();
+  }
+private:
+  const std::string what_;
+};
+
+//
+// The HDFS environment for rocksdb. This class overrides all the
+// file/dir access methods and delegates the thread-mgmt methods to the
+// default posix environment.
+//
+class HdfsEnv : public Env {
+
+ public:
+  explicit HdfsEnv(const std::string& fsname) : fsname_(fsname) {
+    posixEnv = Env::Default();
+    fileSys_ = connectToPath(fsname_);
+  }
+
+  virtual ~HdfsEnv() {
+    fprintf(stderr, "Destroying HdfsEnv::Default()\n");
+    hdfsDisconnect(fileSys_);
+  }
+
+  virtual Status NewSequentialFile(const std::string& fname,
+                                   SequentialFile** result);
+
+  virtual Status NewRandomAccessFile(const std::string& fname,
+                                     RandomAccessFile** result);
+
+  virtual Status NewWritableFile(const std::string& fname,
+                                 WritableFile** result);
+
+  virtual Status NewRandomRWFile(const std::string& fname,
+                                 unique_ptr<RandomRWFile>* result,
+                                 const EnvOptions& options);
+
+  virtual Status NewDirectory(const std::string& name,
+                              unique_ptr<Directory>* result);
+
+  virtual bool FileExists(const std::string& fname);
+
+  virtual Status GetChildren(const std::string& path,
+                             std::vector<std::string>* result);
+
+  virtual Status DeleteFile(const std::string& fname);
+
+  virtual Status CreateDir(const std::string& name);
+
+  virtual Status CreateDirIfMissing(const std::string& name);
+
+  virtual Status DeleteDir(const std::string& name);
+
+  virtual Status GetFileSize(const std::string& fname, uint64_t* size);
+
+  virtual Status GetFileModificationTime(const std::string& fname,
+                                         uint64_t* file_mtime);
+
+  virtual Status RenameFile(const std::string& src, const std::string& target);
+
+  virtual Status LockFile(const std::string& fname, FileLock** lock);
+
+  virtual Status UnlockFile(FileLock* lock);
+
+  virtual Status NewLogger(const std::string& fname, Logger** result);
+
+  virtual void Schedule(void (*function)(void* arg), void* arg,
+                        Priority pri = LOW) {
+    posixEnv->Schedule(function, arg, pri);
+  }
+
+  virtual void StartThread(void (*function)(void* arg), void* arg) {
+    posixEnv->StartThread(function, arg);
+  }
+
+  virtual void WaitForJoin() { posixEnv->WaitForJoin(); }
+
+  virtual unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const
+      override {
+    return posixEnv->GetThreadPoolQueueLen(pri);
+  }
+
+  virtual Status GetTestDirectory(std::string* path) {
+    return posixEnv->GetTestDirectory(path);
+  }
+
+  virtual uint64_t NowMicros() {
+    return posixEnv->NowMicros();
+  }
+
+  virtual void SleepForMicroseconds(int micros) {
+    posixEnv->SleepForMicroseconds(micros);
+  }
+
+  virtual Status GetHostName(char* name, uint64_t len) {
+    return posixEnv->GetHostName(name, len);
+  }
+
+  virtual Status GetCurrentTime(int64_t* unix_time) {
+    return posixEnv->GetCurrentTime(unix_time);
+  }
+
+  virtual Status GetAbsolutePath(const std::string& db_path,
+      std::string* output_path) {
+    return posixEnv->GetAbsolutePath(db_path, output_path);
+  }
+
+  virtual void SetBackgroundThreads(int number, Priority pri = LOW) {
+    posixEnv->SetBackgroundThreads(number, pri);
+  }
+
+  virtual std::string TimeToString(uint64_t number) {
+    return posixEnv->TimeToString(number);
+  }
+
+  static uint64_t gettid() {
+    assert(sizeof(pthread_t) <= sizeof(uint64_t));
+    return (uint64_t)pthread_self();
+  }
+
+ private:
+  std::string fsname_;  // string of the form "hdfs://hostname:port/"
+  hdfsFS fileSys_;      //  a single FileSystem object for all files
+  Env*  posixEnv;       // This object is derived from Env, but not from
+                        // posixEnv. We have posixnv as an encapsulated
+                        // object here so that we can use posix timers,
+                        // posix threads, etc.
+
+  /**
+   * If the URI is specified of the form hdfs://server:port/path,
+   * then connect to the specified cluster
+   * else connect to default.
+   */
+  hdfsFS connectToPath(const std::string& uri) {
+    if (uri.empty()) {
+      return nullptr;
+    }
+    if (uri.find(kProto) != 0) {
+      // uri doesn't start with hdfs:// -> use default:0, which is special
+      // to libhdfs.
+      return hdfsConnectNewInstance("default", 0);
+    }
+    const std::string hostport = uri.substr(kProto.length());
+
+    std::vector <std::string> parts;
+    split(hostport, ':', parts);
+    if (parts.size() != 2) {
+      throw HdfsFatalException("Bad uri for hdfs " + uri);
+    }
+    // parts[0] = hosts, parts[1] = port/xxx/yyy
+    std::string host(parts[0]);
+    std::string remaining(parts[1]);
+
+    int rem = remaining.find(pathsep);
+    std::string portStr = (rem == 0 ? remaining :
+                           remaining.substr(0, rem));
+
+    tPort port;
+    port = atoi(portStr.c_str());
+    if (port == 0) {
+      throw HdfsFatalException("Bad host-port for hdfs " + uri);
+    }
+    hdfsFS fs = hdfsConnectNewInstance(host.c_str(), port);
+    return fs;
+  }
+
+  void split(const std::string &s, char delim,
+             std::vector<std::string> &elems) {
+    elems.clear();
+    size_t prev = 0;
+    size_t pos = s.find(delim);
+    while (pos != std::string::npos) {
+      elems.push_back(s.substr(prev, pos));
+      prev = pos + 1;
+      pos = s.find(delim, prev);
+    }
+    elems.push_back(s.substr(prev, s.size()));
+  }
+};
+
+}  // namespace rocksdb
+
+#else // USE_HDFS
+
+
+namespace rocksdb {
+
+static const Status notsup;
+
+class HdfsEnv : public Env {
+
+ public:
+  explicit HdfsEnv(const std::string& fsname) {
+    fprintf(stderr, "You have not build rocksdb with HDFS support\n");
+    fprintf(stderr, "Please see hdfs/README for details\n");
+    throw std::exception();
+  }
+
+  virtual ~HdfsEnv() {
+  }
+
+  virtual Status NewSequentialFile(const std::string& fname,
+                                   unique_ptr<SequentialFile>* result,
+                                   const EnvOptions& options);
+
+  virtual Status NewRandomAccessFile(const std::string& fname,
+                                     unique_ptr<RandomAccessFile>* result,
+                                     const EnvOptions& options) {
+    return notsup;
+  }
+
+  virtual Status NewWritableFile(const std::string& fname,
+                                 unique_ptr<WritableFile>* result,
+                                 const EnvOptions& options) {
+    return notsup;
+  }
+
+  virtual Status NewRandomRWFile(const std::string& fname,
+                                 unique_ptr<RandomRWFile>* result,
+                                 const EnvOptions& options) {
+    return notsup;
+  }
+
+  virtual Status NewDirectory(const std::string& name,
+                              unique_ptr<Directory>* result) {
+    return notsup;
+  }
+
+  virtual bool FileExists(const std::string& fname){return false;}
+
+  virtual Status GetChildren(const std::string& path,
+                             std::vector<std::string>* result){return notsup;}
+
+  virtual Status DeleteFile(const std::string& fname){return notsup;}
+
+  virtual Status CreateDir(const std::string& name){return notsup;}
+
+  virtual Status CreateDirIfMissing(const std::string& name){return notsup;}
+
+  virtual Status DeleteDir(const std::string& name){return notsup;}
+
+  virtual Status GetFileSize(const std::string& fname, uint64_t* size){return notsup;}
+
+  virtual Status GetFileModificationTime(const std::string& fname,
+                                         uint64_t* time) {
+    return notsup;
+  }
+
+  virtual Status RenameFile(const std::string& src, const std::string& target){return notsup;}
+
+  virtual Status LockFile(const std::string& fname, FileLock** lock){return notsup;}
+
+  virtual Status UnlockFile(FileLock* lock){return notsup;}
+
+  virtual Status NewLogger(const std::string& fname,
+                           shared_ptr<Logger>* result){return notsup;}
+
+  virtual void Schedule(void (*function)(void* arg), void* arg,
+                        Priority pri = LOW) {}
+
+  virtual void StartThread(void (*function)(void* arg), void* arg) {}
+
+  virtual void WaitForJoin() {}
+
+  virtual unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const {
+    return 0;
+  }
+
+  virtual Status GetTestDirectory(std::string* path) {return notsup;}
+
+  virtual uint64_t NowMicros() {return 0;}
+
+  virtual void SleepForMicroseconds(int micros) {}
+
+  virtual Status GetHostName(char* name, uint64_t len) {return notsup;}
+
+  virtual Status GetCurrentTime(int64_t* unix_time) {return notsup;}
+
+  virtual Status GetAbsolutePath(const std::string& db_path,
+      std::string* outputpath) {return notsup;}
+
+  virtual void SetBackgroundThreads(int number, Priority pri = LOW) {}
+
+  virtual std::string TimeToString(uint64_t number) { return "";}
+};
+}
+
+#endif // USE_HDFS
diff --git a/hdfs/hdfs.h b/hdfs/hdfs.h
new file mode 100644 (file)
index 0000000..8e8dfec
--- /dev/null
@@ -0,0 +1,477 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#ifndef LIBHDFS_HDFS_H
+#define LIBHDFS_HDFS_H
+
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+#include <time.h>
+#include <errno.h>
+
+#include <jni.h>
+
+#ifndef O_RDONLY
+#define O_RDONLY 1
+#endif
+
+#ifndef O_WRONLY 
+#define O_WRONLY 2
+#endif
+
+#ifndef EINTERNAL
+#define EINTERNAL 255 
+#endif
+
+
+/** All APIs set errno to meaningful values */
+#ifdef __cplusplus
+extern  "C" {
+#endif
+
+    /**
+     * Some utility decls used in libhdfs.
+     */
+
+    typedef int32_t   tSize; /// size of data for read/write io ops 
+    typedef time_t    tTime; /// time type in seconds
+    typedef int64_t   tOffset;/// offset within the file
+    typedef uint16_t  tPort; /// port
+    typedef enum tObjectKind {
+        kObjectKindFile = 'F',
+        kObjectKindDirectory = 'D',
+    } tObjectKind;
+
+
+    /**
+     * The C reflection of org.apache.org.hadoop.FileSystem .
+     */
+    typedef void* hdfsFS;
+
+    
+    /**
+     * The C equivalent of org.apache.org.hadoop.FSData(Input|Output)Stream .
+     */
+    enum hdfsStreamType
+    {
+        UNINITIALIZED = 0,
+        INPUT = 1,
+        OUTPUT = 2,
+    };
+
+    
+    /**
+     * The 'file-handle' to a file in hdfs.
+     */
+    struct hdfsFile_internal {
+        void* file;
+        enum hdfsStreamType type;
+    };
+    typedef struct hdfsFile_internal* hdfsFile;
+      
+
+    /** 
+     * hdfsConnectAsUser - Connect to a hdfs file system as a specific user
+     * Connect to the hdfs.
+     * @param host A string containing either a host name, or an ip address
+     * of the namenode of a hdfs cluster. 'host' should be passed as NULL if
+     * you want to connect to local filesystem. 'host' should be passed as
+     * 'default' (and port as 0) to used the 'configured' filesystem
+     * (core-site/core-default.xml).
+     * @param port The port on which the server is listening.
+     * @param user the user name (this is hadoop domain user). Or NULL is equivelant to hhdfsConnect(host, port)
+     * @param groups the groups (these are hadoop domain groups)
+     * @return Returns a handle to the filesystem or NULL on error.
+     */
+     hdfsFS hdfsConnectAsUser(const char* host, tPort port, const char *user , const char *groups[], int groups_size );
+
+
+    /** 
+     * hdfsConnect - Connect to a hdfs file system.
+     * Connect to the hdfs.
+     * @param host A string containing either a host name, or an ip address
+     * of the namenode of a hdfs cluster. 'host' should be passed as NULL if
+     * you want to connect to local filesystem. 'host' should be passed as
+     * 'default' (and port as 0) to used the 'configured' filesystem
+     * (core-site/core-default.xml).
+     * @param port The port on which the server is listening.
+     * @return Returns a handle to the filesystem or NULL on error.
+     */
+     hdfsFS hdfsConnect(const char* host, tPort port);
+
+
+    /**
+     * This are the same as hdfsConnectAsUser except that every invocation returns a new FileSystem handle.
+     * Applications should call a hdfsDisconnect for every call to hdfsConnectAsUserNewInstance.
+     */
+     hdfsFS hdfsConnectAsUserNewInstance(const char* host, tPort port, const char *user , const char *groups[], int groups_size );
+     hdfsFS hdfsConnectNewInstance(const char* host, tPort port);
+     hdfsFS hdfsConnectPath(const char* uri);
+
+    /** 
+     * hdfsDisconnect - Disconnect from the hdfs file system.
+     * Disconnect from hdfs.
+     * @param fs The configured filesystem handle.
+     * @return Returns 0 on success, -1 on error.  
+     */
+    int hdfsDisconnect(hdfsFS fs);
+        
+
+    /** 
+     * hdfsOpenFile - Open a hdfs file in given mode.
+     * @param fs The configured filesystem handle.
+     * @param path The full path to the file.
+     * @param flags - an | of bits/fcntl.h file flags - supported flags are O_RDONLY, O_WRONLY (meaning create or overwrite i.e., implies O_TRUNCAT), 
+     * O_WRONLY|O_APPEND. Other flags are generally ignored other than (O_RDWR || (O_EXCL & O_CREAT)) which return NULL and set errno equal ENOTSUP.
+     * @param bufferSize Size of buffer for read/write - pass 0 if you want
+     * to use the default configured values.
+     * @param replication Block replication - pass 0 if you want to use
+     * the default configured values.
+     * @param blocksize Size of block - pass 0 if you want to use the
+     * default configured values.
+     * @return Returns the handle to the open file or NULL on error.
+     */
+    hdfsFile hdfsOpenFile(hdfsFS fs, const char* path, int flags,
+                          int bufferSize, short replication, tSize blocksize);
+
+
+    /** 
+     * hdfsCloseFile - Close an open file. 
+     * @param fs The configured filesystem handle.
+     * @param file The file handle.
+     * @return Returns 0 on success, -1 on error.  
+     */
+    int hdfsCloseFile(hdfsFS fs, hdfsFile file);
+
+
+    /** 
+     * hdfsExists - Checks if a given path exsits on the filesystem 
+     * @param fs The configured filesystem handle.
+     * @param path The path to look for
+     * @return Returns 0 on exists, 1 on non-exists, -1/-2 on error.  
+     */
+    int hdfsExists(hdfsFS fs, const char *path);
+
+
+    /** 
+     * hdfsSeek - Seek to given offset in file. 
+     * This works only for files opened in read-only mode. 
+     * @param fs The configured filesystem handle.
+     * @param file The file handle.
+     * @param desiredPos Offset into the file to seek into.
+     * @return Returns 0 on success, -1 on error.  
+     */
+    int hdfsSeek(hdfsFS fs, hdfsFile file, tOffset desiredPos); 
+
+
+    /** 
+     * hdfsTell - Get the current offset in the file, in bytes.
+     * @param fs The configured filesystem handle.
+     * @param file The file handle.
+     * @return Current offset, -1 on error.
+     */
+    tOffset hdfsTell(hdfsFS fs, hdfsFile file);
+
+
+    /** 
+     * hdfsRead - Read data from an open file.
+     * @param fs The configured filesystem handle.
+     * @param file The file handle.
+     * @param buffer The buffer to copy read bytes into.
+     * @param length The length of the buffer.
+     * @return Returns the number of bytes actually read, possibly less
+     * than than length;-1 on error.
+     */
+    tSize hdfsRead(hdfsFS fs, hdfsFile file, void* buffer, tSize length);
+
+
+    /** 
+     * hdfsPread - Positional read of data from an open file.
+     * @param fs The configured filesystem handle.
+     * @param file The file handle.
+     * @param position Position from which to read
+     * @param buffer The buffer to copy read bytes into.
+     * @param length The length of the buffer.
+     * @return Returns the number of bytes actually read, possibly less than
+     * than length;-1 on error.
+     */
+    tSize hdfsPread(hdfsFS fs, hdfsFile file, tOffset position,
+                    void* buffer, tSize length);
+
+
+    /** 
+     * hdfsWrite - Write data into an open file.
+     * @param fs The configured filesystem handle.
+     * @param file The file handle.
+     * @param buffer The data.
+     * @param length The no. of bytes to write. 
+     * @return Returns the number of bytes written, -1 on error.
+     */
+    tSize hdfsWrite(hdfsFS fs, hdfsFile file, const void* buffer,
+                    tSize length);
+
+
+    /** 
+     * hdfsWrite - Flush the data. 
+     * @param fs The configured filesystem handle.
+     * @param file The file handle.
+     * @return Returns 0 on success, -1 on error. 
+     */
+    int hdfsFlush(hdfsFS fs, hdfsFile file);
+
+    /**
+     * hdfsSync - Sync the data to persistent store.
+     * @param fs The configured filesystem handle.
+     * @param file The file handle.
+     * @return Returns 0 on success, -1 on error.
+     */
+    int hdfsSync(hdfsFS fs, hdfsFile file);
+
+    /**
+     * hdfsGetNumReplicasInPipeline - get number of remaining replicas in 
+     * pipeline
+     * @param fs The configured filesystem handle
+     * @param file the file handle
+     * @return returns the # of datanodes in the write pipeline; -1 on error
+     */
+   int hdfsGetNumCurrentReplicas(hdfsFS, hdfsFile file);
+
+    /**
+     * hdfsAvailable - Number of bytes that can be read from this
+     * input stream without blocking.
+     * @param fs The configured filesystem handle.
+     * @param file The file handle.
+     * @return Returns available bytes; -1 on error. 
+     */
+    int hdfsAvailable(hdfsFS fs, hdfsFile file);
+
+
+    /**
+     * hdfsCopy - Copy file from one filesystem to another.
+     * @param srcFS The handle to source filesystem.
+     * @param src The path of source file. 
+     * @param dstFS The handle to destination filesystem.
+     * @param dst The path of destination file. 
+     * @return Returns 0 on success, -1 on error. 
+     */
+    int hdfsCopy(hdfsFS srcFS, const char* src, hdfsFS dstFS, const char* dst);
+
+
+    /**
+     * hdfsMove - Move file from one filesystem to another.
+     * @param srcFS The handle to source filesystem.
+     * @param src The path of source file. 
+     * @param dstFS The handle to destination filesystem.
+     * @param dst The path of destination file. 
+     * @return Returns 0 on success, -1 on error. 
+     */
+    int hdfsMove(hdfsFS srcFS, const char* src, hdfsFS dstFS, const char* dst);
+
+
+    /**
+     * hdfsDelete - Delete file. 
+     * @param fs The configured filesystem handle.
+     * @param path The path of the file. 
+     * @return Returns 0 on success, -1 on error. 
+     */
+    int hdfsDelete(hdfsFS fs, const char* path);
+
+
+    /**
+     * hdfsRename - Rename file. 
+     * @param fs The configured filesystem handle.
+     * @param oldPath The path of the source file. 
+     * @param newPath The path of the destination file. 
+     * @return Returns 0 on success, -1 on error. 
+     */
+    int hdfsRename(hdfsFS fs, const char* oldPath, const char* newPath);
+
+
+    /** 
+     * hdfsGetWorkingDirectory - Get the current working directory for
+     * the given filesystem.
+     * @param fs The configured filesystem handle.
+     * @param buffer The user-buffer to copy path of cwd into. 
+     * @param bufferSize The length of user-buffer.
+     * @return Returns buffer, NULL on error.
+     */
+    char* hdfsGetWorkingDirectory(hdfsFS fs, char *buffer, size_t bufferSize);
+
+
+    /** 
+     * hdfsSetWorkingDirectory - Set the working directory. All relative
+     * paths will be resolved relative to it.
+     * @param fs The configured filesystem handle.
+     * @param path The path of the new 'cwd'. 
+     * @return Returns 0 on success, -1 on error. 
+     */
+    int hdfsSetWorkingDirectory(hdfsFS fs, const char* path);
+
+
+    /** 
+     * hdfsCreateDirectory - Make the given file and all non-existent
+     * parents into directories.
+     * @param fs The configured filesystem handle.
+     * @param path The path of the directory. 
+     * @return Returns 0 on success, -1 on error. 
+     */
+    int hdfsCreateDirectory(hdfsFS fs, const char* path);
+
+
+    /** 
+     * hdfsSetReplication - Set the replication of the specified
+     * file to the supplied value
+     * @param fs The configured filesystem handle.
+     * @param path The path of the file. 
+     * @return Returns 0 on success, -1 on error. 
+     */
+    int hdfsSetReplication(hdfsFS fs, const char* path, int16_t replication);
+
+
+    /** 
+     * hdfsFileInfo - Information about a file/directory.
+     */
+    typedef struct  {
+        tObjectKind mKind;   /* file or directory */
+        char *mName;         /* the name of the file */
+        tTime mLastMod;      /* the last modification time for the file in seconds */
+        tOffset mSize;       /* the size of the file in bytes */
+        short mReplication;    /* the count of replicas */
+        tOffset mBlockSize;  /* the block size for the file */
+        char *mOwner;        /* the owner of the file */
+        char *mGroup;        /* the group associated with the file */
+        short mPermissions;  /* the permissions associated with the file */
+        tTime mLastAccess;    /* the last access time for the file in seconds */
+    } hdfsFileInfo;
+
+
+    /** 
+     * hdfsListDirectory - Get list of files/directories for a given
+     * directory-path. hdfsFreeFileInfo should be called to deallocate memory if
+     * the function returns non-NULL value.
+     * @param fs The configured filesystem handle.
+     * @param path The path of the directory. 
+     * @param numEntries Set to the number of files/directories in path.
+     * @return Returns a dynamically-allocated array of hdfsFileInfo
+     * objects; NULL if empty or on error.
+     * on error, numEntries will be -1.
+     */
+    hdfsFileInfo *hdfsListDirectory(hdfsFS fs, const char* path,
+                                    int *numEntries);
+
+
+    /** 
+     * hdfsGetPathInfo - Get information about a path as a (dynamically
+     * allocated) single hdfsFileInfo struct. hdfsFreeFileInfo should be
+     * called when the pointer is no longer needed.
+     * @param fs The configured filesystem handle.
+     * @param path The path of the file. 
+     * @return Returns a dynamically-allocated hdfsFileInfo object;
+     * NULL on error.
+     */
+    hdfsFileInfo *hdfsGetPathInfo(hdfsFS fs, const char* path);
+
+
+    /** 
+     * hdfsFreeFileInfo - Free up the hdfsFileInfo array (including fields) 
+     * @param hdfsFileInfo The array of dynamically-allocated hdfsFileInfo
+     * objects.
+     * @param numEntries The size of the array.
+     */
+    void hdfsFreeFileInfo(hdfsFileInfo *hdfsFileInfo, int numEntries);
+
+
+    /** 
+     * hdfsGetHosts - Get hostnames where a particular block (determined by
+     * pos & blocksize) of a file is stored. The last element in the array
+     * is NULL. Due to replication, a single block could be present on
+     * multiple hosts.
+     * @param fs The configured filesystem handle.
+     * @param path The path of the file. 
+     * @param start The start of the block.
+     * @param length The length of the block.
+     * @return Returns a dynamically-allocated 2-d array of blocks-hosts;
+     * NULL on error.
+     */
+    char*** hdfsGetHosts(hdfsFS fs, const char* path, 
+            tOffset start, tOffset length);
+
+
+    /** 
+     * hdfsFreeHosts - Free up the structure returned by hdfsGetHosts
+     * @param hdfsFileInfo The array of dynamically-allocated hdfsFileInfo
+     * objects.
+     * @param numEntries The size of the array.
+     */
+    void hdfsFreeHosts(char ***blockHosts);
+
+
+    /** 
+     * hdfsGetDefaultBlockSize - Get the optimum blocksize.
+     * @param fs The configured filesystem handle.
+     * @return Returns the blocksize; -1 on error. 
+     */
+    tOffset hdfsGetDefaultBlockSize(hdfsFS fs);
+
+
+    /** 
+     * hdfsGetCapacity - Return the raw capacity of the filesystem.  
+     * @param fs The configured filesystem handle.
+     * @return Returns the raw-capacity; -1 on error. 
+     */
+    tOffset hdfsGetCapacity(hdfsFS fs);
+
+
+    /** 
+     * hdfsGetUsed - Return the total raw size of all files in the filesystem.
+     * @param fs The configured filesystem handle.
+     * @return Returns the total-size; -1 on error. 
+     */
+    tOffset hdfsGetUsed(hdfsFS fs);
+
+    /** 
+     * hdfsChown 
+     * @param fs The configured filesystem handle.
+     * @param path the path to the file or directory
+     * @param owner this is a string in Hadoop land. Set to null or "" if only setting group
+     * @param group  this is a string in Hadoop land. Set to null or "" if only setting user
+     * @return 0 on success else -1
+     */
+    int hdfsChown(hdfsFS fs, const char* path, const char *owner, const char *group);
+
+    /** 
+     * hdfsChmod
+     * @param fs The configured filesystem handle.
+     * @param path the path to the file or directory
+     * @param mode the bitmask to set it to
+     * @return 0 on success else -1
+     */
+      int hdfsChmod(hdfsFS fs, const char* path, short mode);
+
+    /** 
+     * hdfsUtime
+     * @param fs The configured filesystem handle.
+     * @param path the path to the file or directory
+     * @param mtime new modification time or 0 for only set access time in seconds
+     * @param atime new access time or 0 for only set modification time in seconds
+     * @return 0 on success else -1
+     */
+    int hdfsUtime(hdfsFS fs, const char* path, tTime mtime, tTime atime);
+    
+#ifdef __cplusplus
+}
+#endif
+
+#endif /*LIBHDFS_HDFS_H*/
+
+/**
+ * vim: ts=4: sw=4: et
+ */
diff --git a/hdfs/libhdfs.a b/hdfs/libhdfs.a
new file mode 100644 (file)
index 0000000..4d1f19f
Binary files /dev/null and b/hdfs/libhdfs.a differ
diff --git a/helpers/memenv/memenv.cc b/helpers/memenv/memenv.cc
new file mode 100644 (file)
index 0000000..185e7d8
--- /dev/null
@@ -0,0 +1,395 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/env.h"
+#include "rocksdb/status.h"
+#include "port/port.h"
+#include "util/mutexlock.h"
+#include <map>
+#include <string.h>
+#include <string>
+#include <vector>
+
+namespace rocksdb {
+
+namespace {
+
+class FileState {
+ public:
+  // FileStates are reference counted. The initial reference count is zero
+  // and the caller must call Ref() at least once.
+  FileState() : refs_(0), size_(0) {}
+
+  // Increase the reference count.
+  void Ref() {
+    MutexLock lock(&refs_mutex_);
+    ++refs_;
+  }
+
+  // Decrease the reference count. Delete if this is the last reference.
+  void Unref() {
+    bool do_delete = false;
+
+    {
+      MutexLock lock(&refs_mutex_);
+      --refs_;
+      assert(refs_ >= 0);
+      if (refs_ <= 0) {
+        do_delete = true;
+      }
+    }
+
+    if (do_delete) {
+      delete this;
+    }
+  }
+
+  uint64_t Size() const { return size_; }
+
+  Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const {
+    if (offset > size_) {
+      return Status::IOError("Offset greater than file size.");
+    }
+    const uint64_t available = size_ - offset;
+    if (n > available) {
+      n = available;
+    }
+    if (n == 0) {
+      *result = Slice();
+      return Status::OK();
+    }
+
+    size_t block = offset / kBlockSize;
+    size_t block_offset = offset % kBlockSize;
+
+    if (n <= kBlockSize - block_offset) {
+      // The requested bytes are all in the first block.
+      *result = Slice(blocks_[block] + block_offset, n);
+      return Status::OK();
+    }
+
+    size_t bytes_to_copy = n;
+    char* dst = scratch;
+
+    while (bytes_to_copy > 0) {
+      size_t avail = kBlockSize - block_offset;
+      if (avail > bytes_to_copy) {
+        avail = bytes_to_copy;
+      }
+      memcpy(dst, blocks_[block] + block_offset, avail);
+
+      bytes_to_copy -= avail;
+      dst += avail;
+      block++;
+      block_offset = 0;
+    }
+
+    *result = Slice(scratch, n);
+    return Status::OK();
+  }
+
+  Status Append(const Slice& data) {
+    const char* src = data.data();
+    size_t src_len = data.size();
+
+    while (src_len > 0) {
+      size_t avail;
+      size_t offset = size_ % kBlockSize;
+
+      if (offset != 0) {
+        // There is some room in the last block.
+        avail = kBlockSize - offset;
+      } else {
+        // No room in the last block; push new one.
+        blocks_.push_back(new char[kBlockSize]);
+        avail = kBlockSize;
+      }
+
+      if (avail > src_len) {
+        avail = src_len;
+      }
+      memcpy(blocks_.back() + offset, src, avail);
+      src_len -= avail;
+      src += avail;
+      size_ += avail;
+    }
+
+    return Status::OK();
+  }
+
+ private:
+  // Private since only Unref() should be used to delete it.
+  ~FileState() {
+    for (std::vector<char*>::iterator i = blocks_.begin(); i != blocks_.end();
+         ++i) {
+      delete [] *i;
+    }
+  }
+
+  // No copying allowed.
+  FileState(const FileState&);
+  void operator=(const FileState&);
+
+  port::Mutex refs_mutex_;
+  int refs_;  // Protected by refs_mutex_;
+
+  // The following fields are not protected by any mutex. They are only mutable
+  // while the file is being written, and concurrent access is not allowed
+  // to writable files.
+  std::vector<char*> blocks_;
+  uint64_t size_;
+
+  enum { kBlockSize = 8 * 1024 };
+};
+
+class SequentialFileImpl : public SequentialFile {
+ public:
+  explicit SequentialFileImpl(FileState* file) : file_(file), pos_(0) {
+    file_->Ref();
+  }
+
+  ~SequentialFileImpl() {
+    file_->Unref();
+  }
+
+  virtual Status Read(size_t n, Slice* result, char* scratch) {
+    Status s = file_->Read(pos_, n, result, scratch);
+    if (s.ok()) {
+      pos_ += result->size();
+    }
+    return s;
+  }
+
+  virtual Status Skip(uint64_t n) {
+    if (pos_ > file_->Size()) {
+      return Status::IOError("pos_ > file_->Size()");
+    }
+    const size_t available = file_->Size() - pos_;
+    if (n > available) {
+      n = available;
+    }
+    pos_ += n;
+    return Status::OK();
+  }
+
+ private:
+  FileState* file_;
+  size_t pos_;
+};
+
+class RandomAccessFileImpl : public RandomAccessFile {
+ public:
+  explicit RandomAccessFileImpl(FileState* file) : file_(file) {
+    file_->Ref();
+  }
+
+  ~RandomAccessFileImpl() {
+    file_->Unref();
+  }
+
+  virtual Status Read(uint64_t offset, size_t n, Slice* result,
+                      char* scratch) const {
+    return file_->Read(offset, n, result, scratch);
+  }
+
+ private:
+  FileState* file_;
+};
+
+class WritableFileImpl : public WritableFile {
+ public:
+  WritableFileImpl(FileState* file) : file_(file) {
+    file_->Ref();
+  }
+
+  ~WritableFileImpl() {
+    file_->Unref();
+  }
+
+  virtual Status Append(const Slice& data) {
+    return file_->Append(data);
+  }
+
+  virtual Status Close() { return Status::OK(); }
+  virtual Status Flush() { return Status::OK(); }
+  virtual Status Sync() { return Status::OK(); }
+
+ private:
+  FileState* file_;
+};
+
+class InMemoryDirectory : public Directory {
+ public:
+  virtual Status Fsync() { return Status::OK(); }
+};
+
+class InMemoryEnv : public EnvWrapper {
+ public:
+  explicit InMemoryEnv(Env* base_env) : EnvWrapper(base_env) { }
+
+  virtual ~InMemoryEnv() {
+    for (FileSystem::iterator i = file_map_.begin(); i != file_map_.end(); ++i){
+      i->second->Unref();
+    }
+  }
+
+  // Partial implementation of the Env interface.
+  virtual Status NewSequentialFile(const std::string& fname,
+                                   unique_ptr<SequentialFile>* result,
+                                   const EnvOptions& soptions) {
+    MutexLock lock(&mutex_);
+    if (file_map_.find(fname) == file_map_.end()) {
+      *result = NULL;
+      return Status::IOError(fname, "File not found");
+    }
+
+    result->reset(new SequentialFileImpl(file_map_[fname]));
+    return Status::OK();
+  }
+
+  virtual Status NewRandomAccessFile(const std::string& fname,
+                                     unique_ptr<RandomAccessFile>* result,
+                                     const EnvOptions& soptions) {
+    MutexLock lock(&mutex_);
+    if (file_map_.find(fname) == file_map_.end()) {
+      *result = NULL;
+      return Status::IOError(fname, "File not found");
+    }
+
+    result->reset(new RandomAccessFileImpl(file_map_[fname]));
+    return Status::OK();
+  }
+
+  virtual Status NewWritableFile(const std::string& fname,
+                                 unique_ptr<WritableFile>* result,
+                                 const EnvOptions& soptions) {
+    MutexLock lock(&mutex_);
+    if (file_map_.find(fname) != file_map_.end()) {
+      DeleteFileInternal(fname);
+    }
+
+    FileState* file = new FileState();
+    file->Ref();
+    file_map_[fname] = file;
+
+    result->reset(new WritableFileImpl(file));
+    return Status::OK();
+  }
+
+  virtual Status NewDirectory(const std::string& name,
+                              unique_ptr<Directory>* result) {
+    result->reset(new InMemoryDirectory());
+    return Status::OK();
+  }
+
+  virtual bool FileExists(const std::string& fname) {
+    MutexLock lock(&mutex_);
+    return file_map_.find(fname) != file_map_.end();
+  }
+
+  virtual Status GetChildren(const std::string& dir,
+                             std::vector<std::string>* result) {
+    MutexLock lock(&mutex_);
+    result->clear();
+
+    for (FileSystem::iterator i = file_map_.begin(); i != file_map_.end(); ++i){
+      const std::string& filename = i->first;
+
+      if (filename.size() >= dir.size() + 1 && filename[dir.size()] == '/' &&
+          Slice(filename).starts_with(Slice(dir))) {
+        result->push_back(filename.substr(dir.size() + 1));
+      }
+    }
+
+    return Status::OK();
+  }
+
+  void DeleteFileInternal(const std::string& fname) {
+    if (file_map_.find(fname) == file_map_.end()) {
+      return;
+    }
+
+    file_map_[fname]->Unref();
+    file_map_.erase(fname);
+  }
+
+  virtual Status DeleteFile(const std::string& fname) {
+    MutexLock lock(&mutex_);
+    if (file_map_.find(fname) == file_map_.end()) {
+      return Status::IOError(fname, "File not found");
+    }
+
+    DeleteFileInternal(fname);
+    return Status::OK();
+  }
+
+  virtual Status CreateDir(const std::string& dirname) {
+    return Status::OK();
+  }
+
+  virtual Status CreateDirIfMissing(const std::string& dirname) {
+    return Status::OK();
+  }
+
+  virtual Status DeleteDir(const std::string& dirname) {
+    return Status::OK();
+  }
+
+  virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) {
+    MutexLock lock(&mutex_);
+    if (file_map_.find(fname) == file_map_.end()) {
+      return Status::IOError(fname, "File not found");
+    }
+
+    *file_size = file_map_[fname]->Size();
+    return Status::OK();
+  }
+
+  virtual Status GetFileModificationTime(const std::string& fname,
+                                         uint64_t* time) {
+    return Status::NotSupported("getFileMTime", "Not supported in MemEnv");
+  }
+
+  virtual Status RenameFile(const std::string& src,
+                            const std::string& target) {
+    MutexLock lock(&mutex_);
+    if (file_map_.find(src) == file_map_.end()) {
+      return Status::IOError(src, "File not found");
+    }
+
+    DeleteFileInternal(target);
+    file_map_[target] = file_map_[src];
+    file_map_.erase(src);
+    return Status::OK();
+  }
+
+  virtual Status LockFile(const std::string& fname, FileLock** lock) {
+    *lock = new FileLock;
+    return Status::OK();
+  }
+
+  virtual Status UnlockFile(FileLock* lock) {
+    delete lock;
+    return Status::OK();
+  }
+
+  virtual Status GetTestDirectory(std::string* path) {
+    *path = "/test";
+    return Status::OK();
+  }
+
+ private:
+  // Map from filenames to FileState objects, representing a simple file system.
+  typedef std::map<std::string, FileState*> FileSystem;
+  port::Mutex mutex_;
+  FileSystem file_map_;  // Protected by mutex_.
+};
+
+}  // namespace
+
+Env* NewMemEnv(Env* base_env) {
+  return new InMemoryEnv(base_env);
+}
+
+}  // namespace rocksdb
diff --git a/helpers/memenv/memenv_test.cc b/helpers/memenv/memenv_test.cc
new file mode 100644 (file)
index 0000000..ea3ed61
--- /dev/null
@@ -0,0 +1,231 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_impl.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "util/testharness.h"
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace rocksdb {
+
+class MemEnvTest {
+ public:
+  Env* env_;
+  const EnvOptions soptions_;
+
+  MemEnvTest()
+      : env_(NewMemEnv(Env::Default())) {
+  }
+  ~MemEnvTest() {
+    delete env_;
+  }
+};
+
+TEST(MemEnvTest, Basics) {
+  uint64_t file_size;
+  unique_ptr<WritableFile> writable_file;
+  std::vector<std::string> children;
+
+  ASSERT_OK(env_->CreateDir("/dir"));
+
+  // Check that the directory is empty.
+  ASSERT_TRUE(!env_->FileExists("/dir/non_existent"));
+  ASSERT_TRUE(!env_->GetFileSize("/dir/non_existent", &file_size).ok());
+  ASSERT_OK(env_->GetChildren("/dir", &children));
+  ASSERT_EQ(0U, children.size());
+
+  // Create a file.
+  ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_));
+  writable_file.reset();
+
+  // Check that the file exists.
+  ASSERT_TRUE(env_->FileExists("/dir/f"));
+  ASSERT_OK(env_->GetFileSize("/dir/f", &file_size));
+  ASSERT_EQ(0U, file_size);
+  ASSERT_OK(env_->GetChildren("/dir", &children));
+  ASSERT_EQ(1U, children.size());
+  ASSERT_EQ("f", children[0]);
+
+  // Write to the file.
+  ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_));
+  ASSERT_OK(writable_file->Append("abc"));
+  writable_file.reset();
+
+  // Check for expected size.
+  ASSERT_OK(env_->GetFileSize("/dir/f", &file_size));
+  ASSERT_EQ(3U, file_size);
+
+  // Check that renaming works.
+  ASSERT_TRUE(!env_->RenameFile("/dir/non_existent", "/dir/g").ok());
+  ASSERT_OK(env_->RenameFile("/dir/f", "/dir/g"));
+  ASSERT_TRUE(!env_->FileExists("/dir/f"));
+  ASSERT_TRUE(env_->FileExists("/dir/g"));
+  ASSERT_OK(env_->GetFileSize("/dir/g", &file_size));
+  ASSERT_EQ(3U, file_size);
+
+  // Check that opening non-existent file fails.
+  unique_ptr<SequentialFile> seq_file;
+  unique_ptr<RandomAccessFile> rand_file;
+  ASSERT_TRUE(!env_->NewSequentialFile("/dir/non_existent", &seq_file,
+                                       soptions_).ok());
+  ASSERT_TRUE(!seq_file);
+  ASSERT_TRUE(!env_->NewRandomAccessFile("/dir/non_existent", &rand_file,
+                                         soptions_).ok());
+  ASSERT_TRUE(!rand_file);
+
+  // Check that deleting works.
+  ASSERT_TRUE(!env_->DeleteFile("/dir/non_existent").ok());
+  ASSERT_OK(env_->DeleteFile("/dir/g"));
+  ASSERT_TRUE(!env_->FileExists("/dir/g"));
+  ASSERT_OK(env_->GetChildren("/dir", &children));
+  ASSERT_EQ(0U, children.size());
+  ASSERT_OK(env_->DeleteDir("/dir"));
+}
+
+TEST(MemEnvTest, ReadWrite) {
+  unique_ptr<WritableFile> writable_file;
+  unique_ptr<SequentialFile> seq_file;
+  unique_ptr<RandomAccessFile> rand_file;
+  Slice result;
+  char scratch[100];
+
+  ASSERT_OK(env_->CreateDir("/dir"));
+
+  ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_));
+  ASSERT_OK(writable_file->Append("hello "));
+  ASSERT_OK(writable_file->Append("world"));
+  writable_file.reset();
+
+  // Read sequentially.
+  ASSERT_OK(env_->NewSequentialFile("/dir/f", &seq_file, soptions_));
+  ASSERT_OK(seq_file->Read(5, &result, scratch)); // Read "hello".
+  ASSERT_EQ(0, result.compare("hello"));
+  ASSERT_OK(seq_file->Skip(1));
+  ASSERT_OK(seq_file->Read(1000, &result, scratch)); // Read "world".
+  ASSERT_EQ(0, result.compare("world"));
+  ASSERT_OK(seq_file->Read(1000, &result, scratch)); // Try reading past EOF.
+  ASSERT_EQ(0U, result.size());
+  ASSERT_OK(seq_file->Skip(100)); // Try to skip past end of file.
+  ASSERT_OK(seq_file->Read(1000, &result, scratch));
+  ASSERT_EQ(0U, result.size());
+
+  // Random reads.
+  ASSERT_OK(env_->NewRandomAccessFile("/dir/f", &rand_file, soptions_));
+  ASSERT_OK(rand_file->Read(6, 5, &result, scratch)); // Read "world".
+  ASSERT_EQ(0, result.compare("world"));
+  ASSERT_OK(rand_file->Read(0, 5, &result, scratch)); // Read "hello".
+  ASSERT_EQ(0, result.compare("hello"));
+  ASSERT_OK(rand_file->Read(10, 100, &result, scratch)); // Read "d".
+  ASSERT_EQ(0, result.compare("d"));
+
+  // Too high offset.
+  ASSERT_TRUE(!rand_file->Read(1000, 5, &result, scratch).ok());
+}
+
+TEST(MemEnvTest, Locks) {
+  FileLock* lock;
+
+  // These are no-ops, but we test they return success.
+  ASSERT_OK(env_->LockFile("some file", &lock));
+  ASSERT_OK(env_->UnlockFile(lock));
+}
+
+TEST(MemEnvTest, Misc) {
+  std::string test_dir;
+  ASSERT_OK(env_->GetTestDirectory(&test_dir));
+  ASSERT_TRUE(!test_dir.empty());
+
+  unique_ptr<WritableFile> writable_file;
+  ASSERT_OK(env_->NewWritableFile("/a/b", &writable_file, soptions_));
+
+  // These are no-ops, but we test they return success.
+  ASSERT_OK(writable_file->Sync());
+  ASSERT_OK(writable_file->Flush());
+  ASSERT_OK(writable_file->Close());
+  writable_file.reset();
+}
+
+TEST(MemEnvTest, LargeWrite) {
+  const size_t kWriteSize = 300 * 1024;
+  char* scratch = new char[kWriteSize * 2];
+
+  std::string write_data;
+  for (size_t i = 0; i < kWriteSize; ++i) {
+    write_data.append(1, static_cast<char>(i));
+  }
+
+  unique_ptr<WritableFile> writable_file;
+  ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_));
+  ASSERT_OK(writable_file->Append("foo"));
+  ASSERT_OK(writable_file->Append(write_data));
+  writable_file.reset();
+
+  unique_ptr<SequentialFile> seq_file;
+  Slice result;
+  ASSERT_OK(env_->NewSequentialFile("/dir/f", &seq_file, soptions_));
+  ASSERT_OK(seq_file->Read(3, &result, scratch)); // Read "foo".
+  ASSERT_EQ(0, result.compare("foo"));
+
+  size_t read = 0;
+  std::string read_data;
+  while (read < kWriteSize) {
+    ASSERT_OK(seq_file->Read(kWriteSize - read, &result, scratch));
+    read_data.append(result.data(), result.size());
+    read += result.size();
+  }
+  ASSERT_TRUE(write_data == read_data);
+  delete [] scratch;
+}
+
+TEST(MemEnvTest, DBTest) {
+  Options options;
+  options.create_if_missing = true;
+  options.env = env_;
+  DB* db;
+
+  const Slice keys[] = {Slice("aaa"), Slice("bbb"), Slice("ccc")};
+  const Slice vals[] = {Slice("foo"), Slice("bar"), Slice("baz")};
+
+  ASSERT_OK(DB::Open(options, "/dir/db", &db));
+  for (size_t i = 0; i < 3; ++i) {
+    ASSERT_OK(db->Put(WriteOptions(), keys[i], vals[i]));
+  }
+
+  for (size_t i = 0; i < 3; ++i) {
+    std::string res;
+    ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
+    ASSERT_TRUE(res == vals[i]);
+  }
+
+  Iterator* iterator = db->NewIterator(ReadOptions());
+  iterator->SeekToFirst();
+  for (size_t i = 0; i < 3; ++i) {
+    ASSERT_TRUE(iterator->Valid());
+    ASSERT_TRUE(keys[i] == iterator->key());
+    ASSERT_TRUE(vals[i] == iterator->value());
+    iterator->Next();
+  }
+  ASSERT_TRUE(!iterator->Valid());
+  delete iterator;
+
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db);
+  ASSERT_OK(dbi->TEST_FlushMemTable());
+
+  for (size_t i = 0; i < 3; ++i) {
+    std::string res;
+    ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
+    ASSERT_TRUE(res == vals[i]);
+  }
+
+  delete db;
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h
new file mode 100644 (file)
index 0000000..013ee5d
--- /dev/null
@@ -0,0 +1,575 @@
+/*  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+  This source code is licensed under the BSD-style license found in the
+  LICENSE file in the root directory of this source tree. An additional grant
+  of patent rights can be found in the PATENTS file in the same directory.
+ Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+  Use of this source code is governed by a BSD-style license that can be
+  found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+  C bindings for leveldb.  May be useful as a stable ABI that can be
+  used by programs that keep leveldb in a shared library, or for
+  a JNI api.
+
+  Does not support:
+  . getters for the option types
+  . custom comparators that implement key shortening
+  . capturing post-write-snapshot
+  . custom iter, db, env, cache implementations using just the C bindings
+
+  Some conventions:
+
+  (1) We expose just opaque struct pointers and functions to clients.
+  This allows us to change internal representations without having to
+  recompile clients.
+
+  (2) For simplicity, there is no equivalent to the Slice type.  Instead,
+  the caller has to pass the pointer and length as separate
+  arguments.
+
+  (3) Errors are represented by a null-terminated c string.  NULL
+  means no error.  All operations that can raise an error are passed
+  a "char** errptr" as the last argument.  One of the following must
+  be true on entry:
+     *errptr == NULL
+     *errptr points to a malloc()ed null-terminated error message
+  On success, a leveldb routine leaves *errptr unchanged.
+  On failure, leveldb frees the old value of *errptr and
+  set *errptr to a malloc()ed error message.
+
+  (4) Bools have the type unsigned char (0 == false; rest == true)
+
+  (5) All of the pointer arguments must be non-NULL.
+*/
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_C_H_
+#define STORAGE_ROCKSDB_INCLUDE_C_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdarg.h>
+#include <stddef.h>
+#include <stdint.h>
+
+/* Exported types */
+
+typedef struct rocksdb_t                 rocksdb_t;
+typedef struct rocksdb_cache_t           rocksdb_cache_t;
+typedef struct rocksdb_comparator_t      rocksdb_comparator_t;
+typedef struct rocksdb_env_t             rocksdb_env_t;
+typedef struct rocksdb_filelock_t        rocksdb_filelock_t;
+typedef struct rocksdb_filterpolicy_t    rocksdb_filterpolicy_t;
+typedef struct rocksdb_flushoptions_t    rocksdb_flushoptions_t;
+typedef struct rocksdb_iterator_t        rocksdb_iterator_t;
+typedef struct rocksdb_logger_t          rocksdb_logger_t;
+typedef struct rocksdb_mergeoperator_t   rocksdb_mergeoperator_t;
+typedef struct rocksdb_options_t         rocksdb_options_t;
+typedef struct rocksdb_randomfile_t      rocksdb_randomfile_t;
+typedef struct rocksdb_readoptions_t     rocksdb_readoptions_t;
+typedef struct rocksdb_seqfile_t         rocksdb_seqfile_t;
+typedef struct rocksdb_slicetransform_t  rocksdb_slicetransform_t;
+typedef struct rocksdb_snapshot_t        rocksdb_snapshot_t;
+typedef struct rocksdb_writablefile_t    rocksdb_writablefile_t;
+typedef struct rocksdb_writebatch_t      rocksdb_writebatch_t;
+typedef struct rocksdb_writeoptions_t    rocksdb_writeoptions_t;
+typedef struct rocksdb_universal_compaction_options_t rocksdb_universal_compaction_options_t;
+typedef struct rocksdb_livefiles_t     rocksdb_livefiles_t;
+
+/* DB operations */
+
+extern rocksdb_t* rocksdb_open(
+    const rocksdb_options_t* options,
+    const char* name,
+    char** errptr);
+
+extern rocksdb_t* rocksdb_open_for_read_only(
+    const rocksdb_options_t* options,
+    const char* name,
+    unsigned char error_if_log_file_exist,
+    char** errptr);
+
+extern void rocksdb_close(rocksdb_t* db);
+
+extern void rocksdb_put(
+    rocksdb_t* db,
+    const rocksdb_writeoptions_t* options,
+    const char* key, size_t keylen,
+    const char* val, size_t vallen,
+    char** errptr);
+
+extern void rocksdb_delete(
+    rocksdb_t* db,
+    const rocksdb_writeoptions_t* options,
+    const char* key, size_t keylen,
+    char** errptr);
+
+extern void rocksdb_merge(
+    rocksdb_t* db,
+    const rocksdb_writeoptions_t* options,
+    const char* key, size_t keylen,
+    const char* val, size_t vallen,
+    char** errptr);
+
+extern void rocksdb_write(
+    rocksdb_t* db,
+    const rocksdb_writeoptions_t* options,
+    rocksdb_writebatch_t* batch,
+    char** errptr);
+
+/* Returns NULL if not found.  A malloc()ed array otherwise.
+   Stores the length of the array in *vallen. */
+extern char* rocksdb_get(
+    rocksdb_t* db,
+    const rocksdb_readoptions_t* options,
+    const char* key, size_t keylen,
+    size_t* vallen,
+    char** errptr);
+
+extern rocksdb_iterator_t* rocksdb_create_iterator(
+    rocksdb_t* db,
+    const rocksdb_readoptions_t* options);
+
+extern const rocksdb_snapshot_t* rocksdb_create_snapshot(
+    rocksdb_t* db);
+
+extern void rocksdb_release_snapshot(
+    rocksdb_t* db,
+    const rocksdb_snapshot_t* snapshot);
+
+/* Returns NULL if property name is unknown.
+   Else returns a pointer to a malloc()-ed null-terminated value. */
+extern char* rocksdb_property_value(
+    rocksdb_t* db,
+    const char* propname);
+
+extern void rocksdb_approximate_sizes(
+    rocksdb_t* db,
+    int num_ranges,
+    const char* const* range_start_key, const size_t* range_start_key_len,
+    const char* const* range_limit_key, const size_t* range_limit_key_len,
+    uint64_t* sizes);
+
+extern void rocksdb_compact_range(
+    rocksdb_t* db,
+    const char* start_key, size_t start_key_len,
+    const char* limit_key, size_t limit_key_len);
+
+extern void rocksdb_delete_file(
+    rocksdb_t* db,
+    const char* name);
+
+extern const rocksdb_livefiles_t* rocksdb_livefiles(
+    rocksdb_t* db);
+
+extern void rocksdb_flush(
+    rocksdb_t* db,
+    const rocksdb_flushoptions_t* options,
+    char** errptr);
+
+extern void rocksdb_disable_file_deletions(
+    rocksdb_t* db,
+    char** errptr);
+
+extern void rocksdb_enable_file_deletions(
+    rocksdb_t* db,
+    unsigned char force,
+    char** errptr);
+
+/* Management operations */
+
+extern void rocksdb_destroy_db(
+    const rocksdb_options_t* options,
+    const char* name,
+    char** errptr);
+
+extern void rocksdb_repair_db(
+    const rocksdb_options_t* options,
+    const char* name,
+    char** errptr);
+
+/* Iterator */
+
+extern void rocksdb_iter_destroy(rocksdb_iterator_t*);
+extern unsigned char rocksdb_iter_valid(const rocksdb_iterator_t*);
+extern void rocksdb_iter_seek_to_first(rocksdb_iterator_t*);
+extern void rocksdb_iter_seek_to_last(rocksdb_iterator_t*);
+extern void rocksdb_iter_seek(rocksdb_iterator_t*, const char* k, size_t klen);
+extern void rocksdb_iter_next(rocksdb_iterator_t*);
+extern void rocksdb_iter_prev(rocksdb_iterator_t*);
+extern const char* rocksdb_iter_key(const rocksdb_iterator_t*, size_t* klen);
+extern const char* rocksdb_iter_value(const rocksdb_iterator_t*, size_t* vlen);
+extern void rocksdb_iter_get_error(const rocksdb_iterator_t*, char** errptr);
+
+/* Write batch */
+
+extern rocksdb_writebatch_t* rocksdb_writebatch_create();
+extern void rocksdb_writebatch_destroy(rocksdb_writebatch_t*);
+extern void rocksdb_writebatch_clear(rocksdb_writebatch_t*);
+extern int rocksdb_writebatch_count(rocksdb_writebatch_t*);
+extern void rocksdb_writebatch_put(
+    rocksdb_writebatch_t*,
+    const char* key, size_t klen,
+    const char* val, size_t vlen);
+extern void rocksdb_writebatch_merge(
+    rocksdb_writebatch_t*,
+    const char* key, size_t klen,
+    const char* val, size_t vlen);
+extern void rocksdb_writebatch_delete(
+    rocksdb_writebatch_t*,
+    const char* key, size_t klen);
+extern void rocksdb_writebatch_iterate(
+    rocksdb_writebatch_t*,
+    void* state,
+    void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen),
+    void (*deleted)(void*, const char* k, size_t klen));
+extern const char* rocksdb_writebatch_data(rocksdb_writebatch_t*, size_t *size);
+
+/* Options */
+
+extern rocksdb_options_t* rocksdb_options_create();
+extern void rocksdb_options_destroy(rocksdb_options_t*);
+extern void rocksdb_options_set_comparator(
+    rocksdb_options_t*,
+    rocksdb_comparator_t*);
+extern void rocksdb_options_set_merge_operator(rocksdb_options_t*,
+                                               rocksdb_mergeoperator_t*);
+extern void rocksdb_options_set_compression_per_level(
+  rocksdb_options_t* opt,
+  int* level_values,
+  size_t num_levels);
+extern void rocksdb_options_set_filter_policy(
+    rocksdb_options_t*,
+    rocksdb_filterpolicy_t*);
+extern void rocksdb_options_set_create_if_missing(
+    rocksdb_options_t*, unsigned char);
+extern void rocksdb_options_set_error_if_exists(
+    rocksdb_options_t*, unsigned char);
+extern void rocksdb_options_set_paranoid_checks(
+    rocksdb_options_t*, unsigned char);
+extern void rocksdb_options_set_env(rocksdb_options_t*, rocksdb_env_t*);
+extern void rocksdb_options_set_info_log(rocksdb_options_t*, rocksdb_logger_t*);
+extern void rocksdb_options_set_info_log_level(rocksdb_options_t*, int);
+extern void rocksdb_options_set_write_buffer_size(rocksdb_options_t*, size_t);
+extern void rocksdb_options_set_max_open_files(rocksdb_options_t*, int);
+extern void rocksdb_options_set_cache(rocksdb_options_t*, rocksdb_cache_t*);
+extern void rocksdb_options_set_cache_compressed(rocksdb_options_t*, rocksdb_cache_t*);
+extern void rocksdb_options_set_block_size(rocksdb_options_t*, size_t);
+extern void rocksdb_options_set_block_restart_interval(rocksdb_options_t*, int);
+extern void rocksdb_options_set_compression_options(
+    rocksdb_options_t*, int, int, int);
+extern void rocksdb_options_set_whole_key_filtering(rocksdb_options_t*, unsigned char);
+extern void rocksdb_options_set_prefix_extractor(
+    rocksdb_options_t*, rocksdb_slicetransform_t*);
+extern void rocksdb_options_set_num_levels(rocksdb_options_t*, int);
+extern void rocksdb_options_set_level0_file_num_compaction_trigger(
+    rocksdb_options_t*, int);
+extern void rocksdb_options_set_level0_slowdown_writes_trigger(
+    rocksdb_options_t*, int);
+extern void rocksdb_options_set_level0_stop_writes_trigger(
+    rocksdb_options_t*, int);
+extern void rocksdb_options_set_max_mem_compaction_level(
+    rocksdb_options_t*, int);
+extern void rocksdb_options_set_target_file_size_base(
+    rocksdb_options_t*, uint64_t);
+extern void rocksdb_options_set_target_file_size_multiplier(
+    rocksdb_options_t*, int);
+extern void rocksdb_options_set_max_bytes_for_level_base(
+    rocksdb_options_t*, uint64_t);
+extern void rocksdb_options_set_max_bytes_for_level_multiplier(
+    rocksdb_options_t*, int);
+extern void rocksdb_options_set_expanded_compaction_factor(
+    rocksdb_options_t*, int);
+extern void rocksdb_options_set_max_grandparent_overlap_factor(
+    rocksdb_options_t*, int);
+extern void rocksdb_options_set_max_bytes_for_level_multiplier_additional(
+    rocksdb_options_t*, int* level_values, size_t num_levels);
+extern void rocksdb_options_enable_statistics(rocksdb_options_t*);
+
+extern void rocksdb_options_set_max_write_buffer_number(rocksdb_options_t*, int);
+extern void rocksdb_options_set_min_write_buffer_number_to_merge(rocksdb_options_t*, int);
+extern void rocksdb_options_set_max_background_compactions(rocksdb_options_t*, int);
+extern void rocksdb_options_set_max_background_flushes(rocksdb_options_t*, int);
+extern void rocksdb_options_set_max_log_file_size(rocksdb_options_t*, size_t);
+extern void rocksdb_options_set_log_file_time_to_roll(rocksdb_options_t*, size_t);
+extern void rocksdb_options_set_keep_log_file_num(rocksdb_options_t*, size_t);
+extern void rocksdb_options_set_soft_rate_limit(rocksdb_options_t*, double);
+extern void rocksdb_options_set_hard_rate_limit(rocksdb_options_t*, double);
+extern void rocksdb_options_set_rate_limit_delay_max_milliseconds(
+    rocksdb_options_t*, unsigned int);
+extern void rocksdb_options_set_max_manifest_file_size(
+    rocksdb_options_t*, size_t);
+extern void rocksdb_options_set_no_block_cache(
+    rocksdb_options_t*, unsigned char);
+extern void rocksdb_options_set_table_cache_numshardbits(
+    rocksdb_options_t*, int);
+extern void rocksdb_options_set_table_cache_remove_scan_count_limit(
+    rocksdb_options_t*, int);
+extern void rocksdb_options_set_arena_block_size(
+    rocksdb_options_t*, size_t);
+extern void rocksdb_options_set_use_fsync(
+    rocksdb_options_t*, int);
+extern void rocksdb_options_set_db_stats_log_interval(
+    rocksdb_options_t*, int);
+extern void rocksdb_options_set_db_log_dir(
+    rocksdb_options_t*, const char*);
+extern void rocksdb_options_set_wal_dir(
+    rocksdb_options_t*, const char*);
+extern void rocksdb_options_set_WAL_ttl_seconds(
+    rocksdb_options_t*, uint64_t);
+extern void rocksdb_options_set_WAL_size_limit_MB(
+    rocksdb_options_t*, uint64_t);
+extern void rocksdb_options_set_manifest_preallocation_size(
+    rocksdb_options_t*, size_t);
+extern void rocksdb_options_set_purge_redundant_kvs_while_flush(
+    rocksdb_options_t*, unsigned char);
+extern void rocksdb_options_set_allow_os_buffer(
+    rocksdb_options_t*, unsigned char);
+extern void rocksdb_options_set_allow_mmap_reads(
+    rocksdb_options_t*, unsigned char);
+extern void rocksdb_options_set_allow_mmap_writes(
+    rocksdb_options_t*, unsigned char);
+extern void rocksdb_options_set_is_fd_close_on_exec(
+    rocksdb_options_t*, unsigned char);
+extern void rocksdb_options_set_skip_log_error_on_recovery(
+    rocksdb_options_t*, unsigned char);
+extern void rocksdb_options_set_stats_dump_period_sec(
+    rocksdb_options_t*, unsigned int);
+extern void rocksdb_options_set_block_size_deviation(
+    rocksdb_options_t*, int);
+extern void rocksdb_options_set_advise_random_on_open(
+    rocksdb_options_t*, unsigned char);
+extern void rocksdb_options_set_access_hint_on_compaction_start(
+    rocksdb_options_t*, int);
+extern void rocksdb_options_set_use_adaptive_mutex(
+    rocksdb_options_t*, unsigned char);
+extern void rocksdb_options_set_bytes_per_sync(
+    rocksdb_options_t*, uint64_t);
+extern void rocksdb_options_set_verify_checksums_in_compaction(
+    rocksdb_options_t*, unsigned char);
+extern void rocksdb_options_set_filter_deletes(
+    rocksdb_options_t*, unsigned char);
+extern void rocksdb_options_set_max_sequential_skip_in_iterations(
+    rocksdb_options_t*, uint64_t);
+extern void rocksdb_options_set_disable_data_sync(rocksdb_options_t*, int);
+extern void rocksdb_options_set_disable_auto_compactions(rocksdb_options_t*, int);
+extern void rocksdb_options_set_disable_seek_compaction(rocksdb_options_t*, int);
+extern void rocksdb_options_set_delete_obsolete_files_period_micros(
+    rocksdb_options_t*, uint64_t);
+extern void rocksdb_options_set_source_compaction_factor(rocksdb_options_t*, int);
+extern void rocksdb_options_prepare_for_bulk_load(rocksdb_options_t*);
+extern void rocksdb_options_set_memtable_vector_rep(rocksdb_options_t*);
+extern void rocksdb_options_set_hash_skip_list_rep(rocksdb_options_t*, size_t, int32_t, int32_t);
+extern void rocksdb_options_set_hash_link_list_rep(rocksdb_options_t*, size_t);
+extern void rocksdb_options_set_plain_table_factory(rocksdb_options_t*, uint32_t, int, double, size_t);
+
+extern void rocksdb_options_set_max_bytes_for_level_base(rocksdb_options_t* opt, uint64_t n);
+extern void rocksdb_options_set_stats_dump_period_sec(rocksdb_options_t* opt, unsigned int sec);
+
+extern void rocksdb_options_set_min_level_to_compress(rocksdb_options_t* opt, int level);
+
+extern void rocksdb_options_set_memtable_prefix_bloom_bits(
+    rocksdb_options_t*, uint32_t);
+extern void rocksdb_options_set_memtable_prefix_bloom_probes(
+    rocksdb_options_t*, uint32_t);
+extern void rocksdb_options_set_max_successive_merges(
+    rocksdb_options_t*, size_t);
+extern void rocksdb_options_set_min_partial_merge_operands(
+    rocksdb_options_t*, uint32_t);
+extern void rocksdb_options_set_bloom_locality(
+    rocksdb_options_t*, uint32_t);
+extern void rocksdb_options_set_allow_thread_local(
+    rocksdb_options_t*, unsigned char);
+extern void rocksdb_options_set_inplace_update_support(
+    rocksdb_options_t*, unsigned char);
+extern void rocksdb_options_set_inplace_update_num_locks(
+    rocksdb_options_t*, size_t);
+
+enum {
+  rocksdb_no_compression = 0,
+  rocksdb_snappy_compression = 1,
+  rocksdb_zlib_compression = 2,
+  rocksdb_bz2_compression = 3,
+  rocksdb_lz4_compression = 4,
+  rocksdb_lz4hc_compression = 5
+};
+extern void rocksdb_options_set_compression(rocksdb_options_t*, int);
+
+enum {
+  rocksdb_level_compaction = 0,
+  rocksdb_universal_compaction = 1
+};
+extern void rocksdb_options_set_compaction_style(rocksdb_options_t*, int);
+extern void rocksdb_options_set_universal_compaction_options(rocksdb_options_t*, rocksdb_universal_compaction_options_t*);
+/* Comparator */
+
+extern rocksdb_comparator_t* rocksdb_comparator_create(
+    void* state,
+    void (*destructor)(void*),
+    int (*compare)(
+        void*,
+        const char* a, size_t alen,
+        const char* b, size_t blen),
+    const char* (*name)(void*));
+extern void rocksdb_comparator_destroy(rocksdb_comparator_t*);
+
+/* Filter policy */
+
+extern rocksdb_filterpolicy_t* rocksdb_filterpolicy_create(
+    void* state,
+    void (*destructor)(void*),
+    char* (*create_filter)(
+        void*,
+        const char* const* key_array, const size_t* key_length_array,
+        int num_keys,
+        size_t* filter_length),
+    unsigned char (*key_may_match)(
+        void*,
+        const char* key, size_t length,
+        const char* filter, size_t filter_length),
+    void (*delete_filter)(
+        void*,
+        const char* filter, size_t filter_length),
+    const char* (*name)(void*));
+extern void rocksdb_filterpolicy_destroy(rocksdb_filterpolicy_t*);
+
+extern rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom(
+    int bits_per_key);
+
+/* Merge Operator */
+
+extern rocksdb_mergeoperator_t* rocksdb_mergeoperator_create(
+    void* state,
+    void (*destructor)(void*),
+    char* (*full_merge)(
+        void*,
+        const char* key, size_t key_length,
+        const char* existing_value, size_t existing_value_length,
+        const char* const* operands_list, const size_t* operands_list_length,
+        int num_operands,
+        unsigned char* success, size_t* new_value_length),
+    char* (*partial_merge)(
+        void*,
+        const char* key, size_t key_length,
+        const char* const* operands_list, const size_t* operands_list_length,
+        int num_operands,
+        unsigned char* success, size_t* new_value_length),
+    void (*delete_value)(
+        void*,
+        const char* value, size_t value_length),
+    const char* (*name)(void*));
+extern void rocksdb_mergeoperator_destroy(rocksdb_mergeoperator_t*);
+
+/* Read options */
+
+extern rocksdb_readoptions_t* rocksdb_readoptions_create();
+extern void rocksdb_readoptions_destroy(rocksdb_readoptions_t*);
+extern void rocksdb_readoptions_set_verify_checksums(
+    rocksdb_readoptions_t*,
+    unsigned char);
+extern void rocksdb_readoptions_set_fill_cache(
+    rocksdb_readoptions_t*, unsigned char);
+extern void rocksdb_readoptions_set_snapshot(
+    rocksdb_readoptions_t*,
+    const rocksdb_snapshot_t*);
+extern void rocksdb_readoptions_set_read_tier(
+    rocksdb_readoptions_t*, int);
+extern void rocksdb_readoptions_set_tailing(
+    rocksdb_readoptions_t*, unsigned char);
+
+/* Write options */
+
+extern rocksdb_writeoptions_t* rocksdb_writeoptions_create();
+extern void rocksdb_writeoptions_destroy(rocksdb_writeoptions_t*);
+extern void rocksdb_writeoptions_set_sync(
+    rocksdb_writeoptions_t*, unsigned char);
+extern void rocksdb_writeoptions_disable_WAL(rocksdb_writeoptions_t* opt, int disable);
+
+/* Flush options */
+
+extern rocksdb_flushoptions_t* rocksdb_flushoptions_create();
+extern void rocksdb_flushoptions_destroy(rocksdb_flushoptions_t*);
+extern void rocksdb_flushoptions_set_wait(
+    rocksdb_flushoptions_t*, unsigned char);
+
+/* Cache */
+
+extern rocksdb_cache_t* rocksdb_cache_create_lru(size_t capacity);
+extern void rocksdb_cache_destroy(rocksdb_cache_t* cache);
+
+/* Env */
+
+extern rocksdb_env_t* rocksdb_create_default_env();
+extern void rocksdb_env_set_background_threads(rocksdb_env_t* env, int n);
+extern void rocksdb_env_set_high_priority_background_threads(rocksdb_env_t* env, int n);
+extern void rocksdb_env_destroy(rocksdb_env_t*);
+
+/* SliceTransform */
+
+extern rocksdb_slicetransform_t* rocksdb_slicetransform_create(
+    void* state,
+    void (*destructor)(void*),
+    char* (*transform)(
+        void*,
+        const char* key, size_t length,
+        size_t* dst_length),
+    unsigned char (*in_domain)(
+        void*,
+        const char* key, size_t length),
+    unsigned char (*in_range)(
+        void*,
+        const char* key, size_t length),
+    const char* (*name)(void*));
+extern rocksdb_slicetransform_t* rocksdb_slicetransform_create_fixed_prefix(size_t);
+extern void rocksdb_slicetransform_destroy(rocksdb_slicetransform_t*);
+
+/* Universal Compaction options */
+
+enum {
+  rocksdb_similar_size_compaction_stop_style = 0,
+  rocksdb_total_size_compaction_stop_style = 1
+};
+
+extern rocksdb_universal_compaction_options_t* rocksdb_universal_compaction_options_create() ;
+extern void rocksdb_universal_compaction_options_set_size_ratio(
+  rocksdb_universal_compaction_options_t*, int);
+extern void rocksdb_universal_compaction_options_set_min_merge_width(
+  rocksdb_universal_compaction_options_t*, int);
+extern void rocksdb_universal_compaction_options_set_max_merge_width(
+  rocksdb_universal_compaction_options_t*, int);
+extern void rocksdb_universal_compaction_options_set_max_size_amplification_percent(
+  rocksdb_universal_compaction_options_t*, int);
+extern void rocksdb_universal_compaction_options_set_compression_size_percent(
+  rocksdb_universal_compaction_options_t*, int);
+extern void rocksdb_universal_compaction_options_set_stop_style(
+  rocksdb_universal_compaction_options_t*, int);
+extern void rocksdb_universal_compaction_options_destroy(
+  rocksdb_universal_compaction_options_t*);
+
+extern int rocksdb_livefiles_count(
+  const rocksdb_livefiles_t*);
+extern const char* rocksdb_livefiles_name(
+  const rocksdb_livefiles_t*,
+  int index);
+extern int rocksdb_livefiles_level(
+  const rocksdb_livefiles_t*,
+  int index);
+extern size_t rocksdb_livefiles_size(
+  const rocksdb_livefiles_t*,
+  int index);
+extern const char* rocksdb_livefiles_smallestkey(
+  const rocksdb_livefiles_t*,
+  int index,
+  size_t* size);
+extern const char* rocksdb_livefiles_largestkey(
+  const rocksdb_livefiles_t*,
+  int index,
+  size_t* size);
+extern void rocksdb_livefiles_destroy(
+  const rocksdb_livefiles_t*);
+
+#ifdef __cplusplus
+}  /* end extern "C" */
+#endif
+
+#endif  /* STORAGE_ROCKSDB_INCLUDE_C_H_ */
diff --git a/include/rocksdb/cache.h b/include/rocksdb/cache.h
new file mode 100644 (file)
index 0000000..65d44b6
--- /dev/null
@@ -0,0 +1,140 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A Cache is an interface that maps keys to values.  It has internal
+// synchronization and may be safely accessed concurrently from
+// multiple threads.  It may automatically evict entries to make room
+// for new entries.  Values have a specified charge against the cache
+// capacity.  For example, a cache where the values are variable
+// length strings, may use the length of the string as the charge for
+// the string.
+//
+// A builtin cache implementation with a least-recently-used eviction
+// policy is provided.  Clients may use their own implementations if
+// they want something more sophisticated (like scan-resistance, a
+// custom eviction policy, variable cache sizing, etc.)
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_CACHE_H_
+#define STORAGE_ROCKSDB_INCLUDE_CACHE_H_
+
+#include <memory>
+#include <stdint.h>
+#include "rocksdb/slice.h"
+
+namespace rocksdb {
+
+using std::shared_ptr;
+
+class Cache;
+
+// Create a new cache with a fixed size capacity. The cache is sharded
+// to 2^numShardBits shards, by hash of the key. The total capacity
+// is divided and evenly assigned to each shard. Inside each shard,
+// the eviction is done in two passes: first try to free spaces by
+// evicting entries that are among the most least used removeScanCountLimit
+// entries and do not have reference other than by the cache itself, in
+// the least-used order. If not enough space is freed, further free the
+// entries in least used order.
+//
+// The functions without parameter numShardBits and/or removeScanCountLimit
+// use default values. removeScanCountLimit's default value is 0, which
+// means a strict LRU order inside each shard.
+extern shared_ptr<Cache> NewLRUCache(size_t capacity);
+extern shared_ptr<Cache> NewLRUCache(size_t capacity, int numShardBits);
+extern shared_ptr<Cache> NewLRUCache(size_t capacity, int numShardBits,
+                                     int removeScanCountLimit);
+
+class Cache {
+ public:
+  Cache() { }
+
+  // Destroys all existing entries by calling the "deleter"
+  // function that was passed to the constructor.
+  virtual ~Cache();
+
+  // Opaque handle to an entry stored in the cache.
+  struct Handle { };
+
+  // Insert a mapping from key->value into the cache and assign it
+  // the specified charge against the total cache capacity.
+  //
+  // Returns a handle that corresponds to the mapping.  The caller
+  // must call this->Release(handle) when the returned mapping is no
+  // longer needed.
+  //
+  // When the inserted entry is no longer needed, the key and
+  // value will be passed to "deleter".
+  virtual Handle* Insert(const Slice& key, void* value, size_t charge,
+                         void (*deleter)(const Slice& key, void* value)) = 0;
+
+  // If the cache has no mapping for "key", returns nullptr.
+  //
+  // Else return a handle that corresponds to the mapping.  The caller
+  // must call this->Release(handle) when the returned mapping is no
+  // longer needed.
+  virtual Handle* Lookup(const Slice& key) = 0;
+
+  // Release a mapping returned by a previous Lookup().
+  // REQUIRES: handle must not have been released yet.
+  // REQUIRES: handle must have been returned by a method on *this.
+  virtual void Release(Handle* handle) = 0;
+
+  // Return the value encapsulated in a handle returned by a
+  // successful Lookup().
+  // REQUIRES: handle must not have been released yet.
+  // REQUIRES: handle must have been returned by a method on *this.
+  virtual void* Value(Handle* handle) = 0;
+
+  // If the cache contains entry for key, erase it.  Note that the
+  // underlying entry will be kept around until all existing handles
+  // to it have been released.
+  virtual void Erase(const Slice& key) = 0;
+
+  // Return a new numeric id.  May be used by multiple clients who are
+  // sharing the same cache to partition the key space.  Typically the
+  // client will allocate a new id at startup and prepend the id to
+  // its cache keys.
+  virtual uint64_t NewId() = 0;
+
+  // returns the maximum configured capacity of the cache
+  virtual size_t GetCapacity() const = 0;
+
+  // returns the memory size for the entries residing in the cache.
+  virtual size_t GetUsage() const = 0;
+
+  // Call this on shutdown if you want to speed it up. Cache will disown
+  // any underlying data and will not free it on delete. This call will leak
+  // memory - call this only if you're shutting down the process.
+  // Any attempts of using cache after this call will fail terribly.
+  // Always delete the DB object before calling this method!
+  virtual void DisownData() {
+    // default implementation is noop
+  };
+
+  // Apply callback to all entries in the cache
+  // If thread_safe is true, it will also lock the accesses. Otherwise, it will
+  // access the cache without the lock held
+  virtual void ApplyToAllCacheEntries(void (*callback)(void*, size_t),
+                                      bool thread_safe) = 0;
+
+ private:
+  void LRU_Remove(Handle* e);
+  void LRU_Append(Handle* e);
+  void Unref(Handle* e);
+
+  struct Rep;
+  Rep* rep_;
+
+  // No copying allowed
+  Cache(const Cache&);
+  void operator=(const Cache&);
+};
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_UTIL_CACHE_H_
diff --git a/include/rocksdb/compaction_filter.h b/include/rocksdb/compaction_filter.h
new file mode 100644 (file)
index 0000000..59b0509
--- /dev/null
@@ -0,0 +1,198 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2013 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_
+#define STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_
+
+#include <string>
+#include <vector>
+
+namespace rocksdb {
+
+class Slice;
+class SliceTransform;
+
+// Context information of a compaction run
+struct CompactionFilterContext {
+  // Does this compaction run include all data files
+  bool is_full_compaction;
+  // Is this compaction requested by the client (true),
+  // or is it occurring as an automatic compaction process
+  bool is_manual_compaction;
+};
+
+// CompactionFilter allows an application to modify/delete a key-value at
+// the time of compaction.
+
+class CompactionFilter {
+ public:
+  // Context information of a compaction run
+  struct Context {
+    // Does this compaction run include all data files
+    bool is_full_compaction;
+    // Is this compaction requested by the client (true),
+    // or is it occurring as an automatic compaction process
+    bool is_manual_compaction;
+  };
+
+  virtual ~CompactionFilter() {}
+
+  // The compaction process invokes this
+  // method for kv that is being compacted. A return value
+  // of false indicates that the kv should be preserved in the
+  // output of this compaction run and a return value of true
+  // indicates that this key-value should be removed from the
+  // output of the compaction.  The application can inspect
+  // the existing value of the key and make decision based on it.
+  //
+  // When the value is to be preserved, the application has the option
+  // to modify the existing_value and pass it back through new_value.
+  // value_changed needs to be set to true in this case.
+  //
+  // If multithreaded compaction is being used *and* a single CompactionFilter
+  // instance was supplied via Options::compaction_filter, this method may be
+  // called from different threads concurrently.  The application must ensure
+  // that the call is thread-safe.
+  //
+  // If the CompactionFilter was created by a factory, then it will only ever
+  // be used by a single thread that is doing the compaction run, and this
+  // call does not need to be thread-safe.  However, multiple filters may be
+  // in existence and operating concurrently.
+  virtual bool Filter(int level,
+                      const Slice& key,
+                      const Slice& existing_value,
+                      std::string* new_value,
+                      bool* value_changed) const = 0;
+
+  // Returns a name that identifies this compaction filter.
+  // The name will be printed to LOG file on start up for diagnosis.
+  virtual const char* Name() const = 0;
+};
+
+// CompactionFilterV2 that buffers kv pairs sharing the same prefix and let
+// application layer to make individual decisions for all the kv pairs in the
+// buffer.
+class CompactionFilterV2 {
+ public:
+  virtual ~CompactionFilterV2() {}
+
+  // The compaction process invokes this method for all the kv pairs
+  // sharing the same prefix. It is a "roll-up" version of CompactionFilter.
+  //
+  // Each entry in the return vector indicates if the corresponding kv should
+  // be preserved in the output of this compaction run. The application can
+  // inspect the exisitng values of the keys and make decision based on it.
+  //
+  // When a value is to be preserved, the application has the option
+  // to modify the entry in existing_values and pass it back through an entry
+  // in new_values. A corresponding values_changed entry needs to be set to
+  // true in this case. Note that the new_values vector contains only changed
+  // values, i.e. new_values.size() <= values_changed.size().
+  //
+  typedef std::vector<Slice> SliceVector;
+  virtual std::vector<bool> Filter(int level,
+                                   const SliceVector& keys,
+                                   const SliceVector& existing_values,
+                                   std::vector<std::string>* new_values,
+                                   std::vector<bool>* values_changed)
+    const = 0;
+
+  // Returns a name that identifies this compaction filter.
+  // The name will be printed to LOG file on start up for diagnosis.
+  virtual const char* Name() const = 0;
+};
+
+// Each compaction will create a new CompactionFilter allowing the
+// application to know about different campactions
+class CompactionFilterFactory {
+ public:
+  virtual ~CompactionFilterFactory() { }
+
+  virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& context) = 0;
+
+  // Returns a name that identifies this compaction filter factory.
+  virtual const char* Name() const = 0;
+};
+
+// Default implementaion of CompactionFilterFactory which does not
+// return any filter
+class DefaultCompactionFilterFactory : public CompactionFilterFactory {
+ public:
+  virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& context) override {
+    return std::unique_ptr<CompactionFilter>(nullptr);
+  }
+
+  virtual const char* Name() const override {
+    return "DefaultCompactionFilterFactory";
+  }
+};
+
+// Each compaction will create a new CompactionFilterV2
+//
+// CompactionFilterFactoryV2 enables application to specify a prefix and use
+// CompactionFilterV2 to filter kv-pairs in batches. Each batch contains all
+// the kv-pairs sharing the same prefix.
+//
+// This is useful for applications that require grouping kv-pairs in
+// compaction filter to make a purge/no-purge decision. For example, if the
+// key prefix is user id and the rest of key represents the type of value.
+// This batching filter will come in handy if the application's compaction
+// filter requires knowledge of all types of values for any user id.
+//
+class CompactionFilterFactoryV2 {
+ public:
+  // NOTE: CompactionFilterFactoryV2 will not delete prefix_extractor
+  explicit CompactionFilterFactoryV2(const SliceTransform* prefix_extractor)
+    : prefix_extractor_(prefix_extractor) { }
+
+  virtual ~CompactionFilterFactoryV2() { }
+
+  virtual std::unique_ptr<CompactionFilterV2> CreateCompactionFilterV2(
+    const CompactionFilterContext& context) = 0;
+
+  // Returns a name that identifies this compaction filter factory.
+  virtual const char* Name() const = 0;
+
+  const SliceTransform* GetPrefixExtractor() const {
+    return prefix_extractor_;
+  }
+
+  void SetPrefixExtractor(const SliceTransform* prefix_extractor) {
+    prefix_extractor_ = prefix_extractor;
+  }
+
+ private:
+  // Prefix extractor for compaction filter v2
+  // Keys sharing the same prefix will be buffered internally.
+  // Client can implement a Filter callback function to operate on the buffer
+  const SliceTransform* prefix_extractor_;
+};
+
+// Default implementaion of CompactionFilterFactoryV2 which does not
+// return any filter
+class DefaultCompactionFilterFactoryV2 : public CompactionFilterFactoryV2 {
+ public:
+  explicit DefaultCompactionFilterFactoryV2()
+      : CompactionFilterFactoryV2(nullptr) { }
+
+  virtual std::unique_ptr<CompactionFilterV2>
+  CreateCompactionFilterV2(
+      const CompactionFilterContext& context) override {
+    return std::unique_ptr<CompactionFilterV2>(nullptr);
+  }
+
+  virtual const char* Name() const override {
+    return "DefaultCompactionFilterFactoryV2";
+  }
+};
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_
diff --git a/include/rocksdb/comparator.h b/include/rocksdb/comparator.h
new file mode 100644 (file)
index 0000000..f3a8499
--- /dev/null
@@ -0,0 +1,67 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_COMPARATOR_H_
+#define STORAGE_ROCKSDB_INCLUDE_COMPARATOR_H_
+
+#include <string>
+
+namespace rocksdb {
+
+class Slice;
+
+// A Comparator object provides a total order across slices that are
+// used as keys in an sstable or a database.  A Comparator implementation
+// must be thread-safe since rocksdb may invoke its methods concurrently
+// from multiple threads.
+class Comparator {
+ public:
+  virtual ~Comparator();
+
+  // Three-way comparison.  Returns value:
+  //   < 0 iff "a" < "b",
+  //   == 0 iff "a" == "b",
+  //   > 0 iff "a" > "b"
+  virtual int Compare(const Slice& a, const Slice& b) const = 0;
+
+  // The name of the comparator.  Used to check for comparator
+  // mismatches (i.e., a DB created with one comparator is
+  // accessed using a different comparator.
+  //
+  // The client of this package should switch to a new name whenever
+  // the comparator implementation changes in a way that will cause
+  // the relative ordering of any two keys to change.
+  //
+  // Names starting with "rocksdb." are reserved and should not be used
+  // by any clients of this package.
+  virtual const char* Name() const = 0;
+
+  // Advanced functions: these are used to reduce the space requirements
+  // for internal data structures like index blocks.
+
+  // If *start < limit, changes *start to a short string in [start,limit).
+  // Simple comparator implementations may return with *start unchanged,
+  // i.e., an implementation of this method that does nothing is correct.
+  virtual void FindShortestSeparator(
+      std::string* start,
+      const Slice& limit) const = 0;
+
+  // Changes *key to a short string >= *key.
+  // Simple comparator implementations may return with *key unchanged,
+  // i.e., an implementation of this method that does nothing is correct.
+  virtual void FindShortSuccessor(std::string* key) const = 0;
+};
+
+// Return a builtin comparator that uses lexicographic byte-wise
+// ordering.  The result remains the property of this module and
+// must not be deleted.
+extern const Comparator* BytewiseComparator();
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_COMPARATOR_H_
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
new file mode 100644 (file)
index 0000000..e743b4c
--- /dev/null
@@ -0,0 +1,495 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_DB_H_
+#define STORAGE_ROCKSDB_INCLUDE_DB_H_
+
+#include <stdint.h>
+#include <stdio.h>
+#include <memory>
+#include <vector>
+#include <string>
+#include <unordered_map>
+#include "rocksdb/version.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/types.h"
+#include "rocksdb/transaction_log.h"
+
+namespace rocksdb {
+
+using std::unique_ptr;
+
+class ColumnFamilyHandle {
+ public:
+  virtual ~ColumnFamilyHandle() {}
+};
+extern const std::string kDefaultColumnFamilyName;
+
+struct ColumnFamilyDescriptor {
+  std::string name;
+  ColumnFamilyOptions options;
+  ColumnFamilyDescriptor()
+      : name(kDefaultColumnFamilyName), options(ColumnFamilyOptions()) {}
+  ColumnFamilyDescriptor(const std::string& _name,
+                         const ColumnFamilyOptions& _options)
+      : name(_name), options(_options) {}
+};
+
+static const int kMajorVersion = __ROCKSDB_MAJOR__;
+static const int kMinorVersion = __ROCKSDB_MINOR__;
+
+struct Options;
+struct ReadOptions;
+struct WriteOptions;
+struct FlushOptions;
+struct TableProperties;
+class WriteBatch;
+class Env;
+
+// Metadata associated with each SST file.
+struct LiveFileMetaData {
+  std::string column_family_name;  // Name of the column family
+  std::string name;                // Name of the file
+  int level;               // Level at which this file resides.
+  size_t size;             // File size in bytes.
+  std::string smallestkey; // Smallest user defined key in the file.
+  std::string largestkey;  // Largest user defined key in the file.
+  SequenceNumber smallest_seqno; // smallest seqno in file
+  SequenceNumber largest_seqno;  // largest seqno in file
+};
+
+// Abstract handle to particular state of a DB.
+// A Snapshot is an immutable object and can therefore be safely
+// accessed from multiple threads without any external synchronization.
+class Snapshot {
+ protected:
+  virtual ~Snapshot();
+};
+
+// A range of keys
+struct Range {
+  Slice start;          // Included in the range
+  Slice limit;          // Not included in the range
+
+  Range() { }
+  Range(const Slice& s, const Slice& l) : start(s), limit(l) { }
+};
+
+// A collections of table properties objects, where
+//  key: is the table's file name.
+//  value: the table properties object of the given table.
+typedef std::unordered_map<std::string, std::shared_ptr<const TableProperties>>
+    TablePropertiesCollection;
+
+// A DB is a persistent ordered map from keys to values.
+// A DB is safe for concurrent access from multiple threads without
+// any external synchronization.
+class DB {
+ public:
+  // Open the database with the specified "name".
+  // Stores a pointer to a heap-allocated database in *dbptr and returns
+  // OK on success.
+  // Stores nullptr in *dbptr and returns a non-OK status on error.
+  // Caller should delete *dbptr when it is no longer needed.
+  static Status Open(const Options& options,
+                     const std::string& name,
+                     DB** dbptr);
+
+  // Open the database for read only. All DB interfaces
+  // that modify data, like put/delete, will return error.
+  // If the db is opened in read only mode, then no compactions
+  // will happen.
+  static Status OpenForReadOnly(const Options& options,
+      const std::string& name, DB** dbptr,
+      bool error_if_log_file_exist = false);
+
+  // Open the database for read only with column families. When opening DB with
+  // read only, you can specify only a subset of column families in the
+  // database that should be opened. However, you always need to specify default
+  // column family. The default column family name is 'default' and it's stored
+  // in rocksdb::kDefaultColumnFamilyName
+  static Status OpenForReadOnly(
+      const DBOptions& db_options, const std::string& name,
+      const std::vector<ColumnFamilyDescriptor>& column_families,
+      std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
+      bool error_if_log_file_exist = false);
+
+  // Open DB with column families.
+  // db_options specify database specific options
+  // column_families is the vector of all column families in the databse,
+  // containing column family name and options. You need to open ALL column
+  // families in the database. To get the list of column families, you can use
+  // ListColumnFamilies(). Also, you can open only a subset of column families
+  // for read-only access.
+  // The default column family name is 'default' and it's stored
+  // in rocksdb::kDefaultColumnFamilyName.
+  // If everything is OK, handles will on return be the same size
+  // as column_families --- handles[i] will be a handle that you
+  // will use to operate on column family column_family[i]
+  static Status Open(const DBOptions& db_options, const std::string& name,
+                     const std::vector<ColumnFamilyDescriptor>& column_families,
+                     std::vector<ColumnFamilyHandle*>* handles, DB** dbptr);
+
+  // ListColumnFamilies will open the DB specified by argument name
+  // and return the list of all column families in that DB
+  // through column_families argument. The ordering of
+  // column families in column_families is unspecified.
+  static Status ListColumnFamilies(const DBOptions& db_options,
+                                   const std::string& name,
+                                   std::vector<std::string>* column_families);
+
+  DB() { }
+  virtual ~DB();
+
+  // Create a column_family and return the handle of column family
+  // through the argument handle.
+  virtual Status CreateColumnFamily(const ColumnFamilyOptions& options,
+                                    const std::string& column_family_name,
+                                    ColumnFamilyHandle** handle);
+
+  // Drop a column family specified by column_family handle. This call
+  // only records a drop record in the manifest and prevents the column
+  // family from flushing and compacting.
+  virtual Status DropColumnFamily(ColumnFamilyHandle* column_family);
+
+  // Set the database entry for "key" to "value".
+  // Returns OK on success, and a non-OK status on error.
+  // Note: consider setting options.sync = true.
+  virtual Status Put(const WriteOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     const Slice& value) = 0;
+  virtual Status Put(const WriteOptions& options, const Slice& key,
+                     const Slice& value) {
+    return Put(options, DefaultColumnFamily(), key, value);
+  }
+
+  // Remove the database entry (if any) for "key".  Returns OK on
+  // success, and a non-OK status on error.  It is not an error if "key"
+  // did not exist in the database.
+  // Note: consider setting options.sync = true.
+  virtual Status Delete(const WriteOptions& options,
+                        ColumnFamilyHandle* column_family,
+                        const Slice& key) = 0;
+  virtual Status Delete(const WriteOptions& options, const Slice& key) {
+    return Delete(options, DefaultColumnFamily(), key);
+  }
+
+  // Merge the database entry for "key" with "value".  Returns OK on success,
+  // and a non-OK status on error. The semantics of this operation is
+  // determined by the user provided merge_operator when opening DB.
+  // Note: consider setting options.sync = true.
+  virtual Status Merge(const WriteOptions& options,
+                       ColumnFamilyHandle* column_family, const Slice& key,
+                       const Slice& value) = 0;
+  virtual Status Merge(const WriteOptions& options, const Slice& key,
+                       const Slice& value) {
+    return Merge(options, DefaultColumnFamily(), key, value);
+  }
+
+  // Apply the specified updates to the database.
+  // Returns OK on success, non-OK on failure.
+  // Note: consider setting options.sync = true.
+  virtual Status Write(const WriteOptions& options, WriteBatch* updates) = 0;
+
+  // If the database contains an entry for "key" store the
+  // corresponding value in *value and return OK.
+  //
+  // If there is no entry for "key" leave *value unchanged and return
+  // a status for which Status::IsNotFound() returns true.
+  //
+  // May return some other Status on an error.
+  virtual Status Get(const ReadOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     std::string* value) = 0;
+  virtual Status Get(const ReadOptions& options, const Slice& key, std::string* value) {
+    return Get(options, DefaultColumnFamily(), key, value);
+  }
+
+  // If keys[i] does not exist in the database, then the i'th returned
+  // status will be one for which Status::IsNotFound() is true, and
+  // (*values)[i] will be set to some arbitrary value (often ""). Otherwise,
+  // the i'th returned status will have Status::ok() true, and (*values)[i]
+  // will store the value associated with keys[i].
+  //
+  // (*values) will always be resized to be the same size as (keys).
+  // Similarly, the number of returned statuses will be the number of keys.
+  // Note: keys will not be "de-duplicated". Duplicate keys will return
+  // duplicate values in order.
+  virtual std::vector<Status> MultiGet(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_family,
+      const std::vector<Slice>& keys, std::vector<std::string>* values) = 0;
+  virtual std::vector<Status> MultiGet(const ReadOptions& options,
+                                       const std::vector<Slice>& keys,
+                                       std::vector<std::string>* values) {
+    return MultiGet(options, std::vector<ColumnFamilyHandle*>(
+                                 keys.size(), DefaultColumnFamily()),
+                    keys, values);
+  }
+
+  // If the key definitely does not exist in the database, then this method
+  // returns false, else true. If the caller wants to obtain value when the key
+  // is found in memory, a bool for 'value_found' must be passed. 'value_found'
+  // will be true on return if value has been set properly.
+  // This check is potentially lighter-weight than invoking DB::Get(). One way
+  // to make this lighter weight is to avoid doing any IOs.
+  // Default implementation here returns true and sets 'value_found' to false
+  virtual bool KeyMayExist(const ReadOptions& options,
+                           ColumnFamilyHandle* column_family, const Slice& key,
+                           std::string* value, bool* value_found = nullptr) {
+    if (value_found != nullptr) {
+      *value_found = false;
+    }
+    return true;
+  }
+  virtual bool KeyMayExist(const ReadOptions& options, const Slice& key,
+                           std::string* value, bool* value_found = nullptr) {
+    return KeyMayExist(options, DefaultColumnFamily(), key, value, value_found);
+  }
+
+  // Return a heap-allocated iterator over the contents of the database.
+  // The result of NewIterator() is initially invalid (caller must
+  // call one of the Seek methods on the iterator before using it).
+  //
+  // Caller should delete the iterator when it is no longer needed.
+  // The returned iterator should be deleted before this db is deleted.
+  virtual Iterator* NewIterator(const ReadOptions& options,
+                                ColumnFamilyHandle* column_family) = 0;
+  virtual Iterator* NewIterator(const ReadOptions& options) {
+    return NewIterator(options, DefaultColumnFamily());
+  }
+  // Returns iterators from a consistent database state across multiple
+  // column families. Iterators are heap allocated and need to be deleted
+  // before the db is deleted
+  virtual Status NewIterators(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_families,
+      std::vector<Iterator*>* iterators) = 0;
+
+  // Return a handle to the current DB state.  Iterators created with
+  // this handle will all observe a stable snapshot of the current DB
+  // state.  The caller must call ReleaseSnapshot(result) when the
+  // snapshot is no longer needed.
+  //
+  // nullptr will be returned if the DB fails to take a snapshot or does
+  // not support snapshot.
+  virtual const Snapshot* GetSnapshot() = 0;
+
+  // Release a previously acquired snapshot.  The caller must not
+  // use "snapshot" after this call.
+  virtual void ReleaseSnapshot(const Snapshot* snapshot) = 0;
+
+  // DB implementations can export properties about their state
+  // via this method.  If "property" is a valid property understood by this
+  // DB implementation, fills "*value" with its current value and returns
+  // true.  Otherwise returns false.
+  //
+  //
+  // Valid property names include:
+  //
+  //  "rocksdb.num-files-at-level<N>" - return the number of files at level <N>,
+  //     where <N> is an ASCII representation of a level number (e.g. "0").
+  //  "rocksdb.stats" - returns a multi-line string that describes statistics
+  //     about the internal operation of the DB.
+  //  "rocksdb.sstables" - returns a multi-line string that describes all
+  //     of the sstables that make up the db contents.
+  virtual bool GetProperty(ColumnFamilyHandle* column_family,
+                           const Slice& property, std::string* value) = 0;
+  virtual bool GetProperty(const Slice& property, std::string* value) {
+    return GetProperty(DefaultColumnFamily(), property, value);
+  }
+
+  // For each i in [0,n-1], store in "sizes[i]", the approximate
+  // file system space used by keys in "[range[i].start .. range[i].limit)".
+  //
+  // Note that the returned sizes measure file system space usage, so
+  // if the user data compresses by a factor of ten, the returned
+  // sizes will be one-tenth the size of the corresponding user data size.
+  //
+  // The results may not include the sizes of recently written data.
+  virtual void GetApproximateSizes(ColumnFamilyHandle* column_family,
+                                   const Range* range, int n,
+                                   uint64_t* sizes) = 0;
+  virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes) {
+    GetApproximateSizes(DefaultColumnFamily(), range, n, sizes);
+  }
+
+  // Compact the underlying storage for the key range [*begin,*end].
+  // The actual compaction interval might be superset of [*begin, *end].
+  // In particular, deleted and overwritten versions are discarded,
+  // and the data is rearranged to reduce the cost of operations
+  // needed to access the data.  This operation should typically only
+  // be invoked by users who understand the underlying implementation.
+  //
+  // begin==nullptr is treated as a key before all keys in the database.
+  // end==nullptr is treated as a key after all keys in the database.
+  // Therefore the following call will compact the entire database:
+  //    db->CompactRange(nullptr, nullptr);
+  // Note that after the entire database is compacted, all data are pushed
+  // down to the last level containing any data. If the total data size
+  // after compaction is reduced, that level might not be appropriate for
+  // hosting all the files. In this case, client could set reduce_level
+  // to true, to move the files back to the minimum level capable of holding
+  // the data set or a given level (specified by non-negative target_level).
+  virtual Status CompactRange(ColumnFamilyHandle* column_family,
+                              const Slice* begin, const Slice* end,
+                              bool reduce_level = false,
+                              int target_level = -1) = 0;
+  virtual Status CompactRange(const Slice* begin, const Slice* end,
+                              bool reduce_level = false,
+                              int target_level = -1) {
+    return CompactRange(DefaultColumnFamily(), begin, end, reduce_level,
+                        target_level);
+  }
+
+  // Number of levels used for this DB.
+  virtual int NumberLevels(ColumnFamilyHandle* column_family) = 0;
+  virtual int NumberLevels() { return NumberLevels(DefaultColumnFamily()); }
+
+  // Maximum level to which a new compacted memtable is pushed if it
+  // does not create overlap.
+  virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family) = 0;
+  virtual int MaxMemCompactionLevel() {
+    return MaxMemCompactionLevel(DefaultColumnFamily());
+  }
+
+  // Number of files in level-0 that would stop writes.
+  virtual int Level0StopWriteTrigger(ColumnFamilyHandle* column_family) = 0;
+  virtual int Level0StopWriteTrigger() {
+    return Level0StopWriteTrigger(DefaultColumnFamily());
+  }
+
+  // Get DB name -- the exact same name that was provided as an argument to
+  // DB::Open()
+  virtual const std::string& GetName() const = 0;
+
+  // Get Env object from the DB
+  virtual Env* GetEnv() const = 0;
+
+  // Get DB Options that we use
+  virtual const Options& GetOptions(ColumnFamilyHandle* column_family)
+      const = 0;
+  virtual const Options& GetOptions() const {
+    return GetOptions(DefaultColumnFamily());
+  }
+
+  // Flush all mem-table data.
+  virtual Status Flush(const FlushOptions& options,
+                       ColumnFamilyHandle* column_family) = 0;
+  virtual Status Flush(const FlushOptions& options) {
+    return Flush(options, DefaultColumnFamily());
+  }
+
+  // The sequence number of the most recent transaction.
+  virtual SequenceNumber GetLatestSequenceNumber() const = 0;
+
+#ifndef ROCKSDB_LITE
+
+  // Prevent file deletions. Compactions will continue to occur,
+  // but no obsolete files will be deleted. Calling this multiple
+  // times have the same effect as calling it once.
+  virtual Status DisableFileDeletions() = 0;
+
+  // Allow compactions to delete obselete files.
+  // If force == true, the call to EnableFileDeletions() will guarantee that
+  // file deletions are enabled after the call, even if DisableFileDeletions()
+  // was called multiple times before.
+  // If force == false, EnableFileDeletions will only enable file deletion
+  // after it's been called at least as many times as DisableFileDeletions(),
+  // enabling the two methods to be called by two threads concurrently without
+  // synchronization -- i.e., file deletions will be enabled only after both
+  // threads call EnableFileDeletions()
+  virtual Status EnableFileDeletions(bool force = true) = 0;
+
+  // GetLiveFiles followed by GetSortedWalFiles can generate a lossless backup
+
+  // THIS METHOD IS DEPRECATED. Use the GetLiveFilesMetaData to get more
+  // detailed information on the live files.
+  // Retrieve the list of all files in the database. The files are
+  // relative to the dbname and are not absolute paths. The valid size of the
+  // manifest file is returned in manifest_file_size. The manifest file is an
+  // ever growing file, but only the portion specified by manifest_file_size is
+  // valid for this snapshot.
+  // Setting flush_memtable to true does Flush before recording the live files.
+  // Setting flush_memtable to false is useful when we don't want to wait for
+  // flush which may have to wait for compaction to complete taking an
+  // indeterminate time.
+  //
+  // In case you have multiple column families, even if flush_memtable is true,
+  // you still need to call GetSortedWalFiles after GetLiveFiles to compensate
+  // for new data that arrived to already-flushed column families while other
+  // column families were flushing
+  virtual Status GetLiveFiles(std::vector<std::string>&,
+                              uint64_t* manifest_file_size,
+                              bool flush_memtable = true) = 0;
+
+  // Retrieve the sorted list of all wal files with earliest file first
+  virtual Status GetSortedWalFiles(VectorLogPtr& files) = 0;
+
+  // Sets iter to an iterator that is positioned at a write-batch containing
+  // seq_number. If the sequence number is non existent, it returns an iterator
+  // at the first available seq_no after the requested seq_no
+  // Returns Status::OK if iterator is valid
+  // Must set WAL_ttl_seconds or WAL_size_limit_MB to large values to
+  // use this api, else the WAL files will get
+  // cleared aggressively and the iterator might keep getting invalid before
+  // an update is read.
+  virtual Status GetUpdatesSince(
+      SequenceNumber seq_number, unique_ptr<TransactionLogIterator>* iter,
+      const TransactionLogIterator::ReadOptions&
+          read_options = TransactionLogIterator::ReadOptions()) = 0;
+
+  // Delete the file name from the db directory and update the internal state to
+  // reflect that. Supports deletion of sst and log files only. 'name' must be
+  // path relative to the db directory. eg. 000001.sst, /archive/000003.log
+  virtual Status DeleteFile(std::string name) = 0;
+
+  // Returns a list of all table files with their level, start key
+  // and end key
+  virtual void GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {}
+
+#endif  // ROCKSDB_LITE
+
+  // Sets the globally unique ID created at database creation time by invoking
+  // Env::GenerateUniqueId(), in identity. Returns Status::OK if identity could
+  // be set properly
+  virtual Status GetDbIdentity(std::string& identity) = 0;
+
+  // Returns default column family handle
+  virtual ColumnFamilyHandle* DefaultColumnFamily() const = 0;
+
+#ifndef ROCKSDB_LITE
+  virtual Status GetPropertiesOfAllTables(ColumnFamilyHandle* column_family,
+                                          TablePropertiesCollection* props) = 0;
+  virtual Status GetPropertiesOfAllTables(TablePropertiesCollection* props) {
+    return GetPropertiesOfAllTables(DefaultColumnFamily(), props);
+  }
+#endif  // ROCKSDB_LITE
+
+ private:
+  // No copying allowed
+  DB(const DB&);
+  void operator=(const DB&);
+};
+
+// Destroy the contents of the specified database.
+// Be very careful using this method.
+Status DestroyDB(const std::string& name, const Options& options);
+
+#ifndef ROCKSDB_LITE
+// If a DB cannot be opened, you may attempt to call this method to
+// resurrect as much of the contents of the database as possible.
+// Some data may be lost, so be careful when calling this function
+// on a database that contains important information.
+Status RepairDB(const std::string& dbname, const Options& options);
+#endif
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_DB_H_
diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h
new file mode 100644 (file)
index 0000000..6a96351
--- /dev/null
@@ -0,0 +1,772 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// An Env is an interface used by the rocksdb implementation to access
+// operating system functionality like the filesystem etc.  Callers
+// may wish to provide a custom Env object when opening a database to
+// get fine gain control; e.g., to rate limit file system operations.
+//
+// All Env implementations are safe for concurrent access from
+// multiple threads without any external synchronization.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_ENV_H_
+#define STORAGE_ROCKSDB_INCLUDE_ENV_H_
+
+#include <cstdarg>
+#include <string>
+#include <memory>
+#include <vector>
+#include <stdint.h>
+#include "rocksdb/status.h"
+
+namespace rocksdb {
+
+class FileLock;
+class Logger;
+class RandomAccessFile;
+class SequentialFile;
+class Slice;
+class WritableFile;
+class RandomRWFile;
+class Directory;
+struct DBOptions;
+
+using std::unique_ptr;
+using std::shared_ptr;
+
+
+// Options while opening a file to read/write
+struct EnvOptions {
+
+  // construct with default Options
+  EnvOptions();
+
+  // construct from Options
+  explicit EnvOptions(const DBOptions& options);
+
+  // If true, then allow caching of data in environment buffers
+  bool use_os_buffer = true;
+
+   // If true, then use mmap to read data
+  bool use_mmap_reads = false;
+
+   // If true, then use mmap to write data
+  bool use_mmap_writes = true;
+
+  // If true, set the FD_CLOEXEC on open fd.
+  bool set_fd_cloexec = true;
+
+  // Allows OS to incrementally sync files to disk while they are being
+  // written, in the background. Issue one request for every bytes_per_sync
+  // written. 0 turns it off.
+  // Default: 0
+  uint64_t bytes_per_sync = 0;
+
+  // If true, we will preallocate the file with FALLOC_FL_KEEP_SIZE flag, which
+  // means that file size won't change as part of preallocation.
+  // If false, preallocation will also change the file size. This option will
+  // improve the performance in workloads where you sync the data on every
+  // write. By default, we set it to true for MANIFEST writes and false for
+  // WAL writes
+  bool fallocate_with_keep_size = true;
+};
+
+class Env {
+ public:
+  Env() { }
+  virtual ~Env();
+
+  // Return a default environment suitable for the current operating
+  // system.  Sophisticated users may wish to provide their own Env
+  // implementation instead of relying on this default environment.
+  //
+  // The result of Default() belongs to rocksdb and must never be deleted.
+  static Env* Default();
+
+  // Create a brand new sequentially-readable file with the specified name.
+  // On success, stores a pointer to the new file in *result and returns OK.
+  // On failure stores nullptr in *result and returns non-OK.  If the file does
+  // not exist, returns a non-OK status.
+  //
+  // The returned file will only be accessed by one thread at a time.
+  virtual Status NewSequentialFile(const std::string& fname,
+                                   unique_ptr<SequentialFile>* result,
+                                   const EnvOptions& options)
+                                   = 0;
+
+  // Create a brand new random access read-only file with the
+  // specified name.  On success, stores a pointer to the new file in
+  // *result and returns OK.  On failure stores nullptr in *result and
+  // returns non-OK.  If the file does not exist, returns a non-OK
+  // status.
+  //
+  // The returned file may be concurrently accessed by multiple threads.
+  virtual Status NewRandomAccessFile(const std::string& fname,
+                                     unique_ptr<RandomAccessFile>* result,
+                                     const EnvOptions& options)
+                                     = 0;
+
+  // Create an object that writes to a new file with the specified
+  // name.  Deletes any existing file with the same name and creates a
+  // new file.  On success, stores a pointer to the new file in
+  // *result and returns OK.  On failure stores nullptr in *result and
+  // returns non-OK.
+  //
+  // The returned file will only be accessed by one thread at a time.
+  virtual Status NewWritableFile(const std::string& fname,
+                                 unique_ptr<WritableFile>* result,
+                                 const EnvOptions& options) = 0;
+
+  // Create an object that both reads and writes to a file on
+  // specified offsets (random access). If file already exists,
+  // does not overwrite it. On success, stores a pointer to the
+  // new file in *result and returns OK. On failure stores nullptr
+  // in *result and returns non-OK.
+  virtual Status NewRandomRWFile(const std::string& fname,
+                                 unique_ptr<RandomRWFile>* result,
+                                 const EnvOptions& options) = 0;
+
+  // Create an object that represents a directory. Will fail if directory
+  // doesn't exist. If the directory exists, it will open the directory
+  // and create a new Directory object.
+  //
+  // On success, stores a pointer to the new Directory in
+  // *result and returns OK. On failure stores nullptr in *result and
+  // returns non-OK.
+  virtual Status NewDirectory(const std::string& name,
+                              unique_ptr<Directory>* result) = 0;
+
+  // Returns true iff the named file exists.
+  virtual bool FileExists(const std::string& fname) = 0;
+
+  // Store in *result the names of the children of the specified directory.
+  // The names are relative to "dir".
+  // Original contents of *results are dropped.
+  virtual Status GetChildren(const std::string& dir,
+                             std::vector<std::string>* result) = 0;
+
+  // Delete the named file.
+  virtual Status DeleteFile(const std::string& fname) = 0;
+
+  // Create the specified directory. Returns error if directory exists.
+  virtual Status CreateDir(const std::string& dirname) = 0;
+
+  // Creates directory if missing. Return Ok if it exists, or successful in
+  // Creating.
+  virtual Status CreateDirIfMissing(const std::string& dirname) = 0;
+
+  // Delete the specified directory.
+  virtual Status DeleteDir(const std::string& dirname) = 0;
+
+  // Store the size of fname in *file_size.
+  virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) = 0;
+
+  // Store the last modification time of fname in *file_mtime.
+  virtual Status GetFileModificationTime(const std::string& fname,
+                                         uint64_t* file_mtime) = 0;
+  // Rename file src to target.
+  virtual Status RenameFile(const std::string& src,
+                            const std::string& target) = 0;
+
+  // Lock the specified file.  Used to prevent concurrent access to
+  // the same db by multiple processes.  On failure, stores nullptr in
+  // *lock and returns non-OK.
+  //
+  // On success, stores a pointer to the object that represents the
+  // acquired lock in *lock and returns OK.  The caller should call
+  // UnlockFile(*lock) to release the lock.  If the process exits,
+  // the lock will be automatically released.
+  //
+  // If somebody else already holds the lock, finishes immediately
+  // with a failure.  I.e., this call does not wait for existing locks
+  // to go away.
+  //
+  // May create the named file if it does not already exist.
+  virtual Status LockFile(const std::string& fname, FileLock** lock) = 0;
+
+  // Release the lock acquired by a previous successful call to LockFile.
+  // REQUIRES: lock was returned by a successful LockFile() call
+  // REQUIRES: lock has not already been unlocked.
+  virtual Status UnlockFile(FileLock* lock) = 0;
+
+  enum Priority { LOW, HIGH, TOTAL };
+
+  // Arrange to run "(*function)(arg)" once in a background thread, in
+  // the thread pool specified by pri. By default, jobs go to the 'LOW'
+  // priority thread pool.
+
+  // "function" may run in an unspecified thread.  Multiple functions
+  // added to the same Env may run concurrently in different threads.
+  // I.e., the caller may not assume that background work items are
+  // serialized.
+  virtual void Schedule(
+      void (*function)(void* arg),
+      void* arg,
+      Priority pri = LOW) = 0;
+
+  // Start a new thread, invoking "function(arg)" within the new thread.
+  // When "function(arg)" returns, the thread will be destroyed.
+  virtual void StartThread(void (*function)(void* arg), void* arg) = 0;
+
+  // Wait for all threads started by StartThread to terminate.
+  virtual void WaitForJoin() {}
+
+  // Get thread pool queue length for specific thrad pool.
+  virtual unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const {
+    return 0;
+  }
+
+  // *path is set to a temporary directory that can be used for testing. It may
+  // or many not have just been created. The directory may or may not differ
+  // between runs of the same process, but subsequent calls will return the
+  // same directory.
+  virtual Status GetTestDirectory(std::string* path) = 0;
+
+  // Create and return a log file for storing informational messages.
+  virtual Status NewLogger(const std::string& fname,
+                           shared_ptr<Logger>* result) = 0;
+
+  // Returns the number of micro-seconds since some fixed point in time. Only
+  // useful for computing deltas of time.
+  virtual uint64_t NowMicros() = 0;
+
+  // Returns the number of nano-seconds since some fixed point in time. Only
+  // useful for computing deltas of time in one run.
+  // Default implementation simply relies on NowMicros
+  virtual uint64_t NowNanos() {
+    return NowMicros() * 1000;
+  }
+
+  // Sleep/delay the thread for the perscribed number of micro-seconds.
+  virtual void SleepForMicroseconds(int micros) = 0;
+
+  // Get the current host name.
+  virtual Status GetHostName(char* name, uint64_t len) = 0;
+
+  // Get the number of seconds since the Epoch, 1970-01-01 00:00:00 (UTC).
+  virtual Status GetCurrentTime(int64_t* unix_time) = 0;
+
+  // Get full directory name for this db.
+  virtual Status GetAbsolutePath(const std::string& db_path,
+      std::string* output_path) = 0;
+
+  // The number of background worker threads of a specific thread pool
+  // for this environment. 'LOW' is the default pool.
+  // default number: 1
+  virtual void SetBackgroundThreads(int number, Priority pri = LOW) = 0;
+
+  // Converts seconds-since-Jan-01-1970 to a printable string
+  virtual std::string TimeToString(uint64_t time) = 0;
+
+  // Generates a unique id that can be used to identify a db
+  virtual std::string GenerateUniqueId();
+
+  // OptimizeForLogWrite will create a new EnvOptions object that is a copy of
+  // the EnvOptions in the parameters, but is optimized for writing log files.
+  // Default implementation returns the copy of the same object.
+  virtual EnvOptions OptimizeForLogWrite(const EnvOptions& env_options) const;
+  // OptimizeForManifestWrite will create a new EnvOptions object that is a copy
+  // of the EnvOptions in the parameters, but is optimized for writing manifest
+  // files. Default implementation returns the copy of the same object.
+  virtual EnvOptions OptimizeForManifestWrite(const EnvOptions& env_options)
+      const;
+
+ private:
+  // No copying allowed
+  Env(const Env&);
+  void operator=(const Env&);
+};
+
+// A file abstraction for reading sequentially through a file
+class SequentialFile {
+ public:
+  SequentialFile() { }
+  virtual ~SequentialFile();
+
+  // Read up to "n" bytes from the file.  "scratch[0..n-1]" may be
+  // written by this routine.  Sets "*result" to the data that was
+  // read (including if fewer than "n" bytes were successfully read).
+  // May set "*result" to point at data in "scratch[0..n-1]", so
+  // "scratch[0..n-1]" must be live when "*result" is used.
+  // If an error was encountered, returns a non-OK status.
+  //
+  // REQUIRES: External synchronization
+  virtual Status Read(size_t n, Slice* result, char* scratch) = 0;
+
+  // Skip "n" bytes from the file. This is guaranteed to be no
+  // slower that reading the same data, but may be faster.
+  //
+  // If end of file is reached, skipping will stop at the end of the
+  // file, and Skip will return OK.
+  //
+  // REQUIRES: External synchronization
+  virtual Status Skip(uint64_t n) = 0;
+
+  // Remove any kind of caching of data from the offset to offset+length
+  // of this file. If the length is 0, then it refers to the end of file.
+  // If the system is not caching the file contents, then this is a noop.
+  virtual Status InvalidateCache(size_t offset, size_t length) {
+    return Status::NotSupported("InvalidateCache not supported.");
+  }
+};
+
+// A file abstraction for randomly reading the contents of a file.
+class RandomAccessFile {
+ public:
+  RandomAccessFile() { }
+  virtual ~RandomAccessFile();
+
+  // Read up to "n" bytes from the file starting at "offset".
+  // "scratch[0..n-1]" may be written by this routine.  Sets "*result"
+  // to the data that was read (including if fewer than "n" bytes were
+  // successfully read).  May set "*result" to point at data in
+  // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when
+  // "*result" is used.  If an error was encountered, returns a non-OK
+  // status.
+  //
+  // Safe for concurrent use by multiple threads.
+  virtual Status Read(uint64_t offset, size_t n, Slice* result,
+                      char* scratch) const = 0;
+
+  // Tries to get an unique ID for this file that will be the same each time
+  // the file is opened (and will stay the same while the file is open).
+  // Furthermore, it tries to make this ID at most "max_size" bytes. If such an
+  // ID can be created this function returns the length of the ID and places it
+  // in "id"; otherwise, this function returns 0, in which case "id"
+  // may not have been modified.
+  //
+  // This function guarantees, for IDs from a given environment, two unique ids
+  // cannot be made equal to eachother by adding arbitrary bytes to one of
+  // them. That is, no unique ID is the prefix of another.
+  //
+  // This function guarantees that the returned ID will not be interpretable as
+  // a single varint.
+  //
+  // Note: these IDs are only valid for the duration of the process.
+  virtual size_t GetUniqueId(char* id, size_t max_size) const {
+    return 0; // Default implementation to prevent issues with backwards
+              // compatibility.
+  };
+
+
+  enum AccessPattern { NORMAL, RANDOM, SEQUENTIAL, WILLNEED, DONTNEED };
+
+  virtual void Hint(AccessPattern pattern) {}
+
+  // Remove any kind of caching of data from the offset to offset+length
+  // of this file. If the length is 0, then it refers to the end of file.
+  // If the system is not caching the file contents, then this is a noop.
+  virtual Status InvalidateCache(size_t offset, size_t length) {
+    return Status::NotSupported("InvalidateCache not supported.");
+  }
+};
+
+// A file abstraction for sequential writing.  The implementation
+// must provide buffering since callers may append small fragments
+// at a time to the file.
+class WritableFile {
+ public:
+  WritableFile() : last_preallocated_block_(0), preallocation_block_size_ (0) {
+  }
+  virtual ~WritableFile();
+
+  virtual Status Append(const Slice& data) = 0;
+  virtual Status Close() = 0;
+  virtual Status Flush() = 0;
+  virtual Status Sync() = 0; // sync data
+
+  /*
+   * Sync data and/or metadata as well.
+   * By default, sync only data.
+   * Override this method for environments where we need to sync
+   * metadata as well.
+   */
+  virtual Status Fsync() {
+    return Sync();
+  }
+
+  /*
+   * Get the size of valid data in the file.
+   */
+  virtual uint64_t GetFileSize() {
+    return 0;
+  }
+
+  /*
+   * Get and set the default pre-allocation block size for writes to
+   * this file.  If non-zero, then Allocate will be used to extend the
+   * underlying storage of a file (generally via fallocate) if the Env
+   * instance supports it.
+   */
+  void SetPreallocationBlockSize(size_t size) {
+    preallocation_block_size_ = size;
+  }
+
+  virtual void GetPreallocationStatus(size_t* block_size,
+                                      size_t* last_allocated_block) {
+    *last_allocated_block = last_preallocated_block_;
+    *block_size = preallocation_block_size_;
+  }
+
+  // For documentation, refer to RandomAccessFile::GetUniqueId()
+  virtual size_t GetUniqueId(char* id, size_t max_size) const {
+    return 0; // Default implementation to prevent issues with backwards
+  }
+
+  // Remove any kind of caching of data from the offset to offset+length
+  // of this file. If the length is 0, then it refers to the end of file.
+  // If the system is not caching the file contents, then this is a noop.
+  // This call has no effect on dirty pages in the cache.
+  virtual Status InvalidateCache(size_t offset, size_t length) {
+    return Status::NotSupported("InvalidateCache not supported.");
+  }
+
+ protected:
+  // PrepareWrite performs any necessary preparation for a write
+  // before the write actually occurs.  This allows for pre-allocation
+  // of space on devices where it can result in less file
+  // fragmentation and/or less waste from over-zealous filesystem
+  // pre-allocation.
+  void PrepareWrite(size_t offset, size_t len) {
+    if (preallocation_block_size_ == 0) {
+      return;
+    }
+    // If this write would cross one or more preallocation blocks,
+    // determine what the last preallocation block necesessary to
+    // cover this write would be and Allocate to that point.
+    const auto block_size = preallocation_block_size_;
+    size_t new_last_preallocated_block =
+      (offset + len + block_size - 1) / block_size;
+    if (new_last_preallocated_block > last_preallocated_block_) {
+      size_t num_spanned_blocks =
+        new_last_preallocated_block - last_preallocated_block_;
+      Allocate(block_size * last_preallocated_block_,
+               block_size * num_spanned_blocks);
+      last_preallocated_block_ = new_last_preallocated_block;
+    }
+  }
+
+  /*
+   * Pre-allocate space for a file.
+   */
+  virtual Status Allocate(off_t offset, off_t len) {
+    return Status::OK();
+  }
+
+  // Sync a file range with disk.
+  // offset is the starting byte of the file range to be synchronized.
+  // nbytes specifies the length of the range to be synchronized.
+  // This asks the OS to initiate flushing the cached data to disk,
+  // without waiting for completion.
+  // Default implementation does nothing.
+  virtual Status RangeSync(off_t offset, off_t nbytes) {
+    return Status::OK();
+  }
+
+ private:
+  size_t last_preallocated_block_;
+  size_t preallocation_block_size_;
+  // No copying allowed
+  WritableFile(const WritableFile&);
+  void operator=(const WritableFile&);
+};
+
+// A file abstraction for random reading and writing.
+class RandomRWFile {
+ public:
+  RandomRWFile() {}
+  virtual ~RandomRWFile() {}
+
+  // Write data from Slice data to file starting from offset
+  // Returns IOError on failure, but does not guarantee
+  // atomicity of a write.  Returns OK status on success.
+  //
+  // Safe for concurrent use.
+  virtual Status Write(uint64_t offset, const Slice& data) = 0;
+  // Read up to "n" bytes from the file starting at "offset".
+  // "scratch[0..n-1]" may be written by this routine.  Sets "*result"
+  // to the data that was read (including if fewer than "n" bytes were
+  // successfully read).  May set "*result" to point at data in
+  // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when
+  // "*result" is used.  If an error was encountered, returns a non-OK
+  // status.
+  //
+  // Safe for concurrent use by multiple threads.
+  virtual Status Read(uint64_t offset, size_t n, Slice* result,
+                      char* scratch) const = 0;
+  virtual Status Close() = 0; // closes the file
+  virtual Status Sync() = 0; // sync data
+
+  /*
+   * Sync data and/or metadata as well.
+   * By default, sync only data.
+   * Override this method for environments where we need to sync
+   * metadata as well.
+   */
+  virtual Status Fsync() {
+    return Sync();
+  }
+
+  /*
+   * Pre-allocate space for a file.
+   */
+  virtual Status Allocate(off_t offset, off_t len) {
+    return Status::OK();
+  }
+
+ private:
+  // No copying allowed
+  RandomRWFile(const RandomRWFile&);
+  void operator=(const RandomRWFile&);
+};
+
+// Directory object represents collection of files and implements
+// filesystem operations that can be executed on directories.
+class Directory {
+ public:
+  virtual ~Directory() {}
+  // Fsync directory
+  virtual Status Fsync() = 0;
+};
+
+enum InfoLogLevel : unsigned char {
+  DEBUG_LEVEL = 0,
+  INFO_LEVEL,
+  WARN_LEVEL,
+  ERROR_LEVEL,
+  FATAL_LEVEL,
+  NUM_INFO_LOG_LEVELS,
+};
+
+// An interface for writing log messages.
+class Logger {
+ public:
+  enum { DO_NOT_SUPPORT_GET_LOG_FILE_SIZE = -1 };
+  explicit Logger(const InfoLogLevel log_level = InfoLogLevel::INFO_LEVEL)
+      : log_level_(log_level) {}
+  virtual ~Logger();
+
+  // Write an entry to the log file with the specified format.
+  virtual void Logv(const char* format, va_list ap) = 0;
+
+  // Write an entry to the log file with the specified log level
+  // and format.  Any log with level under the internal log level
+  // of *this (see @SetInfoLogLevel and @GetInfoLogLevel) will not be
+  // printed.
+  void Logv(const InfoLogLevel log_level, const char* format, va_list ap) {
+    static const char* kInfoLogLevelNames[5] = {"DEBUG", "INFO", "WARN",
+                                                "ERROR", "FATAL"};
+    if (log_level < log_level_) {
+      return;
+    }
+
+    if (log_level == InfoLogLevel::INFO_LEVEL) {
+      // Doesn't print log level if it is INFO level.
+      // This is to avoid unexpected performance regression after we add
+      // the feature of log level. All the logs before we add the feature
+      // are INFO level. We don't want to add extra costs to those existing
+      // logging.
+      Logv(format, ap);
+    } else {
+      char new_format[500];
+      snprintf(new_format, sizeof(new_format) - 1, "[%s] %s",
+               kInfoLogLevelNames[log_level], format);
+      Logv(new_format, ap);
+    }
+  }
+  virtual size_t GetLogFileSize() const {
+    return DO_NOT_SUPPORT_GET_LOG_FILE_SIZE;
+  }
+  // Flush to the OS buffers
+  virtual void Flush() {}
+  virtual InfoLogLevel GetInfoLogLevel() const { return log_level_; }
+  virtual void SetInfoLogLevel(const InfoLogLevel log_level) {
+    log_level_ = log_level;
+  }
+
+ private:
+  // No copying allowed
+  Logger(const Logger&);
+  void operator=(const Logger&);
+  InfoLogLevel log_level_;
+};
+
+
+// Identifies a locked file.
+class FileLock {
+ public:
+  FileLock() { }
+  virtual ~FileLock();
+ private:
+  // No copying allowed
+  FileLock(const FileLock&);
+  void operator=(const FileLock&);
+};
+
+extern void LogFlush(const shared_ptr<Logger>& info_log);
+
+extern void Log(const InfoLogLevel log_level,
+                const shared_ptr<Logger>& info_log, const char* format, ...);
+
+// a set of log functions with different log levels.
+extern void Debug(const shared_ptr<Logger>& info_log, const char* format, ...);
+extern void Info(const shared_ptr<Logger>& info_log, const char* format, ...);
+extern void Warn(const shared_ptr<Logger>& info_log, const char* format, ...);
+extern void Error(const shared_ptr<Logger>& info_log, const char* format, ...);
+extern void Fatal(const shared_ptr<Logger>& info_log, const char* format, ...);
+
+// Log the specified data to *info_log if info_log is non-nullptr.
+// The default info log level is InfoLogLevel::ERROR.
+extern void Log(const shared_ptr<Logger>& info_log, const char* format, ...)
+#   if defined(__GNUC__) || defined(__clang__)
+    __attribute__((__format__ (__printf__, 2, 3)))
+#   endif
+    ;
+
+extern void LogFlush(Logger *info_log);
+
+extern void Log(const InfoLogLevel log_level, Logger* info_log,
+                const char* format, ...);
+
+// The default info log level is InfoLogLevel::ERROR.
+extern void Log(Logger* info_log, const char* format, ...)
+#   if defined(__GNUC__) || defined(__clang__)
+    __attribute__((__format__ (__printf__, 2, 3)))
+#   endif
+    ;
+
+// a set of log functions with different log levels.
+extern void Debug(Logger* info_log, const char* format, ...);
+extern void Info(Logger* info_log, const char* format, ...);
+extern void Warn(Logger* info_log, const char* format, ...);
+extern void Error(Logger* info_log, const char* format, ...);
+extern void Fatal(Logger* info_log, const char* format, ...);
+
+// A utility routine: write "data" to the named file.
+extern Status WriteStringToFile(Env* env, const Slice& data,
+                                const std::string& fname,
+                                bool should_sync = false);
+
+// A utility routine: read contents of named file into *data
+extern Status ReadFileToString(Env* env, const std::string& fname,
+                               std::string* data);
+
+// An implementation of Env that forwards all calls to another Env.
+// May be useful to clients who wish to override just part of the
+// functionality of another Env.
+class EnvWrapper : public Env {
+ public:
+  // Initialize an EnvWrapper that delegates all calls to *t
+  explicit EnvWrapper(Env* t) : target_(t) { }
+  virtual ~EnvWrapper();
+
+  // Return the target to which this Env forwards all calls
+  Env* target() const { return target_; }
+
+  // The following text is boilerplate that forwards all methods to target()
+  Status NewSequentialFile(const std::string& f,
+                           unique_ptr<SequentialFile>* r,
+                           const EnvOptions& options) {
+    return target_->NewSequentialFile(f, r, options);
+  }
+  Status NewRandomAccessFile(const std::string& f,
+                             unique_ptr<RandomAccessFile>* r,
+                             const EnvOptions& options) {
+    return target_->NewRandomAccessFile(f, r, options);
+  }
+  Status NewWritableFile(const std::string& f, unique_ptr<WritableFile>* r,
+                         const EnvOptions& options) {
+    return target_->NewWritableFile(f, r, options);
+  }
+  Status NewRandomRWFile(const std::string& f, unique_ptr<RandomRWFile>* r,
+                         const EnvOptions& options) {
+    return target_->NewRandomRWFile(f, r, options);
+  }
+  virtual Status NewDirectory(const std::string& name,
+                              unique_ptr<Directory>* result) {
+    return target_->NewDirectory(name, result);
+  }
+  bool FileExists(const std::string& f) { return target_->FileExists(f); }
+  Status GetChildren(const std::string& dir, std::vector<std::string>* r) {
+    return target_->GetChildren(dir, r);
+  }
+  Status DeleteFile(const std::string& f) { return target_->DeleteFile(f); }
+  Status CreateDir(const std::string& d) { return target_->CreateDir(d); }
+  Status CreateDirIfMissing(const std::string& d) {
+    return target_->CreateDirIfMissing(d);
+  }
+  Status DeleteDir(const std::string& d) { return target_->DeleteDir(d); }
+  Status GetFileSize(const std::string& f, uint64_t* s) {
+    return target_->GetFileSize(f, s);
+  }
+
+  Status GetFileModificationTime(const std::string& fname,
+                                 uint64_t* file_mtime) {
+    return target_->GetFileModificationTime(fname, file_mtime);
+  }
+
+  Status RenameFile(const std::string& s, const std::string& t) {
+    return target_->RenameFile(s, t);
+  }
+  Status LockFile(const std::string& f, FileLock** l) {
+    return target_->LockFile(f, l);
+  }
+  Status UnlockFile(FileLock* l) { return target_->UnlockFile(l); }
+  void Schedule(void (*f)(void*), void* a, Priority pri) {
+    return target_->Schedule(f, a, pri);
+  }
+  void StartThread(void (*f)(void*), void* a) {
+    return target_->StartThread(f, a);
+  }
+  void WaitForJoin() { return target_->WaitForJoin(); }
+  virtual unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const {
+    return target_->GetThreadPoolQueueLen(pri);
+  }
+  virtual Status GetTestDirectory(std::string* path) {
+    return target_->GetTestDirectory(path);
+  }
+  virtual Status NewLogger(const std::string& fname,
+                           shared_ptr<Logger>* result) {
+    return target_->NewLogger(fname, result);
+  }
+  uint64_t NowMicros() {
+    return target_->NowMicros();
+  }
+  void SleepForMicroseconds(int micros) {
+    target_->SleepForMicroseconds(micros);
+  }
+  Status GetHostName(char* name, uint64_t len) {
+    return target_->GetHostName(name, len);
+  }
+  Status GetCurrentTime(int64_t* unix_time) {
+    return target_->GetCurrentTime(unix_time);
+  }
+  Status GetAbsolutePath(const std::string& db_path,
+      std::string* output_path) {
+    return target_->GetAbsolutePath(db_path, output_path);
+  }
+  void SetBackgroundThreads(int num, Priority pri) {
+    return target_->SetBackgroundThreads(num, pri);
+  }
+  std::string TimeToString(uint64_t time) {
+    return target_->TimeToString(time);
+  }
+
+ private:
+  Env* target_;
+};
+
+// Returns a new environment that stores its data in memory and delegates
+// all non-file-storage tasks to base_env. The caller must delete the result
+// when it is no longer needed.
+// *base_env must remain live while the result is in use.
+Env* NewMemEnv(Env* base_env);
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_ENV_H_
diff --git a/include/rocksdb/filter_policy.h b/include/rocksdb/filter_policy.h
new file mode 100644 (file)
index 0000000..fa44db4
--- /dev/null
@@ -0,0 +1,74 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A database can be configured with a custom FilterPolicy object.
+// This object is responsible for creating a small filter from a set
+// of keys.  These filters are stored in rocksdb and are consulted
+// automatically by rocksdb to decide whether or not to read some
+// information from disk. In many cases, a filter can cut down the
+// number of disk seeks form a handful to a single disk seek per
+// DB::Get() call.
+//
+// Most people will want to use the builtin bloom filter support (see
+// NewBloomFilterPolicy() below).
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_
+#define STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_
+
+#include <string>
+
+namespace rocksdb {
+
+class Slice;
+
+class FilterPolicy {
+ public:
+  virtual ~FilterPolicy();
+
+  // Return the name of this policy.  Note that if the filter encoding
+  // changes in an incompatible way, the name returned by this method
+  // must be changed.  Otherwise, old incompatible filters may be
+  // passed to methods of this type.
+  virtual const char* Name() const = 0;
+
+  // keys[0,n-1] contains a list of keys (potentially with duplicates)
+  // that are ordered according to the user supplied comparator.
+  // Append a filter that summarizes keys[0,n-1] to *dst.
+  //
+  // Warning: do not change the initial contents of *dst.  Instead,
+  // append the newly constructed filter to *dst.
+  virtual void CreateFilter(const Slice* keys, int n, std::string* dst)
+      const = 0;
+
+  // "filter" contains the data appended by a preceding call to
+  // CreateFilter() on this class.  This method must return true if
+  // the key was in the list of keys passed to CreateFilter().
+  // This method may return true or false if the key was not on the
+  // list, but it should aim to return false with a high probability.
+  virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const = 0;
+};
+
+// Return a new filter policy that uses a bloom filter with approximately
+// the specified number of bits per key.  A good value for bits_per_key
+// is 10, which yields a filter with ~ 1% false positive rate.
+//
+// Callers must delete the result after any database that is using the
+// result has been closed.
+//
+// Note: if you are using a custom comparator that ignores some parts
+// of the keys being compared, you must not use NewBloomFilterPolicy()
+// and must provide your own FilterPolicy that also ignores the
+// corresponding parts of the keys.  For example, if the comparator
+// ignores trailing spaces, it would be incorrect to use a
+// FilterPolicy (like NewBloomFilterPolicy) that does not ignore
+// trailing spaces in keys.
+extern const FilterPolicy* NewBloomFilterPolicy(int bits_per_key);
+
+}
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_
diff --git a/include/rocksdb/flush_block_policy.h b/include/rocksdb/flush_block_policy.h
new file mode 100644 (file)
index 0000000..8340ad6
--- /dev/null
@@ -0,0 +1,58 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+
+#include <string>
+
+namespace rocksdb {
+
+class Slice;
+class BlockBuilder;
+struct Options;
+
+// FlushBlockPolicy provides a configurable way to determine when to flush a
+// block in the block based tables,
+class FlushBlockPolicy {
+ public:
+  // Keep track of the key/value sequences and return the boolean value to
+  // determine if table builder should flush current data block.
+  virtual bool Update(const Slice& key,
+                      const Slice& value) = 0;
+
+  virtual ~FlushBlockPolicy() { }
+};
+
+class FlushBlockPolicyFactory {
+ public:
+  // Return the name of the flush block policy.
+  virtual const char* Name() const = 0;
+
+  // Return a new block flush policy that flushes data blocks by data size.
+  // FlushBlockPolicy may need to access the metadata of the data block
+  // builder to determine when to flush the blocks.
+  //
+  // Callers must delete the result after any database that is using the
+  // result has been closed.
+  virtual FlushBlockPolicy* NewFlushBlockPolicy(
+      const Options& options, const BlockBuilder& data_block_builder) const = 0;
+
+  virtual ~FlushBlockPolicyFactory() { }
+};
+
+class FlushBlockBySizePolicyFactory : public FlushBlockPolicyFactory {
+ public:
+  FlushBlockBySizePolicyFactory() {}
+
+  virtual const char* Name() const override {
+    return "FlushBlockBySizePolicyFactory";
+  }
+
+  virtual FlushBlockPolicy* NewFlushBlockPolicy(
+      const Options& options,
+      const BlockBuilder& data_block_builder) const override;
+};
+
+}  // rocksdb
diff --git a/include/rocksdb/iterator.h b/include/rocksdb/iterator.h
new file mode 100644 (file)
index 0000000..7538e9c
--- /dev/null
@@ -0,0 +1,106 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// An iterator yields a sequence of key/value pairs from a source.
+// The following class defines the interface.  Multiple implementations
+// are provided by this library.  In particular, iterators are provided
+// to access the contents of a Table or a DB.
+//
+// Multiple threads can invoke const methods on an Iterator without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same Iterator must use
+// external synchronization.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_ITERATOR_H_
+#define STORAGE_ROCKSDB_INCLUDE_ITERATOR_H_
+
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+
+namespace rocksdb {
+
+class Iterator {
+ public:
+  Iterator();
+  virtual ~Iterator();
+
+  // An iterator is either positioned at a key/value pair, or
+  // not valid.  This method returns true iff the iterator is valid.
+  virtual bool Valid() const = 0;
+
+  // Position at the first key in the source.  The iterator is Valid()
+  // after this call iff the source is not empty.
+  virtual void SeekToFirst() = 0;
+
+  // Position at the last key in the source.  The iterator is
+  // Valid() after this call iff the source is not empty.
+  virtual void SeekToLast() = 0;
+
+  // Position at the first key in the source that at or past target
+  // The iterator is Valid() after this call iff the source contains
+  // an entry that comes at or past target.
+  virtual void Seek(const Slice& target) = 0;
+
+  // Moves to the next entry in the source.  After this call, Valid() is
+  // true iff the iterator was not positioned at the last entry in the source.
+  // REQUIRES: Valid()
+  virtual void Next() = 0;
+
+  // Moves to the previous entry in the source.  After this call, Valid() is
+  // true iff the iterator was not positioned at the first entry in source.
+  // REQUIRES: Valid()
+  virtual void Prev() = 0;
+
+  // Return the key for the current entry.  The underlying storage for
+  // the returned slice is valid only until the next modification of
+  // the iterator.
+  // REQUIRES: Valid()
+  virtual Slice key() const = 0;
+
+  // Return the value for the current entry.  The underlying storage for
+  // the returned slice is valid only until the next modification of
+  // the iterator.
+  // REQUIRES: !AtEnd() && !AtStart()
+  virtual Slice value() const = 0;
+
+  // If an error has occurred, return it.  Else return an ok status.
+  // If non-blocking IO is requested and this operation cannot be
+  // satisfied without doing some IO, then this returns Status::Incomplete().
+  virtual Status status() const = 0;
+
+  // Clients are allowed to register function/arg1/arg2 triples that
+  // will be invoked when this iterator is destroyed.
+  //
+  // Note that unlike all of the preceding methods, this method is
+  // not abstract and therefore clients should not override it.
+  typedef void (*CleanupFunction)(void* arg1, void* arg2);
+  void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2);
+
+ private:
+  struct Cleanup {
+    CleanupFunction function;
+    void* arg1;
+    void* arg2;
+    Cleanup* next;
+  };
+  Cleanup cleanup_;
+
+  // No copying allowed
+  Iterator(const Iterator&);
+  void operator=(const Iterator&);
+};
+
+// Return an empty iterator (yields nothing).
+extern Iterator* NewEmptyIterator();
+
+// Return an empty iterator with the specified status.
+extern Iterator* NewErrorIterator(const Status& status);
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_ITERATOR_H_
diff --git a/include/rocksdb/ldb_tool.h b/include/rocksdb/ldb_tool.h
new file mode 100644 (file)
index 0000000..46bacc8
--- /dev/null
@@ -0,0 +1,18 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+#ifndef ROCKSDB_LITE
+#pragma once
+#include "rocksdb/options.h"
+
+namespace rocksdb {
+
+class LDBTool {
+ public:
+  void Run(int argc, char** argv, Options = Options());
+};
+
+} // namespace rocksdb
+
+#endif  // ROCKSDB_LITE
diff --git a/include/rocksdb/memtablerep.h b/include/rocksdb/memtablerep.h
new file mode 100644 (file)
index 0000000..445edcc
--- /dev/null
@@ -0,0 +1,265 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file contains the interface that must be implemented by any collection
+// to be used as the backing store for a MemTable. Such a collection must
+// satisfy the following properties:
+//  (1) It does not store duplicate items.
+//  (2) It uses MemTableRep::KeyComparator to compare items for iteration and
+//     equality.
+//  (3) It can be accessed concurrently by multiple readers and can support
+//     during reads. However, it needn't support multiple concurrent writes.
+//  (4) Items are never deleted.
+// The liberal use of assertions is encouraged to enforce (1).
+//
+// The factory will be passed an Arena object when a new MemTableRep is
+// requested. The API for this object is in rocksdb/arena.h.
+//
+// Users can implement their own memtable representations. We include three
+// types built in:
+//  - SkipListRep: This is the default; it is backed by a skip list.
+//  - HashSkipListRep: The memtable rep that is best used for keys that are
+//  structured like "prefix:suffix" where iteration within a prefix is
+//  common and iteration across different prefixes is rare. It is backed by
+//  a hash map where each bucket is a skip list.
+//  - VectorRep: This is backed by an unordered std::vector. On iteration, the
+// vector is sorted. It is intelligent about sorting; once the MarkReadOnly()
+// has been called, the vector will only be sorted once. It is optimized for
+// random-write-heavy workloads.
+//
+// The last four implementations are designed for situations in which
+// iteration over the entire collection is rare since doing so requires all the
+// keys to be copied into a sorted data structure.
+
+#pragma once
+
+#include <memory>
+#include <stdint.h>
+
+namespace rocksdb {
+
+class Arena;
+class LookupKey;
+class Slice;
+class SliceTransform;
+
+typedef void* KeyHandle;
+
+class MemTableRep {
+ public:
+  // KeyComparator provides a means to compare keys, which are internal keys
+  // concatenated with values.
+  class KeyComparator {
+   public:
+    // Compare a and b. Return a negative value if a is less than b, 0 if they
+    // are equal, and a positive value if a is greater than b
+    virtual int operator()(const char* prefix_len_key1,
+                           const char* prefix_len_key2) const = 0;
+
+    virtual int operator()(const char* prefix_len_key,
+                           const Slice& key) const = 0;
+
+    virtual ~KeyComparator() { }
+  };
+
+  explicit MemTableRep(Arena* arena) : arena_(arena) {}
+
+  // Allocate a buf of len size for storing key. The idea is that a specific
+  // memtable representation knows its underlying data structure better. By
+  // allowing it to allocate memory, it can possibly put correlated stuff
+  // in consecutive memory area to make processor prefetching more efficient.
+  virtual KeyHandle Allocate(const size_t len, char** buf);
+
+  // Insert key into the collection. (The caller will pack key and value into a
+  // single buffer and pass that in as the parameter to Insert).
+  // REQUIRES: nothing that compares equal to key is currently in the
+  // collection.
+  virtual void Insert(KeyHandle handle) = 0;
+
+  // Returns true iff an entry that compares equal to key is in the collection.
+  virtual bool Contains(const char* key) const = 0;
+
+  // Notify this table rep that it will no longer be added to. By default, does
+  // nothing.
+  virtual void MarkReadOnly() { }
+
+  // Look up key from the mem table, since the first key in the mem table whose
+  // user_key matches the one given k, call the function callback_func(), with
+  // callback_args directly forwarded as the first parameter, and the mem table
+  // key as the second parameter. If the return value is false, then terminates.
+  // Otherwise, go through the next key.
+  // It's safe for Get() to terminate after having finished all the potential
+  // key for the k.user_key(), or not.
+  //
+  // Default:
+  // Get() function with a default value of dynamically construct an iterator,
+  // seek and call the call back function.
+  virtual void Get(const LookupKey& k, void* callback_args,
+                   bool (*callback_func)(void* arg, const char* entry));
+
+  // Report an approximation of how much memory has been used other than memory
+  // that was allocated through the arena.
+  virtual size_t ApproximateMemoryUsage() = 0;
+
+  virtual ~MemTableRep() { }
+
+  // Iteration over the contents of a skip collection
+  class Iterator {
+   public:
+    // Initialize an iterator over the specified collection.
+    // The returned iterator is not valid.
+    // explicit Iterator(const MemTableRep* collection);
+    virtual ~Iterator() {}
+
+    // Returns true iff the iterator is positioned at a valid node.
+    virtual bool Valid() const = 0;
+
+    // Returns the key at the current position.
+    // REQUIRES: Valid()
+    virtual const char* key() const = 0;
+
+    // Advances to the next position.
+    // REQUIRES: Valid()
+    virtual void Next() = 0;
+
+    // Advances to the previous position.
+    // REQUIRES: Valid()
+    virtual void Prev() = 0;
+
+    // Advance to the first entry with a key >= target
+    virtual void Seek(const Slice& internal_key, const char* memtable_key) = 0;
+
+    // Position at the first entry in collection.
+    // Final state of iterator is Valid() iff collection is not empty.
+    virtual void SeekToFirst() = 0;
+
+    // Position at the last entry in collection.
+    // Final state of iterator is Valid() iff collection is not empty.
+    virtual void SeekToLast() = 0;
+  };
+
+  // Return an iterator over the keys in this representation.
+  virtual Iterator* GetIterator() = 0;
+
+  // Return an iterator over at least the keys with the specified user key. The
+  // iterator may also allow access to other keys, but doesn't have to. Default:
+  // GetIterator().
+  virtual Iterator* GetIterator(const Slice& user_key) { return GetIterator(); }
+
+  // Return an iterator that has a special Seek semantics. The result of
+  // a Seek might only include keys with the same prefix as the target key.
+  virtual Iterator* GetDynamicPrefixIterator() { return GetIterator(); }
+
+  // Return true if the current MemTableRep supports merge operator.
+  // Default: true
+  virtual bool IsMergeOperatorSupported() const { return true; }
+
+  // Return true if the current MemTableRep supports snapshot
+  // Default: true
+  virtual bool IsSnapshotSupported() const { return true; }
+
+ protected:
+  // When *key is an internal key concatenated with the value, returns the
+  // user key.
+  virtual Slice UserKey(const char* key) const;
+
+  Arena* arena_;
+};
+
+// This is the base class for all factories that are used by RocksDB to create
+// new MemTableRep objects
+class MemTableRepFactory {
+ public:
+  virtual ~MemTableRepFactory() {}
+  virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&,
+      Arena*, const SliceTransform*) = 0;
+  virtual const char* Name() const = 0;
+};
+
+// This uses a skip list to store keys. It is the default.
+class SkipListFactory : public MemTableRepFactory {
+ public:
+  virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&,
+                                         Arena*,
+                                         const SliceTransform*) override;
+  virtual const char* Name() const override { return "SkipListFactory"; }
+};
+
+#ifndef ROCKSDB_LITE
+// This creates MemTableReps that are backed by an std::vector. On iteration,
+// the vector is sorted. This is useful for workloads where iteration is very
+// rare and writes are generally not issued after reads begin.
+//
+// Parameters:
+//   count: Passed to the constructor of the underlying std::vector of each
+//     VectorRep. On initialization, the underlying array will be at least count
+//     bytes reserved for usage.
+class VectorRepFactory : public MemTableRepFactory {
+  const size_t count_;
+
+ public:
+  explicit VectorRepFactory(size_t count = 0) : count_(count) { }
+  virtual MemTableRep* CreateMemTableRep(
+      const MemTableRep::KeyComparator&, Arena*,
+      const SliceTransform*) override;
+  virtual const char* Name() const override {
+    return "VectorRepFactory";
+  }
+};
+
+// This class contains a fixed array of buckets, each
+// pointing to a skiplist (null if the bucket is empty).
+// bucket_count: number of fixed array buckets
+// skiplist_height: the max height of the skiplist
+// skiplist_branching_factor: probabilistic size ratio between adjacent
+//                            link lists in the skiplist
+extern MemTableRepFactory* NewHashSkipListRepFactory(
+    size_t bucket_count = 1000000, int32_t skiplist_height = 4,
+    int32_t skiplist_branching_factor = 4
+);
+
+// The factory is to create memtables with a hashed linked list:
+// it contains a fixed array of buckets, each pointing to a sorted single
+// linked list (null if the bucket is empty).
+// bucket_count: number of fixed array buckets
+extern MemTableRepFactory* NewHashLinkListRepFactory(
+    size_t bucket_count = 50000);
+
+// This factory creates a cuckoo-hashing based mem-table representation.
+// Cuckoo-hash is a closed-hash strategy, in which all key/value pairs
+// are stored in the bucket array itself intead of in some data structures
+// external to the bucket array.  In addition, each key in cuckoo hash
+// has a constant number of possible buckets in the bucket array.  These
+// two properties together makes cuckoo hash more memory efficient and
+// a constant worst-case read time.  Cuckoo hash is best suitable for
+// point-lookup workload.
+//
+// When inserting a key / value, it first checks whether one of its possible
+// buckets is empty.  If so, the key / value will be inserted to that vacant
+// bucket.  Otherwise, one of the keys originally stored in one of these
+// possible buckets will be "kicked out" and move to one of its possible
+// buckets (and possibly kicks out another victim.)  In the current
+// implementation, such "kick-out" path is bounded.  If it cannot find a
+// "kick-out" path for a specific key, this key will be stored in a backup
+// structure, and the current memtable to be forced to immutable.
+//
+// Note that currently this mem-table representation does not support
+// snapshot (i.e., it only queries latest state) and iterators.  In addition,
+// MultiGet operation might also lose its atomicity due to the lack of
+// snapshot support.
+//
+// Parameters:
+//   write_buffer_size: the write buffer size in bytes.
+//   average_data_size: the average size of key + value in bytes.  This value
+//     together with write_buffer_size will be used to compute the number
+//     of buckets.
+//   hash_function_count: the number of hash functions that will be used by
+//     the cuckoo-hash.  The number also equals to the number of possible
+//     buckets each key will have.
+extern MemTableRepFactory* NewHashCuckooRepFactory(
+    size_t write_buffer_size, size_t average_data_size = 64,
+    unsigned int hash_function_count = 4);
+#endif  // ROCKSDB_LITE
+}  // namespace rocksdb
diff --git a/include/rocksdb/merge_operator.h b/include/rocksdb/merge_operator.h
new file mode 100644 (file)
index 0000000..2ae64c1
--- /dev/null
@@ -0,0 +1,182 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_MERGE_OPERATOR_H_
+#define STORAGE_ROCKSDB_INCLUDE_MERGE_OPERATOR_H_
+
+#include <memory>
+#include <string>
+#include <deque>
+#include "rocksdb/slice.h"
+
+namespace rocksdb {
+
+class Slice;
+class Logger;
+
+// The Merge Operator
+//
+// Essentially, a MergeOperator specifies the SEMANTICS of a merge, which only
+// client knows. It could be numeric addition, list append, string
+// concatenation, edit data structure, ... , anything.
+// The library, on the other hand, is concerned with the exercise of this
+// interface, at the right time (during get, iteration, compaction...)
+//
+// To use merge, the client needs to provide an object implementing one of
+// the following interfaces:
+//  a) AssociativeMergeOperator - for most simple semantics (always take
+//    two values, and merge them into one value, which is then put back
+//    into rocksdb); numeric addition and string concatenation are examples;
+//
+//  b) MergeOperator - the generic class for all the more abstract / complex
+//    operations; one method (FullMerge) to merge a Put/Delete value with a
+//    merge operand; and another method (PartialMerge) that merges multiple
+//    operands together. this is especially useful if your key values have
+//    complex structures but you would still like to support client-specific
+//    incremental updates.
+//
+// AssociativeMergeOperator is simpler to implement. MergeOperator is simply
+// more powerful.
+//
+// Refer to rocksdb-merge wiki for more details and example implementations.
+//
+class MergeOperator {
+ public:
+  virtual ~MergeOperator() {}
+
+  // Gives the client a way to express the read -> modify -> write semantics
+  // key:      (IN)    The key that's associated with this merge operation.
+  //                   Client could multiplex the merge operator based on it
+  //                   if the key space is partitioned and different subspaces
+  //                   refer to different types of data which have different
+  //                   merge operation semantics
+  // existing: (IN)    null indicates that the key does not exist before this op
+  // operand_list:(IN) the sequence of merge operations to apply, front() first.
+  // new_value:(OUT)   Client is responsible for filling the merge result here
+  // logger:   (IN)    Client could use this to log errors during merge.
+  //
+  // Return true on success.
+  // All values passed in will be client-specific values. So if this method
+  // returns false, it is because client specified bad data or there was
+  // internal corruption. This will be treated as an error by the library.
+  //
+  // Also make use of the *logger for error messages.
+  virtual bool FullMerge(const Slice& key,
+                         const Slice* existing_value,
+                         const std::deque<std::string>& operand_list,
+                         std::string* new_value,
+                         Logger* logger) const = 0;
+
+  // This function performs merge(left_op, right_op)
+  // when both the operands are themselves merge operation types
+  // that you would have passed to a DB::Merge() call in the same order
+  // (i.e.: DB::Merge(key,left_op), followed by DB::Merge(key,right_op)).
+  //
+  // PartialMerge should combine them into a single merge operation that is
+  // saved into *new_value, and then it should return true.
+  // *new_value should be constructed such that a call to
+  // DB::Merge(key, *new_value) would yield the same result as a call
+  // to DB::Merge(key, left_op) followed by DB::Merge(key, right_op).
+  //
+  // The default implementation of PartialMergeMulti will use this function
+  // as a helper, for backward compatibility.  Any successor class of
+  // MergeOperator should either implement PartialMerge or PartialMergeMulti,
+  // although implementing PartialMergeMulti is suggested as it is in general
+  // more effective to merge multiple operands at a time instead of two
+  // operands at a time.
+  //
+  // If it is impossible or infeasible to combine the two operations,
+  // leave new_value unchanged and return false. The library will
+  // internally keep track of the operations, and apply them in the
+  // correct order once a base-value (a Put/Delete/End-of-Database) is seen.
+  //
+  // TODO: Presently there is no way to differentiate between error/corruption
+  // and simply "return false". For now, the client should simply return
+  // false in any case it cannot perform partial-merge, regardless of reason.
+  // If there is corruption in the data, handle it in the FullMerge() function,
+  // and return false there.  The default implementation of PartialMerge will
+  // always return false.
+  virtual bool PartialMerge(const Slice& key, const Slice& left_operand,
+                            const Slice& right_operand, std::string* new_value,
+                            Logger* logger) const {
+    return false;
+  }
+
+  // This function performs merge when all the operands are themselves merge
+  // operation types that you would have passed to a DB::Merge() call in the
+  // same order (front() first)
+  // (i.e. DB::Merge(key, operand_list[0]), followed by
+  //  DB::Merge(key, operand_list[1]), ...)
+  //
+  // PartialMergeMulti should combine them into a single merge operation that is
+  // saved into *new_value, and then it should return true.  *new_value should
+  // be constructed such that a call to DB::Merge(key, *new_value) would yield
+  // the same result as subquential individual calls to DB::Merge(key, operand)
+  // for each operand in operand_list from front() to back().
+  //
+  // The PartialMergeMulti function will be called only when the list of
+  // operands are long enough. The minimum amount of operands that will be
+  // passed to the function are specified by the "min_partial_merge_operands"
+  // option.
+  //
+  // In the default implementation, PartialMergeMulti will invoke PartialMerge
+  // multiple times, where each time it only merges two operands.  Developers
+  // should either implement PartialMergeMulti, or implement PartialMerge which
+  // is served as the helper function of the default PartialMergeMulti.
+  virtual bool PartialMergeMulti(const Slice& key,
+                                 const std::deque<Slice>& operand_list,
+                                 std::string* new_value, Logger* logger) const;
+
+  // The name of the MergeOperator. Used to check for MergeOperator
+  // mismatches (i.e., a DB created with one MergeOperator is
+  // accessed using a different MergeOperator)
+  // TODO: the name is currently not stored persistently and thus
+  //       no checking is enforced. Client is responsible for providing
+  //       consistent MergeOperator between DB opens.
+  virtual const char* Name() const = 0;
+};
+
+// The simpler, associative merge operator.
+class AssociativeMergeOperator : public MergeOperator {
+ public:
+  virtual ~AssociativeMergeOperator() {}
+
+  // Gives the client a way to express the read -> modify -> write semantics
+  // key:           (IN) The key that's associated with this merge operation.
+  // existing_value:(IN) null indicates the key does not exist before this op
+  // value:         (IN) the value to update/merge the existing_value with
+  // new_value:    (OUT) Client is responsible for filling the merge result here
+  // logger:        (IN) Client could use this to log errors during merge.
+  //
+  // Return true on success.
+  // All values passed in will be client-specific values. So if this method
+  // returns false, it is because client specified bad data or there was
+  // internal corruption. The client should assume that this will be treated
+  // as an error by the library.
+  virtual bool Merge(const Slice& key,
+                     const Slice* existing_value,
+                     const Slice& value,
+                     std::string* new_value,
+                     Logger* logger) const = 0;
+
+
+ private:
+  // Default implementations of the MergeOperator functions
+  virtual bool FullMerge(const Slice& key,
+                         const Slice* existing_value,
+                         const std::deque<std::string>& operand_list,
+                         std::string* new_value,
+                         Logger* logger) const override;
+
+  virtual bool PartialMerge(const Slice& key,
+                            const Slice& left_operand,
+                            const Slice& right_operand,
+                            std::string* new_value,
+                            Logger* logger) const override;
+};
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_MERGE_OPERATOR_H_
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
new file mode 100644 (file)
index 0000000..c283a5e
--- /dev/null
@@ -0,0 +1,922 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_
+#define STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_
+
+#include <stddef.h>
+#include <string>
+#include <memory>
+#include <vector>
+#include <stdint.h>
+
+#include "rocksdb/version.h"
+#include "rocksdb/universal_compaction.h"
+
+namespace rocksdb {
+
+class Cache;
+class CompactionFilter;
+class CompactionFilterFactory;
+class CompactionFilterFactoryV2;
+class Comparator;
+class Env;
+enum InfoLogLevel : unsigned char;
+class FilterPolicy;
+class Logger;
+class MergeOperator;
+class Snapshot;
+class TableFactory;
+class MemTableRepFactory;
+class TablePropertiesCollector;
+class Slice;
+class SliceTransform;
+class Statistics;
+class InternalKeyComparator;
+
+using std::shared_ptr;
+
+// DB contents are stored in a set of blocks, each of which holds a
+// sequence of key,value pairs.  Each block may be compressed before
+// being stored in a file.  The following enum describes which
+// compression method (if any) is used to compress a block.
+enum CompressionType : char {
+  // NOTE: do not change the values of existing entries, as these are
+  // part of the persistent format on disk.
+  kNoCompression = 0x0, kSnappyCompression = 0x1, kZlibCompression = 0x2,
+  kBZip2Compression = 0x3, kLZ4Compression = 0x4, kLZ4HCCompression = 0x5
+};
+
+enum CompactionStyle : char {
+  kCompactionStyleLevel = 0x0,     // level based compaction style
+  kCompactionStyleUniversal = 0x1  // Universal compaction style
+};
+
+// Compression options for different compression algorithms like Zlib
+struct CompressionOptions {
+  int window_bits;
+  int level;
+  int strategy;
+  CompressionOptions() : window_bits(-14), level(-1), strategy(0) {}
+  CompressionOptions(int wbits, int _lev, int _strategy)
+      : window_bits(wbits), level(_lev), strategy(_strategy) {}
+};
+
+enum UpdateStatus {    // Return status For inplace update callback
+  UPDATE_FAILED   = 0, // Nothing to update
+  UPDATED_INPLACE = 1, // Value updated inplace
+  UPDATED         = 2, // No inplace update. Merged value set
+};
+
+struct Options;
+
+struct ColumnFamilyOptions {
+  // -------------------
+  // Parameters that affect behavior
+
+  // Comparator used to define the order of keys in the table.
+  // Default: a comparator that uses lexicographic byte-wise ordering
+  //
+  // REQUIRES: The client must ensure that the comparator supplied
+  // here has the same name and orders keys *exactly* the same as the
+  // comparator provided to previous open calls on the same DB.
+  const Comparator* comparator;
+
+  // REQUIRES: The client must provide a merge operator if Merge operation
+  // needs to be accessed. Calling Merge on a DB without a merge operator
+  // would result in Status::NotSupported. The client must ensure that the
+  // merge operator supplied here has the same name and *exactly* the same
+  // semantics as the merge operator provided to previous open calls on
+  // the same DB. The only exception is reserved for upgrade, where a DB
+  // previously without a merge operator is introduced to Merge operation
+  // for the first time. It's necessary to specify a merge operator when
+  // openning the DB in this case.
+  // Default: nullptr
+  shared_ptr<MergeOperator> merge_operator;
+
+  // A single CompactionFilter instance to call into during compaction.
+  // Allows an application to modify/delete a key-value during background
+  // compaction.
+  //
+  // If the client requires a new compaction filter to be used for different
+  // compaction runs, it can specify compaction_filter_factory instead of this
+  // option.  The client should specify only one of the two.
+  // compaction_filter takes precedence over compaction_filter_factory if
+  // client specifies both.
+  //
+  // If multithreaded compaction is being used, the supplied CompactionFilter
+  // instance may be used from different threads concurrently and so should be
+  // thread-safe.
+  //
+  // Default: nullptr
+  const CompactionFilter* compaction_filter;
+
+  // This is a factory that provides compaction filter objects which allow
+  // an application to modify/delete a key-value during background compaction.
+  //
+  // A new filter will be created on each compaction run.  If multithreaded
+  // compaction is being used, each created CompactionFilter will only be used
+  // from a single thread and so does not need to be thread-safe.
+  //
+  // Default: a factory that doesn't provide any object
+  std::shared_ptr<CompactionFilterFactory> compaction_filter_factory;
+
+  // Version TWO of the compaction_filter_factory
+  // It supports rolling compaction
+  //
+  // Default: a factory that doesn't provide any object
+  std::shared_ptr<CompactionFilterFactoryV2> compaction_filter_factory_v2;
+
+  // -------------------
+  // Parameters that affect performance
+
+  // Amount of data to build up in memory (backed by an unsorted log
+  // on disk) before converting to a sorted on-disk file.
+  //
+  // Larger values increase performance, especially during bulk loads.
+  // Up to max_write_buffer_number write buffers may be held in memory
+  // at the same time,
+  // so you may wish to adjust this parameter to control memory usage.
+  // Also, a larger write buffer will result in a longer recovery time
+  // the next time the database is opened.
+  //
+  // Default: 4MB
+  size_t write_buffer_size;
+
+  // The maximum number of write buffers that are built up in memory.
+  // The default is 2, so that when 1 write buffer is being flushed to
+  // storage, new writes can continue to the other write buffer.
+  // Default: 2
+  int max_write_buffer_number;
+
+  // The minimum number of write buffers that will be merged together
+  // before writing to storage.  If set to 1, then
+  // all write buffers are fushed to L0 as individual files and this increases
+  // read amplification because a get request has to check in all of these
+  // files. Also, an in-memory merge may result in writing lesser
+  // data to storage if there are duplicate records in each of these
+  // individual write buffers.  Default: 1
+  int min_write_buffer_number_to_merge;
+
+  // Control over blocks (user data is stored in a set of blocks, and
+  // a block is the unit of reading from disk).
+
+  // If non-NULL use the specified cache for blocks.
+  // If NULL, rocksdb will automatically create and use an 8MB internal cache.
+  // Default: nullptr
+  shared_ptr<Cache> block_cache;
+
+  // If non-NULL use the specified cache for compressed blocks.
+  // If NULL, rocksdb will not use a compressed block cache.
+  // Default: nullptr
+  shared_ptr<Cache> block_cache_compressed;
+
+  // Approximate size of user data packed per block.  Note that the
+  // block size specified here corresponds to uncompressed data.  The
+  // actual size of the unit read from disk may be smaller if
+  // compression is enabled.  This parameter can be changed dynamically.
+  //
+  // Default: 4K
+  size_t block_size;
+
+  // Number of keys between restart points for delta encoding of keys.
+  // This parameter can be changed dynamically.  Most clients should
+  // leave this parameter alone.
+  //
+  // Default: 16
+  int block_restart_interval;
+
+  // Compress blocks using the specified compression algorithm.  This
+  // parameter can be changed dynamically.
+  //
+  // Default: kSnappyCompression, which gives lightweight but fast
+  // compression.
+  //
+  // Typical speeds of kSnappyCompression on an Intel(R) Core(TM)2 2.4GHz:
+  //    ~200-500MB/s compression
+  //    ~400-800MB/s decompression
+  // Note that these speeds are significantly faster than most
+  // persistent storage speeds, and therefore it is typically never
+  // worth switching to kNoCompression.  Even if the input data is
+  // incompressible, the kSnappyCompression implementation will
+  // efficiently detect that and will switch to uncompressed mode.
+  CompressionType compression;
+
+  // Different levels can have different compression policies. There
+  // are cases where most lower levels would like to quick compression
+  // algorithm while the higher levels (which have more data) use
+  // compression algorithms that have better compression but could
+  // be slower. This array, if non nullptr, should have an entry for
+  // each level of the database. This array, if non nullptr, overides the
+  // value specified in the previous field 'compression'. The caller is
+  // reponsible for allocating memory and initializing the values in it
+  // before invoking Open(). The caller is responsible for freeing this
+  // array and it could be freed anytime after the return from Open().
+  // This could have been a std::vector but that makes the equivalent
+  // java/C api hard to construct.
+  std::vector<CompressionType> compression_per_level;
+
+  // different options for compression algorithms
+  CompressionOptions compression_opts;
+
+  // If non-nullptr, use the specified filter policy to reduce disk reads.
+  // Many applications will benefit from passing the result of
+  // NewBloomFilterPolicy() here.
+  //
+  // Default: nullptr
+  const FilterPolicy* filter_policy;
+
+  // If non-nullptr, use the specified function to determine the
+  // prefixes for keys.  These prefixes will be placed in the filter.
+  // Depending on the workload, this can reduce the number of read-IOP
+  // cost for scans when a prefix is passed via ReadOptions to
+  // db.NewIterator().  For prefix filtering to work properly,
+  // "prefix_extractor" and "comparator" must be such that the following
+  // properties hold:
+  //
+  // 1) key.starts_with(prefix(key))
+  // 2) Compare(prefix(key), key) <= 0.
+  // 3) If Compare(k1, k2) <= 0, then Compare(prefix(k1), prefix(k2)) <= 0
+  // 4) prefix(prefix(key)) == prefix(key)
+  //
+  // Default: nullptr
+  std::shared_ptr<const SliceTransform> prefix_extractor;
+
+  // If true, place whole keys in the filter (not just prefixes).
+  // This must generally be true for gets to be efficient.
+  //
+  // Default: true
+  bool whole_key_filtering;
+
+  // Number of levels for this database
+  int num_levels;
+
+  // Number of files to trigger level-0 compaction. A value <0 means that
+  // level-0 compaction will not be triggered by number of files at all.
+  //
+  // Default: 4
+  int level0_file_num_compaction_trigger;
+
+  // Soft limit on number of level-0 files. We start slowing down writes at this
+  // point. A value <0 means that no writing slow down will be triggered by
+  // number of files in level-0.
+  int level0_slowdown_writes_trigger;
+
+  // Maximum number of level-0 files.  We stop writes at this point.
+  int level0_stop_writes_trigger;
+
+  // Maximum level to which a new compacted memtable is pushed if it
+  // does not create overlap.  We try to push to level 2 to avoid the
+  // relatively expensive level 0=>1 compactions and to avoid some
+  // expensive manifest file operations.  We do not push all the way to
+  // the largest level since that can generate a lot of wasted disk
+  // space if the same key space is being repeatedly overwritten.
+  int max_mem_compaction_level;
+
+  // Target file size for compaction.
+  // target_file_size_base is per-file size for level-1.
+  // Target file size for level L can be calculated by
+  // target_file_size_base * (target_file_size_multiplier ^ (L-1))
+  // For example, if target_file_size_base is 2MB and
+  // target_file_size_multiplier is 10, then each file on level-1 will
+  // be 2MB, and each file on level 2 will be 20MB,
+  // and each file on level-3 will be 200MB.
+
+  // by default target_file_size_base is 2MB.
+  int target_file_size_base;
+  // by default target_file_size_multiplier is 1, which means
+  // by default files in different levels will have similar size.
+  int target_file_size_multiplier;
+
+  // Control maximum total data size for a level.
+  // max_bytes_for_level_base is the max total for level-1.
+  // Maximum number of bytes for level L can be calculated as
+  // (max_bytes_for_level_base) * (max_bytes_for_level_multiplier ^ (L-1))
+  // For example, if max_bytes_for_level_base is 20MB, and if
+  // max_bytes_for_level_multiplier is 10, total data size for level-1
+  // will be 20MB, total file size for level-2 will be 200MB,
+  // and total file size for level-3 will be 2GB.
+
+  // by default 'max_bytes_for_level_base' is 10MB.
+  uint64_t max_bytes_for_level_base;
+  // by default 'max_bytes_for_level_base' is 10.
+  int max_bytes_for_level_multiplier;
+
+  // Different max-size multipliers for different levels.
+  // These are multiplied by max_bytes_for_level_multiplier to arrive
+  // at the max-size of each level.
+  // Default: 1
+  std::vector<int> max_bytes_for_level_multiplier_additional;
+
+  // Maximum number of bytes in all compacted files.  We avoid expanding
+  // the lower level file set of a compaction if it would make the
+  // total compaction cover more than
+  // (expanded_compaction_factor * targetFileSizeLevel()) many bytes.
+  int expanded_compaction_factor;
+
+  // Maximum number of bytes in all source files to be compacted in a
+  // single compaction run. We avoid picking too many files in the
+  // source level so that we do not exceed the total source bytes
+  // for compaction to exceed
+  // (source_compaction_factor * targetFileSizeLevel()) many bytes.
+  // Default:1, i.e. pick maxfilesize amount of data as the source of
+  // a compaction.
+  int source_compaction_factor;
+
+  // Control maximum bytes of overlaps in grandparent (i.e., level+2) before we
+  // stop building a single file in a level->level+1 compaction.
+  int max_grandparent_overlap_factor;
+
+  // Disable compaction triggered by seek.
+  // With bloomfilter and fast storage, a miss on one level
+  // is very cheap if the file handle is cached in table cache
+  // (which is true if max_open_files is large).
+  bool disable_seek_compaction;
+
+  // Puts are delayed 0-1 ms when any level has a compaction score that exceeds
+  // soft_rate_limit. This is ignored when == 0.0.
+  // CONSTRAINT: soft_rate_limit <= hard_rate_limit. If this constraint does not
+  // hold, RocksDB will set soft_rate_limit = hard_rate_limit
+  // Default: 0 (disabled)
+  double soft_rate_limit;
+
+  // Puts are delayed 1ms at a time when any level has a compaction score that
+  // exceeds hard_rate_limit. This is ignored when <= 1.0.
+  // Default: 0 (disabled)
+  double hard_rate_limit;
+
+  // Max time a put will be stalled when hard_rate_limit is enforced. If 0, then
+  // there is no limit.
+  // Default: 1000
+  unsigned int rate_limit_delay_max_milliseconds;
+
+  // Disable block cache. If this is set to true,
+  // then no block cache should be used, and the block_cache should
+  // point to a nullptr object.
+  // Default: false
+  bool no_block_cache;
+
+  // size of one block in arena memory allocation.
+  // If <= 0, a proper value is automatically calculated (usually 1/10 of
+  // writer_buffer_size).
+  //
+  // There are two additonal restriction of the The specified size:
+  // (1) size should be in the range of [4096, 2 << 30] and
+  // (2) be the multiple of the CPU word (which helps with the memory
+  // alignment).
+  //
+  // We'll automatically check and adjust the size number to make sure it
+  // conforms to the restrictions.
+  //
+  // Default: 0
+  size_t arena_block_size;
+
+  // Disable automatic compactions. Manual compactions can still
+  // be issued on this column family
+  bool disable_auto_compactions;
+
+  // Purge duplicate/deleted keys when a memtable is flushed to storage.
+  // Default: true
+  bool purge_redundant_kvs_while_flush;
+
+  // This is used to close a block before it reaches the configured
+  // 'block_size'. If the percentage of free space in the current block is less
+  // than this specified number and adding a new record to the block will
+  // exceed the configured block size, then this block will be closed and the
+  // new record will be written to the next block.
+  // Default is 10.
+  int block_size_deviation;
+
+  // The compaction style. Default: kCompactionStyleLevel
+  CompactionStyle compaction_style;
+
+  // If true, compaction will verify checksum on every read that happens
+  // as part of compaction
+  // Default: true
+  bool verify_checksums_in_compaction;
+
+  // The options needed to support Universal Style compactions
+  CompactionOptionsUniversal compaction_options_universal;
+
+  // Use KeyMayExist API to filter deletes when this is true.
+  // If KeyMayExist returns false, i.e. the key definitely does not exist, then
+  // the delete is a noop. KeyMayExist only incurs in-memory look up.
+  // This optimization avoids writing the delete to storage when appropriate.
+  // Default: false
+  bool filter_deletes;
+
+  // An iteration->Next() sequentially skips over keys with the same
+  // user-key unless this option is set. This number specifies the number
+  // of keys (with the same userkey) that will be sequentially
+  // skipped before a reseek is issued.
+  // Default: 8
+  uint64_t max_sequential_skip_in_iterations;
+
+  // This is a factory that provides MemTableRep objects.
+  // Default: a factory that provides a skip-list-based implementation of
+  // MemTableRep.
+  std::shared_ptr<MemTableRepFactory> memtable_factory;
+
+  // This is a factory that provides TableFactory objects.
+  // Default: a factory that provides a default implementation of
+  // Table and TableBuilder.
+  std::shared_ptr<TableFactory> table_factory;
+
+  // This option allows user to to collect their own interested statistics of
+  // the tables.
+  // Default: emtpy vector -- no user-defined statistics collection will be
+  // performed.
+  typedef std::vector<std::shared_ptr<TablePropertiesCollector>>
+      TablePropertiesCollectors;
+  TablePropertiesCollectors table_properties_collectors;
+
+  // Allows thread-safe inplace updates.
+  // If inplace_callback function is not set,
+  //   Put(key, new_value) will update inplace the existing_value iff
+  //   * key exists in current memtable
+  //   * new sizeof(new_value) <= sizeof(existing_value)
+  //   * existing_value for that key is a put i.e. kTypeValue
+  // If inplace_callback function is set, check doc for inplace_callback.
+  // Default: false.
+  bool inplace_update_support;
+
+  // Number of locks used for inplace update
+  // Default: 10000, if inplace_update_support = true, else 0.
+  size_t inplace_update_num_locks;
+
+  // existing_value - pointer to previous value (from both memtable and sst).
+  //                  nullptr if key doesn't exist
+  // existing_value_size - pointer to size of existing_value).
+  //                       nullptr if key doesn't exist
+  // delta_value - Delta value to be merged with the existing_value.
+  //               Stored in transaction logs.
+  // merged_value - Set when delta is applied on the previous value.
+
+  // Applicable only when inplace_update_support is true,
+  // this callback function is called at the time of updating the memtable
+  // as part of a Put operation, lets say Put(key, delta_value). It allows the
+  // 'delta_value' specified as part of the Put operation to be merged with
+  // an 'existing_value' of the key in the database.
+
+  // If the merged value is smaller in size that the 'existing_value',
+  // then this function can update the 'existing_value' buffer inplace and
+  // the corresponding 'existing_value'_size pointer, if it wishes to.
+  // The callback should return UpdateStatus::UPDATED_INPLACE.
+  // In this case. (In this case, the snapshot-semantics of the rocksdb
+  // Iterator is not atomic anymore).
+
+  // If the merged value is larger in size than the 'existing_value' or the
+  // application does not wish to modify the 'existing_value' buffer inplace,
+  // then the merged value should be returned via *merge_value. It is set by
+  // merging the 'existing_value' and the Put 'delta_value'. The callback should
+  // return UpdateStatus::UPDATED in this case. This merged value will be added
+  // to the memtable.
+
+  // If merging fails or the application does not wish to take any action,
+  // then the callback should return UpdateStatus::UPDATE_FAILED.
+
+  // Please remember that the original call from the application is Put(key,
+  // delta_value). So the transaction log (if enabled) will still contain (key,
+  // delta_value). The 'merged_value' is not stored in the transaction log.
+  // Hence the inplace_callback function should be consistent across db reopens.
+
+  // Default: nullptr
+  UpdateStatus (*inplace_callback)(char* existing_value,
+                                   uint32_t* existing_value_size,
+                                   Slice delta_value,
+                                   std::string* merged_value);
+
+  // if prefix_extractor is set and bloom_bits is not 0, create prefix bloom
+  // for memtable
+  uint32_t memtable_prefix_bloom_bits;
+
+  // number of hash probes per key
+  uint32_t memtable_prefix_bloom_probes;
+
+  // Control locality of bloom filter probes to improve cache miss rate.
+  // This option only applies to memtable prefix bloom and plaintable
+  // prefix bloom. It essentially limits the max number of cache lines each
+  // bloom filter check can touch.
+  // This optimization is turned off when set to 0. The number should never
+  // be greater than number of probes. This option can boost performance
+  // for in-memory workload but should use with care since it can cause
+  // higher false positive rate.
+  // Default: 0
+  uint32_t bloom_locality;
+
+  // Maximum number of successive merge operations on a key in the memtable.
+  //
+  // When a merge operation is added to the memtable and the maximum number of
+  // successive merges is reached, the value of the key will be calculated and
+  // inserted into the memtable instead of the merge operation. This will
+  // ensure that there are never more than max_successive_merges merge
+  // operations in the memtable.
+  //
+  // Default: 0 (disabled)
+  size_t max_successive_merges;
+
+  // The number of partial merge operands to accumulate before partial
+  // merge will be performed. Partial merge will not be called
+  // if the list of values to merge is less than min_partial_merge_operands.
+  //
+  // If min_partial_merge_operands < 2, then it will be treated as 2.
+  //
+  // Default: 2
+  uint32_t min_partial_merge_operands;
+
+  // Create ColumnFamilyOptions with default values for all fields
+  ColumnFamilyOptions();
+  // Create ColumnFamilyOptions from Options
+  explicit ColumnFamilyOptions(const Options& options);
+
+  void Dump(Logger* log) const;
+};
+
+struct DBOptions {
+  // If true, the database will be created if it is missing.
+  // Default: false
+  bool create_if_missing;
+
+  // If true, an error is raised if the database already exists.
+  // Default: false
+  bool error_if_exists;
+
+  // If true, the implementation will do aggressive checking of the
+  // data it is processing and will stop early if it detects any
+  // errors.  This may have unforeseen ramifications: for example, a
+  // corruption of one DB entry may cause a large number of entries to
+  // become unreadable or for the entire DB to become unopenable.
+  // If any of the  writes to the database fails (Put, Delete, Merge, Write),
+  // the database will switch to read-only mode and fail all other
+  // Write operations.
+  // Default: true
+  bool paranoid_checks;
+
+  // Use the specified object to interact with the environment,
+  // e.g. to read/write files, schedule background work, etc.
+  // Default: Env::Default()
+  Env* env;
+
+  // Any internal progress/error information generated by the db will
+  // be written to info_log if it is non-nullptr, or to a file stored
+  // in the same directory as the DB contents if info_log is nullptr.
+  // Default: nullptr
+  shared_ptr<Logger> info_log;
+
+  InfoLogLevel info_log_level;
+
+  // Number of open files that can be used by the DB.  You may need to
+  // increase this if your database has a large working set. Value -1 means
+  // files opened are always kept open. You can estimate number of files based
+  // on target_file_size_base and target_file_size_multiplier for level-based
+  // compaction. For universal-style compaction, you can usually set it to -1.
+  // Default: 5000
+  int max_open_files;
+
+  // Once write-ahead logs exceed this size, we will start forcing the flush of
+  // column families whose memtables are backed by the oldest live WAL file
+  // (i.e. the ones that are causing all the space amplification). If set to 0
+  // (default), we will dynamically choose the WAL size limit to be
+  // [sum of all write_buffer_size * max_write_buffer_number] * 2
+  // Default: 0
+  uint64_t max_total_wal_size;
+
+  // If non-null, then we should collect metrics about database operations
+  // Statistics objects should not be shared between DB instances as
+  // it does not use any locks to prevent concurrent updates.
+  shared_ptr<Statistics> statistics;
+
+  // If true, then the contents of data files are not synced
+  // to stable storage. Their contents remain in the OS buffers till the
+  // OS decides to flush them. This option is good for bulk-loading
+  // of data. Once the bulk-loading is complete, please issue a
+  // sync to the OS to flush all dirty buffesrs to stable storage.
+  // Default: false
+  bool disableDataSync;
+
+  // If true, then every store to stable storage will issue a fsync.
+  // If false, then every store to stable storage will issue a fdatasync.
+  // This parameter should be set to true while storing data to
+  // filesystem like ext3 that can lose files after a reboot.
+  // Default: false
+  bool use_fsync;
+
+  // This number controls how often a new scribe log about
+  // db deploy stats is written out.
+  // -1 indicates no logging at all.
+  // Default value is 1800 (half an hour).
+  int db_stats_log_interval;
+
+  // This specifies the info LOG dir.
+  // If it is empty, the log files will be in the same dir as data.
+  // If it is non empty, the log files will be in the specified dir,
+  // and the db data dir's absolute path will be used as the log file
+  // name's prefix.
+  std::string db_log_dir;
+
+  // This specifies the absolute dir path for write-ahead logs (WAL).
+  // If it is empty, the log files will be in the same dir as data,
+  //   dbname is used as the data dir by default
+  // If it is non empty, the log files will be in kept the specified dir.
+  // When destroying the db,
+  //   all log files in wal_dir and the dir itself is deleted
+  std::string wal_dir;
+
+  // The periodicity when obsolete files get deleted. The default
+  // value is 6 hours. The files that get out of scope by compaction
+  // process will still get automatically delete on every compaction,
+  // regardless of this setting
+  uint64_t delete_obsolete_files_period_micros;
+
+  // Maximum number of concurrent background compaction jobs, submitted to
+  // the default LOW priority thread pool.
+  // If you're increasing this, also consider increasing number of threads in
+  // LOW priority thread pool. For more information, see
+  // Env::SetBackgroundThreads
+  // Default: 1
+  int max_background_compactions;
+
+  // Maximum number of concurrent background memtable flush jobs, submitted to
+  // the HIGH priority thread pool.
+  //
+  // By default, all background jobs (major compaction and memtable flush) go
+  // to the LOW priority pool. If this option is set to a positive number,
+  // memtable flush jobs will be submitted to the HIGH priority pool.
+  // It is important when the same Env is shared by multiple db instances.
+  // Without a separate pool, long running major compaction jobs could
+  // potentially block memtable flush jobs of other db instances, leading to
+  // unnecessary Put stalls.
+  //
+  // If you're increasing this, also consider increasing number of threads in
+  // HIGH priority thread pool. For more information, see
+  // Env::SetBackgroundThreads
+  // Default: 1
+  int max_background_flushes;
+
+  // Specify the maximal size of the info log file. If the log file
+  // is larger than `max_log_file_size`, a new info log file will
+  // be created.
+  // If max_log_file_size == 0, all logs will be written to one
+  // log file.
+  size_t max_log_file_size;
+
+  // Time for the info log file to roll (in seconds).
+  // If specified with non-zero value, log file will be rolled
+  // if it has been active longer than `log_file_time_to_roll`.
+  // Default: 0 (disabled)
+  size_t log_file_time_to_roll;
+
+  // Maximal info log files to be kept.
+  // Default: 1000
+  size_t keep_log_file_num;
+
+  // manifest file is rolled over on reaching this limit.
+  // The older manifest file be deleted.
+  // The default value is MAX_INT so that roll-over does not take place.
+  uint64_t max_manifest_file_size;
+
+  // Number of shards used for table cache.
+  int table_cache_numshardbits;
+
+  // During data eviction of table's LRU cache, it would be inefficient
+  // to strictly follow LRU because this piece of memory will not really
+  // be released unless its refcount falls to zero. Instead, make two
+  // passes: the first pass will release items with refcount = 1,
+  // and if not enough space releases after scanning the number of
+  // elements specified by this parameter, we will remove items in LRU
+  // order.
+  int table_cache_remove_scan_count_limit;
+
+  // The following two fields affect how archived logs will be deleted.
+  // 1. If both set to 0, logs will be deleted asap and will not get into
+  //    the archive.
+  // 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0,
+  //    WAL files will be checked every 10 min and if total size is greater
+  //    then WAL_size_limit_MB, they will be deleted starting with the
+  //    earliest until size_limit is met. All empty files will be deleted.
+  // 3. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
+  //    WAL files will be checked every WAL_ttl_secondsi / 2 and those that
+  //    are older than WAL_ttl_seconds will be deleted.
+  // 4. If both are not 0, WAL files will be checked every 10 min and both
+  //    checks will be performed with ttl being first.
+  uint64_t WAL_ttl_seconds;
+  uint64_t WAL_size_limit_MB;
+
+  // Number of bytes to preallocate (via fallocate) the manifest
+  // files.  Default is 4mb, which is reasonable to reduce random IO
+  // as well as prevent overallocation for mounts that preallocate
+  // large amounts of data (such as xfs's allocsize option).
+  size_t manifest_preallocation_size;
+
+  // Data being read from file storage may be buffered in the OS
+  // Default: true
+  bool allow_os_buffer;
+
+  // Allow the OS to mmap file for reading sst tables. Default: false
+  bool allow_mmap_reads;
+
+  // Allow the OS to mmap file for writing. Default: false
+  bool allow_mmap_writes;
+
+  // Disable child process inherit open files. Default: true
+  bool is_fd_close_on_exec;
+
+  // Skip log corruption error on recovery (If client is ok with
+  // losing most recent changes)
+  // Default: false
+  bool skip_log_error_on_recovery;
+
+  // if not zero, dump rocksdb.stats to LOG every stats_dump_period_sec
+  // Default: 3600 (1 hour)
+  unsigned int stats_dump_period_sec;
+
+  // If set true, will hint the underlying file system that the file
+  // access pattern is random, when a sst file is opened.
+  // Default: true
+  bool advise_random_on_open;
+
+  // Specify the file access pattern once a compaction is started.
+  // It will be applied to all input files of a compaction.
+  // Default: NORMAL
+  enum {
+    NONE,
+    NORMAL,
+    SEQUENTIAL,
+    WILLNEED
+  } access_hint_on_compaction_start;
+
+  // Use adaptive mutex, which spins in the user space before resorting
+  // to kernel. This could reduce context switch when the mutex is not
+  // heavily contended. However, if the mutex is hot, we could end up
+  // wasting spin time.
+  // Default: false
+  bool use_adaptive_mutex;
+
+  // Allows OS to incrementally sync files to disk while they are being
+  // written, asynchronously, in the background.
+  // Issue one request for every bytes_per_sync written. 0 turns it off.
+  // Default: 0
+  uint64_t bytes_per_sync;
+
+  // Allow RocksDB to use thread local storage to optimize performance.
+  // Default: true
+  bool allow_thread_local;
+
+  // Create DBOptions with default values for all fields
+  DBOptions();
+  // Create DBOptions from Options
+  explicit DBOptions(const Options& options);
+
+  void Dump(Logger* log) const;
+};
+
+// Options to control the behavior of a database (passed to DB::Open)
+struct Options : public DBOptions, public ColumnFamilyOptions {
+  // Create an Options object with default values for all fields.
+  Options() :
+    DBOptions(),
+    ColumnFamilyOptions() {}
+
+  Options(const DBOptions& db_options,
+          const ColumnFamilyOptions& column_family_options)
+      : DBOptions(db_options), ColumnFamilyOptions(column_family_options) {}
+
+  void Dump(Logger* log) const;
+
+  // Set appropriate parameters for bulk loading.
+  // The reason that this is a function that returns "this" instead of a
+  // constructor is to enable chaining of multiple similar calls in the future.
+  //
+
+  // All data will be in level 0 without any automatic compaction.
+  // It's recommended to manually call CompactRange(NULL, NULL) before reading
+  // from the database, because otherwise the read can be very slow.
+  Options* PrepareForBulkLoad();
+};
+
+//
+// An application can issue a read request (via Get/Iterators) and specify
+// if that read should process data that ALREADY resides on a specified cache
+// level. For example, if an application specifies kBlockCacheTier then the
+// Get call will process data that is already processed in the memtable or
+// the block cache. It will not page in data from the OS cache or data that
+// resides in storage.
+enum ReadTier {
+  kReadAllTier = 0x0,    // data in memtable, block cache, OS cache or storage
+  kBlockCacheTier = 0x1  // data in memtable or block cache
+};
+
+// Options that control read operations
+struct ReadOptions {
+  // If true, all data read from underlying storage will be
+  // verified against corresponding checksums.
+  // Default: true
+  bool verify_checksums;
+
+  // Should the "data block"/"index block"/"filter block" read for this
+  // iteration be cached in memory?
+  // Callers may wish to set this field to false for bulk scans.
+  // Default: true
+  bool fill_cache;
+
+  // If this option is set and memtable implementation allows, Seek
+  // might only return keys with the same prefix as the seek-key
+  //
+  // ! DEPRECATED: prefix_seek is on by default when prefix_extractor
+  // is configured
+  // bool prefix_seek;
+
+  // If "snapshot" is non-nullptr, read as of the supplied snapshot
+  // (which must belong to the DB that is being read and which must
+  // not have been released).  If "snapshot" is nullptr, use an impliicit
+  // snapshot of the state at the beginning of this read operation.
+  // Default: nullptr
+  const Snapshot* snapshot;
+
+  // If "prefix" is non-nullptr, and ReadOptions is being passed to
+  // db.NewIterator, only return results when the key begins with this
+  // prefix.  This field is ignored by other calls (e.g., Get).
+  // Options.prefix_extractor must also be set, and
+  // prefix_extractor.InRange(prefix) must be true.  The iterator
+  // returned by NewIterator when this option is set will behave just
+  // as if the underlying store did not contain any non-matching keys,
+  // with two exceptions.  Seek() only accepts keys starting with the
+  // prefix, and SeekToLast() is not supported.  prefix filter with this
+  // option will sometimes reduce the number of read IOPs.
+  // Default: nullptr
+  //
+  // ! DEPRECATED
+  // const Slice* prefix;
+
+  // Specify if this read request should process data that ALREADY
+  // resides on a particular cache. If the required data is not
+  // found at the specified cache, then Status::Incomplete is returned.
+  // Default: kReadAllTier
+  ReadTier read_tier;
+
+  // Specify to create a tailing iterator -- a special iterator that has a
+  // view of the complete database (i.e. it can also be used to read newly
+  // added data) and is optimized for sequential reads. It will return records
+  // that were inserted into the database after the creation of the iterator.
+  // Default: false
+  // Not supported in ROCKSDB_LITE mode!
+  bool tailing;
+
+  ReadOptions()
+      : verify_checksums(true),
+        fill_cache(true),
+        snapshot(nullptr),
+        read_tier(kReadAllTier),
+        tailing(false) {}
+  ReadOptions(bool cksum, bool cache)
+      : verify_checksums(cksum),
+        fill_cache(cache),
+        snapshot(nullptr),
+        read_tier(kReadAllTier),
+        tailing(false) {}
+};
+
+// Options that control write operations
+struct WriteOptions {
+  // If true, the write will be flushed from the operating system
+  // buffer cache (by calling WritableFile::Sync()) before the write
+  // is considered complete.  If this flag is true, writes will be
+  // slower.
+  //
+  // If this flag is false, and the machine crashes, some recent
+  // writes may be lost.  Note that if it is just the process that
+  // crashes (i.e., the machine does not reboot), no writes will be
+  // lost even if sync==false.
+  //
+  // In other words, a DB write with sync==false has similar
+  // crash semantics as the "write()" system call.  A DB write
+  // with sync==true has similar crash semantics to a "write()"
+  // system call followed by "fdatasync()".
+  //
+  // Default: false
+  bool sync;
+
+  // If true, writes will not first go to the write ahead log,
+  // and the write may got lost after a crash.
+  bool disableWAL;
+
+  WriteOptions() : sync(false), disableWAL(false) {}
+};
+
+// Options that control flush operations
+struct FlushOptions {
+  // If true, the flush will wait until the flush is done.
+  // Default: true
+  bool wait;
+
+  FlushOptions() : wait(true) {}
+};
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_
diff --git a/include/rocksdb/perf_context.h b/include/rocksdb/perf_context.h
new file mode 100644 (file)
index 0000000..0704ea2
--- /dev/null
@@ -0,0 +1,75 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_PERF_CONTEXT_H
+#define STORAGE_ROCKSDB_INCLUDE_PERF_CONTEXT_H
+
+#include <stdint.h>
+#include <string>
+
+namespace rocksdb {
+
+enum PerfLevel {
+  kDisable        = 0,  // disable perf stats
+  kEnableCount    = 1,  // enable only count stats
+  kEnableTime     = 2   // enable time stats too
+};
+
+// set the perf stats level
+void SetPerfLevel(PerfLevel level);
+
+// A thread local context for gathering performance counter efficiently
+// and transparently.
+
+struct PerfContext {
+
+  void Reset(); // reset all performance counters to zero
+
+  std::string ToString() const;
+
+  uint64_t user_key_comparison_count; // total number of user key comparisons
+  uint64_t block_cache_hit_count;     // total number of block cache hits
+  uint64_t block_read_count;          // total number of block reads (with IO)
+  uint64_t block_read_byte;           // total number of bytes from block reads
+  uint64_t block_read_time;           // total time spent on block reads
+  uint64_t block_checksum_time;       // total time spent on block checksum
+  uint64_t block_decompress_time;     // total time spent on block decompression
+  // total number of internal keys skipped over during iteration (overwritten or
+  // deleted, to be more specific, hidden by a put or delete of the same key)
+  uint64_t internal_key_skipped_count;
+  // total number of deletes skipped over during iteration
+  uint64_t internal_delete_skipped_count;
+
+  uint64_t get_snapshot_time;          // total time spent on getting snapshot
+  uint64_t get_from_memtable_time;     // total time spent on querying memtables
+  uint64_t get_from_memtable_count;    // number of mem tables queried
+  // total time spent after Get() finds a key
+  uint64_t get_post_process_time;
+  uint64_t get_from_output_files_time; // total time reading from output files
+  // total time spent on seeking child iters
+  uint64_t seek_child_seek_time;
+  // number of seek issued in child iterators
+  uint64_t seek_child_seek_count;
+  uint64_t seek_min_heap_time;         // total time spent on the merge heap
+  // total time spent on seeking the internal entries
+  uint64_t seek_internal_seek_time;
+  // total time spent on iterating internal entries to find the next user entry
+  uint64_t find_next_user_entry_time;
+  // total time spent on pre or post processing when writing a record
+  uint64_t write_pre_and_post_process_time;
+  uint64_t write_wal_time;            // total time spent on writing to WAL
+  // total time spent on writing to mem tables
+  uint64_t write_memtable_time;
+};
+
+#if defined(NPERF_CONTEXT) || defined(IOS_CROSS_COMPILE)
+extern PerfContext perf_context;
+#else
+extern __thread PerfContext perf_context;
+#endif
+
+}
+
+#endif
diff --git a/include/rocksdb/slice.h b/include/rocksdb/slice.h
new file mode 100644 (file)
index 0000000..2253715
--- /dev/null
@@ -0,0 +1,136 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Slice is a simple structure containing a pointer into some external
+// storage and a size.  The user of a Slice must ensure that the slice
+// is not used after the corresponding external storage has been
+// deallocated.
+//
+// Multiple threads can invoke const methods on a Slice without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same Slice must use
+// external synchronization.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_SLICE_H_
+#define STORAGE_ROCKSDB_INCLUDE_SLICE_H_
+
+#include <assert.h>
+#include <stddef.h>
+#include <string.h>
+#include <string>
+
+namespace rocksdb {
+
+class Slice {
+ public:
+  // Create an empty slice.
+  Slice() : data_(""), size_(0) { }
+
+  // Create a slice that refers to d[0,n-1].
+  Slice(const char* d, size_t n) : data_(d), size_(n) { }
+
+  // Create a slice that refers to the contents of "s"
+  /* implicit */
+  Slice(const std::string& s) : data_(s.data()), size_(s.size()) { }
+
+  // Create a slice that refers to s[0,strlen(s)-1]
+  /* implicit */
+  Slice(const char* s) : data_(s), size_(strlen(s)) { }
+
+  // Return a pointer to the beginning of the referenced data
+  const char* data() const { return data_; }
+
+  // Return the length (in bytes) of the referenced data
+  size_t size() const { return size_; }
+
+  // Return true iff the length of the referenced data is zero
+  bool empty() const { return size_ == 0; }
+
+  // Return the ith byte in the referenced data.
+  // REQUIRES: n < size()
+  char operator[](size_t n) const {
+    assert(n < size());
+    return data_[n];
+  }
+
+  // Change this slice to refer to an empty array
+  void clear() { data_ = ""; size_ = 0; }
+
+  // Drop the first "n" bytes from this slice.
+  void remove_prefix(size_t n) {
+    assert(n <= size());
+    data_ += n;
+    size_ -= n;
+  }
+
+  // Return a string that contains the copy of the referenced data.
+  std::string ToString(bool hex = false) const {
+    if (hex) {
+      std::string result;
+      char buf[10];
+      for (size_t i = 0; i < size_; i++) {
+        snprintf(buf, 10, "%02X", (unsigned char)data_[i]);
+        result += buf;
+      }
+      return result;
+    } else {
+      return std::string(data_, size_);
+    }
+  }
+
+  // Three-way comparison.  Returns value:
+  //   <  0 iff "*this" <  "b",
+  //   == 0 iff "*this" == "b",
+  //   >  0 iff "*this" >  "b"
+  int compare(const Slice& b) const;
+
+  // Return true iff "x" is a prefix of "*this"
+  bool starts_with(const Slice& x) const {
+    return ((size_ >= x.size_) &&
+            (memcmp(data_, x.data_, x.size_) == 0));
+  }
+
+ // private: make these public for rocksdbjni access
+  const char* data_;
+  size_t size_;
+
+  // Intentionally copyable
+};
+
+// A set of Slices that are virtually concatenated together.  'parts' points
+// to an array of Slices.  The number of elements in the array is 'num_parts'.
+struct SliceParts {
+  SliceParts(const Slice* _parts, int _num_parts) :
+      parts(_parts), num_parts(_num_parts) { }
+
+  const Slice* parts;
+  int num_parts;
+};
+
+inline bool operator==(const Slice& x, const Slice& y) {
+  return ((x.size() == y.size()) &&
+          (memcmp(x.data(), y.data(), x.size()) == 0));
+}
+
+inline bool operator!=(const Slice& x, const Slice& y) {
+  return !(x == y);
+}
+
+inline int Slice::compare(const Slice& b) const {
+  const int min_len = (size_ < b.size_) ? size_ : b.size_;
+  int r = memcmp(data_, b.data_, min_len);
+  if (r == 0) {
+    if (size_ < b.size_) r = -1;
+    else if (size_ > b.size_) r = +1;
+  }
+  return r;
+}
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_SLICE_H_
diff --git a/include/rocksdb/slice_transform.h b/include/rocksdb/slice_transform.h
new file mode 100644 (file)
index 0000000..a784550
--- /dev/null
@@ -0,0 +1,47 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Class for specifying user-defined functions which perform a
+// transformation on a slice.  It is not required that every slice
+// belong to the domain and/or range of a function.  Subclasses should
+// define InDomain and InRange to determine which slices are in either
+// of these sets respectively.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_SLICE_TRANSFORM_H_
+#define STORAGE_ROCKSDB_INCLUDE_SLICE_TRANSFORM_H_
+
+#include <string>
+
+namespace rocksdb {
+
+class Slice;
+
+class SliceTransform {
+ public:
+  virtual ~SliceTransform() {};
+
+  // Return the name of this transformation.
+  virtual const char* Name() const = 0;
+
+  // transform a src in domain to a dst in the range
+  virtual Slice Transform(const Slice& src) const = 0;
+
+  // determine whether this is a valid src upon the function applies
+  virtual bool InDomain(const Slice& src) const = 0;
+
+  // determine whether dst=Transform(src) for some src
+  virtual bool InRange(const Slice& dst) const = 0;
+};
+
+extern const SliceTransform* NewFixedPrefixTransform(size_t prefix_len);
+
+extern const SliceTransform* NewNoopTransform();
+
+}
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_SLICE_TRANSFORM_H_
diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h
new file mode 100644 (file)
index 0000000..dcd82f6
--- /dev/null
@@ -0,0 +1,268 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_STATISTICS_H_
+#define STORAGE_ROCKSDB_INCLUDE_STATISTICS_H_
+
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <memory>
+#include <vector>
+
+namespace rocksdb {
+
+/**
+ * Keep adding ticker's here.
+ *  1. Any ticker should be added before TICKER_ENUM_MAX.
+ *  2. Add a readable string in TickersNameMap below for the newly added ticker.
+ */
+enum Tickers {
+  // total block cache misses
+  // REQUIRES: BLOCK_CACHE_MISS == BLOCK_CACHE_INDEX_MISS +
+  //                               BLOCK_CACHE_FILTER_MISS +
+  //                               BLOCK_CACHE_DATA_MISS;
+  BLOCK_CACHE_MISS,
+  // total block cache hit
+  // REQUIRES: BLOCK_CACHE_HIT == BLOCK_CACHE_INDEX_HIT +
+  //                              BLOCK_CACHE_FILTER_HIT +
+  //                              BLOCK_CACHE_DATA_HIT;
+  BLOCK_CACHE_HIT,
+  // # of blocks added to block cache.
+  BLOCK_CACHE_ADD,
+  // # of times cache miss when accessing index block from block cache.
+  BLOCK_CACHE_INDEX_MISS,
+  // # of times cache hit when accessing index block from block cache.
+  BLOCK_CACHE_INDEX_HIT,
+  // # of times cache miss when accessing filter block from block cache.
+  BLOCK_CACHE_FILTER_MISS,
+  // # of times cache hit when accessing filter block from block cache.
+  BLOCK_CACHE_FILTER_HIT,
+  // # of times cache miss when accessing data block from block cache.
+  BLOCK_CACHE_DATA_MISS,
+  // # of times cache hit when accessing data block from block cache.
+  BLOCK_CACHE_DATA_HIT,
+  // # of times bloom filter has avoided file reads.
+  BLOOM_FILTER_USEFUL,
+
+  // # of memtable hits.
+  MEMTABLE_HIT,
+  // # of memtable misses.
+  MEMTABLE_MISS,
+
+  /**
+   * COMPACTION_KEY_DROP_* count the reasons for key drop during compaction
+   * There are 3 reasons currently.
+   */
+  COMPACTION_KEY_DROP_NEWER_ENTRY,  // key was written with a newer value.
+  COMPACTION_KEY_DROP_OBSOLETE,     // The key is obsolete.
+  COMPACTION_KEY_DROP_USER,  // user compaction function has dropped the key.
+
+  // Number of keys written to the database via the Put and Write call's
+  NUMBER_KEYS_WRITTEN,
+  // Number of Keys read,
+  NUMBER_KEYS_READ,
+  // Number keys updated, if inplace update is enabled
+  NUMBER_KEYS_UPDATED,
+  // Bytes written / read
+  BYTES_WRITTEN,
+  BYTES_READ,
+  NO_FILE_CLOSES,
+  NO_FILE_OPENS,
+  NO_FILE_ERRORS,
+  // Time system had to wait to do LO-L1 compactions
+  STALL_L0_SLOWDOWN_MICROS,
+  // Time system had to wait to move memtable to L1.
+  STALL_MEMTABLE_COMPACTION_MICROS,
+  // write throttle because of too many files in L0
+  STALL_L0_NUM_FILES_MICROS,
+  RATE_LIMIT_DELAY_MILLIS,
+  NO_ITERATORS,  // number of iterators currently open
+
+  // Number of MultiGet calls, keys read, and bytes read
+  NUMBER_MULTIGET_CALLS,
+  NUMBER_MULTIGET_KEYS_READ,
+  NUMBER_MULTIGET_BYTES_READ,
+
+  // Number of deletes records that were not required to be
+  // written to storage because key does not exist
+  NUMBER_FILTERED_DELETES,
+  NUMBER_MERGE_FAILURES,
+  SEQUENCE_NUMBER,
+
+  // number of times bloom was checked before creating iterator on a
+  // file, and the number of times the check was useful in avoiding
+  // iterator creation (and thus likely IOPs).
+  BLOOM_FILTER_PREFIX_CHECKED,
+  BLOOM_FILTER_PREFIX_USEFUL,
+
+  // Number of times we had to reseek inside an iteration to skip
+  // over large number of keys with same userkey.
+  NUMBER_OF_RESEEKS_IN_ITERATION,
+
+  // Record the number of calls to GetUpadtesSince. Useful to keep track of
+  // transaction log iterator refreshes
+  GET_UPDATES_SINCE_CALLS,
+  BLOCK_CACHE_COMPRESSED_MISS,  // miss in the compressed block cache
+  BLOCK_CACHE_COMPRESSED_HIT,   // hit in the compressed block cache
+  WAL_FILE_SYNCED,              // Number of times WAL sync is done
+  WAL_FILE_BYTES,               // Number of bytes written to WAL
+
+  // Writes can be processed by requesting thread or by the thread at the
+  // head of the writers queue.
+  WRITE_DONE_BY_SELF,
+  WRITE_DONE_BY_OTHER,
+  WRITE_WITH_WAL,       // Number of Write calls that request WAL
+  COMPACT_READ_BYTES,   // Bytes read during compaction
+  COMPACT_WRITE_BYTES,  // Bytes written during compaction
+
+  // Number of table's properties loaded directly from file, without creating
+  // table reader object.
+  NUMBER_DIRECT_LOAD_TABLE_PROPERTIES,
+  NUMBER_SUPERVERSION_ACQUIRES,
+  NUMBER_SUPERVERSION_RELEASES,
+  NUMBER_SUPERVERSION_CLEANUPS,
+  TICKER_ENUM_MAX
+};
+
+// The order of items listed in  Tickers should be the same as
+// the order listed in TickersNameMap
+const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
+    {BLOCK_CACHE_MISS, "rocksdb.block.cache.miss"},
+    {BLOCK_CACHE_HIT, "rocksdb.block.cache.hit"},
+    {BLOCK_CACHE_ADD, "rocksdb.block.cache.add"},
+    {BLOCK_CACHE_INDEX_MISS, "rocksdb.block.cache.index.miss"},
+    {BLOCK_CACHE_INDEX_HIT, "rocksdb.block.cache.index.hit"},
+    {BLOCK_CACHE_FILTER_MISS, "rocksdb.block.cache.filter.miss"},
+    {BLOCK_CACHE_FILTER_HIT, "rocksdb.block.cache.filter.hit"},
+    {BLOCK_CACHE_DATA_MISS, "rocksdb.block.cache.data.miss"},
+    {BLOCK_CACHE_DATA_HIT, "rocksdb.block.cache.data.hit"},
+    {BLOOM_FILTER_USEFUL, "rocksdb.bloom.filter.useful"},
+    {MEMTABLE_HIT, "rocksdb.memtable.hit"},
+    {MEMTABLE_MISS, "rocksdb.memtable.miss"},
+    {COMPACTION_KEY_DROP_NEWER_ENTRY, "rocksdb.compaction.key.drop.new"},
+    {COMPACTION_KEY_DROP_OBSOLETE, "rocksdb.compaction.key.drop.obsolete"},
+    {COMPACTION_KEY_DROP_USER, "rocksdb.compaction.key.drop.user"},
+    {NUMBER_KEYS_WRITTEN, "rocksdb.number.keys.written"},
+    {NUMBER_KEYS_READ, "rocksdb.number.keys.read"},
+    {NUMBER_KEYS_UPDATED, "rocksdb.number.keys.updated"},
+    {BYTES_WRITTEN, "rocksdb.bytes.written"},
+    {BYTES_READ, "rocksdb.bytes.read"},
+    {NO_FILE_CLOSES, "rocksdb.no.file.closes"},
+    {NO_FILE_OPENS, "rocksdb.no.file.opens"},
+    {NO_FILE_ERRORS, "rocksdb.no.file.errors"},
+    {STALL_L0_SLOWDOWN_MICROS, "rocksdb.l0.slowdown.micros"},
+    {STALL_MEMTABLE_COMPACTION_MICROS, "rocksdb.memtable.compaction.micros"},
+    {STALL_L0_NUM_FILES_MICROS, "rocksdb.l0.num.files.stall.micros"},
+    {RATE_LIMIT_DELAY_MILLIS, "rocksdb.rate.limit.delay.millis"},
+    {NO_ITERATORS, "rocksdb.num.iterators"},
+    {NUMBER_MULTIGET_CALLS, "rocksdb.number.multiget.get"},
+    {NUMBER_MULTIGET_KEYS_READ, "rocksdb.number.multiget.keys.read"},
+    {NUMBER_MULTIGET_BYTES_READ, "rocksdb.number.multiget.bytes.read"},
+    {NUMBER_FILTERED_DELETES, "rocksdb.number.deletes.filtered"},
+    {NUMBER_MERGE_FAILURES, "rocksdb.number.merge.failures"},
+    {SEQUENCE_NUMBER, "rocksdb.sequence.number"},
+    {BLOOM_FILTER_PREFIX_CHECKED, "rocksdb.bloom.filter.prefix.checked"},
+    {BLOOM_FILTER_PREFIX_USEFUL, "rocksdb.bloom.filter.prefix.useful"},
+    {NUMBER_OF_RESEEKS_IN_ITERATION, "rocksdb.number.reseeks.iteration"},
+    {GET_UPDATES_SINCE_CALLS, "rocksdb.getupdatessince.calls"},
+    {BLOCK_CACHE_COMPRESSED_MISS, "rocksdb.block.cachecompressed.miss"},
+    {BLOCK_CACHE_COMPRESSED_HIT, "rocksdb.block.cachecompressed.hit"},
+    {WAL_FILE_SYNCED, "rocksdb.wal.synced"},
+    {WAL_FILE_BYTES, "rocksdb.wal.bytes"},
+    {WRITE_DONE_BY_SELF, "rocksdb.write.self"},
+    {WRITE_DONE_BY_OTHER, "rocksdb.write.other"},
+    {WRITE_WITH_WAL, "rocksdb.write.wal"},
+    {COMPACT_READ_BYTES, "rocksdb.compact.read.bytes"},
+    {COMPACT_WRITE_BYTES, "rocksdb.compact.write.bytes"},
+    {NUMBER_DIRECT_LOAD_TABLE_PROPERTIES,
+     "rocksdb.number.direct.load.table.properties"},
+    {NUMBER_SUPERVERSION_ACQUIRES, "rocksdb.number.superversion_acquires"},
+    {NUMBER_SUPERVERSION_RELEASES, "rocksdb.number.superversion_releases"},
+    {NUMBER_SUPERVERSION_CLEANUPS, "rocksdb.number.superversion_cleanups"},
+};
+
+/**
+ * Keep adding histogram's here.
+ * Any histogram whould have value less than HISTOGRAM_ENUM_MAX
+ * Add a new Histogram by assigning it the current value of HISTOGRAM_ENUM_MAX
+ * Add a string representation in HistogramsNameMap below
+ * And increment HISTOGRAM_ENUM_MAX
+ */
+enum Histograms {
+  DB_GET,
+  DB_WRITE,
+  COMPACTION_TIME,
+  TABLE_SYNC_MICROS,
+  COMPACTION_OUTFILE_SYNC_MICROS,
+  WAL_FILE_SYNC_MICROS,
+  MANIFEST_FILE_SYNC_MICROS,
+  // TIME SPENT IN IO DURING TABLE OPEN
+  TABLE_OPEN_IO_MICROS,
+  DB_MULTIGET,
+  READ_BLOCK_COMPACTION_MICROS,
+  READ_BLOCK_GET_MICROS,
+  WRITE_RAW_BLOCK_MICROS,
+
+  STALL_L0_SLOWDOWN_COUNT,
+  STALL_MEMTABLE_COMPACTION_COUNT,
+  STALL_L0_NUM_FILES_COUNT,
+  HARD_RATE_LIMIT_DELAY_COUNT,
+  SOFT_RATE_LIMIT_DELAY_COUNT,
+  NUM_FILES_IN_SINGLE_COMPACTION,
+  HISTOGRAM_ENUM_MAX,
+};
+
+const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap = {
+  { DB_GET, "rocksdb.db.get.micros" },
+  { DB_WRITE, "rocksdb.db.write.micros" },
+  { COMPACTION_TIME, "rocksdb.compaction.times.micros" },
+  { TABLE_SYNC_MICROS, "rocksdb.table.sync.micros" },
+  { COMPACTION_OUTFILE_SYNC_MICROS, "rocksdb.compaction.outfile.sync.micros" },
+  { WAL_FILE_SYNC_MICROS, "rocksdb.wal.file.sync.micros" },
+  { MANIFEST_FILE_SYNC_MICROS, "rocksdb.manifest.file.sync.micros" },
+  { TABLE_OPEN_IO_MICROS, "rocksdb.table.open.io.micros" },
+  { DB_MULTIGET, "rocksdb.db.multiget.micros" },
+  { READ_BLOCK_COMPACTION_MICROS, "rocksdb.read.block.compaction.micros" },
+  { READ_BLOCK_GET_MICROS, "rocksdb.read.block.get.micros" },
+  { WRITE_RAW_BLOCK_MICROS, "rocksdb.write.raw.block.micros" },
+  { STALL_L0_SLOWDOWN_COUNT, "rocksdb.l0.slowdown.count"},
+  { STALL_MEMTABLE_COMPACTION_COUNT, "rocksdb.memtable.compaction.count"},
+  { STALL_L0_NUM_FILES_COUNT, "rocksdb.num.files.stall.count"},
+  { HARD_RATE_LIMIT_DELAY_COUNT, "rocksdb.hard.rate.limit.delay.count"},
+  { SOFT_RATE_LIMIT_DELAY_COUNT, "rocksdb.soft.rate.limit.delay.count"},
+  { NUM_FILES_IN_SINGLE_COMPACTION, "rocksdb.numfiles.in.singlecompaction" },
+};
+
+struct HistogramData {
+  double median;
+  double percentile95;
+  double percentile99;
+  double average;
+  double standard_deviation;
+};
+
+// Analyze the performance of a db
+class Statistics {
+ public:
+  virtual ~Statistics() {}
+
+  virtual long getTickerCount(Tickers tickerType) = 0;
+  virtual void recordTick(Tickers tickerType, uint64_t count = 0) = 0;
+  virtual void setTickerCount(Tickers tickerType, uint64_t count) = 0;
+  virtual void measureTime(Histograms histogramType, uint64_t time) = 0;
+
+  virtual void histogramData(Histograms type, HistogramData* const data) = 0;
+  // String representation of the statistic object.
+  std::string ToString();
+};
+
+// Create a concrete DBStatistics object
+std::shared_ptr<Statistics> CreateDBStatistics();
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_STATISTICS_H_
diff --git a/include/rocksdb/status.h b/include/rocksdb/status.h
new file mode 100644 (file)
index 0000000..dbd41fc
--- /dev/null
@@ -0,0 +1,145 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A Status encapsulates the result of an operation.  It may indicate success,
+// or it may indicate an error with an associated error message.
+//
+// Multiple threads can invoke const methods on a Status without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same Status must use
+// external synchronization.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_STATUS_H_
+#define STORAGE_ROCKSDB_INCLUDE_STATUS_H_
+
+#include <string>
+#include "rocksdb/slice.h"
+
+namespace rocksdb {
+
+class Status {
+ public:
+  // Create a success status.
+  Status() : code_(kOk), state_(nullptr) { }
+  ~Status() { delete[] state_; }
+
+  // Copy the specified status.
+  Status(const Status& s);
+  void operator=(const Status& s);
+
+  // Return a success status.
+  static Status OK() { return Status(); }
+
+  // Return error status of an appropriate type.
+  static Status NotFound(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kNotFound, msg, msg2);
+  }
+  // Fast path for not found without malloc;
+  static Status NotFound() {
+    return Status(kNotFound);
+  }
+  static Status Corruption(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kCorruption, msg, msg2);
+  }
+  static Status NotSupported(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kNotSupported, msg, msg2);
+  }
+  static Status InvalidArgument(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kInvalidArgument, msg, msg2);
+  }
+  static Status IOError(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kIOError, msg, msg2);
+  }
+  static Status MergeInProgress(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kMergeInProgress, msg, msg2);
+  }
+  static Status Incomplete(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kIncomplete, msg, msg2);
+  }
+  static Status ShutdownInProgress(const Slice& msg,
+                                   const Slice& msg2 = Slice()) {
+    return Status(kShutdownInProgress, msg, msg2);
+  }
+
+  // Returns true iff the status indicates success.
+  bool ok() const { return code() == kOk; }
+
+  // Returns true iff the status indicates a NotFound error.
+  bool IsNotFound() const { return code() == kNotFound; }
+
+  // Returns true iff the status indicates a Corruption error.
+  bool IsCorruption() const { return code() == kCorruption; }
+
+  // Returns true iff the status indicates a NotSupported error.
+  bool IsNotSupported() const { return code() == kNotSupported; }
+
+  // Returns true iff the status indicates an InvalidArgument error.
+  bool IsInvalidArgument() const { return code() == kInvalidArgument; }
+
+  // Returns true iff the status indicates an IOError.
+  bool IsIOError() const { return code() == kIOError; }
+
+  // Returns true iff the status indicates an MergeInProgress.
+  bool IsMergeInProgress() const { return code() == kMergeInProgress; }
+
+  // Returns true iff the status indicates Incomplete
+  bool IsIncomplete() const { return code() == kIncomplete; }
+
+  // Returns true iff the status indicates Incomplete
+  bool IsShutdownInProgress() const { return code() == kShutdownInProgress; }
+
+  // Return a string representation of this status suitable for printing.
+  // Returns the string "OK" for success.
+  std::string ToString() const;
+
+ private:
+  enum Code {
+    kOk = 0,
+    kNotFound = 1,
+    kCorruption = 2,
+    kNotSupported = 3,
+    kInvalidArgument = 4,
+    kIOError = 5,
+    kMergeInProgress = 6,
+    kIncomplete = 7,
+    kShutdownInProgress = 8
+  };
+
+  // A nullptr state_ (which is always the case for OK) means the message
+  // is empty.
+  // of the following form:
+  //    state_[0..3] == length of message
+  //    state_[4..]  == message
+  Code code_;
+  const char* state_;
+
+  Code code() const {
+    return code_;
+  }
+  explicit Status(Code code) : code_(code), state_(nullptr) { }
+  Status(Code code, const Slice& msg, const Slice& msg2);
+  static const char* CopyState(const char* s);
+};
+
+inline Status::Status(const Status& s) {
+  code_ = s.code_;
+  state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_);
+}
+inline void Status::operator=(const Status& s) {
+  // The following condition catches both aliasing (when this == &s),
+  // and the common case where both s and *this are ok.
+  code_ = s.code_;
+  if (state_ != s.state_) {
+    delete[] state_;
+    state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_);
+  }
+}
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_STATUS_H_
diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h
new file mode 100644 (file)
index 0000000..14a505a
--- /dev/null
@@ -0,0 +1,193 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Currently we support two types of tables: plain table and block-based table.
+//   1. Block-based table: this is the default table type that we inherited from
+//      LevelDB, which was designed for storing data in hard disk or flash
+//      device.
+//   2. Plain table: it is one of RocksDB's SST file format optimized
+//      for low query latency on pure-memory or really low-latency media.
+//
+// A tutorial of rocksdb table formats is available here:
+//   https://github.com/facebook/rocksdb/wiki/A-Tutorial-of-RocksDB-SST-formats
+//
+// Example code is also available
+//   https://github.com/facebook/rocksdb/wiki/A-Tutorial-of-RocksDB-SST-formats#wiki-examples
+
+#pragma once
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+
+namespace rocksdb {
+
+// -- Block-based Table
+class FlushBlockPolicyFactory;
+class RandomAccessFile;
+class TableBuilder;
+class TableReader;
+class WritableFile;
+struct EnvOptions;
+struct Options;
+
+using std::unique_ptr;
+
+enum ChecksumType : char {
+  kNoChecksum = 0x0,  // not yet supported. Will fail
+  kCRC32c = 0x1,
+  kxxHash = 0x2,
+};
+
+// For advanced user only
+struct BlockBasedTableOptions {
+  // @flush_block_policy_factory creates the instances of flush block policy.
+  // which provides a configurable way to determine when to flush a block in
+  // the block based tables.  If not set, table builder will use the default
+  // block flush policy, which cut blocks by block size (please refer to
+  // `FlushBlockBySizePolicy`).
+  std::shared_ptr<FlushBlockPolicyFactory> flush_block_policy_factory;
+
+  // TODO(kailiu) Temporarily disable this feature by making the default value
+  // to be false.
+  //
+  // Indicating if we'd put index/filter blocks to the block cache.
+  // If not specified, each "table reader" object will pre-load index/filter
+  // block during table initialization.
+  bool cache_index_and_filter_blocks = false;
+
+  // The index type that will be used for this table.
+  enum IndexType : char {
+    // A space efficient index block that is optimized for
+    // binary-search-based index.
+    kBinarySearch,
+
+    // The hash index, if enabled, will do the hash lookup when
+    // `Options.prefix_extractor` is provided.
+    kHashSearch,
+  };
+
+  IndexType index_type = kBinarySearch;
+
+  // Use the specified checksum type. Newly created table files will be
+  // protected with this checksum type. Old table files will still be readable,
+  // even though they have different checksum type.
+  ChecksumType checksum = kCRC32c;
+};
+
+// Table Properties that are specific to block-based table properties.
+struct BlockBasedTablePropertyNames {
+  // value of this propertis is a fixed int32 number.
+  static const std::string kIndexType;
+};
+
+// Create default block based table factory.
+extern TableFactory* NewBlockBasedTableFactory(
+    const BlockBasedTableOptions& table_options = BlockBasedTableOptions());
+
+#ifndef ROCKSDB_LITE
+// -- Plain Table with prefix-only seek
+// For this factory, you need to set Options.prefix_extrator properly to make it
+// work. Look-up will starts with prefix hash lookup for key prefix. Inside the
+// hash bucket found, a binary search is executed for hash conflicts. Finally,
+// a linear search is used.
+// @user_key_len: plain table has optimization for fix-sized keys, which can be
+//                specified via user_key_len.  Alternatively, you can pass
+//                `kPlainTableVariableLength` if your keys have variable
+//                lengths.
+// @bloom_bits_per_key: the number of bits used for bloom filer per prefix. You
+//                      may disable it by passing a zero.
+// @hash_table_ratio: the desired utilization of the hash table used for prefix
+//                    hashing. hash_table_ratio = number of prefixes / #buckets
+//                    in the hash table
+// @index_sparseness: inside each prefix, need to build one index record for how
+//                    many keys for binary search inside each hash bucket.
+const uint32_t kPlainTableVariableLength = 0;
+extern TableFactory* NewPlainTableFactory(uint32_t user_key_len =
+                                              kPlainTableVariableLength,
+                                          int bloom_bits_per_prefix = 10,
+                                          double hash_table_ratio = 0.75,
+                                          size_t index_sparseness = 16);
+
+// -- Plain Table
+// This factory of plain table ignores Options.prefix_extractor and assumes no
+// hashable prefix available to the key structure. Lookup will be based on
+// binary search index only. Total order seek() can be issued.
+// @user_key_len: plain table has optimization for fix-sized keys, which can be
+//                specified via user_key_len.  Alternatively, you can pass
+//                `kPlainTableVariableLength` if your keys have variable
+//                lengths.
+// @bloom_bits_per_key: the number of bits used for bloom filer per key. You may
+//                  disable it by passing a zero.
+// @index_sparseness: need to build one index record for how many keys for
+//                    binary search.
+extern TableFactory* NewTotalOrderPlainTableFactory(
+    uint32_t user_key_len = kPlainTableVariableLength,
+    int bloom_bits_per_key = 0, size_t index_sparseness = 16);
+
+#endif  // ROCKSDB_LITE
+
+// A base class for table factories.
+class TableFactory {
+ public:
+  virtual ~TableFactory() {}
+
+  // The type of the table.
+  //
+  // The client of this package should switch to a new name whenever
+  // the table format implementation changes.
+  //
+  // Names starting with "rocksdb." are reserved and should not be used
+  // by any clients of this package.
+  virtual const char* Name() const = 0;
+
+  // Returns a Table object table that can fetch data from file specified
+  // in parameter file. It's the caller's responsibility to make sure
+  // file is in the correct format.
+  //
+  // NewTableReader() is called in two places:
+  // (1) TableCache::FindTable() calls the function when table cache miss
+  //     and cache the table object returned.
+  // (1) SstFileReader (for SST Dump) opens the table and dump the table
+  //     contents using the interator of the table.
+  // options and soptions are options. options is the general options.
+  // Multiple configured can be accessed from there, including and not
+  // limited to block cache and key comparators.
+  // file is a file handler to handle the file for the table
+  // file_size is the physical file size of the file
+  // table_reader is the output table reader
+  virtual Status NewTableReader(
+      const Options& options, const EnvOptions& soptions,
+      const InternalKeyComparator& internal_comparator,
+      unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
+      unique_ptr<TableReader>* table_reader) const = 0;
+
+  // Return a table builder to write to a file for this table type.
+  //
+  // It is called in several places:
+  // (1) When flushing memtable to a level-0 output file, it creates a table
+  //     builder (In DBImpl::WriteLevel0Table(), by calling BuildTable())
+  // (2) During compaction, it gets the builder for writing compaction output
+  //     files in DBImpl::OpenCompactionOutputFile().
+  // (3) When recovering from transaction logs, it creates a table builder to
+  //     write to a level-0 output file (In DBImpl::WriteLevel0TableForRecovery,
+  //     by calling BuildTable())
+  // (4) When running Repairer, it creates a table builder to convert logs to
+  //     SST files (In Repairer::ConvertLogToTable() by calling BuildTable())
+  //
+  // options is the general options. Multiple configured can be acceseed from
+  // there, including and not limited to compression options.
+  // file is a handle of a writable file. It is the caller's responsibility to
+  // keep the file open and close the file after closing the table builder.
+  // compression_type is the compression type to use in this table.
+  virtual TableBuilder* NewTableBuilder(
+      const Options& options, const InternalKeyComparator& internal_comparator,
+      WritableFile* file, CompressionType compression_type) const = 0;
+};
+
+}  // namespace rocksdb
diff --git a/include/rocksdb/table_properties.h b/include/rocksdb/table_properties.h
new file mode 100644 (file)
index 0000000..aa8b8a0
--- /dev/null
@@ -0,0 +1,112 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <string>
+#include <map>
+#include "rocksdb/status.h"
+
+namespace rocksdb {
+
+// -- Table Properties
+// Other than basic table properties, each table may also have the user
+// collected properties.
+// The value of the user-collected properties are encoded as raw bytes --
+// users have to interprete these values by themselves.
+// Note: To do prefix seek/scan in `UserCollectedProperties`, you can do
+// something similar to:
+//
+// UserCollectedProperties props = ...;
+// for (auto pos = props.lower_bound(prefix);
+//      pos != props.end() && pos->first.compare(0, prefix.size(), prefix) == 0;
+//      ++pos) {
+//   ...
+// }
+typedef std::map<const std::string, std::string> UserCollectedProperties;
+
+// TableProperties contains a bunch of read-only properties of its associated
+// table.
+struct TableProperties {
+ public:
+  // the total size of all data blocks.
+  uint64_t data_size = 0;
+  // the size of index block.
+  uint64_t index_size = 0;
+  // the size of filter block.
+  uint64_t filter_size = 0;
+  // total raw key size
+  uint64_t raw_key_size = 0;
+  // total raw value size
+  uint64_t raw_value_size = 0;
+  // the number of blocks in this table
+  uint64_t num_data_blocks = 0;
+  // the number of entries in this table
+  uint64_t num_entries = 0;
+  // format version, reserved for backward compatibility
+  uint64_t format_version = 0;
+  // If 0, key is variable length. Otherwise number of bytes for each key.
+  uint64_t fixed_key_len = 0;
+
+  // The name of the filter policy used in this table.
+  // If no filter policy is used, `filter_policy_name` will be an empty string.
+  std::string filter_policy_name;
+
+  // user collected properties
+  UserCollectedProperties user_collected_properties;
+
+  // convert this object to a human readable form
+  //   @prop_delim: delimiter for each property.
+  std::string ToString(const std::string& prop_delim = "; ",
+                       const std::string& kv_delim = "=") const;
+};
+
+// table properties' human-readable names in the property block.
+struct TablePropertiesNames {
+  static const std::string kDataSize;
+  static const std::string kIndexSize;
+  static const std::string kFilterSize;
+  static const std::string kRawKeySize;
+  static const std::string kRawValueSize;
+  static const std::string kNumDataBlocks;
+  static const std::string kNumEntries;
+  static const std::string kFormatVersion;
+  static const std::string kFixedKeyLen;
+  static const std::string kFilterPolicy;
+};
+
+extern const std::string kPropertiesBlock;
+
+// `TablePropertiesCollector` provides the mechanism for users to collect
+// their own interested properties. This class is essentially a collection
+//  of callback functions that will be invoked during table building.
+class TablePropertiesCollector {
+ public:
+  virtual ~TablePropertiesCollector() {}
+
+  // Add() will be called when a new key/value pair is inserted into the table.
+  // @params key    the original key that is inserted into the table.
+  // @params value  the original value that is inserted into the table.
+  virtual Status Add(const Slice& key, const Slice& value) = 0;
+
+  // Finish() will be called when a table has already been built and is ready
+  // for writing the properties block.
+  // @params properties  User will add their collected statistics to
+  // `properties`.
+  virtual Status Finish(UserCollectedProperties* properties) = 0;
+
+  // The name of the properties collector can be used for debugging purpose.
+  virtual const char* Name() const = 0;
+
+  // Return the human-readable properties, where the key is property name and
+  // the value is the human-readable form of value.
+  virtual UserCollectedProperties GetReadableProperties() const = 0;
+};
+
+// Extra properties
+// Below is a list of non-basic properties that are collected by database
+// itself. Especially some properties regarding to the internal keys (which
+// is unknown to `table`).
+extern uint64_t GetDeletedKeys(const UserCollectedProperties& props);
+
+}  // namespace rocksdb
diff --git a/include/rocksdb/transaction_log.h b/include/rocksdb/transaction_log.h
new file mode 100644 (file)
index 0000000..30443bb
--- /dev/null
@@ -0,0 +1,104 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_TRANSACTION_LOG_ITERATOR_H_
+#define STORAGE_ROCKSDB_INCLUDE_TRANSACTION_LOG_ITERATOR_H_
+
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+#include "rocksdb/write_batch.h"
+#include <memory>
+#include <vector>
+
+namespace rocksdb {
+
+class LogFile;
+typedef std::vector<std::unique_ptr<LogFile>> VectorLogPtr;
+
+enum  WalFileType {
+  /* Indicates that WAL file is in archive directory. WAL files are moved from
+   * the main db directory to archive directory once they are not live and stay
+   * there until cleaned up. Files are cleaned depending on archive size
+   * (Options::WAL_size_limit_MB) and time since last cleaning
+   * (Options::WAL_ttl_seconds).
+   */
+  kArchivedLogFile = 0,
+
+  /* Indicates that WAL file is live and resides in the main db directory */
+  kAliveLogFile = 1
+} ;
+
+class LogFile {
+ public:
+  LogFile() {}
+  virtual ~LogFile() {}
+
+  // Returns log file's pathname relative to the main db dir
+  // Eg. For a live-log-file = /000003.log
+  //     For an archived-log-file = /archive/000003.log
+  virtual std::string PathName() const = 0;
+
+
+  // Primary identifier for log file.
+  // This is directly proportional to creation time of the log file
+  virtual uint64_t LogNumber() const = 0;
+
+  // Log file can be either alive or archived
+  virtual WalFileType Type() const = 0;
+
+  // Starting sequence number of writebatch written in this log file
+  virtual SequenceNumber StartSequence() const = 0;
+
+  // Size of log file on disk in Bytes
+  virtual uint64_t SizeFileBytes() const = 0;
+};
+
+struct BatchResult {
+  SequenceNumber sequence = 0;
+  std::unique_ptr<WriteBatch> writeBatchPtr;
+};
+
+// A TransactionLogIterator is used to iterate over the transactions in a db.
+// One run of the iterator is continuous, i.e. the iterator will stop at the
+// beginning of any gap in sequences
+class TransactionLogIterator {
+ public:
+  TransactionLogIterator() {}
+  virtual ~TransactionLogIterator() {}
+
+  // An iterator is either positioned at a WriteBatch or not valid.
+  // This method returns true if the iterator is valid.
+  // Can read data from a valid iterator.
+  virtual bool Valid() = 0;
+
+  // Moves the iterator to the next WriteBatch.
+  // REQUIRES: Valid() to be true.
+  virtual void Next() = 0;
+
+  // Returns ok if the iterator is valid.
+  // Returns the Error when something has gone wrong.
+  virtual Status status() = 0;
+
+  // If valid return's the current write_batch and the sequence number of the
+  // earliest transaction contained in the batch.
+  // ONLY use if Valid() is true and status() is OK.
+  virtual BatchResult GetBatch() = 0;
+
+  // The read options for TransactionLogIterator.
+  struct ReadOptions {
+    // If true, all data read from underlying storage will be
+    // verified against corresponding checksums.
+    // Default: true
+    bool verify_checksums_;
+
+    ReadOptions() : verify_checksums_(true) {}
+
+    explicit ReadOptions(bool verify_checksums)
+        : verify_checksums_(verify_checksums) {}
+  };
+};
+} //  namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_TRANSACTION_LOG_ITERATOR_H_
diff --git a/include/rocksdb/types.h b/include/rocksdb/types.h
new file mode 100644 (file)
index 0000000..f20bf82
--- /dev/null
@@ -0,0 +1,20 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_TYPES_H_
+#define STORAGE_ROCKSDB_INCLUDE_TYPES_H_
+
+#include <stdint.h>
+
+namespace rocksdb {
+
+// Define all public custom types here.
+
+// Represents a sequence number in a WAL file.
+typedef uint64_t SequenceNumber;
+
+}  //  namespace rocksdb
+
+#endif //  STORAGE_ROCKSDB_INCLUDE_TYPES_H_
diff --git a/include/rocksdb/universal_compaction.h b/include/rocksdb/universal_compaction.h
new file mode 100644 (file)
index 0000000..eaf47e5
--- /dev/null
@@ -0,0 +1,83 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef STORAGE_ROCKSDB_UNIVERSAL_COMPACTION_OPTIONS_H
+#define STORAGE_ROCKSDB_UNIVERSAL_COMPACTION_OPTIONS_H
+
+#include <stdint.h>
+#include <climits>
+
+namespace rocksdb {
+
+//
+// Algorithm used to make a compaction request stop picking new files
+// into a single compaction run
+//
+enum CompactionStopStyle {
+  kCompactionStopStyleSimilarSize, // pick files of similar size
+  kCompactionStopStyleTotalSize    // total size of picked files > next file
+};
+
+class CompactionOptionsUniversal {
+ public:
+
+  // Percentage flexibilty while comparing file size. If the candidate file(s)
+  // size is 1% smaller than the next file's size, then include next file into
+  // this candidate set. // Default: 1
+  unsigned int size_ratio;
+
+  // The minimum number of files in a single compaction run. Default: 2
+  unsigned int min_merge_width;
+
+  // The maximum number of files in a single compaction run. Default: UINT_MAX
+  unsigned int max_merge_width;
+
+  // The size amplification is defined as the amount (in percentage) of
+  // additional storage needed to store a single byte of data in the database.
+  // For example, a size amplification of 2% means that a database that
+  // contains 100 bytes of user-data may occupy upto 102 bytes of
+  // physical storage. By this definition, a fully compacted database has
+  // a size amplification of 0%. Rocksdb uses the following heuristic
+  // to calculate size amplification: it assumes that all files excluding
+  // the earliest file contribute to the size amplification.
+  // Default: 200, which means that a 100 byte database could require upto
+  // 300 bytes of storage.
+  unsigned int max_size_amplification_percent;
+
+  // If this option is set to be -1 (the default value), all the output files
+  // will follow compression type specified.
+  //
+  // If this option is not negative, we will try to make sure compressed
+  // size is just above this value. In normal cases, at least this percentage
+  // of data will be compressed.
+  // When we are compacting to a new file, here is the criteria whether
+  // it needs to be compressed: assuming here are the list of files sorted
+  // by generation time:
+  //    A1...An B1...Bm C1...Ct
+  // where A1 is the newest and Ct is the oldest, and we are going to compact
+  // B1...Bm, we calculate the total size of all the files as total_size, as
+  // well as  the total size of C1...Ct as total_C, the compaction output file
+  // will be compressed iff
+  //   total_C / total_size < this percentage
+  int compression_size_percent;
+
+  // The algorithm used to stop picking files into a single compaction run
+  // Default: kCompactionStopStyleTotalSize
+  CompactionStopStyle stop_style;
+
+  // Default set of parameters
+  CompactionOptionsUniversal() :
+    size_ratio(1),
+    min_merge_width(2),
+    max_merge_width(UINT_MAX),
+    max_size_amplification_percent(200),
+    compression_size_percent(-1),
+    stop_style(kCompactionStopStyleTotalSize) {
+  }
+};
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_UNIVERSAL_COMPACTION_OPTIONS_H
diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h
new file mode 100644 (file)
index 0000000..6aeabc2
--- /dev/null
@@ -0,0 +1,6 @@
+#pragma once
+
+// Also update Makefile if you change these
+#define __ROCKSDB_MAJOR__ 3
+#define __ROCKSDB_MINOR__ 0
+#define __ROCKSDB_PATCH__ 0
diff --git a/include/rocksdb/write_batch.h b/include/rocksdb/write_batch.h
new file mode 100644 (file)
index 0000000..74ee2ad
--- /dev/null
@@ -0,0 +1,158 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// WriteBatch holds a collection of updates to apply atomically to a DB.
+//
+// The updates are applied in the order in which they are added
+// to the WriteBatch.  For example, the value of "key" will be "v3"
+// after the following batch is written:
+//
+//    batch.Put("key", "v1");
+//    batch.Delete("key");
+//    batch.Put("key", "v2");
+//    batch.Put("key", "v3");
+//
+// Multiple threads can invoke const methods on a WriteBatch without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same WriteBatch must use
+// external synchronization.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_WRITE_BATCH_H_
+#define STORAGE_ROCKSDB_INCLUDE_WRITE_BATCH_H_
+
+#include <string>
+#include "rocksdb/status.h"
+
+namespace rocksdb {
+
+class Slice;
+class ColumnFamilyHandle;
+struct SliceParts;
+
+class WriteBatch {
+ public:
+  explicit WriteBatch(size_t reserved_bytes = 0);
+  ~WriteBatch();
+
+  // Store the mapping "key->value" in the database.
+  void Put(ColumnFamilyHandle* column_family, const Slice& key,
+           const Slice& value);
+  void Put(const Slice& key, const Slice& value) {
+    Put(nullptr, key, value);
+  }
+
+  // Variant of Put() that gathers output like writev(2).  The key and value
+  // that will be written to the database are concatentations of arrays of
+  // slices.
+  void Put(ColumnFamilyHandle* column_family, const SliceParts& key,
+           const SliceParts& value);
+  void Put(const SliceParts& key, const SliceParts& value) {
+    Put(nullptr, key, value);
+  }
+
+  // Merge "value" with the existing value of "key" in the database.
+  // "key->merge(existing, value)"
+  void Merge(ColumnFamilyHandle* column_family, const Slice& key,
+             const Slice& value);
+  void Merge(const Slice& key, const Slice& value) {
+    Merge(nullptr, key, value);
+  }
+
+  // If the database contains a mapping for "key", erase it.  Else do nothing.
+  void Delete(ColumnFamilyHandle* column_family, const Slice& key);
+  void Delete(const Slice& key) { Delete(nullptr, key); }
+
+  // Append a blob of arbitrary size to the records in this batch. The blob will
+  // be stored in the transaction log but not in any other file. In particular,
+  // it will not be persisted to the SST files. When iterating over this
+  // WriteBatch, WriteBatch::Handler::LogData will be called with the contents
+  // of the blob as it is encountered. Blobs, puts, deletes, and merges will be
+  // encountered in the same order in thich they were inserted. The blob will
+  // NOT consume sequence number(s) and will NOT increase the count of the batch
+  //
+  // Example application: add timestamps to the transaction log for use in
+  // replication.
+  void PutLogData(const Slice& blob);
+
+  // Clear all updates buffered in this batch.
+  void Clear();
+
+  // Support for iterating over the contents of a batch.
+  class Handler {
+   public:
+    virtual ~Handler();
+    // default implementation will just call Put without column family for
+    // backwards compatibility. If the column family is not default,
+    // the function is noop
+    virtual Status PutCF(uint32_t column_family_id, const Slice& key,
+                         const Slice& value) {
+      if (column_family_id == 0) {
+        // Put() historically doesn't return status. We didn't want to be
+        // backwards incompatible so we didn't change the return status
+        // (this is a public API). We do an ordinary get and return Status::OK()
+        Put(key, value);
+        return Status::OK();
+      }
+      return Status::InvalidArgument(
+          "non-default column family and PutCF not implemented");
+    }
+    virtual void Put(const Slice& key, const Slice& value);
+    // Merge and LogData are not pure virtual. Otherwise, we would break
+    // existing clients of Handler on a source code level. The default
+    // implementation of Merge simply throws a runtime exception.
+    virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
+                           const Slice& value) {
+      if (column_family_id == 0) {
+        Merge(key, value);
+        return Status::OK();
+      }
+      return Status::InvalidArgument(
+          "non-default column family and MergeCF not implemented");
+    }
+    virtual void Merge(const Slice& key, const Slice& value);
+    // The default implementation of LogData does nothing.
+    virtual void LogData(const Slice& blob);
+    virtual Status DeleteCF(uint32_t column_family_id, const Slice& key) {
+      if (column_family_id == 0) {
+        Delete(key);
+        return Status::OK();
+      }
+      return Status::InvalidArgument(
+          "non-default column family and DeleteCF not implemented");
+    }
+    virtual void Delete(const Slice& key);
+    // Continue is called by WriteBatch::Iterate. If it returns false,
+    // iteration is halted. Otherwise, it continues iterating. The default
+    // implementation always returns true.
+    virtual bool Continue();
+  };
+  Status Iterate(Handler* handler) const;
+
+  // Retrieve the serialized version of this batch.
+  const std::string& Data() const { return rep_; }
+
+  // Retrieve data size of the batch.
+  size_t GetDataSize() const { return rep_.size(); }
+
+  // Returns the number of updates in the batch
+  int Count() const;
+
+  // Constructor with a serialized string object
+  explicit WriteBatch(std::string rep): rep_(rep) {}
+
+ private:
+  friend class WriteBatchInternal;
+
+  std::string rep_;  // See comment in write_batch.cc for the format of rep_
+
+  // Intentionally copyable
+};
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_WRITE_BATCH_H_
diff --git a/include/utilities/backupable_db.h b/include/utilities/backupable_db.h
new file mode 100644 (file)
index 0000000..617fe8a
--- /dev/null
@@ -0,0 +1,251 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#define __STDC_FORMAT_MACROS
+#include <inttypes.h>
+#include <string>
+#include <map>
+#include <vector>
+
+#include "utilities/stackable_db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/status.h"
+
+namespace rocksdb {
+
+struct BackupableDBOptions {
+  // Where to keep the backup files. Has to be different than dbname_
+  // Best to set this to dbname_ + "/backups"
+  // Required
+  std::string backup_dir;
+
+  // Backup Env object. It will be used for backup file I/O. If it's
+  // nullptr, backups will be written out using DBs Env. If it's
+  // non-nullptr, backup's I/O will be performed using this object.
+  // If you want to have backups on HDFS, use HDFS Env here!
+  // Default: nullptr
+  Env* backup_env;
+
+  // If share_table_files == true, backup will assume that table files with
+  // same name have the same contents. This enables incremental backups and
+  // avoids unnecessary data copies.
+  // If share_table_files == false, each backup will be on its own and will
+  // not share any data with other backups.
+  // default: true
+  bool share_table_files;
+
+  // Backup info and error messages will be written to info_log
+  // if non-nullptr.
+  // Default: nullptr
+  Logger* info_log;
+
+  // If sync == true, we can guarantee you'll get consistent backup even
+  // on a machine crash/reboot. Backup process is slower with sync enabled.
+  // If sync == false, we don't guarantee anything on machine reboot. However,
+  // chances are some of the backups are consistent.
+  // Default: true
+  bool sync;
+
+  // If true, it will delete whatever backups there are already
+  // Default: false
+  bool destroy_old_data;
+
+  // If false, we won't backup log files. This option can be useful for backing
+  // up in-memory databases where log file are persisted, but table files are in
+  // memory.
+  // Default: true
+  bool backup_log_files;
+
+  // Max bytes that can be transferred in a second during backup.
+  // If 0, go as fast as you can
+  // Default: 0
+  uint64_t backup_rate_limit;
+
+  // Max bytes that can be transferred in a second during restore.
+  // If 0, go as fast as you can
+  // Default: 0
+  uint64_t restore_rate_limit;
+
+  // Only used if share_table_files is set to true. If true, will consider that
+  // backups can come from different databases, hence a sst is not uniquely
+  // identifed by its name, but by the triple (file name, crc32, file length)
+  // Default: false
+  // Note: this is an experimental option, and you'll need to set it manually
+  // *turn it on only if you know what you're doing*
+  bool share_files_with_checksum;
+
+  void Dump(Logger* logger) const;
+
+  explicit BackupableDBOptions(const std::string& _backup_dir,
+                               Env* _backup_env = nullptr,
+                               bool _share_table_files = true,
+                               Logger* _info_log = nullptr, bool _sync = true,
+                               bool _destroy_old_data = false,
+                               bool _backup_log_files = true,
+                               uint64_t _backup_rate_limit = 0,
+                               uint64_t _restore_rate_limit = 0)
+      : backup_dir(_backup_dir),
+        backup_env(_backup_env),
+        share_table_files(_share_table_files),
+        info_log(_info_log),
+        sync(_sync),
+        destroy_old_data(_destroy_old_data),
+        backup_log_files(_backup_log_files),
+        backup_rate_limit(_backup_rate_limit),
+        restore_rate_limit(_restore_rate_limit),
+        share_files_with_checksum(false) {
+    assert(share_table_files || !share_files_with_checksum);
+  }
+};
+
+struct RestoreOptions {
+  // If true, restore won't overwrite the existing log files in wal_dir. It will
+  // also move all log files from archive directory to wal_dir. Use this option
+  // in combination with BackupableDBOptions::backup_log_files = false for
+  // persisting in-memory databases.
+  // Default: false
+  bool keep_log_files;
+
+  explicit RestoreOptions(bool _keep_log_files = false)
+      : keep_log_files(_keep_log_files) {}
+};
+
+typedef uint32_t BackupID;
+
+struct BackupInfo {
+  BackupID backup_id;
+  int64_t timestamp;
+  uint64_t size;
+
+  BackupInfo() {}
+  BackupInfo(BackupID _backup_id, int64_t _timestamp, uint64_t _size)
+      : backup_id(_backup_id), timestamp(_timestamp), size(_size) {}
+};
+
+class BackupEngineReadOnly {
+ public:
+  virtual ~BackupEngineReadOnly() {}
+
+  static BackupEngineReadOnly* NewReadOnlyBackupEngine(
+      Env* db_env, const BackupableDBOptions& options);
+
+  // You can GetBackupInfo safely, even with other BackupEngine performing
+  // backups on the same directory
+  virtual void GetBackupInfo(std::vector<BackupInfo>* backup_info) = 0;
+
+  // Restoring DB from backup is NOT safe when there is another BackupEngine
+  // running that might call DeleteBackup() or PurgeOldBackups(). It is caller's
+  // responsibility to synchronize the operation, i.e. don't delete the backup
+  // when you're restoring from it
+  virtual Status RestoreDBFromBackup(
+      BackupID backup_id, const std::string& db_dir, const std::string& wal_dir,
+      const RestoreOptions& restore_options = RestoreOptions()) = 0;
+  virtual Status RestoreDBFromLatestBackup(
+      const std::string& db_dir, const std::string& wal_dir,
+      const RestoreOptions& restore_options = RestoreOptions()) = 0;
+};
+
+// Please see the documentation in BackupableDB and RestoreBackupableDB
+class BackupEngine {
+ public:
+  virtual ~BackupEngine() {}
+
+  static BackupEngine* NewBackupEngine(Env* db_env,
+                                       const BackupableDBOptions& options);
+
+  virtual Status CreateNewBackup(DB* db, bool flush_before_backup = false) = 0;
+  virtual Status PurgeOldBackups(uint32_t num_backups_to_keep) = 0;
+  virtual Status DeleteBackup(BackupID backup_id) = 0;
+  virtual void StopBackup() = 0;
+
+  virtual void GetBackupInfo(std::vector<BackupInfo>* backup_info) = 0;
+  virtual Status RestoreDBFromBackup(
+      BackupID backup_id, const std::string& db_dir, const std::string& wal_dir,
+      const RestoreOptions& restore_options = RestoreOptions()) = 0;
+  virtual Status RestoreDBFromLatestBackup(
+      const std::string& db_dir, const std::string& wal_dir,
+      const RestoreOptions& restore_options = RestoreOptions()) = 0;
+};
+
+// Stack your DB with BackupableDB to be able to backup the DB
+class BackupableDB : public StackableDB {
+ public:
+  // BackupableDBOptions have to be the same as the ones used in a previous
+  // incarnation of the DB
+  //
+  // BackupableDB ownes the pointer `DB* db` now. You should not delete it or
+  // use it after the invocation of BackupableDB
+  BackupableDB(DB* db, const BackupableDBOptions& options);
+  virtual ~BackupableDB();
+
+  // Captures the state of the database in the latest backup
+  // NOT a thread safe call
+  Status CreateNewBackup(bool flush_before_backup = false);
+  // Returns info about backups in backup_info
+  void GetBackupInfo(std::vector<BackupInfo>* backup_info);
+  // deletes old backups, keeping latest num_backups_to_keep alive
+  Status PurgeOldBackups(uint32_t num_backups_to_keep);
+  // deletes a specific backup
+  Status DeleteBackup(BackupID backup_id);
+  // Call this from another thread if you want to stop the backup
+  // that is currently happening. It will return immediatelly, will
+  // not wait for the backup to stop.
+  // The backup will stop ASAP and the call to CreateNewBackup will
+  // return Status::Incomplete(). It will not clean up after itself, but
+  // the state will remain consistent. The state will be cleaned up
+  // next time you create BackupableDB or RestoreBackupableDB.
+  void StopBackup();
+
+ private:
+  BackupEngine* backup_engine_;
+};
+
+// Use this class to access information about backups and restore from them
+class RestoreBackupableDB {
+ public:
+  RestoreBackupableDB(Env* db_env, const BackupableDBOptions& options);
+  ~RestoreBackupableDB();
+
+  // Returns info about backups in backup_info
+  void GetBackupInfo(std::vector<BackupInfo>* backup_info);
+
+  // restore from backup with backup_id
+  // IMPORTANT -- if options_.share_table_files == true and you restore DB
+  // from some backup that is not the latest, and you start creating new
+  // backups from the new DB, they will probably fail
+  //
+  // Example: Let's say you have backups 1, 2, 3, 4, 5 and you restore 3.
+  // If you add new data to the DB and try creating a new backup now, the
+  // database will diverge from backups 4 and 5 and the new backup will fail.
+  // If you want to create new backup, you will first have to delete backups 4
+  // and 5.
+  Status RestoreDBFromBackup(BackupID backup_id, const std::string& db_dir,
+                             const std::string& wal_dir,
+                             const RestoreOptions& restore_options =
+                                 RestoreOptions());
+
+  // restore from the latest backup
+  Status RestoreDBFromLatestBackup(const std::string& db_dir,
+                                   const std::string& wal_dir,
+                                   const RestoreOptions& restore_options =
+                                       RestoreOptions());
+  // deletes old backups, keeping latest num_backups_to_keep alive
+  Status PurgeOldBackups(uint32_t num_backups_to_keep);
+  // deletes a specific backup
+  Status DeleteBackup(BackupID backup_id);
+
+ private:
+  BackupEngine* backup_engine_;
+};
+
+}  // namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/include/utilities/db_ttl.h b/include/utilities/db_ttl.h
new file mode 100644 (file)
index 0000000..e99744d
--- /dev/null
@@ -0,0 +1,68 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <string>
+#include <vector>
+
+#include "utilities/stackable_db.h"
+#include "rocksdb/db.h"
+
+namespace rocksdb {
+
+// Database with TTL support.
+//
+// USE-CASES:
+// This API should be used to open the db when key-values inserted are
+//  meant to be removed from the db in a non-strict 'ttl' amount of time
+//  Therefore, this guarantees that key-values inserted will remain in the
+//  db for >= ttl amount of time and the db will make efforts to remove the
+//  key-values as soon as possible after ttl seconds of their insertion.
+//
+// BEHAVIOUR:
+// TTL is accepted in seconds
+// (int32_t)Timestamp(creation) is suffixed to values in Put internally
+// Expired TTL values deleted in compaction only:(Timestamp+ttl<time_now)
+// Get/Iterator may return expired entries(compaction not run on them yet)
+// Different TTL may be used during different Opens
+// Example: Open1 at t=0 with ttl=4 and insert k1,k2, close at t=2
+//          Open2 at t=3 with ttl=5. Now k1,k2 should be deleted at t>=5
+// read_only=true opens in the usual read-only mode. Compactions will not be
+//  triggered(neither manual nor automatic), so no expired entries removed
+//
+// CONSTRAINTS:
+// Not specifying/passing or non-positive TTL behaves like TTL = infinity
+//
+// !!!WARNING!!!:
+// Calling DB::Open directly to re-open a db created by this API will get
+//  corrupt values(timestamp suffixed) and no ttl effect will be there
+//  during the second Open, so use this API consistently to open the db
+// Be careful when passing ttl with a small positive value because the
+//  whole database may be deleted in a small amount of time
+
+class DBWithTTL : public StackableDB {
+ public:
+  virtual Status CreateColumnFamilyWithTtl(
+      const ColumnFamilyOptions& options, const std::string& column_family_name,
+      ColumnFamilyHandle** handle, int ttl) = 0;
+
+  static Status Open(const Options& options, const std::string& dbname,
+                     DBWithTTL** dbptr, int32_t ttl = 0,
+                     bool read_only = false);
+
+  static Status Open(const DBOptions& db_options, const std::string& dbname,
+                     const std::vector<ColumnFamilyDescriptor>& column_families,
+                     std::vector<ColumnFamilyHandle*>* handles,
+                     DBWithTTL** dbptr, std::vector<int32_t> ttls,
+                     bool read_only = false);
+
+ protected:
+  explicit DBWithTTL(DB* db) : StackableDB(db) {}
+};
+
+}  // namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/include/utilities/geo_db.h b/include/utilities/geo_db.h
new file mode 100644 (file)
index 0000000..87ff5e6
--- /dev/null
@@ -0,0 +1,105 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+
+#ifndef ROCKSDB_LITE
+#pragma once
+#include <string>
+#include <vector>
+
+#include "utilities/stackable_db.h"
+#include "rocksdb/status.h"
+
+namespace rocksdb {
+
+//
+// Configurable options needed for setting up a Geo database
+//
+struct GeoDBOptions {
+  // Backup info and error messages will be written to info_log
+  // if non-nullptr.
+  // Default: nullptr
+  Logger* info_log;
+
+  explicit GeoDBOptions(Logger* _info_log = nullptr):info_log(_info_log) { }
+};
+
+//
+// A position in the earth's geoid
+//
+class GeoPosition {
+ public:
+  double latitude;
+  double longitude;
+
+  explicit GeoPosition(double la = 0, double lo = 0) :
+    latitude(la), longitude(lo) {
+  }
+};
+
+//
+// Description of an object on the Geoid. It is located by a GPS location,
+// and is identified by the id. The value associated with this object is
+// an opaque string 'value'. Different objects identified by unique id's
+// can have the same gps-location associated with them.
+//
+class GeoObject {
+ public:
+  GeoPosition position;
+  std::string id;
+  std::string value;
+
+  GeoObject() {}
+
+  GeoObject(const GeoPosition& pos, const std::string& i,
+            const std::string& val) :
+    position(pos), id(i), value(val) {
+  }
+};
+
+//
+// Stack your DB with GeoDB to be able to get geo-spatial support
+//
+class GeoDB : public StackableDB {
+ public:
+  // GeoDBOptions have to be the same as the ones used in a previous
+  // incarnation of the DB
+  //
+  // GeoDB owns the pointer `DB* db` now. You should not delete it or
+  // use it after the invocation of GeoDB
+  // GeoDB(DB* db, const GeoDBOptions& options) : StackableDB(db) {}
+  GeoDB(DB* db, const GeoDBOptions& options) : StackableDB(db) {}
+  virtual ~GeoDB() {}
+
+  // Insert a new object into the location database. The object is
+  // uniquely identified by the id. If an object with the same id already
+  // exists in the db, then the old one is overwritten by the new
+  // object being inserted here.
+  virtual Status Insert(const GeoObject& object) = 0;
+
+  // Retrieve the value of the object located at the specified GPS
+  // location and is identified by the 'id'.
+  virtual Status GetByPosition(const GeoPosition& pos,
+                               const Slice& id, std::string* value) = 0;
+
+  // Retrieve the value of the object identified by the 'id'. This method
+  // could be potentially slower than GetByPosition
+  virtual Status GetById(const Slice& id, GeoObject*  object) = 0;
+
+  // Delete the specified object
+  virtual Status Remove(const Slice& id) = 0;
+
+  // Returns a list of all items within a circular radius from the
+  // specified gps location. If 'number_of_values' is specified,
+  // then this call returns at most that many number of objects.
+  // The radius is specified in 'meters'.
+  virtual Status SearchRadial(const GeoPosition& pos,
+                              double radius,
+                              std::vector<GeoObject>* values,
+                              int number_of_values = INT_MAX) = 0;
+};
+
+}  // namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/include/utilities/stackable_db.h b/include/utilities/stackable_db.h
new file mode 100644 (file)
index 0000000..7927c2a
--- /dev/null
@@ -0,0 +1,215 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include "rocksdb/db.h"
+
+namespace rocksdb {
+
+// This class contains APIs to stack rocksdb wrappers.Eg. Stack TTL over base d
+class StackableDB : public DB {
+ public:
+  // StackableDB is the owner of db now!
+  explicit StackableDB(DB* db) : db_(db) {}
+
+  ~StackableDB() {
+    delete db_;
+  }
+
+  virtual DB* GetBaseDB() {
+    return db_;
+  }
+
+  virtual Status CreateColumnFamily(const ColumnFamilyOptions& options,
+                                    const std::string& column_family_name,
+                                    ColumnFamilyHandle** handle) {
+    return db_->CreateColumnFamily(options, column_family_name, handle);
+  }
+
+  virtual Status DropColumnFamily(ColumnFamilyHandle* column_family) {
+    return db_->DropColumnFamily(column_family);
+  }
+
+  using DB::Put;
+  virtual Status Put(const WriteOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     const Slice& val) override {
+    return db_->Put(options, column_family, key, val);
+  }
+
+  using DB::Get;
+  virtual Status Get(const ReadOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     std::string* value) override {
+    return db_->Get(options, column_family, key, value);
+  }
+
+  using DB::MultiGet;
+  virtual std::vector<Status> MultiGet(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_family,
+      const std::vector<Slice>& keys,
+      std::vector<std::string>* values) override {
+    return db_->MultiGet(options, column_family, keys, values);
+  }
+
+  using DB::KeyMayExist;
+  virtual bool KeyMayExist(const ReadOptions& options,
+                           ColumnFamilyHandle* column_family, const Slice& key,
+                           std::string* value,
+                           bool* value_found = nullptr) override {
+    return db_->KeyMayExist(options, column_family, key, value, value_found);
+  }
+
+  using DB::Delete;
+  virtual Status Delete(const WriteOptions& wopts,
+                        ColumnFamilyHandle* column_family,
+                        const Slice& key) override {
+    return db_->Delete(wopts, column_family, key);
+  }
+
+  using DB::Merge;
+  virtual Status Merge(const WriteOptions& options,
+                       ColumnFamilyHandle* column_family, const Slice& key,
+                       const Slice& value) override {
+    return db_->Merge(options, column_family, key, value);
+  }
+
+
+  virtual Status Write(const WriteOptions& opts, WriteBatch* updates)
+    override {
+      return db_->Write(opts, updates);
+  }
+
+  using DB::NewIterator;
+  virtual Iterator* NewIterator(const ReadOptions& opts,
+                                ColumnFamilyHandle* column_family) override {
+    return db_->NewIterator(opts, column_family);
+  }
+
+  virtual Status NewIterators(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_families,
+      std::vector<Iterator*>* iterators) {
+    return db_->NewIterators(options, column_families, iterators);
+  }
+
+
+  virtual const Snapshot* GetSnapshot() override {
+    return db_->GetSnapshot();
+  }
+
+  virtual void ReleaseSnapshot(const Snapshot* snapshot) override {
+    return db_->ReleaseSnapshot(snapshot);
+  }
+
+  using DB::GetProperty;
+  virtual bool GetProperty(ColumnFamilyHandle* column_family,
+                           const Slice& property, std::string* value) override {
+      return db_->GetProperty(column_family, property, value);
+  }
+
+  using DB::GetApproximateSizes;
+  virtual void GetApproximateSizes(ColumnFamilyHandle* column_family,
+                                   const Range* r, int n,
+                                   uint64_t* sizes) override {
+      return db_->GetApproximateSizes(column_family, r, n, sizes);
+  }
+
+  using DB::CompactRange;
+  virtual Status CompactRange(ColumnFamilyHandle* column_family,
+                              const Slice* begin, const Slice* end,
+                              bool reduce_level = false,
+                              int target_level = -1) override {
+    return db_->CompactRange(column_family, begin, end, reduce_level,
+                             target_level);
+  }
+
+  using DB::NumberLevels;
+  virtual int NumberLevels(ColumnFamilyHandle* column_family) override {
+    return db_->NumberLevels(column_family);
+  }
+
+  using DB::MaxMemCompactionLevel;
+  virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family)
+      override {
+    return db_->MaxMemCompactionLevel(column_family);
+  }
+
+  using DB::Level0StopWriteTrigger;
+  virtual int Level0StopWriteTrigger(ColumnFamilyHandle* column_family)
+      override {
+    return db_->Level0StopWriteTrigger(column_family);
+  }
+
+  virtual const std::string& GetName() const override {
+    return db_->GetName();
+  }
+
+  virtual Env* GetEnv() const override {
+    return db_->GetEnv();
+  }
+
+  using DB::GetOptions;
+  virtual const Options& GetOptions(ColumnFamilyHandle* column_family) const
+      override {
+    return db_->GetOptions(column_family);
+  }
+
+  using DB::Flush;
+  virtual Status Flush(const FlushOptions& fopts,
+                       ColumnFamilyHandle* column_family) override {
+    return db_->Flush(fopts, column_family);
+  }
+
+  virtual Status DisableFileDeletions() override {
+    return db_->DisableFileDeletions();
+  }
+
+  virtual Status EnableFileDeletions(bool force) override {
+    return db_->EnableFileDeletions(force);
+  }
+
+  virtual Status GetLiveFiles(std::vector<std::string>& vec, uint64_t* mfs,
+                              bool flush_memtable = true) override {
+      return db_->GetLiveFiles(vec, mfs, flush_memtable);
+  }
+
+  virtual SequenceNumber GetLatestSequenceNumber() const override {
+    return db_->GetLatestSequenceNumber();
+  }
+
+  virtual Status GetSortedWalFiles(VectorLogPtr& files) override {
+    return db_->GetSortedWalFiles(files);
+  }
+
+  virtual Status DeleteFile(std::string name) override {
+    return db_->DeleteFile(name);
+  }
+
+  virtual Status GetDbIdentity(std::string& identity) {
+    return db_->GetDbIdentity(identity);
+  }
+
+  using DB::GetPropertiesOfAllTables;
+  virtual Status GetPropertiesOfAllTables(ColumnFamilyHandle* column_family,
+                                          TablePropertiesCollection* props) {
+    return db_->GetPropertiesOfAllTables(column_family, props);
+  }
+
+  virtual Status GetUpdatesSince(
+      SequenceNumber seq_number, unique_ptr<TransactionLogIterator>* iter,
+      const TransactionLogIterator::ReadOptions& read_options) override {
+    return db_->GetUpdatesSince(seq_number, iter, read_options);
+  }
+
+  virtual ColumnFamilyHandle* DefaultColumnFamily() const override {
+    return db_->DefaultColumnFamily();
+  }
+
+ protected:
+  DB* db_;
+};
+
+} //  namespace rocksdb
diff --git a/include/utilities/utility_db.h b/include/utilities/utility_db.h
new file mode 100644 (file)
index 0000000..f2b99ce
--- /dev/null
@@ -0,0 +1,30 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#ifndef ROCKSDB_LITE
+#include <vector>
+#include <string>
+
+#include "utilities/stackable_db.h"
+#include "utilities/db_ttl.h"
+#include "rocksdb/db.h"
+
+namespace rocksdb {
+
+// Please don't use this class. It's deprecated
+class UtilityDB {
+ public:
+  // This function is here only for backwards compatibility. Please use the
+  // functions defined in DBWithTTl (utilities/db_ttl.h)
+  // (deprecated)
+  __attribute__((deprecated)) static Status OpenTtlDB(const Options& options,
+                                                      const std::string& name,
+                                                      StackableDB** dbptr,
+                                                      int32_t ttl = 0,
+                                                      bool read_only = false);
+};
+
+} //  namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/java/Makefile b/java/Makefile
new file mode 100644 (file)
index 0000000..9d21b57
--- /dev/null
@@ -0,0 +1,31 @@
+NATIVE_JAVA_CLASSES = org.rocksdb.RocksDB org.rocksdb.Options org.rocksdb.WriteBatch org.rocksdb.WriteBatchInternal org.rocksdb.WriteBatchTest org.rocksdb.WriteOptions org.rocksdb.BackupableDB org.rocksdb.BackupableDBOptions org.rocksdb.Statistics org.rocksdb.Iterator org.rocksdb.VectorMemTableConfig org.rocksdb.SkipListMemTableConfig org.rocksdb.HashLinkedListMemTableConfig org.rocksdb.HashSkipListMemTableConfig org.rocksdb.PlainTableConfig org.rocksdb.ReadOptions org.rocksdb.Filter org.rocksdb.BloomFilter
+NATIVE_INCLUDE = ./include
+ROCKSDB_JAR = rocksdbjni.jar
+
+clean:
+       -find . -name "*.class" -exec rm {} \;
+       -find . -name "hs*.log" -exec rm {} \;
+       rm -f $(ROCKSDB_JAR)
+
+java:
+       javac org/rocksdb/util/*.java org/rocksdb/*.java
+       jar -cf $(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class
+       javah -d $(NATIVE_INCLUDE) -jni $(NATIVE_JAVA_CLASSES)
+
+sample: java
+       javac -cp $(ROCKSDB_JAR) RocksDBSample.java
+       @rm -rf /tmp/rocksdbjni
+       @rm -rf /tmp/rocksdbjni_not_found
+       java -ea -Djava.library.path=.:../ -cp ".:./*" -Xcheck:jni RocksDBSample /tmp/rocksdbjni
+       @rm -rf /tmp/rocksdbjni
+       @rm -rf /tmp/rocksdbjni_not_found
+
+test: java
+       javac org/rocksdb/test/*.java
+       java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.WriteBatchTest
+       java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.BackupableDBTest
+       java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.OptionsTest
+       java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.ReadOptionsTest
+
+db_bench: java
+       javac org/rocksdb/benchmark/*.java
diff --git a/java/RocksDBSample.java b/java/RocksDBSample.java
new file mode 100644 (file)
index 0000000..5d11b1a
--- /dev/null
@@ -0,0 +1,253 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+import java.util.ArrayList;
+import org.rocksdb.*;
+import org.rocksdb.util.SizeUnit;
+import java.io.IOException;
+
+public class RocksDBSample {
+  static {
+    RocksDB.loadLibrary();
+  }
+
+  public static void main(String[] args) {
+    if (args.length < 1) {
+      System.out.println("usage: RocksDBSample db_path");
+      return;
+    }
+    String db_path = args[0];
+    String db_path_not_found = db_path + "_not_found";
+
+    System.out.println("RocksDBSample");
+    RocksDB db = null;
+    Options options = new Options();
+    try {
+      db = RocksDB.open(options, db_path_not_found);
+      assert(false);
+    } catch (RocksDBException e) {
+      System.out.format("caught the expceted exception -- %s\n", e);
+      assert(db == null);
+    }
+
+    Filter filter = new BloomFilter(10);
+    options.setCreateIfMissing(true)
+        .createStatistics()
+        .setWriteBufferSize(8 * SizeUnit.KB)
+        .setMaxWriteBufferNumber(3)
+        .setDisableSeekCompaction(true)
+        .setBlockSize(64 * SizeUnit.KB)
+        .setMaxBackgroundCompactions(10)
+        .setFilter(filter);
+    Statistics stats = options.statisticsPtr();
+
+    assert(options.createIfMissing() == true);
+    assert(options.writeBufferSize() == 8 * SizeUnit.KB);
+    assert(options.maxWriteBufferNumber() == 3);
+    assert(options.disableSeekCompaction() == true);
+    assert(options.blockSize() == 64 * SizeUnit.KB);
+    assert(options.maxBackgroundCompactions() == 10);
+
+    assert(options.memTableFactoryName().equals("SkipListFactory"));
+    options.setMemTableConfig(
+        new HashSkipListMemTableConfig()
+            .setHeight(4)
+            .setBranchingFactor(4)
+            .setBucketCount(2000000));
+    assert(options.memTableFactoryName().equals("HashSkipListRepFactory"));
+
+    options.setMemTableConfig(
+        new HashLinkedListMemTableConfig()
+            .setBucketCount(100000));
+    assert(options.memTableFactoryName().equals("HashLinkedListRepFactory"));
+
+    options.setMemTableConfig(
+        new VectorMemTableConfig().setReservedSize(10000));
+    assert(options.memTableFactoryName().equals("VectorRepFactory"));
+
+    options.setMemTableConfig(new SkipListMemTableConfig());
+    assert(options.memTableFactoryName().equals("SkipListFactory"));
+
+    options.setTableFormatConfig(new PlainTableConfig());
+    assert(options.tableFactoryName().equals("PlainTable"));
+
+    try {
+      db = RocksDB.open(options, db_path_not_found);
+      db.put("hello".getBytes(), "world".getBytes());
+      byte[] value = db.get("hello".getBytes());
+      assert("world".equals(new String(value)));
+    } catch (RocksDBException e) {
+      System.out.format("[ERROR] caught the unexpceted exception -- %s\n", e);
+      assert(db == null);
+      assert(false);
+    }
+    // be sure to release the c++ pointer
+    db.close();
+
+    ReadOptions readOptions = new ReadOptions();
+    readOptions.setFillCache(false);
+
+    try {
+      db = RocksDB.open(options, db_path);
+      db.put("hello".getBytes(), "world".getBytes());
+      byte[] value = db.get("hello".getBytes());
+      System.out.format("Get('hello') = %s\n",
+          new String(value));
+
+      for (int i = 1; i <= 9; ++i) {
+        for (int j = 1; j <= 9; ++j) {
+          db.put(String.format("%dx%d", i, j).getBytes(),
+                 String.format("%d", i * j).getBytes());
+        }
+      }
+
+      for (int i = 1; i <= 9; ++i) {
+        for (int j = 1; j <= 9; ++j) {
+          System.out.format("%s ", new String(db.get(
+              String.format("%dx%d", i, j).getBytes())));
+        }
+        System.out.println("");
+      }
+
+      value = db.get("1x1".getBytes());
+      assert(value != null);
+      value = db.get("world".getBytes());
+      assert(value == null);
+      value = db.get(readOptions, "world".getBytes());
+      assert(value == null);
+
+      byte[] testKey = "asdf".getBytes();
+      byte[] testValue =
+          "asdfghjkl;'?><MNBVCXZQWERTYUIOP{+_)(*&^%$#@".getBytes();
+      db.put(testKey, testValue);
+      byte[] testResult = db.get(testKey);
+      assert(testResult != null);
+      assert(Arrays.equals(testValue, testResult));
+      assert(new String(testValue).equals(new String(testResult)));
+      testResult = db.get(readOptions, testKey);
+      assert(testResult != null);
+      assert(Arrays.equals(testValue, testResult));
+      assert(new String(testValue).equals(new String(testResult)));
+
+      byte[] insufficientArray = new byte[10];
+      byte[] enoughArray = new byte[50];
+      int len;
+      len = db.get(testKey, insufficientArray);
+      assert(len > insufficientArray.length);
+      len = db.get("asdfjkl;".getBytes(), enoughArray);
+      assert(len == RocksDB.NOT_FOUND);
+      len = db.get(testKey, enoughArray);
+      assert(len == testValue.length);
+
+      len = db.get(readOptions, testKey, insufficientArray);
+      assert(len > insufficientArray.length);
+      len = db.get(readOptions, "asdfjkl;".getBytes(), enoughArray);
+      assert(len == RocksDB.NOT_FOUND);
+      len = db.get(readOptions, testKey, enoughArray);
+      assert(len == testValue.length);
+
+      db.remove(testKey);
+      len = db.get(testKey, enoughArray);
+      assert(len == RocksDB.NOT_FOUND);
+
+      // repeat the test with WriteOptions
+      WriteOptions writeOpts = new WriteOptions();
+      writeOpts.setSync(true);
+      writeOpts.setDisableWAL(true);
+      db.put(writeOpts, testKey, testValue);
+      len = db.get(testKey, enoughArray);
+      assert(len == testValue.length);
+      assert(new String(testValue).equals(
+          new String(enoughArray, 0, len)));
+      writeOpts.dispose();
+
+      try {
+        for (TickerType statsType : TickerType.values()) {
+          stats.getTickerCount(statsType);
+        }
+        System.out.println("getTickerCount() passed.");
+      } catch (Exception e) {
+        System.out.println("Failed in call to getTickerCount()");
+        assert(false); //Should never reach here.
+      }
+
+      try {
+        for (HistogramType histogramType : HistogramType.values()) {
+          HistogramData data = stats.geHistogramData(histogramType);
+        }
+        System.out.println("geHistogramData() passed.");
+      } catch (Exception e) {
+        System.out.println("Failed in call to geHistogramData()");
+        assert(false); //Should never reach here.
+      }
+
+      Iterator iterator = db.newIterator();
+
+      boolean seekToFirstPassed = false;
+      for (iterator.seekToFirst(); iterator.isValid(); iterator.next()) {
+        iterator.status();
+        assert(iterator.key() != null);
+        assert(iterator.value() != null);
+        seekToFirstPassed = true;
+      }
+      if(seekToFirstPassed) {
+        System.out.println("iterator seekToFirst tests passed.");
+      }
+
+      boolean seekToLastPassed = false;
+      for (iterator.seekToLast(); iterator.isValid(); iterator.prev()) {
+        iterator.status();
+        assert(iterator.key() != null);
+        assert(iterator.value() != null);
+        seekToLastPassed = true;
+      }
+
+      if(seekToLastPassed) {
+        System.out.println("iterator seekToLastPassed tests passed.");
+      }
+
+      iterator.seekToFirst();
+      iterator.seek(iterator.key());
+      assert(iterator.key() != null);
+      assert(iterator.value() != null);
+
+      System.out.println("iterator seek test passed.");
+
+      iterator.dispose();
+      System.out.println("iterator tests passed.");
+
+      iterator = db.newIterator();
+      List<byte[]> keys = new ArrayList<byte[]>();
+      for (iterator.seekToLast(); iterator.isValid(); iterator.prev()) {
+        keys.add(iterator.key());
+      }
+      iterator.dispose();
+
+      Map<byte[], byte[]> values = db.multiGet(keys);
+      assert(values.size() == keys.size());
+      for(byte[] value1 : values.values()) {
+        assert(value1 != null);
+      }
+
+      values = db.multiGet(new ReadOptions(), keys);
+      assert(values.size() == keys.size());
+      for(byte[] value1 : values.values()) {
+        assert(value1 != null);
+      }
+    } catch (RocksDBException e) {
+      System.err.println(e);
+    }
+    if (db != null) {
+      db.close();
+    }
+    // be sure to dispose c++ pointers
+    options.dispose();
+    readOptions.dispose();
+    filter.dispose();
+  }
+}
diff --git a/java/jdb_bench.sh b/java/jdb_bench.sh
new file mode 100755 (executable)
index 0000000..dba7dbd
--- /dev/null
@@ -0,0 +1 @@
+java -server -d64 -XX:NewSize=4m -XX:+AggressiveOpts -Djava.library.path=.:../ -cp "rocksdbjni.jar:.:./*" org.rocksdb.benchmark.DbBenchmark $@
diff --git a/java/org/rocksdb/BackupableDB.java b/java/org/rocksdb/BackupableDB.java
new file mode 100644 (file)
index 0000000..91607d4
--- /dev/null
@@ -0,0 +1,80 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * A subclass of RocksDB which supports backup-related operations.
+ *
+ * @see BackupableDBOptions
+ */
+public class BackupableDB extends RocksDB {
+  /**
+   * Open a BackupableDB under the specified path.
+   * Note that the backup path should be set properly in the
+   * input BackupableDBOptions.
+   *
+   * @param opt options for db.
+   * @param bopt backup related options.
+   * @param the db path for storing data.  The path for storing
+   *     backup should be specified in the BackupableDBOptions.
+   * @return reference to the opened BackupableDB.
+   */
+  public static BackupableDB open(
+      Options opt, BackupableDBOptions bopt, String db_path)
+      throws RocksDBException {
+    // since BackupableDB c++ will handle the life cycle of
+    // the returned RocksDB of RocksDB.open(), here we store
+    // it as a BackupableDB member variable to avoid GC.
+    BackupableDB bdb = new BackupableDB(RocksDB.open(opt, db_path));
+    bdb.open(bdb.db_.nativeHandle_, bopt.nativeHandle_);
+
+    return bdb;
+  }
+
+  /**
+   * Captures the state of the database in the latest backup.
+   * Note that this function is not thread-safe.
+   *
+   * @param flushBeforeBackup if true, then all data will be flushed
+   *     before creating backup.
+   */
+  public void createNewBackup(boolean flushBeforeBackup) {
+    createNewBackup(nativeHandle_, flushBeforeBackup);
+  }
+
+
+  /**
+   * Close the BackupableDB instance and release resource.
+   *
+   * Internally, BackupableDB owns the rocksdb::DB pointer to its
+   * associated RocksDB.  The release of that RocksDB pointer is
+   * handled in the destructor of the c++ rocksdb::BackupableDB and
+   * should be transparent to Java developers.
+   */
+  @Override public synchronized void close() {
+    if (isInitialized()) {
+      super.close();
+    }
+  }
+
+  /**
+   * A protected construction that will be used in the static factory
+   * method BackupableDB.open().
+   */
+  protected BackupableDB(RocksDB db) {
+    super();
+    db_ = db;
+  }
+
+  @Override protected void finalize() {
+    close();
+  }
+
+  protected native void open(long rocksDBHandle, long backupDBOptionsHandle);
+  protected native void createNewBackup(long handle, boolean flag);
+
+  private final RocksDB db_;
+}
diff --git a/java/org/rocksdb/BackupableDBOptions.java b/java/org/rocksdb/BackupableDBOptions.java
new file mode 100644 (file)
index 0000000..2c64b60
--- /dev/null
@@ -0,0 +1,44 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * BackupableDBOptions to control the behavior of a backupable database.
+ * It will be used during the creation of a BackupableDB.
+ *
+ * Note that dispose() must be called before an Options instance
+ * become out-of-scope to release the allocated memory in c++.
+ */
+public class BackupableDBOptions extends RocksObject {
+  public BackupableDBOptions(String path) {
+    super();
+    newBackupableDBOptions(path);
+  }
+
+  /**
+   * Returns the path to the BackupableDB directory.
+   *
+   * @return the path to the BackupableDB directory.
+   */
+  public String backupDir() {
+    assert(isInitialized());
+    return backupDir(nativeHandle_);
+  }
+
+  /**
+   * Release the memory allocated for the current instance
+   * in the c++ side.
+   */
+  @Override public synchronized void dispose() {
+    if (isInitialized()) {
+      dispose(nativeHandle_);
+    }
+  }
+
+  private native void newBackupableDBOptions(String path);
+  private native String backupDir(long handle);
+  private native void dispose(long handle);
+}
diff --git a/java/org/rocksdb/BloomFilter.java b/java/org/rocksdb/BloomFilter.java
new file mode 100644 (file)
index 0000000..9c4913a
--- /dev/null
@@ -0,0 +1,37 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * This class creates a new filter policy that uses a bloom filter
+ * with approximately the specified number of bits per key.
+ * A good value for bitsPerKey is 10, which yields a filter
+ * with ~ 1% false positive rate.
+ *
+ * Default value of bits per key is 10.
+ */
+public class BloomFilter extends Filter {
+  private static final int DEFAULT_BITS_PER_KEY = 10;
+  private final int bitsPerKey_;
+
+  public BloomFilter() {
+    this(DEFAULT_BITS_PER_KEY);
+  }
+
+  public BloomFilter(int bitsPerKey) {
+    super();
+    bitsPerKey_ = bitsPerKey;
+
+    createNewFilter();
+  }
+
+  @Override
+  protected void createNewFilter() {
+    createNewFilter0(bitsPerKey_);
+  }
+
+  private native void createNewFilter0(int bitsKeyKey);
+}
diff --git a/java/org/rocksdb/Filter.java b/java/org/rocksdb/Filter.java
new file mode 100644 (file)
index 0000000..3a01ad4
--- /dev/null
@@ -0,0 +1,32 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * Filters are stored in rocksdb and are consulted automatically
+ * by rocksdb to decide whether or not to read some
+ * information from disk. In many cases, a filter can cut down the
+ * number of disk seeks form a handful to a single disk seek per
+ * DB::Get() call.
+ */
+public abstract class Filter extends RocksObject {
+  protected abstract void createNewFilter();
+
+  /**
+   * Deletes underlying C++ filter pointer.
+   *
+   * Note that this function should be called only after all
+   * RocksDB instances referencing the filter are closed.
+   * Otherwise an undefined behavior will occur.
+   */
+  @Override public synchronized void dispose() {
+    if (isInitialized()) {
+      dispose0(nativeHandle_);
+    }
+  }
+
+  private native void dispose0(long handle);
+}
diff --git a/java/org/rocksdb/HashLinkedListMemTableConfig.java b/java/org/rocksdb/HashLinkedListMemTableConfig.java
new file mode 100644 (file)
index 0000000..24fcd8b
--- /dev/null
@@ -0,0 +1,52 @@
+package org.rocksdb;
+
+/**
+ * The config for hash linked list memtable representation
+ * Such memtable contains a fix-sized array of buckets, where
+ * each bucket points to a sorted singly-linked
+ * list (or null if the bucket is empty).
+ *
+ * Note that since this mem-table representation relies on the
+ * key prefix, it is required to invoke one of the usePrefixExtractor
+ * functions to specify how to extract key prefix given a key.
+ * If proper prefix-extractor is not set, then RocksDB will
+ * use the default memtable representation (SkipList) instead
+ * and post a warning in the LOG.
+ */
+public class HashLinkedListMemTableConfig extends MemTableConfig {
+  public static final long DEFAULT_BUCKET_COUNT = 50000;
+
+  public HashLinkedListMemTableConfig() {
+    bucketCount_ = DEFAULT_BUCKET_COUNT;
+  }
+
+  /**
+   * Set the number of buckets in the fixed-size array used
+   * in the hash linked-list mem-table.
+   *
+   * @param count the number of hash buckets.
+   * @return the reference to the current HashLinkedListMemTableConfig.
+   */
+  public HashLinkedListMemTableConfig setBucketCount(long count) {
+    bucketCount_ = count;
+    return this;
+  }
+
+  /**
+   * Returns the number of buckets that will be used in the memtable
+   * created based on this config.
+   *
+   * @return the number of buckets
+   */
+  public long bucketCount() {
+    return bucketCount_;
+  }
+
+  @Override protected long newMemTableFactoryHandle() {
+    return newMemTableFactoryHandle(bucketCount_);
+  }
+
+  private native long newMemTableFactoryHandle(long bucketCount);
+
+  private long bucketCount_;
+}
diff --git a/java/org/rocksdb/HashSkipListMemTableConfig.java b/java/org/rocksdb/HashSkipListMemTableConfig.java
new file mode 100644 (file)
index 0000000..74fb0db
--- /dev/null
@@ -0,0 +1,97 @@
+package org.rocksdb;
+
+/**
+ * The config for hash skip-list mem-table representation.
+ * Such mem-table representation contains a fix-sized array of
+ * buckets, where each bucket points to a skiplist (or null if the
+ * bucket is empty).
+ *
+ * Note that since this mem-table representation relies on the
+ * key prefix, it is required to invoke one of the usePrefixExtractor
+ * functions to specify how to extract key prefix given a key.
+ * If proper prefix-extractor is not set, then RocksDB will
+ * use the default memtable representation (SkipList) instead
+ * and post a warning in the LOG.
+ */
+public class HashSkipListMemTableConfig extends MemTableConfig {
+  public static final int DEFAULT_BUCKET_COUNT = 1000000;
+  public static final int DEFAULT_BRANCHING_FACTOR = 4;
+  public static final int DEFAULT_HEIGHT = 4;
+
+  public HashSkipListMemTableConfig() {
+    bucketCount_ = DEFAULT_BUCKET_COUNT;
+    branchingFactor_ = DEFAULT_BRANCHING_FACTOR;
+    height_ = DEFAULT_HEIGHT;
+  }
+
+  /**
+   * Set the number of hash buckets used in the hash skiplist memtable.
+   * Default = 1000000.
+   *
+   * @param count the number of hash buckets used in the hash
+   *    skiplist memtable.
+   * @return the reference to the current HashSkipListMemTableConfig.
+   */
+  public HashSkipListMemTableConfig setBucketCount(long count) {
+    bucketCount_ = count;
+    return this;
+  }
+
+  /**
+   * @return the number of hash buckets
+   */
+  public long bucketCount() {
+    return bucketCount_;
+  }
+
+  /**
+   * Set the height of the skip list.  Default = 4.
+   *
+   * @return the reference to the current HashSkipListMemTableConfig.
+   */
+  public HashSkipListMemTableConfig setHeight(int height) {
+    height_ = height;
+    return this;
+  }
+
+  /**
+   * @return the height of the skip list.
+   */
+  public int height() {
+    return height_;
+  }
+
+  /**
+   * Set the branching factor used in the hash skip-list memtable.
+   * This factor controls the probabilistic size ratio between adjacent
+   * links in the skip list.
+   *
+   * @param bf the probabilistic size ratio between adjacent link
+   *     lists in the skip list.
+   * @return the reference to the current HashSkipListMemTableConfig.
+   */
+  public HashSkipListMemTableConfig setBranchingFactor(int bf) {
+    branchingFactor_ = bf;
+    return this;
+  }
+
+  /**
+   * @return branching factor, the probabilistic size ratio between
+   *     adjacent links in the skip list.
+   */
+  public int branchingFactor() {
+    return branchingFactor_;
+  }
+
+  @Override protected long newMemTableFactoryHandle() {
+    return newMemTableFactoryHandle(
+        bucketCount_, height_, branchingFactor_);
+  }
+
+  private native long newMemTableFactoryHandle(
+      long bucketCount, int height, int branchingFactor);
+
+  private long bucketCount_;
+  private int branchingFactor_;
+  private int height_;
+}
diff --git a/java/org/rocksdb/HistogramData.java b/java/org/rocksdb/HistogramData.java
new file mode 100644 (file)
index 0000000..3b2e295
--- /dev/null
@@ -0,0 +1,43 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+public class HistogramData {
+  private final double median_;
+  private final double percentile95_;
+  private final double percentile99_;
+  private final double average_;
+  private final double standardDeviation_;
+
+  public HistogramData(double median, double percentile95,
+      double percentile99, double average, double standardDeviation) {
+    median_ = median;
+    percentile95_ = percentile95;
+    percentile99_ = percentile99;
+    average_ = average;
+    standardDeviation_ = standardDeviation;
+  }
+
+  public double getMedian() {
+    return median_;
+  }
+
+  public double getPercentile95() {
+    return percentile95_;
+  }
+
+  public double getPercentile99() {
+    return percentile99_;
+  }
+
+  public double getAverage() {
+    return average_;
+  }
+
+  public double getStandardDeviation() {
+    return standardDeviation_;
+  }
+}
diff --git a/java/org/rocksdb/HistogramType.java b/java/org/rocksdb/HistogramType.java
new file mode 100644 (file)
index 0000000..751c03a
--- /dev/null
@@ -0,0 +1,39 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+public enum HistogramType {
+  DB_GET(0),
+  DB_WRITE(1),
+  COMPACTION_TIME(2),
+  TABLE_SYNC_MICROS(3),
+  COMPACTION_OUTFILE_SYNC_MICROS(4),
+  WAL_FILE_SYNC_MICROS(5),
+  MANIFEST_FILE_SYNC_MICROS(6),
+  // TIME SPENT IN IO DURING TABLE OPEN
+  TABLE_OPEN_IO_MICROS(7),
+  DB_MULTIGET(8),
+  READ_BLOCK_COMPACTION_MICROS(9),
+  READ_BLOCK_GET_MICROS(10),
+  WRITE_RAW_BLOCK_MICROS(11),
+
+  STALL_L0_SLOWDOWN_COUNT(12),
+  STALL_MEMTABLE_COMPACTION_COUNT(13),
+  STALL_L0_NUM_FILES_COUNT(14),
+  HARD_RATE_LIMIT_DELAY_COUNT(15),
+  SOFT_RATE_LIMIT_DELAY_COUNT(16),
+  NUM_FILES_IN_SINGLE_COMPACTION(17);
+
+  private final int value_;
+
+  private HistogramType(int value) {
+    value_ = value;
+  }
+
+  public int getValue() {
+    return value_;
+  }
+}
diff --git a/java/org/rocksdb/Iterator.java b/java/org/rocksdb/Iterator.java
new file mode 100644 (file)
index 0000000..3c745a4
--- /dev/null
@@ -0,0 +1,138 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * An iterator yields a sequence of key/value pairs from a source.
+ * The following class defines the interface.  Multiple implementations
+ * are provided by this library.  In particular, iterators are provided
+ * to access the contents of a Table or a DB.
+ *
+ * Multiple threads can invoke const methods on an Iterator without
+ * external synchronization, but if any of the threads may call a
+ * non-const method, all threads accessing the same Iterator must use
+ * external synchronization.
+ */
+public class Iterator extends RocksObject {
+  public Iterator(long nativeHandle) {
+    super();
+    nativeHandle_ = nativeHandle;
+  }
+
+  /**
+   * An iterator is either positioned at a key/value pair, or
+   * not valid.  This method returns true iff the iterator is valid.
+   * @return true if iterator is valid.
+   */
+  public boolean isValid() {
+    assert(isInitialized());
+    return isValid0(nativeHandle_);
+  }
+
+  /**
+   * Position at the first key in the source.  The iterator is Valid()
+   * after this call iff the source is not empty.
+   */
+  public void seekToFirst() {
+    assert(isInitialized());
+    seekToFirst0(nativeHandle_);
+  }
+
+  /**
+   * Position at the last key in the source.  The iterator is
+   * Valid() after this call iff the source is not empty.
+   */
+  public void seekToLast() {
+    assert(isInitialized());
+    seekToLast0(nativeHandle_);
+  }
+
+  /**
+   * Moves to the next entry in the source.  After this call, Valid() is
+   * true iff the iterator was not positioned at the last entry in the source.
+   * REQUIRES: Valid()
+   */
+  public void next() {
+    assert(isInitialized());
+    next0(nativeHandle_);
+  }
+
+  /**
+   * Moves to the previous entry in the source.  After this call, Valid() is
+   * true iff the iterator was not positioned at the first entry in source.
+   * REQUIRES: Valid()
+   */
+  public void prev() {
+    assert(isInitialized());
+    prev0(nativeHandle_);
+  }
+
+  /**
+   * Return the key for the current entry.  The underlying storage for
+   * the returned slice is valid only until the next modification of
+   * the iterator.
+   * REQUIRES: Valid()
+   * @return key for the current entry.
+   */
+  public byte[] key() {
+    assert(isInitialized());
+    return key0(nativeHandle_);
+  }
+
+  /**
+   * Return the value for the current entry.  The underlying storage for
+   * the returned slice is valid only until the next modification of
+   * the iterator.
+   * REQUIRES: !AtEnd() && !AtStart()
+   * @return value for the current entry.
+   */
+  public byte[] value() {
+    assert(isInitialized());
+    return value0(nativeHandle_);
+  }
+
+  /**
+   * Position at the first key in the source that at or past target
+   * The iterator is Valid() after this call iff the source contains
+   * an entry that comes at or past target.
+   */
+  public void seek(byte[] target) {
+    assert(isInitialized());
+    seek0(nativeHandle_, target, target.length);
+  }
+
+  /**
+   * If an error has occurred, return it.  Else return an ok status.
+   * If non-blocking IO is requested and this operation cannot be
+   * satisfied without doing some IO, then this returns Status::Incomplete().
+   *
+   */
+  public void status() throws RocksDBException {
+    assert(isInitialized());
+    status0(nativeHandle_);
+  }
+
+  /**
+   * Deletes underlying C++ iterator pointer.
+   */
+  @Override public synchronized void dispose() {
+    if(isInitialized()) {
+      dispose(nativeHandle_);
+      nativeHandle_ = 0;
+    }
+  }
+
+  private native boolean isValid0(long handle);
+  private native void dispose(long handle);
+  private native void seekToFirst0(long handle);
+  private native void seekToLast0(long handle);
+  private native void next0(long handle);
+  private native void prev0(long handle);
+  private native byte[] key0(long handle);
+  private native byte[] value0(long handle);
+  private native void seek0(long handle, byte[] target, int targetLen);
+  private native void status0(long handle);
+}
diff --git a/java/org/rocksdb/MemTableConfig.java b/java/org/rocksdb/MemTableConfig.java
new file mode 100644 (file)
index 0000000..a473c25
--- /dev/null
@@ -0,0 +1,27 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+package org.rocksdb;
+
+/**
+ * MemTableConfig is used to config the internal mem-table of a RocksDB.
+ * It is required for each memtable to have one such sub-class to allow
+ * Java developers to use it.
+ *
+ * To make a RocksDB to use a specific MemTable format, its associated
+ * MemTableConfig should be properly set and passed into Options
+ * via Options.setMemTableFactory() and open the db using that Options.
+ *
+ * @see Options
+ */
+public abstract class MemTableConfig {
+  /**
+   * This function should only be called by Options.setMemTableConfig(),
+   * which will create a c++ shared-pointer to the c++ MemTableRepFactory
+   * that associated with the Java MemTableConfig.
+   *
+   * @see Options.setMemTableFactory()
+   */
+  abstract protected long newMemTableFactoryHandle();
+}
diff --git a/java/org/rocksdb/Options.java b/java/org/rocksdb/Options.java
new file mode 100644 (file)
index 0000000..02d3e20
--- /dev/null
@@ -0,0 +1,2355 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * Options to control the behavior of a database.  It will be used
+ * during the creation of a RocksDB (i.e., RocksDB.open()).
+ *
+ * Note that dispose() must be called before an Options instance
+ * become out-of-scope to release the allocated memory in c++.
+ */
+public class Options extends RocksObject {
+  static final long DEFAULT_CACHE_SIZE = 8 << 20;
+  /**
+   * Construct options for opening a RocksDB.
+   *
+   * This constructor will create (by allocating a block of memory)
+   * an rocksdb::Options in the c++ side.
+   */
+  public Options() {
+    super();
+    cacheSize_ = DEFAULT_CACHE_SIZE;
+    newOptions();
+  }
+
+  /**
+   * If this value is set to true, then the database will be created
+   * if it is missing during RocksDB.open().
+   * Default: false
+   *
+   * @param flag a flag indicating whether to create a database the
+   *     specified database in RocksDB.open() operation is missing.
+   * @return the instance of the current Options.
+   * @see RocksDB.open()
+   */
+  public Options setCreateIfMissing(boolean flag) {
+    assert(isInitialized());
+    setCreateIfMissing(nativeHandle_, flag);
+    return this;
+  }
+
+  /**
+   * Return true if the create_if_missing flag is set to true.
+   * If true, the database will be created if it is missing.
+   *
+   * @return true if the createIfMissing option is set to true.
+   * @see setCreateIfMissing()
+   */
+  public boolean createIfMissing() {
+    assert(isInitialized());
+    return createIfMissing(nativeHandle_);
+  }
+
+  /**
+   * Amount of data to build up in memory (backed by an unsorted log
+   * on disk) before converting to a sorted on-disk file.
+   *
+   * Larger values increase performance, especially during bulk loads.
+   * Up to max_write_buffer_number write buffers may be held in memory
+   * at the same time, so you may wish to adjust this parameter
+   * to control memory usage.
+   *
+   * Also, a larger write buffer will result in a longer recovery time
+   * the next time the database is opened.
+   *
+   * Default: 4MB
+   * @param writeBufferSize the size of write buffer.
+   * @return the instance of the current Options.
+   * @see RocksDB.open()
+   */
+  public Options setWriteBufferSize(long writeBufferSize) {
+    assert(isInitialized());
+    setWriteBufferSize(nativeHandle_, writeBufferSize);
+    return this;
+  }
+
+  /**
+   * Return size of write buffer size.
+   *
+   * @return size of write buffer.
+   * @see setWriteBufferSize()
+   */
+  public long writeBufferSize()  {
+    assert(isInitialized());
+    return writeBufferSize(nativeHandle_);
+  }
+
+  /**
+   * The maximum number of write buffers that are built up in memory.
+   * The default is 2, so that when 1 write buffer is being flushed to
+   * storage, new writes can continue to the other write buffer.
+   * Default: 2
+   *
+   * @param maxWriteBufferNumber maximum number of write buffers.
+   * @return the instance of the current Options.
+   * @see RocksDB.open()
+   */
+  public Options setMaxWriteBufferNumber(int maxWriteBufferNumber) {
+    assert(isInitialized());
+    setMaxWriteBufferNumber(nativeHandle_, maxWriteBufferNumber);
+    return this;
+  }
+
+  /**
+   * Returns maximum number of write buffers.
+   *
+   * @return maximum number of write buffers.
+   * @see setMaxWriteBufferNumber()
+   */
+  public int maxWriteBufferNumber() {
+    assert(isInitialized());
+    return maxWriteBufferNumber(nativeHandle_);
+  }
+
+  /*
+   * Approximate size of user data packed per block.  Note that the
+   * block size specified here corresponds to uncompressed data.  The
+   * actual size of the unit read from disk may be smaller if
+   * compression is enabled.  This parameter can be changed dynamically.
+   *
+   * Default: 4K
+   *
+   * @param blockSize the size of each block in bytes.
+   * @return the instance of the current Options.
+   * @see RocksDB.open()
+   */
+  public Options setBlockSize(long blockSize) {
+    assert(isInitialized());
+    setBlockSize(nativeHandle_, blockSize);
+    return this;
+  }
+
+  /*
+   * Returns the size of a block in bytes.
+   *
+   * @return block size.
+   * @see setBlockSize()
+   */
+  public long blockSize() {
+    assert(isInitialized());
+    return blockSize(nativeHandle_);
+  }
+
+  /**
+   * Use the specified filter policy to reduce disk reads.
+   *
+   * Note that the caller should not dispose the input filter as
+   * Options.dispose() will dispose this filter.
+   *
+   * @param Filter policy java instance.
+   * @return the instance of the current Options.
+   * @see RocksDB.open()
+   */
+  public Options setFilter(Filter filter) {
+    assert(isInitialized());
+    setFilterHandle(nativeHandle_, filter.nativeHandle_);
+    filter_ = filter;
+    return this;
+  }
+  private native void setFilterHandle(long optHandle, long filterHandle);
+
+  /*
+   * Disable compaction triggered by seek.
+   * With bloomfilter and fast storage, a miss on one level
+   * is very cheap if the file handle is cached in table cache
+   * (which is true if max_open_files is large).
+   * Default: true
+   *
+   * @param disableSeekCompaction a boolean value to specify whether
+   *     to disable seek compaction.
+   * @return the instance of the current Options.
+   * @see RocksDB.open()
+   */
+  public Options setDisableSeekCompaction(boolean disableSeekCompaction) {
+    assert(isInitialized());
+    setDisableSeekCompaction(nativeHandle_, disableSeekCompaction);
+    return this;
+  }
+
+  /*
+   * Returns true if disable seek compaction is set to true.
+   *
+   * @return true if disable seek compaction is set to true.
+   * @see setDisableSeekCompaction()
+   */
+  public boolean disableSeekCompaction() {
+    assert(isInitialized());
+    return disableSeekCompaction(nativeHandle_);
+  }
+
+  /**
+   * Set the amount of cache in bytes that will be used by RocksDB.
+   * If cacheSize is non-positive, then cache will not be used.
+   *
+   * DEFAULT: 8M
+   */
+  public Options setCacheSize(long cacheSize) {
+    cacheSize_ = cacheSize;
+    return this;
+  }
+
+  /**
+   * @return the amount of cache in bytes that will be used by RocksDB.
+   */
+  public long cacheSize() {
+    return cacheSize_;
+  }
+
+  /**
+   * If true, an error will be thrown during RocksDB.open() if the
+   * database already exists.
+   *
+   * @return if true, an error is raised when the specified database
+   *    already exists before open.
+   */
+  public boolean errorIfExists() {
+    assert(isInitialized());
+    return errorIfExists(nativeHandle_);
+  }
+  private native boolean errorIfExists(long handle);
+
+  /**
+   * If true, an error will be thrown during RocksDB.open() if the
+   * database already exists.
+   * Default: false
+   *
+   * @param errorIfExists if true, an exception will be thrown
+   *     during RocksDB.open() if the database already exists.
+   * @return the reference to the current option.
+   * @see RocksDB.open()
+   */
+  public Options setErrorIfExists(boolean errorIfExists) {
+    assert(isInitialized());
+    setErrorIfExists(nativeHandle_, errorIfExists);
+    return this;
+  }
+  private native void setErrorIfExists(long handle, boolean errorIfExists);
+
+  /**
+   * If true, the implementation will do aggressive checking of the
+   * data it is processing and will stop early if it detects any
+   * errors.  This may have unforeseen ramifications: for example, a
+   * corruption of one DB entry may cause a large number of entries to
+   * become unreadable or for the entire DB to become unopenable.
+   * If any of the  writes to the database fails (Put, Delete, Merge, Write),
+   * the database will switch to read-only mode and fail all other
+   * Write operations.
+   *
+   * @return a boolean indicating whether paranoid-check is on.
+   */
+  public boolean paranoidChecks() {
+    assert(isInitialized());
+    return paranoidChecks(nativeHandle_);
+  }
+  private native boolean paranoidChecks(long handle);
+
+  /**
+   * If true, the implementation will do aggressive checking of the
+   * data it is processing and will stop early if it detects any
+   * errors.  This may have unforeseen ramifications: for example, a
+   * corruption of one DB entry may cause a large number of entries to
+   * become unreadable or for the entire DB to become unopenable.
+   * If any of the  writes to the database fails (Put, Delete, Merge, Write),
+   * the database will switch to read-only mode and fail all other
+   * Write operations.
+   * Default: true
+   *
+   * @param paranoidChecks a flag to indicate whether paranoid-check
+   *     is on.
+   * @return the reference to the current option.
+   */
+  public Options setParanoidChecks(boolean paranoidChecks) {
+    assert(isInitialized());
+    setParanoidChecks(nativeHandle_, paranoidChecks);
+    return this;
+  }
+  private native void setParanoidChecks(
+      long handle, boolean paranoidChecks);
+
+  /**
+   * Number of open files that can be used by the DB.  You may need to
+   * increase this if your database has a large working set. Value -1 means
+   * files opened are always kept open. You can estimate number of files based
+   * on target_file_size_base and target_file_size_multiplier for level-based
+   * compaction. For universal-style compaction, you can usually set it to -1.
+   *
+   * @return the maximum number of open files.
+   */
+  public int maxOpenFiles() {
+    assert(isInitialized());
+    return maxOpenFiles(nativeHandle_);
+  }
+  private native int maxOpenFiles(long handle);
+
+  /**
+   * Number of open files that can be used by the DB.  You may need to
+   * increase this if your database has a large working set. Value -1 means
+   * files opened are always kept open. You can estimate number of files based
+   * on target_file_size_base and target_file_size_multiplier for level-based
+   * compaction. For universal-style compaction, you can usually set it to -1.
+   * Default: 5000
+   *
+   * @param maxOpenFiles the maximum number of open files.
+   * @return the reference to the current option.
+   */
+  public Options setMaxOpenFiles(int maxOpenFiles) {
+    assert(isInitialized());
+    setMaxOpenFiles(nativeHandle_, maxOpenFiles);
+    return this;
+  }
+  private native void setMaxOpenFiles(long handle, int maxOpenFiles);
+
+  /**
+   * If true, then the contents of data files are not synced
+   * to stable storage. Their contents remain in the OS buffers till the
+   * OS decides to flush them. This option is good for bulk-loading
+   * of data. Once the bulk-loading is complete, please issue a
+   * sync to the OS to flush all dirty buffesrs to stable storage.
+   *
+   * @return if true, then data-sync is disabled.
+   */
+  public boolean disableDataSync() {
+    assert(isInitialized());
+    return disableDataSync(nativeHandle_);
+  }
+  private native boolean disableDataSync(long handle);
+
+  /**
+   * If true, then the contents of data files are not synced
+   * to stable storage. Their contents remain in the OS buffers till the
+   * OS decides to flush them. This option is good for bulk-loading
+   * of data. Once the bulk-loading is complete, please issue a
+   * sync to the OS to flush all dirty buffesrs to stable storage.
+   * Default: false
+   *
+   * @param disableDataSync a boolean flag to specify whether to
+   *     disable data sync.
+   * @return the reference to the current option.
+   */
+  public Options setDisableDataSync(boolean disableDataSync) {
+    assert(isInitialized());
+    setDisableDataSync(nativeHandle_, disableDataSync);
+    return this;
+  }
+  private native void setDisableDataSync(long handle, boolean disableDataSync);
+
+  /**
+   * If true, then every store to stable storage will issue a fsync.
+   * If false, then every store to stable storage will issue a fdatasync.
+   * This parameter should be set to true while storing data to
+   * filesystem like ext3 that can lose files after a reboot.
+   *
+   * @return true if fsync is used.
+   */
+  public boolean useFsync() {
+    assert(isInitialized());
+    return useFsync(nativeHandle_);
+  }
+  private native boolean useFsync(long handle);
+
+  /**
+   * If true, then every store to stable storage will issue a fsync.
+   * If false, then every store to stable storage will issue a fdatasync.
+   * This parameter should be set to true while storing data to
+   * filesystem like ext3 that can lose files after a reboot.
+   * Default: false
+   *
+   * @param useFsync a boolean flag to specify whether to use fsync
+   * @return the reference to the current option.
+   */
+  public Options setUseFsync(boolean useFsync) {
+    assert(isInitialized());
+    setUseFsync(nativeHandle_, useFsync);
+    return this;
+  }
+  private native void setUseFsync(long handle, boolean useFsync);
+
+  /**
+   * The time interval in seconds between each two consecutive stats logs.
+   * This number controls how often a new scribe log about
+   * db deploy stats is written out.
+   * -1 indicates no logging at all.
+   *
+   * @return the time interval in seconds between each two consecutive
+   *     stats logs.
+   */
+  public int dbStatsLogInterval() {
+    assert(isInitialized());
+    return dbStatsLogInterval(nativeHandle_);
+  }
+  private native int dbStatsLogInterval(long handle);
+
+  /**
+   * The time interval in seconds between each two consecutive stats logs.
+   * This number controls how often a new scribe log about
+   * db deploy stats is written out.
+   * -1 indicates no logging at all.
+   * Default value is 1800 (half an hour).
+   *
+   * @param dbStatsLogInterval the time interval in seconds between each
+   *     two consecutive stats logs.
+   * @return the reference to the current option.
+   */
+  public Options setDbStatsLogInterval(int dbStatsLogInterval) {
+    assert(isInitialized());
+    setDbStatsLogInterval(nativeHandle_, dbStatsLogInterval);
+    return this;
+  }
+  private native void setDbStatsLogInterval(
+      long handle, int dbStatsLogInterval);
+
+  /**
+   * Returns the directory of info log.
+   *
+   * If it is empty, the log files will be in the same dir as data.
+   * If it is non empty, the log files will be in the specified dir,
+   * and the db data dir's absolute path will be used as the log file
+   * name's prefix.
+   *
+   * @return the path to the info log directory
+   */
+  public String dbLogDir() {
+    assert(isInitialized());
+    return dbLogDir(nativeHandle_);
+  }
+  private native String dbLogDir(long handle);
+
+  /**
+   * This specifies the info LOG dir.
+   * If it is empty, the log files will be in the same dir as data.
+   * If it is non empty, the log files will be in the specified dir,
+   * and the db data dir's absolute path will be used as the log file
+   * name's prefix.
+   *
+   * @param dbLogDir the path to the info log directory
+   * @return the reference to the current option.
+   */
+  public Options setDbLogDir(String dbLogDir) {
+    assert(isInitialized());
+    setDbLogDir(nativeHandle_, dbLogDir);
+    return this;
+  }
+  private native void setDbLogDir(long handle, String dbLogDir);
+
+  /**
+   * Returns the path to the write-ahead-logs (WAL) directory.
+   *
+   * If it is empty, the log files will be in the same dir as data,
+   *   dbname is used as the data dir by default
+   * If it is non empty, the log files will be in kept the specified dir.
+   * When destroying the db,
+   *   all log files in wal_dir and the dir itself is deleted
+   *
+   * @return the path to the write-ahead-logs (WAL) directory.
+   */
+  public String walDir() {
+    assert(isInitialized());
+    return walDir(nativeHandle_);
+  }
+  private native String walDir(long handle);
+
+  /**
+   * This specifies the absolute dir path for write-ahead logs (WAL).
+   * If it is empty, the log files will be in the same dir as data,
+   *   dbname is used as the data dir by default
+   * If it is non empty, the log files will be in kept the specified dir.
+   * When destroying the db,
+   *   all log files in wal_dir and the dir itself is deleted
+   *
+   * @param walDir the path to the write-ahead-log directory.
+   * @return the reference to the current option.
+   */
+  public Options setWalDir(String walDir) {
+    assert(isInitialized());
+    setWalDir(nativeHandle_, walDir);
+    return this;
+  }
+  private native void setWalDir(long handle, String walDir);
+
+  /**
+   * The periodicity when obsolete files get deleted. The default
+   * value is 6 hours. The files that get out of scope by compaction
+   * process will still get automatically delete on every compaction,
+   * regardless of this setting
+   *
+   * @return the time interval in micros when obsolete files will be deleted.
+   */
+  public long deleteObsoleteFilesPeriodMicros() {
+    assert(isInitialized());
+    return deleteObsoleteFilesPeriodMicros(nativeHandle_);
+  }
+  private native long deleteObsoleteFilesPeriodMicros(long handle);
+
+  /**
+   * The periodicity when obsolete files get deleted. The default
+   * value is 6 hours. The files that get out of scope by compaction
+   * process will still get automatically delete on every compaction,
+   * regardless of this setting
+   *
+   * @param micros the time interval in micros
+   * @return the reference to the current option.
+   */
+  public Options setDeleteObsoleteFilesPeriodMicros(long micros) {
+    assert(isInitialized());
+    setDeleteObsoleteFilesPeriodMicros(nativeHandle_, micros);
+    return this;
+  }
+  private native void setDeleteObsoleteFilesPeriodMicros(
+      long handle, long micros);
+
+  /**
+   * Returns the maximum number of concurrent background compaction jobs,
+   * submitted to the default LOW priority thread pool.
+   * When increasing this number, we may also want to consider increasing
+   * number of threads in LOW priority thread pool.
+   * Default: 1
+   *
+   * @return the maximum number of concurrent background compaction jobs.
+   * @see Env.setBackgroundThreads()
+   */
+  public int maxBackgroundCompactions() {
+    assert(isInitialized());
+    return maxBackgroundCompactions(nativeHandle_);
+  }
+
+  /**
+   * Creates statistics object which collects metrics about database operations.
+     Statistics objects should not be shared between DB instances as
+     it does not use any locks to prevent concurrent updates.
+   *
+   * @return the instance of the current Options.
+   * @see RocksDB.open()
+   */
+  public Options createStatistics() {
+    assert(isInitialized());
+    createStatistics(nativeHandle_);
+    return this;
+  }
+
+  /**
+   * Returns statistics object. Calls createStatistics() if
+   * C++ returns NULL pointer for statistics.
+   *
+   * @return the instance of the statistics object.
+   * @see createStatistics()
+   */
+  public Statistics statisticsPtr() {
+    assert(isInitialized());
+
+    long statsPtr = statisticsPtr(nativeHandle_);
+    if(statsPtr == 0) {
+      createStatistics();
+      statsPtr = statisticsPtr(nativeHandle_);
+    }
+
+    return new Statistics(statsPtr);
+  }
+
+  /**
+   * Specifies the maximum number of concurrent background compaction jobs,
+   * submitted to the default LOW priority thread pool.
+   * If you're increasing this, also consider increasing number of threads in
+   * LOW priority thread pool. For more information, see
+   * Default: 1
+   *
+   * @param maxBackgroundCompactions the maximum number of background
+   *     compaction jobs.
+   * @return the reference to the current option.
+   *
+   * @see Env.setBackgroundThreads()
+   * @see maxBackgroundFlushes()
+   */
+  public Options setMaxBackgroundCompactions(int maxBackgroundCompactions) {
+    assert(isInitialized());
+    setMaxBackgroundCompactions(nativeHandle_, maxBackgroundCompactions);
+    return this;
+  }
+
+  /**
+   * Returns the maximum number of concurrent background flush jobs.
+   * If you're increasing this, also consider increasing number of threads in
+   * HIGH priority thread pool. For more information, see
+   * Default: 1
+   *
+   * @return the maximum number of concurrent background flush jobs.
+   * @see Env.setBackgroundThreads()
+   */
+  public int maxBackgroundFlushes() {
+    assert(isInitialized());
+    return maxBackgroundFlushes(nativeHandle_);
+  }
+  private native int maxBackgroundFlushes(long handle);
+
+  /**
+   * Specifies the maximum number of concurrent background flush jobs.
+   * If you're increasing this, also consider increasing number of threads in
+   * HIGH priority thread pool. For more information, see
+   * Default: 1
+   *
+   * @param maxBackgroundFlushes
+   * @return the reference to the current option.
+   *
+   * @see Env.setBackgroundThreads()
+   * @see maxBackgroundCompactions()
+   */
+  public Options setMaxBackgroundFlushes(int maxBackgroundFlushes) {
+    assert(isInitialized());
+    setMaxBackgroundFlushes(nativeHandle_, maxBackgroundFlushes);
+    return this;
+  }
+  private native void setMaxBackgroundFlushes(
+      long handle, int maxBackgroundFlushes);
+
+  /**
+   * Returns the maximum size of a info log file. If the current log file
+   * is larger than this size, a new info log file will be created.
+   * If 0, all logs will be written to one log file.
+   *
+   * @return the maximum size of the info log file.
+   */
+  public long maxLogFileSize() {
+    assert(isInitialized());
+    return maxLogFileSize(nativeHandle_);
+  }
+  private native long maxLogFileSize(long handle);
+
+  /**
+   * Specifies the maximum size of a info log file. If the current log file
+   * is larger than `max_log_file_size`, a new info log file will
+   * be created.
+   * If 0, all logs will be written to one log file.
+   *
+   * @param maxLogFileSize the maximum size of a info log file.
+   * @return the reference to the current option.
+   */
+  public Options setMaxLogFileSize(long maxLogFileSize) {
+    assert(isInitialized());
+    setMaxLogFileSize(nativeHandle_, maxLogFileSize);
+    return this;
+  }
+  private native void setMaxLogFileSize(long handle, long maxLogFileSize);
+
+  /**
+   * Returns the time interval for the info log file to roll (in seconds).
+   * If specified with non-zero value, log file will be rolled
+   * if it has been active longer than `log_file_time_to_roll`.
+   * Default: 0 (disabled)
+   *
+   * @return the time interval in seconds.
+   */
+  public long logFileTimeToRoll() {
+    assert(isInitialized());
+    return logFileTimeToRoll(nativeHandle_);
+  }
+  private native long logFileTimeToRoll(long handle);
+
+  /**
+   * Specifies the time interval for the info log file to roll (in seconds).
+   * If specified with non-zero value, log file will be rolled
+   * if it has been active longer than `log_file_time_to_roll`.
+   * Default: 0 (disabled)
+   *
+   * @param logFileTimeToRoll the time interval in seconds.
+   * @return the reference to the current option.
+   */
+  public Options setLogFileTimeToRoll(long logFileTimeToRoll) {
+    assert(isInitialized());
+    setLogFileTimeToRoll(nativeHandle_, logFileTimeToRoll);
+    return this;
+  }
+  private native void setLogFileTimeToRoll(
+      long handle, long logFileTimeToRoll);
+
+  /**
+   * Returns the maximum number of info log files to be kept.
+   * Default: 1000
+   *
+   * @return the maximum number of info log files to be kept.
+   */
+  public long keepLogFileNum() {
+    assert(isInitialized());
+    return keepLogFileNum(nativeHandle_);
+  }
+  private native long keepLogFileNum(long handle);
+
+  /**
+   * Specifies the maximum number of info log files to be kept.
+   * Default: 1000
+   *
+   * @param keepLogFileNum the maximum number of info log files to be kept.
+   * @return the reference to the current option.
+   */
+  public Options setKeepLogFileNum(long keepLogFileNum) {
+    assert(isInitialized());
+    setKeepLogFileNum(nativeHandle_, keepLogFileNum);
+    return this;
+  }
+  private native void setKeepLogFileNum(long handle, long keepLogFileNum);
+
+  /**
+   * Manifest file is rolled over on reaching this limit.
+   * The older manifest file be deleted.
+   * The default value is MAX_INT so that roll-over does not take place.
+   *
+   * @return the size limit of a manifest file.
+   */
+  public long maxManifestFileSize() {
+    assert(isInitialized());
+    return maxManifestFileSize(nativeHandle_);
+  }
+  private native long maxManifestFileSize(long handle);
+
+  /**
+   * Manifest file is rolled over on reaching this limit.
+   * The older manifest file be deleted.
+   * The default value is MAX_INT so that roll-over does not take place.
+   *
+   * @param maxManifestFileSize the size limit of a manifest file.
+   * @return the reference to the current option.
+   */
+  public Options setMaxManifestFileSize(long maxManifestFileSize) {
+    assert(isInitialized());
+    setMaxManifestFileSize(nativeHandle_, maxManifestFileSize);
+    return this;
+  }
+  private native void setMaxManifestFileSize(
+      long handle, long maxManifestFileSize);
+
+  /**
+   * Number of shards used for table cache.
+   *
+   * @return the number of shards used for table cache.
+   */
+  public int tableCacheNumshardbits() {
+    assert(isInitialized());
+    return tableCacheNumshardbits(nativeHandle_);
+  }
+  private native int tableCacheNumshardbits(long handle);
+
+  /**
+   * Number of shards used for table cache.
+   *
+   * @param tableCacheNumshardbits the number of chards
+   * @return the reference to the current option.
+   */
+  public Options setTableCacheNumshardbits(int tableCacheNumshardbits) {
+    assert(isInitialized());
+    setTableCacheNumshardbits(nativeHandle_, tableCacheNumshardbits);
+    return this;
+  }
+  private native void setTableCacheNumshardbits(
+      long handle, int tableCacheNumshardbits);
+
+  /**
+   * During data eviction of table's LRU cache, it would be inefficient
+   * to strictly follow LRU because this piece of memory will not really
+   * be released unless its refcount falls to zero. Instead, make two
+   * passes: the first pass will release items with refcount = 1,
+   * and if not enough space releases after scanning the number of
+   * elements specified by this parameter, we will remove items in LRU
+   * order.
+   *
+   * @return scan count limit
+   */
+  public int tableCacheRemoveScanCountLimit() {
+    assert(isInitialized());
+    return tableCacheRemoveScanCountLimit(nativeHandle_);
+  }
+  private native int tableCacheRemoveScanCountLimit(long handle);
+
+  /**
+   * During data eviction of table's LRU cache, it would be inefficient
+   * to strictly follow LRU because this piece of memory will not really
+   * be released unless its refcount falls to zero. Instead, make two
+   * passes: the first pass will release items with refcount = 1,
+   * and if not enough space releases after scanning the number of
+   * elements specified by this parameter, we will remove items in LRU
+   * order.
+   *
+   * @param limit scan count limit
+   * @return the reference to the current option.
+   */
+  public Options setTableCacheRemoveScanCountLimit(int limit) {
+    assert(isInitialized());
+    setTableCacheRemoveScanCountLimit(nativeHandle_, limit);
+    return this;
+  }
+  private native void setTableCacheRemoveScanCountLimit(
+      long handle, int limit);
+
+  /**
+   * WalTtlSeconds() and walSizeLimitMB() affect how archived logs
+   * will be deleted.
+   * 1. If both set to 0, logs will be deleted asap and will not get into
+   *    the archive.
+   * 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0,
+   *    WAL files will be checked every 10 min and if total size is greater
+   *    then WAL_size_limit_MB, they will be deleted starting with the
+   *    earliest until size_limit is met. All empty files will be deleted.
+   * 3. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
+   *    WAL files will be checked every WAL_ttl_secondsi / 2 and those that
+   *    are older than WAL_ttl_seconds will be deleted.
+   * 4. If both are not 0, WAL files will be checked every 10 min and both
+   *    checks will be performed with ttl being first.
+   *
+   * @return the wal-ttl seconds
+   * @see walSizeLimitMB()
+   */
+  public long walTtlSeconds() {
+    assert(isInitialized());
+    return walTtlSeconds(nativeHandle_);
+  }
+  private native long walTtlSeconds(long handle);
+
+  /**
+   * WalTtlSeconds() and walSizeLimitMB() affect how archived logs
+   * will be deleted.
+   * 1. If both set to 0, logs will be deleted asap and will not get into
+   *    the archive.
+   * 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0,
+   *    WAL files will be checked every 10 min and if total size is greater
+   *    then WAL_size_limit_MB, they will be deleted starting with the
+   *    earliest until size_limit is met. All empty files will be deleted.
+   * 3. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
+   *    WAL files will be checked every WAL_ttl_secondsi / 2 and those that
+   *    are older than WAL_ttl_seconds will be deleted.
+   * 4. If both are not 0, WAL files will be checked every 10 min and both
+   *    checks will be performed with ttl being first.
+   *
+   * @param walTtlSeconds the ttl seconds
+   * @return the reference to the current option.
+   * @see setWalSizeLimitMB()
+   */
+  public Options setWalTtlSeconds(long walTtlSeconds) {
+    assert(isInitialized());
+    setWalTtlSeconds(nativeHandle_, walTtlSeconds);
+    return this;
+  }
+  private native void setWalTtlSeconds(long handle, long walTtlSeconds);
+
+  /**
+   * WalTtlSeconds() and walSizeLimitMB() affect how archived logs
+   * will be deleted.
+   * 1. If both set to 0, logs will be deleted asap and will not get into
+   *    the archive.
+   * 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0,
+   *    WAL files will be checked every 10 min and if total size is greater
+   *    then WAL_size_limit_MB, they will be deleted starting with the
+   *    earliest until size_limit is met. All empty files will be deleted.
+   * 3. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
+   *    WAL files will be checked every WAL_ttl_secondsi / 2 and those that
+   *    are older than WAL_ttl_seconds will be deleted.
+   * 4. If both are not 0, WAL files will be checked every 10 min and both
+   *    checks will be performed with ttl being first.
+   *
+   * @return size limit in mega-bytes.
+   * @see walSizeLimitMB()
+   */
+  public long walSizeLimitMB() {
+    assert(isInitialized());
+    return walSizeLimitMB(nativeHandle_);
+  }
+  private native long walSizeLimitMB(long handle);
+
+  /**
+   * WalTtlSeconds() and walSizeLimitMB() affect how archived logs
+   * will be deleted.
+   * 1. If both set to 0, logs will be deleted asap and will not get into
+   *    the archive.
+   * 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0,
+   *    WAL files will be checked every 10 min and if total size is greater
+   *    then WAL_size_limit_MB, they will be deleted starting with the
+   *    earliest until size_limit is met. All empty files will be deleted.
+   * 3. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
+   *    WAL files will be checked every WAL_ttl_secondsi / 2 and those that
+   *    are older than WAL_ttl_seconds will be deleted.
+   * 4. If both are not 0, WAL files will be checked every 10 min and both
+   *    checks will be performed with ttl being first.
+   *
+   * @param sizeLimitMB size limit in mega-bytes.
+   * @return the reference to the current option.
+   * @see setWalSizeLimitMB()
+   */
+  public Options setWalSizeLimitMB(long sizeLimitMB) {
+    assert(isInitialized());
+    setWalSizeLimitMB(nativeHandle_, sizeLimitMB);
+    return this;
+  }
+  private native void setWalSizeLimitMB(long handle, long sizeLimitMB);
+
+  /**
+   * Number of bytes to preallocate (via fallocate) the manifest
+   * files.  Default is 4mb, which is reasonable to reduce random IO
+   * as well as prevent overallocation for mounts that preallocate
+   * large amounts of data (such as xfs's allocsize option).
+   *
+   * @return size in bytes.
+   */
+  public long manifestPreallocationSize() {
+    assert(isInitialized());
+    return manifestPreallocationSize(nativeHandle_);
+  }
+  private native long manifestPreallocationSize(long handle);
+
+  /**
+   * Number of bytes to preallocate (via fallocate) the manifest
+   * files.  Default is 4mb, which is reasonable to reduce random IO
+   * as well as prevent overallocation for mounts that preallocate
+   * large amounts of data (such as xfs's allocsize option).
+   *
+   * @param size the size in byte
+   * @return the reference to the current option.
+   */
+  public Options setManifestPreallocationSize(long size) {
+    assert(isInitialized());
+    setManifestPreallocationSize(nativeHandle_, size);
+    return this;
+  }
+  private native void setManifestPreallocationSize(
+      long handle, long size);
+
+  /**
+   * Data being read from file storage may be buffered in the OS
+   * Default: true
+   *
+   * @return if true, then OS buffering is allowed.
+   */
+  public boolean allowOsBuffer() {
+    assert(isInitialized());
+    return allowOsBuffer(nativeHandle_);
+  }
+  private native boolean allowOsBuffer(long handle);
+
+  /**
+   * Data being read from file storage may be buffered in the OS
+   * Default: true
+   *
+   * @param allowOsBufferif true, then OS buffering is allowed.
+   * @return the reference to the current option.
+   */
+  public Options setAllowOsBuffer(boolean allowOsBuffer) {
+    assert(isInitialized());
+    setAllowOsBuffer(nativeHandle_, allowOsBuffer);
+    return this;
+  }
+  private native void setAllowOsBuffer(
+      long handle, boolean allowOsBuffer);
+
+  /**
+   * Allow the OS to mmap file for reading sst tables.
+   * Default: false
+   *
+   * @return true if mmap reads are allowed.
+   */
+  public boolean allowMmapReads() {
+    assert(isInitialized());
+    return allowMmapReads(nativeHandle_);
+  }
+  private native boolean allowMmapReads(long handle);
+
+  /**
+   * Allow the OS to mmap file for reading sst tables.
+   * Default: false
+   *
+   * @param allowMmapReads true if mmap reads are allowed.
+   * @return the reference to the current option.
+   */
+  public Options setAllowMmapReads(boolean allowMmapReads) {
+    assert(isInitialized());
+    setAllowMmapReads(nativeHandle_, allowMmapReads);
+    return this;
+  }
+  private native void setAllowMmapReads(
+      long handle, boolean allowMmapReads);
+
+  /**
+   * Allow the OS to mmap file for writing. Default: false
+   *
+   * @return true if mmap writes are allowed.
+   */
+  public boolean allowMmapWrites() {
+    assert(isInitialized());
+    return allowMmapWrites(nativeHandle_);
+  }
+  private native boolean allowMmapWrites(long handle);
+
+  /**
+   * Allow the OS to mmap file for writing. Default: false
+   *
+   * @param allowMmapWrites true if mmap writes are allowd.
+   * @return the reference to the current option.
+   */
+  public Options setAllowMmapWrites(boolean allowMmapWrites) {
+    assert(isInitialized());
+    setAllowMmapWrites(nativeHandle_, allowMmapWrites);
+    return this;
+  }
+  private native void setAllowMmapWrites(
+      long handle, boolean allowMmapWrites);
+
+  /**
+   * Disable child process inherit open files. Default: true
+   *
+   * @return true if child process inheriting open files is disabled.
+   */
+  public boolean isFdCloseOnExec() {
+    assert(isInitialized());
+    return isFdCloseOnExec(nativeHandle_);
+  }
+  private native boolean isFdCloseOnExec(long handle);
+
+  /**
+   * Disable child process inherit open files. Default: true
+   *
+   * @param isFdCloseOnExec true if child process inheriting open
+   *     files is disabled.
+   * @return the reference to the current option.
+   */
+  public Options setIsFdCloseOnExec(boolean isFdCloseOnExec) {
+    assert(isInitialized());
+    setIsFdCloseOnExec(nativeHandle_, isFdCloseOnExec);
+    return this;
+  }
+  private native void setIsFdCloseOnExec(
+      long handle, boolean isFdCloseOnExec);
+
+  /**
+   * Skip log corruption error on recovery (If client is ok with
+   * losing most recent changes)
+   * Default: false
+   *
+   * @return true if log corruption errors are skipped during recovery.
+   */
+  public boolean skipLogErrorOnRecovery() {
+    assert(isInitialized());
+    return skipLogErrorOnRecovery(nativeHandle_);
+  }
+  private native boolean skipLogErrorOnRecovery(long handle);
+
+  /**
+   * Skip log corruption error on recovery (If client is ok with
+   * losing most recent changes)
+   * Default: false
+   *
+   * @param skip true if log corruption errors are skipped during recovery.
+   * @return the reference to the current option.
+   */
+  public Options setSkipLogErrorOnRecovery(boolean skip) {
+    assert(isInitialized());
+    setSkipLogErrorOnRecovery(nativeHandle_, skip);
+    return this;
+  }
+  private native void setSkipLogErrorOnRecovery(
+      long handle, boolean skip);
+
+  /**
+   * If not zero, dump rocksdb.stats to LOG every stats_dump_period_sec
+   * Default: 3600 (1 hour)
+   *
+   * @return time interval in seconds.
+   */
+  public int statsDumpPeriodSec() {
+    assert(isInitialized());
+    return statsDumpPeriodSec(nativeHandle_);
+  }
+  private native int statsDumpPeriodSec(long handle);
+
+  /**
+   * if not zero, dump rocksdb.stats to LOG every stats_dump_period_sec
+   * Default: 3600 (1 hour)
+   *
+   * @param statsDumpPeriodSec time interval in seconds.
+   * @return the reference to the current option.
+   */
+  public Options setStatsDumpPeriodSec(int statsDumpPeriodSec) {
+    assert(isInitialized());
+    setStatsDumpPeriodSec(nativeHandle_, statsDumpPeriodSec);
+    return this;
+  }
+  private native void setStatsDumpPeriodSec(
+      long handle, int statsDumpPeriodSec);
+
+  /**
+   * If set true, will hint the underlying file system that the file
+   * access pattern is random, when a sst file is opened.
+   * Default: true
+   *
+   * @return true if hinting random access is on.
+   */
+  public boolean adviseRandomOnOpen() {
+    return adviseRandomOnOpen(nativeHandle_);
+  }
+  private native boolean adviseRandomOnOpen(long handle);
+
+  /**
+   * If set true, will hint the underlying file system that the file
+   * access pattern is random, when a sst file is opened.
+   * Default: true
+   *
+   * @param adviseRandomOnOpen true if hinting random access is on.
+   * @return the reference to the current option.
+   */
+  public Options setAdviseRandomOnOpen(boolean adviseRandomOnOpen) {
+    assert(isInitialized());
+    setAdviseRandomOnOpen(nativeHandle_, adviseRandomOnOpen);
+    return this;
+  }
+  private native void setAdviseRandomOnOpen(
+      long handle, boolean adviseRandomOnOpen);
+
+  /**
+   * Use adaptive mutex, which spins in the user space before resorting
+   * to kernel. This could reduce context switch when the mutex is not
+   * heavily contended. However, if the mutex is hot, we could end up
+   * wasting spin time.
+   * Default: false
+   *
+   * @return true if adaptive mutex is used.
+   */
+  public boolean useAdaptiveMutex() {
+    assert(isInitialized());
+    return useAdaptiveMutex(nativeHandle_);
+  }
+  private native boolean useAdaptiveMutex(long handle);
+
+  /**
+   * Use adaptive mutex, which spins in the user space before resorting
+   * to kernel. This could reduce context switch when the mutex is not
+   * heavily contended. However, if the mutex is hot, we could end up
+   * wasting spin time.
+   * Default: false
+   *
+   * @param useAdaptiveMutex true if adaptive mutex is used.
+   * @return the reference to the current option.
+   */
+  public Options setUseAdaptiveMutex(boolean useAdaptiveMutex) {
+    assert(isInitialized());
+    setUseAdaptiveMutex(nativeHandle_, useAdaptiveMutex);
+    return this;
+  }
+  private native void setUseAdaptiveMutex(
+      long handle, boolean useAdaptiveMutex);
+
+  /**
+   * Allows OS to incrementally sync files to disk while they are being
+   * written, asynchronously, in the background.
+   * Issue one request for every bytes_per_sync written. 0 turns it off.
+   * Default: 0
+   *
+   * @return size in bytes
+   */
+  public long bytesPerSync() {
+    return bytesPerSync(nativeHandle_);
+  }
+  private native long bytesPerSync(long handle);
+
+  /**
+   * Allows OS to incrementally sync files to disk while they are being
+   * written, asynchronously, in the background.
+   * Issue one request for every bytes_per_sync written. 0 turns it off.
+   * Default: 0
+   *
+   * @param bytesPerSync size in bytes
+   * @return the reference to the current option.
+   */
+  public Options setBytesPerSync(long bytesPerSync) {
+    assert(isInitialized());
+    setBytesPerSync(nativeHandle_, bytesPerSync);
+    return this;
+  }
+  private native void setBytesPerSync(
+      long handle, long bytesPerSync);
+
+  /**
+   * Allow RocksDB to use thread local storage to optimize performance.
+   * Default: true
+   *
+   * @return true if thread-local storage is allowed
+   */
+  public boolean allowThreadLocal() {
+    assert(isInitialized());
+    return allowThreadLocal(nativeHandle_);
+  }
+  private native boolean allowThreadLocal(long handle);
+
+  /**
+   * Allow RocksDB to use thread local storage to optimize performance.
+   * Default: true
+   *
+   * @param allowThreadLocal true if thread-local storage is allowed.
+   * @return the reference to the current option.
+   */
+  public Options setAllowThreadLocal(boolean allowThreadLocal) {
+    assert(isInitialized());
+    setAllowThreadLocal(nativeHandle_, allowThreadLocal);
+    return this;
+  }
+  private native void setAllowThreadLocal(
+      long handle, boolean allowThreadLocal);
+
+  /**
+   * Set the config for mem-table.
+   *
+   * @param config the mem-table config.
+   * @return the instance of the current Options.
+   */
+  public Options setMemTableConfig(MemTableConfig config) {
+    setMemTableFactory(nativeHandle_, config.newMemTableFactoryHandle());
+    return this;
+  }
+
+  /**
+   * Returns the name of the current mem table representation.
+   * Memtable format can be set using setTableFormatConfig.
+   *
+   * @return the name of the currently-used memtable factory.
+   * @see setTableFormatConfig()
+   */
+  public String memTableFactoryName() {
+    assert(isInitialized());
+    return memTableFactoryName(nativeHandle_);
+  }
+
+  /**
+   * Set the config for table format.
+   *
+   * @param config the table format config.
+   * @return the reference of the current Options.
+   */
+  public Options setTableFormatConfig(TableFormatConfig config) {
+    setTableFactory(nativeHandle_, config.newTableFactoryHandle());
+    return this;
+  }
+
+  /**
+   * @return the name of the currently used table factory.
+   */
+  public String tableFactoryName() {
+    assert(isInitialized());
+    return tableFactoryName(nativeHandle_);
+  }
+
+  /**
+   * This prefix-extractor uses the first n bytes of a key as its prefix.
+   *
+   * In some hash-based memtable representation such as HashLinkedList
+   * and HashSkipList, prefixes are used to partition the keys into
+   * several buckets.  Prefix extractor is used to specify how to
+   * extract the prefix given a key.
+   *
+   * @param n use the first n bytes of a key as its prefix.
+   */
+  public Options useFixedLengthPrefixExtractor(int n) {
+    assert(isInitialized());
+    useFixedLengthPrefixExtractor(nativeHandle_, n);
+    return this;
+  }
+
+///////////////////////////////////////////////////////////////////////
+  /**
+   * Number of keys between restart points for delta encoding of keys.
+   * This parameter can be changed dynamically.  Most clients should
+   * leave this parameter alone.
+   * Default: 16
+   *
+   * @return the number of keys between restart points.
+   */
+  public int blockRestartInterval() {
+    return blockRestartInterval(nativeHandle_);
+  }
+  private native int blockRestartInterval(long handle);
+
+  /**
+   * Number of keys between restart points for delta encoding of keys.
+   * This parameter can be changed dynamically.  Most clients should
+   * leave this parameter alone.
+   * Default: 16
+   *
+   * @param blockRestartInterval the number of keys between restart points.
+   * @return the reference to the current option.
+   */
+  public Options setBlockRestartInterval(int blockRestartInterval) {
+    setBlockRestartInterval(nativeHandle_, blockRestartInterval);
+    return this;
+  }
+  private native void setBlockRestartInterval(
+      long handle, int blockRestartInterval);
+
+  /**
+   * If true, place whole keys in the filter (not just prefixes).
+   * This must generally be true for gets to be efficient.
+   * Default: true
+   *
+   * @return if true, then whole-key-filtering is on.
+   */
+  public boolean wholeKeyFiltering() {
+    return wholeKeyFiltering(nativeHandle_);
+  }
+  private native boolean wholeKeyFiltering(long handle);
+
+  /**
+   * If true, place whole keys in the filter (not just prefixes).
+   * This must generally be true for gets to be efficient.
+   * Default: true
+   *
+   * @param wholeKeyFiltering if true, then whole-key-filtering is on.
+   * @return the reference to the current option.
+   */
+  public Options setWholeKeyFiltering(boolean wholeKeyFiltering) {
+    setWholeKeyFiltering(nativeHandle_, wholeKeyFiltering);
+    return this;
+  }
+  private native void setWholeKeyFiltering(
+      long handle, boolean wholeKeyFiltering);
+
+  /**
+   * If level-styled compaction is used, then this number determines
+   * the total number of levels.
+   *
+   * @return the number of levels.
+   */
+  public int numLevels() {
+    return numLevels(nativeHandle_);
+  }
+  private native int numLevels(long handle);
+
+  /**
+   * Set the number of levels for this database
+   * If level-styled compaction is used, then this number determines
+   * the total number of levels.
+   *
+   * @param numLevels the number of levels.
+   * @return the reference to the current option.
+   */
+  public Options setNumLevels(int numLevels) {
+    setNumLevels(nativeHandle_, numLevels);
+    return this;
+  }
+  private native void setNumLevels(
+      long handle, int numLevels);
+
+  /**
+   * The number of files in leve 0 to trigger compaction from level-0 to
+   * level-1.  A value < 0 means that level-0 compaction will not be
+   * triggered by number of files at all.
+   * Default: 4
+   *
+   * @return the number of files in level 0 to trigger compaction.
+   */
+  public int levelZeroFileNumCompactionTrigger() {
+    return levelZeroFileNumCompactionTrigger(nativeHandle_);
+  }
+  private native int levelZeroFileNumCompactionTrigger(long handle);
+
+  /**
+   * Number of files to trigger level-0 compaction. A value <0 means that
+   * level-0 compaction will not be triggered by number of files at all.
+   * Default: 4
+   *
+   * @param numFiles the number of files in level-0 to trigger compaction.
+   * @return the reference to the current option.
+   */
+  public Options setLevelZeroFileNumCompactionTrigger(
+      int numFiles) {
+    setLevelZeroFileNumCompactionTrigger(
+        nativeHandle_, numFiles);
+    return this;
+  }
+  private native void setLevelZeroFileNumCompactionTrigger(
+      long handle, int numFiles);
+
+  /**
+   * Soft limit on the number of level-0 files. We start slowing down writes
+   * at this point. A value < 0 means that no writing slow down will be
+   * triggered by number of files in level-0.
+   *
+   * @return the soft limit on the number of level-0 files.
+   */
+  public int levelZeroSlowdownWritesTrigger() {
+    return levelZeroSlowdownWritesTrigger(nativeHandle_);
+  }
+  private native int levelZeroSlowdownWritesTrigger(long handle);
+
+  /**
+   * Soft limit on number of level-0 files. We start slowing down writes at this
+   * point. A value <0 means that no writing slow down will be triggered by
+   * number of files in level-0.
+   *
+   * @param numFiles soft limit on number of level-0 files.
+   * @return the reference to the current option.
+   */
+  public Options setLevelZeroSlowdownWritesTrigger(
+      int numFiles) {
+    setLevelZeroSlowdownWritesTrigger(nativeHandle_, numFiles);
+    return this;
+  }
+  private native void setLevelZeroSlowdownWritesTrigger(
+      long handle, int numFiles);
+
+  /**
+   * Maximum number of level-0 files.  We stop writes at this point.
+   *
+   * @return the hard limit of the number of level-0 file.
+   */
+  public int levelZeroStopWritesTrigger() {
+    return levelZeroStopWritesTrigger(nativeHandle_);
+  }
+  private native int levelZeroStopWritesTrigger(long handle);
+
+  /**
+   * Maximum number of level-0 files.  We stop writes at this point.
+   *
+   * @param numFiles the hard limit of the number of level-0 files.
+   * @return the reference to the current option.
+   */
+  public Options setLevelZeroStopWritesTrigger(int numFiles) {
+    setLevelZeroStopWritesTrigger(nativeHandle_, numFiles);
+    return this;
+  }
+  private native void setLevelZeroStopWritesTrigger(
+      long handle, int numFiles);
+
+  /**
+   * The highest level to which a new compacted memtable is pushed if it
+   * does not create overlap.  We try to push to level 2 to avoid the
+   * relatively expensive level 0=>1 compactions and to avoid some
+   * expensive manifest file operations.  We do not push all the way to
+   * the largest level since that can generate a lot of wasted disk
+   * space if the same key space is being repeatedly overwritten.
+   *
+   * @return the highest level where a new compacted memtable will be pushed.
+   */
+  public int maxMemCompactionLevel() {
+    return maxMemCompactionLevel(nativeHandle_);
+  }
+  private native int maxMemCompactionLevel(long handle);
+
+  /**
+   * The highest level to which a new compacted memtable is pushed if it
+   * does not create overlap.  We try to push to level 2 to avoid the
+   * relatively expensive level 0=>1 compactions and to avoid some
+   * expensive manifest file operations.  We do not push all the way to
+   * the largest level since that can generate a lot of wasted disk
+   * space if the same key space is being repeatedly overwritten.
+   *
+   * @param maxMemCompactionLevel the highest level to which a new compacted
+   *     mem-table will be pushed.
+   * @return the reference to the current option.
+   */
+  public Options setMaxMemCompactionLevel(int maxMemCompactionLevel) {
+    setMaxMemCompactionLevel(nativeHandle_, maxMemCompactionLevel);
+    return this;
+  }
+  private native void setMaxMemCompactionLevel(
+      long handle, int maxMemCompactionLevel);
+
+  /**
+   * The target file size for compaction.
+   * This targetFileSizeBase determines a level-1 file size.
+   * Target file size for level L can be calculated by
+   * targetFileSizeBase * (targetFileSizeMultiplier ^ (L-1))
+   * For example, if targetFileSizeBase is 2MB and
+   * target_file_size_multiplier is 10, then each file on level-1 will
+   * be 2MB, and each file on level 2 will be 20MB,
+   * and each file on level-3 will be 200MB.
+   * by default targetFileSizeBase is 2MB.
+   *
+   * @return the target size of a level-0 file.
+   *
+   * @see targetFileSizeMultiplier()
+   */
+  public int targetFileSizeBase() {
+    return targetFileSizeBase(nativeHandle_);
+  }
+  private native int targetFileSizeBase(long handle);
+
+  /**
+   * The target file size for compaction.
+   * This targetFileSizeBase determines a level-1 file size.
+   * Target file size for level L can be calculated by
+   * targetFileSizeBase * (targetFileSizeMultiplier ^ (L-1))
+   * For example, if targetFileSizeBase is 2MB and
+   * target_file_size_multiplier is 10, then each file on level-1 will
+   * be 2MB, and each file on level 2 will be 20MB,
+   * and each file on level-3 will be 200MB.
+   * by default targetFileSizeBase is 2MB.
+   *
+   * @param targetFileSizeBase the target size of a level-0 file.
+   * @return the reference to the current option.
+   *
+   * @see setTargetFileSizeMultiplier()
+   */
+  public Options setTargetFileSizeBase(int targetFileSizeBase) {
+    setTargetFileSizeBase(nativeHandle_, targetFileSizeBase);
+    return this;
+  }
+  private native void setTargetFileSizeBase(
+      long handle, int targetFileSizeBase);
+
+  /**
+   * targetFileSizeMultiplier defines the size ratio between a
+   * level-(L+1) file and level-L file.
+   * By default targetFileSizeMultiplier is 1, meaning
+   * files in different levels have the same target.
+   *
+   * @return the size ratio between a level-(L+1) file and level-L file.
+   */
+  public int targetFileSizeMultiplier() {
+    return targetFileSizeMultiplier(nativeHandle_);
+  }
+  private native int targetFileSizeMultiplier(long handle);
+
+  /**
+   * targetFileSizeMultiplier defines the size ratio between a
+   * level-L file and level-(L+1) file.
+   * By default target_file_size_multiplier is 1, meaning
+   * files in different levels have the same target.
+   *
+   * @param multiplier the size ratio between a level-(L+1) file
+   *     and level-L file.
+   * @return the reference to the current option.
+   */
+  public Options setTargetFileSizeMultiplier(int multiplier) {
+    setTargetFileSizeMultiplier(nativeHandle_, multiplier);
+    return this;
+  }
+  private native void setTargetFileSizeMultiplier(
+      long handle, int multiplier);
+
+  /**
+   * The upper-bound of the total size of level-1 files in bytes.
+   * Maximum number of bytes for level L can be calculated as
+   * (maxBytesForLevelBase) * (maxBytesForLevelMultiplier ^ (L-1))
+   * For example, if maxBytesForLevelBase is 20MB, and if
+   * max_bytes_for_level_multiplier is 10, total data size for level-1
+   * will be 20MB, total file size for level-2 will be 200MB,
+   * and total file size for level-3 will be 2GB.
+   * by default 'maxBytesForLevelBase' is 10MB.
+   *
+   * @return the upper-bound of the total size of leve-1 files in bytes.
+   * @see maxBytesForLevelMultiplier()
+   */
+  public long maxBytesForLevelBase() {
+    return maxBytesForLevelBase(nativeHandle_);
+  }
+  private native long maxBytesForLevelBase(long handle);
+
+  /**
+   * The upper-bound of the total size of level-1 files in bytes.
+   * Maximum number of bytes for level L can be calculated as
+   * (maxBytesForLevelBase) * (maxBytesForLevelMultiplier ^ (L-1))
+   * For example, if maxBytesForLevelBase is 20MB, and if
+   * max_bytes_for_level_multiplier is 10, total data size for level-1
+   * will be 20MB, total file size for level-2 will be 200MB,
+   * and total file size for level-3 will be 2GB.
+   * by default 'maxBytesForLevelBase' is 10MB.
+   *
+   * @return maxBytesForLevelBase the upper-bound of the total size of
+   *     leve-1 files in bytes.
+   * @return the reference to the current option.
+   * @see setMaxBytesForLevelMultiplier()
+   */
+  public Options setMaxBytesForLevelBase(long maxBytesForLevelBase) {
+    setMaxBytesForLevelBase(nativeHandle_, maxBytesForLevelBase);
+    return this;
+  }
+  private native void setMaxBytesForLevelBase(
+      long handle, long maxBytesForLevelBase);
+
+  /**
+   * The ratio between the total size of level-(L+1) files and the total
+   * size of level-L files for all L.
+   * DEFAULT: 10
+   *
+   * @return the ratio between the total size of level-(L+1) files and
+   *     the total size of level-L files for all L.
+   * @see maxBytesForLevelBase()
+   */
+  public int maxBytesForLevelMultiplier() {
+    return maxBytesForLevelMultiplier(nativeHandle_);
+  }
+  private native int maxBytesForLevelMultiplier(long handle);
+
+  /**
+   * The ratio between the total size of level-(L+1) files and the total
+   * size of level-L files for all L.
+   * DEFAULT: 10
+   *
+   * @param multiplier the ratio between the total size of level-(L+1)
+   *     files and the total size of level-L files for all L.
+   * @return the reference to the current option.
+   * @see setMaxBytesForLevelBase()
+   */
+  public Options setMaxBytesForLevelMultiplier(int multiplier) {
+    setMaxBytesForLevelMultiplier(nativeHandle_, multiplier);
+    return this;
+  }
+  private native void setMaxBytesForLevelMultiplier(
+      long handle, int multiplier);
+
+  /**
+   * Maximum number of bytes in all compacted files.  We avoid expanding
+   * the lower level file set of a compaction if it would make the
+   * total compaction cover more than
+   * (expanded_compaction_factor * targetFileSizeLevel()) many bytes.
+   *
+   * @return the maximum number of bytes in all compacted files.
+   * @see sourceCompactionFactor()
+   */
+  public int expandedCompactionFactor() {
+    return expandedCompactionFactor(nativeHandle_);
+  }
+  private native int expandedCompactionFactor(long handle);
+
+  /**
+   * Maximum number of bytes in all compacted files.  We avoid expanding
+   * the lower level file set of a compaction if it would make the
+   * total compaction cover more than
+   * (expanded_compaction_factor * targetFileSizeLevel()) many bytes.
+   *
+   * @param expandedCompactionFactor the maximum number of bytes in all
+   *     compacted files.
+   * @return the reference to the current option.
+   * @see setSourceCompactionFactor()
+   */
+  public Options setExpandedCompactionFactor(int expandedCompactionFactor) {
+    setExpandedCompactionFactor(nativeHandle_, expandedCompactionFactor);
+    return this;
+  }
+  private native void setExpandedCompactionFactor(
+      long handle, int expandedCompactionFactor);
+
+  /**
+   * Maximum number of bytes in all source files to be compacted in a
+   * single compaction run. We avoid picking too many files in the
+   * source level so that we do not exceed the total source bytes
+   * for compaction to exceed
+   * (source_compaction_factor * targetFileSizeLevel()) many bytes.
+   * Default:1, i.e. pick maxfilesize amount of data as the source of
+   * a compaction.
+   *
+   * @return the maximum number of bytes in all source files to be compactedo.
+   * @see expendedCompactionFactor()
+   */
+  public int sourceCompactionFactor() {
+    return sourceCompactionFactor(nativeHandle_);
+  }
+  private native int sourceCompactionFactor(long handle);
+
+  /**
+   * Maximum number of bytes in all source files to be compacted in a
+   * single compaction run. We avoid picking too many files in the
+   * source level so that we do not exceed the total source bytes
+   * for compaction to exceed
+   * (source_compaction_factor * targetFileSizeLevel()) many bytes.
+   * Default:1, i.e. pick maxfilesize amount of data as the source of
+   * a compaction.
+   *
+   * @param sourceCompactionFactor the maximum number of bytes in all
+   *     source files to be compacted in a single compaction run.
+   * @return the reference to the current option.
+   * @see setExpendedCompactionFactor()
+   */
+  public Options setSourceCompactionFactor(int sourceCompactionFactor) {
+    setSourceCompactionFactor(nativeHandle_, sourceCompactionFactor);
+    return this;
+  }
+  private native void setSourceCompactionFactor(
+      long handle, int sourceCompactionFactor);
+
+  /**
+   * Control maximum bytes of overlaps in grandparent (i.e., level+2) before we
+   * stop building a single file in a level->level+1 compaction.
+   *
+   * @return maximum bytes of overlaps in "grandparent" level.
+   */
+  public int maxGrandparentOverlapFactor() {
+    return maxGrandparentOverlapFactor(nativeHandle_);
+  }
+  private native int maxGrandparentOverlapFactor(long handle);
+
+  /**
+   * Control maximum bytes of overlaps in grandparent (i.e., level+2) before we
+   * stop building a single file in a level->level+1 compaction.
+   *
+   * @param maxGrandparentOverlapFactor maximum bytes of overlaps in
+   *     "grandparent" level.
+   * @return the reference to the current option.
+   */
+  public Options setMaxGrandparentOverlapFactor(
+      int maxGrandparentOverlapFactor) {
+    setMaxGrandparentOverlapFactor(nativeHandle_, maxGrandparentOverlapFactor);
+    return this;
+  }
+  private native void setMaxGrandparentOverlapFactor(
+      long handle, int maxGrandparentOverlapFactor);
+
+  /**
+   * Puts are delayed 0-1 ms when any level has a compaction score that exceeds
+   * soft_rate_limit. This is ignored when == 0.0.
+   * CONSTRAINT: soft_rate_limit <= hard_rate_limit. If this constraint does not
+   * hold, RocksDB will set soft_rate_limit = hard_rate_limit
+   * Default: 0 (disabled)
+   *
+   * @return soft-rate-limit for put delay.
+   */
+  public double softRateLimit() {
+    return softRateLimit(nativeHandle_);
+  }
+  private native double softRateLimit(long handle);
+
+  /**
+   * Puts are delayed 0-1 ms when any level has a compaction score that exceeds
+   * soft_rate_limit. This is ignored when == 0.0.
+   * CONSTRAINT: soft_rate_limit <= hard_rate_limit. If this constraint does not
+   * hold, RocksDB will set soft_rate_limit = hard_rate_limit
+   * Default: 0 (disabled)
+   *
+   * @param softRateLimit the soft-rate-limit of a compaction score
+   *     for put delay.
+   * @return the reference to the current option.
+   */
+  public Options setSoftRateLimit(double softRateLimit) {
+    setSoftRateLimit(nativeHandle_, softRateLimit);
+    return this;
+  }
+  private native void setSoftRateLimit(
+      long handle, double softRateLimit);
+
+  /**
+   * Puts are delayed 1ms at a time when any level has a compaction score that
+   * exceeds hard_rate_limit. This is ignored when <= 1.0.
+   * Default: 0 (disabled)
+   *
+   * @return the hard-rate-limit of a compaction score for put delay.
+   */
+  public double hardRateLimit() {
+    return hardRateLimit(nativeHandle_);
+  }
+  private native double hardRateLimit(long handle);
+
+  /**
+   * Puts are delayed 1ms at a time when any level has a compaction score that
+   * exceeds hard_rate_limit. This is ignored when <= 1.0.
+   * Default: 0 (disabled)
+   *
+   * @param hardRateLimit the hard-rate-limit of a compaction score for put
+   *     delay.
+   * @return the reference to the current option.
+   */
+  public Options setHardRateLimit(double hardRateLimit) {
+    setHardRateLimit(nativeHandle_, hardRateLimit);
+    return this;
+  }
+  private native void setHardRateLimit(
+      long handle, double hardRateLimit);
+
+  /**
+   * The maximum time interval a put will be stalled when hard_rate_limit
+   * is enforced.  If 0, then there is no limit.
+   * Default: 1000
+   *
+   * @return the maximum time interval a put will be stalled when
+   *     hard_rate_limit is enforced.
+   */
+  public int rateLimitDelayMaxMilliseconds() {
+    return rateLimitDelayMaxMilliseconds(nativeHandle_);
+  }
+  private native int rateLimitDelayMaxMilliseconds(long handle);
+
+  /**
+   * The maximum time interval a put will be stalled when hard_rate_limit
+   * is enforced. If 0, then there is no limit.
+   * Default: 1000
+   *
+   * @param rateLimitDelayMaxMilliseconds the maximum time interval a put
+   *     will be stalled.
+   * @return the reference to the current option.
+   */
+  public Options setRateLimitDelayMaxMilliseconds(
+      int rateLimitDelayMaxMilliseconds) {
+    setRateLimitDelayMaxMilliseconds(
+        nativeHandle_, rateLimitDelayMaxMilliseconds);
+    return this;
+  }
+  private native void setRateLimitDelayMaxMilliseconds(
+      long handle, int rateLimitDelayMaxMilliseconds);
+
+  /**
+   * Disable block cache. If this is set to true,
+   * then no block cache should be used, and the block_cache should
+   * point to a nullptr object.
+   * Default: false
+   *
+   * @return true if block cache is disabled.
+   */
+  public boolean noBlockCache() {
+    return noBlockCache(nativeHandle_);
+  }
+  private native boolean noBlockCache(long handle);
+
+  /**
+   * Disable block cache. If this is set to true,
+   * then no block cache should be used, and the block_cache should
+   * point to a nullptr object.
+   * Default: false
+   *
+   * @param noBlockCache true if block-cache is disabled.
+   * @return the reference to the current option.
+   */
+  public Options setNoBlockCache(boolean noBlockCache) {
+    setNoBlockCache(nativeHandle_, noBlockCache);
+    return this;
+  }
+  private native void setNoBlockCache(
+      long handle, boolean noBlockCache);
+
+  /**
+   * The size of one block in arena memory allocation.
+   * If <= 0, a proper value is automatically calculated (usually 1/10 of
+   * writer_buffer_size).
+   *
+   * There are two additonal restriction of the The specified size:
+   * (1) size should be in the range of [4096, 2 << 30] and
+   * (2) be the multiple of the CPU word (which helps with the memory
+   * alignment).
+   *
+   * We'll automatically check and adjust the size number to make sure it
+   * conforms to the restrictions.
+   * Default: 0
+   *
+   * @return the size of an arena block
+   */
+  public long arenaBlockSize() {
+    return arenaBlockSize(nativeHandle_);
+  }
+  private native long arenaBlockSize(long handle);
+
+  /**
+   * The size of one block in arena memory allocation.
+   * If <= 0, a proper value is automatically calculated (usually 1/10 of
+   * writer_buffer_size).
+   *
+   * There are two additonal restriction of the The specified size:
+   * (1) size should be in the range of [4096, 2 << 30] and
+   * (2) be the multiple of the CPU word (which helps with the memory
+   * alignment).
+   *
+   * We'll automatically check and adjust the size number to make sure it
+   * conforms to the restrictions.
+   * Default: 0
+   *
+   * @param arenaBlockSize the size of an arena block
+   * @return the reference to the current option.
+   */
+  public Options setArenaBlockSize(long arenaBlockSize) {
+    setArenaBlockSize(nativeHandle_, arenaBlockSize);
+    return this;
+  }
+  private native void setArenaBlockSize(
+      long handle, long arenaBlockSize);
+
+  /**
+   * Disable automatic compactions. Manual compactions can still
+   * be issued on this column family
+   *
+   * @return true if auto-compactions are disabled.
+   */
+  public boolean disableAutoCompactions() {
+    return disableAutoCompactions(nativeHandle_);
+  }
+  private native boolean disableAutoCompactions(long handle);
+
+  /**
+   * Disable automatic compactions. Manual compactions can still
+   * be issued on this column family
+   *
+   * @param disableAutoCompactions true if auto-compactions are disabled.
+   * @return the reference to the current option.
+   */
+  public Options setDisableAutoCompactions(boolean disableAutoCompactions) {
+    setDisableAutoCompactions(nativeHandle_, disableAutoCompactions);
+    return this;
+  }
+  private native void setDisableAutoCompactions(
+      long handle, boolean disableAutoCompactions);
+
+  /**
+   * Purge duplicate/deleted keys when a memtable is flushed to storage.
+   * Default: true
+   *
+   * @return true if purging keys is disabled.
+   */
+  public boolean purgeRedundantKvsWhileFlush() {
+    return purgeRedundantKvsWhileFlush(nativeHandle_);
+  }
+  private native boolean purgeRedundantKvsWhileFlush(long handle);
+
+  /**
+   * Purge duplicate/deleted keys when a memtable is flushed to storage.
+   * Default: true
+   *
+   * @param purgeRedundantKvsWhileFlush true if purging keys is disabled.
+   * @return the reference to the current option.
+   */
+  public Options setPurgeRedundantKvsWhileFlush(
+      boolean purgeRedundantKvsWhileFlush) {
+    setPurgeRedundantKvsWhileFlush(
+        nativeHandle_, purgeRedundantKvsWhileFlush);
+    return this;
+  }
+  private native void setPurgeRedundantKvsWhileFlush(
+      long handle, boolean purgeRedundantKvsWhileFlush);
+
+  /**
+   * This is used to close a block before it reaches the configured
+   * 'block_size'. If the percentage of free space in the current block is less
+   * than this specified number and adding a new record to the block will
+   * exceed the configured block size, then this block will be closed and the
+   * new record will be written to the next block.
+   * Default is 10.
+   *
+   * @return the target block size
+   */
+  public int blockSizeDeviation() {
+    return blockSizeDeviation(nativeHandle_);
+  }
+  private native int blockSizeDeviation(long handle);
+
+  /**
+   * This is used to close a block before it reaches the configured
+   * 'block_size'. If the percentage of free space in the current block is less
+   * than this specified number and adding a new record to the block will
+   * exceed the configured block size, then this block will be closed and the
+   * new record will be written to the next block.
+   * Default is 10.
+   *
+   * @param blockSizeDeviation the target block size
+   * @return the reference to the current option.
+   */
+  public Options setBlockSizeDeviation(int blockSizeDeviation) {
+    setBlockSizeDeviation(nativeHandle_, blockSizeDeviation);
+    return this;
+  }
+  private native void setBlockSizeDeviation(
+      long handle, int blockSizeDeviation);
+
+  /**
+   * If true, compaction will verify checksum on every read that happens
+   * as part of compaction
+   * Default: true
+   *
+   * @return true if compaction verifies checksum on every read.
+   */
+  public boolean verifyChecksumsInCompaction() {
+    return verifyChecksumsInCompaction(nativeHandle_);
+  }
+  private native boolean verifyChecksumsInCompaction(long handle);
+
+  /**
+   * If true, compaction will verify checksum on every read that happens
+   * as part of compaction
+   * Default: true
+   *
+   * @param verifyChecksumsInCompaction true if compaction verifies
+   *     checksum on every read.
+   * @return the reference to the current option.
+   */
+  public Options setVerifyChecksumsInCompaction(
+      boolean verifyChecksumsInCompaction) {
+    setVerifyChecksumsInCompaction(
+        nativeHandle_, verifyChecksumsInCompaction);
+    return this;
+  }
+  private native void setVerifyChecksumsInCompaction(
+      long handle, boolean verifyChecksumsInCompaction);
+
+  /**
+   * Use KeyMayExist API to filter deletes when this is true.
+   * If KeyMayExist returns false, i.e. the key definitely does not exist, then
+   * the delete is a noop. KeyMayExist only incurs in-memory look up.
+   * This optimization avoids writing the delete to storage when appropriate.
+   * Default: false
+   *
+   * @return true if filter-deletes behavior is on.
+   */
+  public boolean filterDeletes() {
+    return filterDeletes(nativeHandle_);
+  }
+  private native boolean filterDeletes(long handle);
+
+  /**
+   * Use KeyMayExist API to filter deletes when this is true.
+   * If KeyMayExist returns false, i.e. the key definitely does not exist, then
+   * the delete is a noop. KeyMayExist only incurs in-memory look up.
+   * This optimization avoids writing the delete to storage when appropriate.
+   * Default: false
+   *
+   * @param filterDeletes true if filter-deletes behavior is on.
+   * @return the reference to the current option.
+   */
+  public Options setFilterDeletes(boolean filterDeletes) {
+    setFilterDeletes(nativeHandle_, filterDeletes);
+    return this;
+  }
+  private native void setFilterDeletes(
+      long handle, boolean filterDeletes);
+
+  /**
+   * An iteration->Next() sequentially skips over keys with the same
+   * user-key unless this option is set. This number specifies the number
+   * of keys (with the same userkey) that will be sequentially
+   * skipped before a reseek is issued.
+   * Default: 8
+   *
+   * @return the number of keys could be skipped in a iteration.
+   */
+  public long maxSequentialSkipInIterations() {
+    return maxSequentialSkipInIterations(nativeHandle_);
+  }
+  private native long maxSequentialSkipInIterations(long handle);
+
+  /**
+   * An iteration->Next() sequentially skips over keys with the same
+   * user-key unless this option is set. This number specifies the number
+   * of keys (with the same userkey) that will be sequentially
+   * skipped before a reseek is issued.
+   * Default: 8
+   *
+   * @param maxSequentialSkipInIterations the number of keys could
+   *     be skipped in a iteration.
+   * @return the reference to the current option.
+   */
+  public Options setMaxSequentialSkipInIterations(long maxSequentialSkipInIterations) {
+    setMaxSequentialSkipInIterations(nativeHandle_, maxSequentialSkipInIterations);
+    return this;
+  }
+  private native void setMaxSequentialSkipInIterations(
+      long handle, long maxSequentialSkipInIterations);
+
+  /**
+   * Allows thread-safe inplace updates.
+   * If inplace_callback function is not set,
+   *   Put(key, new_value) will update inplace the existing_value iff
+   *   * key exists in current memtable
+   *   * new sizeof(new_value) <= sizeof(existing_value)
+   *   * existing_value for that key is a put i.e. kTypeValue
+   * If inplace_callback function is set, check doc for inplace_callback.
+   * Default: false.
+   *
+   * @return true if thread-safe inplace updates are allowed.
+   */
+  public boolean inplaceUpdateSupport() {
+    return inplaceUpdateSupport(nativeHandle_);
+  }
+  private native boolean inplaceUpdateSupport(long handle);
+
+  /**
+   * Allows thread-safe inplace updates.
+   * If inplace_callback function is not set,
+   *   Put(key, new_value) will update inplace the existing_value iff
+   *   * key exists in current memtable
+   *   * new sizeof(new_value) <= sizeof(existing_value)
+   *   * existing_value for that key is a put i.e. kTypeValue
+   * If inplace_callback function is set, check doc for inplace_callback.
+   * Default: false.
+   *
+   * @param inplaceUpdateSupport true if thread-safe inplace updates
+   *     are allowed.
+   * @return the reference to the current option.
+   */
+  public Options setInplaceUpdateSupport(boolean inplaceUpdateSupport) {
+    setInplaceUpdateSupport(nativeHandle_, inplaceUpdateSupport);
+    return this;
+  }
+  private native void setInplaceUpdateSupport(
+      long handle, boolean inplaceUpdateSupport);
+
+  /**
+   * Number of locks used for inplace update
+   * Default: 10000, if inplace_update_support = true, else 0.
+   *
+   * @return the number of locks used for inplace update.
+   */
+  public long inplaceUpdateNumLocks() {
+    return inplaceUpdateNumLocks(nativeHandle_);
+  }
+  private native long inplaceUpdateNumLocks(long handle);
+
+  /**
+   * Number of locks used for inplace update
+   * Default: 10000, if inplace_update_support = true, else 0.
+   *
+   * @param inplaceUpdateNumLocks the number of locks used for
+   *     inplace updates.
+   * @return the reference to the current option.
+   */
+  public Options setInplaceUpdateNumLocks(long inplaceUpdateNumLocks) {
+    setInplaceUpdateNumLocks(nativeHandle_, inplaceUpdateNumLocks);
+    return this;
+  }
+  private native void setInplaceUpdateNumLocks(
+      long handle, long inplaceUpdateNumLocks);
+
+  /**
+   * Returns the number of bits used in the prefix bloom filter.
+   *
+   * This value will be used only when a prefix-extractor is specified.
+   *
+   * @return the number of bloom-bits.
+   * @see useFixedLengthPrefixExtractor()
+   */
+  public int memtablePrefixBloomBits() {
+    return memtablePrefixBloomBits(nativeHandle_);
+  }
+  private native int memtablePrefixBloomBits(long handle);
+
+  /**
+   * Sets the number of bits used in the prefix bloom filter.
+   *
+   * This value will be used only when a prefix-extractor is specified.
+   *
+   * @param memtablePrefixBloomBits the number of bits used in the
+   *     prefix bloom filter.
+   * @return the reference to the current option.
+   */
+  public Options setMemtablePrefixBloomBits(int memtablePrefixBloomBits) {
+    setMemtablePrefixBloomBits(nativeHandle_, memtablePrefixBloomBits);
+    return this;
+  }
+  private native void setMemtablePrefixBloomBits(
+      long handle, int memtablePrefixBloomBits);
+
+  /**
+   * The number of hash probes per key used in the mem-table.
+   *
+   * @return the number of hash probes per key.
+   */
+  public int memtablePrefixBloomProbes() {
+    return memtablePrefixBloomProbes(nativeHandle_);
+  }
+  private native int memtablePrefixBloomProbes(long handle);
+
+  /**
+   * The number of hash probes per key used in the mem-table.
+   *
+   * @param memtablePrefixBloomProbes the number of hash probes per key.
+   * @return the reference to the current option.
+   */
+  public Options setMemtablePrefixBloomProbes(int memtablePrefixBloomProbes) {
+    setMemtablePrefixBloomProbes(nativeHandle_, memtablePrefixBloomProbes);
+    return this;
+  }
+  private native void setMemtablePrefixBloomProbes(
+      long handle, int memtablePrefixBloomProbes);
+
+  /**
+   * Control locality of bloom filter probes to improve cache miss rate.
+   * This option only applies to memtable prefix bloom and plaintable
+   * prefix bloom. It essentially limits the max number of cache lines each
+   * bloom filter check can touch.
+   * This optimization is turned off when set to 0. The number should never
+   * be greater than number of probes. This option can boost performance
+   * for in-memory workload but should use with care since it can cause
+   * higher false positive rate.
+   * Default: 0
+   *
+   * @return the level of locality of bloom-filter probes.
+   * @see setMemTablePrefixBloomProbes
+   */
+  public int bloomLocality() {
+    return bloomLocality(nativeHandle_);
+  }
+  private native int bloomLocality(long handle);
+
+  /**
+   * Control locality of bloom filter probes to improve cache miss rate.
+   * This option only applies to memtable prefix bloom and plaintable
+   * prefix bloom. It essentially limits the max number of cache lines each
+   * bloom filter check can touch.
+   * This optimization is turned off when set to 0. The number should never
+   * be greater than number of probes. This option can boost performance
+   * for in-memory workload but should use with care since it can cause
+   * higher false positive rate.
+   * Default: 0
+   *
+   * @param bloomLocality the level of locality of bloom-filter probes.
+   * @return the reference to the current option.
+   */
+  public Options setBloomLocality(int bloomLocality) {
+    setBloomLocality(nativeHandle_, bloomLocality);
+    return this;
+  }
+  private native void setBloomLocality(
+      long handle, int bloomLocality);
+
+  /**
+   * Maximum number of successive merge operations on a key in the memtable.
+   *
+   * When a merge operation is added to the memtable and the maximum number of
+   * successive merges is reached, the value of the key will be calculated and
+   * inserted into the memtable instead of the merge operation. This will
+   * ensure that there are never more than max_successive_merges merge
+   * operations in the memtable.
+   *
+   * Default: 0 (disabled)
+   *
+   * @return the maximum number of successive merges.
+   */
+  public long maxSuccessiveMerges() {
+    return maxSuccessiveMerges(nativeHandle_);
+  }
+  private native long maxSuccessiveMerges(long handle);
+
+  /**
+   * Maximum number of successive merge operations on a key in the memtable.
+   *
+   * When a merge operation is added to the memtable and the maximum number of
+   * successive merges is reached, the value of the key will be calculated and
+   * inserted into the memtable instead of the merge operation. This will
+   * ensure that there are never more than max_successive_merges merge
+   * operations in the memtable.
+   *
+   * Default: 0 (disabled)
+   *
+   * @param maxSuccessiveMerges the maximum number of successive merges.
+   * @return the reference to the current option.
+   */
+  public Options setMaxSuccessiveMerges(long maxSuccessiveMerges) {
+    setMaxSuccessiveMerges(nativeHandle_, maxSuccessiveMerges);
+    return this;
+  }
+  private native void setMaxSuccessiveMerges(
+      long handle, long maxSuccessiveMerges);
+
+  /**
+   * The minimum number of write buffers that will be merged together
+   * before writing to storage.  If set to 1, then
+   * all write buffers are fushed to L0 as individual files and this increases
+   * read amplification because a get request has to check in all of these
+   * files. Also, an in-memory merge may result in writing lesser
+   * data to storage if there are duplicate records in each of these
+   * individual write buffers.  Default: 1
+   *
+   * @return the minimum number of write buffers that will be merged together.
+   */
+  public int minWriteBufferNumberToMerge() {
+    return minWriteBufferNumberToMerge(nativeHandle_);
+  }
+  private native int minWriteBufferNumberToMerge(long handle);
+
+  /**
+   * The minimum number of write buffers that will be merged together
+   * before writing to storage.  If set to 1, then
+   * all write buffers are fushed to L0 as individual files and this increases
+   * read amplification because a get request has to check in all of these
+   * files. Also, an in-memory merge may result in writing lesser
+   * data to storage if there are duplicate records in each of these
+   * individual write buffers.  Default: 1
+   *
+   * @param minWriteBufferNumberToMerge the minimum number of write buffers
+   *     that will be merged together.
+   * @return the reference to the current option.
+   */
+  public Options setMinWriteBufferNumberToMerge(int minWriteBufferNumberToMerge) {
+    setMinWriteBufferNumberToMerge(nativeHandle_, minWriteBufferNumberToMerge);
+    return this;
+  }
+  private native void setMinWriteBufferNumberToMerge(
+      long handle, int minWriteBufferNumberToMerge);
+
+  /**
+   * The number of partial merge operands to accumulate before partial
+   * merge will be performed. Partial merge will not be called
+   * if the list of values to merge is less than min_partial_merge_operands.
+   *
+   * If min_partial_merge_operands < 2, then it will be treated as 2.
+   *
+   * Default: 2
+   *
+   * @return
+   */
+  public int minPartialMergeOperands() {
+    return minPartialMergeOperands(nativeHandle_);
+  }
+  private native int minPartialMergeOperands(long handle);
+
+  /**
+   * The number of partial merge operands to accumulate before partial
+   * merge will be performed. Partial merge will not be called
+   * if the list of values to merge is less than min_partial_merge_operands.
+   *
+   * If min_partial_merge_operands < 2, then it will be treated as 2.
+   *
+   * Default: 2
+   *
+   * @param minPartialMergeOperands
+   * @return the reference to the current option.
+   */
+  public Options setMinPartialMergeOperands(int minPartialMergeOperands) {
+    setMinPartialMergeOperands(nativeHandle_, minPartialMergeOperands);
+    return this;
+  }
+  private native void setMinPartialMergeOperands(
+      long handle, int minPartialMergeOperands);
+
+  /**
+   * Release the memory allocated for the current instance
+   * in the c++ side.
+   */
+  @Override public synchronized void dispose() {
+    if (isInitialized()) {
+      dispose0();
+    }
+  }
+
+  static final int DEFAULT_PLAIN_TABLE_BLOOM_BITS_PER_KEY = 10;
+  static final double DEFAULT_PLAIN_TABLE_HASH_TABLE_RATIO = 0.75;
+  static final int DEFAULT_PLAIN_TABLE_INDEX_SPARSENESS = 16;
+
+  private native void newOptions();
+  private native void dispose0();
+  private native void setCreateIfMissing(long handle, boolean flag);
+  private native boolean createIfMissing(long handle);
+  private native void setWriteBufferSize(long handle, long writeBufferSize);
+  private native long writeBufferSize(long handle);
+  private native void setMaxWriteBufferNumber(
+      long handle, int maxWriteBufferNumber);
+  private native int maxWriteBufferNumber(long handle);
+  private native void setBlockSize(long handle, long blockSize);
+  private native long blockSize(long handle);
+  private native void setDisableSeekCompaction(
+      long handle, boolean disableSeekCompaction);
+  private native boolean disableSeekCompaction(long handle);
+  private native void setMaxBackgroundCompactions(
+      long handle, int maxBackgroundCompactions);
+  private native int maxBackgroundCompactions(long handle);
+  private native void createStatistics(long optHandle);
+  private native long statisticsPtr(long optHandle);
+
+  private native void setMemTableFactory(long handle, long factoryHandle);
+  private native String memTableFactoryName(long handle);
+
+  private native void setTableFactory(long handle, long factoryHandle);
+  private native String tableFactoryName(long handle);
+
+  private native void useFixedLengthPrefixExtractor(
+      long handle, int prefixLength);
+
+  long cacheSize_;
+  Filter filter_;
+}
diff --git a/java/org/rocksdb/PlainTableConfig.java b/java/org/rocksdb/PlainTableConfig.java
new file mode 100644 (file)
index 0000000..554ce38
--- /dev/null
@@ -0,0 +1,123 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+package org.rocksdb;
+
+/**
+ * The config for plain table sst format.
+ *
+ * PlainTable is a RocksDB's SST file format optimized for low query latency
+ * on pure-memory or really low-latency media.  It also support prefix
+ * hash feature.
+ */
+public class PlainTableConfig extends TableFormatConfig {
+  public static final int VARIABLE_LENGTH = 0;
+  public static final int DEFAULT_BLOOM_BITS_PER_KEY = 10;
+  public static final double DEFAULT_HASH_TABLE_RATIO = 0.75;
+  public static final int DEFAULT_INDEX_SPARSENESS = 16;
+
+  public PlainTableConfig() {
+    keySize_ = VARIABLE_LENGTH;
+    bloomBitsPerKey_ = DEFAULT_BLOOM_BITS_PER_KEY;
+    hashTableRatio_ = DEFAULT_HASH_TABLE_RATIO;
+    indexSparseness_ = DEFAULT_INDEX_SPARSENESS;
+  }
+
+  /**
+   * Set the length of the user key. If it is set to be VARIABLE_LENGTH,
+   * then it indicates the user keys are variable-lengthed.  Otherwise,
+   * all the keys need to have the same length in byte.
+   * DEFAULT: VARIABLE_LENGTH
+   *
+   * @param keySize the length of the user key.
+   * @return the reference to the current config.
+   */
+  public PlainTableConfig setKeySize(int keySize) {
+    keySize_ = keySize;
+    return this;
+  }
+
+  /**
+   * @return the specified size of the user key.  If VARIABLE_LENGTH,
+   *     then it indicates variable-length key.
+   */
+  public int keySize() {
+    return keySize_;
+  }
+
+  /**
+   * Set the number of bits per key used by the internal bloom filter
+   * in the plain table sst format.
+   *
+   * @param bitsPerKey the number of bits per key for bloom filer.
+   * @return the reference to the current config.
+   */
+  public PlainTableConfig setBloomBitsPerKey(int bitsPerKey) {
+    bloomBitsPerKey_ = bitsPerKey;
+    return this;
+  }
+
+  /**
+   * @return the number of bits per key used for the bloom filter.
+   */
+  public int bloomBitsPerKey() {
+    return bloomBitsPerKey_;
+  }
+
+  /**
+   * hashTableRatio is the desired utilization of the hash table used
+   * for prefix hashing.  The ideal ratio would be the number of
+   * prefixes / the number of hash buckets.  If this value is set to
+   * zero, then hash table will not be used.
+   *
+   * @param ratio the hash table ratio.
+   * @return the reference to the current config.
+   */
+  public PlainTableConfig setHashTableRatio(double ratio) {
+    hashTableRatio_ = ratio;
+    return this;
+  }
+
+  /**
+   * @return the hash table ratio.
+   */
+  public double hashTableRatio() {
+    return hashTableRatio_;
+  }
+
+  /**
+   * Index sparseness determines the index interval for keys inside the
+   * same prefix.  This number is equal to the maximum number of linear
+   * search required after hash and binary search.  If it's set to 0,
+   * then each key will be indexed.
+   *
+   * @param sparseness the index sparseness.
+   * @return the reference to the current config.
+   */
+  public PlainTableConfig setIndexSparseness(int sparseness) {
+    indexSparseness_ = sparseness;
+    return this;
+  }
+
+  /**
+   * @return the index sparseness.
+   */
+  public int indexSparseness() {
+    return indexSparseness_;
+  }
+
+  @Override protected long newTableFactoryHandle() {
+    return newTableFactoryHandle(keySize_, bloomBitsPerKey_,
+        hashTableRatio_, indexSparseness_);
+  }
+
+  private native long newTableFactoryHandle(
+      int keySize, int bloomBitsPerKey,
+      double hashTableRatio, int indexSparseness);
+
+  private int keySize_;
+  private int bloomBitsPerKey_;
+  private double hashTableRatio_;
+  private int indexSparseness_;
+}
diff --git a/java/org/rocksdb/ReadOptions.java b/java/org/rocksdb/ReadOptions.java
new file mode 100644 (file)
index 0000000..23250fc
--- /dev/null
@@ -0,0 +1,130 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * The class that controls the get behavior.
+ *
+ * Note that dispose() must be called before an Options instance
+ * become out-of-scope to release the allocated memory in c++.
+ */
+public class ReadOptions extends RocksObject {
+  public ReadOptions() {
+    super();
+    newReadOptions();
+  }
+  private native void newReadOptions();
+
+  /**
+   * Release the memory allocated for the current instance
+   * in the c++ side.
+   *
+   * Calling other methods after dispose() leads to undefined behavior.
+   */
+  @Override public synchronized void dispose() {
+    if (isInitialized()) {
+      dispose(nativeHandle_);
+    }
+  }
+  private native void dispose(long handle);
+
+  /**
+   * If true, all data read from underlying storage will be
+   * verified against corresponding checksums.
+   * Default: true
+   *
+   * @return true if checksum verification is on.
+   */
+  public boolean verifyChecksums() {
+    assert(isInitialized());
+    return verifyChecksums(nativeHandle_);
+  }
+  private native boolean verifyChecksums(long handle);
+
+  /**
+   * If true, all data read from underlying storage will be
+   * verified against corresponding checksums.
+   * Default: true
+   *
+   * @param verifyChecksums if true, then checksum verification
+   *     will be performed on every read.
+   * @return the reference to the current ReadOptions.
+   */
+  public ReadOptions setVerifyChecksums(boolean verifyChecksums) {
+    assert(isInitialized());
+    setVerifyChecksums(nativeHandle_, verifyChecksums);
+    return this;
+  }
+  private native void setVerifyChecksums(
+      long handle, boolean verifyChecksums);
+
+  // TODO(yhchiang): this option seems to be block-based table only.
+  //                 move this to a better place?
+  /**
+   * Fill the cache when loading the block-based sst formated db.
+   * Callers may wish to set this field to false for bulk scans.
+   * Default: true
+   *
+   * @return true if the fill-cache behavior is on.
+   */
+  public boolean fillCache() {
+    assert(isInitialized());
+    return fillCache(nativeHandle_);
+  }
+  private native boolean fillCache(long handle);
+
+  /**
+   * Fill the cache when loading the block-based sst formated db.
+   * Callers may wish to set this field to false for bulk scans.
+   * Default: true
+   *
+   * @param fillCache if true, then fill-cache behavior will be
+   *     performed.
+   * @return the reference to the current ReadOptions.
+   */
+  public ReadOptions setFillCache(boolean fillCache) {
+    assert(isInitialized());
+    setFillCache(nativeHandle_, fillCache);
+    return this;
+  }
+  private native void setFillCache(
+      long handle, boolean fillCache);
+
+  /**
+   * Specify to create a tailing iterator -- a special iterator that has a
+   * view of the complete database (i.e. it can also be used to read newly
+   * added data) and is optimized for sequential reads. It will return records
+   * that were inserted into the database after the creation of the iterator.
+   * Default: false
+   * Not supported in ROCKSDB_LITE mode!
+   *
+   * @return true if tailing iterator is enabled.
+   */
+  public boolean tailing() {
+    assert(isInitialized());
+    return tailing(nativeHandle_);
+  }
+  private native boolean tailing(long handle);
+
+  /**
+   * Specify to create a tailing iterator -- a special iterator that has a
+   * view of the complete database (i.e. it can also be used to read newly
+   * added data) and is optimized for sequential reads. It will return records
+   * that were inserted into the database after the creation of the iterator.
+   * Default: false
+   * Not supported in ROCKSDB_LITE mode!
+   *
+   * @param tailing if true, then tailing iterator will be enabled.
+   * @return the reference to the current ReadOptions.
+   */
+  public ReadOptions setTailing(boolean tailing) {
+    assert(isInitialized());
+    setTailing(nativeHandle_, tailing);
+    return this;
+  }
+  private native void setTailing(
+      long handle, boolean tailing);
+}
diff --git a/java/org/rocksdb/RocksDB.java b/java/org/rocksdb/RocksDB.java
new file mode 100644 (file)
index 0000000..e92acea
--- /dev/null
@@ -0,0 +1,376 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+import java.util.List;
+import java.util.Map;
+import java.util.HashMap;
+import java.io.Closeable;
+import java.io.IOException;
+import org.rocksdb.util.Environment;
+
+/**
+ * A RocksDB is a persistent ordered map from keys to values.  It is safe for
+ * concurrent access from multiple threads without any external synchronization.
+ * All methods of this class could potentially throw RocksDBException, which
+ * indicates sth wrong at the rocksdb library side and the call failed.
+ */
+public class RocksDB extends RocksObject {
+  public static final int NOT_FOUND = -1;
+  private static final String[] compressionLibs_ = {
+      "snappy", "zlib", "bzip2", "lz4", "lz4hc"};
+
+  /**
+   * Loads the necessary library files.
+   * Calling this method twice will have no effect.
+   */
+  public static synchronized void loadLibrary() {
+    // loading possibly necessary libraries.
+    for (String lib : compressionLibs_) {
+      try {
+      System.loadLibrary(lib);
+      } catch (UnsatisfiedLinkError e) {
+        // since it may be optional, we ignore its loading failure here.
+      }
+    }
+    // However, if any of them is required.  We will see error here.
+    System.loadLibrary("rocksdbjni");
+  }
+
+  /**
+   * Tries to load the necessary library files from the given list of
+   * directories.
+   *
+   * @param paths a list of strings where each describes a directory
+   *     of a library.
+   */
+  public static synchronized void loadLibrary(List<String> paths) {
+    for (String lib : compressionLibs_) {
+      for (String path : paths) {
+        try {
+          System.load(path + "/" + Environment.getSharedLibraryName(lib));
+          break;
+        } catch (UnsatisfiedLinkError e) {
+          // since they are optional, we ignore loading fails.
+        }
+      }
+    }
+    boolean success = false;
+    UnsatisfiedLinkError err = null;
+    for (String path : paths) {
+      try {
+        System.load(path + "/" + Environment.getJniLibraryName("rocksdbjni"));
+        success = true;
+        break;
+      } catch (UnsatisfiedLinkError e) {
+        err = e;
+      }
+    }
+    if (success == false) {
+      throw err;
+    }
+  }
+
+  /**
+   * The factory constructor of RocksDB that opens a RocksDB instance given
+   * the path to the database using the default options w/ createIfMissing
+   * set to true.
+   *
+   * @param path the path to the rocksdb.
+   * @param status an out value indicating the status of the Open().
+   * @return a rocksdb instance on success, null if the specified rocksdb can
+   *     not be opened.
+   *
+   * @see Options.setCreateIfMissing()
+   * @see Options.createIfMissing()
+   */
+  public static RocksDB open(String path) throws RocksDBException {
+    RocksDB db = new RocksDB();
+
+    // This allows to use the rocksjni default Options instead of
+    // the c++ one.
+    Options options = new Options();
+    db.open(options.nativeHandle_, options.cacheSize_, path);
+    db.transferCppRawPointersOwnershipFrom(options);
+    options.dispose();
+    return db;
+  }
+
+  /**
+   * The factory constructor of RocksDB that opens a RocksDB instance given
+   * the path to the database using the specified options and db path.
+   */
+  public static RocksDB open(Options options, String path)
+      throws RocksDBException {
+    // when non-default Options is used, keeping an Options reference
+    // in RocksDB can prevent Java to GC during the life-time of
+    // the currently-created RocksDB.
+    RocksDB db = new RocksDB();
+    db.open(options.nativeHandle_, options.cacheSize_, path);
+    db.transferCppRawPointersOwnershipFrom(options);
+    return db;
+  }
+
+  @Override public synchronized void dispose() {
+    if (isInitialized()) {
+      dispose(nativeHandle_);
+      nativeHandle_ = 0;
+    }
+  }
+
+  /**
+   * Close the RocksDB instance.
+   * This function is equivalent to dispose().
+   */
+  public void close() {
+    dispose();
+  }
+
+  /**
+   * Set the database entry for "key" to "value".
+   *
+   * @param key the specified key to be inserted.
+   * @param value the value associated with the specified key.
+   */
+  public void put(byte[] key, byte[] value) throws RocksDBException {
+    put(nativeHandle_, key, key.length, value, value.length);
+  }
+
+  /**
+   * Set the database entry for "key" to "value".
+   *
+   * @param key the specified key to be inserted.
+   * @param value the value associated with the specified key.
+   */
+  public void put(WriteOptions writeOpts, byte[] key, byte[] value)
+      throws RocksDBException {
+    put(nativeHandle_, writeOpts.nativeHandle_,
+        key, key.length, value, value.length);
+  }
+
+  /**
+   * Apply the specified updates to the database.
+   */
+  public void write(WriteOptions writeOpts, WriteBatch updates)
+      throws RocksDBException {
+    write(writeOpts.nativeHandle_, updates.nativeHandle_);
+  }
+
+  /**
+   * Get the value associated with the specified key.
+   *
+   * @param key the key to retrieve the value.
+   * @param value the out-value to receive the retrieved value.
+   * @return The size of the actual value that matches the specified
+   *     {@code key} in byte.  If the return value is greater than the
+   *     length of {@code value}, then it indicates that the size of the
+   *     input buffer {@code value} is insufficient and partial result will
+   *     be returned.  RocksDB.NOT_FOUND will be returned if the value not
+   *     found.
+   */
+  public int get(byte[] key, byte[] value) throws RocksDBException {
+    return get(nativeHandle_, key, key.length, value, value.length);
+  }
+
+  /**
+   * Get the value associated with the specified key.
+   *
+   * @param key the key to retrieve the value.
+   * @param value the out-value to receive the retrieved value.
+   * @return The size of the actual value that matches the specified
+   *     {@code key} in byte.  If the return value is greater than the
+   *     length of {@code value}, then it indicates that the size of the
+   *     input buffer {@code value} is insufficient and partial result will
+   *     be returned.  RocksDB.NOT_FOUND will be returned if the value not
+   *     found.
+   */
+  public int get(ReadOptions opt, byte[] key, byte[] value)
+      throws RocksDBException {
+    return get(nativeHandle_, opt.nativeHandle_,
+               key, key.length, value, value.length);
+  }
+
+  /**
+   * The simplified version of get which returns a new byte array storing
+   * the value associated with the specified input key if any.  null will be
+   * returned if the specified key is not found.
+   *
+   * @param key the key retrieve the value.
+   * @return a byte array storing the value associated with the input key if
+   *     any.  null if it does not find the specified key.
+   *
+   * @see RocksDBException
+   */
+  public byte[] get(byte[] key) throws RocksDBException {
+    return get(nativeHandle_, key, key.length);
+  }
+
+  /**
+   * The simplified version of get which returns a new byte array storing
+   * the value associated with the specified input key if any.  null will be
+   * returned if the specified key is not found.
+   *
+   * @param key the key retrieve the value.
+   * @param opt Read options.
+   * @return a byte array storing the value associated with the input key if
+   *     any.  null if it does not find the specified key.
+   *
+   * @see RocksDBException
+   */
+  public byte[] get(ReadOptions opt, byte[] key) throws RocksDBException {
+    return get(nativeHandle_, opt.nativeHandle_, key, key.length);
+  }
+
+  /**
+   * Returns a map of keys for which values were found in DB.
+   *
+   * @param keys List of keys for which values need to be retrieved.
+   * @return Map where key of map is the key passed by user and value for map
+   * entry is the corresponding value in DB.
+   *
+   * @see RocksDBException
+   */
+  public Map<byte[], byte[]> multiGet(List<byte[]> keys)
+      throws RocksDBException {
+    assert(keys.size() != 0);
+
+    List<byte[]> values = multiGet(
+        nativeHandle_, keys, keys.size());
+
+    Map<byte[], byte[]> keyValueMap = new HashMap<byte[], byte[]>();
+    for(int i = 0; i < values.size(); i++) {
+      if(values.get(i) == null) {
+        continue;
+      }
+
+      keyValueMap.put(keys.get(i), values.get(i));
+    }
+
+    return keyValueMap;
+  }
+
+
+  /**
+   * Returns a map of keys for which values were found in DB.
+   *
+   * @param List of keys for which values need to be retrieved.
+   * @param opt Read options.
+   * @return Map where key of map is the key passed by user and value for map
+   * entry is the corresponding value in DB.
+   *
+   * @see RocksDBException
+   */
+  public Map<byte[], byte[]> multiGet(ReadOptions opt, List<byte[]> keys)
+      throws RocksDBException {
+    assert(keys.size() != 0);
+
+    List<byte[]> values = multiGet(
+        nativeHandle_, opt.nativeHandle_, keys, keys.size());
+
+    Map<byte[], byte[]> keyValueMap = new HashMap<byte[], byte[]>();
+    for(int i = 0; i < values.size(); i++) {
+      if(values.get(i) == null) {
+        continue;
+      }
+
+      keyValueMap.put(keys.get(i), values.get(i));
+    }
+
+    return keyValueMap;
+  }
+
+  /**
+   * Remove the database entry (if any) for "key".  Returns OK on
+   * success, and a non-OK status on error.  It is not an error if "key"
+   * did not exist in the database.
+   */
+  public void remove(byte[] key) throws RocksDBException {
+    remove(nativeHandle_, key, key.length);
+  }
+
+  /**
+   * Remove the database entry (if any) for "key".  Returns OK on
+   * success, and a non-OK status on error.  It is not an error if "key"
+   * did not exist in the database.
+   */
+  public void remove(WriteOptions writeOpt, byte[] key)
+      throws RocksDBException {
+    remove(nativeHandle_, writeOpt.nativeHandle_, key, key.length);
+  }
+
+  /**
+   * Return a heap-allocated iterator over the contents of the database.
+   * The result of newIterator() is initially invalid (caller must
+   * call one of the Seek methods on the iterator before using it).
+   *
+   * Caller should close the iterator when it is no longer needed.
+   * The returned iterator should be closed before this db is closed.
+   *
+   * @return instance of iterator object.
+   */
+  public Iterator newIterator() {
+    return new Iterator(iterator0(nativeHandle_));
+  }
+
+  @Override protected void finalize() {
+    close();
+  }
+
+  /**
+   * Private constructor.
+   */
+  protected RocksDB() {
+    super();
+  }
+
+  /**
+   * Transfer the ownership of all c++ raw-pointers from Options
+   * to RocksDB to ensure the life-time of those raw-pointers
+   * will be at least as long as the life-time of any RocksDB
+   * that uses these raw-pointers.
+   */
+  protected void transferCppRawPointersOwnershipFrom(Options opt) {
+    filter_ = opt.filter_;
+    opt.filter_ = null;
+  }
+
+  // native methods
+  protected native void open(
+      long optionsHandle, long cacheSize, String path) throws RocksDBException;
+  protected native void put(
+      long handle, byte[] key, int keyLen,
+      byte[] value, int valueLen) throws RocksDBException;
+  protected native void put(
+      long handle, long writeOptHandle,
+      byte[] key, int keyLen,
+      byte[] value, int valueLen) throws RocksDBException;
+  protected native void write(
+      long writeOptHandle, long batchHandle) throws RocksDBException;
+  protected native int get(
+      long handle, byte[] key, int keyLen,
+      byte[] value, int valueLen) throws RocksDBException;
+  protected native int get(
+      long handle, long readOptHandle, byte[] key, int keyLen,
+      byte[] value, int valueLen) throws RocksDBException;
+  protected native List<byte[]> multiGet(
+      long dbHandle, List<byte[]> keys, int keysCount);
+  protected native List<byte[]> multiGet(
+      long dbHandle, long rOptHandle, List<byte[]> keys, int keysCount);
+  protected native byte[] get(
+      long handle, byte[] key, int keyLen) throws RocksDBException;
+  protected native byte[] get(
+      long handle, long readOptHandle,
+      byte[] key, int keyLen) throws RocksDBException;
+  protected native void remove(
+      long handle, byte[] key, int keyLen) throws RocksDBException;
+  protected native void remove(
+      long handle, long writeOptHandle,
+      byte[] key, int keyLen) throws RocksDBException;
+  protected native long iterator0(long optHandle);
+  protected native void dispose(long handle);
+
+  protected Filter filter_;
+}
diff --git a/java/org/rocksdb/RocksDBException.java b/java/org/rocksdb/RocksDBException.java
new file mode 100644 (file)
index 0000000..acc9366
--- /dev/null
@@ -0,0 +1,23 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+import java.util.*;
+
+/**
+ * A RocksDBException encapsulates the error of an operation.  This exception
+ * type is used to describe an internal error from the c++ rocksdb library.
+ */
+public class RocksDBException extends Exception {
+  /**
+   * The private construct used by a set of public static factory method.
+   *
+   * @param msg the specified error message.
+   */
+  public RocksDBException(String msg) {
+    super(msg);
+  }
+}
diff --git a/java/org/rocksdb/RocksObject.java b/java/org/rocksdb/RocksObject.java
new file mode 100644 (file)
index 0000000..6e36cba
--- /dev/null
@@ -0,0 +1,35 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * RocksObject is the base-class of all RocksDB related class that has
+ * a pointer to some c++ rocksdb object.  Although RocksObject
+ * will release its c++ resource on its finalize() once it has been
+ * garbage-collected, it is suggested to call dispose() manually to
+ * release its c++ resource once an instance of RocksObject is no
+ * longer used.
+ */
+public abstract class RocksObject {
+  protected RocksObject() {
+    nativeHandle_ = 0;
+  }
+
+  /**
+   * Release the c++ object pointed by the native handle.
+   */
+  public abstract void dispose();
+
+  protected boolean isInitialized() {
+    return (nativeHandle_ != 0);
+  }
+
+  @Override protected void finalize() {
+    dispose();
+  }
+
+  protected long nativeHandle_;
+}
diff --git a/java/org/rocksdb/SkipListMemTableConfig.java b/java/org/rocksdb/SkipListMemTableConfig.java
new file mode 100644 (file)
index 0000000..7f9f5cb
--- /dev/null
@@ -0,0 +1,15 @@
+package org.rocksdb;
+
+/**
+ * The config for skip-list memtable representation.
+ */
+public class SkipListMemTableConfig extends MemTableConfig {
+  public SkipListMemTableConfig() {
+  }
+
+  @Override protected long newMemTableFactoryHandle() {
+    return newMemTableFactoryHandle0();
+  }
+
+  private native long newMemTableFactoryHandle0();
+}
diff --git a/java/org/rocksdb/Statistics.java b/java/org/rocksdb/Statistics.java
new file mode 100644 (file)
index 0000000..bed2b88
--- /dev/null
@@ -0,0 +1,38 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * Statistics to analyze the performance of a db. Pointer for statistics object
+ * is managed by Options class.
+ */
+public class Statistics {
+
+  private final long statsHandle_;
+
+  public Statistics(long statsHandle) {
+    statsHandle_ = statsHandle;
+  }
+
+  public long getTickerCount(TickerType tickerType) {
+    assert(isInitialized());
+    return getTickerCount0(tickerType.getValue(), statsHandle_);
+  }
+
+  public HistogramData geHistogramData(HistogramType histogramType) {
+    assert(isInitialized());
+    HistogramData hist = geHistogramData0(
+        histogramType.getValue(), statsHandle_);
+    return hist;
+  }
+
+  private boolean isInitialized() {
+    return (statsHandle_ != 0);
+  }
+
+  private native long getTickerCount0(int tickerType, long handle);
+  private native HistogramData geHistogramData0(int histogramType, long handle);
+}
diff --git a/java/org/rocksdb/TableFormatConfig.java b/java/org/rocksdb/TableFormatConfig.java
new file mode 100644 (file)
index 0000000..e5c6341
--- /dev/null
@@ -0,0 +1,20 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+package org.rocksdb;
+
+/**
+ * TableFormatConfig is used to config the internal Table format of a RocksDB.
+ * To make a RocksDB to use a specific Table format, its associated
+ * TableFormatConfig should be properly set and passed into Options via
+ * Options.setTableFormatConfig() and open the db using that Options.
+ */
+public abstract class TableFormatConfig {
+  /**
+   * This function should only be called by Options.setTableFormatConfig(),
+   * which will create a c++ shared-pointer to the c++ TableFactory
+   * that associated with the Java TableFormatConfig.
+   */
+  abstract protected long newTableFactoryHandle();
+}
diff --git a/java/org/rocksdb/TickerType.java b/java/org/rocksdb/TickerType.java
new file mode 100644 (file)
index 0000000..5ad714d
--- /dev/null
@@ -0,0 +1,123 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+public enum TickerType {
+  // total block cache misses
+  // REQUIRES: BLOCK_CACHE_MISS == BLOCK_CACHE_INDEX_MISS +
+  //                               BLOCK_CACHE_FILTER_MISS +
+  //                               BLOCK_CACHE_DATA_MISS;
+  BLOCK_CACHE_MISS(0),
+  // total block cache hit
+  // REQUIRES: BLOCK_CACHE_HIT == BLOCK_CACHE_INDEX_HIT +
+  //                              BLOCK_CACHE_FILTER_HIT +
+  //                              BLOCK_CACHE_DATA_HIT;
+  BLOCK_CACHE_HIT(1),
+  // # of blocks added to block cache.
+  BLOCK_CACHE_ADD(2),
+  // # of times cache miss when accessing index block from block cache.
+  BLOCK_CACHE_INDEX_MISS(3),
+  // # of times cache hit when accessing index block from block cache.
+  BLOCK_CACHE_INDEX_HIT(4),
+  // # of times cache miss when accessing filter block from block cache.
+  BLOCK_CACHE_FILTER_MISS(5),
+  // # of times cache hit when accessing filter block from block cache.
+  BLOCK_CACHE_FILTER_HIT(6),
+  // # of times cache miss when accessing data block from block cache.
+  BLOCK_CACHE_DATA_MISS(7),
+  // # of times cache hit when accessing data block from block cache.
+  BLOCK_CACHE_DATA_HIT(8),
+  // # of times bloom filter has avoided file reads.
+  BLOOM_FILTER_USEFUL(9),
+
+  // # of memtable hits.
+  MEMTABLE_HIT(10),
+  // # of memtable misses.
+  MEMTABLE_MISS(11),
+
+  /**
+   * COMPACTION_KEY_DROP_* count the reasons for key drop during compaction
+   * There are 3 reasons currently.
+   */
+  COMPACTION_KEY_DROP_NEWER_ENTRY(12),  // key was written with a newer value.
+  COMPACTION_KEY_DROP_OBSOLETE(13),     // The key is obsolete.
+  COMPACTION_KEY_DROP_USER(14),  // user compaction function has dropped the key.
+
+  // Number of keys written to the database via the Put and Write call's
+  NUMBER_KEYS_WRITTEN(15),
+  // Number of Keys read,
+  NUMBER_KEYS_READ(16),
+  // Number keys updated, if inplace update is enabled
+  NUMBER_KEYS_UPDATED(17),
+  // Bytes written / read
+  BYTES_WRITTEN(18),
+  BYTES_READ(19),
+  NO_FILE_CLOSES(20),
+  NO_FILE_OPENS(21),
+  NO_FILE_ERRORS(22),
+  // Time system had to wait to do LO-L1 compactions
+  STALL_L0_SLOWDOWN_MICROS(23),
+  // Time system had to wait to move memtable to L1.
+  STALL_MEMTABLE_COMPACTION_MICROS(24),
+  // write throttle because of too many files in L0
+  STALL_L0_NUM_FILES_MICROS(25),
+  RATE_LIMIT_DELAY_MILLIS(26),
+  NO_ITERATORS(27),  // number of iterators currently open
+
+  // Number of MultiGet calls, keys read, and bytes read
+  NUMBER_MULTIGET_CALLS(28),
+  NUMBER_MULTIGET_KEYS_READ(29),
+  NUMBER_MULTIGET_BYTES_READ(30),
+
+  // Number of deletes records that were not required to be
+  // written to storage because key does not exist
+  NUMBER_FILTERED_DELETES(31),
+  NUMBER_MERGE_FAILURES(32),
+  SEQUENCE_NUMBER(33),
+
+  // number of times bloom was checked before creating iterator on a
+  // file, and the number of times the check was useful in avoiding
+  // iterator creation (and thus likely IOPs).
+  BLOOM_FILTER_PREFIX_CHECKED(34),
+  BLOOM_FILTER_PREFIX_USEFUL(35),
+
+  // Number of times we had to reseek inside an iteration to skip
+  // over large number of keys with same userkey.
+  NUMBER_OF_RESEEKS_IN_ITERATION(36),
+
+  // Record the number of calls to GetUpadtesSince. Useful to keep track of
+  // transaction log iterator refreshes
+  GET_UPDATES_SINCE_CALLS(37),
+  BLOCK_CACHE_COMPRESSED_MISS(38),  // miss in the compressed block cache
+  BLOCK_CACHE_COMPRESSED_HIT(39),   // hit in the compressed block cache
+  WAL_FILE_SYNCED(40),              // Number of times WAL sync is done
+  WAL_FILE_BYTES(41),               // Number of bytes written to WAL
+
+  // Writes can be processed by requesting thread or by the thread at the
+  // head of the writers queue.
+  WRITE_DONE_BY_SELF(42),
+  WRITE_DONE_BY_OTHER(43),
+  WRITE_WITH_WAL(44),       // Number of Write calls that request WAL
+  COMPACT_READ_BYTES(45),   // Bytes read during compaction
+  COMPACT_WRITE_BYTES(46),  // Bytes written during compaction
+
+  // Number of table's properties loaded directly from file, without creating
+  // table reader object.
+  NUMBER_DIRECT_LOAD_TABLE_PROPERTIES(47),
+  NUMBER_SUPERVERSION_ACQUIRES(48),
+  NUMBER_SUPERVERSION_RELEASES(49),
+  NUMBER_SUPERVERSION_CLEANUPS(50);
+
+  private final int value_;
+
+  private TickerType(int value) {
+    value_ = value;
+  }
+
+  public int getValue() {
+    return value_;
+  }
+}
diff --git a/java/org/rocksdb/VectorMemTableConfig.java b/java/org/rocksdb/VectorMemTableConfig.java
new file mode 100644 (file)
index 0000000..b7a413f
--- /dev/null
@@ -0,0 +1,40 @@
+package org.rocksdb;
+
+/**
+ * The config for vector memtable representation.
+ */
+public class VectorMemTableConfig extends MemTableConfig {
+  public static final int DEFAULT_RESERVED_SIZE = 0;
+  public VectorMemTableConfig() {
+    reservedSize_ = DEFAULT_RESERVED_SIZE;
+  }
+
+  /**
+   * Set the initial size of the vector that will be used
+   * by the memtable created based on this config.
+   *
+   * @param size the initial size of the vector.
+   * @return the reference to the current config.
+   */
+  public VectorMemTableConfig setReservedSize(int size) {
+    reservedSize_ = size;
+    return this;
+  }
+
+  /**
+   * Returns the initial size of the vector used by the memtable
+   * created based on this config.
+   *
+   * @return the initial size of the vector.
+   */
+  public int reservedSize() {
+    return reservedSize_;
+  }
+
+  @Override protected long newMemTableFactoryHandle() {
+    return newMemTableFactoryHandle(reservedSize_);
+  }
+
+  private native long newMemTableFactoryHandle(long reservedSize);
+  private int reservedSize_;
+}
diff --git a/java/org/rocksdb/WriteBatch.java b/java/org/rocksdb/WriteBatch.java
new file mode 100644 (file)
index 0000000..1ddbd44
--- /dev/null
@@ -0,0 +1,113 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+import java.util.*;
+
+/**
+ * WriteBatch holds a collection of updates to apply atomically to a DB.
+ *
+ * The updates are applied in the order in which they are added
+ * to the WriteBatch.  For example, the value of "key" will be "v3"
+ * after the following batch is written:
+ *
+ *    batch.put("key", "v1");
+ *    batch.remove("key");
+ *    batch.put("key", "v2");
+ *    batch.put("key", "v3");
+ *
+ * Multiple threads can invoke const methods on a WriteBatch without
+ * external synchronization, but if any of the threads may call a
+ * non-const method, all threads accessing the same WriteBatch must use
+ * external synchronization.
+ */
+public class WriteBatch extends RocksObject {
+  public WriteBatch() {
+    super();
+    newWriteBatch(0);
+  }
+
+  public WriteBatch(int reserved_bytes) {
+    nativeHandle_ = 0;
+    newWriteBatch(reserved_bytes);
+  }
+
+  /**
+   * Returns the number of updates in the batch.
+   */
+  public native int count();
+
+  /**
+   * Store the mapping "key->value" in the database.
+   */
+  public void put(byte[] key, byte[] value) {
+    put(key, key.length, value, value.length);
+  }
+
+  /**
+   * Merge "value" with the existing value of "key" in the database.
+   * "key->merge(existing, value)"
+   */
+  public void merge(byte[] key, byte[] value) {
+    merge(key, key.length, value, value.length);
+  }
+
+  /**
+   * If the database contains a mapping for "key", erase it.  Else do nothing.
+   */
+  public void remove(byte[] key) {
+    remove(key, key.length);
+  }
+
+  /**
+   * Append a blob of arbitrary size to the records in this batch. The blob will
+   * be stored in the transaction log but not in any other file. In particular,
+   * it will not be persisted to the SST files. When iterating over this
+   * WriteBatch, WriteBatch::Handler::LogData will be called with the contents
+   * of the blob as it is encountered. Blobs, puts, deletes, and merges will be
+   * encountered in the same order in thich they were inserted. The blob will
+   * NOT consume sequence number(s) and will NOT increase the count of the batch
+   *
+   * Example application: add timestamps to the transaction log for use in
+   * replication.
+   */
+  public void putLogData(byte[] blob) {
+    putLogData(blob, blob.length);
+  }
+
+  /**
+   * Clear all updates buffered in this batch
+   */
+  public native void clear();
+
+  /**
+   * Delete the c++ side pointer.
+   */
+  @Override public synchronized void dispose() {
+    if (isInitialized()) {
+      dispose0();
+    }
+  }
+
+  private native void newWriteBatch(int reserved_bytes);
+  private native void put(byte[] key, int keyLen,
+                          byte[] value, int valueLen);
+  private native void merge(byte[] key, int keyLen,
+                            byte[] value, int valueLen);
+  private native void remove(byte[] key, int keyLen);
+  private native void putLogData(byte[] blob, int blobLen);
+  private native void dispose0();
+}
+
+/**
+ * Package-private class which provides java api to access
+ * c++ WriteBatchInternal.
+ */
+class WriteBatchInternal {
+  static native void setSequence(WriteBatch batch, long sn);
+  static native long sequence(WriteBatch batch);
+  static native void append(WriteBatch b1, WriteBatch b2);
+}
diff --git a/java/org/rocksdb/WriteBatchTest.java b/java/org/rocksdb/WriteBatchTest.java
new file mode 100644 (file)
index 0000000..03a8663
--- /dev/null
@@ -0,0 +1,124 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+package org.rocksdb;
+
+import java.util.*;
+import java.io.UnsupportedEncodingException;
+
+/**
+ * This class mimics the db/write_batch_test.cc in the c++ rocksdb library.
+ */
+public class WriteBatchTest {
+  static {
+    RocksDB.loadLibrary();
+  }
+
+  public static void main(String args[]) {
+    System.out.println("Testing WriteBatchTest.Empty ===");
+    Empty();
+
+    System.out.println("Testing WriteBatchTest.Multiple ===");
+    Multiple();
+
+    System.out.println("Testing WriteBatchTest.Append ===");
+    Append();
+
+    System.out.println("Testing WriteBatchTest.Blob ===");
+    Blob();
+
+    // The following tests have not yet ported.
+    // Continue();
+    // PutGatherSlices();
+
+    System.out.println("Passed all WriteBatchTest!");
+  }
+
+  static void Empty() {
+    WriteBatch batch = new WriteBatch();
+    assert(batch.count() == 0);
+  }
+
+  static void Multiple() {
+    try {
+      WriteBatch batch =  new WriteBatch();
+      batch.put("foo".getBytes("US-ASCII"), "bar".getBytes("US-ASCII"));
+      batch.remove("box".getBytes("US-ASCII"));
+      batch.put("baz".getBytes("US-ASCII"), "boo".getBytes("US-ASCII"));
+      WriteBatchInternal.setSequence(batch, 100);
+      assert(100 == WriteBatchInternal.sequence(batch));
+      assert(3 == batch.count());
+      assert(new String("Put(baz, boo)@102" +
+                        "Delete(box)@101" +
+                        "Put(foo, bar)@100")
+                .equals(new String(getContents(batch), "US-ASCII")));
+    } catch (UnsupportedEncodingException e) {
+      System.err.println(e);
+      assert(false);
+    }
+  }
+
+  static void Append() {
+    WriteBatch b1 = new WriteBatch();
+    WriteBatch b2 = new WriteBatch();
+    WriteBatchInternal.setSequence(b1, 200);
+    WriteBatchInternal.setSequence(b2, 300);
+    WriteBatchInternal.append(b1, b2);
+    assert(getContents(b1).length == 0);
+    assert(b1.count() == 0);
+    try {
+      b2.put("a".getBytes("US-ASCII"), "va".getBytes("US-ASCII"));
+      WriteBatchInternal.append(b1, b2);
+      assert("Put(a, va)@200".equals(new String(getContents(b1), "US-ASCII")));
+      assert(1 == b1.count());
+      b2.clear();
+      b2.put("b".getBytes("US-ASCII"), "vb".getBytes("US-ASCII"));
+      WriteBatchInternal.append(b1, b2);
+      assert(new String("Put(a, va)@200" +
+                        "Put(b, vb)@201")
+                .equals(new String(getContents(b1), "US-ASCII")));
+      assert(2 == b1.count());
+      b2.remove("foo".getBytes("US-ASCII"));
+      WriteBatchInternal.append(b1, b2);
+      assert(new String("Put(a, va)@200" +
+                        "Put(b, vb)@202" +
+                        "Put(b, vb)@201" +
+                        "Delete(foo)@203")
+                 .equals(new String(getContents(b1), "US-ASCII")));
+      assert(4 == b1.count());
+    } catch (UnsupportedEncodingException e) {
+      System.err.println(e);
+      assert(false);
+    }
+  }
+
+  static void Blob() {
+    WriteBatch batch = new WriteBatch();
+    try {
+      batch.put("k1".getBytes("US-ASCII"), "v1".getBytes("US-ASCII"));
+      batch.put("k2".getBytes("US-ASCII"), "v2".getBytes("US-ASCII"));
+      batch.put("k3".getBytes("US-ASCII"), "v3".getBytes("US-ASCII"));
+      batch.putLogData("blob1".getBytes("US-ASCII"));
+      batch.remove("k2".getBytes("US-ASCII"));
+      batch.putLogData("blob2".getBytes("US-ASCII"));
+      batch.merge("foo".getBytes("US-ASCII"), "bar".getBytes("US-ASCII"));
+      assert(5 == batch.count());
+      assert(new String("Merge(foo, bar)@4" +
+                        "Put(k1, v1)@0" +
+                        "Delete(k2)@3" +
+                        "Put(k2, v2)@1" +
+                        "Put(k3, v3)@2")
+                .equals(new String(getContents(batch), "US-ASCII")));
+    } catch (UnsupportedEncodingException e) {
+      System.err.println(e);
+      assert(false);
+    }
+  }
+
+  static native byte[] getContents(WriteBatch batch);
+}
diff --git a/java/org/rocksdb/WriteOptions.java b/java/org/rocksdb/WriteOptions.java
new file mode 100644 (file)
index 0000000..f4a1d6a
--- /dev/null
@@ -0,0 +1,100 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * Options that control write operations.
+ *
+ * Note that developers should call WriteOptions.dispose() to release the
+ * c++ side memory before a WriteOptions instance runs out of scope.
+ */
+public class WriteOptions extends RocksObject {
+  public WriteOptions() {
+    super();
+    newWriteOptions();
+  }
+
+  @Override public synchronized void dispose() {
+    if (isInitialized()) {
+      dispose0(nativeHandle_);
+    }
+  }
+
+  /**
+   * If true, the write will be flushed from the operating system
+   * buffer cache (by calling WritableFile::Sync()) before the write
+   * is considered complete.  If this flag is true, writes will be
+   * slower.
+   *
+   * If this flag is false, and the machine crashes, some recent
+   * writes may be lost.  Note that if it is just the process that
+   * crashes (i.e., the machine does not reboot), no writes will be
+   * lost even if sync==false.
+   *
+   * In other words, a DB write with sync==false has similar
+   * crash semantics as the "write()" system call.  A DB write
+   * with sync==true has similar crash semantics to a "write()"
+   * system call followed by "fdatasync()".
+   *
+   * Default: false
+   *
+   * @param flag a boolean flag to indicate whether a write
+   *     should be synchronized.
+   * @return the instance of the current WriteOptions.
+   */
+  public WriteOptions setSync(boolean flag) {
+    setSync(nativeHandle_, flag);
+    return this;
+  }
+
+  /**
+   * If true, the write will be flushed from the operating system
+   * buffer cache (by calling WritableFile::Sync()) before the write
+   * is considered complete.  If this flag is true, writes will be
+   * slower.
+   *
+   * If this flag is false, and the machine crashes, some recent
+   * writes may be lost.  Note that if it is just the process that
+   * crashes (i.e., the machine does not reboot), no writes will be
+   * lost even if sync==false.
+   *
+   * In other words, a DB write with sync==false has similar
+   * crash semantics as the "write()" system call.  A DB write
+   * with sync==true has similar crash semantics to a "write()"
+   * system call followed by "fdatasync()".
+   */
+  public boolean sync() {
+    return sync(nativeHandle_);
+  }
+
+  /**
+   * If true, writes will not first go to the write ahead log,
+   * and the write may got lost after a crash.
+   *
+   * @param flag a boolean flag to specify whether to disable
+   *     write-ahead-log on writes.
+   * @return the instance of the current WriteOptions.
+   */
+  public WriteOptions setDisableWAL(boolean flag) {
+    setDisableWAL(nativeHandle_, flag);
+    return this;
+  }
+
+  /**
+   * If true, writes will not first go to the write ahead log,
+   * and the write may got lost after a crash.
+   */
+  public boolean disableWAL() {
+    return disableWAL(nativeHandle_);
+  }
+
+  private native void newWriteOptions();
+  private native void setSync(long handle, boolean flag);
+  private native boolean sync(long handle);
+  private native void setDisableWAL(long handle, boolean flag);
+  private native boolean disableWAL(long handle);
+  private native void dispose0(long handle);
+}
diff --git a/java/org/rocksdb/benchmark/DbBenchmark.java b/java/org/rocksdb/benchmark/DbBenchmark.java
new file mode 100644 (file)
index 0000000..5404b72
--- /dev/null
@@ -0,0 +1,1577 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+/**
+ * Copyright (C) 2011 the original author or authors.
+ * See the notice.md file distributed with this work for additional
+ * information regarding copyright ownership.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.rocksdb.benchmark;
+
+import java.lang.Runnable;
+import java.io.File;
+import java.nio.ByteBuffer;
+import java.util.Collection;
+import java.util.Date;
+import java.util.EnumMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+import java.util.concurrent.TimeUnit;
+import java.util.Arrays;
+import java.util.ArrayList;
+import java.util.concurrent.Callable;
+import java.util.concurrent.Executors;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+import org.rocksdb.*;
+import org.rocksdb.util.SizeUnit;
+
+class Stats {
+  int id_;
+  long start_;
+  long finish_;
+  double seconds_;
+  long done_;
+  long found_;
+  long lastOpTime_;
+  long nextReport_;
+  long bytes_;
+  StringBuilder message_;
+  boolean excludeFromMerge_;
+
+  // TODO(yhchiang): use the following arguments:
+  //   (Long)Flag.stats_interval
+  //   (Integer)Flag.stats_per_interval
+
+  Stats(int id) {
+    id_ = id;
+    nextReport_ = 100;
+    done_ = 0;
+    bytes_ = 0;
+    seconds_ = 0;
+    start_ = System.nanoTime();
+    lastOpTime_ = start_;
+    finish_ = start_;
+    found_ = 0;
+    message_ = new StringBuilder("");
+    excludeFromMerge_ = false;
+  }
+
+  void merge(final Stats other) {
+    if (other.excludeFromMerge_) {
+      return;
+    }
+
+    done_ += other.done_;
+    found_ += other.found_;
+    bytes_ += other.bytes_;
+    seconds_ += other.seconds_;
+    if (other.start_ < start_) start_ = other.start_;
+    if (other.finish_ > finish_) finish_ = other.finish_;
+
+    // Just keep the messages from one thread
+    if (message_.length() == 0) {
+      message_ = other.message_;
+    }
+  }
+
+  void stop() {
+    finish_ = System.nanoTime();
+    seconds_ = (double) (finish_ - start_) / 1000000;
+  }
+
+  void addMessage(String msg) {
+    if (message_.length() > 0) {
+      message_.append(" ");
+    }
+    message_.append(msg);
+  }
+
+  void setId(int id) { id_ = id; }
+  void setExcludeFromMerge() { excludeFromMerge_ = true; }
+
+  void finishedSingleOp(int bytes) {
+    done_++;
+    lastOpTime_ = System.nanoTime();
+    bytes_ += bytes;
+    if (done_ >= nextReport_) {
+      if (nextReport_ < 1000) {
+        nextReport_ += 100;
+      } else if (nextReport_ < 5000) {
+        nextReport_ += 500;
+      } else if (nextReport_ < 10000) {
+        nextReport_ += 1000;
+      } else if (nextReport_ < 50000) {
+        nextReport_ += 5000;
+      } else if (nextReport_ < 100000) {
+        nextReport_ += 10000;
+      } else if (nextReport_ < 500000) {
+        nextReport_ += 50000;
+      } else {
+        nextReport_ += 100000;
+      }
+      System.err.printf("... Task %s finished %d ops%30s\r", id_, done_, "");
+    }
+  }
+
+  void report(String name) {
+    // Pretend at least one op was done in case we are running a benchmark
+    // that does not call FinishedSingleOp().
+    if (done_ < 1) done_ = 1;
+
+    StringBuilder extra = new StringBuilder("");
+    if (bytes_ > 0) {
+      // Rate is computed on actual elapsed time, not the sum of per-thread
+      // elapsed times.
+      double elapsed = (finish_ - start_) * 1e-6;
+      extra.append(String.format("%6.1f MB/s", (bytes_ / 1048576.0) / elapsed));
+    }
+    extra.append(message_.toString());
+    double elapsed = (finish_ - start_) * 1e-6;
+    double throughput = (double) done_ / elapsed;
+
+    System.out.format("%-12s : %11.3f micros/op %d ops/sec;%s%s\n",
+            name, elapsed * 1e6 / done_,
+            (long) throughput, (extra.length() == 0 ? "" : " "), extra.toString());
+  }
+}
+
+public class DbBenchmark {
+  enum Order {
+    SEQUENTIAL,
+    RANDOM
+  }
+
+  enum DBState {
+    FRESH,
+    EXISTING
+  }
+
+  enum CompressionType {
+    NONE,
+    SNAPPY,
+    ZLIB,
+    BZIP2,
+    LZ4,
+    LZ4HC
+  }
+
+  static {
+    RocksDB.loadLibrary();
+  }
+
+  abstract class BenchmarkTask implements Callable<Stats> {
+    // TODO(yhchiang): use (Integer)Flag.perf_level.
+    public BenchmarkTask(
+        int tid, long randSeed, long numEntries, long keyRange) {
+      tid_ = tid;
+      rand_ = new Random(randSeed + tid * 1000);
+      numEntries_ = numEntries;
+      keyRange_ = keyRange;
+      stats_ = new Stats(tid);
+    }
+
+    @Override public Stats call() throws RocksDBException {
+      stats_.start_ = System.nanoTime();
+      runTask();
+      stats_.finish_ = System.nanoTime();
+      return stats_;
+    }
+
+    abstract protected void runTask() throws RocksDBException;
+
+    protected int tid_;
+    protected Random rand_;
+    protected long numEntries_;
+    protected long keyRange_;
+    protected Stats stats_;
+
+    protected void getFixedKey(byte[] key, long sn) {
+      generateKeyFromLong(key, sn);
+    }
+
+    protected void getRandomKey(byte[] key, long range) {
+      generateKeyFromLong(key, Math.abs(rand_.nextLong() % range));
+    }
+  }
+
+  abstract class WriteTask extends BenchmarkTask {
+    public WriteTask(
+        int tid, long randSeed, long numEntries, long keyRange,
+        WriteOptions writeOpt, long entriesPerBatch) {
+      super(tid, randSeed, numEntries, keyRange);
+      writeOpt_ = writeOpt;
+      entriesPerBatch_ = entriesPerBatch;
+      maxWritesPerSecond_ = -1;
+    }
+
+    public WriteTask(
+        int tid, long randSeed, long numEntries, long keyRange,
+        WriteOptions writeOpt, long entriesPerBatch, long maxWritesPerSecond) {
+      super(tid, randSeed, numEntries, keyRange);
+      writeOpt_ = writeOpt;
+      entriesPerBatch_ = entriesPerBatch;
+      maxWritesPerSecond_ = maxWritesPerSecond;
+    }
+
+    @Override public void runTask() throws RocksDBException {
+      if (numEntries_ != DbBenchmark.this.num_) {
+        stats_.message_.append(String.format(" (%d ops)", numEntries_));
+      }
+      byte[] key = new byte[keySize_];
+      byte[] value = new byte[valueSize_];
+
+      try {
+        if (entriesPerBatch_ == 1) {
+          for (long i = 0; i < numEntries_; ++i) {
+            getKey(key, i, keyRange_);
+            db_.put(writeOpt_, key, DbBenchmark.this.gen_.generate(valueSize_));
+            stats_.finishedSingleOp(keySize_ + valueSize_);
+            writeRateControl(i);
+            if (isFinished()) {
+              return;
+            }
+          }
+        } else {
+          for (long i = 0; i < numEntries_; i += entriesPerBatch_) {
+            WriteBatch batch = new WriteBatch();
+            for (long j = 0; j < entriesPerBatch_; j++) {
+              getKey(key, i + j, keyRange_);
+              batch.put(key, DbBenchmark.this.gen_.generate(valueSize_));
+              stats_.finishedSingleOp(keySize_ + valueSize_);
+            }
+            db_.write(writeOpt_, batch);
+            batch.dispose();
+            writeRateControl(i);
+            if (isFinished()) {
+              return;
+            }
+          }
+        }
+      } catch (InterruptedException e) {
+        // thread has been terminated.
+      }
+    }
+
+    protected void writeRateControl(long writeCount)
+        throws InterruptedException {
+      if (maxWritesPerSecond_ <= 0) return;
+      long minInterval =
+          writeCount * TimeUnit.SECONDS.toNanos(1) / maxWritesPerSecond_;
+      long interval = System.nanoTime() - stats_.start_;
+      if (minInterval - interval > TimeUnit.MILLISECONDS.toNanos(1)) {
+        TimeUnit.NANOSECONDS.sleep(minInterval - interval);
+      }
+    }
+
+    abstract protected void getKey(byte[] key, long id, long range);
+    protected WriteOptions writeOpt_;
+    protected long entriesPerBatch_;
+    protected long maxWritesPerSecond_;
+  }
+
+  class WriteSequentialTask extends WriteTask {
+    public WriteSequentialTask(
+        int tid, long randSeed, long numEntries, long keyRange,
+        WriteOptions writeOpt, long entriesPerBatch) {
+      super(tid, randSeed, numEntries, keyRange,
+            writeOpt, entriesPerBatch);
+    }
+    public WriteSequentialTask(
+        int tid, long randSeed, long numEntries, long keyRange,
+        WriteOptions writeOpt, long entriesPerBatch,
+        long maxWritesPerSecond) {
+      super(tid, randSeed, numEntries, keyRange,
+            writeOpt, entriesPerBatch,
+            maxWritesPerSecond);
+    }
+    @Override protected void getKey(byte[] key, long id, long range) {
+      getFixedKey(key, id);
+    }
+  }
+
+  class WriteRandomTask extends WriteTask {
+    public WriteRandomTask(
+        int tid, long randSeed, long numEntries, long keyRange,
+        WriteOptions writeOpt, long entriesPerBatch) {
+      super(tid, randSeed, numEntries, keyRange,
+            writeOpt, entriesPerBatch);
+    }
+    public WriteRandomTask(
+        int tid, long randSeed, long numEntries, long keyRange,
+        WriteOptions writeOpt, long entriesPerBatch,
+        long maxWritesPerSecond) {
+      super(tid, randSeed, numEntries, keyRange,
+            writeOpt, entriesPerBatch,
+            maxWritesPerSecond);
+    }
+    @Override protected void getKey(byte[] key, long id, long range) {
+      getRandomKey(key, range);
+    }
+  }
+
+  class WriteUniqueRandomTask extends WriteTask {
+    static final int MAX_BUFFER_SIZE = 10000000;
+    public WriteUniqueRandomTask(
+        int tid, long randSeed, long numEntries, long keyRange,
+        WriteOptions writeOpt, long entriesPerBatch) {
+      super(tid, randSeed, numEntries, keyRange,
+            writeOpt, entriesPerBatch);
+      initRandomKeySequence();
+    }
+    public WriteUniqueRandomTask(
+        int tid, long randSeed, long numEntries, long keyRange,
+        WriteOptions writeOpt, long entriesPerBatch,
+        long maxWritesPerSecond) {
+      super(tid, randSeed, numEntries, keyRange,
+            writeOpt, entriesPerBatch,
+            maxWritesPerSecond);
+      initRandomKeySequence();
+    }
+    @Override protected void getKey(byte[] key, long id, long range) {
+      generateKeyFromLong(key, nextUniqueRandom());
+    }
+
+    protected void initRandomKeySequence() {
+      bufferSize_ = MAX_BUFFER_SIZE;
+      if (bufferSize_ > keyRange_) {
+        bufferSize_ = (int) keyRange_;
+      }
+      currentKeyCount_ = bufferSize_;
+      keyBuffer_ = new long[MAX_BUFFER_SIZE];
+      for (int k = 0; k < bufferSize_; ++k) {
+        keyBuffer_[k] = k;
+      }
+    }
+
+    /**
+     * Semi-randomly return the next unique key.  It is guaranteed to be
+     * fully random if keyRange_ <= MAX_BUFFER_SIZE.
+     */
+    long nextUniqueRandom() {
+      if (bufferSize_ == 0) {
+        System.err.println("bufferSize_ == 0.");
+        return 0;
+      }
+      int r = rand_.nextInt(bufferSize_);
+      // randomly pick one from the keyBuffer
+      long randKey = keyBuffer_[r];
+      if (currentKeyCount_ < keyRange_) {
+        // if we have not yet inserted all keys, insert next new key to [r].
+        keyBuffer_[r] = currentKeyCount_++;
+      } else {
+        // move the last element to [r] and decrease the size by 1.
+        keyBuffer_[r] = keyBuffer_[--bufferSize_];
+      }
+      return randKey;
+    }
+
+    int bufferSize_;
+    long currentKeyCount_;
+    long[] keyBuffer_;
+  }
+
+  class ReadRandomTask extends BenchmarkTask {
+    public ReadRandomTask(
+        int tid, long randSeed, long numEntries, long keyRange) {
+      super(tid, randSeed, numEntries, keyRange);
+    }
+    @Override public void runTask() throws RocksDBException {
+      byte[] key = new byte[keySize_];
+      byte[] value = new byte[valueSize_];
+      for (long i = 0; i < numEntries_; i++) {
+        getRandomKey(key, numEntries_);
+        int len = db_.get(key, value);
+        if (len != RocksDB.NOT_FOUND) {
+          stats_.found_++;
+          stats_.finishedSingleOp(keySize_ + valueSize_);
+        } else {
+          stats_.finishedSingleOp(keySize_);
+        }
+        if (isFinished()) {
+          return;
+        }
+      }
+    }
+  }
+
+  class ReadSequentialTask extends BenchmarkTask {
+    public ReadSequentialTask(
+        int tid, long randSeed, long numEntries, long keyRange) {
+      super(tid, randSeed, numEntries, keyRange);
+    }
+    @Override public void runTask() throws RocksDBException {
+      org.rocksdb.Iterator iter = db_.newIterator();
+      long i;
+      for (iter.seekToFirst(), i = 0;
+           iter.isValid() && i < numEntries_;
+           iter.next(), ++i) {
+        stats_.found_++;
+        stats_.finishedSingleOp(iter.key().length + iter.value().length);
+        if (isFinished()) {
+          return;
+        }
+      }
+    }
+  }
+
+  public DbBenchmark(Map<Flag, Object> flags) throws Exception {
+    benchmarks_ = (List<String>) flags.get(Flag.benchmarks);
+    num_ = (Integer) flags.get(Flag.num);
+    threadNum_ = (Integer) flags.get(Flag.threads);
+    reads_ = (Integer) (flags.get(Flag.reads) == null ?
+        flags.get(Flag.num) : flags.get(Flag.reads));
+    keySize_ = (Integer) flags.get(Flag.key_size);
+    valueSize_ = (Integer) flags.get(Flag.value_size);
+    compressionRatio_ = (Double) flags.get(Flag.compression_ratio);
+    useExisting_ = (Boolean) flags.get(Flag.use_existing_db);
+    randSeed_ = (Long) flags.get(Flag.seed);
+    databaseDir_ = (String) flags.get(Flag.db);
+    writesPerSeconds_ = (Integer) flags.get(Flag.writes_per_second);
+    cacheSize_ = (Long) flags.get(Flag.cache_size);
+    memtable_ = (String) flags.get(Flag.memtablerep);
+    maxWriteBufferNumber_ = (Integer) flags.get(Flag.max_write_buffer_number);
+    prefixSize_ = (Integer) flags.get(Flag.prefix_size);
+    keysPerPrefix_ = (Integer) flags.get(Flag.keys_per_prefix);
+    hashBucketCount_ = (Long) flags.get(Flag.hash_bucket_count);
+    usePlainTable_ = (Boolean) flags.get(Flag.use_plain_table);
+    flags_ = flags;
+    finishLock_ = new Object();
+    // options.setPrefixSize((Integer)flags_.get(Flag.prefix_size));
+    // options.setKeysPerPrefix((Long)flags_.get(Flag.keys_per_prefix));
+    compressionType_ = (String) flags.get(Flag.compression_type);
+    compression_ = CompressionType.NONE;
+    try {
+      if (compressionType_.equals("snappy")) {
+        System.loadLibrary("snappy");
+      } else if (compressionType_.equals("zlib")) {
+        System.loadLibrary("zlib");
+      } else if (compressionType_.equals("bzip2")) {
+        System.loadLibrary("bzip2");
+      } else if (compressionType_.equals("lz4")) {
+        System.loadLibrary("lz4");
+      } else if (compressionType_.equals("lz4hc")) {
+        System.loadLibrary("lz4hc");
+      }
+    } catch (UnsatisfiedLinkError e) {
+      System.err.format("Unable to load %s library:%s%n" +
+                        "No compression is used.%n",
+          compressionType_, e.toString());
+      compressionType_ = "none";
+      compressionRatio_ = 1.0;
+    }
+    gen_ = new RandomGenerator(randSeed_, compressionRatio_);
+  }
+
+  private void prepareReadOptions(ReadOptions options) {
+    options.setVerifyChecksums((Boolean)flags_.get(Flag.verify_checksum));
+    options.setTailing((Boolean)flags_.get(Flag.use_tailing_iterator));
+  }
+
+  private void prepareWriteOptions(WriteOptions options) {
+    options.setSync((Boolean)flags_.get(Flag.sync));
+    options.setDisableWAL((Boolean)flags_.get(Flag.disable_wal));
+  }
+
+  private void prepareOptions(Options options) {
+    options.setCacheSize(cacheSize_);
+    if (!useExisting_) {
+      options.setCreateIfMissing(true);
+    } else {
+      options.setCreateIfMissing(false);
+    }
+    if (memtable_.equals("skip_list")) {
+      options.setMemTableConfig(new SkipListMemTableConfig());
+    } else if (memtable_.equals("vector")) {
+      options.setMemTableConfig(new VectorMemTableConfig());
+    } else if (memtable_.equals("hash_linkedlist")) {
+      options.setMemTableConfig(
+          new HashLinkedListMemTableConfig()
+              .setBucketCount(hashBucketCount_));
+      options.useFixedLengthPrefixExtractor(prefixSize_);
+    } else if (memtable_.equals("hash_skiplist") ||
+               memtable_.equals("prefix_hash")) {
+      options.setMemTableConfig(
+          new HashSkipListMemTableConfig()
+              .setBucketCount(hashBucketCount_));
+      options.useFixedLengthPrefixExtractor(prefixSize_);
+    } else {
+      System.err.format(
+          "unable to detect the specified memtable, " +
+          "use the default memtable factory %s%n",
+          options.memTableFactoryName());
+    }
+    if (usePlainTable_) {
+      options.setTableFormatConfig(
+          new PlainTableConfig().setKeySize(keySize_));
+    }
+    options.setWriteBufferSize(
+        (Long)flags_.get(Flag.write_buffer_size));
+    options.setMaxWriteBufferNumber(
+        (Integer)flags_.get(Flag.max_write_buffer_number));
+    options.setMaxBackgroundCompactions(
+        (Integer)flags_.get(Flag.max_background_compactions));
+    options.setMaxBackgroundFlushes(
+        (Integer)flags_.get(Flag.max_background_flushes));
+    options.setCacheSize(
+        (Long)flags_.get(Flag.cache_size));
+    options.setBlockSize(
+        (Long)flags_.get(Flag.block_size));
+    options.setMaxOpenFiles(
+        (Integer)flags_.get(Flag.open_files));
+    options.setCreateIfMissing(
+        !(Boolean)flags_.get(Flag.use_existing_db));
+    options.setTableCacheRemoveScanCountLimit(
+        (Integer)flags_.get(Flag.cache_remove_scan_count_limit));
+    options.setDisableDataSync(
+        (Boolean)flags_.get(Flag.disable_data_sync));
+    options.setUseFsync(
+        (Boolean)flags_.get(Flag.use_fsync));
+    options.setWalDir(
+        (String)flags_.get(Flag.wal_dir));
+    options.setDisableSeekCompaction(
+        (Boolean)flags_.get(Flag.disable_seek_compaction));
+    options.setDeleteObsoleteFilesPeriodMicros(
+        (Integer)flags_.get(Flag.delete_obsolete_files_period_micros));
+    options.setTableCacheNumshardbits(
+        (Integer)flags_.get(Flag.table_cache_numshardbits));
+    options.setAllowMmapReads(
+        (Boolean)flags_.get(Flag.mmap_read));
+    options.setAllowMmapWrites(
+        (Boolean)flags_.get(Flag.mmap_write));
+    options.setAdviseRandomOnOpen(
+        (Boolean)flags_.get(Flag.advise_random_on_open));
+    options.setUseAdaptiveMutex(
+        (Boolean)flags_.get(Flag.use_adaptive_mutex));
+    options.setBytesPerSync(
+        (Long)flags_.get(Flag.bytes_per_sync));
+    options.setBloomLocality(
+        (Integer)flags_.get(Flag.bloom_locality));
+    options.setMinWriteBufferNumberToMerge(
+        (Integer)flags_.get(Flag.min_write_buffer_number_to_merge));
+    options.setMemtablePrefixBloomBits(
+        (Integer)flags_.get(Flag.memtable_bloom_bits));
+    options.setNumLevels(
+        (Integer)flags_.get(Flag.num_levels));
+    options.setTargetFileSizeBase(
+        (Integer)flags_.get(Flag.target_file_size_base));
+    options.setTargetFileSizeMultiplier(
+        (Integer)flags_.get(Flag.target_file_size_multiplier));
+    options.setMaxBytesForLevelBase(
+        (Integer)flags_.get(Flag.max_bytes_for_level_base));
+    options.setMaxBytesForLevelMultiplier(
+        (Integer)flags_.get(Flag.max_bytes_for_level_multiplier));
+    options.setLevelZeroStopWritesTrigger(
+        (Integer)flags_.get(Flag.level0_stop_writes_trigger));
+    options.setLevelZeroSlowdownWritesTrigger(
+        (Integer)flags_.get(Flag.level0_slowdown_writes_trigger));
+    options.setLevelZeroFileNumCompactionTrigger(
+        (Integer)flags_.get(Flag.level0_file_num_compaction_trigger));
+    options.setSoftRateLimit(
+        (Double)flags_.get(Flag.soft_rate_limit));
+    options.setHardRateLimit(
+        (Double)flags_.get(Flag.hard_rate_limit));
+    options.setRateLimitDelayMaxMilliseconds(
+        (Integer)flags_.get(Flag.rate_limit_delay_max_milliseconds));
+    options.setMaxGrandparentOverlapFactor(
+        (Integer)flags_.get(Flag.max_grandparent_overlap_factor));
+    options.setDisableAutoCompactions(
+        (Boolean)flags_.get(Flag.disable_auto_compactions));
+    options.setSourceCompactionFactor(
+        (Integer)flags_.get(Flag.source_compaction_factor));
+    options.setFilterDeletes(
+        (Boolean)flags_.get(Flag.filter_deletes));
+    options.setMaxSuccessiveMerges(
+        (Integer)flags_.get(Flag.max_successive_merges));
+    options.setWalTtlSeconds((Long)flags_.get(Flag.wal_ttl_seconds));
+    options.setWalSizeLimitMB((Long)flags_.get(Flag.wal_size_limit_MB));
+    int bloomBits = (Integer)flags_.get(Flag.bloom_bits);
+    if (bloomBits > 0) {
+      // Internally, options will keep a reference to this BloomFilter.
+      // This will disallow Java to GC this BloomFilter.  In addition,
+      // options.dispose() will release the c++ object of this BloomFilter.
+      // As a result, the caller should not directly call
+      // BloomFilter.dispose().
+      options.setFilter(new BloomFilter(bloomBits));
+    }
+    /* TODO(yhchiang): enable the following parameters
+    options.setCompressionType((String)flags_.get(Flag.compression_type));
+    options.setCompressionLevel((Integer)flags_.get(Flag.compression_level));
+    options.setMinLevelToCompress((Integer)flags_.get(Flag.min_level_to_compress));
+    options.setHdfs((String)flags_.get(Flag.hdfs)); // env
+    options.setCacheNumshardbits((Integer)flags_.get(Flag.cache_numshardbits));
+    options.setStatistics((Boolean)flags_.get(Flag.statistics));
+    options.setUniversalSizeRatio(
+        (Integer)flags_.get(Flag.universal_size_ratio));
+    options.setUniversalMinMergeWidth(
+        (Integer)flags_.get(Flag.universal_min_merge_width));
+    options.setUniversalMaxMergeWidth(
+        (Integer)flags_.get(Flag.universal_max_merge_width));
+    options.setUniversalMaxSizeAmplificationPercent(
+        (Integer)flags_.get(Flag.universal_max_size_amplification_percent));
+    options.setUniversalCompressionSizePercent(
+        (Integer)flags_.get(Flag.universal_compression_size_percent));
+    // TODO(yhchiang): add RocksDB.openForReadOnly() to enable Flag.readonly
+    // TODO(yhchiang): enable Flag.merge_operator by switch
+    options.setAccessHintOnCompactionStart(
+        (String)flags_.get(Flag.compaction_fadvice));
+    // available values of fadvice are "NONE", "NORMAL", "SEQUENTIAL", "WILLNEED" for fadvice
+    */
+  }
+
+  private void run() throws RocksDBException {
+    if (!useExisting_) {
+      destroyDb();
+    }
+    Options options = new Options();
+    prepareOptions(options);
+    open(options);
+
+    printHeader(options);
+
+    for (String benchmark : benchmarks_) {
+      List<Callable<Stats>> tasks = new ArrayList<Callable<Stats>>();
+      List<Callable<Stats>> bgTasks = new ArrayList<Callable<Stats>>();
+      WriteOptions writeOpt = new WriteOptions();
+      prepareWriteOptions(writeOpt);
+      ReadOptions readOpt = new ReadOptions();
+      prepareReadOptions(readOpt);
+      int currentTaskId = 0;
+      boolean known = true;
+
+      if (benchmark.equals("fillseq")) {
+        tasks.add(new WriteSequentialTask(
+            currentTaskId++, randSeed_, num_, num_, writeOpt, 1));
+      } else if (benchmark.equals("fillbatch")) {
+        tasks.add(new WriteRandomTask(
+            currentTaskId++, randSeed_, num_ / 1000, num_, writeOpt, 1000));
+      } else if (benchmark.equals("fillrandom")) {
+        tasks.add(new WriteRandomTask(
+            currentTaskId++, randSeed_, num_, num_, writeOpt, 1));
+      } else if (benchmark.equals("filluniquerandom")) {
+        tasks.add(new WriteUniqueRandomTask(
+            currentTaskId++, randSeed_, num_, num_, writeOpt, 1));
+      } else if (benchmark.equals("fillsync")) {
+        writeOpt.setSync(true);
+        tasks.add(new WriteRandomTask(
+            currentTaskId++, randSeed_, num_ / 1000, num_ / 1000,
+            writeOpt, 1));
+      } else if (benchmark.equals("readseq")) {
+        for (int t = 0; t < threadNum_; ++t) {
+          tasks.add(new ReadSequentialTask(
+              currentTaskId++, randSeed_, reads_ / threadNum_, num_));
+        }
+      } else if (benchmark.equals("readrandom")) {
+        for (int t = 0; t < threadNum_; ++t) {
+          tasks.add(new ReadRandomTask(
+              currentTaskId++, randSeed_, reads_ / threadNum_, num_));
+        }
+      } else if (benchmark.equals("readwhilewriting")) {
+        WriteTask writeTask = new WriteRandomTask(
+            -1, randSeed_, Long.MAX_VALUE, num_, writeOpt, 1, writesPerSeconds_);
+        writeTask.stats_.setExcludeFromMerge();
+        bgTasks.add(writeTask);
+        for (int t = 0; t < threadNum_; ++t) {
+          tasks.add(new ReadRandomTask(
+              currentTaskId++, randSeed_, reads_ / threadNum_, num_));
+        }
+      } else if (benchmark.equals("readhot")) {
+        for (int t = 0; t < threadNum_; ++t) {
+          tasks.add(new ReadRandomTask(
+              currentTaskId++, randSeed_, reads_ / threadNum_, num_ / 100));
+        }
+      } else if (benchmark.equals("delete")) {
+        destroyDb();
+        open(options);
+      } else {
+        known = false;
+        System.err.println("Unknown benchmark: " + benchmark);
+      }
+      if (known) {
+        ExecutorService executor = Executors.newCachedThreadPool();
+        ExecutorService bgExecutor = Executors.newCachedThreadPool();
+        try {
+          // measure only the main executor time
+          List<Future<Stats>> bgResults = new ArrayList<Future<Stats>>();
+          for (Callable bgTask : bgTasks) {
+            bgResults.add(bgExecutor.submit(bgTask));
+          }
+          start();
+          List<Future<Stats>> results = executor.invokeAll(tasks);
+          executor.shutdown();
+          boolean finished = executor.awaitTermination(10, TimeUnit.SECONDS);
+          if (!finished) {
+            System.out.format(
+                "Benchmark %s was not finished before timeout.",
+                benchmark);
+            executor.shutdownNow();
+          }
+          setFinished(true);
+          bgExecutor.shutdown();
+          finished = bgExecutor.awaitTermination(10, TimeUnit.SECONDS);
+          if (!finished) {
+            System.out.format(
+                "Benchmark %s was not finished before timeout.",
+                benchmark);
+            bgExecutor.shutdownNow();
+          }
+
+          stop(benchmark, results, currentTaskId);
+        } catch (InterruptedException e) {
+          System.err.println(e);
+        }
+      }
+      writeOpt.dispose();
+      readOpt.dispose();
+    }
+    options.dispose();
+    db_.close();
+  }
+
+  private void printHeader(Options options) {
+    int kKeySize = 16;
+    System.out.printf("Keys:     %d bytes each\n", kKeySize);
+    System.out.printf("Values:   %d bytes each (%d bytes after compression)\n",
+        valueSize_,
+        (int) (valueSize_ * compressionRatio_ + 0.5));
+    System.out.printf("Entries:  %d\n", num_);
+    System.out.printf("RawSize:  %.1f MB (estimated)\n",
+        ((double)(kKeySize + valueSize_) * num_) / SizeUnit.MB);
+    System.out.printf("FileSize:   %.1f MB (estimated)\n",
+        (((kKeySize + valueSize_ * compressionRatio_) * num_) / SizeUnit.MB));
+    System.out.format("Memtable Factory: %s%n", options.memTableFactoryName());
+    System.out.format("Prefix:   %d bytes%n", prefixSize_);
+    System.out.format("Compression: %s%n", compressionType_);
+    printWarnings();
+    System.out.printf("------------------------------------------------\n");
+  }
+
+  void printWarnings() {
+    boolean assertsEnabled = false;
+    assert assertsEnabled = true; // Intentional side effect!!!
+    if (assertsEnabled) {
+      System.out.printf(
+          "WARNING: Assertions are enabled; benchmarks unnecessarily slow\n");
+    }
+  }
+
+  private void open(Options options) throws RocksDBException {
+    db_ = RocksDB.open(options, databaseDir_);
+  }
+
+  private void start() {
+    setFinished(false);
+    startTime_ = System.nanoTime();
+  }
+
+  private void stop(
+      String benchmark, List<Future<Stats>> results, int concurrentThreads) {
+    long endTime = System.nanoTime();
+    double elapsedSeconds =
+        1.0d * (endTime - startTime_) / TimeUnit.SECONDS.toNanos(1);
+
+    Stats stats = new Stats(-1);
+    int taskFinishedCount = 0;
+    for (Future<Stats> result : results) {
+      if (result.isDone()) {
+        try {
+          Stats taskStats = result.get(3, TimeUnit.SECONDS);
+          if (!result.isCancelled()) {
+            taskFinishedCount++;
+          }
+          stats.merge(taskStats);
+        } catch (Exception e) {
+          // then it's not successful, the output will indicate this
+        }
+      }
+    }
+
+    System.out.printf(
+        "%-16s : %11.5f micros/op; %6.1f MB/s; %d / %d task(s) finished.\n",
+        benchmark, (double) elapsedSeconds / stats.done_ * 1e6,
+        (stats.bytes_ / 1048576.0) / elapsedSeconds,
+        taskFinishedCount, concurrentThreads);
+  }
+
+  public void generateKeyFromLong(byte[] slice, long n) {
+    assert(n >= 0);
+    int startPos = 0;
+
+    if (keysPerPrefix_ > 0) {
+      long numPrefix = (num_ + keysPerPrefix_ - 1) / keysPerPrefix_;
+      long prefix = n % numPrefix;
+      int bytesToFill = Math.min(prefixSize_, 8);
+      for (int i = 0; i < bytesToFill; ++i) {
+        slice[i] = (byte) (prefix % 256);
+        prefix /= 256;
+      }
+      for (int i = 8; i < bytesToFill; ++i) {
+        slice[i] = '0';
+      }
+      startPos = bytesToFill;
+    }
+
+    for (int i = slice.length - 1; i >= startPos; --i) {
+      slice[i] = (byte) ('0' + (n % 10));
+      n /= 10;
+    }
+  }
+
+  private void destroyDb() {
+    if (db_ != null) {
+      db_.close();
+    }
+    // TODO(yhchiang): develop our own FileUtil
+    // FileUtil.deleteDir(databaseDir_);
+  }
+
+  private void printStats() {
+  }
+
+  static void printHelp() {
+    System.out.println("usage:");
+    for (Flag flag : Flag.values()) {
+      System.out.format("  --%s%n\t%s%n",
+          flag.name(),
+          flag.desc());
+      if (flag.getDefaultValue() != null) {
+        System.out.format("\tDEFAULT: %s%n",
+            flag.getDefaultValue().toString());
+      }
+    }
+  }
+
+  public static void main(String[] args) throws Exception {
+    Map<Flag, Object> flags = new EnumMap<Flag, Object>(Flag.class);
+    for (Flag flag : Flag.values()) {
+      if (flag.getDefaultValue() != null) {
+        flags.put(flag, flag.getDefaultValue());
+      }
+    }
+    for (String arg : args) {
+      boolean valid = false;
+      if (arg.equals("--help") || arg.equals("-h")) {
+        printHelp();
+        System.exit(0);
+      }
+      if (arg.startsWith("--")) {
+        try {
+          String[] parts = arg.substring(2).split("=");
+          if (parts.length >= 1) {
+            Flag key = Flag.valueOf(parts[0]);
+            if (key != null) {
+              Object value = null;
+              if (parts.length >= 2) {
+                value = key.parseValue(parts[1]);
+              }
+              flags.put(key, value);
+              valid = true;
+            }
+          }
+        }
+        catch (Exception e) {
+        }
+      }
+      if (!valid) {
+        System.err.println("Invalid argument " + arg);
+        System.exit(1);
+      }
+    }
+    new DbBenchmark(flags).run();
+  }
+
+  private enum Flag {
+    benchmarks(
+        Arrays.asList(
+            "fillseq",
+            "readrandom",
+            "fillrandom"),
+        "Comma-separated list of operations to run in the specified order\n" +
+        "\tActual benchmarks:\n" +
+        "\t\tfillseq          -- write N values in sequential key order in async mode.\n" +
+        "\t\tfillrandom       -- write N values in random key order in async mode.\n" +
+        "\t\tfillbatch        -- write N/1000 batch where each batch has 1000 values\n" +
+        "\t\t                   in random key order in sync mode.\n" +
+        "\t\tfillsync         -- write N/100 values in random key order in sync mode.\n" +
+        "\t\tfill100K         -- write N/1000 100K values in random order in async mode.\n" +
+        "\t\treadseq          -- read N times sequentially.\n" +
+        "\t\treadrandom       -- read N times in random order.\n" +
+        "\t\treadhot          -- read N times in random order from 1% section of DB.\n" +
+        "\t\treadwhilewriting -- measure the read performance of multiple readers\n" +
+        "\t\t                   with a bg single writer.  The write rate of the bg\n" +
+        "\t\t                   is capped by --writes_per_second.\n" +
+        "\tMeta Operations:\n" +
+        "\t\tdelete            -- delete DB") {
+      @Override public Object parseValue(String value) {
+        return new ArrayList<String>(Arrays.asList(value.split(",")));
+      }
+    },
+    compression_ratio(0.5d,
+        "Arrange to generate values that shrink to this fraction of\n" +
+        "\ttheir original size after compression.") {
+      @Override public Object parseValue(String value) {
+        return Double.parseDouble(value);
+      }
+    },
+    use_existing_db(false,
+        "If true, do not destroy the existing database.  If you set this\n" +
+        "\tflag and also specify a benchmark that wants a fresh database,\n" +
+        "\tthat benchmark will fail.") {
+      @Override public Object parseValue(String value) {
+        return Boolean.parseBoolean(value);
+      }
+    },
+    num(1000000,
+        "Number of key/values to place in database.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    threads(1,
+        "Number of concurrent threads to run.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    reads(null,
+        "Number of read operations to do.  If negative, do --nums reads.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    key_size(16,
+        "The size of each key in bytes.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    value_size(100,
+        "The size of each value in bytes.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    write_buffer_size(4 * SizeUnit.MB,
+        "Number of bytes to buffer in memtable before compacting\n" +
+        "\t(initialized to default value by 'main'.)") {
+      @Override public Object parseValue(String value) {
+        return Long.parseLong(value);
+      }
+    },
+    max_write_buffer_number(2,
+             "The number of in-memory memtables. Each memtable is of size\n" +
+             "\twrite_buffer_size.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    prefix_size(0, "Controls the prefix size for HashSkipList, HashLinkedList,\n" +
+                   "\tand plain table.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    keys_per_prefix(0, "Controls the average number of keys generated\n" +
+             "\tper prefix, 0 means no special handling of the prefix,\n" +
+             "\ti.e. use the prefix comes with the generated random number.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    memtablerep("skip_list",
+        "The memtable format.  Available options are\n" +
+        "\tskip_list,\n" +
+        "\tvector,\n" +
+        "\thash_linkedlist,\n" +
+        "\thash_skiplist (prefix_hash.)") {
+      @Override public Object parseValue(String value) {
+        return value;
+      }
+    },
+    hash_bucket_count(SizeUnit.MB,
+        "The number of hash buckets used in the hash-bucket-based\n" +
+        "\tmemtables.  Memtables that currently support this argument are\n" +
+        "\thash_linkedlist and hash_skiplist.") {
+      @Override public Object parseValue(String value) {
+        return Long.parseLong(value);
+      }
+    },
+    writes_per_second(10000,
+        "The write-rate of the background writer used in the\n" +
+        "\t`readwhilewriting` benchmark.  Non-positive number indicates\n" +
+        "\tusing an unbounded write-rate in `readwhilewriting` benchmark.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    use_plain_table(false,
+        "Use plain-table sst format.") {
+      @Override public Object parseValue(String value) {
+        return Boolean.parseBoolean(value);
+      }
+    },
+    cache_size(-1L,
+        "Number of bytes to use as a cache of uncompressed data.\n" +
+        "\tNegative means use default settings.") {
+      @Override public Object parseValue(String value) {
+        return Long.parseLong(value);
+      }
+    },
+    seed(0L,
+        "Seed base for random number generators.") {
+      @Override public Object parseValue(String value) {
+        return Long.parseLong(value);
+      }
+    },
+    num_levels(7,
+        "The total number of levels.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    numdistinct(1000,
+        "Number of distinct keys to use. Used in RandomWithVerify to\n" +
+        "\tread/write on fewer keys so that gets are more likely to find the\n" +
+        "\tkey and puts are more likely to update the same key.") {
+      @Override public Object parseValue(String value) {
+        return Long.parseLong(value);
+      }
+    },
+    merge_keys(-1,
+        "Number of distinct keys to use for MergeRandom and\n" +
+        "\tReadRandomMergeRandom.\n" +
+        "\tIf negative, there will be FLAGS_num keys.") {
+      @Override public Object parseValue(String value) {
+        return Long.parseLong(value);
+      }
+    },
+    bloom_locality(0,"Control bloom filter probes locality.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    duration(0,"Time in seconds for the random-ops tests to run.\n" +
+        "\tWhen 0 then num & reads determine the test duration.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    num_multi_db(0,
+        "Number of DBs used in the benchmark. 0 means single DB.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    histogram(false,"Print histogram of operation timings.") {
+      @Override public Object parseValue(String value) {
+        return Boolean.parseBoolean(value);
+      }
+    },
+    min_write_buffer_number_to_merge(
+        defaultOptions_.minWriteBufferNumberToMerge(),
+        "The minimum number of write buffers that will be merged together\n" +
+        "\tbefore writing to storage. This is cheap because it is an\n" +
+        "\tin-memory merge. If this feature is not enabled, then all these\n" +
+        "\twrite buffers are flushed to L0 as separate files and this\n" +
+        "\tincreases read amplification because a get request has to check\n" +
+        "\tin all of these files. Also, an in-memory merge may result in\n" +
+        "\twriting less data to storage if there are duplicate records\n" +
+        "\tin each of these individual write buffers.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    max_background_compactions(
+        defaultOptions_.maxBackgroundCompactions(),
+        "The maximum number of concurrent background compactions\n" +
+        "\tthat can occur in parallel.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    max_background_flushes(
+        defaultOptions_.maxBackgroundFlushes(),
+        "The maximum number of concurrent background flushes\n" +
+        "\tthat can occur in parallel.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    /* TODO(yhchiang): enable the following
+    compaction_style((int32_t) defaultOptions_.compactionStyle(),
+        "style of compaction: level-based vs universal.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },*/
+    universal_size_ratio(0,
+        "Percentage flexibility while comparing file size\n" +
+        "\t(for universal compaction only).") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    universal_min_merge_width(0,"The minimum number of files in a\n" +
+        "\tsingle compaction run (for universal compaction only).") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    universal_max_merge_width(0,"The max number of files to compact\n" +
+        "\tin universal style compaction.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    universal_max_size_amplification_percent(0,
+        "The max size amplification for universal style compaction.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    universal_compression_size_percent(-1,
+        "The percentage of the database to compress for universal\n" +
+        "\tcompaction. -1 means compress everything.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    block_size(defaultOptions_.blockSize(),
+        "Number of bytes in a block.") {
+      @Override public Object parseValue(String value) {
+        return Long.parseLong(value);
+      }
+    },
+    compressed_cache_size(-1,
+        "Number of bytes to use as a cache of compressed data.") {
+      @Override public Object parseValue(String value) {
+        return Long.parseLong(value);
+      }
+    },
+    open_files(defaultOptions_.maxOpenFiles(),
+        "Maximum number of files to keep open at the same time\n" +
+        "\t(use default if == 0)") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    bloom_bits(-1,"Bloom filter bits per key. Negative means\n" +
+        "\tuse default settings.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    memtable_bloom_bits(0,"Bloom filter bits per key for memtable.\n" +
+        "\tNegative means no bloom filter.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    cache_numshardbits(-1,"Number of shards for the block cache\n" +
+        "\tis 2 ** cache_numshardbits. Negative means use default settings.\n" +
+        "\tThis is applied only if FLAGS_cache_size is non-negative.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    cache_remove_scan_count_limit(32,"") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    verify_checksum(false,"Verify checksum for every block read\n" +
+        "\tfrom storage.") {
+      @Override public Object parseValue(String value) {
+        return Boolean.parseBoolean(value);
+      }
+    },
+    statistics(false,"Database statistics.") {
+      @Override public Object parseValue(String value) {
+        return Boolean.parseBoolean(value);
+      }
+    },
+    writes(-1,"Number of write operations to do. If negative, do\n" +
+        "\t--num reads.") {
+      @Override public Object parseValue(String value) {
+        return Long.parseLong(value);
+      }
+    },
+    sync(false,"Sync all writes to disk.") {
+      @Override public Object parseValue(String value) {
+        return Boolean.parseBoolean(value);
+      }
+    },
+    disable_data_sync(false,"If true, do not wait until data is\n" +
+        "\tsynced to disk.") {
+      @Override public Object parseValue(String value) {
+        return Boolean.parseBoolean(value);
+      }
+    },
+    use_fsync(false,"If true, issue fsync instead of fdatasync.") {
+      @Override public Object parseValue(String value) {
+        return Boolean.parseBoolean(value);
+      }
+    },
+    disable_wal(false,"If true, do not write WAL for write.") {
+      @Override public Object parseValue(String value) {
+        return Boolean.parseBoolean(value);
+      }
+    },
+    wal_dir("", "If not empty, use the given dir for WAL.") {
+      @Override public Object parseValue(String value) {
+        return value;
+      }
+    },
+    target_file_size_base(2 * 1048576,"Target file size at level-1") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    target_file_size_multiplier(1,
+        "A multiplier to compute target level-N file size (N >= 2)") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    max_bytes_for_level_base(10 * 1048576,
+      "Max bytes for level-1") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    max_bytes_for_level_multiplier(10,
+        "A multiplier to compute max bytes for level-N (N >= 2)") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    level0_stop_writes_trigger(12,"Number of files in level-0\n" +
+        "\tthat will trigger put stop.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    level0_slowdown_writes_trigger(8,"Number of files in level-0\n" +
+        "\tthat will slow down writes.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    level0_file_num_compaction_trigger(4,"Number of files in level-0\n" +
+        "\twhen compactions start.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    readwritepercent(90,"Ratio of reads to reads/writes (expressed\n" +
+        "\tas percentage) for the ReadRandomWriteRandom workload. The\n" +
+        "\tdefault value 90 means 90% operations out of all reads and writes\n" +
+        "\toperations are reads. In other words, 9 gets for every 1 put.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    mergereadpercent(70,"Ratio of merges to merges&reads (expressed\n" +
+        "\tas percentage) for the ReadRandomMergeRandom workload. The\n" +
+        "\tdefault value 70 means 70% out of all read and merge operations\n" +
+        "\tare merges. In other words, 7 merges for every 3 gets.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    deletepercent(2,"Percentage of deletes out of reads/writes/\n" +
+        "\tdeletes (used in RandomWithVerify only). RandomWithVerify\n" +
+        "\tcalculates writepercent as (100 - FLAGS_readwritepercent -\n" +
+        "\tdeletepercent), so deletepercent must be smaller than (100 -\n" +
+        "\tFLAGS_readwritepercent)") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    disable_seek_compaction(false,"Option to disable compaction\n" +
+        "\ttriggered by read.") {
+      @Override public Object parseValue(String value) {
+        return Boolean.parseBoolean(value);
+      }
+    },
+    delete_obsolete_files_period_micros(0,"Option to delete\n" +
+        "\tobsolete files periodically. 0 means that obsolete files are\n" +
+        "\tdeleted after every compaction run.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    compression_type("snappy",
+        "Algorithm used to compress the database.") {
+      @Override public Object parseValue(String value) {
+        return value;
+      }
+    },
+    compression_level(-1,
+        "Compression level. For zlib this should be -1 for the\n" +
+        "\tdefault level, or between 0 and 9.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    min_level_to_compress(-1,"If non-negative, compression starts\n" +
+        "\tfrom this level. Levels with number < min_level_to_compress are\n" +
+        "\tnot compressed. Otherwise, apply compression_type to\n" +
+        "\tall levels.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    table_cache_numshardbits(4,"") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    stats_interval(0,"Stats are reported every N operations when\n" +
+        "\tthis is greater than zero. When 0 the interval grows over time.") {
+      @Override public Object parseValue(String value) {
+        return Long.parseLong(value);
+      }
+    },
+    stats_per_interval(0,"Reports additional stats per interval when\n" +
+        "\tthis is greater than 0.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    perf_level(0,"Level of perf collection.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    soft_rate_limit(0.0,"") {
+      @Override public Object parseValue(String value) {
+        return Double.parseDouble(value);
+      }
+    },
+    hard_rate_limit(0.0,"When not equal to 0 this make threads\n" +
+        "\tsleep at each stats reporting interval until the compaction\n" +
+        "\tscore for all levels is less than or equal to this value.") {
+      @Override public Object parseValue(String value) {
+        return Double.parseDouble(value);
+      }
+    },
+    rate_limit_delay_max_milliseconds(1000,
+        "When hard_rate_limit is set then this is the max time a put will\n" +
+        "\tbe stalled.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    max_grandparent_overlap_factor(10,"Control maximum bytes of\n" +
+        "\toverlaps in grandparent (i.e., level+2) before we stop building a\n" +
+        "\tsingle file in a level->level+1 compaction.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    readonly(false,"Run read only benchmarks.") {
+      @Override public Object parseValue(String value) {
+        return Boolean.parseBoolean(value);
+      }
+    },
+    disable_auto_compactions(false,"Do not auto trigger compactions.") {
+      @Override public Object parseValue(String value) {
+        return Boolean.parseBoolean(value);
+      }
+    },
+    source_compaction_factor(1,"Cap the size of data in level-K for\n" +
+        "\ta compaction run that compacts Level-K with Level-(K+1) (for\n" +
+        "\tK >= 1)") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    wal_ttl_seconds(0L,"Set the TTL for the WAL Files in seconds.") {
+      @Override public Object parseValue(String value) {
+        return Long.parseLong(value);
+      }
+    },
+    wal_size_limit_MB(0L,"Set the size limit for the WAL Files\n" +
+        "\tin MB.") {
+      @Override public Object parseValue(String value) {
+        return Long.parseLong(value);
+      }
+    },
+    /* TODO(yhchiang): enable the following
+    bufferedio(rocksdb::EnvOptions().use_os_buffer,
+        "Allow buffered io using OS buffers.") {
+      @Override public Object parseValue(String value) {
+        return Boolean.parseBoolean(value);
+      }
+    },
+    */
+    mmap_read(false,
+        "Allow reads to occur via mmap-ing files.") {
+      @Override public Object parseValue(String value) {
+        return Boolean.parseBoolean(value);
+      }
+    },
+    mmap_write(false,
+        "Allow writes to occur via mmap-ing files.") {
+      @Override public Object parseValue(String value) {
+        return Boolean.parseBoolean(value);
+      }
+    },
+    advise_random_on_open(defaultOptions_.adviseRandomOnOpen(),
+        "Advise random access on table file open.") {
+      @Override public Object parseValue(String value) {
+        return Boolean.parseBoolean(value);
+      }
+    },
+    compaction_fadvice("NORMAL",
+      "Access pattern advice when a file is compacted.") {
+      @Override public Object parseValue(String value) {
+        return value;
+      }
+    },
+    use_tailing_iterator(false,
+        "Use tailing iterator to access a series of keys instead of get.") {
+      @Override public Object parseValue(String value) {
+        return Boolean.parseBoolean(value);
+      }
+    },
+    use_adaptive_mutex(defaultOptions_.useAdaptiveMutex(),
+        "Use adaptive mutex.") {
+      @Override public Object parseValue(String value) {
+        return Boolean.parseBoolean(value);
+      }
+    },
+    bytes_per_sync(defaultOptions_.bytesPerSync(),
+        "Allows OS to incrementally sync files to disk while they are\n" +
+        "\tbeing written, in the background. Issue one request for every\n" +
+        "\tbytes_per_sync written. 0 turns it off.") {
+      @Override public Object parseValue(String value) {
+        return Long.parseLong(value);
+      }
+    },
+    filter_deletes(false," On true, deletes use bloom-filter and drop\n" +
+        "\tthe delete if key not present.") {
+      @Override public Object parseValue(String value) {
+        return Boolean.parseBoolean(value);
+      }
+    },
+    max_successive_merges(0,"Maximum number of successive merge\n" +
+        "\toperations on a key in the memtable.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    db("/tmp/rocksdbjni-bench",
+       "Use the db with the following name.") {
+      @Override public Object parseValue(String value) {
+        return value;
+      }
+    };
+
+    private Flag(Object defaultValue, String desc) {
+      defaultValue_ = defaultValue;
+      desc_ = desc;
+    }
+
+    protected abstract Object parseValue(String value);
+
+    public Object getDefaultValue() {
+      return defaultValue_;
+    }
+
+    public String desc() {
+      return desc_;
+    }
+
+    private final Object defaultValue_;
+    private final String desc_;
+  }
+
+  private static class RandomGenerator {
+    private final byte[] data_;
+    private int dataLength_;
+    private int position_;
+    Random rand_;
+
+    private RandomGenerator(long seed, double compressionRatio) {
+      // We use a limited amount of data over and over again and ensure
+      // that it is larger than the compression window (32KB), and also
+      // large enough to serve all typical value sizes we want to write.
+      rand_ = new Random(seed);
+      dataLength_ = 1048576 + 100;
+      data_ = new byte[dataLength_];
+      // TODO(yhchiang): mimic test::CompressibleString?
+      for (int i = 0; i < dataLength_; ++i) {
+        data_[i] = (byte) (' ' + rand_.nextInt(95));
+      }
+    }
+
+    private byte[] generate(int length) {
+      position_ = rand_.nextInt(data_.length - length);
+      return Arrays.copyOfRange(data_, position_, position_ + length);
+    }
+  }
+
+  boolean isFinished() {
+    synchronized(finishLock_) {
+      return isFinished_;
+    }
+  }
+
+  void setFinished(boolean flag) {
+    synchronized(finishLock_) {
+      isFinished_ = flag;
+    }
+  }
+
+  RocksDB db_;
+  final List<String> benchmarks_;
+  final int num_;
+  final int reads_;
+  final int keySize_;
+  final int valueSize_;
+  final int threadNum_;
+  final int writesPerSeconds_;
+  final long randSeed_;
+  final long cacheSize_;
+  final boolean useExisting_;
+  final String databaseDir_;
+  double compressionRatio_;
+  RandomGenerator gen_;
+  long startTime_;
+
+  // memtable related
+  final int maxWriteBufferNumber_;
+  final int prefixSize_;
+  final int keysPerPrefix_;
+  final String memtable_;
+  final long hashBucketCount_;
+
+  // sst format related
+  boolean usePlainTable_;
+
+  Object finishLock_;
+  boolean isFinished_;
+  Map<Flag, Object> flags_;
+  // as the scope of a static member equals to the scope of the problem,
+  // we let its c++ pointer to be disposed in its finalizer.
+  static Options defaultOptions_ = new Options();
+  String compressionType_;
+  CompressionType compression_;
+}
diff --git a/java/org/rocksdb/test/BackupableDBTest.java b/java/org/rocksdb/test/BackupableDBTest.java
new file mode 100644 (file)
index 0000000..f0fc3d5
--- /dev/null
@@ -0,0 +1,41 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb.test;
+
+import org.rocksdb.*;
+
+public class BackupableDBTest {
+  static final String db_path = "/tmp/backupablejni_db";
+  static final String backup_path = "/tmp/backupablejni_db_backup";
+  static {
+    RocksDB.loadLibrary();
+  }
+  public static void main(String[] args) {
+
+    Options opt = new Options();
+    opt.setCreateIfMissing(true);
+
+    BackupableDBOptions bopt = new BackupableDBOptions(backup_path);
+    BackupableDB bdb = null;
+
+    try {
+      bdb = BackupableDB.open(opt, bopt, db_path);
+      bdb.put("hello".getBytes(), "BackupableDB".getBytes());
+      bdb.createNewBackup(true);
+      byte[] value = bdb.get("hello".getBytes());
+      assert(new String(value).equals("BackupableDB"));
+    } catch (RocksDBException e) {
+      System.err.format("[ERROR]: %s%n", e);
+      e.printStackTrace();
+    } finally {
+      opt.dispose();
+      bopt.dispose();
+      if (bdb != null) {
+        bdb.close();
+      }
+    }
+  }
+}
diff --git a/java/org/rocksdb/test/OptionsTest.java b/java/org/rocksdb/test/OptionsTest.java
new file mode 100644 (file)
index 0000000..e1e0e05
--- /dev/null
@@ -0,0 +1,424 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb.test;
+
+import java.util.Random;
+import org.rocksdb.RocksDB;
+import org.rocksdb.Options;
+
+public class OptionsTest {
+  static {
+    RocksDB.loadLibrary();
+  }
+  public static void main(String[] args) {
+    Options opt = new Options();
+    Random rand = new Random();
+    { // CreateIfMissing test
+      boolean boolValue = rand.nextBoolean();
+      opt.setCreateIfMissing(boolValue);
+      assert(opt.createIfMissing() == boolValue);
+    }
+
+    { // ErrorIfExists test
+      boolean boolValue = rand.nextBoolean();
+      opt.setErrorIfExists(boolValue);
+      assert(opt.errorIfExists() == boolValue);
+    }
+
+    { // ParanoidChecks test
+      boolean boolValue = rand.nextBoolean();
+      opt.setParanoidChecks(boolValue);
+      assert(opt.paranoidChecks() == boolValue);
+    }
+
+    { // MaxOpenFiles test
+      int intValue = rand.nextInt();
+      opt.setMaxOpenFiles(intValue);
+      assert(opt.maxOpenFiles() == intValue);
+    }
+
+    { // DisableDataSync test
+      boolean boolValue = rand.nextBoolean();
+      opt.setDisableDataSync(boolValue);
+      assert(opt.disableDataSync() == boolValue);
+    }
+
+    { // UseFsync test
+      boolean boolValue = rand.nextBoolean();
+      opt.setUseFsync(boolValue);
+      assert(opt.useFsync() == boolValue);
+    }
+
+    { // DbStatsLogInterval test
+      int intValue = rand.nextInt();
+      opt.setDbStatsLogInterval(intValue);
+      assert(opt.dbStatsLogInterval() == intValue);
+    }
+
+    { // DbLogDir test
+      String str = "path/to/DbLogDir";
+      opt.setDbLogDir(str);
+      assert(opt.dbLogDir().equals(str));
+    }
+
+    { // WalDir test
+      String str = "path/to/WalDir";
+      opt.setWalDir(str);
+      assert(opt.walDir().equals(str));
+    }
+
+    { // DeleteObsoleteFilesPeriodMicros test
+      long longValue = rand.nextLong();
+      opt.setDeleteObsoleteFilesPeriodMicros(longValue);
+      assert(opt.deleteObsoleteFilesPeriodMicros() == longValue);
+    }
+
+    { // MaxBackgroundCompactions test
+      int intValue = rand.nextInt();
+      opt.setMaxBackgroundCompactions(intValue);
+      assert(opt.maxBackgroundCompactions() == intValue);
+    }
+
+    { // MaxBackgroundFlushes test
+      int intValue = rand.nextInt();
+      opt.setMaxBackgroundFlushes(intValue);
+      assert(opt.maxBackgroundFlushes() == intValue);
+    }
+
+    { // MaxLogFileSize test
+      long longValue = rand.nextLong();
+      opt.setMaxLogFileSize(longValue);
+      assert(opt.maxLogFileSize() == longValue);
+    }
+
+    { // LogFileTimeToRoll test
+      long longValue = rand.nextLong();
+      opt.setLogFileTimeToRoll(longValue);
+      assert(opt.logFileTimeToRoll() == longValue);
+    }
+
+    { // KeepLogFileNum test
+      long longValue = rand.nextLong();
+      opt.setKeepLogFileNum(longValue);
+      assert(opt.keepLogFileNum() == longValue);
+    }
+
+    { // MaxManifestFileSize test
+      long longValue = rand.nextLong();
+      opt.setMaxManifestFileSize(longValue);
+      assert(opt.maxManifestFileSize() == longValue);
+    }
+
+    { // TableCacheNumshardbits test
+      int intValue = rand.nextInt();
+      opt.setTableCacheNumshardbits(intValue);
+      assert(opt.tableCacheNumshardbits() == intValue);
+    }
+
+    { // TableCacheRemoveScanCountLimit test
+      int intValue = rand.nextInt();
+      opt.setTableCacheRemoveScanCountLimit(intValue);
+      assert(opt.tableCacheRemoveScanCountLimit() == intValue);
+    }
+
+    { // WalTtlSeconds test
+      long longValue = rand.nextLong();
+      opt.setWalTtlSeconds(longValue);
+      assert(opt.walTtlSeconds() == longValue);
+    }
+
+    { // ManifestPreallocationSize test
+      long longValue = rand.nextLong();
+      opt.setManifestPreallocationSize(longValue);
+      assert(opt.manifestPreallocationSize() == longValue);
+    }
+
+    { // AllowOsBuffer test
+      boolean boolValue = rand.nextBoolean();
+      opt.setAllowOsBuffer(boolValue);
+      assert(opt.allowOsBuffer() == boolValue);
+    }
+
+    { // AllowMmapReads test
+      boolean boolValue = rand.nextBoolean();
+      opt.setAllowMmapReads(boolValue);
+      assert(opt.allowMmapReads() == boolValue);
+    }
+
+    { // AllowMmapWrites test
+      boolean boolValue = rand.nextBoolean();
+      opt.setAllowMmapWrites(boolValue);
+      assert(opt.allowMmapWrites() == boolValue);
+    }
+
+    { // IsFdCloseOnExec test
+      boolean boolValue = rand.nextBoolean();
+      opt.setIsFdCloseOnExec(boolValue);
+      assert(opt.isFdCloseOnExec() == boolValue);
+    }
+
+    { // SkipLogErrorOnRecovery test
+      boolean boolValue = rand.nextBoolean();
+      opt.setSkipLogErrorOnRecovery(boolValue);
+      assert(opt.skipLogErrorOnRecovery() == boolValue);
+    }
+
+    { // StatsDumpPeriodSec test
+      int intValue = rand.nextInt();
+      opt.setStatsDumpPeriodSec(intValue);
+      assert(opt.statsDumpPeriodSec() == intValue);
+    }
+
+    { // AdviseRandomOnOpen test
+      boolean boolValue = rand.nextBoolean();
+      opt.setAdviseRandomOnOpen(boolValue);
+      assert(opt.adviseRandomOnOpen() == boolValue);
+    }
+
+    { // UseAdaptiveMutex test
+      boolean boolValue = rand.nextBoolean();
+      opt.setUseAdaptiveMutex(boolValue);
+      assert(opt.useAdaptiveMutex() == boolValue);
+    }
+
+    { // BytesPerSync test
+      long longValue = rand.nextLong();
+      opt.setBytesPerSync(longValue);
+      assert(opt.bytesPerSync() == longValue);
+    }
+
+    { // AllowThreadLocal test
+      boolean boolValue = rand.nextBoolean();
+      opt.setAllowThreadLocal(boolValue);
+      assert(opt.allowThreadLocal() == boolValue);
+    }
+
+    { // WriteBufferSize test
+      long longValue = rand.nextLong();
+      opt.setWriteBufferSize(longValue);
+      assert(opt.writeBufferSize() == longValue);
+    }
+
+    { // MaxWriteBufferNumber test
+      int intValue = rand.nextInt();
+      opt.setMaxWriteBufferNumber(intValue);
+      assert(opt.maxWriteBufferNumber() == intValue);
+    }
+
+    { // MinWriteBufferNumberToMerge test
+      int intValue = rand.nextInt();
+      opt.setMinWriteBufferNumberToMerge(intValue);
+      assert(opt.minWriteBufferNumberToMerge() == intValue);
+    }
+
+    { // BlockSize test
+      long longValue = rand.nextLong();
+      opt.setBlockSize(longValue);
+      assert(opt.blockSize() == longValue);
+    }
+
+    { // BlockRestartInterval test
+      int intValue = rand.nextInt();
+      opt.setBlockRestartInterval(intValue);
+      assert(opt.blockRestartInterval() == intValue);
+    }
+
+    { // WholeKeyFiltering test
+      boolean boolValue = rand.nextBoolean();
+      opt.setWholeKeyFiltering(boolValue);
+      assert(opt.wholeKeyFiltering() == boolValue);
+    }
+
+    { // NumLevels test
+      int intValue = rand.nextInt();
+      opt.setNumLevels(intValue);
+      assert(opt.numLevels() == intValue);
+    }
+
+    { // LevelFileNumCompactionTrigger test
+      int intValue = rand.nextInt();
+      opt.setLevelZeroFileNumCompactionTrigger(intValue);
+      assert(opt.levelZeroFileNumCompactionTrigger() == intValue);
+    }
+
+    { // LevelSlowdownWritesTrigger test
+      int intValue = rand.nextInt();
+      opt.setLevelZeroSlowdownWritesTrigger(intValue);
+      assert(opt.levelZeroSlowdownWritesTrigger() == intValue);
+    }
+
+    { // LevelStopWritesTrigger test
+      int intValue = rand.nextInt();
+      opt.setLevelZeroStopWritesTrigger(intValue);
+      assert(opt.levelZeroStopWritesTrigger() == intValue);
+    }
+
+    { // MaxMemCompactionLevel test
+      int intValue = rand.nextInt();
+      opt.setMaxMemCompactionLevel(intValue);
+      assert(opt.maxMemCompactionLevel() == intValue);
+    }
+
+    { // TargetFileSizeBase test
+      int intValue = rand.nextInt();
+      opt.setTargetFileSizeBase(intValue);
+      assert(opt.targetFileSizeBase() == intValue);
+    }
+
+    { // TargetFileSizeMultiplier test
+      int intValue = rand.nextInt();
+      opt.setTargetFileSizeMultiplier(intValue);
+      assert(opt.targetFileSizeMultiplier() == intValue);
+    }
+
+    { // MaxBytesForLevelBase test
+      long longValue = rand.nextLong();
+      opt.setMaxBytesForLevelBase(longValue);
+      assert(opt.maxBytesForLevelBase() == longValue);
+    }
+
+    { // MaxBytesForLevelMultiplier test
+      int intValue = rand.nextInt();
+      opt.setMaxBytesForLevelMultiplier(intValue);
+      assert(opt.maxBytesForLevelMultiplier() == intValue);
+    }
+
+    { // ExpandedCompactionFactor test
+      int intValue = rand.nextInt();
+      opt.setExpandedCompactionFactor(intValue);
+      assert(opt.expandedCompactionFactor() == intValue);
+    }
+
+    { // SourceCompactionFactor test
+      int intValue = rand.nextInt();
+      opt.setSourceCompactionFactor(intValue);
+      assert(opt.sourceCompactionFactor() == intValue);
+    }
+
+    { // MaxGrandparentOverlapFactor test
+      int intValue = rand.nextInt();
+      opt.setMaxGrandparentOverlapFactor(intValue);
+      assert(opt.maxGrandparentOverlapFactor() == intValue);
+    }
+
+    { // DisableSeekCompaction test
+      boolean boolValue = rand.nextBoolean();
+      opt.setDisableSeekCompaction(boolValue);
+      assert(opt.disableSeekCompaction() == boolValue);
+    }
+
+    { // SoftRateLimit test
+      double doubleValue = rand.nextDouble();
+      opt.setSoftRateLimit(doubleValue);
+      assert(opt.softRateLimit() == doubleValue);
+    }
+
+    { // HardRateLimit test
+      double doubleValue = rand.nextDouble();
+      opt.setHardRateLimit(doubleValue);
+      assert(opt.hardRateLimit() == doubleValue);
+    }
+
+    { // RateLimitDelayMaxMilliseconds test
+      int intValue = rand.nextInt();
+      opt.setRateLimitDelayMaxMilliseconds(intValue);
+      assert(opt.rateLimitDelayMaxMilliseconds() == intValue);
+    }
+
+    { // NoBlockCache test
+      boolean boolValue = rand.nextBoolean();
+      opt.setNoBlockCache(boolValue);
+      assert(opt.noBlockCache() == boolValue);
+    }
+
+    { // ArenaBlockSize test
+      long longValue = rand.nextLong();
+      opt.setArenaBlockSize(longValue);
+      assert(opt.arenaBlockSize() == longValue);
+    }
+
+    { // DisableAutoCompactions test
+      boolean boolValue = rand.nextBoolean();
+      opt.setDisableAutoCompactions(boolValue);
+      assert(opt.disableAutoCompactions() == boolValue);
+    }
+
+    { // PurgeRedundantKvsWhileFlush test
+      boolean boolValue = rand.nextBoolean();
+      opt.setPurgeRedundantKvsWhileFlush(boolValue);
+      assert(opt.purgeRedundantKvsWhileFlush() == boolValue);
+    }
+
+    { // BlockSizeDeviation test
+      int intValue = rand.nextInt();
+      opt.setBlockSizeDeviation(intValue);
+      assert(opt.blockSizeDeviation() == intValue);
+    }
+
+    { // VerifyChecksumsInCompaction test
+      boolean boolValue = rand.nextBoolean();
+      opt.setVerifyChecksumsInCompaction(boolValue);
+      assert(opt.verifyChecksumsInCompaction() == boolValue);
+    }
+
+    { // FilterDeletes test
+      boolean boolValue = rand.nextBoolean();
+      opt.setFilterDeletes(boolValue);
+      assert(opt.filterDeletes() == boolValue);
+    }
+
+    { // MaxSequentialSkipInIterations test
+      long longValue = rand.nextLong();
+      opt.setMaxSequentialSkipInIterations(longValue);
+      assert(opt.maxSequentialSkipInIterations() == longValue);
+    }
+
+    { // InplaceUpdateSupport test
+      boolean boolValue = rand.nextBoolean();
+      opt.setInplaceUpdateSupport(boolValue);
+      assert(opt.inplaceUpdateSupport() == boolValue);
+    }
+
+    { // InplaceUpdateNumLocks test
+      long longValue = rand.nextLong();
+      opt.setInplaceUpdateNumLocks(longValue);
+      assert(opt.inplaceUpdateNumLocks() == longValue);
+    }
+
+    { // MemtablePrefixBloomBits test
+      int intValue = rand.nextInt();
+      opt.setMemtablePrefixBloomBits(intValue);
+      assert(opt.memtablePrefixBloomBits() == intValue);
+    }
+
+    { // MemtablePrefixBloomProbes test
+      int intValue = rand.nextInt();
+      opt.setMemtablePrefixBloomProbes(intValue);
+      assert(opt.memtablePrefixBloomProbes() == intValue);
+    }
+
+    { // BloomLocality test
+      int intValue = rand.nextInt();
+      opt.setBloomLocality(intValue);
+      assert(opt.bloomLocality() == intValue);
+    }
+
+    { // MaxSuccessiveMerges test
+      long longValue = rand.nextLong();
+      opt.setMaxSuccessiveMerges(longValue);
+      assert(opt.maxSuccessiveMerges() == longValue);
+    }
+
+    { // MinPartialMergeOperands test
+      int intValue = rand.nextInt();
+      opt.setMinPartialMergeOperands(intValue);
+      assert(opt.minPartialMergeOperands() == intValue);
+    }
+
+    opt.dispose();
+    System.out.println("Passed OptionsTest");
+  }
+}
diff --git a/java/org/rocksdb/test/ReadOptionsTest.java b/java/org/rocksdb/test/ReadOptionsTest.java
new file mode 100644 (file)
index 0000000..b3b5b26
--- /dev/null
@@ -0,0 +1,40 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb.test;
+
+import java.util.Random;
+import org.rocksdb.RocksDB;
+import org.rocksdb.ReadOptions;
+
+public class ReadOptionsTest {
+  static {
+    RocksDB.loadLibrary();
+  }
+  public static void main(String[] args) {
+    ReadOptions opt = new ReadOptions();
+    Random rand = new Random();
+    { // VerifyChecksums test
+      boolean boolValue = rand.nextBoolean();
+      opt.setVerifyChecksums(boolValue);
+      assert(opt.verifyChecksums() == boolValue);
+    }
+
+    { // FillCache test
+      boolean boolValue = rand.nextBoolean();
+      opt.setFillCache(boolValue);
+      assert(opt.fillCache() == boolValue);
+    }
+
+    { // Tailing test
+      boolean boolValue = rand.nextBoolean();
+      opt.setTailing(boolValue);
+      assert(opt.tailing() == boolValue);
+    }
+
+    opt.dispose();
+    System.out.println("Passed ReadOptionsTest");
+  }
+}
diff --git a/java/org/rocksdb/util/Environment.java b/java/org/rocksdb/util/Environment.java
new file mode 100644 (file)
index 0000000..c2e3bc0
--- /dev/null
@@ -0,0 +1,37 @@
+package org.rocksdb.util;
+
+public class Environment {
+  private static String OS = System.getProperty("os.name").toLowerCase();
+
+  public static boolean isWindows() {
+    return (OS.indexOf("win") >= 0);
+  }
+
+  public static boolean isMac() {
+    return (OS.indexOf("mac") >= 0);
+  }
+
+  public static boolean isUnix() {
+    return (OS.indexOf("nix") >= 0 ||
+            OS.indexOf("nux") >= 0 ||
+            OS.indexOf("aix") >= 0);
+  }
+
+  public static String getSharedLibraryName(String name) {
+    if (isUnix()) {
+      return String.format("lib%s.so", name);
+    } else if (isMac()) {
+      return String.format("lib%s.dylib", name);
+    }
+    throw new UnsupportedOperationException();
+  }
+
+  public static String getJniLibraryName(String name) {
+    if (isUnix()) {
+      return String.format("lib%s.so", name);
+    } else if (isMac()) {
+      return String.format("lib%s.jnilib", name);
+    }
+    throw new UnsupportedOperationException();
+  }
+}
diff --git a/java/org/rocksdb/util/SizeUnit.java b/java/org/rocksdb/util/SizeUnit.java
new file mode 100644 (file)
index 0000000..8d50cd1
--- /dev/null
@@ -0,0 +1,16 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb.util;
+
+public class SizeUnit {
+  public static final long KB = 1024L;
+  public static final long MB = KB * KB;
+  public static final long GB = KB * MB;
+  public static final long TB = KB * GB;
+  public static final long PB = KB * TB;
+
+  private SizeUnit() {}
+}
diff --git a/java/rocksjni/backupablejni.cc b/java/rocksjni/backupablejni.cc
new file mode 100644 (file)
index 0000000..8b57a0c
--- /dev/null
@@ -0,0 +1,85 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file implements the "bridge" between Java and C++ and enables
+// calling c++ rocksdb::DB methods from Java side.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <jni.h>
+#include <string>
+
+#include "include/org_rocksdb_BackupableDB.h"
+#include "include/org_rocksdb_BackupableDBOptions.h"
+#include "rocksjni/portal.h"
+#include "utilities/backupable_db.h"
+
+/*
+ * Class:     org_rocksdb_BackupableDB
+ * Method:    open
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_BackupableDB_open(
+    JNIEnv* env, jobject jbdb, jlong jdb_handle, jlong jopt_handle) {
+  auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  auto opt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jopt_handle);
+  auto bdb = new rocksdb::BackupableDB(db, *opt);
+
+  // as BackupableDB extends RocksDB on the java side, we can reuse
+  // the RocksDB portal here.
+  rocksdb::RocksDBJni::setHandle(env, jbdb, bdb);
+}
+
+/*
+ * Class:     org_rocksdb_BackupableDB
+ * Method:    createNewBackup
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_BackupableDB_createNewBackup(
+    JNIEnv* env, jobject jbdb, jlong jhandle, jboolean jflag) {
+  reinterpret_cast<rocksdb::BackupableDB*>(jhandle)->CreateNewBackup(jflag);
+}
+
+///////////////////////////////////////////////////////////////////////////
+// BackupDBOptions
+
+/*
+ * Class:     org_rocksdb_BackupableDBOptions
+ * Method:    newBackupableDBOptions
+ * Signature: (Ljava/lang/String;)V
+ */
+void Java_org_rocksdb_BackupableDBOptions_newBackupableDBOptions(
+    JNIEnv* env, jobject jobj, jstring jpath) {
+  const char* cpath = env->GetStringUTFChars(jpath, 0);
+  auto bopt = new rocksdb::BackupableDBOptions(cpath);
+  env->ReleaseStringUTFChars(jpath, cpath);
+
+  rocksdb::BackupableDBOptionsJni::setHandle(env, jobj, bopt);
+}
+
+/*
+ * Class:     org_rocksdb_BackupableDBOptions
+ * Method:    backupDir
+ * Signature: (J)Ljava/lang/String;
+ */
+jstring Java_org_rocksdb_BackupableDBOptions_backupDir(
+    JNIEnv* env, jobject jopt, jlong jhandle, jstring jpath) {
+  auto bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
+  return env->NewStringUTF(bopt->backup_dir.c_str());
+}
+
+/*
+ * Class:     org_rocksdb_BackupableDBOptions
+ * Method:    dispose
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_BackupableDBOptions_dispose(
+    JNIEnv* env, jobject jopt, jlong jhandle) {
+  auto bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
+  assert(bopt);
+  delete bopt;
+
+  rocksdb::BackupableDBOptionsJni::setHandle(env, jopt, nullptr);
+}
diff --git a/java/rocksjni/filter.cc b/java/rocksjni/filter.cc
new file mode 100644 (file)
index 0000000..7ef9598
--- /dev/null
@@ -0,0 +1,41 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file implements the "bridge" between Java and C++ for
+// rocksdb::FilterPolicy.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <jni.h>
+#include <string>
+
+#include "include/org_rocksdb_Filter.h"
+#include "include/org_rocksdb_BloomFilter.h"
+#include "rocksjni/portal.h"
+#include "rocksdb/filter_policy.h"
+
+/*
+ * Class:     org_rocksdb_BloomFilter
+ * Method:    createNewFilter0
+ * Signature: (I)V
+ */
+void Java_org_rocksdb_BloomFilter_createNewFilter0(
+    JNIEnv* env, jobject jobj, jint bits_per_key) {
+  const rocksdb::FilterPolicy* fp = rocksdb::NewBloomFilterPolicy(bits_per_key);
+  rocksdb::FilterJni::setHandle(env, jobj, fp);
+}
+
+/*
+ * Class:     org_rocksdb_Filter
+ * Method:    dispose0
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_Filter_dispose0(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  auto fp = reinterpret_cast<rocksdb::FilterPolicy*>(handle);
+  delete fp;
+
+  rocksdb::FilterJni::setHandle(env, jobj, nullptr);
+}
diff --git a/java/rocksjni/iterator.cc b/java/rocksjni/iterator.cc
new file mode 100644 (file)
index 0000000..a7ea97d
--- /dev/null
@@ -0,0 +1,145 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file implements the "bridge" between Java and C++ and enables
+// calling c++ rocksdb::Iterator methods from Java side.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <jni.h>
+
+#include "include/org_rocksdb_Iterator.h"
+#include "rocksjni/portal.h"
+#include "rocksdb/iterator.h"
+
+/*
+ * Class:     org_rocksdb_Iterator
+ * Method:    isValid0
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Iterator_isValid0(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  return reinterpret_cast<rocksdb::Iterator*>(handle)->Valid();
+}
+
+/*
+ * Class:     org_rocksdb_Iterator
+ * Method:    seekToFirst0
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_Iterator_seekToFirst0(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  reinterpret_cast<rocksdb::Iterator*>(handle)->SeekToFirst();
+}
+
+/*
+ * Class:     org_rocksdb_Iterator
+ * Method:    seekToFirst0
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_Iterator_seekToLast0(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  reinterpret_cast<rocksdb::Iterator*>(handle)->SeekToLast();
+}
+
+/*
+ * Class:     org_rocksdb_Iterator
+ * Method:    seekToLast0
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_Iterator_next0(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  reinterpret_cast<rocksdb::Iterator*>(handle)->Next();
+}
+
+/*
+ * Class:     org_rocksdb_Iterator
+ * Method:    next0
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_Iterator_prev0(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  reinterpret_cast<rocksdb::Iterator*>(handle)->Prev();
+}
+
+/*
+ * Class:     org_rocksdb_Iterator
+ * Method:    prev0
+ * Signature: (J)V
+ */
+jbyteArray Java_org_rocksdb_Iterator_key0(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  auto it = reinterpret_cast<rocksdb::Iterator*>(handle);
+  rocksdb::Slice key_slice = it->key();
+
+  jbyteArray jkey = env->NewByteArray(key_slice.size());
+  env->SetByteArrayRegion(
+      jkey, 0, key_slice.size(),
+      reinterpret_cast<const jbyte*>(key_slice.data()));
+  return jkey;
+}
+
+/*
+ * Class:     org_rocksdb_Iterator
+ * Method:    key0
+ * Signature: (J)[B
+ */
+jbyteArray Java_org_rocksdb_Iterator_value0(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  auto it = reinterpret_cast<rocksdb::Iterator*>(handle);
+  rocksdb::Slice value_slice = it->value();
+
+  jbyteArray jvalue = env->NewByteArray(value_slice.size());
+  env->SetByteArrayRegion(
+      jvalue, 0, value_slice.size(),
+      reinterpret_cast<const jbyte*>(value_slice.data()));
+  return jvalue;
+}
+
+/*
+ * Class:     org_rocksdb_Iterator
+ * Method:    value0
+ * Signature: (J)[B
+ */
+void Java_org_rocksdb_Iterator_seek0(
+    JNIEnv* env, jobject jobj, jlong handle,
+    jbyteArray jtarget, jint jtarget_len) {
+  auto it = reinterpret_cast<rocksdb::Iterator*>(handle);
+  jbyte* target = env->GetByteArrayElements(jtarget, 0);
+  rocksdb::Slice target_slice(
+      reinterpret_cast<char*>(target), jtarget_len);
+
+  it->Seek(target_slice);
+
+  env->ReleaseByteArrayElements(jtarget, target, JNI_ABORT);
+}
+
+/*
+ * Class:     org_rocksdb_Iterator
+ * Method:    seek0
+ * Signature: (J[BI)V
+ */
+void Java_org_rocksdb_Iterator_status0(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  auto it = reinterpret_cast<rocksdb::Iterator*>(handle);
+  rocksdb::Status s = it->status();
+
+  if (s.ok()) {
+    return;
+  }
+
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+}
+
+/*
+ * Class:     org_rocksdb_Iterator
+ * Method:    dispose
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_Iterator_dispose(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  auto it = reinterpret_cast<rocksdb::Iterator*>(handle);
+  delete it;
+}
diff --git a/java/rocksjni/memtablejni.cc b/java/rocksjni/memtablejni.cc
new file mode 100644 (file)
index 0000000..a0d50f5
--- /dev/null
@@ -0,0 +1,58 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file implements the "bridge" between Java and C++ for MemTables.
+
+#include "include/org_rocksdb_HashSkipListMemTableConfig.h"
+#include "include/org_rocksdb_HashLinkedListMemTableConfig.h"
+#include "include/org_rocksdb_VectorMemTableConfig.h"
+#include "include/org_rocksdb_SkipListMemTableConfig.h"
+#include "rocksdb/memtablerep.h"
+
+/*
+ * Class:     org_rocksdb_HashSkipListMemTableConfig
+ * Method:    newMemTableFactoryHandle
+ * Signature: (JII)J
+ */
+jlong Java_org_rocksdb_HashSkipListMemTableConfig_newMemTableFactoryHandle(
+    JNIEnv* env, jobject jobj, jlong jbucket_count,
+    jint jheight, jint jbranching_factor) {
+  return reinterpret_cast<jlong>(rocksdb::NewHashSkipListRepFactory(
+      static_cast<size_t>(jbucket_count),
+      static_cast<int32_t>(jheight),
+      static_cast<int32_t>(jbranching_factor)));
+}
+
+/*
+ * Class:     org_rocksdb_HashLinkedListMemTableConfig
+ * Method:    newMemTableFactoryHandle
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_HashLinkedListMemTableConfig_newMemTableFactoryHandle(
+    JNIEnv* env, jobject jobj, jlong jbucket_count) {
+  return reinterpret_cast<jlong>(rocksdb::NewHashLinkListRepFactory(
+       static_cast<size_t>(jbucket_count)));
+}
+
+/*
+ * Class:     org_rocksdb_VectorMemTableConfig
+ * Method:    newMemTableFactoryHandle
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_VectorMemTableConfig_newMemTableFactoryHandle(
+    JNIEnv* env, jobject jobj, jlong jreserved_size) {
+  return reinterpret_cast<jlong>(new rocksdb::VectorRepFactory(
+      static_cast<size_t>(jreserved_size)));
+}
+
+/*
+ * Class:     org_rocksdb_SkipListMemTableConfig
+ * Method:    newMemTableFactoryHandle0
+ * Signature: ()J
+ */
+jlong Java_org_rocksdb_SkipListMemTableConfig_newMemTableFactoryHandle0(
+    JNIEnv* env, jobject jobj) {
+  return reinterpret_cast<jlong>(new rocksdb::SkipListFactory());
+}
diff --git a/java/rocksjni/options.cc b/java/rocksjni/options.cc
new file mode 100644 (file)
index 0000000..c5849ce
--- /dev/null
@@ -0,0 +1,1807 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file implements the "bridge" between Java and C++ for rocksdb::Options.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <jni.h>
+#include <string>
+#include <memory>
+
+#include "include/org_rocksdb_Options.h"
+#include "include/org_rocksdb_WriteOptions.h"
+#include "include/org_rocksdb_ReadOptions.h"
+#include "rocksjni/portal.h"
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/table.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/filter_policy.h"
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    newOptions
+ * Signature: ()V
+ */
+void Java_org_rocksdb_Options_newOptions(JNIEnv* env, jobject jobj) {
+  rocksdb::Options* op = new rocksdb::Options();
+  rocksdb::OptionsJni::setHandle(env, jobj, op);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    dispose0
+ * Signature: ()V
+ */
+void Java_org_rocksdb_Options_dispose0(JNIEnv* env, jobject jobj) {
+  rocksdb::Options* op = rocksdb::OptionsJni::getHandle(env, jobj);
+  delete op;
+
+  rocksdb::OptionsJni::setHandle(env, jobj, nullptr);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setCreateIfMissing
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setCreateIfMissing(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean flag) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->create_if_missing = flag;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    createIfMissing
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_createIfMissing(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->create_if_missing;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setWriteBufferSize
+ * Signature: (JJ)I
+ */
+void Java_org_rocksdb_Options_setWriteBufferSize(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong jwrite_buffer_size) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->write_buffer_size =
+          static_cast<size_t>(jwrite_buffer_size);
+}
+
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    writeBufferSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_writeBufferSize(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->write_buffer_size;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMaxWriteBufferNumber
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setMaxWriteBufferNumber(
+    JNIEnv* env, jobject jobj, jlong jhandle, jint jmax_write_buffer_number) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->max_write_buffer_number =
+          jmax_write_buffer_number;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    createStatistics
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_Options_createStatistics(
+    JNIEnv* env, jobject jobj, jlong jOptHandle) {
+  reinterpret_cast<rocksdb::Options*>(jOptHandle)->statistics =
+      rocksdb::CreateDBStatistics();
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    statisticsPtr
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_statisticsPtr(
+    JNIEnv* env, jobject jobj, jlong jOptHandle) {
+  auto st = reinterpret_cast<rocksdb::Options*>(jOptHandle)->statistics.get();
+  return reinterpret_cast<jlong>(st);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setFilterHandle
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setFilterHandle(
+    JNIEnv* env, jobject jobj, jlong jopt_handle, jlong jfilter_handle) {
+  reinterpret_cast<rocksdb::Options*>(jopt_handle)->filter_policy =
+      reinterpret_cast<rocksdb::FilterPolicy*>(jfilter_handle);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    maxWriteBufferNumber
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_maxWriteBufferNumber(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->max_write_buffer_number;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setBlockSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setBlockSize(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong jblock_size) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->block_size =
+          static_cast<size_t>(jblock_size);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    blockSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_blockSize(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->block_size;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setDisableSeekCompaction
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setDisableSeekCompaction(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jboolean jdisable_seek_compaction) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->disable_seek_compaction =
+         jdisable_seek_compaction;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    disableSeekCompaction
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_disableSeekCompaction(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->disable_seek_compaction;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    errorIfExists
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_errorIfExists(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->error_if_exists;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setErrorIfExists
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setErrorIfExists(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean error_if_exists) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->error_if_exists =
+      static_cast<bool>(error_if_exists);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    paranoidChecks
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_paranoidChecks(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->paranoid_checks;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setParanoidChecks
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setParanoidChecks(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean paranoid_checks) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->paranoid_checks =
+      static_cast<bool>(paranoid_checks);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    maxOpenFiles
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_maxOpenFiles(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->max_open_files;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMaxOpenFiles
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setMaxOpenFiles(
+    JNIEnv* env, jobject jobj, jlong jhandle, jint max_open_files) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->max_open_files =
+      static_cast<int>(max_open_files);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    disableDataSync
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_disableDataSync(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->disableDataSync;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setDisableDataSync
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setDisableDataSync(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean disableDataSync) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->disableDataSync =
+      static_cast<bool>(disableDataSync);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    useFsync
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_useFsync(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->use_fsync;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setUseFsync
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setUseFsync(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean use_fsync) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->use_fsync =
+      static_cast<bool>(use_fsync);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    dbStatsLogInterval
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_dbStatsLogInterval(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->db_stats_log_interval;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setDbStatsLogInterval
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setDbStatsLogInterval(
+    JNIEnv* env, jobject jobj, jlong jhandle, jint db_stats_log_interval) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->db_stats_log_interval =
+      static_cast<int>(db_stats_log_interval);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    dbLogDir
+ * Signature: (J)Ljava/lang/String
+ */
+jstring Java_org_rocksdb_Options_dbLogDir(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return env->NewStringUTF(
+      reinterpret_cast<rocksdb::Options*>(jhandle)->db_log_dir.c_str());
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setDbLogDir
+ * Signature: (JLjava/lang/String)V
+ */
+void Java_org_rocksdb_Options_setDbLogDir(
+    JNIEnv* env, jobject jobj, jlong jhandle, jstring jdb_log_dir) {
+  const char* log_dir = env->GetStringUTFChars(jdb_log_dir, 0);
+  reinterpret_cast<rocksdb::Options*>(jhandle)->db_log_dir.assign(log_dir);
+  env->ReleaseStringUTFChars(jdb_log_dir, log_dir);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    walDir
+ * Signature: (J)Ljava/lang/String
+ */
+jstring Java_org_rocksdb_Options_walDir(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return env->NewStringUTF(
+      reinterpret_cast<rocksdb::Options*>(jhandle)->wal_dir.c_str());
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setWalDir
+ * Signature: (JLjava/lang/String)V
+ */
+void Java_org_rocksdb_Options_setWalDir(
+    JNIEnv* env, jobject jobj, jlong jhandle, jstring jwal_dir) {
+  const char* wal_dir = env->GetStringUTFChars(jwal_dir, 0);
+  reinterpret_cast<rocksdb::Options*>(jhandle)->wal_dir.assign(wal_dir);
+  env->ReleaseStringUTFChars(jwal_dir, wal_dir);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    deleteObsoleteFilesPeriodMicros
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_deleteObsoleteFilesPeriodMicros(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)
+      ->delete_obsolete_files_period_micros;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setDeleteObsoleteFilesPeriodMicros
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setDeleteObsoleteFilesPeriodMicros(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong micros) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)
+      ->delete_obsolete_files_period_micros =
+          static_cast<int64_t>(micros);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    maxBackgroundCompactions
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_maxBackgroundCompactions(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(
+      jhandle)->max_background_compactions;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMaxBackgroundCompactions
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setMaxBackgroundCompactions(
+    JNIEnv* env, jobject jobj, jlong jhandle, jint max) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)
+      ->max_background_compactions = static_cast<int>(max);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    maxBackgroundFlushes
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_maxBackgroundFlushes(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->max_background_flushes;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMaxBackgroundFlushes
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setMaxBackgroundFlushes(
+    JNIEnv* env, jobject jobj, jlong jhandle, jint max_background_flushes) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->max_background_flushes =
+      static_cast<int>(max_background_flushes);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    maxLogFileSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_maxLogFileSize(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->max_log_file_size;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMaxLogFileSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setMaxLogFileSize(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong max_log_file_size) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->max_log_file_size =
+      static_cast<size_t>(max_log_file_size);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    logFileTimeToRoll
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_logFileTimeToRoll(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->log_file_time_to_roll;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setLogFileTimeToRoll
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setLogFileTimeToRoll(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong log_file_time_to_roll) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->log_file_time_to_roll =
+      static_cast<size_t>(log_file_time_to_roll);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    keepLogFileNum
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_keepLogFileNum(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->keep_log_file_num;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setKeepLogFileNum
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setKeepLogFileNum(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong keep_log_file_num) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->keep_log_file_num =
+      static_cast<size_t>(keep_log_file_num);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    maxManifestFileSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_maxManifestFileSize(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->max_manifest_file_size;
+}
+
+/*
+ * Method:    memTableFactoryName
+ * Signature: (J)Ljava/lang/String
+ */
+jstring Java_org_rocksdb_Options_memTableFactoryName(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  auto opt = reinterpret_cast<rocksdb::Options*>(jhandle);
+  rocksdb::MemTableRepFactory* tf = opt->memtable_factory.get();
+
+  // Should never be nullptr.
+  // Default memtable factory is SkipListFactory
+  assert(tf);
+
+  // temporarly fix for the historical typo
+  if (strcmp(tf->Name(), "HashLinkListRepFactory") == 0) {
+    return env->NewStringUTF("HashLinkedListRepFactory");
+  }
+
+  return env->NewStringUTF(tf->Name());
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMaxManifestFileSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setMaxManifestFileSize(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong max_manifest_file_size) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->max_manifest_file_size =
+      static_cast<int64_t>(max_manifest_file_size);
+}
+
+/*
+ * Method:    setMemTableFactory
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setMemTableFactory(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong jfactory_handle) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->memtable_factory.reset(
+      reinterpret_cast<rocksdb::MemTableRepFactory*>(jfactory_handle));
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    tableCacheNumshardbits
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_tableCacheNumshardbits(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->table_cache_numshardbits;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setTableCacheNumshardbits
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setTableCacheNumshardbits(
+    JNIEnv* env, jobject jobj, jlong jhandle, jint table_cache_numshardbits) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->table_cache_numshardbits =
+      static_cast<int>(table_cache_numshardbits);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    tableCacheRemoveScanCountLimit
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_tableCacheRemoveScanCountLimit(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(
+      jhandle)->table_cache_remove_scan_count_limit;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setTableCacheRemoveScanCountLimit
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setTableCacheRemoveScanCountLimit(
+    JNIEnv* env, jobject jobj, jlong jhandle, jint limit) {
+  reinterpret_cast<rocksdb::Options*>(
+      jhandle)->table_cache_remove_scan_count_limit = static_cast<int>(limit);
+}
+
+/*
+ * Method:    useFixedLengthPrefixExtractor
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_useFixedLengthPrefixExtractor(
+    JNIEnv* env, jobject jobj, jlong jhandle, jint jprefix_length) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->prefix_extractor.reset(
+      rocksdb::NewFixedPrefixTransform(static_cast<size_t>(jprefix_length)));
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    walTtlSeconds
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_walTtlSeconds(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->WAL_ttl_seconds;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setWalTtlSeconds
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setWalTtlSeconds(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong WAL_ttl_seconds) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->WAL_ttl_seconds =
+      static_cast<int64_t>(WAL_ttl_seconds);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    walTtlSeconds
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_walSizeLimitMB(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->WAL_size_limit_MB;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setWalSizeLimitMB
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setWalSizeLimitMB(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong WAL_size_limit_MB) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->WAL_size_limit_MB =
+      static_cast<int64_t>(WAL_size_limit_MB);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    manifestPreallocationSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_manifestPreallocationSize(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)
+      ->manifest_preallocation_size;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setManifestPreallocationSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setManifestPreallocationSize(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong preallocation_size) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->manifest_preallocation_size =
+      static_cast<size_t>(preallocation_size);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    allowOsBuffer
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_allowOsBuffer(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->allow_os_buffer;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setAllowOsBuffer
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setAllowOsBuffer(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean allow_os_buffer) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->allow_os_buffer =
+      static_cast<bool>(allow_os_buffer);
+}
+
+/*
+ * Method:    setTableFactory
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setTableFactory(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong jfactory_handle) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->table_factory.reset(
+      reinterpret_cast<rocksdb::TableFactory*>(jfactory_handle));
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    allowMmapReads
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_allowMmapReads(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->allow_mmap_reads;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setAllowMmapReads
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setAllowMmapReads(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean allow_mmap_reads) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->allow_mmap_reads =
+      static_cast<bool>(allow_mmap_reads);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    allowMmapWrites
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_allowMmapWrites(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->allow_mmap_writes;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setAllowMmapWrites
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setAllowMmapWrites(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean allow_mmap_writes) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->allow_mmap_writes =
+      static_cast<bool>(allow_mmap_writes);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    isFdCloseOnExec
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_isFdCloseOnExec(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->is_fd_close_on_exec;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setIsFdCloseOnExec
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setIsFdCloseOnExec(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean is_fd_close_on_exec) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->is_fd_close_on_exec =
+      static_cast<bool>(is_fd_close_on_exec);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    skipLogErrorOnRecovery
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_skipLogErrorOnRecovery(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)
+      ->skip_log_error_on_recovery;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setSkipLogErrorOnRecovery
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setSkipLogErrorOnRecovery(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean skip) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->skip_log_error_on_recovery =
+      static_cast<bool>(skip);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    statsDumpPeriodSec
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_statsDumpPeriodSec(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->stats_dump_period_sec;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setStatsDumpPeriodSec
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setStatsDumpPeriodSec(
+    JNIEnv* env, jobject jobj, jlong jhandle, jint stats_dump_period_sec) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->stats_dump_period_sec =
+      static_cast<int>(stats_dump_period_sec);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    adviseRandomOnOpen
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_adviseRandomOnOpen(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->advise_random_on_open;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setAdviseRandomOnOpen
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setAdviseRandomOnOpen(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean advise_random_on_open) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->advise_random_on_open =
+      static_cast<bool>(advise_random_on_open);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    useAdaptiveMutex
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_useAdaptiveMutex(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->use_adaptive_mutex;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setUseAdaptiveMutex
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setUseAdaptiveMutex(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean use_adaptive_mutex) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->use_adaptive_mutex =
+      static_cast<bool>(use_adaptive_mutex);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    bytesPerSync
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_bytesPerSync(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->bytes_per_sync;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setBytesPerSync
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setBytesPerSync(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong bytes_per_sync) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->bytes_per_sync =
+      static_cast<int64_t>(bytes_per_sync);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    allowThreadLocal
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_allowThreadLocal(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->allow_thread_local;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setAllowThreadLocal
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setAllowThreadLocal(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean allow_thread_local) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->allow_thread_local =
+      static_cast<bool>(allow_thread_local);
+}
+
+/*
+ * Method:    tableFactoryName
+ * Signature: (J)Ljava/lang/String
+ */
+jstring Java_org_rocksdb_Options_tableFactoryName(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  auto opt = reinterpret_cast<rocksdb::Options*>(jhandle);
+  rocksdb::TableFactory* tf = opt->table_factory.get();
+
+  // Should never be nullptr.
+  // Default memtable factory is SkipListFactory
+  assert(tf);
+
+  return env->NewStringUTF(tf->Name());
+}
+
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    minWriteBufferNumberToMerge
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_minWriteBufferNumberToMerge(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(
+      jhandle)->min_write_buffer_number_to_merge;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMinWriteBufferNumberToMerge
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setMinWriteBufferNumberToMerge(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jint jmin_write_buffer_number_to_merge) {
+  reinterpret_cast<rocksdb::Options*>(
+      jhandle)->min_write_buffer_number_to_merge =
+          static_cast<int>(jmin_write_buffer_number_to_merge);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    blockRestartInterval
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_blockRestartInterval(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->block_restart_interval;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setBlockRestartInterval
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setBlockRestartInterval(
+    JNIEnv* env, jobject jobj, jlong jhandle, jint jblock_restart_interval) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->block_restart_interval =
+      static_cast<int>(jblock_restart_interval);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    wholeKeyFiltering
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_wholeKeyFiltering(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->whole_key_filtering;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setWholeKeyFiltering
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setWholeKeyFiltering(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean jwhole_key_filtering) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->whole_key_filtering =
+      static_cast<bool>(jwhole_key_filtering);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    numLevels
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_numLevels(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->num_levels;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setNumLevels
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setNumLevels(
+    JNIEnv* env, jobject jobj, jlong jhandle, jint jnum_levels) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->num_levels =
+      static_cast<int>(jnum_levels);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    levelZeroFileNumCompactionTrigger
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_levelZeroFileNumCompactionTrigger(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(
+      jhandle)->level0_file_num_compaction_trigger;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setLevelZeroFileNumCompactionTrigger
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setLevelZeroFileNumCompactionTrigger(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jint jlevel0_file_num_compaction_trigger) {
+  reinterpret_cast<rocksdb::Options*>(
+      jhandle)->level0_file_num_compaction_trigger =
+          static_cast<int>(jlevel0_file_num_compaction_trigger);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    levelZeroSlowdownWritesTrigger
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_levelZeroSlowdownWritesTrigger(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(
+      jhandle)->level0_slowdown_writes_trigger;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setLevelSlowdownWritesTrigger
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setLevelZeroSlowdownWritesTrigger(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jint jlevel0_slowdown_writes_trigger) {
+  reinterpret_cast<rocksdb::Options*>(
+      jhandle)->level0_slowdown_writes_trigger =
+          static_cast<int>(jlevel0_slowdown_writes_trigger);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    levelZeroStopWritesTrigger
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_levelZeroStopWritesTrigger(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(
+      jhandle)->level0_stop_writes_trigger;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setLevelStopWritesTrigger
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setLevelZeroStopWritesTrigger(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jint jlevel0_stop_writes_trigger) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->level0_stop_writes_trigger =
+      static_cast<int>(jlevel0_stop_writes_trigger);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    maxMemCompactionLevel
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_maxMemCompactionLevel(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(
+      jhandle)->max_mem_compaction_level;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMaxMemCompactionLevel
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setMaxMemCompactionLevel(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jint jmax_mem_compaction_level) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->max_mem_compaction_level =
+      static_cast<int>(jmax_mem_compaction_level);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    targetFileSizeBase
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_targetFileSizeBase(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->target_file_size_base;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setTargetFileSizeBase
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setTargetFileSizeBase(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jint jtarget_file_size_base) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->target_file_size_base =
+      static_cast<int>(jtarget_file_size_base);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    targetFileSizeMultiplier
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_targetFileSizeMultiplier(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(
+      jhandle)->target_file_size_multiplier;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setTargetFileSizeMultiplier
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setTargetFileSizeMultiplier(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jint jtarget_file_size_multiplier) {
+  reinterpret_cast<rocksdb::Options*>(
+      jhandle)->target_file_size_multiplier =
+          static_cast<int>(jtarget_file_size_multiplier);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    maxBytesForLevelBase
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_maxBytesForLevelBase(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(
+      jhandle)->max_bytes_for_level_base;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMaxBytesForLevelBase
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setMaxBytesForLevelBase(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jlong jmax_bytes_for_level_base) {
+  reinterpret_cast<rocksdb::Options*>(
+      jhandle)->max_bytes_for_level_base =
+          static_cast<int64_t>(jmax_bytes_for_level_base);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    maxBytesForLevelMultiplier
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_maxBytesForLevelMultiplier(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(
+      jhandle)->max_bytes_for_level_multiplier;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMaxBytesForLevelMultiplier
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setMaxBytesForLevelMultiplier(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jint jmax_bytes_for_level_multiplier) {
+  reinterpret_cast<rocksdb::Options*>(
+      jhandle)->max_bytes_for_level_multiplier =
+          static_cast<int>(jmax_bytes_for_level_multiplier);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    expandedCompactionFactor
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_expandedCompactionFactor(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(
+      jhandle)->expanded_compaction_factor;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setExpandedCompactionFactor
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setExpandedCompactionFactor(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jint jexpanded_compaction_factor) {
+  reinterpret_cast<rocksdb::Options*>(
+      jhandle)->expanded_compaction_factor =
+          static_cast<int>(jexpanded_compaction_factor);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    sourceCompactionFactor
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_sourceCompactionFactor(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(
+      jhandle)->source_compaction_factor;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setSourceCompactionFactor
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setSourceCompactionFactor(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+        jint jsource_compaction_factor) {
+  reinterpret_cast<rocksdb::Options*>(
+      jhandle)->source_compaction_factor =
+          static_cast<int>(jsource_compaction_factor);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    maxGrandparentOverlapFactor
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_maxGrandparentOverlapFactor(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(
+      jhandle)->max_grandparent_overlap_factor;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMaxGrandparentOverlapFactor
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setMaxGrandparentOverlapFactor(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jint jmax_grandparent_overlap_factor) {
+  reinterpret_cast<rocksdb::Options*>(
+      jhandle)->max_grandparent_overlap_factor =
+          static_cast<int>(jmax_grandparent_overlap_factor);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    softRateLimit
+ * Signature: (J)D
+ */
+jdouble Java_org_rocksdb_Options_softRateLimit(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->soft_rate_limit;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setSoftRateLimit
+ * Signature: (JD)V
+ */
+void Java_org_rocksdb_Options_setSoftRateLimit(
+    JNIEnv* env, jobject jobj, jlong jhandle, jdouble jsoft_rate_limit) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->soft_rate_limit =
+      static_cast<double>(jsoft_rate_limit);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    hardRateLimit
+ * Signature: (J)D
+ */
+jdouble Java_org_rocksdb_Options_hardRateLimit(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->hard_rate_limit;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setHardRateLimit
+ * Signature: (JD)V
+ */
+void Java_org_rocksdb_Options_setHardRateLimit(
+    JNIEnv* env, jobject jobj, jlong jhandle, jdouble jhard_rate_limit) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->hard_rate_limit =
+      static_cast<double>(jhard_rate_limit);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    rateLimitDelayMaxMilliseconds
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_rateLimitDelayMaxMilliseconds(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(
+      jhandle)->rate_limit_delay_max_milliseconds;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setRateLimitDelayMaxMilliseconds
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setRateLimitDelayMaxMilliseconds(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jint jrate_limit_delay_max_milliseconds) {
+  reinterpret_cast<rocksdb::Options*>(
+      jhandle)->rate_limit_delay_max_milliseconds =
+          static_cast<int>(jrate_limit_delay_max_milliseconds);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    noBlockCache
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_noBlockCache(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->no_block_cache;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setNoBlockCache
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setNoBlockCache(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean jno_block_cache) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->no_block_cache =
+      static_cast<bool>(jno_block_cache);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    arenaBlockSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_arenaBlockSize(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->arena_block_size;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setArenaBlockSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setArenaBlockSize(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong jarena_block_size) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->arena_block_size =
+      static_cast<size_t>(jarena_block_size);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    disableAutoCompactions
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_disableAutoCompactions(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(
+      jhandle)->disable_auto_compactions;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setDisableAutoCompactions
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setDisableAutoCompactions(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jboolean jdisable_auto_compactions) {
+  reinterpret_cast<rocksdb::Options*>(
+      jhandle)->disable_auto_compactions =
+          static_cast<bool>(jdisable_auto_compactions);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    purgeRedundantKvsWhileFlush
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_purgeRedundantKvsWhileFlush(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(
+      jhandle)->purge_redundant_kvs_while_flush;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setPurgeRedundantKvsWhileFlush
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setPurgeRedundantKvsWhileFlush(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jboolean jpurge_redundant_kvs_while_flush) {
+  reinterpret_cast<rocksdb::Options*>(
+      jhandle)->purge_redundant_kvs_while_flush =
+          static_cast<bool>(jpurge_redundant_kvs_while_flush);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    blockSizeDeviation
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_blockSizeDeviation(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->block_size_deviation;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setBlockSizeDeviation
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setBlockSizeDeviation(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jint jblock_size_deviation) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->block_size_deviation =
+      static_cast<int>(jblock_size_deviation);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    verifyChecksumsInCompaction
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_verifyChecksumsInCompaction(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(
+      jhandle)->verify_checksums_in_compaction;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setVerifyChecksumsInCompaction
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setVerifyChecksumsInCompaction(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jboolean jverify_checksums_in_compaction) {
+  reinterpret_cast<rocksdb::Options*>(
+      jhandle)->verify_checksums_in_compaction =
+          static_cast<bool>(jverify_checksums_in_compaction);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    filterDeletes
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_filterDeletes(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->filter_deletes;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setFilterDeletes
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setFilterDeletes(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean jfilter_deletes) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->filter_deletes =
+      static_cast<bool>(jfilter_deletes);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    maxSequentialSkipInIterations
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_maxSequentialSkipInIterations(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(
+      jhandle)->max_sequential_skip_in_iterations;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMaxSequentialSkipInIterations
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setMaxSequentialSkipInIterations(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jlong jmax_sequential_skip_in_iterations) {
+  reinterpret_cast<rocksdb::Options*>(
+      jhandle)->max_sequential_skip_in_iterations =
+          static_cast<int64_t>(jmax_sequential_skip_in_iterations);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    inplaceUpdateSupport
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_inplaceUpdateSupport(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(
+      jhandle)->inplace_update_support;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setInplaceUpdateSupport
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setInplaceUpdateSupport(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jboolean jinplace_update_support) {
+  reinterpret_cast<rocksdb::Options*>(
+      jhandle)->inplace_update_support =
+          static_cast<bool>(jinplace_update_support);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    inplaceUpdateNumLocks
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_inplaceUpdateNumLocks(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(
+      jhandle)->inplace_update_num_locks;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setInplaceUpdateNumLocks
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setInplaceUpdateNumLocks(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jlong jinplace_update_num_locks) {
+  reinterpret_cast<rocksdb::Options*>(
+      jhandle)->inplace_update_num_locks =
+          static_cast<size_t>(jinplace_update_num_locks);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    memtablePrefixBloomBits
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_memtablePrefixBloomBits(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(
+      jhandle)->memtable_prefix_bloom_bits;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMemtablePrefixBloomBits
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setMemtablePrefixBloomBits(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jint jmemtable_prefix_bloom_bits) {
+  reinterpret_cast<rocksdb::Options*>(
+      jhandle)->memtable_prefix_bloom_bits =
+          static_cast<int32_t>(jmemtable_prefix_bloom_bits);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    memtablePrefixBloomProbes
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_memtablePrefixBloomProbes(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(
+      jhandle)->memtable_prefix_bloom_probes;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMemtablePrefixBloomProbes
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setMemtablePrefixBloomProbes(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jint jmemtable_prefix_bloom_probes) {
+  reinterpret_cast<rocksdb::Options*>(
+      jhandle)->memtable_prefix_bloom_probes =
+          static_cast<int32_t>(jmemtable_prefix_bloom_probes);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    bloomLocality
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_bloomLocality(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->bloom_locality;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setBloomLocality
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setBloomLocality(
+    JNIEnv* env, jobject jobj, jlong jhandle, jint jbloom_locality) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->bloom_locality =
+      static_cast<int32_t>(jbloom_locality);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    maxSuccessiveMerges
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_maxSuccessiveMerges(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->max_successive_merges;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMaxSuccessiveMerges
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setMaxSuccessiveMerges(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jlong jmax_successive_merges) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->max_successive_merges =
+      static_cast<size_t>(jmax_successive_merges);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    minPartialMergeOperands
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_minPartialMergeOperands(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(
+      jhandle)->min_partial_merge_operands;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMinPartialMergeOperands
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setMinPartialMergeOperands(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jint jmin_partial_merge_operands) {
+  reinterpret_cast<rocksdb::Options*>(
+      jhandle)->min_partial_merge_operands =
+          static_cast<int32_t>(jmin_partial_merge_operands);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// WriteOptions
+
+/*
+ * Class:     org_rocksdb_WriteOptions
+ * Method:    newWriteOptions
+ * Signature: ()V
+ */
+void Java_org_rocksdb_WriteOptions_newWriteOptions(
+    JNIEnv* env, jobject jwrite_options) {
+  rocksdb::WriteOptions* op = new rocksdb::WriteOptions();
+  rocksdb::WriteOptionsJni::setHandle(env, jwrite_options, op);
+}
+
+/*
+ * Class:     org_rocksdb_WriteOptions
+ * Method:    dispose0
+ * Signature: ()V
+ */
+void Java_org_rocksdb_WriteOptions_dispose0(
+    JNIEnv* env, jobject jwrite_options, jlong jhandle) {
+  auto write_options = reinterpret_cast<rocksdb::WriteOptions*>(jhandle);
+  delete write_options;
+
+  rocksdb::WriteOptionsJni::setHandle(env, jwrite_options, nullptr);
+}
+
+/*
+ * Class:     org_rocksdb_WriteOptions
+ * Method:    setSync
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_WriteOptions_setSync(
+  JNIEnv* env, jobject jwrite_options, jlong jhandle, jboolean jflag) {
+  reinterpret_cast<rocksdb::WriteOptions*>(jhandle)->sync = jflag;
+}
+
+/*
+ * Class:     org_rocksdb_WriteOptions
+ * Method:    sync
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_WriteOptions_sync(
+    JNIEnv* env, jobject jwrite_options, jlong jhandle) {
+  return reinterpret_cast<rocksdb::WriteOptions*>(jhandle)->sync;
+}
+
+/*
+ * Class:     org_rocksdb_WriteOptions
+ * Method:    setDisableWAL
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_WriteOptions_setDisableWAL(
+    JNIEnv* env, jobject jwrite_options, jlong jhandle, jboolean jflag) {
+  reinterpret_cast<rocksdb::WriteOptions*>(jhandle)->disableWAL = jflag;
+}
+
+/*
+ * Class:     org_rocksdb_WriteOptions
+ * Method:    disableWAL
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_WriteOptions_disableWAL(
+    JNIEnv* env, jobject jwrite_options, jlong jhandle) {
+  return reinterpret_cast<rocksdb::WriteOptions*>(jhandle)->disableWAL;
+}
+
+/////////////////////////////////////////////////////////////////////
+// rocksdb::ReadOptions
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    newReadOptions
+ * Signature: ()V
+ */
+void Java_org_rocksdb_ReadOptions_newReadOptions(
+    JNIEnv* env, jobject jobj) {
+  auto read_opt = new rocksdb::ReadOptions();
+  rocksdb::ReadOptionsJni::setHandle(env, jobj, read_opt);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    dispose
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_ReadOptions_dispose(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  delete reinterpret_cast<rocksdb::ReadOptions*>(jhandle);
+  rocksdb::ReadOptionsJni::setHandle(env, jobj, nullptr);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    verifyChecksums
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_ReadOptions_verifyChecksums(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ReadOptions*>(
+      jhandle)->verify_checksums;
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    setVerifyChecksums
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_ReadOptions_setVerifyChecksums(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jboolean jverify_checksums) {
+  reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->verify_checksums =
+      static_cast<bool>(jverify_checksums);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    fillCache
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_ReadOptions_fillCache(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->fill_cache;
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    setFillCache
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_ReadOptions_setFillCache(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean jfill_cache) {
+  reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->fill_cache =
+      static_cast<bool>(jfill_cache);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    tailing
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_ReadOptions_tailing(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->tailing;
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    setTailing
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_ReadOptions_setTailing(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean jtailing) {
+  reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->tailing =
+      static_cast<bool>(jtailing);
+}
diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h
new file mode 100644 (file)
index 0000000..7d70eec
--- /dev/null
@@ -0,0 +1,383 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+// This file is designed for caching those frequently used IDs and provide
+// efficient portal (i.e, a set of static functions) to access java code
+// from c++.
+
+#ifndef JAVA_ROCKSJNI_PORTAL_H_
+#define JAVA_ROCKSJNI_PORTAL_H_
+
+#include <jni.h>
+#include "rocksdb/db.h"
+#include "rocksdb/filter_policy.h"
+#include "utilities/backupable_db.h"
+
+namespace rocksdb {
+
+// The portal class for org.rocksdb.RocksDB
+class RocksDBJni {
+ public:
+  // Get the java class id of org.rocksdb.RocksDB.
+  static jclass getJClass(JNIEnv* env) {
+    static jclass jclazz = env->FindClass("org/rocksdb/RocksDB");
+    assert(jclazz != nullptr);
+    return jclazz;
+  }
+
+  // Get the field id of the member variable of org.rocksdb.RocksDB
+  // that stores the pointer to rocksdb::DB.
+  static jfieldID getHandleFieldID(JNIEnv* env) {
+    static jfieldID fid = env->GetFieldID(
+        getJClass(env), "nativeHandle_", "J");
+    assert(fid != nullptr);
+    return fid;
+  }
+
+  // Get the pointer to rocksdb::DB of the specified org.rocksdb.RocksDB.
+  static rocksdb::DB* getHandle(JNIEnv* env, jobject jdb) {
+    return reinterpret_cast<rocksdb::DB*>(
+        env->GetLongField(jdb, getHandleFieldID(env)));
+  }
+
+  // Pass the rocksdb::DB pointer to the java side.
+  static void setHandle(JNIEnv* env, jobject jdb, rocksdb::DB* db) {
+    env->SetLongField(
+        jdb, getHandleFieldID(env),
+        reinterpret_cast<jlong>(db));
+  }
+};
+
+// The portal class for org.rocksdb.RocksDBException
+class RocksDBExceptionJni {
+ public:
+  // Get the jclass of org.rocksdb.RocksDBException
+  static jclass getJClass(JNIEnv* env) {
+    static jclass jclazz = env->FindClass("org/rocksdb/RocksDBException");
+    assert(jclazz != nullptr);
+    return jclazz;
+  }
+
+  // Create and throw a java exception by converting the input
+  // Status to an RocksDBException.
+  //
+  // In case s.ok() is true, then this function will not throw any
+  // exception.
+  static void ThrowNew(JNIEnv* env, Status s) {
+    if (s.ok()) {
+      return;
+    }
+    jstring msg = env->NewStringUTF(s.ToString().c_str());
+    // get the constructor id of org.rocksdb.RocksDBException
+    static jmethodID mid = env->GetMethodID(
+        getJClass(env), "<init>", "(Ljava/lang/String;)V");
+    assert(mid != nullptr);
+
+    env->Throw((jthrowable)env->NewObject(getJClass(env), mid, msg));
+  }
+};
+
+class OptionsJni {
+ public:
+  // Get the java class id of org.rocksdb.Options.
+  static jclass getJClass(JNIEnv* env) {
+    static jclass jclazz = env->FindClass("org/rocksdb/Options");
+    assert(jclazz != nullptr);
+    return jclazz;
+  }
+
+  // Get the field id of the member variable of org.rocksdb.Options
+  // that stores the pointer to rocksdb::Options
+  static jfieldID getHandleFieldID(JNIEnv* env) {
+    static jfieldID fid = env->GetFieldID(
+        getJClass(env), "nativeHandle_", "J");
+    assert(fid != nullptr);
+    return fid;
+  }
+
+  // Get the pointer to rocksdb::Options
+  static rocksdb::Options* getHandle(JNIEnv* env, jobject jobj) {
+    return reinterpret_cast<rocksdb::Options*>(
+        env->GetLongField(jobj, getHandleFieldID(env)));
+  }
+
+  // Pass the rocksdb::Options pointer to the java side.
+  static void setHandle(JNIEnv* env, jobject jobj, rocksdb::Options* op) {
+    env->SetLongField(
+        jobj, getHandleFieldID(env),
+        reinterpret_cast<jlong>(op));
+  }
+};
+
+class WriteOptionsJni {
+ public:
+  // Get the java class id of org.rocksdb.WriteOptions.
+  static jclass getJClass(JNIEnv* env) {
+    static jclass jclazz = env->FindClass("org/rocksdb/WriteOptions");
+    assert(jclazz != nullptr);
+    return jclazz;
+  }
+
+  // Get the field id of the member variable of org.rocksdb.WriteOptions
+  // that stores the pointer to rocksdb::WriteOptions
+  static jfieldID getHandleFieldID(JNIEnv* env) {
+    static jfieldID fid = env->GetFieldID(
+        getJClass(env), "nativeHandle_", "J");
+    assert(fid != nullptr);
+    return fid;
+  }
+
+  // Get the pointer to rocksdb::WriteOptions
+  static rocksdb::WriteOptions* getHandle(JNIEnv* env, jobject jobj) {
+    return reinterpret_cast<rocksdb::WriteOptions*>(
+        env->GetLongField(jobj, getHandleFieldID(env)));
+  }
+
+  // Pass the rocksdb::WriteOptions pointer to the java side.
+  static void setHandle(JNIEnv* env, jobject jobj, rocksdb::WriteOptions* op) {
+    env->SetLongField(
+        jobj, getHandleFieldID(env),
+        reinterpret_cast<jlong>(op));
+  }
+};
+
+
+class ReadOptionsJni {
+ public:
+  // Get the java class id of org.rocksdb.ReadOptions.
+  static jclass getJClass(JNIEnv* env) {
+    static jclass jclazz = env->FindClass("org/rocksdb/ReadOptions");
+    assert(jclazz != nullptr);
+    return jclazz;
+  }
+
+  // Get the field id of the member variable of org.rocksdb.ReadOptions
+  // that stores the pointer to rocksdb::ReadOptions
+  static jfieldID getHandleFieldID(JNIEnv* env) {
+    static jfieldID fid = env->GetFieldID(
+        getJClass(env), "nativeHandle_", "J");
+    assert(fid != nullptr);
+    return fid;
+  }
+
+  // Get the pointer to rocksdb::ReadOptions
+  static rocksdb::ReadOptions* getHandle(JNIEnv* env, jobject jobj) {
+    return reinterpret_cast<rocksdb::ReadOptions*>(
+        env->GetLongField(jobj, getHandleFieldID(env)));
+  }
+
+  // Pass the rocksdb::ReadOptions pointer to the java side.
+  static void setHandle(JNIEnv* env, jobject jobj,
+                        rocksdb::ReadOptions* op) {
+    env->SetLongField(
+        jobj, getHandleFieldID(env),
+        reinterpret_cast<jlong>(op));
+  }
+};
+
+
+class WriteBatchJni {
+ public:
+  static jclass getJClass(JNIEnv* env) {
+    static jclass jclazz = env->FindClass("org/rocksdb/WriteBatch");
+    assert(jclazz != nullptr);
+    return jclazz;
+  }
+
+  static jfieldID getHandleFieldID(JNIEnv* env) {
+    static jfieldID fid = env->GetFieldID(
+        getJClass(env), "nativeHandle_", "J");
+    assert(fid != nullptr);
+    return fid;
+  }
+
+  // Get the pointer to rocksdb::WriteBatch of the specified
+  // org.rocksdb.WriteBatch.
+  static rocksdb::WriteBatch* getHandle(JNIEnv* env, jobject jwb) {
+    return reinterpret_cast<rocksdb::WriteBatch*>(
+        env->GetLongField(jwb, getHandleFieldID(env)));
+  }
+
+  // Pass the rocksdb::WriteBatch pointer to the java side.
+  static void setHandle(JNIEnv* env, jobject jwb, rocksdb::WriteBatch* wb) {
+    env->SetLongField(
+        jwb, getHandleFieldID(env),
+        reinterpret_cast<jlong>(wb));
+  }
+};
+
+class HistogramDataJni {
+ public:
+  static jmethodID getConstructorMethodId(JNIEnv* env, jclass jclazz) {
+    static jmethodID mid = env->GetMethodID(
+        jclazz, "<init>", "(DDDDD)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+};
+class BackupableDBOptionsJni {
+ public:
+  // Get the java class id of org.rocksdb.BackupableDBOptions.
+  static jclass getJClass(JNIEnv* env) {
+    static jclass jclazz = env->FindClass("org/rocksdb/BackupableDBOptions");
+    assert(jclazz != nullptr);
+    return jclazz;
+  }
+
+  // Get the field id of the member variable of org.rocksdb.BackupableDBOptions
+  // that stores the pointer to rocksdb::BackupableDBOptions
+  static jfieldID getHandleFieldID(JNIEnv* env) {
+    static jfieldID fid = env->GetFieldID(
+        getJClass(env), "nativeHandle_", "J");
+    assert(fid != nullptr);
+    return fid;
+  }
+
+  // Get the pointer to rocksdb::BackupableDBOptions
+  static rocksdb::BackupableDBOptions* getHandle(JNIEnv* env, jobject jobj) {
+    return reinterpret_cast<rocksdb::BackupableDBOptions*>(
+        env->GetLongField(jobj, getHandleFieldID(env)));
+  }
+
+  // Pass the rocksdb::BackupableDBOptions pointer to the java side.
+  static void setHandle(
+      JNIEnv* env, jobject jobj, rocksdb::BackupableDBOptions* op) {
+    env->SetLongField(
+        jobj, getHandleFieldID(env),
+        reinterpret_cast<jlong>(op));
+  }
+};
+
+class IteratorJni {
+ public:
+  // Get the java class id of org.rocksdb.Iteartor.
+  static jclass getJClass(JNIEnv* env) {
+    static jclass jclazz = env->FindClass("org/rocksdb/Iterator");
+    assert(jclazz != nullptr);
+    return jclazz;
+  }
+
+  // Get the field id of the member variable of org.rocksdb.Iterator
+  // that stores the pointer to rocksdb::Iterator.
+  static jfieldID getHandleFieldID(JNIEnv* env) {
+    static jfieldID fid = env->GetFieldID(
+        getJClass(env), "nativeHandle_", "J");
+    assert(fid != nullptr);
+    return fid;
+  }
+
+  // Get the pointer to rocksdb::Iterator.
+  static rocksdb::Iterator* getHandle(JNIEnv* env, jobject jobj) {
+    return reinterpret_cast<rocksdb::Iterator*>(
+        env->GetLongField(jobj, getHandleFieldID(env)));
+  }
+
+  // Pass the rocksdb::Iterator pointer to the java side.
+  static void setHandle(
+      JNIEnv* env, jobject jobj, rocksdb::Iterator* op) {
+    env->SetLongField(
+        jobj, getHandleFieldID(env),
+        reinterpret_cast<jlong>(op));
+  }
+};
+
+class FilterJni {
+ public:
+  // Get the java class id of org.rocksdb.FilterPolicy.
+  static jclass getJClass(JNIEnv* env) {
+    static jclass jclazz = env->FindClass("org/rocksdb/Filter");
+    assert(jclazz != nullptr);
+    return jclazz;
+  }
+
+  // Get the field id of the member variable of org.rocksdb.Filter
+  // that stores the pointer to rocksdb::FilterPolicy.
+  static jfieldID getHandleFieldID(JNIEnv* env) {
+    static jfieldID fid = env->GetFieldID(
+        getJClass(env), "nativeHandle_", "J");
+    assert(fid != nullptr);
+    return fid;
+  }
+
+  // Get the pointer to rocksdb::FilterPolicy.
+  static rocksdb::FilterPolicy* getHandle(JNIEnv* env, jobject jobj) {
+    return reinterpret_cast<rocksdb::FilterPolicy*>(
+        env->GetLongField(jobj, getHandleFieldID(env)));
+  }
+
+  // Pass the rocksdb::FilterPolicy pointer to the java side.
+  static void setHandle(
+      JNIEnv* env, jobject jobj, const rocksdb::FilterPolicy* op) {
+    env->SetLongField(
+        jobj, getHandleFieldID(env),
+        reinterpret_cast<jlong>(op));
+  }
+};
+
+class ListJni {
+ public:
+  // Get the java class id of java.util.List.
+  static jclass getListClass(JNIEnv* env) {
+    static jclass jclazz = env->FindClass("java/util/List");
+    assert(jclazz != nullptr);
+    return jclazz;
+  }
+
+  // Get the java class id of java.util.ArrayList.
+  static jclass getArrayListClass(JNIEnv* env) {
+    static jclass jclazz = env->FindClass("java/util/ArrayList");
+    assert(jclazz != nullptr);
+    return jclazz;
+  }
+
+  // Get the java class id of java.util.Iterator.
+  static jclass getIteratorClass(JNIEnv* env) {
+    static jclass jclazz = env->FindClass("java/util/Iterator");
+    assert(jclazz != nullptr);
+    return jclazz;
+  }
+
+  // Get the java method id of java.util.List.iterator().
+  static jmethodID getIteratorMethod(JNIEnv* env) {
+    static jmethodID mid = env->GetMethodID(
+        getListClass(env), "iterator", "()Ljava/util/Iterator;");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  // Get the java method id of java.util.Iterator.hasNext().
+  static jmethodID getHasNextMethod(JNIEnv* env) {
+    static jmethodID mid = env->GetMethodID(
+        getIteratorClass(env), "hasNext", "()Z");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  // Get the java method id of java.util.Iterator.next().
+  static jmethodID getNextMethod(JNIEnv* env) {
+    static jmethodID mid = env->GetMethodID(
+        getIteratorClass(env), "next", "()Ljava/lang/Object;");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  // Get the java method id of arrayList constructor.
+  static jmethodID getArrayListConstructorMethodId(JNIEnv* env, jclass jclazz) {
+    static jmethodID mid = env->GetMethodID(
+        jclazz, "<init>", "(I)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  // Get the java method id of java.util.List.add().
+  static jmethodID getListAddMethodId(JNIEnv* env) {
+    static jmethodID mid = env->GetMethodID(
+        getListClass(env), "add", "(Ljava/lang/Object;)Z");
+    assert(mid != nullptr);
+    return mid;
+  }
+};
+}  // namespace rocksdb
+#endif  // JAVA_ROCKSJNI_PORTAL_H_
diff --git a/java/rocksjni/rocksjni.cc b/java/rocksjni/rocksjni.cc
new file mode 100644 (file)
index 0000000..4595f3f
--- /dev/null
@@ -0,0 +1,438 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file implements the "bridge" between Java and C++ and enables
+// calling c++ rocksdb::DB methods from Java side.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <jni.h>
+#include <string>
+#include <vector>
+
+#include "include/org_rocksdb_RocksDB.h"
+#include "rocksjni/portal.h"
+#include "rocksdb/db.h"
+#include "rocksdb/cache.h"
+
+//////////////////////////////////////////////////////////////////////////////
+// rocksdb::DB::Open
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    open
+ * Signature: (JLjava/lang/String;)V
+ */
+void Java_org_rocksdb_RocksDB_open(
+    JNIEnv* env, jobject jdb, jlong jopt_handle,
+    jlong jcache_size, jstring jdb_path) {
+  auto opt = reinterpret_cast<rocksdb::Options*>(jopt_handle);
+  if (jcache_size > 0) {
+    opt->no_block_cache = false;
+    opt->block_cache = rocksdb::NewLRUCache(jcache_size);
+  } else {
+    opt->no_block_cache = true;
+    opt->block_cache = nullptr;
+  }
+
+  rocksdb::DB* db = nullptr;
+  const char* db_path = env->GetStringUTFChars(jdb_path, 0);
+  rocksdb::Status s = rocksdb::DB::Open(*opt, db_path, &db);
+  env->ReleaseStringUTFChars(jdb_path, db_path);
+
+  if (s.ok()) {
+    rocksdb::RocksDBJni::setHandle(env, jdb, db);
+    return;
+  }
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// rocksdb::DB::Put
+
+void rocksdb_put_helper(
+    JNIEnv* env, rocksdb::DB* db, const rocksdb::WriteOptions& write_options,
+    jbyteArray jkey, jint jkey_len,
+    jbyteArray jvalue, jint jvalue_len) {
+
+  jbyte* key = env->GetByteArrayElements(jkey, 0);
+  jbyte* value = env->GetByteArrayElements(jvalue, 0);
+  rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
+  rocksdb::Slice value_slice(reinterpret_cast<char*>(value), jvalue_len);
+
+  rocksdb::Status s = db->Put(write_options, key_slice, value_slice);
+
+  // trigger java unref on key and value.
+  // by passing JNI_ABORT, it will simply release the reference without
+  // copying the result back to the java byte array.
+  env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
+  env->ReleaseByteArrayElements(jvalue, value, JNI_ABORT);
+
+  if (s.ok()) {
+    return;
+  }
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    put
+ * Signature: (J[BI[BI)V
+ */
+void Java_org_rocksdb_RocksDB_put__J_3BI_3BI(
+    JNIEnv* env, jobject jdb, jlong jdb_handle,
+    jbyteArray jkey, jint jkey_len,
+    jbyteArray jvalue, jint jvalue_len) {
+  auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  static const rocksdb::WriteOptions default_write_options =
+      rocksdb::WriteOptions();
+
+  rocksdb_put_helper(env, db, default_write_options,
+                     jkey, jkey_len,
+                     jvalue, jvalue_len);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    put
+ * Signature: (JJ[BI[BI)V
+ */
+void Java_org_rocksdb_RocksDB_put__JJ_3BI_3BI(
+    JNIEnv* env, jobject jdb,
+    jlong jdb_handle, jlong jwrite_options_handle,
+    jbyteArray jkey, jint jkey_len,
+    jbyteArray jvalue, jint jvalue_len) {
+  auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  auto write_options = reinterpret_cast<rocksdb::WriteOptions*>(
+      jwrite_options_handle);
+
+  rocksdb_put_helper(env, db, *write_options,
+                     jkey, jkey_len,
+                     jvalue, jvalue_len);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// rocksdb::DB::Write
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    write
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_RocksDB_write(
+    JNIEnv* env, jobject jdb,
+    jlong jwrite_options_handle, jlong jbatch_handle) {
+  rocksdb::DB* db = rocksdb::RocksDBJni::getHandle(env, jdb);
+  auto write_options = reinterpret_cast<rocksdb::WriteOptions*>(
+      jwrite_options_handle);
+  auto batch = reinterpret_cast<rocksdb::WriteBatch*>(jbatch_handle);
+
+  rocksdb::Status s = db->Write(*write_options, batch);
+
+  if (!s.ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// rocksdb::DB::Get
+
+jbyteArray rocksdb_get_helper(
+    JNIEnv* env, rocksdb::DB* db, const rocksdb::ReadOptions& read_opt,
+    jbyteArray jkey, jint jkey_len) {
+  jboolean isCopy;
+  jbyte* key = env->GetByteArrayElements(jkey, &isCopy);
+  rocksdb::Slice key_slice(
+      reinterpret_cast<char*>(key), jkey_len);
+
+  std::string value;
+  rocksdb::Status s = db->Get(
+      read_opt, key_slice, &value);
+
+  // trigger java unref on key.
+  // by passing JNI_ABORT, it will simply release the reference without
+  // copying the result back to the java byte array.
+  env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
+
+  if (s.IsNotFound()) {
+    return nullptr;
+  }
+
+  if (s.ok()) {
+    jbyteArray jvalue = env->NewByteArray(value.size());
+    env->SetByteArrayRegion(
+        jvalue, 0, value.size(),
+        reinterpret_cast<const jbyte*>(value.c_str()));
+    return jvalue;
+  }
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+
+  return nullptr;
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    get
+ * Signature: (J[BI)[B
+ */
+jbyteArray Java_org_rocksdb_RocksDB_get__J_3BI(
+    JNIEnv* env, jobject jdb, jlong jdb_handle,
+    jbyteArray jkey, jint jkey_len) {
+  return rocksdb_get_helper(env,
+      reinterpret_cast<rocksdb::DB*>(jdb_handle),
+      rocksdb::ReadOptions(),
+      jkey, jkey_len);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    get
+ * Signature: (JJ[BI)[B
+ */
+jbyteArray Java_org_rocksdb_RocksDB_get__JJ_3BI(
+    JNIEnv* env, jobject jdb, jlong jdb_handle, jlong jropt_handle,
+    jbyteArray jkey, jint jkey_len) {
+  return rocksdb_get_helper(env,
+      reinterpret_cast<rocksdb::DB*>(jdb_handle),
+      *reinterpret_cast<rocksdb::ReadOptions*>(jropt_handle),
+      jkey, jkey_len);
+}
+
+jint rocksdb_get_helper(
+    JNIEnv* env, rocksdb::DB* db, const rocksdb::ReadOptions& read_options,
+    jbyteArray jkey, jint jkey_len,
+    jbyteArray jvalue, jint jvalue_len) {
+  static const int kNotFound = -1;
+  static const int kStatusError = -2;
+
+  jbyte* key = env->GetByteArrayElements(jkey, 0);
+  rocksdb::Slice key_slice(
+      reinterpret_cast<char*>(key), jkey_len);
+
+  // TODO(yhchiang): we might save one memory allocation here by adding
+  // a DB::Get() function which takes preallocated jbyte* as input.
+  std::string cvalue;
+  rocksdb::Status s = db->Get(
+      read_options, key_slice, &cvalue);
+
+  // trigger java unref on key.
+  // by passing JNI_ABORT, it will simply release the reference without
+  // copying the result back to the java byte array.
+  env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
+
+  if (s.IsNotFound()) {
+    return kNotFound;
+  } else if (!s.ok()) {
+    // Here since we are throwing a Java exception from c++ side.
+    // As a result, c++ does not know calling this function will in fact
+    // throwing an exception.  As a result, the execution flow will
+    // not stop here, and codes after this throw will still be
+    // executed.
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+
+    // Return a dummy const value to avoid compilation error, although
+    // java side might not have a chance to get the return value :)
+    return kStatusError;
+  }
+
+  int cvalue_len = static_cast<int>(cvalue.size());
+  int length = std::min(jvalue_len, cvalue_len);
+
+  env->SetByteArrayRegion(
+      jvalue, 0, length,
+      reinterpret_cast<const jbyte*>(cvalue.c_str()));
+  return cvalue_len;
+}
+
+jobject multi_get_helper(JNIEnv* env, jobject jdb, rocksdb::DB* db,
+    const rocksdb::ReadOptions& rOpt, jobject jkey_list, jint jkeys_count) {
+  std::vector<rocksdb::Slice> keys;
+  std::vector<jbyte*> keys_to_free;
+
+  // get iterator
+  jobject iteratorObj = env->CallObjectMethod(
+      jkey_list, rocksdb::ListJni::getIteratorMethod(env));
+
+  // iterate over keys and convert java byte array to slice
+  while(env->CallBooleanMethod(
+      iteratorObj, rocksdb::ListJni::getHasNextMethod(env)) == JNI_TRUE) {
+    jbyteArray jkey = (jbyteArray) env->CallObjectMethod(
+       iteratorObj, rocksdb::ListJni::getNextMethod(env));
+    jint key_length = env->GetArrayLength(jkey);
+
+    jbyte* key = new jbyte[key_length];
+    env->GetByteArrayRegion(jkey, 0, key_length, key);
+    // store allocated jbyte to free it after multiGet call
+    keys_to_free.push_back(key);
+
+    rocksdb::Slice key_slice(
+      reinterpret_cast<char*>(key), key_length);
+    keys.push_back(key_slice);
+  }
+
+  std::vector<std::string> values;
+  std::vector<rocksdb::Status> s = db->MultiGet(rOpt, keys, &values);
+
+  // Don't reuse class pointer
+  jclass jclazz = env->FindClass("java/util/ArrayList");
+  jmethodID mid = rocksdb::ListJni::getArrayListConstructorMethodId(
+      env, jclazz);
+  jobject jvalue_list = env->NewObject(jclazz, mid, jkeys_count);
+
+  // insert in java list
+  for(std::vector<rocksdb::Status>::size_type i = 0; i != s.size(); i++) {
+    if(s[i].ok()) {
+      jbyteArray jvalue = env->NewByteArray(values[i].size());
+      env->SetByteArrayRegion(
+          jvalue, 0, values[i].size(),
+          reinterpret_cast<const jbyte*>(values[i].c_str()));
+      env->CallBooleanMethod(
+          jvalue_list, rocksdb::ListJni::getListAddMethodId(env), jvalue);
+    }
+    else {
+      env->CallBooleanMethod(
+          jvalue_list, rocksdb::ListJni::getListAddMethodId(env), nullptr);
+    }
+  }
+
+  // free up allocated byte arrays
+  for(std::vector<jbyte*>::size_type i = 0; i != keys_to_free.size(); i++) {
+    delete[] keys_to_free[i];
+  }
+  keys_to_free.clear();
+
+  return jvalue_list;
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    multiGet
+ * Signature: (JLjava/util/List;I)Ljava/util/List;
+ */
+jobject Java_org_rocksdb_RocksDB_multiGet__JLjava_util_List_2I(
+    JNIEnv* env, jobject jdb, jlong jdb_handle,
+    jobject jkey_list, jint jkeys_count) {
+  return multi_get_helper(env, jdb, reinterpret_cast<rocksdb::DB*>(jdb_handle),
+      rocksdb::ReadOptions(), jkey_list, jkeys_count);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    multiGet
+ * Signature: (JJLjava/util/List;I)Ljava/util/List;
+ */
+jobject Java_org_rocksdb_RocksDB_multiGet__JJLjava_util_List_2I(
+    JNIEnv* env, jobject jdb, jlong jdb_handle,
+    jlong jropt_handle, jobject jkey_list, jint jkeys_count) {
+  return multi_get_helper(env, jdb, reinterpret_cast<rocksdb::DB*>(jdb_handle),
+      *reinterpret_cast<rocksdb::ReadOptions*>(jropt_handle), jkey_list,
+      jkeys_count);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    get
+ * Signature: (J[BI[BI)I
+ */
+jint Java_org_rocksdb_RocksDB_get__J_3BI_3BI(
+    JNIEnv* env, jobject jdb, jlong jdb_handle,
+    jbyteArray jkey, jint jkey_len,
+    jbyteArray jvalue, jint jvalue_len) {
+  return rocksdb_get_helper(env,
+      reinterpret_cast<rocksdb::DB*>(jdb_handle),
+      rocksdb::ReadOptions(),
+      jkey, jkey_len, jvalue, jvalue_len);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    get
+ * Signature: (JJ[BI[BI)I
+ */
+jint Java_org_rocksdb_RocksDB_get__JJ_3BI_3BI(
+    JNIEnv* env, jobject jdb, jlong jdb_handle, jlong jropt_handle,
+    jbyteArray jkey, jint jkey_len,
+    jbyteArray jvalue, jint jvalue_len) {
+  return rocksdb_get_helper(env,
+      reinterpret_cast<rocksdb::DB*>(jdb_handle),
+      *reinterpret_cast<rocksdb::ReadOptions*>(jropt_handle),
+      jkey, jkey_len, jvalue, jvalue_len);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// rocksdb::DB::Delete()
+void rocksdb_remove_helper(
+    JNIEnv* env, rocksdb::DB* db, const rocksdb::WriteOptions& write_options,
+    jbyteArray jkey, jint jkey_len) {
+  jbyte* key = env->GetByteArrayElements(jkey, 0);
+  rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
+
+  rocksdb::Status s = db->Delete(write_options, key_slice);
+
+  // trigger java unref on key and value.
+  // by passing JNI_ABORT, it will simply release the reference without
+  // copying the result back to the java byte array.
+  env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
+
+  if (!s.ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+  return;
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    remove
+ * Signature: (J[BI)V
+ */
+void Java_org_rocksdb_RocksDB_remove__J_3BI(
+    JNIEnv* env, jobject jdb, jlong jdb_handle,
+    jbyteArray jkey, jint jkey_len) {
+  auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  static const rocksdb::WriteOptions default_write_options =
+      rocksdb::WriteOptions();
+
+  rocksdb_remove_helper(env, db, default_write_options, jkey, jkey_len);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    remove
+ * Signature: (JJ[BI)V
+ */
+void Java_org_rocksdb_RocksDB_remove__JJ_3BI(
+    JNIEnv* env, jobject jdb, jlong jdb_handle,
+    jlong jwrite_options, jbyteArray jkey, jint jkey_len) {
+  auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  auto write_options = reinterpret_cast<rocksdb::WriteOptions*>(jwrite_options);
+
+  rocksdb_remove_helper(env, db, *write_options, jkey, jkey_len);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// rocksdb::DB::~DB()
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    dispose
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_RocksDB_dispose(
+    JNIEnv* env, jobject java_db, jlong jhandle) {
+  auto db = reinterpret_cast<rocksdb::DB*>(jhandle);
+  assert(db != nullptr);
+  delete db;
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    iterator0
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_RocksDB_iterator0(
+    JNIEnv* env, jobject jdb, jlong db_handle) {
+  auto db = reinterpret_cast<rocksdb::DB*>(db_handle);
+  rocksdb::Iterator* iterator = db->NewIterator(rocksdb::ReadOptions());
+  return reinterpret_cast<jlong>(iterator);
+}
diff --git a/java/rocksjni/statistics.cc b/java/rocksjni/statistics.cc
new file mode 100644 (file)
index 0000000..bf170c6
--- /dev/null
@@ -0,0 +1,50 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file implements the "bridge" between Java and C++ and enables
+// calling c++ rocksdb::Statistics methods from Java side.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <jni.h>
+
+#include "include/org_rocksdb_Statistics.h"
+#include "rocksjni/portal.h"
+#include "rocksdb/statistics.h"
+
+/*
+ * Class:     org_rocksdb_Statistics
+ * Method:    getTickerCount0
+ * Signature: (IJ)J
+ */
+jlong Java_org_rocksdb_Statistics_getTickerCount0(
+    JNIEnv* env, jobject jobj, int tickerType, jlong handle) {
+  auto st = reinterpret_cast<rocksdb::Statistics*>(handle);
+  assert(st != nullptr);
+
+  return st->getTickerCount(static_cast<rocksdb::Tickers>(tickerType));
+}
+
+/*
+ * Class:     org_rocksdb_Statistics
+ * Method:    geHistogramData0
+ * Signature: (IJ)Lorg/rocksdb/HistogramData;
+ */
+jobject Java_org_rocksdb_Statistics_geHistogramData0(
+  JNIEnv* env, jobject jobj, int histogramType, jlong handle) {
+  auto st = reinterpret_cast<rocksdb::Statistics*>(handle);
+  assert(st != nullptr);
+
+  rocksdb::HistogramData data;
+  st->histogramData(static_cast<rocksdb::Histograms>(histogramType),
+    &data);
+
+  // Don't reuse class pointer
+  jclass jclazz = env->FindClass("org/rocksdb/HistogramData");
+  jmethodID mid = rocksdb::HistogramDataJni::getConstructorMethodId(
+      env, jclazz);
+  return env->NewObject(jclazz, mid, data.median, data.percentile95,
+      data.percentile99, data.average, data.standard_deviation);
+}
diff --git a/java/rocksjni/table.cc b/java/rocksjni/table.cc
new file mode 100644 (file)
index 0000000..c21501b
--- /dev/null
@@ -0,0 +1,25 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file implements the "bridge" between Java and C++ for rocksdb::Options.
+
+#include <jni.h>
+#include "include/org_rocksdb_PlainTableConfig.h"
+#include "rocksdb/table.h"
+
+/*
+ * Class:     org_rocksdb_PlainTableConfig
+ * Method:    newTableFactoryHandle
+ * Signature: (IIDI)J
+ */
+jlong Java_org_rocksdb_PlainTableConfig_newTableFactoryHandle(
+    JNIEnv* env, jobject jobj, jint jkey_size, jint jbloom_bits_per_key,
+    jdouble jhash_table_ratio, jint jindex_sparseness) {
+  return reinterpret_cast<jlong>(rocksdb::NewPlainTableFactory(
+          static_cast<uint32_t>(jkey_size),
+          static_cast<int>(jbloom_bits_per_key),
+          static_cast<double>(jhash_table_ratio),
+          static_cast<size_t>(jindex_sparseness)));
+}
diff --git a/java/rocksjni/write_batch.cc b/java/rocksjni/write_batch.cc
new file mode 100644 (file)
index 0000000..035b35f
--- /dev/null
@@ -0,0 +1,264 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file implements the "bridge" between Java and C++ and enables
+// calling c++ rocksdb::WriteBatch methods from Java side.
+#include <memory>
+
+#include "include/org_rocksdb_WriteBatch.h"
+#include "include/org_rocksdb_WriteBatchInternal.h"
+#include "include/org_rocksdb_WriteBatchTest.h"
+#include "rocksjni/portal.h"
+#include "rocksdb/db.h"
+#include "db/memtable.h"
+#include "rocksdb/write_batch.h"
+#include "db/write_batch_internal.h"
+#include "rocksdb/env.h"
+#include "rocksdb/memtablerep.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    newWriteBatch
+ * Signature: (I)V
+ */
+void Java_org_rocksdb_WriteBatch_newWriteBatch(
+    JNIEnv* env, jobject jobj, jint jreserved_bytes) {
+  rocksdb::WriteBatch* wb = new rocksdb::WriteBatch(
+      static_cast<size_t>(jreserved_bytes));
+
+  rocksdb::WriteBatchJni::setHandle(env, jobj, wb);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    count
+ * Signature: ()I
+ */
+jint Java_org_rocksdb_WriteBatch_count(JNIEnv* env, jobject jobj) {
+  rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj);
+  assert(wb != nullptr);
+
+  return static_cast<jint>(wb->Count());
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    clear
+ * Signature: ()V
+ */
+void Java_org_rocksdb_WriteBatch_clear(JNIEnv* env, jobject jobj) {
+  rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj);
+  assert(wb != nullptr);
+
+  wb->Clear();
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    put
+ * Signature: ([BI[BI)V
+ */
+void Java_org_rocksdb_WriteBatch_put(
+    JNIEnv* env, jobject jobj,
+    jbyteArray jkey, jint jkey_len,
+    jbyteArray jvalue, jint jvalue_len) {
+  rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj);
+  assert(wb != nullptr);
+
+  jbyte* key = env->GetByteArrayElements(jkey, nullptr);
+  jbyte* value = env->GetByteArrayElements(jvalue, nullptr);
+  rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
+  rocksdb::Slice value_slice(reinterpret_cast<char*>(value), jvalue_len);
+  wb->Put(key_slice, value_slice);
+  env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
+  env->ReleaseByteArrayElements(jvalue, value, JNI_ABORT);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    merge
+ * Signature: ([BI[BI)V
+ */
+JNIEXPORT void JNICALL Java_org_rocksdb_WriteBatch_merge(
+    JNIEnv* env, jobject jobj,
+    jbyteArray jkey, jint jkey_len,
+    jbyteArray jvalue, jint jvalue_len) {
+  rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj);
+  assert(wb != nullptr);
+
+  jbyte* key = env->GetByteArrayElements(jkey, nullptr);
+  jbyte* value = env->GetByteArrayElements(jvalue, nullptr);
+  rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
+  rocksdb::Slice value_slice(reinterpret_cast<char*>(value), jvalue_len);
+  wb->Merge(key_slice, value_slice);
+  env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
+  env->ReleaseByteArrayElements(jvalue, value, JNI_ABORT);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    remove
+ * Signature: ([BI)V
+ */
+JNIEXPORT void JNICALL Java_org_rocksdb_WriteBatch_remove(
+    JNIEnv* env, jobject jobj,
+    jbyteArray jkey, jint jkey_len) {
+  rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj);
+  assert(wb != nullptr);
+
+  jbyte* key = env->GetByteArrayElements(jkey, nullptr);
+  rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
+  wb->Delete(key_slice);
+  env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    putLogData
+ * Signature: ([BI)V
+ */
+void Java_org_rocksdb_WriteBatch_putLogData(
+    JNIEnv* env, jobject jobj, jbyteArray jblob, jint jblob_len) {
+  rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj);
+  assert(wb != nullptr);
+
+  jbyte* blob = env->GetByteArrayElements(jblob, nullptr);
+  rocksdb::Slice blob_slice(reinterpret_cast<char*>(blob), jblob_len);
+  wb->PutLogData(blob_slice);
+  env->ReleaseByteArrayElements(jblob, blob, JNI_ABORT);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    dispose0
+ * Signature: ()V
+ */
+void Java_org_rocksdb_WriteBatch_dispose0(JNIEnv* env, jobject jobj) {
+  rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj);
+  assert(wb != nullptr);
+  delete wb;
+
+  rocksdb::WriteBatchJni::setHandle(env, jobj, nullptr);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchInternal
+ * Method:    setSequence
+ * Signature: (Lorg/rocksdb/WriteBatch;J)V
+ */
+void Java_org_rocksdb_WriteBatchInternal_setSequence(
+    JNIEnv* env, jclass jclazz, jobject jobj, jlong jsn) {
+  rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj);
+  assert(wb != nullptr);
+
+  rocksdb::WriteBatchInternal::SetSequence(
+      wb, static_cast<rocksdb::SequenceNumber>(jsn));
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchInternal
+ * Method:    sequence
+ * Signature: (Lorg/rocksdb/WriteBatch;)J
+ */
+jlong Java_org_rocksdb_WriteBatchInternal_sequence(
+    JNIEnv* env, jclass jclazz, jobject jobj) {
+  rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj);
+  assert(wb != nullptr);
+
+  return static_cast<jlong>(rocksdb::WriteBatchInternal::Sequence(wb));
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchInternal
+ * Method:    append
+ * Signature: (Lorg/rocksdb/WriteBatch;Lorg/rocksdb/WriteBatch;)V
+ */
+void Java_org_rocksdb_WriteBatchInternal_append(
+    JNIEnv* env, jclass jclazz, jobject jwb1, jobject jwb2) {
+  rocksdb::WriteBatch* wb1 = rocksdb::WriteBatchJni::getHandle(env, jwb1);
+  assert(wb1 != nullptr);
+  rocksdb::WriteBatch* wb2 = rocksdb::WriteBatchJni::getHandle(env, jwb2);
+  assert(wb2 != nullptr);
+
+  rocksdb::WriteBatchInternal::Append(wb1, wb2);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchTest
+ * Method:    getContents
+ * Signature: (Lorg/rocksdb/WriteBatch;)[B
+ */
+jbyteArray Java_org_rocksdb_WriteBatchTest_getContents(
+    JNIEnv* env, jclass jclazz, jobject jobj) {
+  rocksdb::WriteBatch* b = rocksdb::WriteBatchJni::getHandle(env, jobj);
+  assert(b != nullptr);
+
+  // todo: Currently the following code is directly copied from
+  // db/write_bench_test.cc.  It could be implemented in java once
+  // all the necessary components can be accessed via jni api.
+
+  rocksdb::InternalKeyComparator cmp(rocksdb::BytewiseComparator());
+  auto factory = std::make_shared<rocksdb::SkipListFactory>();
+  rocksdb::Options options;
+  options.memtable_factory = factory;
+  rocksdb::MemTable* mem = new rocksdb::MemTable(cmp, options);
+  mem->Ref();
+  std::string state;
+  rocksdb::ColumnFamilyMemTablesDefault cf_mems_default(mem, &options);
+  rocksdb::Status s =
+      rocksdb::WriteBatchInternal::InsertInto(b, &cf_mems_default);
+  int count = 0;
+  rocksdb::Iterator* iter = mem->NewIterator(rocksdb::ReadOptions());
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    rocksdb::ParsedInternalKey ikey;
+    memset(reinterpret_cast<void*>(&ikey), 0, sizeof(ikey));
+    ASSERT_TRUE(rocksdb::ParseInternalKey(iter->key(), &ikey));
+    switch (ikey.type) {
+      case rocksdb::kTypeValue:
+        state.append("Put(");
+        state.append(ikey.user_key.ToString());
+        state.append(", ");
+        state.append(iter->value().ToString());
+        state.append(")");
+        count++;
+        break;
+      case rocksdb::kTypeMerge:
+        state.append("Merge(");
+        state.append(ikey.user_key.ToString());
+        state.append(", ");
+        state.append(iter->value().ToString());
+        state.append(")");
+        count++;
+        break;
+      case rocksdb::kTypeDeletion:
+        state.append("Delete(");
+        state.append(ikey.user_key.ToString());
+        state.append(")");
+        count++;
+        break;
+      default:
+        assert(false);
+        break;
+    }
+    state.append("@");
+    state.append(rocksdb::NumberToString(ikey.sequence));
+  }
+  delete iter;
+  if (!s.ok()) {
+    state.append(s.ToString());
+  } else if (count != rocksdb::WriteBatchInternal::Count(b)) {
+    state.append("CountMismatch()");
+  }
+  delete mem->Unref();
+
+  jbyteArray jstate = env->NewByteArray(state.size());
+  env->SetByteArrayRegion(
+      jstate, 0, state.size(),
+      reinterpret_cast<const jbyte*>(state.c_str()));
+
+  return jstate;
+}
diff --git a/linters/__phutil_library_init__.php b/linters/__phutil_library_init__.php
new file mode 100644 (file)
index 0000000..4b8d3d1
--- /dev/null
@@ -0,0 +1,3 @@
+<?php
+
+phutil_register_library('linters', __FILE__);
diff --git a/linters/__phutil_library_map__.php b/linters/__phutil_library_map__.php
new file mode 100644 (file)
index 0000000..7808dc1
--- /dev/null
@@ -0,0 +1,27 @@
+<?php
+
+/**
+ * This file is automatically generated. Use 'arc liberate' to rebuild it.
+ * @generated
+ * @phutil-library-version 2
+ */
+
+phutil_register_library_map(array(
+  '__library_version__' => 2,
+  'class' =>
+  array(
+    'FacebookFbcodeLintEngine' => 'lint_engine/FacebookFbcodeLintEngine.php',
+    'FbcodeCppLinter' => 'cpp_linter/FbcodeCppLinter.php',
+    'PfffCppLinter' => 'cpp_linter/PfffCppLinter.php',
+    'ArcanistCpplintLinter' => 'cpp_linter/ArcanistCpplintLinter.php',
+  ),
+  'function' =>
+  array(
+  ),
+  'xmap' =>
+  array(
+    'FacebookFbcodeLintEngine' => 'ArcanistLintEngine',
+    'FbcodeCppLinter' => 'ArcanistLinter',
+    'PfffCppLinter' => 'ArcanistLinter',
+  ),
+));
diff --git a/linters/cpp_linter/ArcanistCpplintLinter.php b/linters/cpp_linter/ArcanistCpplintLinter.php
new file mode 100644 (file)
index 0000000..b9c4137
--- /dev/null
@@ -0,0 +1,88 @@
+<?php
+
+/**
+ * Uses google's cpplint.py to check code. RocksDB team forked this file from
+ * phabricator's /src/lint/linter/ArcanistCpplintLinter.php, and customized it
+ * for its own use.
+ *
+ * You can get it here:
+ * http://google-styleguide.googlecode.com/svn/trunk/cpplint/cpplint.py
+ * @group linter
+ */
+final class ArcanistCpplintLinter extends ArcanistLinter {
+
+  public function willLintPaths(array $paths) {
+    return;
+  }
+
+  public function getLinterName() {
+    return 'cpplint.py';
+  }
+
+  public function getLintPath() {
+    $bin = 'cpplint.py';
+    // Search under current dir
+    list($err) = exec_manual('which %s/%s', $this->linterDir(), $bin);
+    if (!$err) {
+      return $this->linterDir().'/'.$bin;
+    }
+
+    // Look for globally installed cpplint.py
+    list($err) = exec_manual('which %s', $bin);
+    if ($err) {
+      throw new ArcanistUsageException(
+        "cpplint.py does not appear to be installed on this system. Install ".
+        "it (e.g., with 'wget \"http://google-styleguide.googlecode.com/".
+        "svn/trunk/cpplint/cpplint.py\"') ".
+        "in your .arcconfig to point to the directory where it resides. ".
+        "Also don't forget to chmod a+x cpplint.py!");
+    }
+
+    return $bin;
+  }
+
+  public function lintPath($path) {
+    $bin = $this->getLintPath();
+    $path = $this->rocksdbDir().'/'.$path;
+
+    $f = new ExecFuture("%C $path", $bin);
+
+    list($err, $stdout, $stderr) = $f->resolve();
+
+    if ($err === 2) {
+      throw new Exception("cpplint failed to run correctly:\n".$stderr);
+    }
+
+    $lines = explode("\n", $stderr);
+    $messages = array();
+    foreach ($lines as $line) {
+      $line = trim($line);
+      $matches = null;
+      $regex = '/^[^:]+:(\d+):\s*(.*)\s*\[(.*)\] \[(\d+)\]$/';
+      if (!preg_match($regex, $line, $matches)) {
+        continue;
+      }
+      foreach ($matches as $key => $match) {
+        $matches[$key] = trim($match);
+      }
+      $message = new ArcanistLintMessage();
+      $message->setPath($path);
+      $message->setLine($matches[1]);
+      $message->setCode($matches[3]);
+      $message->setName($matches[3]);
+      $message->setDescription($matches[2]);
+      $message->setSeverity(ArcanistLintSeverity::SEVERITY_WARNING);
+      $this->addLintMessage($message);
+    }
+  }
+
+  // The path of this linter
+  private function linterDir() {
+    return dirname(__FILE__);
+  }
+
+  // TODO(kaili) a quick and dirty way to figure out rocksdb's root dir.
+  private function rocksdbDir() {
+    return $this->linterDir()."/../..";
+  }
+}
diff --git a/linters/cpp_linter/FbcodeCppLinter.php b/linters/cpp_linter/FbcodeCppLinter.php
new file mode 100644 (file)
index 0000000..e62d3bb
--- /dev/null
@@ -0,0 +1,99 @@
+<?php
+
+class FbcodeCppLinter extends ArcanistLinter {
+  const CPPLINT      = "/home/engshare/tools/cpplint";
+  const LINT_ERROR   = 1;
+  const LINT_WARNING = 2;
+  const C_FLAG = "--c_mode=true";
+  private $rawLintOutput = array();
+
+  public function willLintPaths(array $paths) {
+    $futures = array();
+    $ret_value = 0;
+    $last_line = system("which cpplint", $ret_value);
+    $CPP_LINT = false;
+    if ($ret_value == 0) {
+      $CPP_LINT = $last_line;
+    } else if (file_exists(self::CPPLINT)) {
+      $CPP_LINT = self::CPPLINT;
+    }
+
+    if ($CPP_LINT) {
+      foreach ($paths as $p) {
+        $lpath = $this->getEngine()->getFilePathOnDisk($p);
+        $lpath_file = file($lpath);
+        if (preg_match('/\.(c)$/', $lpath) ||
+            preg_match('/-\*-.*Mode: C[; ].*-\*-/', $lpath_file[0]) ||
+            preg_match('/vim(:.*)*:\s*(set\s+)?filetype=c\s*:/', $lpath_file[0])
+            ) {
+          $futures[$p] = new ExecFuture("%s %s %s 2>&1",
+                             $CPP_LINT, self::C_FLAG,
+                             $this->getEngine()->getFilePathOnDisk($p));
+        } else {
+          $futures[$p] = new ExecFuture("%s %s 2>&1",
+            self::CPPLINT, $this->getEngine()->getFilePathOnDisk($p));
+        }
+      }
+
+      foreach (Futures($futures)->limit(8) as $p => $f) {
+        $this->rawLintOutput[$p] = $f->resolvex();
+      }
+    }
+    return;
+  }
+
+  public function getLinterName() {
+    return "FBCPP";
+  }
+
+  public function lintPath($path) {
+    $msgs = $this->getCppLintOutput($path);
+    foreach ($msgs as $m) {
+      $this->raiseLintAtLine($m['line'], 0, $m['severity'], $m['msg']);
+    }
+  }
+
+  public function getLintSeverityMap() {
+    return array(
+      self::LINT_WARNING => ArcanistLintSeverity::SEVERITY_WARNING,
+      self::LINT_ERROR   => ArcanistLintSeverity::SEVERITY_ERROR
+    );
+  }
+
+  public function getLintNameMap() {
+    return array(
+      self::LINT_WARNING => "CppLint Warning",
+      self::LINT_ERROR   => "CppLint Error"
+    );
+  }
+
+  private function getCppLintOutput($path) {
+    list($output) = $this->rawLintOutput[$path];
+
+    $msgs = array();
+    $current = null;
+    foreach (explode("\n", $output) as $line) {
+      if (preg_match('/[^:]*\((\d+)\):(.*)$/', $line, $matches)) {
+        if ($current) {
+          $msgs[] = $current;
+        }
+        $line = $matches[1];
+        $text = $matches[2];
+        $sev  = preg_match('/.*Warning.*/', $text)
+                  ? self::LINT_WARNING
+                  : self::LINT_ERROR;
+        $current = array('line'     => $line,
+                         'msg'      => $text,
+                         'severity' => $sev);
+      } else if ($current) {
+        $current['msg'] .= ' ' . $line;
+      }
+    }
+    if ($current) {
+      $msgs[] = $current;
+    }
+
+    return $msgs;
+  }
+}
+
diff --git a/linters/cpp_linter/PfffCppLinter.php b/linters/cpp_linter/PfffCppLinter.php
new file mode 100644 (file)
index 0000000..6736614
--- /dev/null
@@ -0,0 +1,68 @@
+<?php
+// Copyright 2004-present Facebook.  All rights reserved.
+
+class PfffCppLinter extends ArcanistLinter {
+  const PROGRAM      = "/home/engshare/tools/checkCpp";
+
+  public function getLinterName() {
+    return "checkCpp";
+  }
+  public function getLintNameMap() {
+    return array(
+    );
+  }
+
+  public function getLintSeverityMap() {
+    return array(
+    );
+  }
+
+  public function willLintPaths(array $paths) {
+    $program = false;
+    $ret_value = 0;
+    $last_line = system("which checkCpp", $ret_value);
+    if ($ret_value == 0) {
+      $program = $last_line;
+    } else if (file_exists(self::PROGRAM)) {
+      $program = self::PROGRAM;
+    }
+    if ($program) {
+      $futures = array();
+      foreach ($paths as $p) {
+        $futures[$p] = new ExecFuture("%s --lint %s 2>&1",
+          $program, $this->getEngine()->getFilePathOnDisk($p));
+      }
+      foreach (Futures($futures)->limit(8) as $p => $f) {
+
+        list($stdout, $stderr) = $f->resolvex();
+        $raw = json_decode($stdout, true);
+        if (!is_array($raw)) {
+          throw new Exception(
+            "checkCpp returned invalid JSON!".
+            "Stdout: {$stdout} Stderr: {$stderr}"
+          );
+        }
+        foreach($raw as $err) {
+          $this->addLintMessage(
+            ArcanistLintMessage::newFromDictionary(
+              array(
+                'path' => $err['file'],
+                'line' => $err['line'],
+                'char' => 0,
+                'name' => $err['name'],
+                'description' => $err['info'],
+                'code' => $this->getLinterName(),
+                'severity' => ArcanistLintSeverity::SEVERITY_WARNING,
+              )
+            )
+          );
+        }
+      }
+    }
+    return;
+  }
+
+  public function lintPath($path) {
+    return;
+  }
+}
diff --git a/linters/cpp_linter/cpplint.py b/linters/cpp_linter/cpplint.py
new file mode 100755 (executable)
index 0000000..d264b00
--- /dev/null
@@ -0,0 +1,4767 @@
+#!/usr/bin/python
+# Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree. An additional grant
+# of patent rights can be found in the PATENTS file in the same directory.
+# Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file. See the AUTHORS file for names of contributors.
+#
+# Copyright (c) 2009 Google Inc. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#    * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#    * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""Does google-lint on c++ files.
+
+The goal of this script is to identify places in the code that *may*
+be in non-compliance with google style.  It does not attempt to fix
+up these problems -- the point is to educate.  It does also not
+attempt to find all problems, or to ensure that everything it does
+find is legitimately a problem.
+
+In particular, we can get very confused by /* and // inside strings!
+We do a small hack, which is to ignore //'s with "'s after them on the
+same line, but it is far from perfect (in either direction).
+"""
+
+import codecs
+import copy
+import getopt
+import math  # for log
+import os
+import re
+import sre_compile
+import string
+import sys
+import unicodedata
+
+
+_USAGE = """
+Syntax: cpplint.py [--verbose=#] [--output=vs7] [--filter=-x,+y,...]
+                   [--counting=total|toplevel|detailed] [--root=subdir]
+                   [--linelength=digits]
+        <file> [file] ...
+
+  The style guidelines this tries to follow are those in
+    http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml
+
+  Every problem is given a confidence score from 1-5, with 5 meaning we are
+  certain of the problem, and 1 meaning it could be a legitimate construct.
+  This will miss some errors, and is not a substitute for a code review.
+
+  To suppress false-positive errors of a certain category, add a
+  'NOLINT(category)' comment to the line.  NOLINT or NOLINT(*)
+  suppresses errors of all categories on that line.
+
+  The files passed in will be linted; at least one file must be provided.
+  Default linted extensions are .cc, .cpp, .cu, .cuh and .h.  Change the
+  extensions with the --extensions flag.
+
+  Flags:
+
+    output=vs7
+      By default, the output is formatted to ease emacs parsing.  Visual Studio
+      compatible output (vs7) may also be used.  Other formats are unsupported.
+
+    verbose=#
+      Specify a number 0-5 to restrict errors to certain verbosity levels.
+
+    filter=-x,+y,...
+      Specify a comma-separated list of category-filters to apply: only
+      error messages whose category names pass the filters will be printed.
+      (Category names are printed with the message and look like
+      "[whitespace/indent]".)  Filters are evaluated left to right.
+      "-FOO" and "FOO" means "do not print categories that start with FOO".
+      "+FOO" means "do print categories that start with FOO".
+
+      Examples: --filter=-whitespace,+whitespace/braces
+                --filter=whitespace,runtime/printf,+runtime/printf_format
+                --filter=-,+build/include_what_you_use
+
+      To see a list of all the categories used in cpplint, pass no arg:
+         --filter=
+
+    counting=total|toplevel|detailed
+      The total number of errors found is always printed. If
+      'toplevel' is provided, then the count of errors in each of
+      the top-level categories like 'build' and 'whitespace' will
+      also be printed. If 'detailed' is provided, then a count
+      is provided for each category like 'build/class'.
+
+    root=subdir
+      The root directory used for deriving header guard CPP variable.
+      By default, the header guard CPP variable is calculated as the relative
+      path to the directory that contains .git, .hg, or .svn.  When this flag
+      is specified, the relative path is calculated from the specified
+      directory. If the specified directory does not exist, this flag is
+      ignored.
+
+      Examples:
+        Assuing that src/.git exists, the header guard CPP variables for
+        src/chrome/browser/ui/browser.h are:
+
+        No flag => CHROME_BROWSER_UI_BROWSER_H_
+        --root=chrome => BROWSER_UI_BROWSER_H_
+        --root=chrome/browser => UI_BROWSER_H_
+
+    linelength=digits
+      This is the allowed line length for the project. The default value is
+      80 characters.
+
+      Examples:
+        --linelength=120
+
+    extensions=extension,extension,...
+      The allowed file extensions that cpplint will check
+
+      Examples:
+        --extensions=hpp,cpp
+"""
+
+# We categorize each error message we print.  Here are the categories.
+# We want an explicit list so we can list them all in cpplint --filter=.
+# If you add a new error message with a new category, add it to the list
+# here!  cpplint_unittest.py should tell you if you forget to do this.
+_ERROR_CATEGORIES = [
+  'build/class',
+  'build/deprecated',
+  'build/endif_comment',
+  'build/explicit_make_pair',
+  'build/forward_decl',
+  'build/header_guard',
+  'build/include',
+  'build/include_alpha',
+  'build/include_order',
+  'build/include_what_you_use',
+  'build/namespaces',
+  'build/printf_format',
+  'build/storage_class',
+  'legal/copyright',
+  'readability/alt_tokens',
+  'readability/braces',
+  'readability/casting',
+  'readability/check',
+  'readability/constructors',
+  'readability/fn_size',
+  'readability/function',
+  'readability/multiline_comment',
+  'readability/multiline_string',
+  'readability/namespace',
+  'readability/nolint',
+  'readability/nul',
+  'readability/streams',
+  'readability/todo',
+  'readability/utf8',
+  'runtime/arrays',
+  'runtime/casting',
+  'runtime/explicit',
+  'runtime/int',
+  'runtime/init',
+  'runtime/invalid_increment',
+  'runtime/member_string_references',
+  'runtime/memset',
+  'runtime/operator',
+  'runtime/printf',
+  'runtime/printf_format',
+  'runtime/references',
+  'runtime/string',
+  'runtime/threadsafe_fn',
+  'runtime/vlog',
+  'whitespace/blank_line',
+  'whitespace/braces',
+  'whitespace/comma',
+  'whitespace/comments',
+  'whitespace/empty_conditional_body',
+  'whitespace/empty_loop_body',
+  'whitespace/end_of_line',
+  'whitespace/ending_newline',
+  'whitespace/forcolon',
+  'whitespace/indent',
+  'whitespace/line_length',
+  'whitespace/newline',
+  'whitespace/operators',
+  'whitespace/parens',
+  'whitespace/semicolon',
+  'whitespace/tab',
+  'whitespace/todo'
+  ]
+
+# The default state of the category filter. This is overrided by the --filter=
+# flag. By default all errors are on, so only add here categories that should be
+# off by default (i.e., categories that must be enabled by the --filter= flags).
+# All entries here should start with a '-' or '+', as in the --filter= flag.
+_DEFAULT_FILTERS = ['-build/include_alpha']
+
+# We used to check for high-bit characters, but after much discussion we
+# decided those were OK, as long as they were in UTF-8 and didn't represent
+# hard-coded international strings, which belong in a separate i18n file.
+
+
+# C++ headers
+_CPP_HEADERS = frozenset([
+    # Legacy
+    'algobase.h',
+    'algo.h',
+    'alloc.h',
+    'builtinbuf.h',
+    'bvector.h',
+    'complex.h',
+    'defalloc.h',
+    'deque.h',
+    'editbuf.h',
+    'fstream.h',
+    'function.h',
+    'hash_map',
+    'hash_map.h',
+    'hash_set',
+    'hash_set.h',
+    'hashtable.h',
+    'heap.h',
+    'indstream.h',
+    'iomanip.h',
+    'iostream.h',
+    'istream.h',
+    'iterator.h',
+    'list.h',
+    'map.h',
+    'multimap.h',
+    'multiset.h',
+    'ostream.h',
+    'pair.h',
+    'parsestream.h',
+    'pfstream.h',
+    'procbuf.h',
+    'pthread_alloc',
+    'pthread_alloc.h',
+    'rope',
+    'rope.h',
+    'ropeimpl.h',
+    'set.h',
+    'slist',
+    'slist.h',
+    'stack.h',
+    'stdiostream.h',
+    'stl_alloc.h',
+    'stl_relops.h',
+    'streambuf.h',
+    'stream.h',
+    'strfile.h',
+    'strstream.h',
+    'tempbuf.h',
+    'tree.h',
+    'type_traits.h',
+    'vector.h',
+    # 17.6.1.2 C++ library headers
+    'algorithm',
+    'array',
+    'atomic',
+    'bitset',
+    'chrono',
+    'codecvt',
+    'complex',
+    'condition_variable',
+    'deque',
+    'exception',
+    'forward_list',
+    'fstream',
+    'functional',
+    'future',
+    'initializer_list',
+    'iomanip',
+    'ios',
+    'iosfwd',
+    'iostream',
+    'istream',
+    'iterator',
+    'limits',
+    'list',
+    'locale',
+    'map',
+    'memory',
+    'mutex',
+    'new',
+    'numeric',
+    'ostream',
+    'queue',
+    'random',
+    'ratio',
+    'regex',
+    'set',
+    'sstream',
+    'stack',
+    'stdexcept',
+    'streambuf',
+    'string',
+    'strstream',
+    'system_error',
+    'thread',
+    'tuple',
+    'typeindex',
+    'typeinfo',
+    'type_traits',
+    'unordered_map',
+    'unordered_set',
+    'utility',
+    'valarray',
+    'vector',
+    # 17.6.1.2 C++ headers for C library facilities
+    'cassert',
+    'ccomplex',
+    'cctype',
+    'cerrno',
+    'cfenv',
+    'cfloat',
+    'cinttypes',
+    'ciso646',
+    'climits',
+    'clocale',
+    'cmath',
+    'csetjmp',
+    'csignal',
+    'cstdalign',
+    'cstdarg',
+    'cstdbool',
+    'cstddef',
+    'cstdint',
+    'cstdio',
+    'cstdlib',
+    'cstring',
+    'ctgmath',
+    'ctime',
+    'cuchar',
+    'cwchar',
+    'cwctype',
+    ])
+
+# Assertion macros.  These are defined in base/logging.h and
+# testing/base/gunit.h.  Note that the _M versions need to come first
+# for substring matching to work.
+_CHECK_MACROS = [
+    'DCHECK', 'CHECK',
+    'EXPECT_TRUE_M', 'EXPECT_TRUE',
+    'ASSERT_TRUE_M', 'ASSERT_TRUE',
+    'EXPECT_FALSE_M', 'EXPECT_FALSE',
+    'ASSERT_FALSE_M', 'ASSERT_FALSE',
+    ]
+
+# Replacement macros for CHECK/DCHECK/EXPECT_TRUE/EXPECT_FALSE
+_CHECK_REPLACEMENT = dict([(m, {}) for m in _CHECK_MACROS])
+
+for op, replacement in [('==', 'EQ'), ('!=', 'NE'),
+                        ('>=', 'GE'), ('>', 'GT'),
+                        ('<=', 'LE'), ('<', 'LT')]:
+  _CHECK_REPLACEMENT['DCHECK'][op] = 'DCHECK_%s' % replacement
+  _CHECK_REPLACEMENT['CHECK'][op] = 'CHECK_%s' % replacement
+  _CHECK_REPLACEMENT['EXPECT_TRUE'][op] = 'EXPECT_%s' % replacement
+  _CHECK_REPLACEMENT['ASSERT_TRUE'][op] = 'ASSERT_%s' % replacement
+  _CHECK_REPLACEMENT['EXPECT_TRUE_M'][op] = 'EXPECT_%s_M' % replacement
+  _CHECK_REPLACEMENT['ASSERT_TRUE_M'][op] = 'ASSERT_%s_M' % replacement
+
+for op, inv_replacement in [('==', 'NE'), ('!=', 'EQ'),
+                            ('>=', 'LT'), ('>', 'LE'),
+                            ('<=', 'GT'), ('<', 'GE')]:
+  _CHECK_REPLACEMENT['EXPECT_FALSE'][op] = 'EXPECT_%s' % inv_replacement
+  _CHECK_REPLACEMENT['ASSERT_FALSE'][op] = 'ASSERT_%s' % inv_replacement
+  _CHECK_REPLACEMENT['EXPECT_FALSE_M'][op] = 'EXPECT_%s_M' % inv_replacement
+  _CHECK_REPLACEMENT['ASSERT_FALSE_M'][op] = 'ASSERT_%s_M' % inv_replacement
+
+# Alternative tokens and their replacements.  For full list, see section 2.5
+# Alternative tokens [lex.digraph] in the C++ standard.
+#
+# Digraphs (such as '%:') are not included here since it's a mess to
+# match those on a word boundary.
+_ALT_TOKEN_REPLACEMENT = {
+    'and': '&&',
+    'bitor': '|',
+    'or': '||',
+    'xor': '^',
+    'compl': '~',
+    'bitand': '&',
+    'and_eq': '&=',
+    'or_eq': '|=',
+    'xor_eq': '^=',
+    'not': '!',
+    'not_eq': '!='
+    }
+
+# Compile regular expression that matches all the above keywords.  The "[ =()]"
+# bit is meant to avoid matching these keywords outside of boolean expressions.
+#
+# False positives include C-style multi-line comments and multi-line strings
+# but those have always been troublesome for cpplint.
+_ALT_TOKEN_REPLACEMENT_PATTERN = re.compile(
+    r'[ =()](' + ('|'.join(_ALT_TOKEN_REPLACEMENT.keys())) + r')(?=[ (]|$)')
+
+
+# These constants define types of headers for use with
+# _IncludeState.CheckNextIncludeOrder().
+_C_SYS_HEADER = 1
+_CPP_SYS_HEADER = 2
+_LIKELY_MY_HEADER = 3
+_POSSIBLE_MY_HEADER = 4
+_OTHER_HEADER = 5
+
+# These constants define the current inline assembly state
+_NO_ASM = 0       # Outside of inline assembly block
+_INSIDE_ASM = 1   # Inside inline assembly block
+_END_ASM = 2      # Last line of inline assembly block
+_BLOCK_ASM = 3    # The whole block is an inline assembly block
+
+# Match start of assembly blocks
+_MATCH_ASM = re.compile(r'^\s*(?:asm|_asm|__asm|__asm__)'
+                        r'(?:\s+(volatile|__volatile__))?'
+                        r'\s*[{(]')
+
+
+_regexp_compile_cache = {}
+
+# Finds occurrences of NOLINT or NOLINT(...).
+_RE_SUPPRESSION = re.compile(r'\bNOLINT\b(\([^)]*\))?')
+
+# {str, set(int)}: a map from error categories to sets of linenumbers
+# on which those errors are expected and should be suppressed.
+_error_suppressions = {}
+
+# The root directory used for deriving header guard CPP variable.
+# This is set by --root flag.
+_root = None
+
+# The allowed line length of files.
+# This is set by --linelength flag.
+_line_length = 80
+
+# The allowed extensions for file names
+# This is set by --extensions flag.
+_valid_extensions = set(['cc', 'h', 'cpp', 'cu', 'cuh'])
+
+def ParseNolintSuppressions(filename, raw_line, linenum, error):
+  """Updates the global list of error-suppressions.
+
+  Parses any NOLINT comments on the current line, updating the global
+  error_suppressions store.  Reports an error if the NOLINT comment
+  was malformed.
+
+  Args:
+    filename: str, the name of the input file.
+    raw_line: str, the line of input text, with comments.
+    linenum: int, the number of the current line.
+    error: function, an error handler.
+  """
+  # FIXME(adonovan): "NOLINT(" is misparsed as NOLINT(*).
+  matched = _RE_SUPPRESSION.search(raw_line)
+  if matched:
+    category = matched.group(1)
+    if category in (None, '(*)'):  # => "suppress all"
+      _error_suppressions.setdefault(None, set()).add(linenum)
+    else:
+      if category.startswith('(') and category.endswith(')'):
+        category = category[1:-1]
+        if category in _ERROR_CATEGORIES:
+          _error_suppressions.setdefault(category, set()).add(linenum)
+        else:
+          error(filename, linenum, 'readability/nolint', 5,
+                'Unknown NOLINT error category: %s' % category)
+
+
+def ResetNolintSuppressions():
+  "Resets the set of NOLINT suppressions to empty."
+  _error_suppressions.clear()
+
+
+def IsErrorSuppressedByNolint(category, linenum):
+  """Returns true if the specified error category is suppressed on this line.
+
+  Consults the global error_suppressions map populated by
+  ParseNolintSuppressions/ResetNolintSuppressions.
+
+  Args:
+    category: str, the category of the error.
+    linenum: int, the current line number.
+  Returns:
+    bool, True iff the error should be suppressed due to a NOLINT comment.
+  """
+  return (linenum in _error_suppressions.get(category, set()) or
+          linenum in _error_suppressions.get(None, set()))
+
+def Match(pattern, s):
+  """Matches the string with the pattern, caching the compiled regexp."""
+  # The regexp compilation caching is inlined in both Match and Search for
+  # performance reasons; factoring it out into a separate function turns out
+  # to be noticeably expensive.
+  if pattern not in _regexp_compile_cache:
+    _regexp_compile_cache[pattern] = sre_compile.compile(pattern)
+  return _regexp_compile_cache[pattern].match(s)
+
+
+def ReplaceAll(pattern, rep, s):
+  """Replaces instances of pattern in a string with a replacement.
+
+  The compiled regex is kept in a cache shared by Match and Search.
+
+  Args:
+    pattern: regex pattern
+    rep: replacement text
+    s: search string
+
+  Returns:
+    string with replacements made (or original string if no replacements)
+  """
+  if pattern not in _regexp_compile_cache:
+    _regexp_compile_cache[pattern] = sre_compile.compile(pattern)
+  return _regexp_compile_cache[pattern].sub(rep, s)
+
+
+def Search(pattern, s):
+  """Searches the string for the pattern, caching the compiled regexp."""
+  if pattern not in _regexp_compile_cache:
+    _regexp_compile_cache[pattern] = sre_compile.compile(pattern)
+  return _regexp_compile_cache[pattern].search(s)
+
+
+class _IncludeState(dict):
+  """Tracks line numbers for includes, and the order in which includes appear.
+
+  As a dict, an _IncludeState object serves as a mapping between include
+  filename and line number on which that file was included.
+
+  Call CheckNextIncludeOrder() once for each header in the file, passing
+  in the type constants defined above. Calls in an illegal order will
+  raise an _IncludeError with an appropriate error message.
+
+  """
+  # self._section will move monotonically through this set. If it ever
+  # needs to move backwards, CheckNextIncludeOrder will raise an error.
+  _INITIAL_SECTION = 0
+  _MY_H_SECTION = 1
+  _C_SECTION = 2
+  _CPP_SECTION = 3
+  _OTHER_H_SECTION = 4
+
+  _TYPE_NAMES = {
+      _C_SYS_HEADER: 'C system header',
+      _CPP_SYS_HEADER: 'C++ system header',
+      _LIKELY_MY_HEADER: 'header this file implements',
+      _POSSIBLE_MY_HEADER: 'header this file may implement',
+      _OTHER_HEADER: 'other header',
+      }
+  _SECTION_NAMES = {
+      _INITIAL_SECTION: "... nothing. (This can't be an error.)",
+      _MY_H_SECTION: 'a header this file implements',
+      _C_SECTION: 'C system header',
+      _CPP_SECTION: 'C++ system header',
+      _OTHER_H_SECTION: 'other header',
+      }
+
+  def __init__(self):
+    dict.__init__(self)
+    self.ResetSection()
+
+  def ResetSection(self):
+    # The name of the current section.
+    self._section = self._INITIAL_SECTION
+    # The path of last found header.
+    self._last_header = ''
+
+  def SetLastHeader(self, header_path):
+    self._last_header = header_path
+
+  def CanonicalizeAlphabeticalOrder(self, header_path):
+    """Returns a path canonicalized for alphabetical comparison.
+
+    - replaces "-" with "_" so they both cmp the same.
+    - removes '-inl' since we don't require them to be after the main header.
+    - lowercase everything, just in case.
+
+    Args:
+      header_path: Path to be canonicalized.
+
+    Returns:
+      Canonicalized path.
+    """
+    return header_path.replace('-inl.h', '.h').replace('-', '_').lower()
+
+  def IsInAlphabeticalOrder(self, clean_lines, linenum, header_path):
+    """Check if a header is in alphabetical order with the previous header.
+
+    Args:
+      clean_lines: A CleansedLines instance containing the file.
+      linenum: The number of the line to check.
+      header_path: Canonicalized header to be checked.
+
+    Returns:
+      Returns true if the header is in alphabetical order.
+    """
+    # If previous section is different from current section, _last_header will
+    # be reset to empty string, so it's always less than current header.
+    #
+    # If previous line was a blank line, assume that the headers are
+    # intentionally sorted the way they are.
+    if (self._last_header > header_path and
+        not Match(r'^\s*$', clean_lines.elided[linenum - 1])):
+      return False
+    return True
+
+  def CheckNextIncludeOrder(self, header_type):
+    """Returns a non-empty error message if the next header is out of order.
+
+    This function also updates the internal state to be ready to check
+    the next include.
+
+    Args:
+      header_type: One of the _XXX_HEADER constants defined above.
+
+    Returns:
+      The empty string if the header is in the right order, or an
+      error message describing what's wrong.
+
+    """
+    error_message = ('Found %s after %s' %
+                     (self._TYPE_NAMES[header_type],
+                      self._SECTION_NAMES[self._section]))
+
+    last_section = self._section
+
+    if header_type == _C_SYS_HEADER:
+      if self._section <= self._C_SECTION:
+        self._section = self._C_SECTION
+      else:
+        self._last_header = ''
+        return error_message
+    elif header_type == _CPP_SYS_HEADER:
+      if self._section <= self._CPP_SECTION:
+        self._section = self._CPP_SECTION
+      else:
+        self._last_header = ''
+        return error_message
+    elif header_type == _LIKELY_MY_HEADER:
+      if self._section <= self._MY_H_SECTION:
+        self._section = self._MY_H_SECTION
+      else:
+        self._section = self._OTHER_H_SECTION
+    elif header_type == _POSSIBLE_MY_HEADER:
+      if self._section <= self._MY_H_SECTION:
+        self._section = self._MY_H_SECTION
+      else:
+        # This will always be the fallback because we're not sure
+        # enough that the header is associated with this file.
+        self._section = self._OTHER_H_SECTION
+    else:
+      assert header_type == _OTHER_HEADER
+      self._section = self._OTHER_H_SECTION
+
+    if last_section != self._section:
+      self._last_header = ''
+
+    return ''
+
+
+class _CppLintState(object):
+  """Maintains module-wide state.."""
+
+  def __init__(self):
+    self.verbose_level = 1  # global setting.
+    self.error_count = 0    # global count of reported errors
+    # filters to apply when emitting error messages
+    self.filters = _DEFAULT_FILTERS[:]
+    self.counting = 'total'  # In what way are we counting errors?
+    self.errors_by_category = {}  # string to int dict storing error counts
+
+    # output format:
+    # "emacs" - format that emacs can parse (default)
+    # "vs7" - format that Microsoft Visual Studio 7 can parse
+    self.output_format = 'emacs'
+
+  def SetOutputFormat(self, output_format):
+    """Sets the output format for errors."""
+    self.output_format = output_format
+
+  def SetVerboseLevel(self, level):
+    """Sets the module's verbosity, and returns the previous setting."""
+    last_verbose_level = self.verbose_level
+    self.verbose_level = level
+    return last_verbose_level
+
+  def SetCountingStyle(self, counting_style):
+    """Sets the module's counting options."""
+    self.counting = counting_style
+
+  def SetFilters(self, filters):
+    """Sets the error-message filters.
+
+    These filters are applied when deciding whether to emit a given
+    error message.
+
+    Args:
+      filters: A string of comma-separated filters (eg "+whitespace/indent").
+               Each filter should start with + or -; else we die.
+
+    Raises:
+      ValueError: The comma-separated filters did not all start with '+' or '-'.
+                  E.g. "-,+whitespace,-whitespace/indent,whitespace/badfilter"
+    """
+    # Default filters always have less priority than the flag ones.
+    self.filters = _DEFAULT_FILTERS[:]
+    for filt in filters.split(','):
+      clean_filt = filt.strip()
+      if clean_filt:
+        self.filters.append(clean_filt)
+    for filt in self.filters:
+      if not (filt.startswith('+') or filt.startswith('-')):
+        raise ValueError('Every filter in --filters must start with + or -'
+                         ' (%s does not)' % filt)
+
+  def ResetErrorCounts(self):
+    """Sets the module's error statistic back to zero."""
+    self.error_count = 0
+    self.errors_by_category = {}
+
+  def IncrementErrorCount(self, category):
+    """Bumps the module's error statistic."""
+    self.error_count += 1
+    if self.counting in ('toplevel', 'detailed'):
+      if self.counting != 'detailed':
+        category = category.split('/')[0]
+      if category not in self.errors_by_category:
+        self.errors_by_category[category] = 0
+      self.errors_by_category[category] += 1
+
+  def PrintErrorCounts(self):
+    """Print a summary of errors by category, and the total."""
+    for category, count in self.errors_by_category.iteritems():
+      sys.stderr.write('Category \'%s\' errors found: %d\n' %
+                       (category, count))
+    sys.stderr.write('Total errors found: %d\n' % self.error_count)
+
+_cpplint_state = _CppLintState()
+
+
+def _OutputFormat():
+  """Gets the module's output format."""
+  return _cpplint_state.output_format
+
+
+def _SetOutputFormat(output_format):
+  """Sets the module's output format."""
+  _cpplint_state.SetOutputFormat(output_format)
+
+
+def _VerboseLevel():
+  """Returns the module's verbosity setting."""
+  return _cpplint_state.verbose_level
+
+
+def _SetVerboseLevel(level):
+  """Sets the module's verbosity, and returns the previous setting."""
+  return _cpplint_state.SetVerboseLevel(level)
+
+
+def _SetCountingStyle(level):
+  """Sets the module's counting options."""
+  _cpplint_state.SetCountingStyle(level)
+
+
+def _Filters():
+  """Returns the module's list of output filters, as a list."""
+  return _cpplint_state.filters
+
+
+def _SetFilters(filters):
+  """Sets the module's error-message filters.
+
+  These filters are applied when deciding whether to emit a given
+  error message.
+
+  Args:
+    filters: A string of comma-separated filters (eg "whitespace/indent").
+             Each filter should start with + or -; else we die.
+  """
+  _cpplint_state.SetFilters(filters)
+
+
+class _FunctionState(object):
+  """Tracks current function name and the number of lines in its body."""
+
+  _NORMAL_TRIGGER = 250  # for --v=0, 500 for --v=1, etc.
+  _TEST_TRIGGER = 400    # about 50% more than _NORMAL_TRIGGER.
+
+  def __init__(self):
+    self.in_a_function = False
+    self.lines_in_function = 0
+    self.current_function = ''
+
+  def Begin(self, function_name):
+    """Start analyzing function body.
+
+    Args:
+      function_name: The name of the function being tracked.
+    """
+    self.in_a_function = True
+    self.lines_in_function = 0
+    self.current_function = function_name
+
+  def Count(self):
+    """Count line in current function body."""
+    if self.in_a_function:
+      self.lines_in_function += 1
+
+  def Check(self, error, filename, linenum):
+    """Report if too many lines in function body.
+
+    Args:
+      error: The function to call with any errors found.
+      filename: The name of the current file.
+      linenum: The number of the line to check.
+    """
+    if Match(r'T(EST|est)', self.current_function):
+      base_trigger = self._TEST_TRIGGER
+    else:
+      base_trigger = self._NORMAL_TRIGGER
+    trigger = base_trigger * 2**_VerboseLevel()
+
+    if self.lines_in_function > trigger:
+      error_level = int(math.log(self.lines_in_function / base_trigger, 2))
+      # 50 => 0, 100 => 1, 200 => 2, 400 => 3, 800 => 4, 1600 => 5, ...
+      if error_level > 5:
+        error_level = 5
+      error(filename, linenum, 'readability/fn_size', error_level,
+            'Small and focused functions are preferred:'
+            ' %s has %d non-comment lines'
+            ' (error triggered by exceeding %d lines).'  % (
+                self.current_function, self.lines_in_function, trigger))
+
+  def End(self):
+    """Stop analyzing function body."""
+    self.in_a_function = False
+
+
+class _IncludeError(Exception):
+  """Indicates a problem with the include order in a file."""
+  pass
+
+
+class FileInfo:
+  """Provides utility functions for filenames.
+
+  FileInfo provides easy access to the components of a file's path
+  relative to the project root.
+  """
+
+  def __init__(self, filename):
+    self._filename = filename
+
+  def FullName(self):
+    """Make Windows paths like Unix."""
+    return os.path.abspath(self._filename).replace('\\', '/')
+
+  def RepositoryName(self):
+    """FullName after removing the local path to the repository.
+
+    If we have a real absolute path name here we can try to do something smart:
+    detecting the root of the checkout and truncating /path/to/checkout from
+    the name so that we get header guards that don't include things like
+    "C:\Documents and Settings\..." or "/home/username/..." in them and thus
+    people on different computers who have checked the source out to different
+    locations won't see bogus errors.
+    """
+    fullname = self.FullName()
+
+    if os.path.exists(fullname):
+      project_dir = os.path.dirname(fullname)
+
+      if os.path.exists(os.path.join(project_dir, ".svn")):
+        # If there's a .svn file in the current directory, we recursively look
+        # up the directory tree for the top of the SVN checkout
+        root_dir = project_dir
+        one_up_dir = os.path.dirname(root_dir)
+        while os.path.exists(os.path.join(one_up_dir, ".svn")):
+          root_dir = os.path.dirname(root_dir)
+          one_up_dir = os.path.dirname(one_up_dir)
+
+        prefix = os.path.commonprefix([root_dir, project_dir])
+        return fullname[len(prefix) + 1:]
+
+      # Not SVN <= 1.6? Try to find a git, hg, or svn top level directory by
+      # searching up from the current path.
+      root_dir = os.path.dirname(fullname)
+      while (root_dir != os.path.dirname(root_dir) and
+             not os.path.exists(os.path.join(root_dir, ".git")) and
+             not os.path.exists(os.path.join(root_dir, ".hg")) and
+             not os.path.exists(os.path.join(root_dir, ".svn"))):
+        root_dir = os.path.dirname(root_dir)
+
+      if (os.path.exists(os.path.join(root_dir, ".git")) or
+          os.path.exists(os.path.join(root_dir, ".hg")) or
+          os.path.exists(os.path.join(root_dir, ".svn"))):
+        prefix = os.path.commonprefix([root_dir, project_dir])
+        return fullname[len(prefix) + 1:]
+
+    # Don't know what to do; header guard warnings may be wrong...
+    return fullname
+
+  def Split(self):
+    """Splits the file into the directory, basename, and extension.
+
+    For 'chrome/browser/browser.cc', Split() would
+    return ('chrome/browser', 'browser', '.cc')
+
+    Returns:
+      A tuple of (directory, basename, extension).
+    """
+
+    googlename = self.RepositoryName()
+    project, rest = os.path.split(googlename)
+    return (project,) + os.path.splitext(rest)
+
+  def BaseName(self):
+    """File base name - text after the final slash, before the final period."""
+    return self.Split()[1]
+
+  def Extension(self):
+    """File extension - text following the final period."""
+    return self.Split()[2]
+
+  def NoExtension(self):
+    """File has no source file extension."""
+    return '/'.join(self.Split()[0:2])
+
+  def IsSource(self):
+    """File has a source file extension."""
+    return self.Extension()[1:] in ('c', 'cc', 'cpp', 'cxx')
+
+
+def _ShouldPrintError(category, confidence, linenum):
+  """If confidence >= verbose, category passes filter and is not suppressed."""
+
+  # There are three ways we might decide not to print an error message:
+  # a "NOLINT(category)" comment appears in the source,
+  # the verbosity level isn't high enough, or the filters filter it out.
+  if IsErrorSuppressedByNolint(category, linenum):
+    return False
+  if confidence < _cpplint_state.verbose_level:
+    return False
+
+  is_filtered = False
+  for one_filter in _Filters():
+    if one_filter.startswith('-'):
+      if category.startswith(one_filter[1:]):
+        is_filtered = True
+    elif one_filter.startswith('+'):
+      if category.startswith(one_filter[1:]):
+        is_filtered = False
+    else:
+      assert False  # should have been checked for in SetFilter.
+  if is_filtered:
+    return False
+
+  return True
+
+
+def Error(filename, linenum, category, confidence, message):
+  """Logs the fact we've found a lint error.
+
+  We log where the error was found, and also our confidence in the error,
+  that is, how certain we are this is a legitimate style regression, and
+  not a misidentification or a use that's sometimes justified.
+
+  False positives can be suppressed by the use of
+  "cpplint(category)"  comments on the offending line.  These are
+  parsed into _error_suppressions.
+
+  Args:
+    filename: The name of the file containing the error.
+    linenum: The number of the line containing the error.
+    category: A string used to describe the "category" this bug
+      falls under: "whitespace", say, or "runtime".  Categories
+      may have a hierarchy separated by slashes: "whitespace/indent".
+    confidence: A number from 1-5 representing a confidence score for
+      the error, with 5 meaning that we are certain of the problem,
+      and 1 meaning that it could be a legitimate construct.
+    message: The error message.
+  """
+  if _ShouldPrintError(category, confidence, linenum):
+    _cpplint_state.IncrementErrorCount(category)
+    if _cpplint_state.output_format == 'vs7':
+      sys.stderr.write('%s(%s):  %s  [%s] [%d]\n' % (
+          filename, linenum, message, category, confidence))
+    elif _cpplint_state.output_format == 'eclipse':
+      sys.stderr.write('%s:%s: warning: %s  [%s] [%d]\n' % (
+          filename, linenum, message, category, confidence))
+    else:
+      sys.stderr.write('%s:%s:  %s  [%s] [%d]\n' % (
+          filename, linenum, message, category, confidence))
+
+
+# Matches standard C++ escape sequences per 2.13.2.3 of the C++ standard.
+_RE_PATTERN_CLEANSE_LINE_ESCAPES = re.compile(
+    r'\\([abfnrtv?"\\\']|\d+|x[0-9a-fA-F]+)')
+# Matches strings.  Escape codes should already be removed by ESCAPES.
+_RE_PATTERN_CLEANSE_LINE_DOUBLE_QUOTES = re.compile(r'"[^"]*"')
+# Matches characters.  Escape codes should already be removed by ESCAPES.
+_RE_PATTERN_CLEANSE_LINE_SINGLE_QUOTES = re.compile(r"'.'")
+# Matches multi-line C++ comments.
+# This RE is a little bit more complicated than one might expect, because we
+# have to take care of space removals tools so we can handle comments inside
+# statements better.
+# The current rule is: We only clear spaces from both sides when we're at the
+# end of the line. Otherwise, we try to remove spaces from the right side,
+# if this doesn't work we try on left side but only if there's a non-character
+# on the right.
+_RE_PATTERN_CLEANSE_LINE_C_COMMENTS = re.compile(
+    r"""(\s*/\*.*\*/\s*$|
+            /\*.*\*/\s+|
+         \s+/\*.*\*/(?=\W)|
+            /\*.*\*/)""", re.VERBOSE)
+
+
+def IsCppString(line):
+  """Does line terminate so, that the next symbol is in string constant.
+
+  This function does not consider single-line nor multi-line comments.
+
+  Args:
+    line: is a partial line of code starting from the 0..n.
+
+  Returns:
+    True, if next character appended to 'line' is inside a
+    string constant.
+  """
+
+  line = line.replace(r'\\', 'XX')  # after this, \\" does not match to \"
+  return ((line.count('"') - line.count(r'\"') - line.count("'\"'")) & 1) == 1
+
+
+def CleanseRawStrings(raw_lines):
+  """Removes C++11 raw strings from lines.
+
+    Before:
+      static const char kData[] = R"(
+          multi-line string
+          )";
+
+    After:
+      static const char kData[] = ""
+          (replaced by blank line)
+          "";
+
+  Args:
+    raw_lines: list of raw lines.
+
+  Returns:
+    list of lines with C++11 raw strings replaced by empty strings.
+  """
+
+  delimiter = None
+  lines_without_raw_strings = []
+  for line in raw_lines:
+    if delimiter:
+      # Inside a raw string, look for the end
+      end = line.find(delimiter)
+      if end >= 0:
+        # Found the end of the string, match leading space for this
+        # line and resume copying the original lines, and also insert
+        # a "" on the last line.
+        leading_space = Match(r'^(\s*)\S', line)
+        line = leading_space.group(1) + '""' + line[end + len(delimiter):]
+        delimiter = None
+      else:
+        # Haven't found the end yet, append a blank line.
+        line = ''
+
+    else:
+      # Look for beginning of a raw string.
+      # See 2.14.15 [lex.string] for syntax.
+      matched = Match(r'^(.*)\b(?:R|u8R|uR|UR|LR)"([^\s\\()]*)\((.*)$', line)
+      if matched:
+        delimiter = ')' + matched.group(2) + '"'
+
+        end = matched.group(3).find(delimiter)
+        if end >= 0:
+          # Raw string ended on same line
+          line = (matched.group(1) + '""' +
+                  matched.group(3)[end + len(delimiter):])
+          delimiter = None
+        else:
+          # Start of a multi-line raw string
+          line = matched.group(1) + '""'
+
+    lines_without_raw_strings.append(line)
+
+  # TODO(unknown): if delimiter is not None here, we might want to
+  # emit a warning for unterminated string.
+  return lines_without_raw_strings
+
+
+def FindNextMultiLineCommentStart(lines, lineix):
+  """Find the beginning marker for a multiline comment."""
+  while lineix < len(lines):
+    if lines[lineix].strip().startswith('/*'):
+      # Only return this marker if the comment goes beyond this line
+      if lines[lineix].strip().find('*/', 2) < 0:
+        return lineix
+    lineix += 1
+  return len(lines)
+
+
+def FindNextMultiLineCommentEnd(lines, lineix):
+  """We are inside a comment, find the end marker."""
+  while lineix < len(lines):
+    if lines[lineix].strip().endswith('*/'):
+      return lineix
+    lineix += 1
+  return len(lines)
+
+
+def RemoveMultiLineCommentsFromRange(lines, begin, end):
+  """Clears a range of lines for multi-line comments."""
+  # Having // dummy comments makes the lines non-empty, so we will not get
+  # unnecessary blank line warnings later in the code.
+  for i in range(begin, end):
+    lines[i] = '// dummy'
+
+
+def RemoveMultiLineComments(filename, lines, error):
+  """Removes multiline (c-style) comments from lines."""
+  lineix = 0
+  while lineix < len(lines):
+    lineix_begin = FindNextMultiLineCommentStart(lines, lineix)
+    if lineix_begin >= len(lines):
+      return
+    lineix_end = FindNextMultiLineCommentEnd(lines, lineix_begin)
+    if lineix_end >= len(lines):
+      error(filename, lineix_begin + 1, 'readability/multiline_comment', 5,
+            'Could not find end of multi-line comment')
+      return
+    RemoveMultiLineCommentsFromRange(lines, lineix_begin, lineix_end + 1)
+    lineix = lineix_end + 1
+
+
+def CleanseComments(line):
+  """Removes //-comments and single-line C-style /* */ comments.
+
+  Args:
+    line: A line of C++ source.
+
+  Returns:
+    The line with single-line comments removed.
+  """
+  commentpos = line.find('//')
+  if commentpos != -1 and not IsCppString(line[:commentpos]):
+    line = line[:commentpos].rstrip()
+  # get rid of /* ... */
+  return _RE_PATTERN_CLEANSE_LINE_C_COMMENTS.sub('', line)
+
+
+class CleansedLines(object):
+  """Holds 3 copies of all lines with different preprocessing applied to them.
+
+  1) elided member contains lines without strings and comments,
+  2) lines member contains lines without comments, and
+  3) raw_lines member contains all the lines without processing.
+  All these three members are of <type 'list'>, and of the same length.
+  """
+
+  def __init__(self, lines):
+    self.elided = []
+    self.lines = []
+    self.raw_lines = lines
+    self.num_lines = len(lines)
+    self.lines_without_raw_strings = CleanseRawStrings(lines)
+    for linenum in range(len(self.lines_without_raw_strings)):
+      self.lines.append(CleanseComments(
+          self.lines_without_raw_strings[linenum]))
+      elided = self._CollapseStrings(self.lines_without_raw_strings[linenum])
+      self.elided.append(CleanseComments(elided))
+
+  def NumLines(self):
+    """Returns the number of lines represented."""
+    return self.num_lines
+
+  @staticmethod
+  def _CollapseStrings(elided):
+    """Collapses strings and chars on a line to simple "" or '' blocks.
+
+    We nix strings first so we're not fooled by text like '"http://"'
+
+    Args:
+      elided: The line being processed.
+
+    Returns:
+      The line with collapsed strings.
+    """
+    if not _RE_PATTERN_INCLUDE.match(elided):
+      # Remove escaped characters first to make quote/single quote collapsing
+      # basic.  Things that look like escaped characters shouldn't occur
+      # outside of strings and chars.
+      elided = _RE_PATTERN_CLEANSE_LINE_ESCAPES.sub('', elided)
+      elided = _RE_PATTERN_CLEANSE_LINE_SINGLE_QUOTES.sub("''", elided)
+      elided = _RE_PATTERN_CLEANSE_LINE_DOUBLE_QUOTES.sub('""', elided)
+    return elided
+
+
+def FindEndOfExpressionInLine(line, startpos, depth, startchar, endchar):
+  """Find the position just after the matching endchar.
+
+  Args:
+    line: a CleansedLines line.
+    startpos: start searching at this position.
+    depth: nesting level at startpos.
+    startchar: expression opening character.
+    endchar: expression closing character.
+
+  Returns:
+    On finding matching endchar: (index just after matching endchar, 0)
+    Otherwise: (-1, new depth at end of this line)
+  """
+  for i in xrange(startpos, len(line)):
+    if line[i] == startchar:
+      depth += 1
+    elif line[i] == endchar:
+      depth -= 1
+      if depth == 0:
+        return (i + 1, 0)
+  return (-1, depth)
+
+
+def CloseExpression(clean_lines, linenum, pos):
+  """If input points to ( or { or [ or <, finds the position that closes it.
+
+  If lines[linenum][pos] points to a '(' or '{' or '[' or '<', finds the
+  linenum/pos that correspond to the closing of the expression.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    pos: A position on the line.
+
+  Returns:
+    A tuple (line, linenum, pos) pointer *past* the closing brace, or
+    (line, len(lines), -1) if we never find a close.  Note we ignore
+    strings and comments when matching; and the line we return is the
+    'cleansed' line at linenum.
+  """
+
+  line = clean_lines.elided[linenum]
+  startchar = line[pos]
+  if startchar not in '({[<':
+    return (line, clean_lines.NumLines(), -1)
+  if startchar == '(': endchar = ')'
+  if startchar == '[': endchar = ']'
+  if startchar == '{': endchar = '}'
+  if startchar == '<': endchar = '>'
+
+  # Check first line
+  (end_pos, num_open) = FindEndOfExpressionInLine(
+      line, pos, 0, startchar, endchar)
+  if end_pos > -1:
+    return (line, linenum, end_pos)
+
+  # Continue scanning forward
+  while linenum < clean_lines.NumLines() - 1:
+    linenum += 1
+    line = clean_lines.elided[linenum]
+    (end_pos, num_open) = FindEndOfExpressionInLine(
+        line, 0, num_open, startchar, endchar)
+    if end_pos > -1:
+      return (line, linenum, end_pos)
+
+  # Did not find endchar before end of file, give up
+  return (line, clean_lines.NumLines(), -1)
+
+
+def FindStartOfExpressionInLine(line, endpos, depth, startchar, endchar):
+  """Find position at the matching startchar.
+
+  This is almost the reverse of FindEndOfExpressionInLine, but note
+  that the input position and returned position differs by 1.
+
+  Args:
+    line: a CleansedLines line.
+    endpos: start searching at this position.
+    depth: nesting level at endpos.
+    startchar: expression opening character.
+    endchar: expression closing character.
+
+  Returns:
+    On finding matching startchar: (index at matching startchar, 0)
+    Otherwise: (-1, new depth at beginning of this line)
+  """
+  for i in xrange(endpos, -1, -1):
+    if line[i] == endchar:
+      depth += 1
+    elif line[i] == startchar:
+      depth -= 1
+      if depth == 0:
+        return (i, 0)
+  return (-1, depth)
+
+
+def ReverseCloseExpression(clean_lines, linenum, pos):
+  """If input points to ) or } or ] or >, finds the position that opens it.
+
+  If lines[linenum][pos] points to a ')' or '}' or ']' or '>', finds the
+  linenum/pos that correspond to the opening of the expression.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    pos: A position on the line.
+
+  Returns:
+    A tuple (line, linenum, pos) pointer *at* the opening brace, or
+    (line, 0, -1) if we never find the matching opening brace.  Note
+    we ignore strings and comments when matching; and the line we
+    return is the 'cleansed' line at linenum.
+  """
+  line = clean_lines.elided[linenum]
+  endchar = line[pos]
+  if endchar not in ')}]>':
+    return (line, 0, -1)
+  if endchar == ')': startchar = '('
+  if endchar == ']': startchar = '['
+  if endchar == '}': startchar = '{'
+  if endchar == '>': startchar = '<'
+
+  # Check last line
+  (start_pos, num_open) = FindStartOfExpressionInLine(
+      line, pos, 0, startchar, endchar)
+  if start_pos > -1:
+    return (line, linenum, start_pos)
+
+  # Continue scanning backward
+  while linenum > 0:
+    linenum -= 1
+    line = clean_lines.elided[linenum]
+    (start_pos, num_open) = FindStartOfExpressionInLine(
+        line, len(line) - 1, num_open, startchar, endchar)
+    if start_pos > -1:
+      return (line, linenum, start_pos)
+
+  # Did not find startchar before beginning of file, give up
+  return (line, 0, -1)
+
+
+def CheckForCopyright(filename, lines, error):
+  """Logs an error if no Copyright message appears at the top of the file."""
+
+  # We'll say it should occur by line 10. Don't forget there's a
+  # dummy line at the front.
+  for line in xrange(1, min(len(lines), 11)):
+    if re.search(r'Copyright', lines[line], re.I): break
+  else:                       # means no copyright line was found
+    error(filename, 0, 'legal/copyright', 5,
+          'No copyright message found.  '
+          'You should have a line: "Copyright [year] <Copyright Owner>"')
+
+
+def GetHeaderGuardCPPVariable(filename):
+  """Returns the CPP variable that should be used as a header guard.
+
+  Args:
+    filename: The name of a C++ header file.
+
+  Returns:
+    The CPP variable that should be used as a header guard in the
+    named file.
+
+  """
+
+  # Restores original filename in case that cpplint is invoked from Emacs's
+  # flymake.
+  filename = re.sub(r'_flymake\.h$', '.h', filename)
+  filename = re.sub(r'/\.flymake/([^/]*)$', r'/\1', filename)
+
+  fileinfo = FileInfo(filename)
+  file_path_from_root = fileinfo.RepositoryName()
+  if _root:
+    file_path_from_root = re.sub('^' + _root + os.sep, '', file_path_from_root)
+  return re.sub(r'[-./\s]', '_', file_path_from_root).upper() + '_'
+
+
+def CheckForHeaderGuard(filename, lines, error):
+  """Checks that the file contains a header guard.
+
+  Logs an error if no #ifndef header guard is present.  For other
+  headers, checks that the full pathname is used.
+
+  Args:
+    filename: The name of the C++ header file.
+    lines: An array of strings, each representing a line of the file.
+    error: The function to call with any errors found.
+  """
+
+  cppvar = GetHeaderGuardCPPVariable(filename)
+
+  ifndef = None
+  ifndef_linenum = 0
+  define = None
+  endif = None
+  endif_linenum = 0
+  for linenum, line in enumerate(lines):
+    # Already been well guarded, no need for further checking.
+    if line.strip() == "#pragma once":
+        return
+    linesplit = line.split()
+    if len(linesplit) >= 2:
+      # find the first occurrence of #ifndef and #define, save arg
+      if not ifndef and linesplit[0] == '#ifndef':
+        # set ifndef to the header guard presented on the #ifndef line.
+        ifndef = linesplit[1]
+        ifndef_linenum = linenum
+      if not define and linesplit[0] == '#define':
+        define = linesplit[1]
+    # find the last occurrence of #endif, save entire line
+    if line.startswith('#endif'):
+      endif = line
+      endif_linenum = linenum
+
+  if not ifndef:
+    error(filename, 0, 'build/header_guard', 5,
+          'No #ifndef header guard found, suggested CPP variable is: %s' %
+          cppvar)
+    return
+
+  if not define:
+    error(filename, 0, 'build/header_guard', 5,
+          'No #define header guard found, suggested CPP variable is: %s' %
+          cppvar)
+    return
+
+  # The guard should be PATH_FILE_H_, but we also allow PATH_FILE_H__
+  # for backward compatibility.
+  if ifndef != cppvar:
+    error_level = 0
+    if ifndef != cppvar + '_':
+      error_level = 5
+
+    ParseNolintSuppressions(filename, lines[ifndef_linenum], ifndef_linenum,
+                            error)
+    error(filename, ifndef_linenum, 'build/header_guard', error_level,
+          '#ifndef header guard has wrong style, please use: %s' % cppvar)
+
+  if define != ifndef:
+    error(filename, 0, 'build/header_guard', 5,
+          '#ifndef and #define don\'t match, suggested CPP variable is: %s' %
+          cppvar)
+    return
+
+  if endif != ('#endif  // %s' % cppvar):
+    error_level = 0
+    if endif != ('#endif  // %s' % (cppvar + '_')):
+      error_level = 5
+
+    ParseNolintSuppressions(filename, lines[endif_linenum], endif_linenum,
+                            error)
+    error(filename, endif_linenum, 'build/header_guard', error_level,
+          '#endif line should be "#endif  // %s"' % cppvar)
+
+
+def CheckForBadCharacters(filename, lines, error):
+  """Logs an error for each line containing bad characters.
+
+  Two kinds of bad characters:
+
+  1. Unicode replacement characters: These indicate that either the file
+  contained invalid UTF-8 (likely) or Unicode replacement characters (which
+  it shouldn't).  Note that it's possible for this to throw off line
+  numbering if the invalid UTF-8 occurred adjacent to a newline.
+
+  2. NUL bytes.  These are problematic for some tools.
+
+  Args:
+    filename: The name of the current file.
+    lines: An array of strings, each representing a line of the file.
+    error: The function to call with any errors found.
+  """
+  for linenum, line in enumerate(lines):
+    if u'\ufffd' in line:
+      error(filename, linenum, 'readability/utf8', 5,
+            'Line contains invalid UTF-8 (or Unicode replacement character).')
+    if '\0' in line:
+      error(filename, linenum, 'readability/nul', 5, 'Line contains NUL byte.')
+
+
+def CheckForNewlineAtEOF(filename, lines, error):
+  """Logs an error if there is no newline char at the end of the file.
+
+  Args:
+    filename: The name of the current file.
+    lines: An array of strings, each representing a line of the file.
+    error: The function to call with any errors found.
+  """
+
+  # The array lines() was created by adding two newlines to the
+  # original file (go figure), then splitting on \n.
+  # To verify that the file ends in \n, we just have to make sure the
+  # last-but-two element of lines() exists and is empty.
+  if len(lines) < 3 or lines[-2]:
+    error(filename, len(lines) - 2, 'whitespace/ending_newline', 5,
+          'Could not find a newline character at the end of the file.')
+
+
+def CheckForMultilineCommentsAndStrings(filename, clean_lines, linenum, error):
+  """Logs an error if we see /* ... */ or "..." that extend past one line.
+
+  /* ... */ comments are legit inside macros, for one line.
+  Otherwise, we prefer // comments, so it's ok to warn about the
+  other.  Likewise, it's ok for strings to extend across multiple
+  lines, as long as a line continuation character (backslash)
+  terminates each line. Although not currently prohibited by the C++
+  style guide, it's ugly and unnecessary. We don't do well with either
+  in this lint program, so we warn about both.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+
+  # Remove all \\ (escaped backslashes) from the line. They are OK, and the
+  # second (escaped) slash may trigger later \" detection erroneously.
+  line = line.replace('\\\\', '')
+
+  if line.count('/*') > line.count('*/'):
+    error(filename, linenum, 'readability/multiline_comment', 5,
+          'Complex multi-line /*...*/-style comment found. '
+          'Lint may give bogus warnings.  '
+          'Consider replacing these with //-style comments, '
+          'with #if 0...#endif, '
+          'or with more clearly structured multi-line comments.')
+
+  if (line.count('"') - line.count('\\"')) % 2:
+    error(filename, linenum, 'readability/multiline_string', 5,
+          'Multi-line string ("...") found.  This lint script doesn\'t '
+          'do well with such strings, and may give bogus warnings.  '
+          'Use C++11 raw strings or concatenation instead.')
+
+
+threading_list = (
+    ('asctime(', 'asctime_r('),
+    ('ctime(', 'ctime_r('),
+    ('getgrgid(', 'getgrgid_r('),
+    ('getgrnam(', 'getgrnam_r('),
+    ('getlogin(', 'getlogin_r('),
+    ('getpwnam(', 'getpwnam_r('),
+    ('getpwuid(', 'getpwuid_r('),
+    ('gmtime(', 'gmtime_r('),
+    ('localtime(', 'localtime_r('),
+    ('rand(', 'rand_r('),
+    ('strtok(', 'strtok_r('),
+    ('ttyname(', 'ttyname_r('),
+    )
+
+
+def CheckPosixThreading(filename, clean_lines, linenum, error):
+  """Checks for calls to thread-unsafe functions.
+
+  Much code has been originally written without consideration of
+  multi-threading. Also, engineers are relying on their old experience;
+  they have learned posix before threading extensions were added. These
+  tests guide the engineers to use thread-safe functions (when using
+  posix directly).
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+  for single_thread_function, multithread_safe_function in threading_list:
+    ix = line.find(single_thread_function)
+    # Comparisons made explicit for clarity -- pylint: disable=g-explicit-bool-comparison
+    if ix >= 0 and (ix == 0 or (not line[ix - 1].isalnum() and
+                                line[ix - 1] not in ('_', '.', '>'))):
+      error(filename, linenum, 'runtime/threadsafe_fn', 2,
+            'Consider using ' + multithread_safe_function +
+            '...) instead of ' + single_thread_function +
+            '...) for improved thread safety.')
+
+
+def CheckVlogArguments(filename, clean_lines, linenum, error):
+  """Checks that VLOG() is only used for defining a logging level.
+
+  For example, VLOG(2) is correct. VLOG(INFO), VLOG(WARNING), VLOG(ERROR), and
+  VLOG(FATAL) are not.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+  if Search(r'\bVLOG\((INFO|ERROR|WARNING|DFATAL|FATAL)\)', line):
+    error(filename, linenum, 'runtime/vlog', 5,
+          'VLOG() should be used with numeric verbosity level.  '
+          'Use LOG() if you want symbolic severity levels.')
+
+
+# Matches invalid increment: *count++, which moves pointer instead of
+# incrementing a value.
+_RE_PATTERN_INVALID_INCREMENT = re.compile(
+    r'^\s*\*\w+(\+\+|--);')
+
+
+def CheckInvalidIncrement(filename, clean_lines, linenum, error):
+  """Checks for invalid increment *count++.
+
+  For example following function:
+  void increment_counter(int* count) {
+    *count++;
+  }
+  is invalid, because it effectively does count++, moving pointer, and should
+  be replaced with ++*count, (*count)++ or *count += 1.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+  if _RE_PATTERN_INVALID_INCREMENT.match(line):
+    error(filename, linenum, 'runtime/invalid_increment', 5,
+          'Changing pointer instead of value (or unused value of operator*).')
+
+
+class _BlockInfo(object):
+  """Stores information about a generic block of code."""
+
+  def __init__(self, seen_open_brace):
+    self.seen_open_brace = seen_open_brace
+    self.open_parentheses = 0
+    self.inline_asm = _NO_ASM
+
+  def CheckBegin(self, filename, clean_lines, linenum, error):
+    """Run checks that applies to text up to the opening brace.
+
+    This is mostly for checking the text after the class identifier
+    and the "{", usually where the base class is specified.  For other
+    blocks, there isn't much to check, so we always pass.
+
+    Args:
+      filename: The name of the current file.
+      clean_lines: A CleansedLines instance containing the file.
+      linenum: The number of the line to check.
+      error: The function to call with any errors found.
+    """
+    pass
+
+  def CheckEnd(self, filename, clean_lines, linenum, error):
+    """Run checks that applies to text after the closing brace.
+
+    This is mostly used for checking end of namespace comments.
+
+    Args:
+      filename: The name of the current file.
+      clean_lines: A CleansedLines instance containing the file.
+      linenum: The number of the line to check.
+      error: The function to call with any errors found.
+    """
+    pass
+
+
+class _ClassInfo(_BlockInfo):
+  """Stores information about a class."""
+
+  def __init__(self, name, class_or_struct, clean_lines, linenum):
+    _BlockInfo.__init__(self, False)
+    self.name = name
+    self.starting_linenum = linenum
+    self.is_derived = False
+    if class_or_struct == 'struct':
+      self.access = 'public'
+      self.is_struct = True
+    else:
+      self.access = 'private'
+      self.is_struct = False
+
+    # Remember initial indentation level for this class.  Using raw_lines here
+    # instead of elided to account for leading comments.
+    initial_indent = Match(r'^( *)\S', clean_lines.raw_lines[linenum])
+    if initial_indent:
+      self.class_indent = len(initial_indent.group(1))
+    else:
+      self.class_indent = 0
+
+    # Try to find the end of the class.  This will be confused by things like:
+    #   class A {
+    #   } *x = { ...
+    #
+    # But it's still good enough for CheckSectionSpacing.
+    self.last_line = 0
+    depth = 0
+    for i in range(linenum, clean_lines.NumLines()):
+      line = clean_lines.elided[i]
+      depth += line.count('{') - line.count('}')
+      if not depth:
+        self.last_line = i
+        break
+
+  def CheckBegin(self, filename, clean_lines, linenum, error):
+    # Look for a bare ':'
+    if Search('(^|[^:]):($|[^:])', clean_lines.elided[linenum]):
+      self.is_derived = True
+
+  def CheckEnd(self, filename, clean_lines, linenum, error):
+    # Check that closing brace is aligned with beginning of the class.
+    # Only do this if the closing brace is indented by only whitespaces.
+    # This means we will not check single-line class definitions.
+    indent = Match(r'^( *)\}', clean_lines.elided[linenum])
+    if indent and len(indent.group(1)) != self.class_indent:
+      if self.is_struct:
+        parent = 'struct ' + self.name
+      else:
+        parent = 'class ' + self.name
+      error(filename, linenum, 'whitespace/indent', 3,
+            'Closing brace should be aligned with beginning of %s' % parent)
+
+
+class _NamespaceInfo(_BlockInfo):
+  """Stores information about a namespace."""
+
+  def __init__(self, name, linenum):
+    _BlockInfo.__init__(self, False)
+    self.name = name or ''
+    self.starting_linenum = linenum
+
+  def CheckEnd(self, filename, clean_lines, linenum, error):
+    """Check end of namespace comments."""
+    line = clean_lines.raw_lines[linenum]
+
+    # Check how many lines is enclosed in this namespace.  Don't issue
+    # warning for missing namespace comments if there aren't enough
+    # lines.  However, do apply checks if there is already an end of
+    # namespace comment and it's incorrect.
+    #
+    # TODO(unknown): We always want to check end of namespace comments
+    # if a namespace is large, but sometimes we also want to apply the
+    # check if a short namespace contained nontrivial things (something
+    # other than forward declarations).  There is currently no logic on
+    # deciding what these nontrivial things are, so this check is
+    # triggered by namespace size only, which works most of the time.
+    if (linenum - self.starting_linenum < 10
+        and not Match(r'};*\s*(//|/\*).*\bnamespace\b', line)):
+      return
+
+    # Look for matching comment at end of namespace.
+    #
+    # Note that we accept C style "/* */" comments for terminating
+    # namespaces, so that code that terminate namespaces inside
+    # preprocessor macros can be cpplint clean.
+    #
+    # We also accept stuff like "// end of namespace <name>." with the
+    # period at the end.
+    #
+    # Besides these, we don't accept anything else, otherwise we might
+    # get false negatives when existing comment is a substring of the
+    # expected namespace.
+    if self.name:
+      # Named namespace
+      if not Match((r'};*\s*(//|/\*).*\bnamespace\s+' + re.escape(self.name) +
+                    r'[\*/\.\\\s]*$'),
+                   line):
+        error(filename, linenum, 'readability/namespace', 5,
+              'Namespace should be terminated with "// namespace %s"' %
+              self.name)
+    else:
+      # Anonymous namespace
+      if not Match(r'};*\s*(//|/\*).*\bnamespace[\*/\.\\\s]*$', line):
+        error(filename, linenum, 'readability/namespace', 5,
+              'Namespace should be terminated with "// namespace"')
+
+
+class _PreprocessorInfo(object):
+  """Stores checkpoints of nesting stacks when #if/#else is seen."""
+
+  def __init__(self, stack_before_if):
+    # The entire nesting stack before #if
+    self.stack_before_if = stack_before_if
+
+    # The entire nesting stack up to #else
+    self.stack_before_else = []
+
+    # Whether we have already seen #else or #elif
+    self.seen_else = False
+
+
+class _NestingState(object):
+  """Holds states related to parsing braces."""
+
+  def __init__(self):
+    # Stack for tracking all braces.  An object is pushed whenever we
+    # see a "{", and popped when we see a "}".  Only 3 types of
+    # objects are possible:
+    # - _ClassInfo: a class or struct.
+    # - _NamespaceInfo: a namespace.
+    # - _BlockInfo: some other type of block.
+    self.stack = []
+
+    # Stack of _PreprocessorInfo objects.
+    self.pp_stack = []
+
+  def SeenOpenBrace(self):
+    """Check if we have seen the opening brace for the innermost block.
+
+    Returns:
+      True if we have seen the opening brace, False if the innermost
+      block is still expecting an opening brace.
+    """
+    return (not self.stack) or self.stack[-1].seen_open_brace
+
+  def InNamespaceBody(self):
+    """Check if we are currently one level inside a namespace body.
+
+    Returns:
+      True if top of the stack is a namespace block, False otherwise.
+    """
+    return self.stack and isinstance(self.stack[-1], _NamespaceInfo)
+
+  def UpdatePreprocessor(self, line):
+    """Update preprocessor stack.
+
+    We need to handle preprocessors due to classes like this:
+      #ifdef SWIG
+      struct ResultDetailsPageElementExtensionPoint {
+      #else
+      struct ResultDetailsPageElementExtensionPoint : public Extension {
+      #endif
+
+    We make the following assumptions (good enough for most files):
+    - Preprocessor condition evaluates to true from #if up to first
+      #else/#elif/#endif.
+
+    - Preprocessor condition evaluates to false from #else/#elif up
+      to #endif.  We still perform lint checks on these lines, but
+      these do not affect nesting stack.
+
+    Args:
+      line: current line to check.
+    """
+    if Match(r'^\s*#\s*(if|ifdef|ifndef)\b', line):
+      # Beginning of #if block, save the nesting stack here.  The saved
+      # stack will allow us to restore the parsing state in the #else case.
+      self.pp_stack.append(_PreprocessorInfo(copy.deepcopy(self.stack)))
+    elif Match(r'^\s*#\s*(else|elif)\b', line):
+      # Beginning of #else block
+      if self.pp_stack:
+        if not self.pp_stack[-1].seen_else:
+          # This is the first #else or #elif block.  Remember the
+          # whole nesting stack up to this point.  This is what we
+          # keep after the #endif.
+          self.pp_stack[-1].seen_else = True
+          self.pp_stack[-1].stack_before_else = copy.deepcopy(self.stack)
+
+        # Restore the stack to how it was before the #if
+        self.stack = copy.deepcopy(self.pp_stack[-1].stack_before_if)
+      else:
+        # TODO(unknown): unexpected #else, issue warning?
+        pass
+    elif Match(r'^\s*#\s*endif\b', line):
+      # End of #if or #else blocks.
+      if self.pp_stack:
+        # If we saw an #else, we will need to restore the nesting
+        # stack to its former state before the #else, otherwise we
+        # will just continue from where we left off.
+        if self.pp_stack[-1].seen_else:
+          # Here we can just use a shallow copy since we are the last
+          # reference to it.
+          self.stack = self.pp_stack[-1].stack_before_else
+        # Drop the corresponding #if
+        self.pp_stack.pop()
+      else:
+        # TODO(unknown): unexpected #endif, issue warning?
+        pass
+
+  def Update(self, filename, clean_lines, linenum, error):
+    """Update nesting state with current line.
+
+    Args:
+      filename: The name of the current file.
+      clean_lines: A CleansedLines instance containing the file.
+      linenum: The number of the line to check.
+      error: The function to call with any errors found.
+    """
+    line = clean_lines.elided[linenum]
+
+    # Update pp_stack first
+    self.UpdatePreprocessor(line)
+
+    # Count parentheses.  This is to avoid adding struct arguments to
+    # the nesting stack.
+    if self.stack:
+      inner_block = self.stack[-1]
+      depth_change = line.count('(') - line.count(')')
+      inner_block.open_parentheses += depth_change
+
+      # Also check if we are starting or ending an inline assembly block.
+      if inner_block.inline_asm in (_NO_ASM, _END_ASM):
+        if (depth_change != 0 and
+            inner_block.open_parentheses == 1 and
+            _MATCH_ASM.match(line)):
+          # Enter assembly block
+          inner_block.inline_asm = _INSIDE_ASM
+        else:
+          # Not entering assembly block.  If previous line was _END_ASM,
+          # we will now shift to _NO_ASM state.
+          inner_block.inline_asm = _NO_ASM
+      elif (inner_block.inline_asm == _INSIDE_ASM and
+            inner_block.open_parentheses == 0):
+        # Exit assembly block
+        inner_block.inline_asm = _END_ASM
+
+    # Consume namespace declaration at the beginning of the line.  Do
+    # this in a loop so that we catch same line declarations like this:
+    #   namespace proto2 { namespace bridge { class MessageSet; } }
+    while True:
+      # Match start of namespace.  The "\b\s*" below catches namespace
+      # declarations even if it weren't followed by a whitespace, this
+      # is so that we don't confuse our namespace checker.  The
+      # missing spaces will be flagged by CheckSpacing.
+      namespace_decl_match = Match(r'^\s*namespace\b\s*([:\w]+)?(.*)$', line)
+      if not namespace_decl_match:
+        break
+
+      new_namespace = _NamespaceInfo(namespace_decl_match.group(1), linenum)
+      self.stack.append(new_namespace)
+
+      line = namespace_decl_match.group(2)
+      if line.find('{') != -1:
+        new_namespace.seen_open_brace = True
+        line = line[line.find('{') + 1:]
+
+    # Look for a class declaration in whatever is left of the line
+    # after parsing namespaces.  The regexp accounts for decorated classes
+    # such as in:
+    #   class LOCKABLE API Object {
+    #   };
+    #
+    # Templates with class arguments may confuse the parser, for example:
+    #   template <class T
+    #             class Comparator = less<T>,
+    #             class Vector = vector<T> >
+    #   class HeapQueue {
+    #
+    # Because this parser has no nesting state about templates, by the
+    # time it saw "class Comparator", it may think that it's a new class.
+    # Nested templates have a similar problem:
+    #   template <
+    #       typename ExportedType,
+    #       typename TupleType,
+    #       template <typename, typename> class ImplTemplate>
+    #
+    # To avoid these cases, we ignore classes that are followed by '=' or '>'
+    class_decl_match = Match(
+        r'\s*(template\s*<[\w\s<>,:]*>\s*)?'
+        r'(class|struct)\s+([A-Z_]+\s+)*(\w+(?:::\w+)*)'
+        r'(([^=>]|<[^<>]*>|<[^<>]*<[^<>]*>\s*>)*)$', line)
+    if (class_decl_match and
+        (not self.stack or self.stack[-1].open_parentheses == 0)):
+      self.stack.append(_ClassInfo(
+          class_decl_match.group(4), class_decl_match.group(2),
+          clean_lines, linenum))
+      line = class_decl_match.group(5)
+
+    # If we have not yet seen the opening brace for the innermost block,
+    # run checks here.
+    if not self.SeenOpenBrace():
+      self.stack[-1].CheckBegin(filename, clean_lines, linenum, error)
+
+    # Update access control if we are inside a class/struct
+    if self.stack and isinstance(self.stack[-1], _ClassInfo):
+      classinfo = self.stack[-1]
+      access_match = Match(
+          r'^(.*)\b(public|private|protected|signals)(\s+(?:slots\s*)?)?'
+          r':(?:[^:]|$)',
+          line)
+      if access_match:
+        classinfo.access = access_match.group(2)
+
+        # Check that access keywords are indented +1 space.  Skip this
+        # check if the keywords are not preceded by whitespaces.
+        indent = access_match.group(1)
+        if (len(indent) != classinfo.class_indent + 1 and
+            Match(r'^\s*$', indent)):
+          if classinfo.is_struct:
+            parent = 'struct ' + classinfo.name
+          else:
+            parent = 'class ' + classinfo.name
+          slots = ''
+          if access_match.group(3):
+            slots = access_match.group(3)
+          error(filename, linenum, 'whitespace/indent', 3,
+                '%s%s: should be indented +1 space inside %s' % (
+                    access_match.group(2), slots, parent))
+
+    # Consume braces or semicolons from what's left of the line
+    while True:
+      # Match first brace, semicolon, or closed parenthesis.
+      matched = Match(r'^[^{;)}]*([{;)}])(.*)$', line)
+      if not matched:
+        break
+
+      token = matched.group(1)
+      if token == '{':
+        # If namespace or class hasn't seen a opening brace yet, mark
+        # namespace/class head as complete.  Push a new block onto the
+        # stack otherwise.
+        if not self.SeenOpenBrace():
+          self.stack[-1].seen_open_brace = True
+        else:
+          self.stack.append(_BlockInfo(True))
+          if _MATCH_ASM.match(line):
+            self.stack[-1].inline_asm = _BLOCK_ASM
+      elif token == ';' or token == ')':
+        # If we haven't seen an opening brace yet, but we already saw
+        # a semicolon, this is probably a forward declaration.  Pop
+        # the stack for these.
+        #
+        # Similarly, if we haven't seen an opening brace yet, but we
+        # already saw a closing parenthesis, then these are probably
+        # function arguments with extra "class" or "struct" keywords.
+        # Also pop these stack for these.
+        if not self.SeenOpenBrace():
+          self.stack.pop()
+      else:  # token == '}'
+        # Perform end of block checks and pop the stack.
+        if self.stack:
+          self.stack[-1].CheckEnd(filename, clean_lines, linenum, error)
+          self.stack.pop()
+      line = matched.group(2)
+
+  def InnermostClass(self):
+    """Get class info on the top of the stack.
+
+    Returns:
+      A _ClassInfo object if we are inside a class, or None otherwise.
+    """
+    for i in range(len(self.stack), 0, -1):
+      classinfo = self.stack[i - 1]
+      if isinstance(classinfo, _ClassInfo):
+        return classinfo
+    return None
+
+  def CheckCompletedBlocks(self, filename, error):
+    """Checks that all classes and namespaces have been completely parsed.
+
+    Call this when all lines in a file have been processed.
+    Args:
+      filename: The name of the current file.
+      error: The function to call with any errors found.
+    """
+    # Note: This test can result in false positives if #ifdef constructs
+    # get in the way of brace matching. See the testBuildClass test in
+    # cpplint_unittest.py for an example of this.
+    for obj in self.stack:
+      if isinstance(obj, _ClassInfo):
+        error(filename, obj.starting_linenum, 'build/class', 5,
+              'Failed to find complete declaration of class %s' %
+              obj.name)
+      elif isinstance(obj, _NamespaceInfo):
+        error(filename, obj.starting_linenum, 'build/namespaces', 5,
+              'Failed to find complete declaration of namespace %s' %
+              obj.name)
+
+
+def CheckForNonStandardConstructs(filename, clean_lines, linenum,
+                                  nesting_state, error):
+  r"""Logs an error if we see certain non-ANSI constructs ignored by gcc-2.
+
+  Complain about several constructs which gcc-2 accepts, but which are
+  not standard C++.  Warning about these in lint is one way to ease the
+  transition to new compilers.
+  - put storage class first (e.g. "static const" instead of "const static").
+  - "%lld" instead of %qd" in printf-type functions.
+  - "%1$d" is non-standard in printf-type functions.
+  - "\%" is an undefined character escape sequence.
+  - text after #endif is not allowed.
+  - invalid inner-style forward declaration.
+  - >? and <? operators, and their >?= and <?= cousins.
+
+  Additionally, check for constructor/destructor style violations and reference
+  members, as it is very convenient to do so while checking for
+  gcc-2 compliance.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    nesting_state: A _NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    error: A callable to which errors are reported, which takes 4 arguments:
+           filename, line number, error level, and message
+  """
+
+  # Remove comments from the line, but leave in strings for now.
+  line = clean_lines.lines[linenum]
+
+  if Search(r'printf\s*\(.*".*%[-+ ]?\d*q', line):
+    error(filename, linenum, 'runtime/printf_format', 3,
+          '%q in format strings is deprecated.  Use %ll instead.')
+
+  if Search(r'printf\s*\(.*".*%\d+\$', line):
+    error(filename, linenum, 'runtime/printf_format', 2,
+          '%N$ formats are unconventional.  Try rewriting to avoid them.')
+
+  # Remove escaped backslashes before looking for undefined escapes.
+  line = line.replace('\\\\', '')
+
+  if Search(r'("|\').*\\(%|\[|\(|{)', line):
+    error(filename, linenum, 'build/printf_format', 3,
+          '%, [, (, and { are undefined character escapes.  Unescape them.')
+
+  # For the rest, work with both comments and strings removed.
+  line = clean_lines.elided[linenum]
+
+  if Search(r'\b(const|volatile|void|char|short|int|long'
+            r'|float|double|signed|unsigned'
+            r'|schar|u?int8|u?int16|u?int32|u?int64)'
+            r'\s+(register|static|extern|typedef)\b',
+            line):
+    error(filename, linenum, 'build/storage_class', 5,
+          'Storage class (static, extern, typedef, etc) should be first.')
+
+  if Match(r'\s*#\s*endif\s*[^/\s]+', line):
+    error(filename, linenum, 'build/endif_comment', 5,
+          'Uncommented text after #endif is non-standard.  Use a comment.')
+
+  if Match(r'\s*class\s+(\w+\s*::\s*)+\w+\s*;', line):
+    error(filename, linenum, 'build/forward_decl', 5,
+          'Inner-style forward declarations are invalid.  Remove this line.')
+
+  if Search(r'(\w+|[+-]?\d+(\.\d*)?)\s*(<|>)\?=?\s*(\w+|[+-]?\d+)(\.\d*)?',
+            line):
+    error(filename, linenum, 'build/deprecated', 3,
+          '>? and <? (max and min) operators are non-standard and deprecated.')
+
+  if Search(r'^\s*const\s*string\s*&\s*\w+\s*;', line):
+    # TODO(unknown): Could it be expanded safely to arbitrary references,
+    # without triggering too many false positives? The first
+    # attempt triggered 5 warnings for mostly benign code in the regtest, hence
+    # the restriction.
+    # Here's the original regexp, for the reference:
+    # type_name = r'\w+((\s*::\s*\w+)|(\s*<\s*\w+?\s*>))?'
+    # r'\s*const\s*' + type_name + '\s*&\s*\w+\s*;'
+    error(filename, linenum, 'runtime/member_string_references', 2,
+          'const string& members are dangerous. It is much better to use '
+          'alternatives, such as pointers or simple constants.')
+
+  # Everything else in this function operates on class declarations.
+  # Return early if the top of the nesting stack is not a class, or if
+  # the class head is not completed yet.
+  classinfo = nesting_state.InnermostClass()
+  if not classinfo or not classinfo.seen_open_brace:
+    return
+
+  # The class may have been declared with namespace or classname qualifiers.
+  # The constructor and destructor will not have those qualifiers.
+  base_classname = classinfo.name.split('::')[-1]
+
+  # Look for single-argument constructors that aren't marked explicit.
+  # Technically a valid construct, but against style.
+  args = Match(r'\s+(?:inline\s+)?%s\s*\(([^,()]+)\)'
+               % re.escape(base_classname),
+               line)
+  if (args and
+      args.group(1) != 'void' and
+      not Match(r'(const\s+)?%s(\s+const)?\s*(?:<\w+>\s*)?&'
+                % re.escape(base_classname), args.group(1).strip())):
+    error(filename, linenum, 'runtime/explicit', 5,
+          'Single-argument constructors should be marked explicit.')
+
+
+def CheckSpacingForFunctionCall(filename, line, linenum, error):
+  """Checks for the correctness of various spacing around function calls.
+
+  Args:
+    filename: The name of the current file.
+    line: The text of the line to check.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+
+  # Since function calls often occur inside if/for/while/switch
+  # expressions - which have their own, more liberal conventions - we
+  # first see if we should be looking inside such an expression for a
+  # function call, to which we can apply more strict standards.
+  fncall = line    # if there's no control flow construct, look at whole line
+  for pattern in (r'\bif\s*\((.*)\)\s*{',
+                  r'\bfor\s*\((.*)\)\s*{',
+                  r'\bwhile\s*\((.*)\)\s*[{;]',
+                  r'\bswitch\s*\((.*)\)\s*{'):
+    match = Search(pattern, line)
+    if match:
+      fncall = match.group(1)    # look inside the parens for function calls
+      break
+
+  # Except in if/for/while/switch, there should never be space
+  # immediately inside parens (eg "f( 3, 4 )").  We make an exception
+  # for nested parens ( (a+b) + c ).  Likewise, there should never be
+  # a space before a ( when it's a function argument.  I assume it's a
+  # function argument when the char before the whitespace is legal in
+  # a function name (alnum + _) and we're not starting a macro. Also ignore
+  # pointers and references to arrays and functions coz they're too tricky:
+  # we use a very simple way to recognize these:
+  # " (something)(maybe-something)" or
+  # " (something)(maybe-something," or
+  # " (something)[something]"
+  # Note that we assume the contents of [] to be short enough that
+  # they'll never need to wrap.
+  if (  # Ignore control structures.
+      not Search(r'\b(if|for|while|switch|return|new|delete|catch|sizeof)\b',
+                 fncall) and
+      # Ignore pointers/references to functions.
+      not Search(r' \([^)]+\)\([^)]*(\)|,$)', fncall) and
+      # Ignore pointers/references to arrays.
+      not Search(r' \([^)]+\)\[[^\]]+\]', fncall)):
+    if Search(r'\w\s*\(\s(?!\s*\\$)', fncall):      # a ( used for a fn call
+      error(filename, linenum, 'whitespace/parens', 4,
+            'Extra space after ( in function call')
+    elif Search(r'\(\s+(?!(\s*\\)|\()', fncall):
+      error(filename, linenum, 'whitespace/parens', 2,
+            'Extra space after (')
+    if (Search(r'\w\s+\(', fncall) and
+        not Search(r'#\s*define|typedef', fncall) and
+        not Search(r'\w\s+\((\w+::)*\*\w+\)\(', fncall)):
+      error(filename, linenum, 'whitespace/parens', 4,
+            'Extra space before ( in function call')
+    # If the ) is followed only by a newline or a { + newline, assume it's
+    # part of a control statement (if/while/etc), and don't complain
+    if Search(r'[^)]\s+\)\s*[^{\s]', fncall):
+      # If the closing parenthesis is preceded by only whitespaces,
+      # try to give a more descriptive error message.
+      if Search(r'^\s+\)', fncall):
+        error(filename, linenum, 'whitespace/parens', 2,
+              'Closing ) should be moved to the previous line')
+      else:
+        error(filename, linenum, 'whitespace/parens', 2,
+              'Extra space before )')
+
+
+def IsBlankLine(line):
+  """Returns true if the given line is blank.
+
+  We consider a line to be blank if the line is empty or consists of
+  only white spaces.
+
+  Args:
+    line: A line of a string.
+
+  Returns:
+    True, if the given line is blank.
+  """
+  return not line or line.isspace()
+
+
+def CheckForFunctionLengths(filename, clean_lines, linenum,
+                            function_state, error):
+  """Reports for long function bodies.
+
+  For an overview why this is done, see:
+  http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Write_Short_Functions
+
+  Uses a simplistic algorithm assuming other style guidelines
+  (especially spacing) are followed.
+  Only checks unindented functions, so class members are unchecked.
+  Trivial bodies are unchecked, so constructors with huge initializer lists
+  may be missed.
+  Blank/comment lines are not counted so as to avoid encouraging the removal
+  of vertical space and comments just to get through a lint check.
+  NOLINT *on the last line of a function* disables this check.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    function_state: Current function name and lines in body so far.
+    error: The function to call with any errors found.
+  """
+  lines = clean_lines.lines
+  line = lines[linenum]
+  raw = clean_lines.raw_lines
+  raw_line = raw[linenum]
+  joined_line = ''
+
+  starting_func = False
+  regexp = r'(\w(\w|::|\*|\&|\s)*)\('  # decls * & space::name( ...
+  match_result = Match(regexp, line)
+  if match_result:
+    # If the name is all caps and underscores, figure it's a macro and
+    # ignore it, unless it's TEST or TEST_F.
+    function_name = match_result.group(1).split()[-1]
+    if function_name == 'TEST' or function_name == 'TEST_F' or (
+        not Match(r'[A-Z_]+$', function_name)):
+      starting_func = True
+
+  if starting_func:
+    body_found = False
+    for start_linenum in xrange(linenum, clean_lines.NumLines()):
+      start_line = lines[start_linenum]
+      joined_line += ' ' + start_line.lstrip()
+      if Search(r'(;|})', start_line):  # Declarations and trivial functions
+        body_found = True
+        break                              # ... ignore
+      elif Search(r'{', start_line):
+        body_found = True
+        function = Search(r'((\w|:)*)\(', line).group(1)
+        if Match(r'TEST', function):    # Handle TEST... macros
+          parameter_regexp = Search(r'(\(.*\))', joined_line)
+          if parameter_regexp:             # Ignore bad syntax
+            function += parameter_regexp.group(1)
+        else:
+          function += '()'
+        function_state.Begin(function)
+        break
+    if not body_found:
+      # No body for the function (or evidence of a non-function) was found.
+      error(filename, linenum, 'readability/fn_size', 5,
+            'Lint failed to find start of function body.')
+  elif Match(r'^\}\s*$', line):  # function end
+    function_state.Check(error, filename, linenum)
+    function_state.End()
+  elif not Match(r'^\s*$', line):
+    function_state.Count()  # Count non-blank/non-comment lines.
+
+
+_RE_PATTERN_TODO = re.compile(r'^//(\s*)TODO(\(.+?\))?:?(\s|$)?')
+
+
+def CheckComment(comment, filename, linenum, error):
+  """Checks for common mistakes in TODO comments.
+
+  Args:
+    comment: The text of the comment from the line in question.
+    filename: The name of the current file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  match = _RE_PATTERN_TODO.match(comment)
+  if match:
+    # One whitespace is correct; zero whitespace is handled elsewhere.
+    leading_whitespace = match.group(1)
+    if len(leading_whitespace) > 1:
+      error(filename, linenum, 'whitespace/todo', 2,
+            'Too many spaces before TODO')
+
+    username = match.group(2)
+    if not username:
+      error(filename, linenum, 'readability/todo', 2,
+            'Missing username in TODO; it should look like '
+            '"// TODO(my_username): Stuff."')
+
+    middle_whitespace = match.group(3)
+    # Comparisons made explicit for correctness -- pylint: disable=g-explicit-bool-comparison
+    if middle_whitespace != ' ' and middle_whitespace != '':
+      error(filename, linenum, 'whitespace/todo', 2,
+            'TODO(my_username) should be followed by a space')
+
+def CheckAccess(filename, clean_lines, linenum, nesting_state, error):
+  """Checks for improper use of DISALLOW* macros.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    nesting_state: A _NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]  # get rid of comments and strings
+
+  matched = Match((r'\s*(DISALLOW_COPY_AND_ASSIGN|'
+                   r'DISALLOW_EVIL_CONSTRUCTORS|'
+                   r'DISALLOW_IMPLICIT_CONSTRUCTORS)'), line)
+  if not matched:
+    return
+  if nesting_state.stack and isinstance(nesting_state.stack[-1], _ClassInfo):
+    if nesting_state.stack[-1].access != 'private':
+      error(filename, linenum, 'readability/constructors', 3,
+            '%s must be in the private: section' % matched.group(1))
+
+  else:
+    # Found DISALLOW* macro outside a class declaration, or perhaps it
+    # was used inside a function when it should have been part of the
+    # class declaration.  We could issue a warning here, but it
+    # probably resulted in a compiler error already.
+    pass
+
+
+def FindNextMatchingAngleBracket(clean_lines, linenum, init_suffix):
+  """Find the corresponding > to close a template.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: Current line number.
+    init_suffix: Remainder of the current line after the initial <.
+
+  Returns:
+    True if a matching bracket exists.
+  """
+  line = init_suffix
+  nesting_stack = ['<']
+  while True:
+    # Find the next operator that can tell us whether < is used as an
+    # opening bracket or as a less-than operator.  We only want to
+    # warn on the latter case.
+    #
+    # We could also check all other operators and terminate the search
+    # early, e.g. if we got something like this "a<b+c", the "<" is
+    # most likely a less-than operator, but then we will get false
+    # positives for default arguments and other template expressions.
+    match = Search(r'^[^<>(),;\[\]]*([<>(),;\[\]])(.*)$', line)
+    if match:
+      # Found an operator, update nesting stack
+      operator = match.group(1)
+      line = match.group(2)
+
+      if nesting_stack[-1] == '<':
+        # Expecting closing angle bracket
+        if operator in ('<', '(', '['):
+          nesting_stack.append(operator)
+        elif operator == '>':
+          nesting_stack.pop()
+          if not nesting_stack:
+            # Found matching angle bracket
+            return True
+        elif operator == ',':
+          # Got a comma after a bracket, this is most likely a template
+          # argument.  We have not seen a closing angle bracket yet, but
+          # it's probably a few lines later if we look for it, so just
+          # return early here.
+          return True
+        else:
+          # Got some other operator.
+          return False
+
+      else:
+        # Expecting closing parenthesis or closing bracket
+        if operator in ('<', '(', '['):
+          nesting_stack.append(operator)
+        elif operator in (')', ']'):
+          # We don't bother checking for matching () or [].  If we got
+          # something like (] or [), it would have been a syntax error.
+          nesting_stack.pop()
+
+    else:
+      # Scan the next line
+      linenum += 1
+      if linenum >= len(clean_lines.elided):
+        break
+      line = clean_lines.elided[linenum]
+
+  # Exhausted all remaining lines and still no matching angle bracket.
+  # Most likely the input was incomplete, otherwise we should have
+  # seen a semicolon and returned early.
+  return True
+
+
+def FindPreviousMatchingAngleBracket(clean_lines, linenum, init_prefix):
+  """Find the corresponding < that started a template.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: Current line number.
+    init_prefix: Part of the current line before the initial >.
+
+  Returns:
+    True if a matching bracket exists.
+  """
+  line = init_prefix
+  nesting_stack = ['>']
+  while True:
+    # Find the previous operator
+    match = Search(r'^(.*)([<>(),;\[\]])[^<>(),;\[\]]*$', line)
+    if match:
+      # Found an operator, update nesting stack
+      operator = match.group(2)
+      line = match.group(1)
+
+      if nesting_stack[-1] == '>':
+        # Expecting opening angle bracket
+        if operator in ('>', ')', ']'):
+          nesting_stack.append(operator)
+        elif operator == '<':
+          nesting_stack.pop()
+          if not nesting_stack:
+            # Found matching angle bracket
+            return True
+        elif operator == ',':
+          # Got a comma before a bracket, this is most likely a
+          # template argument.  The opening angle bracket is probably
+          # there if we look for it, so just return early here.
+          return True
+        else:
+          # Got some other operator.
+          return False
+
+      else:
+        # Expecting opening parenthesis or opening bracket
+        if operator in ('>', ')', ']'):
+          nesting_stack.append(operator)
+        elif operator in ('(', '['):
+          nesting_stack.pop()
+
+    else:
+      # Scan the previous line
+      linenum -= 1
+      if linenum < 0:
+        break
+      line = clean_lines.elided[linenum]
+
+  # Exhausted all earlier lines and still no matching angle bracket.
+  return False
+
+
+def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
+  """Checks for the correctness of various spacing issues in the code.
+
+  Things we check for: spaces around operators, spaces after
+  if/for/while/switch, no spaces around parens in function calls, two
+  spaces between code and comment, don't start a block with a blank
+  line, don't end a function with a blank line, don't add a blank line
+  after public/protected/private, don't have too many blank lines in a row.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    nesting_state: A _NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    error: The function to call with any errors found.
+  """
+
+  # Don't use "elided" lines here, otherwise we can't check commented lines.
+  # Don't want to use "raw" either, because we don't want to check inside C++11
+  # raw strings,
+  raw = clean_lines.lines_without_raw_strings
+  line = raw[linenum]
+
+  # Before nixing comments, check if the line is blank for no good
+  # reason.  This includes the first line after a block is opened, and
+  # blank lines at the end of a function (ie, right before a line like '}'
+  #
+  # Skip all the blank line checks if we are immediately inside a
+  # namespace body.  In other words, don't issue blank line warnings
+  # for this block:
+  #   namespace {
+  #
+  #   }
+  #
+  # A warning about missing end of namespace comments will be issued instead.
+  if IsBlankLine(line) and not nesting_state.InNamespaceBody():
+    elided = clean_lines.elided
+    prev_line = elided[linenum - 1]
+    prevbrace = prev_line.rfind('{')
+    # TODO(unknown): Don't complain if line before blank line, and line after,
+    #                both start with alnums and are indented the same amount.
+    #                This ignores whitespace at the start of a namespace block
+    #                because those are not usually indented.
+    if prevbrace != -1 and prev_line[prevbrace:].find('}') == -1:
+      # OK, we have a blank line at the start of a code block.  Before we
+      # complain, we check if it is an exception to the rule: The previous
+      # non-empty line has the parameters of a function header that are indented
+      # 4 spaces (because they did not fit in a 80 column line when placed on
+      # the same line as the function name).  We also check for the case where
+      # the previous line is indented 6 spaces, which may happen when the
+      # initializers of a constructor do not fit into a 80 column line.
+      exception = False
+      if Match(r' {6}\w', prev_line):  # Initializer list?
+        # We are looking for the opening column of initializer list, which
+        # should be indented 4 spaces to cause 6 space indentation afterwards.
+        search_position = linenum-2
+        while (search_position >= 0
+               and Match(r' {6}\w', elided[search_position])):
+          search_position -= 1
+        exception = (search_position >= 0
+                     and elided[search_position][:5] == '    :')
+      else:
+        # Search for the function arguments or an initializer list.  We use a
+        # simple heuristic here: If the line is indented 4 spaces; and we have a
+        # closing paren, without the opening paren, followed by an opening brace
+        # or colon (for initializer lists) we assume that it is the last line of
+        # a function header.  If we have a colon indented 4 spaces, it is an
+        # initializer list.
+        exception = (Match(r' {4}\w[^\(]*\)\s*(const\s*)?(\{\s*$|:)',
+                           prev_line)
+                     or Match(r' {4}:', prev_line))
+
+      if not exception:
+        error(filename, linenum, 'whitespace/blank_line', 2,
+              'Redundant blank line at the start of a code block '
+              'should be deleted.')
+    # Ignore blank lines at the end of a block in a long if-else
+    # chain, like this:
+    #   if (condition1) {
+    #     // Something followed by a blank line
+    #
+    #   } else if (condition2) {
+    #     // Something else
+    #   }
+    if linenum + 1 < clean_lines.NumLines():
+      next_line = raw[linenum + 1]
+      if (next_line
+          and Match(r'\s*}', next_line)
+          and next_line.find('} else ') == -1):
+        error(filename, linenum, 'whitespace/blank_line', 3,
+              'Redundant blank line at the end of a code block '
+              'should be deleted.')
+
+    matched = Match(r'\s*(public|protected|private):', prev_line)
+    if matched:
+      error(filename, linenum, 'whitespace/blank_line', 3,
+            'Do not leave a blank line after "%s:"' % matched.group(1))
+
+  # Next, we complain if there's a comment too near the text
+  commentpos = line.find('//')
+  if commentpos != -1:
+    # Check if the // may be in quotes.  If so, ignore it
+    # Comparisons made explicit for clarity -- pylint: disable=g-explicit-bool-comparison
+    if (line.count('"', 0, commentpos) -
+        line.count('\\"', 0, commentpos)) % 2 == 0:   # not in quotes
+      # Allow one space for new scopes, two spaces otherwise:
+      if (not Match(r'^\s*{ //', line) and
+          ((commentpos >= 1 and
+            line[commentpos-1] not in string.whitespace) or
+           (commentpos >= 2 and
+            line[commentpos-2] not in string.whitespace))):
+        error(filename, linenum, 'whitespace/comments', 2,
+              'At least two spaces is best between code and comments')
+      # There should always be a space between the // and the comment
+      commentend = commentpos + 2
+      if commentend < len(line) and not line[commentend] == ' ':
+        # but some lines are exceptions -- e.g. if they're big
+        # comment delimiters like:
+        # //----------------------------------------------------------
+        # or are an empty C++ style Doxygen comment, like:
+        # ///
+        # or C++ style Doxygen comments placed after the variable:
+        # ///<  Header comment
+        # //!<  Header comment
+        # or they begin with multiple slashes followed by a space:
+        # //////// Header comment
+        match = (Search(r'[=/-]{4,}\s*$', line[commentend:]) or
+                 Search(r'^/$', line[commentend:]) or
+                 Search(r'^!< ', line[commentend:]) or
+                 Search(r'^/< ', line[commentend:]) or
+                 Search(r'^/+ ', line[commentend:]))
+        if not match:
+          error(filename, linenum, 'whitespace/comments', 4,
+                'Should have a space between // and comment')
+      CheckComment(line[commentpos:], filename, linenum, error)
+
+  line = clean_lines.elided[linenum]  # get rid of comments and strings
+
+  # Don't try to do spacing checks for operator methods
+  line = re.sub(r'operator(==|!=|<|<<|<=|>=|>>|>)\(', 'operator\(', line)
+
+  # We allow no-spaces around = within an if: "if ( (a=Foo()) == 0 )".
+  # Otherwise not.  Note we only check for non-spaces on *both* sides;
+  # sometimes people put non-spaces on one side when aligning ='s among
+  # many lines (not that this is behavior that I approve of...)
+  if Search(r'[\w.]=[\w.]', line) and not Search(r'\b(if|while) ', line):
+    error(filename, linenum, 'whitespace/operators', 4,
+          'Missing spaces around =')
+
+  # It's ok not to have spaces around binary operators like + - * /, but if
+  # there's too little whitespace, we get concerned.  It's hard to tell,
+  # though, so we punt on this one for now.  TODO.
+
+  # You should always have whitespace around binary operators.
+  #
+  # Check <= and >= first to avoid false positives with < and >, then
+  # check non-include lines for spacing around < and >.
+  match = Search(r'[^<>=!\s](==|!=|<=|>=)[^<>=!\s]', line)
+  if match:
+    error(filename, linenum, 'whitespace/operators', 3,
+          'Missing spaces around %s' % match.group(1))
+  # We allow no-spaces around << when used like this: 10<<20, but
+  # not otherwise (particularly, not when used as streams)
+  # Also ignore using ns::operator<<;
+  match = Search(r'(operator|\S)(?:L|UL|ULL|l|ul|ull)?<<(\S)', line)
+  if (match and
+      not (match.group(1).isdigit() and match.group(2).isdigit()) and
+      not (match.group(1) == 'operator' and match.group(2) == ';')):
+    error(filename, linenum, 'whitespace/operators', 3,
+          'Missing spaces around <<')
+  elif not Match(r'#.*include', line):
+    # Avoid false positives on ->
+    reduced_line = line.replace('->', '')
+
+    # Look for < that is not surrounded by spaces.  This is only
+    # triggered if both sides are missing spaces, even though
+    # technically should should flag if at least one side is missing a
+    # space.  This is done to avoid some false positives with shifts.
+    match = Search(r'[^\s<]<([^\s=<].*)', reduced_line)
+    if (match and
+        not FindNextMatchingAngleBracket(clean_lines, linenum, match.group(1))):
+      error(filename, linenum, 'whitespace/operators', 3,
+            'Missing spaces around <')
+
+    # Look for > that is not surrounded by spaces.  Similar to the
+    # above, we only trigger if both sides are missing spaces to avoid
+    # false positives with shifts.
+    match = Search(r'^(.*[^\s>])>[^\s=>]', reduced_line)
+    if (match and
+        not FindPreviousMatchingAngleBracket(clean_lines, linenum,
+                                             match.group(1))):
+      error(filename, linenum, 'whitespace/operators', 3,
+            'Missing spaces around >')
+
+  # We allow no-spaces around >> for almost anything.  This is because
+  # C++11 allows ">>" to close nested templates, which accounts for
+  # most cases when ">>" is not followed by a space.
+  #
+  # We still warn on ">>" followed by alpha character, because that is
+  # likely due to ">>" being used for right shifts, e.g.:
+  #   value >> alpha
+  #
+  # When ">>" is used to close templates, the alphanumeric letter that
+  # follows would be part of an identifier, and there should still be
+  # a space separating the template type and the identifier.
+  #   type<type<type>> alpha
+  match = Search(r'>>[a-zA-Z_]', line)
+  if match:
+    error(filename, linenum, 'whitespace/operators', 3,
+          'Missing spaces around >>')
+
+  # There shouldn't be space around unary operators
+  match = Search(r'(!\s|~\s|[\s]--[\s;]|[\s]\+\+[\s;])', line)
+  if match:
+    error(filename, linenum, 'whitespace/operators', 4,
+          'Extra space for operator %s' % match.group(1))
+
+  # A pet peeve of mine: no spaces after an if, while, switch, or for
+  match = Search(r' (if\(|for\(|while\(|switch\()', line)
+  if match:
+    error(filename, linenum, 'whitespace/parens', 5,
+          'Missing space before ( in %s' % match.group(1))
+
+  # For if/for/while/switch, the left and right parens should be
+  # consistent about how many spaces are inside the parens, and
+  # there should either be zero or one spaces inside the parens.
+  # We don't want: "if ( foo)" or "if ( foo   )".
+  # Exception: "for ( ; foo; bar)" and "for (foo; bar; )" are allowed.
+  match = Search(r'\b(if|for|while|switch)\s*'
+                 r'\(([ ]*)(.).*[^ ]+([ ]*)\)\s*{\s*$',
+                 line)
+  if match:
+    if len(match.group(2)) != len(match.group(4)):
+      if not (match.group(3) == ';' and
+              len(match.group(2)) == 1 + len(match.group(4)) or
+              not match.group(2) and Search(r'\bfor\s*\(.*; \)', line)):
+        error(filename, linenum, 'whitespace/parens', 5,
+              'Mismatching spaces inside () in %s' % match.group(1))
+    if len(match.group(2)) not in [0, 1]:
+      error(filename, linenum, 'whitespace/parens', 5,
+            'Should have zero or one spaces inside ( and ) in %s' %
+            match.group(1))
+
+  # You should always have a space after a comma (either as fn arg or operator)
+  #
+  # This does not apply when the non-space character following the
+  # comma is another comma, since the only time when that happens is
+  # for empty macro arguments.
+  #
+  # We run this check in two passes: first pass on elided lines to
+  # verify that lines contain missing whitespaces, second pass on raw
+  # lines to confirm that those missing whitespaces are not due to
+  # elided comments.
+  if Search(r',[^,\s]', line) and Search(r',[^,\s]', raw[linenum]):
+    error(filename, linenum, 'whitespace/comma', 3,
+          'Missing space after ,')
+
+  # You should always have a space after a semicolon
+  # except for few corner cases
+  # TODO(unknown): clarify if 'if (1) { return 1;}' is requires one more
+  # space after ;
+  if Search(r';[^\s};\\)/]', line):
+    error(filename, linenum, 'whitespace/semicolon', 3,
+          'Missing space after ;')
+
+  # Next we will look for issues with function calls.
+  CheckSpacingForFunctionCall(filename, line, linenum, error)
+
+  # Except after an opening paren, or after another opening brace (in case of
+  # an initializer list, for instance), you should have spaces before your
+  # braces. And since you should never have braces at the beginning of a line,
+  # this is an easy test.
+  match = Match(r'^(.*[^ ({]){', line)
+  if match:
+    # Try a bit harder to check for brace initialization.  This
+    # happens in one of the following forms:
+    #   Constructor() : initializer_list_{} { ... }
+    #   Constructor{}.MemberFunction()
+    #   Type variable{};
+    #   FunctionCall(type{}, ...);
+    #   LastArgument(..., type{});
+    #   LOG(INFO) << type{} << " ...";
+    #   map_of_type[{...}] = ...;
+    #
+    # We check for the character following the closing brace, and
+    # silence the warning if it's one of those listed above, i.e.
+    # "{.;,)<]".
+    #
+    # To account for nested initializer list, we allow any number of
+    # closing braces up to "{;,)<".  We can't simply silence the
+    # warning on first sight of closing brace, because that would
+    # cause false negatives for things that are not initializer lists.
+    #   Silence this:         But not this:
+    #     Outer{                if (...) {
+    #       Inner{...}            if (...){  // Missing space before {
+    #     };                    }
+    #
+    # There is a false negative with this approach if people inserted
+    # spurious semicolons, e.g. "if (cond){};", but we will catch the
+    # spurious semicolon with a separate check.
+    (endline, endlinenum, endpos) = CloseExpression(
+        clean_lines, linenum, len(match.group(1)))
+    trailing_text = ''
+    if endpos > -1:
+      trailing_text = endline[endpos:]
+    for offset in xrange(endlinenum + 1,
+                         min(endlinenum + 3, clean_lines.NumLines() - 1)):
+      trailing_text += clean_lines.elided[offset]
+    if not Match(r'^[\s}]*[{.;,)<\]]', trailing_text):
+      error(filename, linenum, 'whitespace/braces', 5,
+            'Missing space before {')
+
+  # Make sure '} else {' has spaces.
+  if Search(r'}else', line):
+    error(filename, linenum, 'whitespace/braces', 5,
+          'Missing space before else')
+
+  # You shouldn't have spaces before your brackets, except maybe after
+  # 'delete []' or 'new char * []'.
+  if Search(r'\w\s+\[', line) and not Search(r'delete\s+\[', line):
+    error(filename, linenum, 'whitespace/braces', 5,
+          'Extra space before [')
+
+  # You shouldn't have a space before a semicolon at the end of the line.
+  # There's a special case for "for" since the style guide allows space before
+  # the semicolon there.
+  if Search(r':\s*;\s*$', line):
+    error(filename, linenum, 'whitespace/semicolon', 5,
+          'Semicolon defining empty statement. Use {} instead.')
+  elif Search(r'^\s*;\s*$', line):
+    error(filename, linenum, 'whitespace/semicolon', 5,
+          'Line contains only semicolon. If this should be an empty statement, '
+          'use {} instead.')
+  elif (Search(r'\s+;\s*$', line) and
+        not Search(r'\bfor\b', line)):
+    error(filename, linenum, 'whitespace/semicolon', 5,
+          'Extra space before last semicolon. If this should be an empty '
+          'statement, use {} instead.')
+
+  # In range-based for, we wanted spaces before and after the colon, but
+  # not around "::" tokens that might appear.
+  if (Search('for *\(.*[^:]:[^: ]', line) or
+      Search('for *\(.*[^: ]:[^:]', line)):
+    error(filename, linenum, 'whitespace/forcolon', 2,
+          'Missing space around colon in range-based for loop')
+
+
+def CheckSectionSpacing(filename, clean_lines, class_info, linenum, error):
+  """Checks for additional blank line issues related to sections.
+
+  Currently the only thing checked here is blank line before protected/private.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    class_info: A _ClassInfo objects.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  # Skip checks if the class is small, where small means 25 lines or less.
+  # 25 lines seems like a good cutoff since that's the usual height of
+  # terminals, and any class that can't fit in one screen can't really
+  # be considered "small".
+  #
+  # Also skip checks if we are on the first line.  This accounts for
+  # classes that look like
+  #   class Foo { public: ... };
+  #
+  # If we didn't find the end of the class, last_line would be zero,
+  # and the check will be skipped by the first condition.
+  if (class_info.last_line - class_info.starting_linenum <= 24 or
+      linenum <= class_info.starting_linenum):
+    return
+
+  matched = Match(r'\s*(public|protected|private):', clean_lines.lines[linenum])
+  if matched:
+    # Issue warning if the line before public/protected/private was
+    # not a blank line, but don't do this if the previous line contains
+    # "class" or "struct".  This can happen two ways:
+    #  - We are at the beginning of the class.
+    #  - We are forward-declaring an inner class that is semantically
+    #    private, but needed to be public for implementation reasons.
+    # Also ignores cases where the previous line ends with a backslash as can be
+    # common when defining classes in C macros.
+    prev_line = clean_lines.lines[linenum - 1]
+    if (not IsBlankLine(prev_line) and
+        not Search(r'\b(class|struct)\b', prev_line) and
+        not Search(r'\\$', prev_line)):
+      # Try a bit harder to find the beginning of the class.  This is to
+      # account for multi-line base-specifier lists, e.g.:
+      #   class Derived
+      #       : public Base {
+      end_class_head = class_info.starting_linenum
+      for i in range(class_info.starting_linenum, linenum):
+        if Search(r'\{\s*$', clean_lines.lines[i]):
+          end_class_head = i
+          break
+      if end_class_head < linenum - 1:
+        error(filename, linenum, 'whitespace/blank_line', 3,
+              '"%s:" should be preceded by a blank line' % matched.group(1))
+
+
+def GetPreviousNonBlankLine(clean_lines, linenum):
+  """Return the most recent non-blank line and its line number.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file contents.
+    linenum: The number of the line to check.
+
+  Returns:
+    A tuple with two elements.  The first element is the contents of the last
+    non-blank line before the current line, or the empty string if this is the
+    first non-blank line.  The second is the line number of that line, or -1
+    if this is the first non-blank line.
+  """
+
+  prevlinenum = linenum - 1
+  while prevlinenum >= 0:
+    prevline = clean_lines.elided[prevlinenum]
+    if not IsBlankLine(prevline):     # if not a blank line...
+      return (prevline, prevlinenum)
+    prevlinenum -= 1
+  return ('', -1)
+
+
+def CheckBraces(filename, clean_lines, linenum, error):
+  """Looks for misplaced braces (e.g. at the end of line).
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+
+  line = clean_lines.elided[linenum]        # get rid of comments and strings
+
+  if Match(r'\s*{\s*$', line):
+    # We allow an open brace to start a line in the case where someone is using
+    # braces in a block to explicitly create a new scope, which is commonly used
+    # to control the lifetime of stack-allocated variables.  Braces are also
+    # used for brace initializers inside function calls.  We don't detect this
+    # perfectly: we just don't complain if the last non-whitespace character on
+    # the previous non-blank line is ',', ';', ':', '(', '{', or '}', or if the
+    # previous line starts a preprocessor block.
+    prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0]
+    if (not Search(r'[,;:}{(]\s*$', prevline) and
+        not Match(r'\s*#', prevline)):
+      error(filename, linenum, 'whitespace/braces', 4,
+            '{ should almost always be at the end of the previous line')
+
+  # An else clause should be on the same line as the preceding closing brace.
+  if Match(r'\s*else\s*', line):
+    prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0]
+    if Match(r'\s*}\s*$', prevline):
+      error(filename, linenum, 'whitespace/newline', 4,
+            'An else should appear on the same line as the preceding }')
+
+  # If braces come on one side of an else, they should be on both.
+  # However, we have to worry about "else if" that spans multiple lines!
+  if Search(r'}\s*else[^{]*$', line) or Match(r'[^}]*else\s*{', line):
+    if Search(r'}\s*else if([^{]*)$', line):       # could be multi-line if
+      # find the ( after the if
+      pos = line.find('else if')
+      pos = line.find('(', pos)
+      if pos > 0:
+        (endline, _, endpos) = CloseExpression(clean_lines, linenum, pos)
+        if endline[endpos:].find('{') == -1:    # must be brace after if
+          error(filename, linenum, 'readability/braces', 5,
+                'If an else has a brace on one side, it should have it on both')
+    else:            # common case: else not followed by a multi-line if
+      error(filename, linenum, 'readability/braces', 5,
+            'If an else has a brace on one side, it should have it on both')
+
+  # Likewise, an else should never have the else clause on the same line
+  if Search(r'\belse [^\s{]', line) and not Search(r'\belse if\b', line):
+    error(filename, linenum, 'whitespace/newline', 4,
+          'Else clause should never be on same line as else (use 2 lines)')
+
+  # In the same way, a do/while should never be on one line
+  if Match(r'\s*do [^\s{]', line):
+    error(filename, linenum, 'whitespace/newline', 4,
+          'do/while clauses should not be on a single line')
+
+  # Block bodies should not be followed by a semicolon.  Due to C++11
+  # brace initialization, there are more places where semicolons are
+  # required than not, so we use a whitelist approach to check these
+  # rather than a blacklist.  These are the places where "};" should
+  # be replaced by just "}":
+  # 1. Some flavor of block following closing parenthesis:
+  #    for (;;) {};
+  #    while (...) {};
+  #    switch (...) {};
+  #    Function(...) {};
+  #    if (...) {};
+  #    if (...) else if (...) {};
+  #
+  # 2. else block:
+  #    if (...) else {};
+  #
+  # 3. const member function:
+  #    Function(...) const {};
+  #
+  # 4. Block following some statement:
+  #    x = 42;
+  #    {};
+  #
+  # 5. Block at the beginning of a function:
+  #    Function(...) {
+  #      {};
+  #    }
+  #
+  #    Note that naively checking for the preceding "{" will also match
+  #    braces inside multi-dimensional arrays, but this is fine since
+  #    that expression will not contain semicolons.
+  #
+  # 6. Block following another block:
+  #    while (true) {}
+  #    {};
+  #
+  # 7. End of namespaces:
+  #    namespace {};
+  #
+  #    These semicolons seems far more common than other kinds of
+  #    redundant semicolons, possibly due to people converting classes
+  #    to namespaces.  For now we do not warn for this case.
+  #
+  # Try matching case 1 first.
+  match = Match(r'^(.*\)\s*)\{', line)
+  if match:
+    # Matched closing parenthesis (case 1).  Check the token before the
+    # matching opening parenthesis, and don't warn if it looks like a
+    # macro.  This avoids these false positives:
+    #  - macro that defines a base class
+    #  - multi-line macro that defines a base class
+    #  - macro that defines the whole class-head
+    #
+    # But we still issue warnings for macros that we know are safe to
+    # warn, specifically:
+    #  - TEST, TEST_F, TEST_P, MATCHER, MATCHER_P
+    #  - TYPED_TEST
+    #  - INTERFACE_DEF
+    #  - EXCLUSIVE_LOCKS_REQUIRED, SHARED_LOCKS_REQUIRED, LOCKS_EXCLUDED:
+    #
+    # We implement a whitelist of safe macros instead of a blacklist of
+    # unsafe macros, even though the latter appears less frequently in
+    # google code and would have been easier to implement.  This is because
+    # the downside for getting the whitelist wrong means some extra
+    # semicolons, while the downside for getting the blacklist wrong
+    # would result in compile errors.
+    #
+    # In addition to macros, we also don't want to warn on compound
+    # literals.
+    closing_brace_pos = match.group(1).rfind(')')
+    opening_parenthesis = ReverseCloseExpression(
+        clean_lines, linenum, closing_brace_pos)
+    if opening_parenthesis[2] > -1:
+      line_prefix = opening_parenthesis[0][0:opening_parenthesis[2]]
+      macro = Search(r'\b([A-Z_]+)\s*$', line_prefix)
+      if ((macro and
+           macro.group(1) not in (
+               'TEST', 'TEST_F', 'MATCHER', 'MATCHER_P', 'TYPED_TEST',
+               'EXCLUSIVE_LOCKS_REQUIRED', 'SHARED_LOCKS_REQUIRED',
+               'LOCKS_EXCLUDED', 'INTERFACE_DEF')) or
+          Search(r'\s+=\s*$', line_prefix)):
+        match = None
+    # Whitelist lambda function definition which also requires a ";" after
+    # closing brace
+    if match:
+        if Match(r'^.*\[.*\]\s*(.*\)\s*)\{', line):
+            match = None
+
+  else:
+    # Try matching cases 2-3.
+    match = Match(r'^(.*(?:else|\)\s*const)\s*)\{', line)
+    if not match:
+      # Try matching cases 4-6.  These are always matched on separate lines.
+      #
+      # Note that we can't simply concatenate the previous line to the
+      # current line and do a single match, otherwise we may output
+      # duplicate warnings for the blank line case:
+      #   if (cond) {
+      #     // blank line
+      #   }
+      prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0]
+      if prevline and Search(r'[;{}]\s*$', prevline):
+        match = Match(r'^(\s*)\{', line)
+
+  # Check matching closing brace
+  if match:
+    (endline, endlinenum, endpos) = CloseExpression(
+        clean_lines, linenum, len(match.group(1)))
+    if endpos > -1 and Match(r'^\s*;', endline[endpos:]):
+      # Current {} pair is eligible for semicolon check, and we have found
+      # the redundant semicolon, output warning here.
+      #
+      # Note: because we are scanning forward for opening braces, and
+      # outputting warnings for the matching closing brace, if there are
+      # nested blocks with trailing semicolons, we will get the error
+      # messages in reversed order.
+      error(filename, endlinenum, 'readability/braces', 4,
+            "You don't need a ; after a }")
+
+
+def CheckEmptyBlockBody(filename, clean_lines, linenum, error):
+  """Look for empty loop/conditional body with only a single semicolon.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+
+  # Search for loop keywords at the beginning of the line.  Because only
+  # whitespaces are allowed before the keywords, this will also ignore most
+  # do-while-loops, since those lines should start with closing brace.
+  #
+  # We also check "if" blocks here, since an empty conditional block
+  # is likely an error.
+  line = clean_lines.elided[linenum]
+  matched = Match(r'\s*(for|while|if)\s*\(', line)
+  if matched:
+    # Find the end of the conditional expression
+    (end_line, end_linenum, end_pos) = CloseExpression(
+        clean_lines, linenum, line.find('('))
+
+    # Output warning if what follows the condition expression is a semicolon.
+    # No warning for all other cases, including whitespace or newline, since we
+    # have a separate check for semicolons preceded by whitespace.
+    if end_pos >= 0 and Match(r';', end_line[end_pos:]):
+      if matched.group(1) == 'if':
+        error(filename, end_linenum, 'whitespace/empty_conditional_body', 5,
+              'Empty conditional bodies should use {}')
+      else:
+        error(filename, end_linenum, 'whitespace/empty_loop_body', 5,
+              'Empty loop bodies should use {} or continue')
+
+
+def CheckCheck(filename, clean_lines, linenum, error):
+  """Checks the use of CHECK and EXPECT macros.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+
+  # Decide the set of replacement macros that should be suggested
+  lines = clean_lines.elided
+  check_macro = None
+  start_pos = -1
+  for macro in _CHECK_MACROS:
+    i = lines[linenum].find(macro)
+    if i >= 0:
+      check_macro = macro
+
+      # Find opening parenthesis.  Do a regular expression match here
+      # to make sure that we are matching the expected CHECK macro, as
+      # opposed to some other macro that happens to contain the CHECK
+      # substring.
+      matched = Match(r'^(.*\b' + check_macro + r'\s*)\(', lines[linenum])
+      if not matched:
+        continue
+      start_pos = len(matched.group(1))
+      break
+  if not check_macro or start_pos < 0:
+    # Don't waste time here if line doesn't contain 'CHECK' or 'EXPECT'
+    return
+
+  # Find end of the boolean expression by matching parentheses
+  (last_line, end_line, end_pos) = CloseExpression(
+      clean_lines, linenum, start_pos)
+  if end_pos < 0:
+    return
+  if linenum == end_line:
+    expression = lines[linenum][start_pos + 1:end_pos - 1]
+  else:
+    expression = lines[linenum][start_pos + 1:]
+    for i in xrange(linenum + 1, end_line):
+      expression += lines[i]
+    expression += last_line[0:end_pos - 1]
+
+  # Parse expression so that we can take parentheses into account.
+  # This avoids false positives for inputs like "CHECK((a < 4) == b)",
+  # which is not replaceable by CHECK_LE.
+  lhs = ''
+  rhs = ''
+  operator = None
+  while expression:
+    matched = Match(r'^\s*(<<|<<=|>>|>>=|->\*|->|&&|\|\||'
+                    r'==|!=|>=|>|<=|<|\()(.*)$', expression)
+    if matched:
+      token = matched.group(1)
+      if token == '(':
+        # Parenthesized operand
+        expression = matched.group(2)
+        (end, _) = FindEndOfExpressionInLine(expression, 0, 1, '(', ')')
+        if end < 0:
+          return  # Unmatched parenthesis
+        lhs += '(' + expression[0:end]
+        expression = expression[end:]
+      elif token in ('&&', '||'):
+        # Logical and/or operators.  This means the expression
+        # contains more than one term, for example:
+        #   CHECK(42 < a && a < b);
+        #
+        # These are not replaceable with CHECK_LE, so bail out early.
+        return
+      elif token in ('<<', '<<=', '>>', '>>=', '->*', '->'):
+        # Non-relational operator
+        lhs += token
+        expression = matched.group(2)
+      else:
+        # Relational operator
+        operator = token
+        rhs = matched.group(2)
+        break
+    else:
+      # Unparenthesized operand.  Instead of appending to lhs one character
+      # at a time, we do another regular expression match to consume several
+      # characters at once if possible.  Trivial benchmark shows that this
+      # is more efficient when the operands are longer than a single
+      # character, which is generally the case.
+      matched = Match(r'^([^-=!<>()&|]+)(.*)$', expression)
+      if not matched:
+        matched = Match(r'^(\s*\S)(.*)$', expression)
+        if not matched:
+          break
+      lhs += matched.group(1)
+      expression = matched.group(2)
+
+  # Only apply checks if we got all parts of the boolean expression
+  if not (lhs and operator and rhs):
+    return
+
+  # Check that rhs do not contain logical operators.  We already know
+  # that lhs is fine since the loop above parses out && and ||.
+  if rhs.find('&&') > -1 or rhs.find('||') > -1:
+    return
+
+  # At least one of the operands must be a constant literal.  This is
+  # to avoid suggesting replacements for unprintable things like
+  # CHECK(variable != iterator)
+  #
+  # The following pattern matches decimal, hex integers, strings, and
+  # characters (in that order).
+  lhs = lhs.strip()
+  rhs = rhs.strip()
+  match_constant = r'^([-+]?(\d+|0[xX][0-9a-fA-F]+)[lLuU]{0,3}|".*"|\'.*\')$'
+  if Match(match_constant, lhs) or Match(match_constant, rhs):
+    # Note: since we know both lhs and rhs, we can provide a more
+    # descriptive error message like:
+    #   Consider using CHECK_EQ(x, 42) instead of CHECK(x == 42)
+    # Instead of:
+    #   Consider using CHECK_EQ instead of CHECK(a == b)
+    #
+    # We are still keeping the less descriptive message because if lhs
+    # or rhs gets long, the error message might become unreadable.
+    error(filename, linenum, 'readability/check', 2,
+          'Consider using %s instead of %s(a %s b)' % (
+              _CHECK_REPLACEMENT[check_macro][operator],
+              check_macro, operator))
+
+
+def CheckAltTokens(filename, clean_lines, linenum, error):
+  """Check alternative keywords being used in boolean expressions.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+
+  # Avoid preprocessor lines
+  if Match(r'^\s*#', line):
+    return
+
+  # Last ditch effort to avoid multi-line comments.  This will not help
+  # if the comment started before the current line or ended after the
+  # current line, but it catches most of the false positives.  At least,
+  # it provides a way to workaround this warning for people who use
+  # multi-line comments in preprocessor macros.
+  #
+  # TODO(unknown): remove this once cpplint has better support for
+  # multi-line comments.
+  if line.find('/*') >= 0 or line.find('*/') >= 0:
+    return
+
+  for match in _ALT_TOKEN_REPLACEMENT_PATTERN.finditer(line):
+    error(filename, linenum, 'readability/alt_tokens', 2,
+          'Use operator %s instead of %s' % (
+              _ALT_TOKEN_REPLACEMENT[match.group(1)], match.group(1)))
+
+
+def GetLineWidth(line):
+  """Determines the width of the line in column positions.
+
+  Args:
+    line: A string, which may be a Unicode string.
+
+  Returns:
+    The width of the line in column positions, accounting for Unicode
+    combining characters and wide characters.
+  """
+  if isinstance(line, unicode):
+    width = 0
+    for uc in unicodedata.normalize('NFC', line):
+      if unicodedata.east_asian_width(uc) in ('W', 'F'):
+        width += 2
+      elif not unicodedata.combining(uc):
+        width += 1
+    return width
+  else:
+    return len(line)
+
+
+def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state,
+               error):
+  """Checks rules from the 'C++ style rules' section of cppguide.html.
+
+  Most of these rules are hard to test (naming, comment style), but we
+  do what we can.  In particular we check for 2-space indents, line lengths,
+  tab usage, spaces inside code, etc.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    file_extension: The extension (without the dot) of the filename.
+    nesting_state: A _NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    error: The function to call with any errors found.
+  """
+
+  # Don't use "elided" lines here, otherwise we can't check commented lines.
+  # Don't want to use "raw" either, because we don't want to check inside C++11
+  # raw strings,
+  raw_lines = clean_lines.lines_without_raw_strings
+  line = raw_lines[linenum]
+
+  if line.find('\t') != -1:
+    error(filename, linenum, 'whitespace/tab', 1,
+          'Tab found; better to use spaces')
+
+  # One or three blank spaces at the beginning of the line is weird; it's
+  # hard to reconcile that with 2-space indents.
+  # NOTE: here are the conditions rob pike used for his tests.  Mine aren't
+  # as sophisticated, but it may be worth becoming so:  RLENGTH==initial_spaces
+  # if(RLENGTH > 20) complain = 0;
+  # if(match($0, " +(error|private|public|protected):")) complain = 0;
+  # if(match(prev, "&& *$")) complain = 0;
+  # if(match(prev, "\\|\\| *$")) complain = 0;
+  # if(match(prev, "[\",=><] *$")) complain = 0;
+  # if(match($0, " <<")) complain = 0;
+  # if(match(prev, " +for \\(")) complain = 0;
+  # if(prevodd && match(prevprev, " +for \\(")) complain = 0;
+  initial_spaces = 0
+  cleansed_line = clean_lines.elided[linenum]
+  while initial_spaces < len(line) and line[initial_spaces] == ' ':
+    initial_spaces += 1
+  if line and line[-1].isspace():
+    error(filename, linenum, 'whitespace/end_of_line', 4,
+          'Line ends in whitespace.  Consider deleting these extra spaces.')
+  # There are certain situations we allow one space, notably for section labels
+  elif ((initial_spaces == 1 or initial_spaces == 3) and
+        not Match(r'\s*\w+\s*:\s*$', cleansed_line)):
+    error(filename, linenum, 'whitespace/indent', 3,
+          'Weird number of spaces at line-start.  '
+          'Are you using a 2-space indent?')
+
+  # Check if the line is a header guard.
+  is_header_guard = False
+  if file_extension == 'h':
+    cppvar = GetHeaderGuardCPPVariable(filename)
+    if (line.startswith('#ifndef %s' % cppvar) or
+        line.startswith('#define %s' % cppvar) or
+        line.startswith('#endif  // %s' % cppvar)):
+      is_header_guard = True
+  # #include lines and header guards can be long, since there's no clean way to
+  # split them.
+  #
+  # URLs can be long too.  It's possible to split these, but it makes them
+  # harder to cut&paste.
+  #
+  # The "$Id:...$" comment may also get very long without it being the
+  # developers fault.
+  if (not line.startswith('#include') and not is_header_guard and
+      not Match(r'^\s*//.*http(s?)://\S*$', line) and
+      not Match(r'^// \$Id:.*#[0-9]+ \$$', line)):
+    line_width = GetLineWidth(line)
+    extended_length = int((_line_length * 1.25))
+    if line_width > extended_length:
+      error(filename, linenum, 'whitespace/line_length', 4,
+            'Lines should very rarely be longer than %i characters' %
+            extended_length)
+    elif line_width > _line_length:
+      error(filename, linenum, 'whitespace/line_length', 2,
+            'Lines should be <= %i characters long' % _line_length)
+
+  if (cleansed_line.count(';') > 1 and
+      # for loops are allowed two ;'s (and may run over two lines).
+      cleansed_line.find('for') == -1 and
+      (GetPreviousNonBlankLine(clean_lines, linenum)[0].find('for') == -1 or
+       GetPreviousNonBlankLine(clean_lines, linenum)[0].find(';') != -1) and
+      # It's ok to have many commands in a switch case that fits in 1 line
+      not ((cleansed_line.find('case ') != -1 or
+            cleansed_line.find('default:') != -1) and
+           cleansed_line.find('break;') != -1)):
+    error(filename, linenum, 'whitespace/newline', 0,
+          'More than one command on the same line')
+
+  # Some more style checks
+  CheckBraces(filename, clean_lines, linenum, error)
+  CheckEmptyBlockBody(filename, clean_lines, linenum, error)
+  CheckAccess(filename, clean_lines, linenum, nesting_state, error)
+  CheckSpacing(filename, clean_lines, linenum, nesting_state, error)
+  CheckCheck(filename, clean_lines, linenum, error)
+  CheckAltTokens(filename, clean_lines, linenum, error)
+  classinfo = nesting_state.InnermostClass()
+  if classinfo:
+    CheckSectionSpacing(filename, clean_lines, classinfo, linenum, error)
+
+
+_RE_PATTERN_INCLUDE_NEW_STYLE = re.compile(r'#include +"[^/]+\.h"')
+_RE_PATTERN_INCLUDE = re.compile(r'^\s*#\s*include\s*([<"])([^>"]*)[>"].*$')
+# Matches the first component of a filename delimited by -s and _s. That is:
+#  _RE_FIRST_COMPONENT.match('foo').group(0) == 'foo'
+#  _RE_FIRST_COMPONENT.match('foo.cc').group(0) == 'foo'
+#  _RE_FIRST_COMPONENT.match('foo-bar_baz.cc').group(0) == 'foo'
+#  _RE_FIRST_COMPONENT.match('foo_bar-baz.cc').group(0) == 'foo'
+_RE_FIRST_COMPONENT = re.compile(r'^[^-_.]+')
+
+
+def _DropCommonSuffixes(filename):
+  """Drops common suffixes like _test.cc or -inl.h from filename.
+
+  For example:
+    >>> _DropCommonSuffixes('foo/foo-inl.h')
+    'foo/foo'
+    >>> _DropCommonSuffixes('foo/bar/foo.cc')
+    'foo/bar/foo'
+    >>> _DropCommonSuffixes('foo/foo_internal.h')
+    'foo/foo'
+    >>> _DropCommonSuffixes('foo/foo_unusualinternal.h')
+    'foo/foo_unusualinternal'
+
+  Args:
+    filename: The input filename.
+
+  Returns:
+    The filename with the common suffix removed.
+  """
+  for suffix in ('test.cc', 'regtest.cc', 'unittest.cc',
+                 'inl.h', 'impl.h', 'internal.h'):
+    if (filename.endswith(suffix) and len(filename) > len(suffix) and
+        filename[-len(suffix) - 1] in ('-', '_')):
+      return filename[:-len(suffix) - 1]
+  return os.path.splitext(filename)[0]
+
+
+def _IsTestFilename(filename):
+  """Determines if the given filename has a suffix that identifies it as a test.
+
+  Args:
+    filename: The input filename.
+
+  Returns:
+    True if 'filename' looks like a test, False otherwise.
+  """
+  if (filename.endswith('_test.cc') or
+      filename.endswith('_unittest.cc') or
+      filename.endswith('_regtest.cc')):
+    return True
+  else:
+    return False
+
+
+def _ClassifyInclude(fileinfo, include, is_system):
+  """Figures out what kind of header 'include' is.
+
+  Args:
+    fileinfo: The current file cpplint is running over. A FileInfo instance.
+    include: The path to a #included file.
+    is_system: True if the #include used <> rather than "".
+
+  Returns:
+    One of the _XXX_HEADER constants.
+
+  For example:
+    >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'stdio.h', True)
+    _C_SYS_HEADER
+    >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'string', True)
+    _CPP_SYS_HEADER
+    >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'foo/foo.h', False)
+    _LIKELY_MY_HEADER
+    >>> _ClassifyInclude(FileInfo('foo/foo_unknown_extension.cc'),
+    ...                  'bar/foo_other_ext.h', False)
+    _POSSIBLE_MY_HEADER
+    >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'foo/bar.h', False)
+    _OTHER_HEADER
+  """
+  # This is a list of all standard c++ header files, except
+  # those already checked for above.
+  is_cpp_h = include in _CPP_HEADERS
+
+  if is_system:
+    if is_cpp_h:
+      return _CPP_SYS_HEADER
+    else:
+      return _C_SYS_HEADER
+
+  # If the target file and the include we're checking share a
+  # basename when we drop common extensions, and the include
+  # lives in . , then it's likely to be owned by the target file.
+  target_dir, target_base = (
+      os.path.split(_DropCommonSuffixes(fileinfo.RepositoryName())))
+  include_dir, include_base = os.path.split(_DropCommonSuffixes(include))
+  if target_base == include_base and (
+      include_dir == target_dir or
+      include_dir == os.path.normpath(target_dir + '/../public')):
+    return _LIKELY_MY_HEADER
+
+  # If the target and include share some initial basename
+  # component, it's possible the target is implementing the
+  # include, so it's allowed to be first, but we'll never
+  # complain if it's not there.
+  target_first_component = _RE_FIRST_COMPONENT.match(target_base)
+  include_first_component = _RE_FIRST_COMPONENT.match(include_base)
+  if (target_first_component and include_first_component and
+      target_first_component.group(0) ==
+      include_first_component.group(0)):
+    return _POSSIBLE_MY_HEADER
+
+  return _OTHER_HEADER
+
+
+
+def CheckIncludeLine(filename, clean_lines, linenum, include_state, error):
+  """Check rules that are applicable to #include lines.
+
+  Strings on #include lines are NOT removed from elided line, to make
+  certain tasks easier. However, to prevent false positives, checks
+  applicable to #include lines in CheckLanguage must be put here.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    include_state: An _IncludeState instance in which the headers are inserted.
+    error: The function to call with any errors found.
+  """
+  fileinfo = FileInfo(filename)
+
+  line = clean_lines.lines[linenum]
+
+  # "include" should use the new style "foo/bar.h" instead of just "bar.h"
+  if _RE_PATTERN_INCLUDE_NEW_STYLE.search(line):
+    error(filename, linenum, 'build/include', 4,
+          'Include the directory when naming .h files')
+
+  # we shouldn't include a file more than once. actually, there are a
+  # handful of instances where doing so is okay, but in general it's
+  # not.
+  match = _RE_PATTERN_INCLUDE.search(line)
+  if match:
+    include = match.group(2)
+    is_system = (match.group(1) == '<')
+    if include in include_state:
+      error(filename, linenum, 'build/include', 4,
+            '"%s" already included at %s:%s' %
+            (include, filename, include_state[include]))
+    else:
+      include_state[include] = linenum
+
+      # We want to ensure that headers appear in the right order:
+      # 1) for foo.cc, foo.h  (preferred location)
+      # 2) c system files
+      # 3) cpp system files
+      # 4) for foo.cc, foo.h  (deprecated location)
+      # 5) other google headers
+      #
+      # We classify each include statement as one of those 5 types
+      # using a number of techniques. The include_state object keeps
+      # track of the highest type seen, and complains if we see a
+      # lower type after that.
+      error_message = include_state.CheckNextIncludeOrder(
+          _ClassifyInclude(fileinfo, include, is_system))
+      if error_message:
+        error(filename, linenum, 'build/include_order', 4,
+              '%s. Should be: %s.h, c system, c++ system, other.' %
+              (error_message, fileinfo.BaseName()))
+      canonical_include = include_state.CanonicalizeAlphabeticalOrder(include)
+      if not include_state.IsInAlphabeticalOrder(
+          clean_lines, linenum, canonical_include):
+        error(filename, linenum, 'build/include_alpha', 4,
+              'Include "%s" not in alphabetical order' % include)
+      include_state.SetLastHeader(canonical_include)
+
+  # Look for any of the stream classes that are part of standard C++.
+  match = _RE_PATTERN_INCLUDE.match(line)
+  if match:
+    include = match.group(2)
+    if Match(r'(f|ind|io|i|o|parse|pf|stdio|str|)?stream$', include):
+      # Many unit tests use cout, so we exempt them.
+      if not _IsTestFilename(filename):
+        error(filename, linenum, 'readability/streams', 3,
+              'Streams are highly discouraged.')
+
+
+def _GetTextInside(text, start_pattern):
+  r"""Retrieves all the text between matching open and close parentheses.
+
+  Given a string of lines and a regular expression string, retrieve all the text
+  following the expression and between opening punctuation symbols like
+  (, [, or {, and the matching close-punctuation symbol. This properly nested
+  occurrences of the punctuations, so for the text like
+    printf(a(), b(c()));
+  a call to _GetTextInside(text, r'printf\(') will return 'a(), b(c())'.
+  start_pattern must match string having an open punctuation symbol at the end.
+
+  Args:
+    text: The lines to extract text. Its comments and strings must be elided.
+           It can be single line and can span multiple lines.
+    start_pattern: The regexp string indicating where to start extracting
+                   the text.
+  Returns:
+    The extracted text.
+    None if either the opening string or ending punctuation could not be found.
+  """
+  # TODO(sugawarayu): Audit cpplint.py to see what places could be profitably
+  # rewritten to use _GetTextInside (and use inferior regexp matching today).
+
+  # Give opening punctuations to get the matching close-punctuations.
+  matching_punctuation = {'(': ')', '{': '}', '[': ']'}
+  closing_punctuation = set(matching_punctuation.itervalues())
+
+  # Find the position to start extracting text.
+  match = re.search(start_pattern, text, re.M)
+  if not match:  # start_pattern not found in text.
+    return None
+  start_position = match.end(0)
+
+  assert start_position > 0, (
+      'start_pattern must ends with an opening punctuation.')
+  assert text[start_position - 1] in matching_punctuation, (
+      'start_pattern must ends with an opening punctuation.')
+  # Stack of closing punctuations we expect to have in text after position.
+  punctuation_stack = [matching_punctuation[text[start_position - 1]]]
+  position = start_position
+  while punctuation_stack and position < len(text):
+    if text[position] == punctuation_stack[-1]:
+      punctuation_stack.pop()
+    elif text[position] in closing_punctuation:
+      # A closing punctuation without matching opening punctuations.
+      return None
+    elif text[position] in matching_punctuation:
+      punctuation_stack.append(matching_punctuation[text[position]])
+    position += 1
+  if punctuation_stack:
+    # Opening punctuations left without matching close-punctuations.
+    return None
+  # punctuations match.
+  return text[start_position:position - 1]
+
+
+# Patterns for matching call-by-reference parameters.
+#
+# Supports nested templates up to 2 levels deep using this messy pattern:
+#   < (?: < (?: < [^<>]*
+#               >
+#           |   [^<>] )*
+#         >
+#     |   [^<>] )*
+#   >
+_RE_PATTERN_IDENT = r'[_a-zA-Z]\w*'  # =~ [[:alpha:]][[:alnum:]]*
+_RE_PATTERN_TYPE = (
+    r'(?:const\s+)?(?:typename\s+|class\s+|struct\s+|union\s+|enum\s+)?'
+    r'(?:\w|'
+    r'\s*<(?:<(?:<[^<>]*>|[^<>])*>|[^<>])*>|'
+    r'::)+')
+# A call-by-reference parameter ends with '& identifier'.
+_RE_PATTERN_REF_PARAM = re.compile(
+    r'(' + _RE_PATTERN_TYPE + r'(?:\s*(?:\bconst\b|[*]))*\s*'
+    r'&\s*' + _RE_PATTERN_IDENT + r')\s*(?:=[^,()]+)?[,)]')
+# A call-by-const-reference parameter either ends with 'const& identifier'
+# or looks like 'const type& identifier' when 'type' is atomic.
+_RE_PATTERN_CONST_REF_PARAM = (
+    r'(?:.*\s*\bconst\s*&\s*' + _RE_PATTERN_IDENT +
+    r'|const\s+' + _RE_PATTERN_TYPE + r'\s*&\s*' + _RE_PATTERN_IDENT + r')')
+
+
+def CheckLanguage(filename, clean_lines, linenum, file_extension,
+                  include_state, nesting_state, error):
+  """Checks rules from the 'C++ language rules' section of cppguide.html.
+
+  Some of these rules are hard to test (function overloading, using
+  uint32 inappropriately), but we do the best we can.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    file_extension: The extension (without the dot) of the filename.
+    include_state: An _IncludeState instance in which the headers are inserted.
+    nesting_state: A _NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    error: The function to call with any errors found.
+  """
+  # If the line is empty or consists of entirely a comment, no need to
+  # check it.
+  line = clean_lines.elided[linenum]
+  if not line:
+    return
+
+  match = _RE_PATTERN_INCLUDE.search(line)
+  if match:
+    CheckIncludeLine(filename, clean_lines, linenum, include_state, error)
+    return
+
+  # Reset include state across preprocessor directives.  This is meant
+  # to silence warnings for conditional includes.
+  if Match(r'^\s*#\s*(?:ifdef|elif|else|endif)\b', line):
+    include_state.ResetSection()
+
+  # Make Windows paths like Unix.
+  fullname = os.path.abspath(filename).replace('\\', '/')
+
+  # TODO(unknown): figure out if they're using default arguments in fn proto.
+
+  # Check to see if they're using an conversion function cast.
+  # I just try to capture the most common basic types, though there are more.
+  # Parameterless conversion functions, such as bool(), are allowed as they are
+  # probably a member operator declaration or default constructor.
+  match = Search(
+      r'(\bnew\s+)?\b'  # Grab 'new' operator, if it's there
+      r'(int|float|double|bool|char|int32|uint32|int64|uint64)'
+      r'(\([^)].*)', line)
+  if match:
+    matched_new = match.group(1)
+    matched_type = match.group(2)
+    matched_funcptr = match.group(3)
+
+    # gMock methods are defined using some variant of MOCK_METHODx(name, type)
+    # where type may be float(), int(string), etc.  Without context they are
+    # virtually indistinguishable from int(x) casts. Likewise, gMock's
+    # MockCallback takes a template parameter of the form return_type(arg_type),
+    # which looks much like the cast we're trying to detect.
+    #
+    # std::function<> wrapper has a similar problem.
+    #
+    # Return types for function pointers also look like casts if they
+    # don't have an extra space.
+    if (matched_new is None and  # If new operator, then this isn't a cast
+        not (Match(r'^\s*MOCK_(CONST_)?METHOD\d+(_T)?\(', line) or
+             Search(r'\bMockCallback<.*>', line) or
+             Search(r'\bstd::function<.*>', line)) and
+        not (matched_funcptr and
+             Match(r'\((?:[^() ]+::\s*\*\s*)?[^() ]+\)\s*\(',
+                   matched_funcptr))):
+      # Try a bit harder to catch gmock lines: the only place where
+      # something looks like an old-style cast is where we declare the
+      # return type of the mocked method, and the only time when we
+      # are missing context is if MOCK_METHOD was split across
+      # multiple lines.  The missing MOCK_METHOD is usually one or two
+      # lines back, so scan back one or two lines.
+      #
+      # It's not possible for gmock macros to appear in the first 2
+      # lines, since the class head + section name takes up 2 lines.
+      if (linenum < 2 or
+          not (Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\((?:\S+,)?\s*$',
+                     clean_lines.elided[linenum - 1]) or
+               Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\(\s*$',
+                     clean_lines.elided[linenum - 2]))):
+        error(filename, linenum, 'readability/casting', 4,
+              'Using deprecated casting style.  '
+              'Use static_cast<%s>(...) instead' %
+              matched_type)
+
+  CheckCStyleCast(filename, linenum, line, clean_lines.raw_lines[linenum],
+                  'static_cast',
+                  r'\((int|float|double|bool|char|u?int(16|32|64))\)', error)
+
+  # This doesn't catch all cases. Consider (const char * const)"hello".
+  #
+  # (char *) "foo" should always be a const_cast (reinterpret_cast won't
+  # compile).
+  if CheckCStyleCast(filename, linenum, line, clean_lines.raw_lines[linenum],
+                     'const_cast', r'\((char\s?\*+\s?)\)\s*"', error):
+    pass
+  else:
+    # Check pointer casts for other than string constants
+    CheckCStyleCast(filename, linenum, line, clean_lines.raw_lines[linenum],
+                    'reinterpret_cast', r'\((\w+\s?\*+\s?)\)', error)
+
+  # In addition, we look for people taking the address of a cast.  This
+  # is dangerous -- casts can assign to temporaries, so the pointer doesn't
+  # point where you think.
+  match = Search(
+      r'(?:&\(([^)]+)\)[\w(])|'
+      r'(?:&(static|dynamic|down|reinterpret)_cast\b)', line)
+  if match and match.group(1) != '*':
+    error(filename, linenum, 'runtime/casting', 4,
+          ('Are you taking an address of a cast?  '
+           'This is dangerous: could be a temp var.  '
+           'Take the address before doing the cast, rather than after'))
+
+  # Create an extended_line, which is the concatenation of the current and
+  # next lines, for more effective checking of code that may span more than one
+  # line.
+  if linenum + 1 < clean_lines.NumLines():
+    extended_line = line + clean_lines.elided[linenum + 1]
+  else:
+    extended_line = line
+
+  # Check for people declaring static/global STL strings at the top level.
+  # This is dangerous because the C++ language does not guarantee that
+  # globals with constructors are initialized before the first access.
+  match = Match(
+      r'((?:|static +)(?:|const +))string +([a-zA-Z0-9_:]+)\b(.*)',
+      line)
+  # Make sure it's not a function.
+  # Function template specialization looks like: "string foo<Type>(...".
+  # Class template definitions look like: "string Foo<Type>::Method(...".
+  #
+  # Also ignore things that look like operators.  These are matched separately
+  # because operator names cross non-word boundaries.  If we change the pattern
+  # above, we would decrease the accuracy of matching identifiers.
+  if (match and
+      not Search(r'\boperator\W', line) and
+      not Match(r'\s*(<.*>)?(::[a-zA-Z0-9_]+)?\s*\(([^"]|$)', match.group(3))):
+    error(filename, linenum, 'runtime/string', 4,
+          'For a static/global string constant, use a C style string instead: '
+          '"%schar %s[]".' %
+          (match.group(1), match.group(2)))
+
+  if Search(r'\b([A-Za-z0-9_]*_)\(\1\)', line):
+    error(filename, linenum, 'runtime/init', 4,
+          'You seem to be initializing a member variable with itself.')
+
+  if file_extension == 'h':
+    # TODO(unknown): check that 1-arg constructors are explicit.
+    #                How to tell it's a constructor?
+    #                (handled in CheckForNonStandardConstructs for now)
+    # TODO(unknown): check that classes have DISALLOW_EVIL_CONSTRUCTORS
+    #                (level 1 error)
+    pass
+
+  # Check if people are using the verboten C basic types.  The only exception
+  # we regularly allow is "unsigned short port" for port.
+  if Search(r'\bshort port\b', line):
+    if not Search(r'\bunsigned short port\b', line):
+      error(filename, linenum, 'runtime/int', 4,
+            'Use "unsigned short" for ports, not "short"')
+  else:
+    match = Search(r'\b(short|long(?! +double)|long long)\b', line)
+    if match:
+      error(filename, linenum, 'runtime/int', 4,
+            'Use int16/int64/etc, rather than the C type %s' % match.group(1))
+
+  # When snprintf is used, the second argument shouldn't be a literal.
+  match = Search(r'snprintf\s*\(([^,]*),\s*([0-9]*)\s*,', line)
+  if match and match.group(2) != '0':
+    # If 2nd arg is zero, snprintf is used to calculate size.
+    error(filename, linenum, 'runtime/printf', 3,
+          'If you can, use sizeof(%s) instead of %s as the 2nd arg '
+          'to snprintf.' % (match.group(1), match.group(2)))
+
+  # Check if some verboten C functions are being used.
+  if Search(r'\bsprintf\b', line):
+    error(filename, linenum, 'runtime/printf', 5,
+          'Never use sprintf.  Use snprintf instead.')
+  match = Search(r'\b(strcpy|strcat)\b', line)
+  if match:
+    error(filename, linenum, 'runtime/printf', 4,
+          'Almost always, snprintf is better than %s' % match.group(1))
+
+  # Check if some verboten operator overloading is going on
+  # TODO(unknown): catch out-of-line unary operator&:
+  #   class X {};
+  #   int operator&(const X& x) { return 42; }  // unary operator&
+  # The trick is it's hard to tell apart from binary operator&:
+  #   class Y { int operator&(const Y& x) { return 23; } }; // binary operator&
+  if Search(r'\boperator\s*&\s*\(\s*\)', line):
+    error(filename, linenum, 'runtime/operator', 4,
+          'Unary operator& is dangerous.  Do not use it.')
+
+  # Check for suspicious usage of "if" like
+  # } if (a == b) {
+  if Search(r'\}\s*if\s*\(', line):
+    error(filename, linenum, 'readability/braces', 4,
+          'Did you mean "else if"? If not, start a new line for "if".')
+
+  # Check for potential format string bugs like printf(foo).
+  # We constrain the pattern not to pick things like DocidForPrintf(foo).
+  # Not perfect but it can catch printf(foo.c_str()) and printf(foo->c_str())
+  # TODO(sugawarayu): Catch the following case. Need to change the calling
+  # convention of the whole function to process multiple line to handle it.
+  #   printf(
+  #       boy_this_is_a_really_long_variable_that_cannot_fit_on_the_prev_line);
+  printf_args = _GetTextInside(line, r'(?i)\b(string)?printf\s*\(')
+  if printf_args:
+    match = Match(r'([\w.\->()]+)$', printf_args)
+    if match and match.group(1) != '__VA_ARGS__':
+      function_name = re.search(r'\b((?:string)?printf)\s*\(',
+                                line, re.I).group(1)
+      error(filename, linenum, 'runtime/printf', 4,
+            'Potential format string bug. Do %s("%%s", %s) instead.'
+            % (function_name, match.group(1)))
+
+  # Check for potential memset bugs like memset(buf, sizeof(buf), 0).
+  match = Search(r'memset\s*\(([^,]*),\s*([^,]*),\s*0\s*\)', line)
+  if match and not Match(r"^''|-?[0-9]+|0x[0-9A-Fa-f]$", match.group(2)):
+    error(filename, linenum, 'runtime/memset', 4,
+          'Did you mean "memset(%s, 0, %s)"?'
+          % (match.group(1), match.group(2)))
+
+  if Search(r'\busing namespace\b', line):
+    error(filename, linenum, 'build/namespaces', 5,
+          'Do not use namespace using-directives.  '
+          'Use using-declarations instead.')
+
+  # Detect variable-length arrays.
+  match = Match(r'\s*(.+::)?(\w+) [a-z]\w*\[(.+)];', line)
+  if (match and match.group(2) != 'return' and match.group(2) != 'delete' and
+      match.group(3).find(']') == -1):
+    # Split the size using space and arithmetic operators as delimiters.
+    # If any of the resulting tokens are not compile time constants then
+    # report the error.
+    tokens = re.split(r'\s|\+|\-|\*|\/|<<|>>]', match.group(3))
+    is_const = True
+    skip_next = False
+    for tok in tokens:
+      if skip_next:
+        skip_next = False
+        continue
+
+      if Search(r'sizeof\(.+\)', tok): continue
+      if Search(r'arraysize\(\w+\)', tok): continue
+
+      tok = tok.lstrip('(')
+      tok = tok.rstrip(')')
+      if not tok: continue
+      if Match(r'\d+', tok): continue
+      if Match(r'0[xX][0-9a-fA-F]+', tok): continue
+      if Match(r'k[A-Z0-9]\w*', tok): continue
+      if Match(r'(.+::)?k[A-Z0-9]\w*', tok): continue
+      if Match(r'(.+::)?[A-Z][A-Z0-9_]*', tok): continue
+      # A catch all for tricky sizeof cases, including 'sizeof expression',
+      # 'sizeof(*type)', 'sizeof(const type)', 'sizeof(struct StructName)'
+      # requires skipping the next token because we split on ' ' and '*'.
+      if tok.startswith('sizeof'):
+        skip_next = True
+        continue
+      is_const = False
+      break
+    if not is_const:
+      error(filename, linenum, 'runtime/arrays', 1,
+            'Do not use variable-length arrays.  Use an appropriately named '
+            "('k' followed by CamelCase) compile-time constant for the size.")
+
+  # If DISALLOW_EVIL_CONSTRUCTORS, DISALLOW_COPY_AND_ASSIGN, or
+  # DISALLOW_IMPLICIT_CONSTRUCTORS is present, then it should be the last thing
+  # in the class declaration.
+  match = Match(
+      (r'\s*'
+       r'(DISALLOW_(EVIL_CONSTRUCTORS|COPY_AND_ASSIGN|IMPLICIT_CONSTRUCTORS))'
+       r'\(.*\);$'),
+      line)
+  if match and linenum + 1 < clean_lines.NumLines():
+    next_line = clean_lines.elided[linenum + 1]
+    # We allow some, but not all, declarations of variables to be present
+    # in the statement that defines the class.  The [\w\*,\s]* fragment of
+    # the regular expression below allows users to declare instances of
+    # the class or pointers to instances, but not less common types such
+    # as function pointers or arrays.  It's a tradeoff between allowing
+    # reasonable code and avoiding trying to parse more C++ using regexps.
+    if not Search(r'^\s*}[\w\*,\s]*;', next_line):
+      error(filename, linenum, 'readability/constructors', 3,
+            match.group(1) + ' should be the last thing in the class')
+
+  # Check for use of unnamed namespaces in header files.  Registration
+  # macros are typically OK, so we allow use of "namespace {" on lines
+  # that end with backslashes.
+  if (file_extension == 'h'
+      and Search(r'\bnamespace\s*{', line)
+      and line[-1] != '\\'):
+    error(filename, linenum, 'build/namespaces', 4,
+          'Do not use unnamed namespaces in header files.  See '
+          'http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Namespaces'
+          ' for more information.')
+
+def CheckForNonConstReference(filename, clean_lines, linenum,
+                              nesting_state, error):
+  """Check for non-const references.
+
+  Separate from CheckLanguage since it scans backwards from current
+  line, instead of scanning forward.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    nesting_state: A _NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    error: The function to call with any errors found.
+  """
+  # Do nothing if there is no '&' on current line.
+  line = clean_lines.elided[linenum]
+  if '&' not in line:
+    return
+
+  # Long type names may be broken across multiple lines, usually in one
+  # of these forms:
+  #   LongType
+  #       ::LongTypeContinued &identifier
+  #   LongType::
+  #       LongTypeContinued &identifier
+  #   LongType<
+  #       ...>::LongTypeContinued &identifier
+  #
+  # If we detected a type split across two lines, join the previous
+  # line to current line so that we can match const references
+  # accordingly.
+  #
+  # Note that this only scans back one line, since scanning back
+  # arbitrary number of lines would be expensive.  If you have a type
+  # that spans more than 2 lines, please use a typedef.
+  if linenum > 1:
+    previous = None
+    if Match(r'\s*::(?:[\w<>]|::)+\s*&\s*\S', line):
+      # previous_line\n + ::current_line
+      previous = Search(r'\b((?:const\s*)?(?:[\w<>]|::)+[\w<>])\s*$',
+                        clean_lines.elided[linenum - 1])
+    elif Match(r'\s*[a-zA-Z_]([\w<>]|::)+\s*&\s*\S', line):
+      # previous_line::\n + current_line
+      previous = Search(r'\b((?:const\s*)?(?:[\w<>]|::)+::)\s*$',
+                        clean_lines.elided[linenum - 1])
+    if previous:
+      line = previous.group(1) + line.lstrip()
+    else:
+      # Check for templated parameter that is split across multiple lines
+      endpos = line.rfind('>')
+      if endpos > -1:
+        (_, startline, startpos) = ReverseCloseExpression(
+            clean_lines, linenum, endpos)
+        if startpos > -1 and startline < linenum:
+          # Found the matching < on an earlier line, collect all
+          # pieces up to current line.
+          line = ''
+          for i in xrange(startline, linenum + 1):
+            line += clean_lines.elided[i].strip()
+
+  # Check for non-const references in function parameters.  A single '&' may
+  # found in the following places:
+  #   inside expression: binary & for bitwise AND
+  #   inside expression: unary & for taking the address of something
+  #   inside declarators: reference parameter
+  # We will exclude the first two cases by checking that we are not inside a
+  # function body, including one that was just introduced by a trailing '{'.
+  # TODO(unknwon): Doesn't account for preprocessor directives.
+  # TODO(unknown): Doesn't account for 'catch(Exception& e)' [rare].
+  check_params = False
+  if not nesting_state.stack:
+    check_params = True  # top level
+  elif (isinstance(nesting_state.stack[-1], _ClassInfo) or
+        isinstance(nesting_state.stack[-1], _NamespaceInfo)):
+    check_params = True  # within class or namespace
+  elif Match(r'.*{\s*$', line):
+    if (len(nesting_state.stack) == 1 or
+        isinstance(nesting_state.stack[-2], _ClassInfo) or
+        isinstance(nesting_state.stack[-2], _NamespaceInfo)):
+      check_params = True  # just opened global/class/namespace block
+  # We allow non-const references in a few standard places, like functions
+  # called "swap()" or iostream operators like "<<" or ">>".  Do not check
+  # those function parameters.
+  #
+  # We also accept & in static_assert, which looks like a function but
+  # it's actually a declaration expression.
+  whitelisted_functions = (r'(?:[sS]wap(?:<\w:+>)?|'
+                           r'operator\s*[<>][<>]|'
+                           r'static_assert|COMPILE_ASSERT'
+                           r')\s*\(')
+  if Search(whitelisted_functions, line):
+    check_params = False
+  elif not Search(r'\S+\([^)]*$', line):
+    # Don't see a whitelisted function on this line.  Actually we
+    # didn't see any function name on this line, so this is likely a
+    # multi-line parameter list.  Try a bit harder to catch this case.
+    for i in xrange(2):
+      if (linenum > i and
+          Search(whitelisted_functions, clean_lines.elided[linenum - i - 1])):
+        check_params = False
+        break
+
+  if check_params:
+    decls = ReplaceAll(r'{[^}]*}', ' ', line)  # exclude function body
+    for parameter in re.findall(_RE_PATTERN_REF_PARAM, decls):
+      if not Match(_RE_PATTERN_CONST_REF_PARAM, parameter):
+        error(filename, linenum, 'runtime/references', 2,
+              'Is this a non-const reference? '
+              'If so, make const or use a pointer: ' +
+              ReplaceAll(' *<', '<', parameter))
+
+
+def CheckCStyleCast(filename, linenum, line, raw_line, cast_type, pattern,
+                    error):
+  """Checks for a C-style cast by looking for the pattern.
+
+  Args:
+    filename: The name of the current file.
+    linenum: The number of the line to check.
+    line: The line of code to check.
+    raw_line: The raw line of code to check, with comments.
+    cast_type: The string for the C++ cast to recommend.  This is either
+      reinterpret_cast, static_cast, or const_cast, depending.
+    pattern: The regular expression used to find C-style casts.
+    error: The function to call with any errors found.
+
+  Returns:
+    True if an error was emitted.
+    False otherwise.
+  """
+  match = Search(pattern, line)
+  if not match:
+    return False
+
+  # Exclude lines with sizeof, since sizeof looks like a cast.
+  sizeof_match = Match(r'.*sizeof\s*$', line[0:match.start(1) - 1])
+  if sizeof_match:
+    return False
+
+  # operator++(int) and operator--(int)
+  if (line[0:match.start(1) - 1].endswith(' operator++') or
+      line[0:match.start(1) - 1].endswith(' operator--')):
+    return False
+
+  # A single unnamed argument for a function tends to look like old
+  # style cast.  If we see those, don't issue warnings for deprecated
+  # casts, instead issue warnings for unnamed arguments where
+  # appropriate.
+  #
+  # These are things that we want warnings for, since the style guide
+  # explicitly require all parameters to be named:
+  #   Function(int);
+  #   Function(int) {
+  #   ConstMember(int) const;
+  #   ConstMember(int) const {
+  #   ExceptionMember(int) throw (...);
+  #   ExceptionMember(int) throw (...) {
+  #   PureVirtual(int) = 0;
+  #
+  # These are functions of some sort, where the compiler would be fine
+  # if they had named parameters, but people often omit those
+  # identifiers to reduce clutter:
+  #   (FunctionPointer)(int);
+  #   (FunctionPointer)(int) = value;
+  #   Function((function_pointer_arg)(int))
+  #   <TemplateArgument(int)>;
+  #   <(FunctionPointerTemplateArgument)(int)>;
+  remainder = line[match.end(0):]
+  if Match(r'^\s*(?:;|const\b|throw\b|=|>|\{|\))', remainder):
+    # Looks like an unnamed parameter.
+
+    # Don't warn on any kind of template arguments.
+    if Match(r'^\s*>', remainder):
+      return False
+
+    # Don't warn on assignments to function pointers, but keep warnings for
+    # unnamed parameters to pure virtual functions.  Note that this pattern
+    # will also pass on assignments of "0" to function pointers, but the
+    # preferred values for those would be "nullptr" or "NULL".
+    matched_zero = Match(r'^\s=\s*(\S+)\s*;', remainder)
+    if matched_zero and matched_zero.group(1) != '0':
+      return False
+
+    # Don't warn on function pointer declarations.  For this we need
+    # to check what came before the "(type)" string.
+    if Match(r'.*\)\s*$', line[0:match.start(0)]):
+      return False
+
+    # Don't warn if the parameter is named with block comments, e.g.:
+    #  Function(int /*unused_param*/);
+    if '/*' in raw_line:
+      return False
+
+    # Passed all filters, issue warning here.
+    error(filename, linenum, 'readability/function', 3,
+          'All parameters should be named in a function')
+    return True
+
+  # At this point, all that should be left is actual casts.
+  error(filename, linenum, 'readability/casting', 4,
+        'Using C-style cast.  Use %s<%s>(...) instead' %
+        (cast_type, match.group(1)))
+
+  return True
+
+
+_HEADERS_CONTAINING_TEMPLATES = (
+    ('<deque>', ('deque',)),
+    ('<functional>', ('unary_function', 'binary_function',
+                      'plus', 'minus', 'multiplies', 'divides', 'modulus',
+                      'negate',
+                      'equal_to', 'not_equal_to', 'greater', 'less',
+                      'greater_equal', 'less_equal',
+                      'logical_and', 'logical_or', 'logical_not',
+                      'unary_negate', 'not1', 'binary_negate', 'not2',
+                      'bind1st', 'bind2nd',
+                      'pointer_to_unary_function',
+                      'pointer_to_binary_function',
+                      'ptr_fun',
+                      'mem_fun_t', 'mem_fun', 'mem_fun1_t', 'mem_fun1_ref_t',
+                      'mem_fun_ref_t',
+                      'const_mem_fun_t', 'const_mem_fun1_t',
+                      'const_mem_fun_ref_t', 'const_mem_fun1_ref_t',
+                      'mem_fun_ref',
+                     )),
+    ('<limits>', ('numeric_limits',)),
+    ('<list>', ('list',)),
+    ('<map>', ('map', 'multimap',)),
+    ('<memory>', ('allocator',)),
+    ('<queue>', ('queue', 'priority_queue',)),
+    ('<set>', ('set', 'multiset',)),
+    ('<stack>', ('stack',)),
+    ('<string>', ('char_traits', 'basic_string',)),
+    ('<utility>', ('pair',)),
+    ('<vector>', ('vector',)),
+
+    # gcc extensions.
+    # Note: std::hash is their hash, ::hash is our hash
+    ('<hash_map>', ('hash_map', 'hash_multimap',)),
+    ('<hash_set>', ('hash_set', 'hash_multiset',)),
+    ('<slist>', ('slist',)),
+    )
+
+_RE_PATTERN_STRING = re.compile(r'\bstring\b')
+
+_re_pattern_algorithm_header = []
+for _template in ('copy', 'max', 'min', 'min_element', 'sort', 'swap',
+                  'transform'):
+  # Match max<type>(..., ...), max(..., ...), but not foo->max, foo.max or
+  # type::max().
+  _re_pattern_algorithm_header.append(
+      (re.compile(r'[^>.]\b' + _template + r'(<.*?>)?\([^\)]'),
+       _template,
+       '<algorithm>'))
+
+_re_pattern_templates = []
+for _header, _templates in _HEADERS_CONTAINING_TEMPLATES:
+  for _template in _templates:
+    _re_pattern_templates.append(
+        (re.compile(r'(\<|\b)' + _template + r'\s*\<'),
+         _template + '<>',
+         _header))
+
+
+def FilesBelongToSameModule(filename_cc, filename_h):
+  """Check if these two filenames belong to the same module.
+
+  The concept of a 'module' here is a as follows:
+  foo.h, foo-inl.h, foo.cc, foo_test.cc and foo_unittest.cc belong to the
+  same 'module' if they are in the same directory.
+  some/path/public/xyzzy and some/path/internal/xyzzy are also considered
+  to belong to the same module here.
+
+  If the filename_cc contains a longer path than the filename_h, for example,
+  '/absolute/path/to/base/sysinfo.cc', and this file would include
+  'base/sysinfo.h', this function also produces the prefix needed to open the
+  header. This is used by the caller of this function to more robustly open the
+  header file. We don't have access to the real include paths in this context,
+  so we need this guesswork here.
+
+  Known bugs: tools/base/bar.cc and base/bar.h belong to the same module
+  according to this implementation. Because of this, this function gives
+  some false positives. This should be sufficiently rare in practice.
+
+  Args:
+    filename_cc: is the path for the .cc file
+    filename_h: is the path for the header path
+
+  Returns:
+    Tuple with a bool and a string:
+    bool: True if filename_cc and filename_h belong to the same module.
+    string: the additional prefix needed to open the header file.
+  """
+
+  if not filename_cc.endswith('.cc'):
+    return (False, '')
+  filename_cc = filename_cc[:-len('.cc')]
+  if filename_cc.endswith('_unittest'):
+    filename_cc = filename_cc[:-len('_unittest')]
+  elif filename_cc.endswith('_test'):
+    filename_cc = filename_cc[:-len('_test')]
+  filename_cc = filename_cc.replace('/public/', '/')
+  filename_cc = filename_cc.replace('/internal/', '/')
+
+  if not filename_h.endswith('.h'):
+    return (False, '')
+  filename_h = filename_h[:-len('.h')]
+  if filename_h.endswith('-inl'):
+    filename_h = filename_h[:-len('-inl')]
+  filename_h = filename_h.replace('/public/', '/')
+  filename_h = filename_h.replace('/internal/', '/')
+
+  files_belong_to_same_module = filename_cc.endswith(filename_h)
+  common_path = ''
+  if files_belong_to_same_module:
+    common_path = filename_cc[:-len(filename_h)]
+  return files_belong_to_same_module, common_path
+
+
+def UpdateIncludeState(filename, include_state, io=codecs):
+  """Fill up the include_state with new includes found from the file.
+
+  Args:
+    filename: the name of the header to read.
+    include_state: an _IncludeState instance in which the headers are inserted.
+    io: The io factory to use to read the file. Provided for testability.
+
+  Returns:
+    True if a header was successfully added. False otherwise.
+  """
+  headerfile = None
+  try:
+    headerfile = io.open(filename, 'r', 'utf8', 'replace')
+  except IOError:
+    return False
+  linenum = 0
+  for line in headerfile:
+    linenum += 1
+    clean_line = CleanseComments(line)
+    match = _RE_PATTERN_INCLUDE.search(clean_line)
+    if match:
+      include = match.group(2)
+      # The value formatting is cute, but not really used right now.
+      # What matters here is that the key is in include_state.
+      include_state.setdefault(include, '%s:%d' % (filename, linenum))
+  return True
+
+
+def CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error,
+                              io=codecs):
+  """Reports for missing stl includes.
+
+  This function will output warnings to make sure you are including the headers
+  necessary for the stl containers and functions that you use. We only give one
+  reason to include a header. For example, if you use both equal_to<> and
+  less<> in a .h file, only one (the latter in the file) of these will be
+  reported as a reason to include the <functional>.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    include_state: An _IncludeState instance.
+    error: The function to call with any errors found.
+    io: The IO factory to use to read the header file. Provided for unittest
+        injection.
+  """
+  required = {}  # A map of header name to linenumber and the template entity.
+                 # Example of required: { '<functional>': (1219, 'less<>') }
+
+  for linenum in xrange(clean_lines.NumLines()):
+    line = clean_lines.elided[linenum]
+    if not line or line[0] == '#':
+      continue
+
+    # String is special -- it is a non-templatized type in STL.
+    matched = _RE_PATTERN_STRING.search(line)
+    if matched:
+      # Don't warn about strings in non-STL namespaces:
+      # (We check only the first match per line; good enough.)
+      prefix = line[:matched.start()]
+      if prefix.endswith('std::') or not prefix.endswith('::'):
+        required['<string>'] = (linenum, 'string')
+
+    for pattern, template, header in _re_pattern_algorithm_header:
+      if pattern.search(line):
+        required[header] = (linenum, template)
+
+    # The following function is just a speed up, no semantics are changed.
+    if not '<' in line:  # Reduces the cpu time usage by skipping lines.
+      continue
+
+    for pattern, template, header in _re_pattern_templates:
+      if pattern.search(line):
+        required[header] = (linenum, template)
+
+  # The policy is that if you #include something in foo.h you don't need to
+  # include it again in foo.cc. Here, we will look at possible includes.
+  # Let's copy the include_state so it is only messed up within this function.
+  include_state = include_state.copy()
+
+  # Did we find the header for this file (if any) and successfully load it?
+  header_found = False
+
+  # Use the absolute path so that matching works properly.
+  abs_filename = FileInfo(filename).FullName()
+
+  # For Emacs's flymake.
+  # If cpplint is invoked from Emacs's flymake, a temporary file is generated
+  # by flymake and that file name might end with '_flymake.cc'. In that case,
+  # restore original file name here so that the corresponding header file can be
+  # found.
+  # e.g. If the file name is 'foo_flymake.cc', we should search for 'foo.h'
+  # instead of 'foo_flymake.h'
+  abs_filename = re.sub(r'_flymake\.cc$', '.cc', abs_filename)
+
+  # include_state is modified during iteration, so we iterate over a copy of
+  # the keys.
+  header_keys = include_state.keys()
+  for header in header_keys:
+    (same_module, common_path) = FilesBelongToSameModule(abs_filename, header)
+    fullpath = common_path + header
+    if same_module and UpdateIncludeState(fullpath, include_state, io):
+      header_found = True
+
+  # If we can't find the header file for a .cc, assume it's because we don't
+  # know where to look. In that case we'll give up as we're not sure they
+  # didn't include it in the .h file.
+  # TODO(unknown): Do a better job of finding .h files so we are confident that
+  # not having the .h file means there isn't one.
+  if filename.endswith('.cc') and not header_found:
+    return
+
+  # All the lines have been processed, report the errors found.
+  for required_header_unstripped in required:
+    template = required[required_header_unstripped][1]
+    if required_header_unstripped.strip('<>"') not in include_state:
+      error(filename, required[required_header_unstripped][0],
+            'build/include_what_you_use', 4,
+            'Add #include ' + required_header_unstripped + ' for ' + template)
+
+
+_RE_PATTERN_EXPLICIT_MAKEPAIR = re.compile(r'\bmake_pair\s*<')
+
+
+def CheckMakePairUsesDeduction(filename, clean_lines, linenum, error):
+  """Check that make_pair's template arguments are deduced.
+
+  G++ 4.6 in C++0x mode fails badly if make_pair's template arguments are
+  specified explicitly, and such use isn't intended in any case.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+  match = _RE_PATTERN_EXPLICIT_MAKEPAIR.search(line)
+  if match:
+    error(filename, linenum, 'build/explicit_make_pair',
+          4,  # 4 = high confidence
+          'For C++11-compatibility, omit template arguments from make_pair'
+          ' OR use pair directly OR if appropriate, construct a pair directly')
+
+
+def ProcessLine(filename, file_extension, clean_lines, line,
+                include_state, function_state, nesting_state, error,
+                extra_check_functions=[]):
+  """Processes a single line in the file.
+
+  Args:
+    filename: Filename of the file that is being processed.
+    file_extension: The extension (dot not included) of the file.
+    clean_lines: An array of strings, each representing a line of the file,
+                 with comments stripped.
+    line: Number of line being processed.
+    include_state: An _IncludeState instance in which the headers are inserted.
+    function_state: A _FunctionState instance which counts function lines, etc.
+    nesting_state: A _NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    error: A callable to which errors are reported, which takes 4 arguments:
+           filename, line number, error level, and message
+    extra_check_functions: An array of additional check functions that will be
+                           run on each source line. Each function takes 4
+                           arguments: filename, clean_lines, line, error
+  """
+  raw_lines = clean_lines.raw_lines
+  ParseNolintSuppressions(filename, raw_lines[line], line, error)
+  nesting_state.Update(filename, clean_lines, line, error)
+  if nesting_state.stack and nesting_state.stack[-1].inline_asm != _NO_ASM:
+    return
+  CheckForFunctionLengths(filename, clean_lines, line, function_state, error)
+  CheckForMultilineCommentsAndStrings(filename, clean_lines, line, error)
+  CheckStyle(filename, clean_lines, line, file_extension, nesting_state, error)
+  CheckLanguage(filename, clean_lines, line, file_extension, include_state,
+                nesting_state, error)
+  CheckForNonConstReference(filename, clean_lines, line, nesting_state, error)
+  CheckForNonStandardConstructs(filename, clean_lines, line,
+                                nesting_state, error)
+  CheckVlogArguments(filename, clean_lines, line, error)
+  CheckPosixThreading(filename, clean_lines, line, error)
+  CheckInvalidIncrement(filename, clean_lines, line, error)
+  CheckMakePairUsesDeduction(filename, clean_lines, line, error)
+  for check_fn in extra_check_functions:
+    check_fn(filename, clean_lines, line, error)
+
+def ProcessFileData(filename, file_extension, lines, error,
+                    extra_check_functions=[]):
+  """Performs lint checks and reports any errors to the given error function.
+
+  Args:
+    filename: Filename of the file that is being processed.
+    file_extension: The extension (dot not included) of the file.
+    lines: An array of strings, each representing a line of the file, with the
+           last element being empty if the file is terminated with a newline.
+    error: A callable to which errors are reported, which takes 4 arguments:
+           filename, line number, error level, and message
+    extra_check_functions: An array of additional check functions that will be
+                           run on each source line. Each function takes 4
+                           arguments: filename, clean_lines, line, error
+  """
+  lines = (['// marker so line numbers and indices both start at 1'] + lines +
+           ['// marker so line numbers end in a known way'])
+
+  include_state = _IncludeState()
+  function_state = _FunctionState()
+  nesting_state = _NestingState()
+
+  ResetNolintSuppressions()
+
+  CheckForCopyright(filename, lines, error)
+
+  if file_extension == 'h':
+    CheckForHeaderGuard(filename, lines, error)
+
+  RemoveMultiLineComments(filename, lines, error)
+  clean_lines = CleansedLines(lines)
+  for line in xrange(clean_lines.NumLines()):
+    ProcessLine(filename, file_extension, clean_lines, line,
+                include_state, function_state, nesting_state, error,
+                extra_check_functions)
+  nesting_state.CheckCompletedBlocks(filename, error)
+
+  CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error)
+
+  # We check here rather than inside ProcessLine so that we see raw
+  # lines rather than "cleaned" lines.
+  CheckForBadCharacters(filename, lines, error)
+
+  CheckForNewlineAtEOF(filename, lines, error)
+
+def ProcessFile(filename, vlevel, extra_check_functions=[]):
+  """Does google-lint on a single file.
+
+  Args:
+    filename: The name of the file to parse.
+
+    vlevel: The level of errors to report.  Every error of confidence
+    >= verbose_level will be reported.  0 is a good default.
+
+    extra_check_functions: An array of additional check functions that will be
+                           run on each source line. Each function takes 4
+                           arguments: filename, clean_lines, line, error
+  """
+
+  _SetVerboseLevel(vlevel)
+
+  try:
+    # Support the UNIX convention of using "-" for stdin.  Note that
+    # we are not opening the file with universal newline support
+    # (which codecs doesn't support anyway), so the resulting lines do
+    # contain trailing '\r' characters if we are reading a file that
+    # has CRLF endings.
+    # If after the split a trailing '\r' is present, it is removed
+    # below. If it is not expected to be present (i.e. os.linesep !=
+    # '\r\n' as in Windows), a warning is issued below if this file
+    # is processed.
+
+    if filename == '-':
+      lines = codecs.StreamReaderWriter(sys.stdin,
+                                        codecs.getreader('utf8'),
+                                        codecs.getwriter('utf8'),
+                                        'replace').read().split('\n')
+    else:
+      lines = codecs.open(filename, 'r', 'utf8', 'replace').read().split('\n')
+
+    carriage_return_found = False
+    # Remove trailing '\r'.
+    for linenum in range(len(lines)):
+      if lines[linenum].endswith('\r'):
+        lines[linenum] = lines[linenum].rstrip('\r')
+        carriage_return_found = True
+
+  except IOError:
+    sys.stderr.write(
+        "Skipping input '%s': Can't open for reading\n" % filename)
+    return
+
+  # Note, if no dot is found, this will give the entire filename as the ext.
+  file_extension = filename[filename.rfind('.') + 1:]
+
+  # When reading from stdin, the extension is unknown, so no cpplint tests
+  # should rely on the extension.
+  if filename != '-' and file_extension not in _valid_extensions:
+    sys.stderr.write('Ignoring %s; not a valid file name '
+                     '(%s)\n' % (filename, ', '.join(_valid_extensions)))
+  else:
+    ProcessFileData(filename, file_extension, lines, Error,
+                    extra_check_functions)
+    if carriage_return_found and os.linesep != '\r\n':
+      # Use 0 for linenum since outputting only one error for potentially
+      # several lines.
+      Error(filename, 0, 'whitespace/newline', 1,
+            'One or more unexpected \\r (^M) found;'
+            'better to use only a \\n')
+
+  sys.stderr.write('Done processing %s\n' % filename)
+
+
+def PrintUsage(message):
+  """Prints a brief usage string and exits, optionally with an error message.
+
+  Args:
+    message: The optional error message.
+  """
+  sys.stderr.write(_USAGE)
+  if message:
+    sys.exit('\nFATAL ERROR: ' + message)
+  else:
+    sys.exit(1)
+
+
+def PrintCategories():
+  """Prints a list of all the error-categories used by error messages.
+
+  These are the categories used to filter messages via --filter.
+  """
+  sys.stderr.write(''.join('  %s\n' % cat for cat in _ERROR_CATEGORIES))
+  sys.exit(0)
+
+
+def ParseArguments(args):
+  """Parses the command line arguments.
+
+  This may set the output format and verbosity level as side-effects.
+
+  Args:
+    args: The command line arguments:
+
+  Returns:
+    The list of filenames to lint.
+  """
+  try:
+    (opts, filenames) = getopt.getopt(args, '', ['help', 'output=', 'verbose=',
+                                                 'counting=',
+                                                 'filter=',
+                                                 'root=',
+                                                 'linelength=',
+                                                 'extensions='])
+  except getopt.GetoptError:
+    PrintUsage('Invalid arguments.')
+
+  verbosity = _VerboseLevel()
+  output_format = _OutputFormat()
+  filters = ''
+  counting_style = ''
+
+  for (opt, val) in opts:
+    if opt == '--help':
+      PrintUsage(None)
+    elif opt == '--output':
+      if val not in ('emacs', 'vs7', 'eclipse'):
+        PrintUsage('The only allowed output formats are emacs, vs7 and eclipse.')
+      output_format = val
+    elif opt == '--verbose':
+      verbosity = int(val)
+    elif opt == '--filter':
+      filters = val
+      if not filters:
+        PrintCategories()
+    elif opt == '--counting':
+      if val not in ('total', 'toplevel', 'detailed'):
+        PrintUsage('Valid counting options are total, toplevel, and detailed')
+      counting_style = val
+    elif opt == '--root':
+      global _root
+      _root = val
+    elif opt == '--linelength':
+      global _line_length
+      try:
+          _line_length = int(val)
+      except ValueError:
+          PrintUsage('Line length must be digits.')
+    elif opt == '--extensions':
+      global _valid_extensions
+      try:
+          _valid_extensions = set(val.split(','))
+      except ValueError:
+          PrintUsage('Extensions must be comma separated list.')
+
+  if not filenames:
+    PrintUsage('No files were specified.')
+
+  _SetOutputFormat(output_format)
+  _SetVerboseLevel(verbosity)
+  _SetFilters(filters)
+  _SetCountingStyle(counting_style)
+
+  return filenames
+
+
+def main():
+  filenames = ParseArguments(sys.argv[1:])
+
+  # Change stderr to write with replacement characters so we don't die
+  # if we try to print something containing non-ASCII characters.
+  sys.stderr = codecs.StreamReaderWriter(sys.stderr,
+                                         codecs.getreader('utf8'),
+                                         codecs.getwriter('utf8'),
+                                         'replace')
+
+  _cpplint_state.ResetErrorCounts()
+  for filename in filenames:
+    ProcessFile(filename, _cpplint_state.verbose_level)
+  _cpplint_state.PrintErrorCounts()
+
+  sys.exit(_cpplint_state.error_count > 0)
+
+
+if __name__ == '__main__':
+  main()
diff --git a/linters/lint_engine/FacebookFbcodeLintEngine.php b/linters/lint_engine/FacebookFbcodeLintEngine.php
new file mode 100644 (file)
index 0000000..cb9cf9b
--- /dev/null
@@ -0,0 +1,147 @@
+<?php
+// Copyright 2004-present Facebook.  All rights reserved.
+
+class FacebookFbcodeLintEngine extends ArcanistLintEngine {
+
+  public function buildLinters() {
+    $linters = array();
+    $paths = $this->getPaths();
+
+    // Remove all deleted files, which are not checked by the
+    // following linters.
+    foreach ($paths as $key => $path) {
+      if (!Filesystem::pathExists($this->getFilePathOnDisk($path))) {
+        unset($paths[$key]);
+      }
+    }
+
+    $generated_linter = new ArcanistGeneratedLinter();
+    $linters[] = $generated_linter;
+
+    $nolint_linter = new ArcanistNoLintLinter();
+    $linters[] = $nolint_linter;
+
+    $text_linter = new ArcanistTextLinter();
+    $text_linter->setCustomSeverityMap(array(
+      ArcanistTextLinter::LINT_LINE_WRAP
+        => ArcanistLintSeverity::SEVERITY_ADVICE,
+    ));
+    $linters[] = $text_linter;
+
+    $java_text_linter = new ArcanistTextLinter();
+    $java_text_linter->setMaxLineLength(100);
+    $java_text_linter->setCustomSeverityMap(array(
+      ArcanistTextLinter::LINT_LINE_WRAP
+        => ArcanistLintSeverity::SEVERITY_ADVICE,
+    ));
+    $linters[] = $java_text_linter;
+
+    $pep8_options = $this->getPEP8WithTextOptions().',E302';
+
+    $python_linter = new ArcanistPEP8Linter();
+    $python_linter->setConfig(array('options' => $pep8_options));
+    $linters[] = $python_linter;
+
+    $python_2space_linter = new ArcanistPEP8Linter();
+    $python_2space_linter->setConfig(array('options' => $pep8_options.',E111'));
+    $linters[] = $python_2space_linter;
+
+   // Currently we can't run cpplint in commit hook mode, because it
+    // depends on having access to the working directory.
+    if (!$this->getCommitHookMode()) {
+      $cpp_linters = array();
+      $google_linter = new ArcanistCpplintLinter();
+      $google_linter->setConfig(array(
+        'lint.cpplint.prefix' => '',
+        'lint.cpplint.bin' => 'cpplint',
+      ));
+      $cpp_linters[] = $linters[] = $google_linter;
+      $cpp_linters[] = $linters[] = new FbcodeCppLinter();
+      $cpp_linters[] = $linters[] = new PfffCppLinter();
+    }
+
+    $spelling_linter = new ArcanistSpellingLinter();
+    $linters[] = $spelling_linter;
+
+    foreach ($paths as $path) {
+      $is_text = false;
+
+      $text_extensions = (
+        '/\.('.
+        'cpp|cxx|c|cc|h|hpp|hxx|tcc|'.
+        'py|rb|hs|pl|pm|tw|'.
+        'php|phpt|css|js|'.
+        'java|'.
+        'thrift|'.
+        'lua|'.
+        'siv|'.
+        'txt'.
+        ')$/'
+      );
+      if (preg_match($text_extensions, $path)) {
+        $is_text = true;
+      }
+      if ($is_text) {
+        $nolint_linter->addPath($path);
+
+        $generated_linter->addPath($path);
+        $generated_linter->addData($path, $this->loadData($path));
+
+        if (preg_match('/\.java$/', $path)) {
+          $java_text_linter->addPath($path);
+          $java_text_linter->addData($path, $this->loadData($path));
+        } else {
+          $text_linter->addPath($path);
+          $text_linter->addData($path, $this->loadData($path));
+        }
+
+        $spelling_linter->addPath($path);
+        $spelling_linter->addData($path, $this->loadData($path));
+      }
+      if (preg_match('/\.(cpp|c|cc|cxx|h|hh|hpp|hxx|tcc)$/', $path)) {
+        foreach ($cpp_linters as &$linter) {
+          $linter->addPath($path);
+          $linter->addData($path, $this->loadData($path));
+        }
+      }
+
+      // Match *.py and contbuild config files
+      if (preg_match('/(\.(py|tw|smcprops)|^contbuild\/configs\/[^\/]*)$/',
+                    $path)) {
+        $space_count = 4;
+        $real_path = $this->getFilePathOnDisk($path);
+        $dir = dirname($real_path);
+        do {
+          if (file_exists($dir.'/.python2space')) {
+            $space_count = 2;
+            break;
+          }
+          $dir = dirname($dir);
+        } while ($dir != '/' && $dir != '.');
+
+        if ($space_count == 4) {
+          $cur_path_linter = $python_linter;
+        } else {
+          $cur_path_linter = $python_2space_linter;
+        }
+        $cur_path_linter->addPath($path);
+        $cur_path_linter->addData($path, $this->loadData($path));
+
+        if (preg_match('/\.tw$/', $path)) {
+          $cur_path_linter->setCustomSeverityMap(array(
+            'E251' => ArcanistLintSeverity::SEVERITY_DISABLED,
+          ));
+        }
+      }
+    }
+
+    $name_linter = new ArcanistFilenameLinter();
+    $linters[] = $name_linter;
+    foreach ($paths as $path) {
+      $name_linter->addPath($path);
+    }
+
+    return $linters;
+  }
+
+}
diff --git a/port/README b/port/README
new file mode 100644 (file)
index 0000000..422563e
--- /dev/null
@@ -0,0 +1,10 @@
+This directory contains interfaces and implementations that isolate the
+rest of the package from platform details.
+
+Code in the rest of the package includes "port.h" from this directory.
+"port.h" in turn includes a platform specific "port_<platform>.h" file
+that provides the platform specific implementation.
+
+See port_posix.h for an example of what must be provided in a platform
+specific header file.
+
diff --git a/port/atomic_pointer.h b/port/atomic_pointer.h
new file mode 100644 (file)
index 0000000..db3580b
--- /dev/null
@@ -0,0 +1,157 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// AtomicPointer provides storage for a lock-free pointer.
+// Platform-dependent implementation of AtomicPointer:
+// - If the platform provides a cheap barrier, we use it with raw pointers
+// - If cstdatomic is present (on newer versions of gcc, it is), we use
+//   a cstdatomic-based AtomicPointer.  However we prefer the memory
+//   barrier based version, because at least on a gcc 4.4 32-bit build
+//   on linux, we have encountered a buggy <cstdatomic>
+//   implementation.  Also, some <cstdatomic> implementations are much
+//   slower than a memory-barrier based implementation (~16ns for
+//   <cstdatomic> based acquire-load vs. ~1ns for a barrier based
+//   acquire-load).
+// This code is based on atomicops-internals-* in Google's perftools:
+// http://code.google.com/p/google-perftools/source/browse/#svn%2Ftrunk%2Fsrc%2Fbase
+
+#ifndef PORT_ATOMIC_POINTER_H_
+#define PORT_ATOMIC_POINTER_H_
+
+#include <stdint.h>
+#ifdef ROCKSDB_ATOMIC_PRESENT
+#include <atomic>
+#endif
+#ifdef OS_WIN
+#include <windows.h>
+#endif
+#ifdef OS_MACOSX
+#include <libkern/OSAtomic.h>
+#endif
+
+#if defined(_M_X64) || defined(__x86_64__)
+#define ARCH_CPU_X86_FAMILY 1
+#elif defined(_M_IX86) || defined(__i386__) || defined(__i386)
+#define ARCH_CPU_X86_FAMILY 1
+#elif defined(__ARMEL__)
+#define ARCH_CPU_ARM_FAMILY 1
+#endif
+
+namespace rocksdb {
+namespace port {
+
+// Define MemoryBarrier() if available
+// Windows on x86
+#if defined(OS_WIN) && defined(COMPILER_MSVC) && defined(ARCH_CPU_X86_FAMILY)
+// windows.h already provides a MemoryBarrier(void) macro
+// http://msdn.microsoft.com/en-us/library/ms684208(v=vs.85).aspx
+#define ROCKSDB_HAVE_MEMORY_BARRIER
+
+// Gcc on x86
+#elif defined(ARCH_CPU_X86_FAMILY) && defined(__GNUC__)
+inline void MemoryBarrier() {
+  // See http://gcc.gnu.org/ml/gcc/2003-04/msg01180.html for a discussion on
+  // this idiom. Also see http://en.wikipedia.org/wiki/Memory_ordering.
+  __asm__ __volatile__("" : : : "memory");
+}
+#define ROCKSDB_HAVE_MEMORY_BARRIER
+
+// Sun Studio
+#elif defined(ARCH_CPU_X86_FAMILY) && defined(__SUNPRO_CC)
+inline void MemoryBarrier() {
+  // See http://gcc.gnu.org/ml/gcc/2003-04/msg01180.html for a discussion on
+  // this idiom. Also see http://en.wikipedia.org/wiki/Memory_ordering.
+  asm volatile("" : : : "memory");
+}
+#define ROCKSDB_HAVE_MEMORY_BARRIER
+
+// Mac OS
+#elif defined(OS_MACOSX)
+inline void MemoryBarrier() {
+  OSMemoryBarrier();
+}
+#define ROCKSDB_HAVE_MEMORY_BARRIER
+
+// ARM Linux
+#elif defined(ARCH_CPU_ARM_FAMILY) && defined(__linux__)
+typedef void (*LinuxKernelMemoryBarrierFunc)(void);
+// The Linux ARM kernel provides a highly optimized device-specific memory
+// barrier function at a fixed memory address that is mapped in every
+// user-level process.
+//
+// This beats using CPU-specific instructions which are, on single-core
+// devices, un-necessary and very costly (e.g. ARMv7-A "dmb" takes more
+// than 180ns on a Cortex-A8 like the one on a Nexus One). Benchmarking
+// shows that the extra function call cost is completely negligible on
+// multi-core devices.
+//
+inline void MemoryBarrier() {
+  (*(LinuxKernelMemoryBarrierFunc)0xffff0fa0)();
+}
+#define ROCKSDB_HAVE_MEMORY_BARRIER
+
+#endif
+
+// AtomicPointer built using platform-specific MemoryBarrier()
+#if defined(ROCKSDB_HAVE_MEMORY_BARRIER)
+class AtomicPointer {
+ private:
+  void* rep_;
+ public:
+  AtomicPointer() { }
+  explicit AtomicPointer(void* p) : rep_(p) {}
+  inline void* NoBarrier_Load() const { return rep_; }
+  inline void NoBarrier_Store(void* v) { rep_ = v; }
+  inline void* Acquire_Load() const {
+    void* result = rep_;
+    MemoryBarrier();
+    return result;
+  }
+  inline void Release_Store(void* v) {
+    MemoryBarrier();
+    rep_ = v;
+  }
+};
+
+// AtomicPointer based on <atomic>
+#elif defined(ROCKSDB_ATOMIC_PRESENT)
+class AtomicPointer {
+ private:
+  std::atomic<void*> rep_;
+ public:
+  AtomicPointer() { }
+  explicit AtomicPointer(void* v) : rep_(v) { }
+  inline void* Acquire_Load() const {
+    return rep_.load(std::memory_order_acquire);
+  }
+  inline void Release_Store(void* v) {
+    rep_.store(v, std::memory_order_release);
+  }
+  inline void* NoBarrier_Load() const {
+    return rep_.load(std::memory_order_relaxed);
+  }
+  inline void NoBarrier_Store(void* v) {
+    rep_.store(v, std::memory_order_relaxed);
+  }
+};
+
+// We have neither MemoryBarrier(), nor <cstdatomic>
+#else
+#error Please implement AtomicPointer for this platform.
+
+#endif
+
+#undef ROCKSDB_HAVE_MEMORY_BARRIER
+#undef ARCH_CPU_X86_FAMILY
+#undef ARCH_CPU_ARM_FAMILY
+
+}  // namespace port
+}  // namespace rocksdb
+
+#endif  // PORT_ATOMIC_POINTER_H_
diff --git a/port/likely.h b/port/likely.h
new file mode 100644 (file)
index 0000000..ede0df5
--- /dev/null
@@ -0,0 +1,21 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef PORT_LIKELY_H_
+#define PORT_LIKELY_H_
+
+#if defined(__GNUC__) && __GNUC__ >= 4
+#define LIKELY(x)   (__builtin_expect((x), 1))
+#define UNLIKELY(x) (__builtin_expect((x), 0))
+#else
+#define LIKELY(x)   (x)
+#define UNLIKELY(x) (x)
+#endif
+
+#endif  // PORT_LIKELY_H_
diff --git a/port/port.h b/port/port.h
new file mode 100644 (file)
index 0000000..2dc9a0f
--- /dev/null
@@ -0,0 +1,22 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_PORT_PORT_H_
+#define STORAGE_LEVELDB_PORT_PORT_H_
+
+#include <string.h>
+
+// Include the appropriate platform specific file below.  If you are
+// porting to a new platform, see "port_example.h" for documentation
+// of what the new port_<platform>.h file must provide.
+#if defined(ROCKSDB_PLATFORM_POSIX)
+#  include "port/port_posix.h"
+#endif
+
+#endif  // STORAGE_LEVELDB_PORT_PORT_H_
diff --git a/port/port_example.h b/port/port_example.h
new file mode 100644 (file)
index 0000000..f124abb
--- /dev/null
@@ -0,0 +1,133 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// This file contains the specification, but not the implementations,
+// of the types/operations/etc. that should be defined by a platform
+// specific port_<platform>.h file.  Use this file as a reference for
+// how to port this package to a new platform.
+
+#ifndef STORAGE_LEVELDB_PORT_PORT_EXAMPLE_H_
+#define STORAGE_LEVELDB_PORT_PORT_EXAMPLE_H_
+
+namespace rocksdb {
+namespace port {
+
+// TODO(jorlow): Many of these belong more in the environment class rather than
+//               here. We should try moving them and see if it affects perf.
+
+// The following boolean constant must be true on a little-endian machine
+// and false otherwise.
+static const bool kLittleEndian = true /* or some other expression */;
+
+// ------------------ Threading -------------------
+
+// A Mutex represents an exclusive lock.
+class Mutex {
+ public:
+  Mutex();
+  ~Mutex();
+
+  // Lock the mutex.  Waits until other lockers have exited.
+  // Will deadlock if the mutex is already locked by this thread.
+  void Lock();
+
+  // Unlock the mutex.
+  // REQUIRES: This mutex was locked by this thread.
+  void Unlock();
+
+  // Optionally crash if this thread does not hold this mutex.
+  // The implementation must be fast, especially if NDEBUG is
+  // defined.  The implementation is allowed to skip all checks.
+  void AssertHeld();
+};
+
+class CondVar {
+ public:
+  explicit CondVar(Mutex* mu);
+  ~CondVar();
+
+  // Atomically release *mu and block on this condition variable until
+  // either a call to SignalAll(), or a call to Signal() that picks
+  // this thread to wakeup.
+  // REQUIRES: this thread holds *mu
+  void Wait();
+
+  // If there are some threads waiting, wake up at least one of them.
+  void Signal();
+
+  // Wake up all waiting threads.
+  void SignallAll();
+};
+
+// Thread-safe initialization.
+// Used as follows:
+//      static port::OnceType init_control = LEVELDB_ONCE_INIT;
+//      static void Initializer() { ... do something ...; }
+//      ...
+//      port::InitOnce(&init_control, &Initializer);
+typedef intptr_t OnceType;
+#define LEVELDB_ONCE_INIT 0
+extern void InitOnce(port::OnceType*, void (*initializer)());
+
+// A type that holds a pointer that can be read or written atomically
+// (i.e., without word-tearing.)
+class AtomicPointer {
+ private:
+  intptr_t rep_;
+ public:
+  // Initialize to arbitrary value
+  AtomicPointer();
+
+  // Initialize to hold v
+  explicit AtomicPointer(void* v) : rep_(v) { }
+
+  // Read and return the stored pointer with the guarantee that no
+  // later memory access (read or write) by this thread can be
+  // reordered ahead of this read.
+  void* Acquire_Load() const;
+
+  // Set v as the stored pointer with the guarantee that no earlier
+  // memory access (read or write) by this thread can be reordered
+  // after this store.
+  void Release_Store(void* v);
+
+  // Read the stored pointer with no ordering guarantees.
+  void* NoBarrier_Load() const;
+
+  // Set va as the stored pointer with no ordering guarantees.
+  void NoBarrier_Store(void* v);
+};
+
+// ------------------ Compression -------------------
+
+// Store the snappy compression of "input[0,input_length-1]" in *output.
+// Returns false if snappy is not supported by this port.
+extern bool Snappy_Compress(const char* input, size_t input_length,
+                            std::string* output);
+
+// If input[0,input_length-1] looks like a valid snappy compressed
+// buffer, store the size of the uncompressed data in *result and
+// return true.  Else return false.
+extern bool Snappy_GetUncompressedLength(const char* input, size_t length,
+                                         size_t* result);
+
+// Attempt to snappy uncompress input[0,input_length-1] into *output.
+// Returns true if successful, false if the input is invalid lightweight
+// compressed data.
+//
+// REQUIRES: at least the first "n" bytes of output[] must be writable
+// where "n" is the result of a successful call to
+// Snappy_GetUncompressedLength.
+extern bool Snappy_Uncompress(const char* input_data, size_t input_length,
+                              char* output);
+
+}  // namespace port
+}  // namespace rocksdb
+
+#endif  // STORAGE_LEVELDB_PORT_PORT_EXAMPLE_H_
diff --git a/port/port_posix.cc b/port/port_posix.cc
new file mode 100644 (file)
index 0000000..911cebd
--- /dev/null
@@ -0,0 +1,109 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "port/port_posix.h"
+
+#include <cstdlib>
+#include <stdio.h>
+#include <assert.h>
+#include <string.h>
+#include "util/logging.h"
+
+namespace rocksdb {
+namespace port {
+
+static void PthreadCall(const char* label, int result) {
+  if (result != 0) {
+    fprintf(stderr, "pthread %s: %s\n", label, strerror(result));
+    abort();
+  }
+}
+
+Mutex::Mutex(bool adaptive) {
+#ifdef OS_LINUX
+  if (!adaptive) {
+    PthreadCall("init mutex", pthread_mutex_init(&mu_, NULL));
+  } else {
+    pthread_mutexattr_t mutex_attr;
+    PthreadCall("init mutex attr", pthread_mutexattr_init(&mutex_attr));
+    PthreadCall("set mutex attr",
+                pthread_mutexattr_settype(&mutex_attr,
+                                          PTHREAD_MUTEX_ADAPTIVE_NP));
+    PthreadCall("init mutex", pthread_mutex_init(&mu_, &mutex_attr));
+    PthreadCall("destroy mutex attr",
+                pthread_mutexattr_destroy(&mutex_attr));
+  }
+#else // ignore adaptive for non-linux platform
+  PthreadCall("init mutex", pthread_mutex_init(&mu_, NULL));
+#endif // OS_LINUX
+}
+
+Mutex::~Mutex() { PthreadCall("destroy mutex", pthread_mutex_destroy(&mu_)); }
+
+void Mutex::Lock() {
+  PthreadCall("lock", pthread_mutex_lock(&mu_));
+#ifndef NDEBUG
+  locked_ = true;
+#endif
+}
+
+void Mutex::Unlock() {
+#ifndef NDEBUG
+  locked_ = false;
+#endif
+  PthreadCall("unlock", pthread_mutex_unlock(&mu_));
+}
+
+void Mutex::AssertHeld() {
+#ifndef NDEBUG
+  assert(locked_);
+#endif
+}
+
+CondVar::CondVar(Mutex* mu)
+    : mu_(mu) {
+    PthreadCall("init cv", pthread_cond_init(&cv_, NULL));
+}
+
+CondVar::~CondVar() { PthreadCall("destroy cv", pthread_cond_destroy(&cv_)); }
+
+void CondVar::Wait() {
+#ifndef NDEBUG
+  mu_->locked_ = false;
+#endif
+  PthreadCall("wait", pthread_cond_wait(&cv_, &mu_->mu_));
+#ifndef NDEBUG
+  mu_->locked_ = true;
+#endif
+}
+
+void CondVar::Signal() {
+  PthreadCall("signal", pthread_cond_signal(&cv_));
+}
+
+void CondVar::SignalAll() {
+  PthreadCall("broadcast", pthread_cond_broadcast(&cv_));
+}
+
+RWMutex::RWMutex() { PthreadCall("init mutex", pthread_rwlock_init(&mu_, NULL)); }
+
+RWMutex::~RWMutex() { PthreadCall("destroy mutex", pthread_rwlock_destroy(&mu_)); }
+
+void RWMutex::ReadLock() { PthreadCall("read lock", pthread_rwlock_rdlock(&mu_)); }
+
+void RWMutex::WriteLock() { PthreadCall("write lock", pthread_rwlock_wrlock(&mu_)); }
+
+void RWMutex::Unlock() { PthreadCall("unlock", pthread_rwlock_unlock(&mu_)); }
+
+void InitOnce(OnceType* once, void (*initializer)()) {
+  PthreadCall("once", pthread_once(once, initializer));
+}
+
+}  // namespace port
+}  // namespace rocksdb
diff --git a/port/port_posix.h b/port/port_posix.h
new file mode 100644 (file)
index 0000000..b2d1624
--- /dev/null
@@ -0,0 +1,484 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// See port_example.h for documentation for the following types/functions.
+
+#ifndef STORAGE_LEVELDB_PORT_PORT_POSIX_H_
+#define STORAGE_LEVELDB_PORT_PORT_POSIX_H_
+
+#undef PLATFORM_IS_LITTLE_ENDIAN
+#if defined(OS_MACOSX)
+  #include <machine/endian.h>
+  #if defined(__DARWIN_LITTLE_ENDIAN) && defined(__DARWIN_BYTE_ORDER)
+    #define PLATFORM_IS_LITTLE_ENDIAN \
+        (__DARWIN_BYTE_ORDER == __DARWIN_LITTLE_ENDIAN)
+  #endif
+#elif defined(OS_SOLARIS)
+  #include <sys/isa_defs.h>
+  #ifdef _LITTLE_ENDIAN
+    #define PLATFORM_IS_LITTLE_ENDIAN true
+  #else
+    #define PLATFORM_IS_LITTLE_ENDIAN false
+  #endif
+#elif defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) ||\
+      defined(OS_DRAGONFLYBSD) || defined(OS_ANDROID)
+  #include <sys/types.h>
+  #include <sys/endian.h>
+#else
+  #include <endian.h>
+#endif
+#include <pthread.h>
+#ifdef SNAPPY
+#include <snappy.h>
+#endif
+
+#ifdef ZLIB
+#include <zlib.h>
+#endif
+
+#ifdef BZIP2
+#include <bzlib.h>
+#endif
+
+#if defined(LZ4)
+#include <lz4.h>
+#include <lz4hc.h>
+#endif
+
+#include <stdint.h>
+#include <string>
+#include <string.h>
+#include "rocksdb/options.h"
+#include "port/atomic_pointer.h"
+
+#ifndef PLATFORM_IS_LITTLE_ENDIAN
+#define PLATFORM_IS_LITTLE_ENDIAN (__BYTE_ORDER == __LITTLE_ENDIAN)
+#endif
+
+#if defined(OS_MACOSX) || defined(OS_SOLARIS) || defined(OS_FREEBSD) ||\
+    defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLYBSD) ||\
+    defined(OS_ANDROID)
+// Use fread/fwrite/fflush on platforms without _unlocked variants
+#define fread_unlocked fread
+#define fwrite_unlocked fwrite
+#define fflush_unlocked fflush
+#endif
+
+#if defined(OS_MACOSX) || defined(OS_FREEBSD) ||\
+    defined(OS_OPENBSD) || defined(OS_DRAGONFLYBSD)
+// Use fsync() on platforms without fdatasync()
+#define fdatasync fsync
+#endif
+
+#if defined(OS_ANDROID) && __ANDROID_API__ < 9
+// fdatasync() was only introduced in API level 9 on Android. Use fsync()
+// when targetting older platforms.
+#define fdatasync fsync
+#endif
+
+namespace rocksdb {
+namespace port {
+
+static const bool kLittleEndian = PLATFORM_IS_LITTLE_ENDIAN;
+#undef PLATFORM_IS_LITTLE_ENDIAN
+
+class CondVar;
+
+class Mutex {
+ public:
+  /* implicit */ Mutex(bool adaptive = false);
+  ~Mutex();
+
+  void Lock();
+  void Unlock();
+  // this will assert if the mutex is not locked
+  // it does NOT verify that mutex is held by a calling thread
+  void AssertHeld();
+
+ private:
+  friend class CondVar;
+  pthread_mutex_t mu_;
+#ifndef NDEBUG
+  bool locked_;
+#endif
+
+  // No copying
+  Mutex(const Mutex&);
+  void operator=(const Mutex&);
+};
+
+class RWMutex {
+ public:
+  RWMutex();
+  ~RWMutex();
+
+  void ReadLock();
+  void WriteLock();
+  void Unlock();
+  void AssertHeld() { }
+
+ private:
+  pthread_rwlock_t mu_; // the underlying platform mutex
+
+  // No copying allowed
+  RWMutex(const RWMutex&);
+  void operator=(const RWMutex&);
+};
+
+class CondVar {
+ public:
+  explicit CondVar(Mutex* mu);
+  ~CondVar();
+  void Wait();
+  void Signal();
+  void SignalAll();
+ private:
+  pthread_cond_t cv_;
+  Mutex* mu_;
+};
+
+typedef pthread_once_t OnceType;
+#define LEVELDB_ONCE_INIT PTHREAD_ONCE_INIT
+extern void InitOnce(OnceType* once, void (*initializer)());
+
+inline bool Snappy_Compress(const CompressionOptions& opts, const char* input,
+                            size_t length, ::std::string* output) {
+#ifdef SNAPPY
+  output->resize(snappy::MaxCompressedLength(length));
+  size_t outlen;
+  snappy::RawCompress(input, length, &(*output)[0], &outlen);
+  output->resize(outlen);
+  return true;
+#endif
+
+  return false;
+}
+
+inline bool Snappy_GetUncompressedLength(const char* input, size_t length,
+                                         size_t* result) {
+#ifdef SNAPPY
+  return snappy::GetUncompressedLength(input, length, result);
+#else
+  return false;
+#endif
+}
+
+inline bool Snappy_Uncompress(const char* input, size_t length,
+                              char* output) {
+#ifdef SNAPPY
+  return snappy::RawUncompress(input, length, output);
+#else
+  return false;
+#endif
+}
+
+inline bool Zlib_Compress(const CompressionOptions& opts, const char* input,
+                          size_t length, ::std::string* output) {
+#ifdef ZLIB
+  // The memLevel parameter specifies how much memory should be allocated for
+  // the internal compression state.
+  // memLevel=1 uses minimum memory but is slow and reduces compression ratio.
+  // memLevel=9 uses maximum memory for optimal speed.
+  // The default value is 8. See zconf.h for more details.
+  static const int memLevel = 8;
+  z_stream _stream;
+  memset(&_stream, 0, sizeof(z_stream));
+  int st = deflateInit2(&_stream, opts.level, Z_DEFLATED, opts.window_bits,
+                        memLevel, opts.strategy);
+  if (st != Z_OK) {
+    return false;
+  }
+
+  // Resize output to be the plain data length.
+  // This may not be big enough if the compression actually expands data.
+  output->resize(length);
+
+  // Compress the input, and put compressed data in output.
+  _stream.next_in = (Bytef *)input;
+  _stream.avail_in = length;
+
+  // Initialize the output size.
+  _stream.avail_out = length;
+  _stream.next_out = (Bytef *)&(*output)[0];
+
+  int old_sz =0, new_sz =0, new_sz_delta =0;
+  bool done = false;
+  while (!done) {
+    int st = deflate(&_stream, Z_FINISH);
+    switch (st) {
+      case Z_STREAM_END:
+        done = true;
+        break;
+      case Z_OK:
+        // No output space. Increase the output space by 20%.
+        // (Should we fail the compression since it expands the size?)
+        old_sz = output->size();
+        new_sz_delta = (int)(output->size() * 0.2);
+        new_sz = output->size() + (new_sz_delta < 10 ? 10 : new_sz_delta);
+        output->resize(new_sz);
+        // Set more output.
+        _stream.next_out = (Bytef *)&(*output)[old_sz];
+        _stream.avail_out = new_sz - old_sz;
+        break;
+      case Z_BUF_ERROR:
+      default:
+        deflateEnd(&_stream);
+        return false;
+    }
+  }
+
+  output->resize(output->size() - _stream.avail_out);
+  deflateEnd(&_stream);
+  return true;
+#endif
+  return false;
+}
+
+inline char* Zlib_Uncompress(const char* input_data, size_t input_length,
+    int* decompress_size, int windowBits = -14) {
+#ifdef ZLIB
+  z_stream _stream;
+  memset(&_stream, 0, sizeof(z_stream));
+
+  // For raw inflate, the windowBits should be -8..-15.
+  // If windowBits is bigger than zero, it will use either zlib
+  // header or gzip header. Adding 32 to it will do automatic detection.
+  int st = inflateInit2(&_stream,
+      windowBits > 0 ? windowBits + 32 : windowBits);
+  if (st != Z_OK) {
+    return nullptr;
+  }
+
+  _stream.next_in = (Bytef *)input_data;
+  _stream.avail_in = input_length;
+
+  // Assume the decompressed data size will 5x of compressed size.
+  int output_len = input_length * 5;
+  char* output = new char[output_len];
+  int old_sz = output_len;
+
+  _stream.next_out = (Bytef *)output;
+  _stream.avail_out = output_len;
+
+  char* tmp = nullptr;
+  int output_len_delta;
+  bool done = false;
+
+  //while(_stream.next_in != nullptr && _stream.avail_in != 0) {
+  while (!done) {
+    int st = inflate(&_stream, Z_SYNC_FLUSH);
+    switch (st) {
+      case Z_STREAM_END:
+        done = true;
+        break;
+      case Z_OK:
+        // No output space. Increase the output space by 20%.
+        old_sz = output_len;
+        output_len_delta = (int)(output_len * 0.2);
+        output_len += output_len_delta < 10 ? 10 : output_len_delta;
+        tmp = new char[output_len];
+        memcpy(tmp, output, old_sz);
+        delete[] output;
+        output = tmp;
+
+        // Set more output.
+        _stream.next_out = (Bytef *)(output + old_sz);
+        _stream.avail_out = output_len - old_sz;
+        break;
+      case Z_BUF_ERROR:
+      default:
+        delete[] output;
+        inflateEnd(&_stream);
+        return nullptr;
+    }
+  }
+
+  *decompress_size = output_len - _stream.avail_out;
+  inflateEnd(&_stream);
+  return output;
+#endif
+
+  return nullptr;
+}
+
+inline bool BZip2_Compress(const CompressionOptions& opts, const char* input,
+                           size_t length, ::std::string* output) {
+#ifdef BZIP2
+  bz_stream _stream;
+  memset(&_stream, 0, sizeof(bz_stream));
+
+  // Block size 1 is 100K.
+  // 0 is for silent.
+  // 30 is the default workFactor
+  int st = BZ2_bzCompressInit(&_stream, 1, 0, 30);
+  if (st != BZ_OK) {
+    return false;
+  }
+
+  // Resize output to be the plain data length.
+  // This may not be big enough if the compression actually expands data.
+  output->resize(length);
+
+  // Compress the input, and put compressed data in output.
+  _stream.next_in = (char *)input;
+  _stream.avail_in = length;
+
+  // Initialize the output size.
+  _stream.next_out = (char *)&(*output)[0];
+  _stream.avail_out = length;
+
+  int old_sz =0, new_sz =0;
+  while(_stream.next_in != nullptr && _stream.avail_in != 0) {
+    int st = BZ2_bzCompress(&_stream, BZ_FINISH);
+    switch (st) {
+      case BZ_STREAM_END:
+        break;
+      case BZ_FINISH_OK:
+        // No output space. Increase the output space by 20%.
+        // (Should we fail the compression since it expands the size?)
+        old_sz = output->size();
+        new_sz = (int)(output->size() * 1.2);
+        output->resize(new_sz);
+        // Set more output.
+        _stream.next_out = (char *)&(*output)[old_sz];
+        _stream.avail_out = new_sz - old_sz;
+        break;
+      case BZ_SEQUENCE_ERROR:
+      default:
+        BZ2_bzCompressEnd(&_stream);
+        return false;
+    }
+  }
+
+  output->resize(output->size() - _stream.avail_out);
+  BZ2_bzCompressEnd(&_stream);
+  return true;
+#endif
+  return false;
+}
+
+inline char* BZip2_Uncompress(const char* input_data, size_t input_length,
+                              int* decompress_size) {
+#ifdef BZIP2
+  bz_stream _stream;
+  memset(&_stream, 0, sizeof(bz_stream));
+
+  int st = BZ2_bzDecompressInit(&_stream, 0, 0);
+  if (st != BZ_OK) {
+    return nullptr;
+  }
+
+  _stream.next_in = (char *)input_data;
+  _stream.avail_in = input_length;
+
+  // Assume the decompressed data size will be 5x of compressed size.
+  int output_len = input_length * 5;
+  char* output = new char[output_len];
+  int old_sz = output_len;
+
+  _stream.next_out = (char *)output;
+  _stream.avail_out = output_len;
+
+  char* tmp = nullptr;
+
+  while(_stream.next_in != nullptr && _stream.avail_in != 0) {
+    int st = BZ2_bzDecompress(&_stream);
+    switch (st) {
+      case BZ_STREAM_END:
+        break;
+      case BZ_OK:
+        // No output space. Increase the output space by 20%.
+        old_sz = output_len;
+        output_len = (int)(output_len * 1.2);
+        tmp = new char[output_len];
+        memcpy(tmp, output, old_sz);
+        delete[] output;
+        output = tmp;
+
+        // Set more output.
+        _stream.next_out = (char *)(output + old_sz);
+        _stream.avail_out = output_len - old_sz;
+        break;
+      default:
+        delete[] output;
+        BZ2_bzDecompressEnd(&_stream);
+        return nullptr;
+    }
+  }
+
+  *decompress_size = output_len - _stream.avail_out;
+  BZ2_bzDecompressEnd(&_stream);
+  return output;
+#endif
+  return nullptr;
+}
+
+inline bool LZ4_Compress(const CompressionOptions &opts, const char *input,
+                         size_t length, ::std::string* output) {
+#ifdef LZ4
+  int compressBound = LZ4_compressBound(length);
+  output->resize(8 + compressBound);
+  char *p = const_cast<char *>(output->c_str());
+  memcpy(p, &length, sizeof(length));
+  size_t outlen;
+  outlen = LZ4_compress_limitedOutput(input, p + 8, length, compressBound);
+  if (outlen == 0) {
+    return false;
+  }
+  output->resize(8 + outlen);
+  return true;
+#endif
+  return false;
+}
+
+inline char* LZ4_Uncompress(const char* input_data, size_t input_length,
+                            int* decompress_size) {
+#ifdef LZ4
+  if (input_length < 8) {
+    return nullptr;
+  }
+  int output_len;
+  memcpy(&output_len, input_data, sizeof(output_len));
+  char *output = new char[output_len];
+  *decompress_size = LZ4_decompress_safe_partial(
+      input_data + 8, output, input_length - 8, output_len, output_len);
+  if (*decompress_size < 0) {
+    delete[] output;
+    return nullptr;
+  }
+  return output;
+#endif
+  return nullptr;
+}
+
+inline bool LZ4HC_Compress(const CompressionOptions &opts, const char* input,
+                           size_t length, ::std::string* output) {
+#ifdef LZ4
+  int compressBound = LZ4_compressBound(length);
+  output->resize(8 + compressBound);
+  char *p = const_cast<char *>(output->c_str());
+  memcpy(p, &length, sizeof(length));
+  size_t outlen;
+  outlen = LZ4_compressHC2_limitedOutput(input, p + 8, length, compressBound,
+                                         opts.level);
+  if (outlen == 0) {
+    return false;
+  }
+  output->resize(8 + outlen);
+  return true;
+#endif
+  return false;
+}
+
+#define CACHE_LINE_SIZE 64U
+
+} // namespace port
+} // namespace rocksdb
+
+#endif  // STORAGE_LEVELDB_PORT_PORT_POSIX_H_
diff --git a/port/stack_trace.cc b/port/stack_trace.cc
new file mode 100644 (file)
index 0000000..76866e6
--- /dev/null
@@ -0,0 +1,132 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#include "port/stack_trace.h"
+
+namespace rocksdb {
+namespace port {
+
+#if defined(ROCKSDB_LITE) || !(defined(OS_LINUX) || defined(OS_MACOSX))
+
+// noop
+
+void InstallStackTraceHandler() {}
+void PrintStack(int first_frames_to_skip) {}
+
+#else
+
+#include <execinfo.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <cxxabi.h>
+
+namespace {
+
+#ifdef OS_LINUX
+const char* GetExecutableName() {
+  static char name[1024];
+
+  char link[1024];
+  snprintf(link, sizeof(link), "/proc/%d/exe", getpid());
+  auto read = readlink(link, name, sizeof(name));
+  if (-1 == read) {
+    return nullptr;
+  } else {
+    name[read] = 0;
+    return name;
+  }
+}
+
+void PrintStackTraceLine(const char* symbol, void* frame) {
+  static const char* executable = GetExecutableName();
+  if (symbol) {
+    fprintf(stderr, "%s ", symbol);
+  }
+  if (executable) {
+    // out source to addr2line, for the address translation
+    const int kLineMax = 256;
+    char cmd[kLineMax];
+    snprintf(cmd, kLineMax, "addr2line %p -e %s -f -C 2>&1", frame, executable);
+    auto f = popen(cmd, "r");
+    if (f) {
+      char line[kLineMax];
+      while (fgets(line, sizeof(line), f)) {
+        line[strlen(line) - 1] = 0;  // remove newline
+        fprintf(stderr, "%s\t", line);
+      }
+      pclose(f);
+    }
+  } else {
+    fprintf(stderr, " %p", frame);
+  }
+
+  fprintf(stderr, "\n");
+}
+#elif OS_MACOSX
+
+void PrintStackTraceLine(const char* symbol, void* frame) {
+  static int pid = getpid();
+  // out source to atos, for the address translation
+  const int kLineMax = 256;
+  char cmd[kLineMax];
+  snprintf(cmd, kLineMax, "xcrun atos %p -p %d  2>&1", frame, pid);
+  auto f = popen(cmd, "r");
+  if (f) {
+    char line[kLineMax];
+    while (fgets(line, sizeof(line), f)) {
+      line[strlen(line) - 1] = 0;  // remove newline
+      fprintf(stderr, "%s\t", line);
+    }
+    pclose(f);
+  } else if (symbol) {
+    fprintf(stderr, "%s ", symbol);
+  }
+
+  fprintf(stderr, "\n");
+}
+
+#endif
+
+}  // namespace
+
+void PrintStack(int first_frames_to_skip) {
+  const int kMaxFrames = 100;
+  void* frames[kMaxFrames];
+
+  auto num_frames = backtrace(frames, kMaxFrames);
+  auto symbols = backtrace_symbols(frames, num_frames);
+
+  for (int i = first_frames_to_skip; i < num_frames; ++i) {
+    fprintf(stderr, "#%-2d  ", i - first_frames_to_skip);
+    PrintStackTraceLine((symbols != nullptr) ? symbols[i] : nullptr, frames[i]);
+  }
+}
+
+static void StackTraceHandler(int sig) {
+  // reset to default handler
+  signal(sig, SIG_DFL);
+  fprintf(stderr, "Received signal %d (%s)\n", sig, strsignal(sig));
+  // skip the top three signal handler related frames
+  PrintStack(3);
+  // re-signal to default handler (so we still get core dump if needed...)
+  raise(sig);
+}
+
+void InstallStackTraceHandler() {
+  // just use the plain old signal as it's simple and sufficient
+  // for this use case
+  signal(SIGILL, StackTraceHandler);
+  signal(SIGSEGV, StackTraceHandler);
+  signal(SIGBUS, StackTraceHandler);
+  signal(SIGABRT, StackTraceHandler);
+}
+
+#endif
+
+}  // namespace port
+}  // namespace rocksdb
diff --git a/port/stack_trace.h b/port/stack_trace.h
new file mode 100644 (file)
index 0000000..8bc6c7d
--- /dev/null
@@ -0,0 +1,19 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#pragma once
+namespace rocksdb {
+namespace port {
+
+// Install a signal handler to print callstack on the following signals:
+// SIGILL SIGSEGV SIGBUS SIGABRT
+// Currently supports linux only. No-op otherwise.
+void InstallStackTraceHandler();
+
+// Prints stack, skips skip_first_frames frames
+void PrintStack(int first_frames_to_skip = 0);
+
+}  // namespace port
+}  // namespace rocksdb
diff --git a/port/win/stdint.h b/port/win/stdint.h
new file mode 100644 (file)
index 0000000..39edd0d
--- /dev/null
@@ -0,0 +1,24 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// MSVC didn't ship with this file until the 2010 version.
+
+#ifndef STORAGE_LEVELDB_PORT_WIN_STDINT_H_
+#define STORAGE_LEVELDB_PORT_WIN_STDINT_H_
+
+#if !defined(_MSC_VER)
+#error This file should only be included when compiling with MSVC.
+#endif
+
+// Define C99 equivalent types.
+typedef signed char           int8_t;
+typedef signed short          int16_t;
+typedef signed int            int32_t;
+typedef signed long long      int64_t;
+typedef unsigned char         uint8_t;
+typedef unsigned short        uint16_t;
+typedef unsigned int          uint32_t;
+typedef unsigned long long    uint64_t;
+
+#endif  // STORAGE_LEVELDB_PORT_WIN_STDINT_H_
diff --git a/table/block.cc b/table/block.cc
new file mode 100644 (file)
index 0000000..6a6751c
--- /dev/null
@@ -0,0 +1,307 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Decodes the blocks generated by block_builder.cc.
+
+#include "table/block.h"
+
+#include <algorithm>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "rocksdb/comparator.h"
+#include "table/block_hash_index.h"
+#include "table/format.h"
+#include "util/coding.h"
+#include "util/logging.h"
+
+namespace rocksdb {
+
+uint32_t Block::NumRestarts() const {
+  assert(size_ >= 2*sizeof(uint32_t));
+  return DecodeFixed32(data_ + size_ - sizeof(uint32_t));
+}
+
+Block::Block(const BlockContents& contents)
+    : data_(contents.data.data()),
+      size_(contents.data.size()),
+      owned_(contents.heap_allocated),
+      cachable_(contents.cachable),
+      compression_type_(contents.compression_type) {
+  if (size_ < sizeof(uint32_t)) {
+    size_ = 0;  // Error marker
+  } else {
+    restart_offset_ = size_ - (1 + NumRestarts()) * sizeof(uint32_t);
+    if (restart_offset_ > size_ - sizeof(uint32_t)) {
+      // The size is too small for NumRestarts() and therefore
+      // restart_offset_ wrapped around.
+      size_ = 0;
+    }
+  }
+}
+
+Block::~Block() {
+  if (owned_) {
+    delete[] data_;
+  }
+}
+
+// Helper routine: decode the next block entry starting at "p",
+// storing the number of shared key bytes, non_shared key bytes,
+// and the length of the value in "*shared", "*non_shared", and
+// "*value_length", respectively.  Will not derefence past "limit".
+//
+// If any errors are detected, returns nullptr.  Otherwise, returns a
+// pointer to the key delta (just past the three decoded values).
+static inline const char* DecodeEntry(const char* p, const char* limit,
+                                      uint32_t* shared,
+                                      uint32_t* non_shared,
+                                      uint32_t* value_length) {
+  if (limit - p < 3) return nullptr;
+  *shared = reinterpret_cast<const unsigned char*>(p)[0];
+  *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
+  *value_length = reinterpret_cast<const unsigned char*>(p)[2];
+  if ((*shared | *non_shared | *value_length) < 128) {
+    // Fast path: all three values are encoded in one byte each
+    p += 3;
+  } else {
+    if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr;
+    if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr;
+    if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) return nullptr;
+  }
+
+  if (static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)) {
+    return nullptr;
+  }
+  return p;
+}
+
+class Block::Iter : public Iterator {
+ private:
+  const Comparator* const comparator_;
+  const char* const data_;      // underlying block contents
+  uint32_t const restarts_;     // Offset of restart array (list of fixed32)
+  uint32_t const num_restarts_; // Number of uint32_t entries in restart array
+
+  // current_ is offset in data_ of current entry.  >= restarts_ if !Valid
+  uint32_t current_;
+  uint32_t restart_index_;  // Index of restart block in which current_ falls
+  std::string key_;
+  Slice value_;
+  Status status_;
+  BlockHashIndex* hash_index_;
+
+  inline int Compare(const Slice& a, const Slice& b) const {
+    return comparator_->Compare(a, b);
+  }
+
+  // Return the offset in data_ just past the end of the current entry.
+  inline uint32_t NextEntryOffset() const {
+    return (value_.data() + value_.size()) - data_;
+  }
+
+  uint32_t GetRestartPoint(uint32_t index) {
+    assert(index < num_restarts_);
+    return DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t));
+  }
+
+  void SeekToRestartPoint(uint32_t index) {
+    key_.clear();
+    restart_index_ = index;
+    // current_ will be fixed by ParseNextKey();
+
+    // ParseNextKey() starts at the end of value_, so set value_ accordingly
+    uint32_t offset = GetRestartPoint(index);
+    value_ = Slice(data_ + offset, 0);
+  }
+
+ public:
+  Iter(const Comparator* comparator, const char* data, uint32_t restarts,
+       uint32_t num_restarts, BlockHashIndex* hash_index)
+      : comparator_(comparator),
+        data_(data),
+        restarts_(restarts),
+        num_restarts_(num_restarts),
+        current_(restarts_),
+        restart_index_(num_restarts_),
+        hash_index_(hash_index) {
+    assert(num_restarts_ > 0);
+  }
+
+  virtual bool Valid() const { return current_ < restarts_; }
+  virtual Status status() const { return status_; }
+  virtual Slice key() const {
+    assert(Valid());
+    return key_;
+  }
+  virtual Slice value() const {
+    assert(Valid());
+    return value_;
+  }
+
+  virtual void Next() {
+    assert(Valid());
+    ParseNextKey();
+  }
+
+  virtual void Prev() {
+    assert(Valid());
+
+    // Scan backwards to a restart point before current_
+    const uint32_t original = current_;
+    while (GetRestartPoint(restart_index_) >= original) {
+      if (restart_index_ == 0) {
+        // No more entries
+        current_ = restarts_;
+        restart_index_ = num_restarts_;
+        return;
+      }
+      restart_index_--;
+    }
+
+    SeekToRestartPoint(restart_index_);
+    do {
+      // Loop until end of current entry hits the start of original entry
+    } while (ParseNextKey() && NextEntryOffset() < original);
+  }
+
+  virtual void Seek(const Slice& target) {
+    uint32_t index = 0;
+    bool ok = hash_index_ ? HashSeek(target, &index)
+                          : BinarySeek(target, 0, num_restarts_ - 1, &index);
+
+    if (!ok) {
+      return;
+    }
+    SeekToRestartPoint(index);
+    // Linear search (within restart block) for first key >= target
+
+    while (true) {
+      if (!ParseNextKey() || Compare(key_, target) >= 0) {
+        return;
+      }
+    }
+  }
+  virtual void SeekToFirst() {
+    SeekToRestartPoint(0);
+    ParseNextKey();
+  }
+
+  virtual void SeekToLast() {
+    SeekToRestartPoint(num_restarts_ - 1);
+    while (ParseNextKey() && NextEntryOffset() < restarts_) {
+      // Keep skipping
+    }
+  }
+
+ private:
+  void CorruptionError() {
+    current_ = restarts_;
+    restart_index_ = num_restarts_;
+    status_ = Status::Corruption("bad entry in block");
+    key_.clear();
+    value_.clear();
+  }
+
+  bool ParseNextKey() {
+    current_ = NextEntryOffset();
+    const char* p = data_ + current_;
+    const char* limit = data_ + restarts_;  // Restarts come right after data
+    if (p >= limit) {
+      // No more entries to return.  Mark as invalid.
+      current_ = restarts_;
+      restart_index_ = num_restarts_;
+      return false;
+    }
+
+    // Decode next entry
+    uint32_t shared, non_shared, value_length;
+    p = DecodeEntry(p, limit, &shared, &non_shared, &value_length);
+    if (p == nullptr || key_.size() < shared) {
+      CorruptionError();
+      return false;
+    } else {
+      key_.resize(shared);
+      key_.append(p, non_shared);
+      value_ = Slice(p + non_shared, value_length);
+      while (restart_index_ + 1 < num_restarts_ &&
+             GetRestartPoint(restart_index_ + 1) < current_) {
+        ++restart_index_;
+      }
+      return true;
+    }
+  }
+  // Binary search in restart array to find the first restart point
+  // with a key >= target
+  bool BinarySeek(const Slice& target, uint32_t left, uint32_t right,
+                  uint32_t* index) {
+    assert(left <= right);
+
+    while (left < right) {
+      uint32_t mid = (left + right + 1) / 2;
+      uint32_t region_offset = GetRestartPoint(mid);
+      uint32_t shared, non_shared, value_length;
+      const char* key_ptr =
+          DecodeEntry(data_ + region_offset, data_ + restarts_, &shared,
+                      &non_shared, &value_length);
+      if (key_ptr == nullptr || (shared != 0)) {
+        CorruptionError();
+        return false;
+      }
+      Slice mid_key(key_ptr, non_shared);
+      if (Compare(mid_key, target) < 0) {
+        // Key at "mid" is smaller than "target". Therefore all
+        // blocks before "mid" are uninteresting.
+        left = mid;
+      } else {
+        // Key at "mid" is >= "target". Therefore all blocks at or
+        // after "mid" are uninteresting.
+        right = mid - 1;
+      }
+    }
+
+    *index = left;
+    return true;
+  }
+
+  bool HashSeek(const Slice& target, uint32_t* index) {
+    assert(hash_index_);
+    auto restart_index = hash_index_->GetRestartIndex(target);
+    if (restart_index == nullptr) {
+      current_ = restarts_;
+      return 0;
+    }
+
+    // the elements in restart_array[index : index + num_blocks]
+    // are all with same prefix. We'll do binary search in that small range.
+    auto left = restart_index->first_index;
+    auto right = restart_index->first_index + restart_index->num_blocks - 1;
+    return BinarySeek(target, left, right, index);
+  }
+};
+
+Iterator* Block::NewIterator(const Comparator* cmp) {
+  if (size_ < 2*sizeof(uint32_t)) {
+    return NewErrorIterator(Status::Corruption("bad block contents"));
+  }
+  const uint32_t num_restarts = NumRestarts();
+  if (num_restarts == 0) {
+    return NewEmptyIterator();
+  } else {
+    return new Iter(cmp, data_, restart_offset_, num_restarts,
+                    hash_index_.get());
+  }
+}
+
+void Block::SetBlockHashIndex(BlockHashIndex* hash_index) {
+  hash_index_.reset(hash_index);
+}
+
+}  // namespace rocksdb
diff --git a/table/block.h b/table/block.h
new file mode 100644 (file)
index 0000000..b363d62
--- /dev/null
@@ -0,0 +1,61 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <stddef.h>
+#include <stdint.h>
+
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+
+namespace rocksdb {
+
+struct BlockContents;
+class Comparator;
+class BlockHashIndex;
+
+class Block {
+ public:
+  // Initialize the block with the specified contents.
+  explicit Block(const BlockContents& contents);
+
+  ~Block();
+
+  size_t size() const { return size_; }
+  const char* data() const { return data_; }
+  bool cachable() const { return cachable_; }
+  uint32_t NumRestarts() const;
+  CompressionType compression_type() const { return compression_type_; }
+
+  // If hash index lookup is enabled and `use_hash_index` is true. This block
+  // will do hash lookup for the key prefix.
+  //
+  // NOTE: for the hash based lookup, if a key prefix doesn't match any key,
+  // the iterator will simply be set as "invalid", rather than returning
+  // the key that is just pass the target key.
+  Iterator* NewIterator(const Comparator* comparator);
+  void SetBlockHashIndex(BlockHashIndex* hash_index);
+
+ private:
+  const char* data_;
+  size_t size_;
+  uint32_t restart_offset_;     // Offset in data_ of restart array
+  bool owned_;                  // Block owns data_[]
+  bool cachable_;
+  CompressionType compression_type_;
+  std::unique_ptr<BlockHashIndex> hash_index_;
+
+  // No copying allowed
+  Block(const Block&);
+  void operator=(const Block&);
+
+  class Iter;
+};
+
+}  // namespace rocksdb
diff --git a/table/block_based_table_builder.cc b/table/block_based_table_builder.cc
new file mode 100644 (file)
index 0000000..c6469a2
--- /dev/null
@@ -0,0 +1,689 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/block_based_table_builder.h"
+
+#include <assert.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+#include <map>
+#include <memory>
+
+#include "db/dbformat.h"
+
+#include "rocksdb/cache.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/flush_block_policy.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+
+#include "table/block.h"
+#include "table/block_based_table_reader.h"
+#include "table/block_builder.h"
+#include "table/filter_block.h"
+#include "table/format.h"
+#include "table/meta_blocks.h"
+#include "table/table_builder.h"
+
+#include "util/coding.h"
+#include "util/crc32c.h"
+#include "util/stop_watch.h"
+#include "util/xxhash.h"
+
+namespace rocksdb {
+
+namespace {
+
+typedef BlockBasedTableOptions::IndexType IndexType;
+
+// The interface for building index.
+// Instruction for adding a new concrete IndexBuilder:
+//  1. Create a subclass instantiated from IndexBuilder.
+//  2. Add a new entry associated with that subclass in TableOptions::IndexType.
+//  3. Add a create function for the new subclass in CreateIndexBuilder.
+// Note: we can devise more advanced design to simplify the process for adding
+// new subclass, which will, on the other hand, increase the code complexity and
+// catch unwanted attention from readers. Given that we won't add/change
+// indexes frequently, it makes sense to just embrace a more straightforward
+// design that just works.
+class IndexBuilder {
+ public:
+  explicit IndexBuilder(const Comparator* comparator)
+      : comparator_(comparator) {}
+
+  virtual ~IndexBuilder() {}
+
+  // Add a new index entry to index block.
+  // To allow further optimization, we provide `last_key_in_current_block` and
+  // `first_key_in_next_block`, based on which the specific implementation can
+  // determine the best index key to be used for the index block.
+  // @last_key_in_current_block: this parameter maybe overridden with the value
+  //                             "substitute key".
+  // @first_key_in_next_block: it will be nullptr if the entry being added is
+  //                           the last one in the table
+  //
+  // REQUIRES: Finish() has not yet been called.
+  virtual void AddEntry(std::string* last_key_in_current_block,
+                        const Slice* first_key_in_next_block,
+                        const BlockHandle& block_handle) = 0;
+
+  // Inform the index builder that all entries has been written. Block builder
+  // may therefore perform any operation required for block finalization.
+  //
+  // REQUIRES: Finish() has not yet been called.
+  virtual Slice Finish() = 0;
+
+  // Get the estimated size for index block.
+  virtual size_t EstimatedSize() const = 0;
+
+ protected:
+  const Comparator* comparator_;
+};
+
+// This index builder builds space-efficient index block.
+//
+// Optimizations:
+//  1. Made block's `block_restart_interval` to be 1, which will avoid linear
+//     search when doing index lookup.
+//  2. Shorten the key length for index block. Other than honestly using the
+//     last key in the data block as the index key, we instead find a shortest
+//     substitute key that serves the same function.
+class ShortenedIndexBuilder : public IndexBuilder {
+ public:
+  explicit ShortenedIndexBuilder(const Comparator* comparator)
+      : IndexBuilder(comparator),
+        index_block_builder_(1 /* block_restart_interval == 1 */, comparator) {}
+
+  virtual void AddEntry(std::string* last_key_in_current_block,
+                        const Slice* first_key_in_next_block,
+                        const BlockHandle& block_handle) override {
+    if (first_key_in_next_block != nullptr) {
+      comparator_->FindShortestSeparator(last_key_in_current_block,
+                                         *first_key_in_next_block);
+    } else {
+      comparator_->FindShortSuccessor(last_key_in_current_block);
+    }
+
+    std::string handle_encoding;
+    block_handle.EncodeTo(&handle_encoding);
+    index_block_builder_.Add(*last_key_in_current_block, handle_encoding);
+  }
+
+  virtual Slice Finish() override { return index_block_builder_.Finish(); }
+
+  virtual size_t EstimatedSize() const {
+    return index_block_builder_.CurrentSizeEstimate();
+  }
+
+ private:
+  BlockBuilder index_block_builder_;
+};
+
+// FullKeyIndexBuilder is also based on BlockBuilder. It works pretty much like
+// ShortenedIndexBuilder, but preserves the full key instead the substitude key.
+class FullKeyIndexBuilder : public IndexBuilder {
+ public:
+  explicit FullKeyIndexBuilder(const Comparator* comparator)
+      : IndexBuilder(comparator),
+        index_block_builder_(1 /* block_restart_interval == 1 */, comparator) {}
+
+  virtual void AddEntry(std::string* last_key_in_current_block,
+                        const Slice* first_key_in_next_block,
+                        const BlockHandle& block_handle) override {
+    std::string handle_encoding;
+    block_handle.EncodeTo(&handle_encoding);
+    index_block_builder_.Add(*last_key_in_current_block, handle_encoding);
+  }
+
+  virtual Slice Finish() override { return index_block_builder_.Finish(); }
+
+  virtual size_t EstimatedSize() const {
+    return index_block_builder_.CurrentSizeEstimate();
+  }
+
+ private:
+  BlockBuilder index_block_builder_;
+};
+
+// Create a index builder based on its type.
+IndexBuilder* CreateIndexBuilder(IndexType type, const Comparator* comparator) {
+  switch (type) {
+    case BlockBasedTableOptions::kBinarySearch: {
+      return new ShortenedIndexBuilder(comparator);
+    }
+    default: {
+      assert(!"Do not recognize the index type ");
+      return nullptr;
+    }
+  }
+  // impossible.
+  assert(false);
+  return nullptr;
+}
+
+bool GoodCompressionRatio(size_t compressed_size, size_t raw_size) {
+  // Check to see if compressed less than 12.5%
+  return compressed_size < raw_size - (raw_size / 8u);
+}
+
+Slice CompressBlock(const Slice& raw,
+                    const CompressionOptions& compression_options,
+                    CompressionType* type, std::string* compressed_output) {
+  if (*type == kNoCompression) {
+    return raw;
+  }
+
+  // Will return compressed block contents if (1) the compression method is
+  // supported in this platform and (2) the compression rate is "good enough".
+  switch (*type) {
+    case kSnappyCompression:
+      if (port::Snappy_Compress(compression_options, raw.data(), raw.size(),
+                                compressed_output) &&
+          GoodCompressionRatio(compressed_output->size(), raw.size())) {
+        return *compressed_output;
+      }
+      break;  // fall back to no compression.
+    case kZlibCompression:
+      if (port::Zlib_Compress(compression_options, raw.data(), raw.size(),
+                              compressed_output) &&
+          GoodCompressionRatio(compressed_output->size(), raw.size())) {
+        return *compressed_output;
+      }
+      break;  // fall back to no compression.
+    case kBZip2Compression:
+      if (port::BZip2_Compress(compression_options, raw.data(), raw.size(),
+                               compressed_output) &&
+          GoodCompressionRatio(compressed_output->size(), raw.size())) {
+        return *compressed_output;
+      }
+      break;  // fall back to no compression.
+    case kLZ4Compression:
+      if (port::LZ4_Compress(compression_options, raw.data(), raw.size(),
+                             compressed_output) &&
+          GoodCompressionRatio(compressed_output->size(), raw.size())) {
+        return *compressed_output;
+      }
+      break;  // fall back to no compression.
+    case kLZ4HCCompression:
+      if (port::LZ4HC_Compress(compression_options, raw.data(), raw.size(),
+                               compressed_output) &&
+          GoodCompressionRatio(compressed_output->size(), raw.size())) {
+        return *compressed_output;
+      }
+      break;     // fall back to no compression.
+    default: {}  // Do not recognize this compression type
+  }
+
+  // Compression method is not supported, or not good compression ratio, so just
+  // fall back to uncompressed form.
+  *type = kNoCompression;
+  return raw;
+}
+
+}  // anonymous namespace
+
+// kBlockBasedTableMagicNumber was picked by running
+//    echo rocksdb.table.block_based | sha1sum
+// and taking the leading 64 bits.
+// Please note that kBlockBasedTableMagicNumber may also be accessed by
+// other .cc files so it have to be explicitly declared with "extern".
+extern const uint64_t kBlockBasedTableMagicNumber = 0x88e241b785f4cff7ull;
+// We also support reading and writing legacy block based table format (for
+// backwards compatibility)
+extern const uint64_t kLegacyBlockBasedTableMagicNumber = 0xdb4775248b80fb57ull;
+
+// A collector that collects properties of interest to block-based table.
+// For now this class looks heavy-weight since we only write one additional
+// property.
+// But in the forseeable future, we will add more and more properties that are
+// specific to block-based table.
+class BlockBasedTableBuilder::BlockBasedTablePropertiesCollector
+    : public TablePropertiesCollector {
+ public:
+  BlockBasedTablePropertiesCollector(
+      BlockBasedTableOptions::IndexType index_type)
+      : index_type_(index_type) {}
+
+  virtual Status Add(const Slice& key, const Slice& value) {
+    // Intentionally left blank. Have no interest in collecting stats for
+    // individual key/value pairs.
+    return Status::OK();
+  }
+
+  virtual Status Finish(UserCollectedProperties* properties) {
+    std::string val;
+    PutFixed32(&val, static_cast<uint32_t>(index_type_));
+    properties->insert({BlockBasedTablePropertyNames::kIndexType, val});
+
+    return Status::OK();
+  }
+
+  // The name of the properties collector can be used for debugging purpose.
+  virtual const char* Name() const {
+    return "BlockBasedTablePropertiesCollector";
+  }
+
+  virtual UserCollectedProperties GetReadableProperties() const {
+    // Intentionally left blank.
+    return UserCollectedProperties();
+  }
+
+ private:
+  BlockBasedTableOptions::IndexType index_type_;
+};
+
+struct BlockBasedTableBuilder::Rep {
+  Options options;
+  const InternalKeyComparator& internal_comparator;
+  WritableFile* file;
+  uint64_t offset = 0;
+  Status status;
+  BlockBuilder data_block;
+  std::unique_ptr<IndexBuilder> index_builder;
+
+  std::string last_key;
+  CompressionType compression_type;
+  ChecksumType checksum_type;
+  TableProperties props;
+
+  bool closed = false;  // Either Finish() or Abandon() has been called.
+  FilterBlockBuilder* filter_block;
+  char compressed_cache_key_prefix[BlockBasedTable::kMaxCacheKeyPrefixSize];
+  size_t compressed_cache_key_prefix_size;
+
+  BlockHandle pending_handle;  // Handle to add to index block
+
+  std::string compressed_output;
+  std::unique_ptr<FlushBlockPolicy> flush_block_policy;
+
+  Rep(const Options& opt, const InternalKeyComparator& icomparator,
+      WritableFile* f, FlushBlockPolicyFactory* flush_block_policy_factory,
+      CompressionType compression_type, IndexType index_block_type,
+      ChecksumType checksum_type)
+      : options(opt),
+        internal_comparator(icomparator),
+        file(f),
+        data_block(options, &internal_comparator),
+        index_builder(
+            CreateIndexBuilder(index_block_type, &internal_comparator)),
+        compression_type(compression_type),
+        checksum_type(checksum_type),
+        filter_block(opt.filter_policy == nullptr
+                         ? nullptr
+                         : new FilterBlockBuilder(opt, &internal_comparator)),
+        flush_block_policy(flush_block_policy_factory->NewFlushBlockPolicy(
+            options, data_block)) {
+    options.table_properties_collectors.push_back(
+        std::make_shared<BlockBasedTablePropertiesCollector>(index_block_type));
+  }
+};
+
+// TODO(sdong): Currently only write out binary search index. In
+// BlockBasedTableReader, Hash index will be built using binary search index.
+BlockBasedTableBuilder::BlockBasedTableBuilder(
+    const Options& options, const BlockBasedTableOptions& table_options,
+    const InternalKeyComparator& internal_comparator, WritableFile* file,
+    CompressionType compression_type)
+    : rep_(new Rep(options, internal_comparator, file,
+                   table_options.flush_block_policy_factory.get(),
+                   compression_type,
+                   BlockBasedTableOptions::IndexType::kBinarySearch,
+                   table_options.checksum)) {
+  if (rep_->filter_block != nullptr) {
+    rep_->filter_block->StartBlock(0);
+  }
+  if (options.block_cache_compressed.get() != nullptr) {
+    BlockBasedTable::GenerateCachePrefix(
+        options.block_cache_compressed.get(), file,
+        &rep_->compressed_cache_key_prefix[0],
+        &rep_->compressed_cache_key_prefix_size);
+  }
+}
+
+BlockBasedTableBuilder::~BlockBasedTableBuilder() {
+  assert(rep_->closed);  // Catch errors where caller forgot to call Finish()
+  delete rep_->filter_block;
+  delete rep_;
+}
+
+void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
+  Rep* r = rep_;
+  assert(!r->closed);
+  if (!ok()) return;
+  if (r->props.num_entries > 0) {
+    assert(r->internal_comparator.Compare(key, Slice(r->last_key)) > 0);
+  }
+
+  auto should_flush = r->flush_block_policy->Update(key, value);
+  if (should_flush) {
+    assert(!r->data_block.empty());
+    Flush();
+
+    // Add item to index block.
+    // We do not emit the index entry for a block until we have seen the
+    // first key for the next data block.  This allows us to use shorter
+    // keys in the index block.  For example, consider a block boundary
+    // between the keys "the quick brown fox" and "the who".  We can use
+    // "the r" as the key for the index block entry since it is >= all
+    // entries in the first block and < all entries in subsequent
+    // blocks.
+    if (ok()) {
+      r->index_builder->AddEntry(&r->last_key, &key, r->pending_handle);
+    }
+  }
+
+  if (r->filter_block != nullptr) {
+    r->filter_block->AddKey(key);
+  }
+
+  r->last_key.assign(key.data(), key.size());
+  r->data_block.Add(key, value);
+  r->props.num_entries++;
+  r->props.raw_key_size += key.size();
+  r->props.raw_value_size += value.size();
+
+  NotifyCollectTableCollectorsOnAdd(
+      key,
+      value,
+      r->options.table_properties_collectors,
+      r->options.info_log.get()
+  );
+}
+
+void BlockBasedTableBuilder::Flush() {
+  Rep* r = rep_;
+  assert(!r->closed);
+  if (!ok()) return;
+  if (r->data_block.empty()) return;
+  WriteBlock(&r->data_block, &r->pending_handle);
+  if (ok()) {
+    r->status = r->file->Flush();
+  }
+  if (r->filter_block != nullptr) {
+    r->filter_block->StartBlock(r->offset);
+  }
+  r->props.data_size = r->offset;
+  ++r->props.num_data_blocks;
+}
+
+void BlockBasedTableBuilder::WriteBlock(BlockBuilder* block,
+                                        BlockHandle* handle) {
+  WriteBlock(block->Finish(), handle);
+  block->Reset();
+}
+
+void BlockBasedTableBuilder::WriteBlock(const Slice& raw_block_contents,
+                                        BlockHandle* handle) {
+  // File format contains a sequence of blocks where each block has:
+  //    block_data: uint8[n]
+  //    type: uint8
+  //    crc: uint32
+  assert(ok());
+  Rep* r = rep_;
+
+  auto type = r->compression_type;
+  auto block_contents =
+      CompressBlock(raw_block_contents, r->options.compression_opts, &type,
+                    &r->compressed_output);
+  WriteRawBlock(block_contents, type, handle);
+  r->compressed_output.clear();
+}
+
+void BlockBasedTableBuilder::WriteRawBlock(const Slice& block_contents,
+                                           CompressionType type,
+                                           BlockHandle* handle) {
+  Rep* r = rep_;
+  StopWatch sw(r->options.env, r->options.statistics.get(),
+               WRITE_RAW_BLOCK_MICROS);
+  handle->set_offset(r->offset);
+  handle->set_size(block_contents.size());
+  r->status = r->file->Append(block_contents);
+  if (r->status.ok()) {
+    char trailer[kBlockTrailerSize];
+    trailer[0] = type;
+    char* trailer_without_type = trailer + 1;
+    switch (r->checksum_type) {
+      case kNoChecksum:
+        // we don't support no checksum yet
+        assert(false);
+        // intentional fallthrough in release binary
+      case kCRC32c: {
+        auto crc = crc32c::Value(block_contents.data(), block_contents.size());
+        crc = crc32c::Extend(crc, trailer, 1);  // Extend to cover block type
+        EncodeFixed32(trailer_without_type, crc32c::Mask(crc));
+        break;
+      }
+      case kxxHash: {
+        void* xxh = XXH32_init(0);
+        XXH32_update(xxh, block_contents.data(), block_contents.size());
+        XXH32_update(xxh, trailer, 1);  // Extend  to cover block type
+        EncodeFixed32(trailer_without_type, XXH32_digest(xxh));
+        break;
+      }
+    }
+
+    r->status = r->file->Append(Slice(trailer, kBlockTrailerSize));
+    if (r->status.ok()) {
+      r->status = InsertBlockInCache(block_contents, type, handle);
+    }
+    if (r->status.ok()) {
+      r->offset += block_contents.size() + kBlockTrailerSize;
+    }
+  }
+}
+
+Status BlockBasedTableBuilder::status() const {
+  return rep_->status;
+}
+
+static void DeleteCachedBlock(const Slice& key, void* value) {
+  Block* block = reinterpret_cast<Block*>(value);
+  delete block;
+}
+
+//
+// Make a copy of the block contents and insert into compressed block cache
+//
+Status BlockBasedTableBuilder::InsertBlockInCache(const Slice& block_contents,
+                                 const CompressionType type,
+                                 const BlockHandle* handle) {
+  Rep* r = rep_;
+  Cache* block_cache_compressed = r->options.block_cache_compressed.get();
+
+  if (type != kNoCompression && block_cache_compressed != nullptr) {
+
+    Cache::Handle* cache_handle = nullptr;
+    size_t size = block_contents.size();
+
+    char* ubuf = new char[size];             // make a new copy
+    memcpy(ubuf, block_contents.data(), size);
+
+    BlockContents results;
+    Slice sl(ubuf, size);
+    results.data = sl;
+    results.cachable = true; // XXX
+    results.heap_allocated = true;
+    results.compression_type = type;
+
+    Block* block = new Block(results);
+
+    // make cache key by appending the file offset to the cache prefix id
+    char* end = EncodeVarint64(
+                  r->compressed_cache_key_prefix +
+                  r->compressed_cache_key_prefix_size,
+                  handle->offset());
+    Slice key(r->compressed_cache_key_prefix, static_cast<size_t>
+              (end - r->compressed_cache_key_prefix));
+
+    // Insert into compressed block cache.
+    cache_handle = block_cache_compressed->Insert(key, block, block->size(),
+                                                  &DeleteCachedBlock);
+    block_cache_compressed->Release(cache_handle);
+
+    // Invalidate OS cache.
+    r->file->InvalidateCache(r->offset, size);
+  }
+  return Status::OK();
+}
+
+Status BlockBasedTableBuilder::Finish() {
+  Rep* r = rep_;
+  bool empty_data_block = r->data_block.empty();
+  Flush();
+  assert(!r->closed);
+  r->closed = true;
+
+  BlockHandle filter_block_handle,
+              metaindex_block_handle,
+              index_block_handle;
+
+  // Write filter block
+  if (ok() && r->filter_block != nullptr) {
+    auto filter_contents = r->filter_block->Finish();
+    r->props.filter_size = filter_contents.size();
+    WriteRawBlock(filter_contents, kNoCompression, &filter_block_handle);
+  }
+
+  // To make sure properties block is able to keep the accurate size of index
+  // block, we will finish writing all index entries here and flush them
+  // to storage after metaindex block is written.
+  if (ok() && !empty_data_block) {
+    r->index_builder->AddEntry(&r->last_key, nullptr /* no next data block */,
+                               r->pending_handle);
+  }
+
+  // Write meta blocks and metaindex block with the following order.
+  //    1. [meta block: filter]
+  //    2. [meta block: properties]
+  //    3. [metaindex block]
+  if (ok()) {
+    MetaIndexBuilder meta_index_builer;
+
+    // Write filter block.
+    if (r->filter_block != nullptr) {
+      // Add mapping from "<filter_block_prefix>.Name" to location
+      // of filter data.
+      std::string key = BlockBasedTable::kFilterBlockPrefix;
+      key.append(r->options.filter_policy->Name());
+      meta_index_builer.Add(key, filter_block_handle);
+    }
+
+    // Write properties block.
+    {
+      PropertyBlockBuilder property_block_builder;
+      std::vector<std::string> failed_user_prop_collectors;
+      r->props.filter_policy_name = r->options.filter_policy != nullptr ?
+          r->options.filter_policy->Name() : "";
+      r->props.index_size =
+          r->index_builder->EstimatedSize() + kBlockTrailerSize;
+
+      // Add basic properties
+      property_block_builder.AddTableProperty(r->props);
+
+      // Add use collected properties
+      NotifyCollectTableCollectorsOnFinish(
+          r->options.table_properties_collectors,
+          r->options.info_log.get(),
+          &property_block_builder
+      );
+
+      BlockHandle properties_block_handle;
+      WriteRawBlock(
+          property_block_builder.Finish(),
+          kNoCompression,
+          &properties_block_handle
+      );
+
+      meta_index_builer.Add(kPropertiesBlock,
+                            properties_block_handle);
+    }  // end of properties block writing
+
+    WriteRawBlock(
+        meta_index_builer.Finish(),
+        kNoCompression,
+        &metaindex_block_handle
+    );
+  }  // meta blocks and metaindex block.
+
+  // Write index block
+  if (ok()) {
+    WriteBlock(r->index_builder->Finish(), &index_block_handle);
+  }
+
+  // Write footer
+  if (ok()) {
+    // No need to write out new footer if we're using default checksum.
+    // We're writing legacy magic number because we want old versions of RocksDB
+    // be able to read files generated with new release (just in case if
+    // somebody wants to roll back after an upgrade)
+    // TODO(icanadi) at some point in the future, when we're absolutely sure
+    // nobody will roll back to RocksDB 2.x versions, retire the legacy magic
+    // number and always write new table files with new magic number
+    bool legacy = (r->checksum_type == kCRC32c);
+    Footer footer(legacy ? kLegacyBlockBasedTableMagicNumber
+                         : kBlockBasedTableMagicNumber);
+    footer.set_metaindex_handle(metaindex_block_handle);
+    footer.set_index_handle(index_block_handle);
+    footer.set_checksum(r->checksum_type);
+    std::string footer_encoding;
+    footer.EncodeTo(&footer_encoding);
+    r->status = r->file->Append(footer_encoding);
+    if (r->status.ok()) {
+      r->offset += footer_encoding.size();
+    }
+  }
+
+  // Print out the table stats
+  if (ok()) {
+    // user collected properties
+    std::string user_collected;
+    user_collected.reserve(1024);
+    for (auto collector : r->options.table_properties_collectors) {
+      for (const auto& prop : collector->GetReadableProperties()) {
+        user_collected.append(prop.first);
+        user_collected.append("=");
+        user_collected.append(prop.second);
+        user_collected.append("; ");
+      }
+    }
+
+    Log(
+        r->options.info_log,
+        "Table was constructed:\n"
+        "  [basic properties]: %s\n"
+        "  [user collected properties]: %s",
+        r->props.ToString().c_str(),
+        user_collected.c_str()
+    );
+  }
+
+  return r->status;
+}
+
+void BlockBasedTableBuilder::Abandon() {
+  Rep* r = rep_;
+  assert(!r->closed);
+  r->closed = true;
+}
+
+uint64_t BlockBasedTableBuilder::NumEntries() const {
+  return rep_->props.num_entries;
+}
+
+uint64_t BlockBasedTableBuilder::FileSize() const {
+  return rep_->offset;
+}
+
+const std::string BlockBasedTable::kFilterBlockPrefix =
+    "filter.";
+
+}  // namespace rocksdb
diff --git a/table/block_based_table_builder.h b/table/block_based_table_builder.h
new file mode 100644 (file)
index 0000000..5871427
--- /dev/null
@@ -0,0 +1,91 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <stdint.h>
+
+#include "rocksdb/flush_block_policy.h"
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+#include "table/table_builder.h"
+
+namespace rocksdb {
+
+class BlockBuilder;
+class BlockHandle;
+class WritableFile;
+struct BlockBasedTableOptions;
+
+class BlockBasedTableBuilder : public TableBuilder {
+ public:
+  // Create a builder that will store the contents of the table it is
+  // building in *file.  Does not close the file.  It is up to the
+  // caller to close the file after calling Finish().
+  BlockBasedTableBuilder(const Options& options,
+                         const BlockBasedTableOptions& table_options,
+                         const InternalKeyComparator& internal_comparator,
+                         WritableFile* file, CompressionType compression_type);
+
+  // REQUIRES: Either Finish() or Abandon() has been called.
+  ~BlockBasedTableBuilder();
+
+  // Add key,value to the table being constructed.
+  // REQUIRES: key is after any previously added key according to comparator.
+  // REQUIRES: Finish(), Abandon() have not been called
+  void Add(const Slice& key, const Slice& value) override;
+
+  // Return non-ok iff some error has been detected.
+  Status status() const override;
+
+  // Finish building the table.  Stops using the file passed to the
+  // constructor after this function returns.
+  // REQUIRES: Finish(), Abandon() have not been called
+  Status Finish() override;
+
+  // Indicate that the contents of this builder should be abandoned.  Stops
+  // using the file passed to the constructor after this function returns.
+  // If the caller is not going to call Finish(), it must call Abandon()
+  // before destroying this builder.
+  // REQUIRES: Finish(), Abandon() have not been called
+  void Abandon() override;
+
+  // Number of calls to Add() so far.
+  uint64_t NumEntries() const override;
+
+  // Size of the file generated so far.  If invoked after a successful
+  // Finish() call, returns the size of the final generated file.
+  uint64_t FileSize() const override;
+
+ private:
+  bool ok() const { return status().ok(); }
+  // Call block's Finish() method and then write the finalize block contents to
+  // file.
+  void WriteBlock(BlockBuilder* block, BlockHandle* handle);
+  // Directly write block content to the file.
+  void WriteBlock(const Slice& block_contents, BlockHandle* handle);
+  void WriteRawBlock(const Slice& data, CompressionType, BlockHandle* handle);
+  Status InsertBlockInCache(const Slice& block_contents,
+                            const CompressionType type,
+                            const BlockHandle* handle);
+  struct Rep;
+  class BlockBasedTablePropertiesCollector;
+  Rep* rep_;
+
+  // Advanced operation: flush any buffered key/value pairs to file.
+  // Can be used to ensure that two adjacent entries never live in
+  // the same data block.  Most clients should not need to use this method.
+  // REQUIRES: Finish(), Abandon() have not been called
+  void Flush();
+
+  // No copying allowed
+  BlockBasedTableBuilder(const BlockBasedTableBuilder&) = delete;
+  void operator=(const BlockBasedTableBuilder&) = delete;
+};
+
+}  // namespace rocksdb
diff --git a/table/block_based_table_factory.cc b/table/block_based_table_factory.cc
new file mode 100644 (file)
index 0000000..822adee
--- /dev/null
@@ -0,0 +1,60 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+
+#include "table/block_based_table_factory.h"
+
+#include <memory>
+#include <string>
+#include <stdint.h>
+
+#include "rocksdb/flush_block_policy.h"
+#include "table/block_based_table_builder.h"
+#include "table/block_based_table_reader.h"
+#include "port/port.h"
+
+namespace rocksdb {
+
+BlockBasedTableFactory::BlockBasedTableFactory(
+    const BlockBasedTableOptions& table_options)
+    : table_options_(table_options) {
+  if (table_options_.flush_block_policy_factory == nullptr) {
+    table_options_.flush_block_policy_factory.reset(
+        new FlushBlockBySizePolicyFactory());
+  }
+}
+
+Status BlockBasedTableFactory::NewTableReader(
+    const Options& options, const EnvOptions& soptions,
+    const InternalKeyComparator& internal_comparator,
+    unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
+    unique_ptr<TableReader>* table_reader) const {
+  return BlockBasedTable::Open(options, soptions, table_options_,
+                               internal_comparator, std::move(file), file_size,
+                               table_reader);
+}
+
+TableBuilder* BlockBasedTableFactory::NewTableBuilder(
+    const Options& options, const InternalKeyComparator& internal_comparator,
+    WritableFile* file, CompressionType compression_type) const {
+  auto table_builder = new BlockBasedTableBuilder(
+      options, table_options_, internal_comparator, file, compression_type);
+
+  return table_builder;
+}
+
+TableFactory* NewBlockBasedTableFactory(
+    const BlockBasedTableOptions& table_options) {
+  return new BlockBasedTableFactory(table_options);
+}
+
+const std::string BlockBasedTablePropertyNames::kIndexType =
+    "rocksdb.block.based.table.index.type";
+
+}  // namespace rocksdb
diff --git a/table/block_based_table_factory.h b/table/block_based_table_factory.h
new file mode 100644 (file)
index 0000000..492349c
--- /dev/null
@@ -0,0 +1,48 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <memory>
+#include <stdint.h>
+
+#include "rocksdb/flush_block_policy.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+
+namespace rocksdb {
+
+struct Options;
+struct EnvOptions;
+
+using std::unique_ptr;
+class BlockBasedTableBuilder;
+
+class BlockBasedTableFactory : public TableFactory {
+ public:
+  explicit BlockBasedTableFactory(
+      const BlockBasedTableOptions& table_options = BlockBasedTableOptions());
+
+  ~BlockBasedTableFactory() {}
+
+  const char* Name() const override { return "BlockBasedTable"; }
+
+  Status NewTableReader(const Options& options, const EnvOptions& soptions,
+                        const InternalKeyComparator& internal_comparator,
+                        unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
+                        unique_ptr<TableReader>* table_reader) const override;
+
+  TableBuilder* NewTableBuilder(
+      const Options& options, const InternalKeyComparator& internal_comparator,
+      WritableFile* file, CompressionType compression_type) const override;
+
+ private:
+  BlockBasedTableOptions table_options_;
+};
+
+}  // namespace rocksdb
diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc
new file mode 100644 (file)
index 0000000..9538007
--- /dev/null
@@ -0,0 +1,1129 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/block_based_table_reader.h"
+
+#include <string>
+#include <utility>
+
+#include "db/dbformat.h"
+
+#include "rocksdb/cache.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+
+#include "table/block.h"
+#include "table/filter_block.h"
+#include "table/block_hash_index.h"
+#include "table/format.h"
+#include "table/meta_blocks.h"
+#include "table/two_level_iterator.h"
+
+#include "util/coding.h"
+#include "util/perf_context_imp.h"
+#include "util/stop_watch.h"
+
+namespace rocksdb {
+
+extern const uint64_t kBlockBasedTableMagicNumber;
+using std::unique_ptr;
+
+typedef BlockBasedTable::IndexReader IndexReader;
+
+namespace {
+
+// The longest the prefix of the cache key used to identify blocks can be.
+// We are using the fact that we know for Posix files the unique ID is three
+// varints.
+// For some reason, compiling for iOS complains that this variable is unused
+const size_t kMaxCacheKeyPrefixSize __attribute__((unused)) =
+    kMaxVarint64Length * 3 + 1;
+
+// Read the block identified by "handle" from "file".
+// The only relevant option is options.verify_checksums for now.
+// Set *didIO to true if didIO is not null.
+// On failure return non-OK.
+// On success fill *result and return OK - caller owns *result
+Status ReadBlockFromFile(RandomAccessFile* file, const Footer& footer,
+                         const ReadOptions& options, const BlockHandle& handle,
+                         Block** result, Env* env, bool* didIO = nullptr,
+                         bool do_uncompress = true) {
+  BlockContents contents;
+  Status s = ReadBlockContents(file, footer, options, handle, &contents, env,
+                               do_uncompress);
+  if (s.ok()) {
+    *result = new Block(contents);
+  }
+
+  if (didIO != nullptr) {
+    *didIO = true;
+  }
+  return s;
+}
+
+// Delete the resource that is held by the iterator.
+template <class ResourceType>
+void DeleteHeldResource(void* arg, void* ignored) {
+  delete reinterpret_cast<ResourceType*>(arg);
+}
+
+// Delete the entry resided in the cache.
+template <class Entry>
+void DeleteCachedEntry(const Slice& key, void* value) {
+  auto entry = reinterpret_cast<Entry*>(value);
+  delete entry;
+}
+
+// Release the cached entry and decrement its ref count.
+void ReleaseCachedEntry(void* arg, void* h) {
+  Cache* cache = reinterpret_cast<Cache*>(arg);
+  Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(h);
+  cache->Release(handle);
+}
+
+Slice GetCacheKey(const char* cache_key_prefix, size_t cache_key_prefix_size,
+                  const BlockHandle& handle, char* cache_key) {
+  assert(cache_key != nullptr);
+  assert(cache_key_prefix_size != 0);
+  assert(cache_key_prefix_size <= kMaxCacheKeyPrefixSize);
+  memcpy(cache_key, cache_key_prefix, cache_key_prefix_size);
+  char* end =
+      EncodeVarint64(cache_key + cache_key_prefix_size, handle.offset());
+  return Slice(cache_key, static_cast<size_t>(end - cache_key));
+}
+
+Cache::Handle* GetEntryFromCache(Cache* block_cache, const Slice& key,
+                                 Tickers block_cache_miss_ticker,
+                                 Tickers block_cache_hit_ticker,
+                                 Statistics* statistics) {
+  auto cache_handle = block_cache->Lookup(key);
+  if (cache_handle != nullptr) {
+    PERF_COUNTER_ADD(block_cache_hit_count, 1);
+    // overall cache hit
+    RecordTick(statistics, BLOCK_CACHE_HIT);
+    // block-type specific cache hit
+    RecordTick(statistics, block_cache_hit_ticker);
+  } else {
+    // overall cache miss
+    RecordTick(statistics, BLOCK_CACHE_MISS);
+    // block-type specific cache miss
+    RecordTick(statistics, block_cache_miss_ticker);
+  }
+
+  return cache_handle;
+}
+
+}  // namespace
+
+// -- IndexReader and its subclasses
+// IndexReader is the interface that provide the functionality for index access.
+class BlockBasedTable::IndexReader {
+ public:
+  explicit IndexReader(const Comparator* comparator)
+      : comparator_(comparator) {}
+
+  virtual ~IndexReader() {}
+
+  // Create an iterator for index access.
+  virtual Iterator* NewIterator() = 0;
+
+  // The size of the index.
+  virtual size_t size() const = 0;
+
+ protected:
+  const Comparator* comparator_;
+};
+
+// Index that allows binary search lookup for the first key of each block.
+// This class can be viewed as a thin wrapper for `Block` class which already
+// supports binary search.
+class BinarySearchIndexReader : public IndexReader {
+ public:
+  // Read index from the file and create an intance for
+  // `BinarySearchIndexReader`.
+  // On success, index_reader will be populated; otherwise it will remain
+  // unmodified.
+  static Status Create(RandomAccessFile* file, const Footer& footer,
+                       const BlockHandle& index_handle, Env* env,
+                       const Comparator* comparator,
+                       IndexReader** index_reader) {
+    Block* index_block = nullptr;
+    auto s = ReadBlockFromFile(file, footer, ReadOptions(), index_handle,
+                               &index_block, env);
+
+    if (s.ok()) {
+      *index_reader = new BinarySearchIndexReader(comparator, index_block);
+    }
+
+    return s;
+  }
+
+  virtual Iterator* NewIterator() override {
+    return index_block_->NewIterator(comparator_);
+  }
+
+  virtual size_t size() const override { return index_block_->size(); }
+
+ private:
+  BinarySearchIndexReader(const Comparator* comparator, Block* index_block)
+      : IndexReader(comparator), index_block_(index_block) {
+    assert(index_block_ != nullptr);
+  }
+  std::unique_ptr<Block> index_block_;
+};
+
+// Index that leverages an internal hash table to quicken the lookup for a given
+// key.
+// @param data_iter_gen, equavalent to BlockBasedTable::NewIterator(). But that
+// functions requires index to be initalized. To avoid this problem external
+// caller will pass a function that can create the iterator over the entries
+// without the table to be fully initialized.
+class HashIndexReader : public IndexReader {
+ public:
+  static Status Create(RandomAccessFile* file, const Footer& footer,
+                       const BlockHandle& index_handle, Env* env,
+                       const Comparator* comparator,
+                       std::function<Iterator*(Iterator*)> data_iter_gen,
+                       const SliceTransform* prefix_extractor,
+                       IndexReader** index_reader) {
+    assert(prefix_extractor);
+    Block* index_block = nullptr;
+    auto s = ReadBlockFromFile(file, footer, ReadOptions(), index_handle,
+                               &index_block, env);
+
+    if (!s.ok()) {
+      return s;
+    }
+
+    *index_reader = new HashIndexReader(comparator, index_block);
+    std::unique_ptr<Iterator> index_iter(index_block->NewIterator(nullptr));
+    std::unique_ptr<Iterator> data_iter(
+        data_iter_gen(index_block->NewIterator(nullptr)));
+    auto hash_index = CreateBlockHashIndex(index_iter.get(), data_iter.get(),
+                                           index_block->NumRestarts(),
+                                           comparator, prefix_extractor);
+    index_block->SetBlockHashIndex(hash_index);
+    return s;
+  }
+
+  virtual Iterator* NewIterator() override {
+    return index_block_->NewIterator(comparator_);
+  }
+
+  virtual size_t size() const override { return index_block_->size(); }
+
+ private:
+  HashIndexReader(const Comparator* comparator, Block* index_block)
+      : IndexReader(comparator), index_block_(index_block) {
+    assert(index_block_ != nullptr);
+  }
+  std::unique_ptr<Block> index_block_;
+};
+
+
+struct BlockBasedTable::Rep {
+  Rep(const EnvOptions& storage_options,
+      const InternalKeyComparator& internal_comparator)
+      : soptions(storage_options), internal_comparator(internal_comparator) {}
+
+  Options options;
+  const EnvOptions& soptions;
+  const InternalKeyComparator& internal_comparator;
+  Status status;
+  unique_ptr<RandomAccessFile> file;
+  char cache_key_prefix[kMaxCacheKeyPrefixSize];
+  size_t cache_key_prefix_size = 0;
+  char compressed_cache_key_prefix[kMaxCacheKeyPrefixSize];
+  size_t compressed_cache_key_prefix_size = 0;
+
+  // Footer contains the fixed table information
+  Footer footer;
+  // index_reader and filter will be populated and used only when
+  // options.block_cache is nullptr; otherwise we will get the index block via
+  // the block cache.
+  unique_ptr<IndexReader> index_reader;
+  unique_ptr<FilterBlockReader> filter;
+
+  std::shared_ptr<const TableProperties> table_properties;
+  BlockBasedTableOptions::IndexType index_type;
+  // TODO(kailiu) It is very ugly to use internal key in table, since table
+  // module should not be relying on db module. However to make things easier
+  // and compatible with existing code, we introduce a wrapper that allows
+  // block to extract prefix without knowing if a key is internal or not.
+  unique_ptr<SliceTransform> internal_prefix_transform;
+};
+
+BlockBasedTable::~BlockBasedTable() {
+  delete rep_;
+}
+
+// CachableEntry represents the entries that *may* be fetched from block cache.
+//  field `value` is the item we want to get.
+//  field `cache_handle` is the cache handle to the block cache. If the value
+//    was not read from cache, `cache_handle` will be nullptr.
+template <class TValue>
+struct BlockBasedTable::CachableEntry {
+  CachableEntry(TValue* value, Cache::Handle* cache_handle)
+    : value(value)
+    , cache_handle(cache_handle) {
+  }
+  CachableEntry(): CachableEntry(nullptr, nullptr) { }
+  void Release(Cache* cache) {
+    if (cache_handle) {
+      cache->Release(cache_handle);
+      value = nullptr;
+      cache_handle = nullptr;
+    }
+  }
+
+  TValue* value = nullptr;
+  // if the entry is from the cache, cache_handle will be populated.
+  Cache::Handle* cache_handle = nullptr;
+};
+
+// Helper function to setup the cache key's prefix for the Table.
+void BlockBasedTable::SetupCacheKeyPrefix(Rep* rep) {
+  assert(kMaxCacheKeyPrefixSize >= 10);
+  rep->cache_key_prefix_size = 0;
+  rep->compressed_cache_key_prefix_size = 0;
+  if (rep->options.block_cache != nullptr) {
+    GenerateCachePrefix(rep->options.block_cache.get(), rep->file.get(),
+                        &rep->cache_key_prefix[0],
+                        &rep->cache_key_prefix_size);
+  }
+  if (rep->options.block_cache_compressed != nullptr) {
+    GenerateCachePrefix(rep->options.block_cache_compressed.get(),
+                        rep->file.get(), &rep->compressed_cache_key_prefix[0],
+                        &rep->compressed_cache_key_prefix_size);
+  }
+}
+
+void BlockBasedTable::GenerateCachePrefix(Cache* cc,
+    RandomAccessFile* file, char* buffer, size_t* size) {
+
+  // generate an id from the file
+  *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize);
+
+  // If the prefix wasn't generated or was too long,
+  // create one from the cache.
+  if (*size == 0) {
+    char* end = EncodeVarint64(buffer, cc->NewId());
+    *size = static_cast<size_t>(end - buffer);
+  }
+}
+
+void BlockBasedTable::GenerateCachePrefix(Cache* cc,
+    WritableFile* file, char* buffer, size_t* size) {
+
+  // generate an id from the file
+  *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize);
+
+  // If the prefix wasn't generated or was too long,
+  // create one from the cache.
+  if (*size == 0) {
+    char* end = EncodeVarint64(buffer, cc->NewId());
+    *size = static_cast<size_t>(end - buffer);
+  }
+}
+
+Status BlockBasedTable::Open(const Options& options, const EnvOptions& soptions,
+                             const BlockBasedTableOptions& table_options,
+                             const InternalKeyComparator& internal_comparator,
+                             unique_ptr<RandomAccessFile>&& file,
+                             uint64_t file_size,
+                             unique_ptr<TableReader>* table_reader) {
+  table_reader->reset();
+
+  Footer footer(kBlockBasedTableMagicNumber);
+  auto s = ReadFooterFromFile(file.get(), file_size, &footer);
+  if (!s.ok()) return s;
+
+  // We've successfully read the footer and the index block: we're
+  // ready to serve requests.
+  Rep* rep = new BlockBasedTable::Rep(soptions, internal_comparator);
+  rep->options = options;
+  rep->file = std::move(file);
+  rep->footer = footer;
+  rep->index_type = table_options.index_type;
+  SetupCacheKeyPrefix(rep);
+  unique_ptr<BlockBasedTable> new_table(new BlockBasedTable(rep));
+
+  // Read meta index
+  std::unique_ptr<Block> meta;
+  std::unique_ptr<Iterator> meta_iter;
+  s = ReadMetaBlock(rep, &meta, &meta_iter);
+
+  // Read the properties
+  bool found_properties_block = true;
+  s = SeekToPropertiesBlock(meta_iter.get(), &found_properties_block);
+
+  if (found_properties_block) {
+    s = meta_iter->status();
+    TableProperties* table_properties = nullptr;
+    if (s.ok()) {
+      s = ReadProperties(meta_iter->value(), rep->file.get(), rep->footer,
+                         rep->options.env, rep->options.info_log.get(),
+                         &table_properties);
+    }
+
+    if (!s.ok()) {
+      auto err_msg =
+        "[Warning] Encountered error while reading data from properties "
+        "block " + s.ToString();
+      Log(rep->options.info_log, "%s", err_msg.c_str());
+    } else {
+      rep->table_properties.reset(table_properties);
+    }
+  } else {
+    Log(WARN_LEVEL, rep->options.info_log,
+        "Cannot find Properties block from file.");
+  }
+
+  // Will use block cache for index/filter blocks access?
+  if (options.block_cache && table_options.cache_index_and_filter_blocks) {
+    // Hack: Call NewIndexIterator() to implicitly add index to the block_cache
+    unique_ptr<Iterator> iter(new_table->NewIndexIterator(ReadOptions()));
+    s = iter->status();
+
+    if (s.ok()) {
+      // Hack: Call GetFilter() to implicitly add filter to the block_cache
+      auto filter_entry = new_table->GetFilter();
+      filter_entry.Release(options.block_cache.get());
+    }
+  } else {
+    // If we don't use block cache for index/filter blocks access, we'll
+    // pre-load these blocks, which will kept in member variables in Rep
+    // and with a same life-time as this table object.
+    IndexReader* index_reader = nullptr;
+    // TODO: we never really verify check sum for index block
+    s = new_table->CreateIndexReader(&index_reader);
+
+    if (s.ok()) {
+      rep->index_reader.reset(index_reader);
+
+      // Set filter block
+      if (rep->options.filter_policy) {
+        std::string key = kFilterBlockPrefix;
+        key.append(rep->options.filter_policy->Name());
+        meta_iter->Seek(key);
+
+        if (meta_iter->Valid() && meta_iter->key() == Slice(key)) {
+          rep->filter.reset(ReadFilter(meta_iter->value(), rep));
+        }
+      }
+    } else {
+      delete index_reader;
+    }
+  }
+
+  if (s.ok()) {
+    *table_reader = std::move(new_table);
+  }
+
+  return s;
+}
+
+void BlockBasedTable::SetupForCompaction() {
+  switch (rep_->options.access_hint_on_compaction_start) {
+    case Options::NONE:
+      break;
+    case Options::NORMAL:
+      rep_->file->Hint(RandomAccessFile::NORMAL);
+      break;
+    case Options::SEQUENTIAL:
+      rep_->file->Hint(RandomAccessFile::SEQUENTIAL);
+      break;
+    case Options::WILLNEED:
+      rep_->file->Hint(RandomAccessFile::WILLNEED);
+      break;
+    default:
+      assert(false);
+  }
+  compaction_optimized_ = true;
+}
+
+std::shared_ptr<const TableProperties> BlockBasedTable::GetTableProperties()
+    const {
+  return rep_->table_properties;
+}
+
+// Load the meta-block from the file. On success, return the loaded meta block
+// and its iterator.
+Status BlockBasedTable::ReadMetaBlock(
+    Rep* rep,
+    std::unique_ptr<Block>* meta_block,
+    std::unique_ptr<Iterator>* iter) {
+  // TODO(sanjay): Skip this if footer.metaindex_handle() size indicates
+  // it is an empty block.
+  //  TODO: we never really verify check sum for meta index block
+  Block* meta = nullptr;
+  Status s = ReadBlockFromFile(
+      rep->file.get(),
+      rep->footer,
+      ReadOptions(),
+      rep->footer.metaindex_handle(),
+      &meta,
+      rep->options.env);
+
+    if (!s.ok()) {
+      auto err_msg =
+        "[Warning] Encountered error while reading data from properties"
+        "block " + s.ToString();
+      Log(rep->options.info_log, "%s", err_msg.c_str());
+    }
+  if (!s.ok()) {
+    delete meta;
+    return s;
+  }
+
+  meta_block->reset(meta);
+  // meta block uses bytewise comparator.
+  iter->reset(meta->NewIterator(BytewiseComparator()));
+  return Status::OK();
+}
+
+Status BlockBasedTable::GetDataBlockFromCache(
+    const Slice& block_cache_key, const Slice& compressed_block_cache_key,
+    Cache* block_cache, Cache* block_cache_compressed, Statistics* statistics,
+    const ReadOptions& read_options,
+    BlockBasedTable::CachableEntry<Block>* block) {
+  Status s;
+  Block* compressed_block = nullptr;
+  Cache::Handle* block_cache_compressed_handle = nullptr;
+
+  // Lookup uncompressed cache first
+  if (block_cache != nullptr) {
+    block->cache_handle =
+        GetEntryFromCache(block_cache, block_cache_key, BLOCK_CACHE_DATA_MISS,
+                          BLOCK_CACHE_DATA_HIT, statistics);
+    if (block->cache_handle != nullptr) {
+      block->value =
+          reinterpret_cast<Block*>(block_cache->Value(block->cache_handle));
+      return s;
+    }
+  }
+
+  // If not found, search from the compressed block cache.
+  assert(block->cache_handle == nullptr && block->value == nullptr);
+
+  if (block_cache_compressed == nullptr) {
+    return s;
+  }
+
+  assert(!compressed_block_cache_key.empty());
+  block_cache_compressed_handle =
+      block_cache_compressed->Lookup(compressed_block_cache_key);
+  // if we found in the compressed cache, then uncompress and insert into
+  // uncompressed cache
+  if (block_cache_compressed_handle == nullptr) {
+    RecordTick(statistics, BLOCK_CACHE_COMPRESSED_MISS);
+    return s;
+  }
+
+  // found compressed block
+  RecordTick(statistics, BLOCK_CACHE_COMPRESSED_HIT);
+  compressed_block = reinterpret_cast<Block*>(
+      block_cache_compressed->Value(block_cache_compressed_handle));
+  assert(compressed_block->compression_type() != kNoCompression);
+
+  // Retrieve the uncompressed contents into a new buffer
+  BlockContents contents;
+  s = UncompressBlockContents(compressed_block->data(),
+                              compressed_block->size(), &contents);
+
+  // Insert uncompressed block into block cache
+  if (s.ok()) {
+    block->value = new Block(contents);  // uncompressed block
+    assert(block->value->compression_type() == kNoCompression);
+    if (block_cache != nullptr && block->value->cachable() &&
+        read_options.fill_cache) {
+      block->cache_handle =
+          block_cache->Insert(block_cache_key, block->value,
+                              block->value->size(), &DeleteCachedEntry<Block>);
+      assert(reinterpret_cast<Block*>(
+                 block_cache->Value(block->cache_handle)) == block->value);
+    }
+  }
+
+  // Release hold on compressed cache entry
+  block_cache_compressed->Release(block_cache_compressed_handle);
+  return s;
+}
+
+Status BlockBasedTable::PutDataBlockToCache(
+    const Slice& block_cache_key, const Slice& compressed_block_cache_key,
+    Cache* block_cache, Cache* block_cache_compressed,
+    const ReadOptions& read_options, Statistics* statistics,
+    CachableEntry<Block>* block, Block* raw_block) {
+  assert(raw_block->compression_type() == kNoCompression ||
+         block_cache_compressed != nullptr);
+
+  Status s;
+  // Retrieve the uncompressed contents into a new buffer
+  BlockContents contents;
+  if (raw_block->compression_type() != kNoCompression) {
+    s = UncompressBlockContents(raw_block->data(), raw_block->size(),
+                                &contents);
+  }
+  if (!s.ok()) {
+    delete raw_block;
+    return s;
+  }
+
+  if (raw_block->compression_type() != kNoCompression) {
+    block->value = new Block(contents);  // uncompressed block
+  } else {
+    block->value = raw_block;
+    raw_block = nullptr;
+  }
+
+  // Insert compressed block into compressed block cache.
+  // Release the hold on the compressed cache entry immediately.
+  if (block_cache_compressed != nullptr && raw_block != nullptr &&
+      raw_block->cachable()) {
+    auto cache_handle = block_cache_compressed->Insert(
+        compressed_block_cache_key, raw_block, raw_block->size(),
+        &DeleteCachedEntry<Block>);
+    block_cache_compressed->Release(cache_handle);
+    RecordTick(statistics, BLOCK_CACHE_COMPRESSED_MISS);
+    // Avoid the following code to delete this cached block.
+    raw_block = nullptr;
+  }
+  delete raw_block;
+
+  // insert into uncompressed block cache
+  assert((block->value->compression_type() == kNoCompression));
+  if (block_cache != nullptr && block->value->cachable()) {
+    block->cache_handle =
+        block_cache->Insert(block_cache_key, block->value, block->value->size(),
+                            &DeleteCachedEntry<Block>);
+    RecordTick(statistics, BLOCK_CACHE_ADD);
+    assert(reinterpret_cast<Block*>(block_cache->Value(block->cache_handle)) ==
+           block->value);
+  }
+
+  return s;
+}
+
+FilterBlockReader* BlockBasedTable::ReadFilter (
+    const Slice& filter_handle_value,
+    BlockBasedTable::Rep* rep,
+    size_t* filter_size) {
+  Slice v = filter_handle_value;
+  BlockHandle filter_handle;
+  if (!filter_handle.DecodeFrom(&v).ok()) {
+    return nullptr;
+  }
+
+  // TODO: We might want to unify with ReadBlockFromFile() if we start
+  // requiring checksum verification in Table::Open.
+  ReadOptions opt;
+  BlockContents block;
+  if (!ReadBlockContents(rep->file.get(), rep->footer, opt, filter_handle,
+                         &block, rep->options.env, false).ok()) {
+    return nullptr;
+  }
+
+  if (filter_size) {
+    *filter_size = block.data.size();
+  }
+
+  return new FilterBlockReader(
+       rep->options, block.data, block.heap_allocated);
+}
+
+BlockBasedTable::CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
+    bool no_io) const {
+  // filter pre-populated
+  if (rep_->filter != nullptr) {
+    return {rep_->filter.get(), nullptr /* cache handle */};
+  }
+
+  if (rep_->options.filter_policy == nullptr /* do not use filter at all */ ||
+      rep_->options.block_cache == nullptr /* no block cache at all */) {
+    return {nullptr /* filter */, nullptr /* cache handle */};
+  }
+
+  // Fetching from the cache
+  Cache* block_cache = rep_->options.block_cache.get();
+  char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
+  auto key = GetCacheKey(
+      rep_->cache_key_prefix,
+      rep_->cache_key_prefix_size,
+      rep_->footer.metaindex_handle(),
+      cache_key
+  );
+
+  Statistics* statistics = rep_->options.statistics.get();
+  auto cache_handle =
+      GetEntryFromCache(block_cache, key, BLOCK_CACHE_FILTER_MISS,
+                        BLOCK_CACHE_FILTER_HIT, statistics);
+
+  FilterBlockReader* filter = nullptr;
+  if (cache_handle != nullptr) {
+     filter = reinterpret_cast<FilterBlockReader*>(
+         block_cache->Value(cache_handle));
+  } else if (no_io) {
+    // Do not invoke any io.
+    return CachableEntry<FilterBlockReader>();
+  } else {
+    size_t filter_size = 0;
+    std::unique_ptr<Block> meta;
+    std::unique_ptr<Iterator> iter;
+    auto s = ReadMetaBlock(rep_, &meta, &iter);
+
+    if (s.ok()) {
+      std::string filter_block_key = kFilterBlockPrefix;
+      filter_block_key.append(rep_->options.filter_policy->Name());
+      iter->Seek(filter_block_key);
+
+      if (iter->Valid() && iter->key() == Slice(filter_block_key)) {
+        filter = ReadFilter(iter->value(), rep_, &filter_size);
+        assert(filter);
+        assert(filter_size > 0);
+
+        cache_handle = block_cache->Insert(
+            key, filter, filter_size, &DeleteCachedEntry<FilterBlockReader>);
+        RecordTick(statistics, BLOCK_CACHE_ADD);
+      }
+    }
+  }
+
+  return { filter, cache_handle };
+}
+
+Iterator* BlockBasedTable::NewIndexIterator(const ReadOptions& read_options) {
+  // index reader has already been pre-populated.
+  if (rep_->index_reader) {
+    return rep_->index_reader->NewIterator();
+  }
+
+  bool no_io = read_options.read_tier == kBlockCacheTier;
+  Cache* block_cache = rep_->options.block_cache.get();
+  char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
+  auto key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
+                         rep_->footer.index_handle(), cache_key);
+  Statistics* statistics = rep_->options.statistics.get();
+  auto cache_handle =
+      GetEntryFromCache(block_cache, key, BLOCK_CACHE_INDEX_MISS,
+                        BLOCK_CACHE_INDEX_HIT, statistics);
+
+  if (cache_handle == nullptr && no_io) {
+    return NewErrorIterator(Status::Incomplete("no blocking io"));
+  }
+
+  IndexReader* index_reader = nullptr;
+  if (cache_handle != nullptr) {
+    index_reader =
+        reinterpret_cast<IndexReader*>(block_cache->Value(cache_handle));
+  } else {
+    // Create index reader and put it in the cache.
+    Status s;
+    s = CreateIndexReader(&index_reader);
+
+    if (!s.ok()) {
+      // make sure if something goes wrong, index_reader shall remain intact.
+      assert(index_reader == nullptr);
+      return NewErrorIterator(s);
+    }
+
+    cache_handle = block_cache->Insert(key, index_reader, index_reader->size(),
+                                       &DeleteCachedEntry<IndexReader>);
+    RecordTick(statistics, BLOCK_CACHE_ADD);
+  }
+
+  assert(cache_handle);
+  auto iter = index_reader->NewIterator();
+  iter->RegisterCleanup(&ReleaseCachedEntry, block_cache, cache_handle);
+
+  return iter;
+}
+
+// Convert an index iterator value (i.e., an encoded BlockHandle)
+// into an iterator over the contents of the corresponding block.
+Iterator* BlockBasedTable::NewDataBlockIterator(Rep* rep,
+    const ReadOptions& ro, bool* didIO, const Slice& index_value) {
+  const bool no_io = (ro.read_tier == kBlockCacheTier);
+  Cache* block_cache = rep->options.block_cache.get();
+  Cache* block_cache_compressed = rep->options.
+                                    block_cache_compressed.get();
+  CachableEntry<Block> block;
+
+  BlockHandle handle;
+  Slice input = index_value;
+  // We intentionally allow extra stuff in index_value so that we
+  // can add more features in the future.
+  Status s = handle.DecodeFrom(&input);
+
+  if (!s.ok()) {
+    return NewErrorIterator(s);
+  }
+
+  // If either block cache is enabled, we'll try to read from it.
+  if (block_cache != nullptr || block_cache_compressed != nullptr) {
+    Statistics* statistics = rep->options.statistics.get();
+    char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
+    char compressed_cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
+    Slice key, /* key to the block cache */
+        ckey /* key to the compressed block cache */;
+
+    // create key for block cache
+    if (block_cache != nullptr) {
+      key = GetCacheKey(rep->cache_key_prefix,
+                        rep->cache_key_prefix_size, handle, cache_key);
+    }
+
+    if (block_cache_compressed != nullptr) {
+      ckey = GetCacheKey(rep->compressed_cache_key_prefix,
+                         rep->compressed_cache_key_prefix_size, handle,
+                         compressed_cache_key);
+    }
+
+    s = GetDataBlockFromCache(key, ckey, block_cache, block_cache_compressed,
+                      statistics, ro, &block);
+
+    if (block.value == nullptr && !no_io && ro.fill_cache) {
+      Histograms histogram = READ_BLOCK_GET_MICROS;
+      Block* raw_block = nullptr;
+      {
+        StopWatch sw(rep->options.env, statistics, histogram);
+        s = ReadBlockFromFile(rep->file.get(), rep->footer, ro, handle,
+                              &raw_block, rep->options.env, didIO,
+                              block_cache_compressed == nullptr);
+      }
+
+      if (s.ok()) {
+        s = PutDataBlockToCache(key, ckey, block_cache, block_cache_compressed,
+                                ro, statistics, &block, raw_block);
+      }
+    }
+  }
+
+  // Didn't get any data from block caches.
+  if (block.value == nullptr) {
+    if (no_io) {
+      // Could not read from block_cache and can't do IO
+      return NewErrorIterator(Status::Incomplete("no blocking io"));
+    }
+    s = ReadBlockFromFile(rep->file.get(), rep->footer, ro, handle,
+                          &block.value, rep->options.env, didIO);
+  }
+
+  Iterator* iter;
+  if (block.value != nullptr) {
+    iter = block.value->NewIterator(&rep->internal_comparator);
+    if (block.cache_handle != nullptr) {
+      iter->RegisterCleanup(&ReleaseCachedEntry, block_cache,
+                            block.cache_handle);
+    } else {
+      iter->RegisterCleanup(&DeleteHeldResource<Block>, block.value, nullptr);
+    }
+  } else {
+    iter = NewErrorIterator(s);
+  }
+  return iter;
+}
+
+class BlockBasedTable::BlockEntryIteratorState : public TwoLevelIteratorState {
+ public:
+  BlockEntryIteratorState(BlockBasedTable* table,
+      const ReadOptions& read_options, bool* did_io)
+    : TwoLevelIteratorState(table->rep_->options.prefix_extractor != nullptr),
+      table_(table), read_options_(read_options), did_io_(did_io) {}
+
+  Iterator* NewSecondaryIterator(const Slice& index_value) override {
+    return NewDataBlockIterator(table_->rep_, read_options_, did_io_,
+                                index_value);
+  }
+
+  bool PrefixMayMatch(const Slice& internal_key) override {
+    return table_->PrefixMayMatch(internal_key);
+  }
+
+ private:
+  // Don't own table_
+  BlockBasedTable* table_;
+  const ReadOptions read_options_;
+  // Don't own did_io_
+  bool* did_io_;
+};
+
+// This will be broken if the user specifies an unusual implementation
+// of Options.comparator, or if the user specifies an unusual
+// definition of prefixes in Options.filter_policy.  In particular, we
+// require the following three properties:
+//
+// 1) key.starts_with(prefix(key))
+// 2) Compare(prefix(key), key) <= 0.
+// 3) If Compare(key1, key2) <= 0, then Compare(prefix(key1), prefix(key2)) <= 0
+//
+// Otherwise, this method guarantees no I/O will be incurred.
+//
+// REQUIRES: this method shouldn't be called while the DB lock is held.
+bool BlockBasedTable::PrefixMayMatch(const Slice& internal_key) {
+  assert(rep_->options.prefix_extractor != nullptr);
+  auto prefix = rep_->options.prefix_extractor->Transform(
+      ExtractUserKey(internal_key));
+  InternalKey internal_key_prefix(prefix, 0, kTypeValue);
+  auto internal_prefix = internal_key_prefix.Encode();
+
+  bool may_match = true;
+  Status s;
+
+  if (!rep_->options.filter_policy) {
+    return true;
+  }
+
+  // To prevent any io operation in this method, we set `read_tier` to make
+  // sure we always read index or filter only when they have already been
+  // loaded to memory.
+  ReadOptions no_io_read_options;
+  no_io_read_options.read_tier = kBlockCacheTier;
+  unique_ptr<Iterator> iiter(NewIndexIterator(no_io_read_options));
+  iiter->Seek(internal_prefix);
+
+  if (!iiter->Valid()) {
+    // we're past end of file
+    // if it's incomplete, it means that we avoided I/O
+    // and we're not really sure that we're past the end
+    // of the file
+    may_match = iiter->status().IsIncomplete();
+  } else if (ExtractUserKey(iiter->key()).starts_with(
+              ExtractUserKey(internal_prefix))) {
+    // we need to check for this subtle case because our only
+    // guarantee is that "the key is a string >= last key in that data
+    // block" according to the doc/table_format.txt spec.
+    //
+    // Suppose iiter->key() starts with the desired prefix; it is not
+    // necessarily the case that the corresponding data block will
+    // contain the prefix, since iiter->key() need not be in the
+    // block.  However, the next data block may contain the prefix, so
+    // we return true to play it safe.
+    may_match = true;
+  } else {
+    // iiter->key() does NOT start with the desired prefix.  Because
+    // Seek() finds the first key that is >= the seek target, this
+    // means that iiter->key() > prefix.  Thus, any data blocks coming
+    // after the data block corresponding to iiter->key() cannot
+    // possibly contain the key.  Thus, the corresponding data block
+    // is the only one which could potentially contain the prefix.
+    Slice handle_value = iiter->value();
+    BlockHandle handle;
+    s = handle.DecodeFrom(&handle_value);
+    assert(s.ok());
+    auto filter_entry = GetFilter(true /* no io */);
+    may_match =
+      filter_entry.value == nullptr ||
+      filter_entry.value->PrefixMayMatch(handle.offset(), internal_prefix);
+    filter_entry.Release(rep_->options.block_cache.get());
+  }
+
+  Statistics* statistics = rep_->options.statistics.get();
+  RecordTick(statistics, BLOOM_FILTER_PREFIX_CHECKED);
+  if (!may_match) {
+    RecordTick(statistics, BLOOM_FILTER_PREFIX_USEFUL);
+  }
+
+  return may_match;
+}
+
+Iterator* BlockBasedTable::NewIterator(const ReadOptions& read_options) {
+  return NewTwoLevelIterator(new BlockEntryIteratorState(this, read_options,
+                                                         nullptr),
+                             NewIndexIterator(read_options));
+}
+
+Status BlockBasedTable::Get(
+    const ReadOptions& read_options, const Slice& key, void* handle_context,
+    bool (*result_handler)(void* handle_context, const ParsedInternalKey& k,
+                           const Slice& v, bool didIO),
+    void (*mark_key_may_exist_handler)(void* handle_context)) {
+  Status s;
+  Iterator* iiter = NewIndexIterator(read_options);
+  auto filter_entry = GetFilter(read_options.read_tier == kBlockCacheTier);
+  FilterBlockReader* filter = filter_entry.value;
+  bool done = false;
+  for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) {
+    Slice handle_value = iiter->value();
+
+    BlockHandle handle;
+    bool may_not_exist_in_filter =
+      filter != nullptr &&
+      handle.DecodeFrom(&handle_value).ok() &&
+      !filter->KeyMayMatch(handle.offset(), key);
+
+    if (may_not_exist_in_filter) {
+      // Not found
+      // TODO: think about interaction with Merge. If a user key cannot
+      // cross one data block, we should be fine.
+      RecordTick(rep_->options.statistics.get(), BLOOM_FILTER_USEFUL);
+      break;
+    } else {
+      bool didIO = false;
+      unique_ptr<Iterator> block_iter(
+          NewDataBlockIterator(rep_, read_options, &didIO, iiter->value()));
+
+      if (read_options.read_tier && block_iter->status().IsIncomplete()) {
+        // couldn't get block from block_cache
+        // Update Saver.state to Found because we are only looking for whether
+        // we can guarantee the key is not there when "no_io" is set
+        (*mark_key_may_exist_handler)(handle_context);
+        break;
+      }
+
+      // Call the *saver function on each entry/block until it returns false
+      for (block_iter->Seek(key); block_iter->Valid(); block_iter->Next()) {
+        ParsedInternalKey parsed_key;
+        if (!ParseInternalKey(block_iter->key(), &parsed_key)) {
+          s = Status::Corruption(Slice());
+        }
+
+        if (!(*result_handler)(handle_context, parsed_key, block_iter->value(),
+                               didIO)) {
+          done = true;
+          break;
+        }
+      }
+      s = block_iter->status();
+    }
+  }
+
+  filter_entry.Release(rep_->options.block_cache.get());
+  if (s.ok()) {
+    s = iiter->status();
+  }
+  delete iiter;
+  return s;
+}
+
+namespace {
+bool SaveDidIO(void* arg, const ParsedInternalKey& key, const Slice& value,
+               bool didIO) {
+  *reinterpret_cast<bool*>(arg) = didIO;
+  return false;
+}
+}  // namespace
+
+bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options,
+                                      const Slice& key) {
+  // We use Get() as it has logic that checks whether we read the
+  // block from the disk or not.
+  bool didIO = false;
+  Status s = Get(options, key, &didIO, SaveDidIO);
+  assert(s.ok());
+  return !didIO;
+}
+
+// REQUIRES: The following fields of rep_ should have already been populated:
+//  1. file
+//  2. index_handle,
+//  3. options
+//  4. internal_comparator
+//  5. index_type
+Status BlockBasedTable::CreateIndexReader(IndexReader** index_reader) {
+  // Some old version of block-based tables don't have index type present in
+  // table properties. If that's the case we can safely use the kBinarySearch.
+  auto index_type_on_file = BlockBasedTableOptions::kBinarySearch;
+  if (rep_->table_properties) {
+    auto& props = rep_->table_properties->user_collected_properties;
+    auto pos = props.find(BlockBasedTablePropertyNames::kIndexType);
+    if (pos != props.end()) {
+      index_type_on_file = static_cast<BlockBasedTableOptions::IndexType>(
+          DecodeFixed32(pos->second.c_str()));
+    }
+  }
+
+  // TODO(sdong): Currently binary index is the only index type we support in
+  // files. Hash index is built on top of binary index too.
+  if (index_type_on_file != BlockBasedTableOptions::kBinarySearch) {
+    return Status::NotSupported("File Contains not supported index type: ",
+                                std::to_string(index_type_on_file));
+  }
+
+  auto file = rep_->file.get();
+  auto env = rep_->options.env;
+  auto comparator = &rep_->internal_comparator;
+  const Footer& footer = rep_->footer;
+
+  switch (rep_->index_type) {
+    case BlockBasedTableOptions::kBinarySearch: {
+      return BinarySearchIndexReader::Create(
+          file, footer, footer.index_handle(), env, comparator, index_reader);
+    }
+    case BlockBasedTableOptions::kHashSearch: {
+      // We need to wrap data with internal_prefix_transform to make sure it can
+      // handle prefix correctly.
+      rep_->internal_prefix_transform.reset(
+          new InternalKeySliceTransform(rep_->options.prefix_extractor.get()));
+      return HashIndexReader::Create(
+          file, footer, footer.index_handle(), env, comparator,
+          [&](Iterator* index_iter) {
+            return NewTwoLevelIterator(new BlockEntryIteratorState(this,
+                ReadOptions(), nullptr), index_iter);
+          },
+          rep_->internal_prefix_transform.get(), index_reader);
+    }
+    default: {
+      std::string error_message =
+          "Unrecognized index type: " + std::to_string(rep_->index_type);
+      // equivalent to assert(false), but more informative.
+      assert(!error_message.c_str());
+      return Status::InvalidArgument(error_message.c_str());
+    }
+  }
+}
+
+uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key) {
+  unique_ptr<Iterator> index_iter(NewIndexIterator(ReadOptions()));
+
+  index_iter->Seek(key);
+  uint64_t result;
+  if (index_iter->Valid()) {
+    BlockHandle handle;
+    Slice input = index_iter->value();
+    Status s = handle.DecodeFrom(&input);
+    if (s.ok()) {
+      result = handle.offset();
+    } else {
+      // Strange: we can't decode the block handle in the index block.
+      // We'll just return the offset of the metaindex block, which is
+      // close to the whole file size for this case.
+      result = rep_->footer.metaindex_handle().offset();
+    }
+  } else {
+    // key is past the last key in the file. If table_properties is not
+    // available, approximate the offset by returning the offset of the
+    // metaindex block (which is right near the end of the file).
+    result = 0;
+    if (rep_->table_properties) {
+      result = rep_->table_properties->data_size;
+    }
+    // table_properties is not present in the table.
+    if (result == 0) {
+      result = rep_->footer.metaindex_handle().offset();
+    }
+  }
+  return result;
+}
+
+bool BlockBasedTable::TEST_filter_block_preloaded() const {
+  return rep_->filter != nullptr;
+}
+
+bool BlockBasedTable::TEST_index_reader_preloaded() const {
+  return rep_->index_reader != nullptr;
+}
+
+}  // namespace rocksdb
diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h
new file mode 100644 (file)
index 0000000..f68d642
--- /dev/null
@@ -0,0 +1,198 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <stdint.h>
+#include <memory>
+#include <utility>
+#include <string>
+
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "table/table_reader.h"
+#include "util/coding.h"
+
+namespace rocksdb {
+
+class Block;
+class BlockHandle;
+class Cache;
+class FilterBlockReader;
+class Footer;
+class InternalKeyComparator;
+class Iterator;
+class RandomAccessFile;
+class TableCache;
+class TableReader;
+class WritableFile;
+struct BlockBasedTableOptions;
+struct EnvOptions;
+struct Options;
+struct ReadOptions;
+
+using std::unique_ptr;
+
+// A Table is a sorted map from strings to strings.  Tables are
+// immutable and persistent.  A Table may be safely accessed from
+// multiple threads without external synchronization.
+class BlockBasedTable : public TableReader {
+ public:
+  static const std::string kFilterBlockPrefix;
+
+  // Attempt to open the table that is stored in bytes [0..file_size)
+  // of "file", and read the metadata entries necessary to allow
+  // retrieving data from the table.
+  //
+  // If successful, returns ok and sets "*table_reader" to the newly opened
+  // table.  The client should delete "*table_reader" when no longer needed.
+  // If there was an error while initializing the table, sets "*table_reader"
+  // to nullptr and returns a non-ok status.
+  //
+  // *file must remain live while this Table is in use.
+  static Status Open(const Options& db_options, const EnvOptions& env_options,
+                     const BlockBasedTableOptions& table_options,
+                     const InternalKeyComparator& internal_key_comparator,
+                     unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
+                     unique_ptr<TableReader>* table_reader);
+
+  bool PrefixMayMatch(const Slice& internal_key);
+
+  // Returns a new iterator over the table contents.
+  // The result of NewIterator() is initially invalid (caller must
+  // call one of the Seek methods on the iterator before using it).
+  Iterator* NewIterator(const ReadOptions&) override;
+
+  Status Get(const ReadOptions& readOptions, const Slice& key,
+             void* handle_context,
+             bool (*result_handler)(void* handle_context,
+                                    const ParsedInternalKey& k, const Slice& v,
+                                    bool didIO),
+             void (*mark_key_may_exist_handler)(void* handle_context) =
+                 nullptr) override;
+
+  // Given a key, return an approximate byte offset in the file where
+  // the data for that key begins (or would begin if the key were
+  // present in the file).  The returned value is in terms of file
+  // bytes, and so includes effects like compression of the underlying data.
+  // E.g., the approximate offset of the last key in the table will
+  // be close to the file length.
+  uint64_t ApproximateOffsetOf(const Slice& key) override;
+
+  // Returns true if the block for the specified key is in cache.
+  // REQUIRES: key is in this table.
+  bool TEST_KeyInCache(const ReadOptions& options, const Slice& key);
+
+  // Set up the table for Compaction. Might change some parameters with
+  // posix_fadvise
+  void SetupForCompaction() override;
+
+  std::shared_ptr<const TableProperties> GetTableProperties() const override;
+
+  ~BlockBasedTable();
+
+  bool TEST_filter_block_preloaded() const;
+  bool TEST_index_reader_preloaded() const;
+  // Implementation of IndexReader will be exposed to internal cc file only.
+  class IndexReader;
+
+ private:
+  template <class TValue>
+  struct CachableEntry;
+
+  struct Rep;
+  Rep* rep_;
+  bool compaction_optimized_;
+
+  class BlockEntryIteratorState;
+  static Iterator* NewDataBlockIterator(Rep* rep, const ReadOptions& ro,
+      bool* didIO, const Slice& index_value);
+
+  // For the following two functions:
+  // if `no_io == true`, we will not try to read filter/index from sst file
+  // were they not present in cache yet.
+  CachableEntry<FilterBlockReader> GetFilter(bool no_io = false) const;
+
+  // Get the iterator from the index reader.
+  //
+  // Note: ErrorIterator with Status::Incomplete shall be returned if all the
+  // following conditions are met:
+  //  1. We enabled table_options.cache_index_and_filter_blocks.
+  //  2. index is not present in block cache.
+  //  3. We disallowed any io to be performed, that is, read_options ==
+  //     kBlockCacheTier
+  Iterator* NewIndexIterator(const ReadOptions& read_options);
+
+  // Read block cache from block caches (if set): block_cache and
+  // block_cache_compressed.
+  // On success, Status::OK with be returned and @block will be populated with
+  // pointer to the block as well as its block handle.
+  static Status GetDataBlockFromCache(
+      const Slice& block_cache_key, const Slice& compressed_block_cache_key,
+      Cache* block_cache, Cache* block_cache_compressed, Statistics* statistics,
+      const ReadOptions& read_options,
+      BlockBasedTable::CachableEntry<Block>* block);
+  // Put a raw block (maybe compressed) to the corresponding block caches.
+  // This method will perform decompression against raw_block if needed and then
+  // populate the block caches.
+  // On success, Status::OK will be returned; also @block will be populated with
+  // uncompressed block and its cache handle.
+  //
+  // REQUIRES: raw_block is heap-allocated. PutDataBlockToCache() will be
+  // responsible for releasing its memory if error occurs.
+  static Status PutDataBlockToCache(
+      const Slice& block_cache_key, const Slice& compressed_block_cache_key,
+      Cache* block_cache, Cache* block_cache_compressed,
+      const ReadOptions& read_options, Statistics* statistics,
+      CachableEntry<Block>* block, Block* raw_block);
+
+  // Calls (*handle_result)(arg, ...) repeatedly, starting with the entry found
+  // after a call to Seek(key), until handle_result returns false.
+  // May not make such a call if filter policy says that key is not present.
+  friend class TableCache;
+  friend class BlockBasedTableBuilder;
+
+  void ReadMeta(const Footer& footer);
+  void ReadFilter(const Slice& filter_handle_value);
+  Status CreateIndexReader(IndexReader** index_reader);
+
+  // Read the meta block from sst.
+  static Status ReadMetaBlock(
+      Rep* rep,
+      std::unique_ptr<Block>* meta_block,
+      std::unique_ptr<Iterator>* iter);
+
+  // Create the filter from the filter block.
+  static FilterBlockReader* ReadFilter(
+      const Slice& filter_handle_value,
+      Rep* rep,
+      size_t* filter_size = nullptr);
+
+  static void SetupCacheKeyPrefix(Rep* rep);
+
+  explicit BlockBasedTable(Rep* rep)
+      : rep_(rep), compaction_optimized_(false) {}
+
+  // Generate a cache key prefix from the file
+  static void GenerateCachePrefix(Cache* cc,
+    RandomAccessFile* file, char* buffer, size_t* size);
+  static void GenerateCachePrefix(Cache* cc,
+    WritableFile* file, char* buffer, size_t* size);
+
+  // The longest prefix of the cache key used to identify blocks.
+  // For Posix files the unique ID is three varints.
+  static const size_t kMaxCacheKeyPrefixSize = kMaxVarint64Length*3+1;
+
+  // No copying allowed
+  explicit BlockBasedTable(const TableReader&) = delete;
+  void operator=(const TableReader&) = delete;
+};
+
+}  // namespace rocksdb
diff --git a/table/block_builder.cc b/table/block_builder.cc
new file mode 100644 (file)
index 0000000..f812dba
--- /dev/null
@@ -0,0 +1,134 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// BlockBuilder generates blocks where keys are prefix-compressed:
+//
+// When we store a key, we drop the prefix shared with the previous
+// string.  This helps reduce the space requirement significantly.
+// Furthermore, once every K keys, we do not apply the prefix
+// compression and store the entire key.  We call this a "restart
+// point".  The tail end of the block stores the offsets of all of the
+// restart points, and can be used to do a binary search when looking
+// for a particular key.  Values are stored as-is (without compression)
+// immediately following the corresponding key.
+//
+// An entry for a particular key-value pair has the form:
+//     shared_bytes: varint32
+//     unshared_bytes: varint32
+//     value_length: varint32
+//     key_delta: char[unshared_bytes]
+//     value: char[value_length]
+// shared_bytes == 0 for restart points.
+//
+// The trailer of the block has the form:
+//     restarts: uint32[num_restarts]
+//     num_restarts: uint32
+// restarts[i] contains the offset within the block of the ith restart point.
+
+#include "table/block_builder.h"
+
+#include <algorithm>
+#include <assert.h>
+#include "rocksdb/comparator.h"
+#include "db/dbformat.h"
+#include "util/coding.h"
+
+namespace rocksdb {
+
+BlockBuilder::BlockBuilder(int block_restart_interval,
+                           const Comparator* comparator)
+    : block_restart_interval_(block_restart_interval),
+      comparator_(comparator),
+      restarts_(),
+      counter_(0),
+      finished_(false) {
+  assert(block_restart_interval_ >= 1);
+  restarts_.push_back(0);       // First restart point is at offset 0
+}
+
+BlockBuilder::BlockBuilder(const Options& options, const Comparator* comparator)
+    : BlockBuilder(options.block_restart_interval, comparator) {}
+
+void BlockBuilder::Reset() {
+  buffer_.clear();
+  restarts_.clear();
+  restarts_.push_back(0);       // First restart point is at offset 0
+  counter_ = 0;
+  finished_ = false;
+  last_key_.clear();
+}
+
+size_t BlockBuilder::CurrentSizeEstimate() const {
+  return (buffer_.size() +                        // Raw data buffer
+          restarts_.size() * sizeof(uint32_t) +   // Restart array
+          sizeof(uint32_t));                      // Restart array length
+}
+
+size_t BlockBuilder::EstimateSizeAfterKV(const Slice& key, const Slice& value)
+  const {
+  size_t estimate = CurrentSizeEstimate();
+  estimate += key.size() + value.size();
+  if (counter_ >= block_restart_interval_) {
+    estimate += sizeof(uint32_t); // a new restart entry.
+  }
+
+  estimate += sizeof(int32_t); // varint for shared prefix length.
+  estimate += VarintLength(key.size()); // varint for key length.
+  estimate += VarintLength(value.size()); // varint for value length.
+
+  return estimate;
+}
+
+Slice BlockBuilder::Finish() {
+  // Append restart array
+  for (size_t i = 0; i < restarts_.size(); i++) {
+    PutFixed32(&buffer_, restarts_[i]);
+  }
+  PutFixed32(&buffer_, restarts_.size());
+  finished_ = true;
+  return Slice(buffer_);
+}
+
+void BlockBuilder::Add(const Slice& key, const Slice& value) {
+  Slice last_key_piece(last_key_);
+  assert(!finished_);
+  assert(counter_ <= block_restart_interval_);
+  assert(buffer_.empty() // No values yet?
+         || comparator_->Compare(key, last_key_piece) > 0);
+  size_t shared = 0;
+  if (counter_ < block_restart_interval_) {
+    // See how much sharing to do with previous string
+    const size_t min_length = std::min(last_key_piece.size(), key.size());
+    while ((shared < min_length) && (last_key_piece[shared] == key[shared])) {
+      shared++;
+    }
+  } else {
+    // Restart compression
+    restarts_.push_back(buffer_.size());
+    counter_ = 0;
+  }
+  const size_t non_shared = key.size() - shared;
+
+  // Add "<shared><non_shared><value_size>" to buffer_
+  PutVarint32(&buffer_, shared);
+  PutVarint32(&buffer_, non_shared);
+  PutVarint32(&buffer_, value.size());
+
+  // Add string delta to buffer_ followed by value
+  buffer_.append(key.data() + shared, non_shared);
+  buffer_.append(value.data(), value.size());
+
+  // Update state
+  last_key_.resize(shared);
+  last_key_.append(key.data() + shared, non_shared);
+  assert(Slice(last_key_) == key);
+  counter_++;
+}
+
+}  // namespace rocksdb
diff --git a/table/block_builder.h b/table/block_builder.h
new file mode 100644 (file)
index 0000000..ed2f290
--- /dev/null
@@ -0,0 +1,65 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <vector>
+
+#include <stdint.h>
+#include "rocksdb/slice.h"
+
+namespace rocksdb {
+
+struct Options;
+class Comparator;
+
+class BlockBuilder {
+ public:
+  BlockBuilder(int block_builder, const Comparator* comparator);
+  explicit BlockBuilder(const Options& options, const Comparator* comparator);
+
+  // Reset the contents as if the BlockBuilder was just constructed.
+  void Reset();
+
+  // REQUIRES: Finish() has not been callled since the last call to Reset().
+  // REQUIRES: key is larger than any previously added key
+  void Add(const Slice& key, const Slice& value);
+
+  // Finish building the block and return a slice that refers to the
+  // block contents.  The returned slice will remain valid for the
+  // lifetime of this builder or until Reset() is called.
+  Slice Finish();
+
+  // Returns an estimate of the current (uncompressed) size of the block
+  // we are building.
+  size_t CurrentSizeEstimate() const;
+
+  // Returns an estimated block size after appending key and value.
+  size_t EstimateSizeAfterKV(const Slice& key, const Slice& value) const;
+
+  // Return true iff no entries have been added since the last Reset()
+  bool empty() const {
+    return buffer_.empty();
+  }
+
+ private:
+  const int          block_restart_interval_;
+  const Comparator*  comparator_;
+
+  std::string           buffer_;    // Destination buffer
+  std::vector<uint32_t> restarts_;  // Restart points
+  int                   counter_;   // Number of entries emitted since restart
+  bool                  finished_;  // Has Finish() been called?
+  std::string           last_key_;
+
+  // No copying allowed
+  BlockBuilder(const BlockBuilder&);
+  void operator=(const BlockBuilder&);
+};
+
+}  // namespace rocksdb
diff --git a/table/block_hash_index.cc b/table/block_hash_index.cc
new file mode 100644 (file)
index 0000000..0c9674c
--- /dev/null
@@ -0,0 +1,112 @@
+// Copyright (c) 2013, Facebook, Inc. All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#include <algorithm>
+
+#include "table/block_hash_index.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/slice_transform.h"
+
+namespace rocksdb {
+
+BlockHashIndex* CreateBlockHashIndex(Iterator* index_iter, Iterator* data_iter,
+                                     const uint32_t num_restarts,
+                                     const Comparator* comparator,
+                                     const SliceTransform* hash_key_extractor) {
+  assert(hash_key_extractor);
+  auto hash_index = new BlockHashIndex(hash_key_extractor);
+  uint64_t current_restart_index = 0;
+
+  std::string pending_entry_prefix;
+  // pending_block_num == 0 also implies there is no entry inserted at all.
+  uint32_t pending_block_num = 0;
+  uint32_t pending_entry_index = 0;
+
+  // scan all the entries and create a hash index based on their prefixes.
+  data_iter->SeekToFirst();
+  for (index_iter->SeekToFirst();
+       index_iter->Valid() && current_restart_index < num_restarts;
+       index_iter->Next()) {
+    Slice last_key_in_block = index_iter->key();
+    assert(data_iter->Valid() && data_iter->status().ok());
+
+    // scan through all entries within a data block.
+    while (data_iter->Valid() &&
+           comparator->Compare(data_iter->key(), last_key_in_block) <= 0) {
+      auto key_prefix = hash_key_extractor->Transform(data_iter->key());
+      bool is_first_entry = pending_block_num == 0;
+
+      // Keys may share the prefix
+      if (is_first_entry || pending_entry_prefix != key_prefix) {
+        if (!is_first_entry) {
+          bool succeeded = hash_index->Add(
+              pending_entry_prefix, pending_entry_index, pending_block_num);
+          if (!succeeded) {
+            delete hash_index;
+            return nullptr;
+          }
+        }
+
+        // update the status.
+        // needs a hard copy otherwise the underlying data changes all the time.
+        pending_entry_prefix = key_prefix.ToString();
+        pending_block_num = 1;
+        pending_entry_index = current_restart_index;
+      } else {
+        // entry number increments when keys share the prefix reside in
+        // differnt data blocks.
+        auto last_restart_index = pending_entry_index + pending_block_num - 1;
+        assert(last_restart_index <= current_restart_index);
+        if (last_restart_index != current_restart_index) {
+          ++pending_block_num;
+        }
+      }
+      data_iter->Next();
+    }
+
+    ++current_restart_index;
+  }
+
+  // make sure all entries has been scaned.
+  assert(!index_iter->Valid());
+  assert(!data_iter->Valid());
+
+  if (pending_block_num > 0) {
+    auto succeeded = hash_index->Add(pending_entry_prefix, pending_entry_index,
+                                     pending_block_num);
+    if (!succeeded) {
+      delete hash_index;
+      return nullptr;
+    }
+  }
+
+  return hash_index;
+}
+
+bool BlockHashIndex::Add(const Slice& prefix, uint32_t restart_index,
+                         uint32_t num_blocks) {
+  auto prefix_ptr = arena_.Allocate(prefix.size());
+  std::copy(prefix.data() /* begin */, prefix.data() + prefix.size() /* end */,
+            prefix_ptr /* destination */);
+  auto result =
+      restart_indices_.insert({Slice(prefix_ptr, prefix.size()),
+                               RestartIndex(restart_index, num_blocks)});
+  return result.second;
+}
+
+const BlockHashIndex::RestartIndex* BlockHashIndex::GetRestartIndex(
+    const Slice& key) {
+  auto key_prefix = hash_key_extractor_->Transform(key);
+
+  auto pos = restart_indices_.find(key_prefix);
+  if (pos == restart_indices_.end()) {
+    return nullptr;
+  }
+
+  return &pos->second;
+}
+
+}  // namespace rocksdb
diff --git a/table/block_hash_index.h b/table/block_hash_index.h
new file mode 100644 (file)
index 0000000..0ff65b4
--- /dev/null
@@ -0,0 +1,72 @@
+// Copyright (c) 2013, Facebook, Inc. All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+#pragma once
+
+#include <string>
+#include <unordered_map>
+
+#include "util/arena.h"
+#include "util/murmurhash.h"
+
+namespace rocksdb {
+
+class Comparator;
+class Iterator;
+class Slice;
+class SliceTransform;
+
+// Build a hash-based index to speed up the lookup for "index block".
+// BlockHashIndex accepts a key and, if found, returns its restart index within
+// that index block.
+class BlockHashIndex {
+ public:
+  // Represents a restart index in the index block's restart array.
+  struct RestartIndex {
+    explicit RestartIndex(uint32_t first_index, uint32_t num_blocks = 1)
+        : first_index(first_index), num_blocks(num_blocks) {}
+
+    // For a given prefix, what is the restart index for the first data block
+    // that contains it.
+    uint32_t first_index = 0;
+
+    // How many data blocks contains this prefix?
+    uint32_t num_blocks = 1;
+  };
+
+  explicit BlockHashIndex(const SliceTransform* hash_key_extractor)
+      : hash_key_extractor_(hash_key_extractor) {}
+
+  // Maps a key to its restart first_index.
+  // Returns nullptr if the restart first_index is found
+  const RestartIndex* GetRestartIndex(const Slice& key);
+
+  bool Add(const Slice& key_prefix, uint32_t restart_index,
+           uint32_t num_blocks);
+
+  size_t ApproximateMemoryUsage() const {
+    return arena_.ApproximateMemoryUsage();
+  }
+
+ private:
+  const SliceTransform* hash_key_extractor_;
+  std::unordered_map<Slice, RestartIndex, murmur_hash> restart_indices_;
+  Arena arena_;
+};
+
+// Create hash index by scanning the entries in index as well as the whole
+// dataset.
+// @params index_iter: an iterator with the pointer to the first entry in a
+//                     block.
+// @params data_iter: an iterator that can scan all the entries reside in a
+//                     table.
+// @params num_restarts: used for correctness verification.
+// @params hash_key_extractor: extract the hashable part of a given key.
+// On error, nullptr will be returned.
+BlockHashIndex* CreateBlockHashIndex(Iterator* index_iter, Iterator* data_iter,
+                                     const uint32_t num_restarts,
+                                     const Comparator* comparator,
+                                     const SliceTransform* hash_key_extractor);
+
+}  // namespace rocksdb
diff --git a/table/block_hash_index_test.cc b/table/block_hash_index_test.cc
new file mode 100644 (file)
index 0000000..f4c0ac4
--- /dev/null
@@ -0,0 +1,117 @@
+// Copyright (c) 2013, Facebook, Inc. All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#include <map>
+#include <memory>
+#include <vector>
+
+#include "rocksdb/comparator.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/slice_transform.h"
+#include "table/block_hash_index.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+
+typedef std::map<std::string, std::string> Data;
+
+class MapIterator : public Iterator {
+ public:
+  explicit MapIterator(const Data& data) : data_(data), pos_(data_.end()) {}
+
+  virtual bool Valid() const { return pos_ != data_.end(); }
+
+  virtual void SeekToFirst() { pos_ = data_.begin(); }
+
+  virtual void SeekToLast() {
+    pos_ = data_.end();
+    --pos_;
+  }
+
+  virtual void Seek(const Slice& target) {
+    pos_ = data_.find(target.ToString());
+  }
+
+  virtual void Next() { ++pos_; }
+
+  virtual void Prev() { --pos_; }
+
+  virtual Slice key() const { return pos_->first; }
+
+  virtual Slice value() const { return pos_->second; }
+
+  virtual Status status() const { return Status::OK(); }
+
+ private:
+  const Data& data_;
+  Data::const_iterator pos_;
+};
+
+class BlockTest {};
+
+TEST(BlockTest, BasicTest) {
+  const size_t keys_per_block = 4;
+  const size_t prefix_size = 2;
+  std::vector<std::string> keys = {/* block 1 */
+                                   "0101", "0102", "0103", "0201",
+                                   /* block 2 */
+                                   "0202", "0203", "0301", "0401",
+                                   /* block 3 */
+                                   "0501", "0601", "0701", "0801",
+                                   /* block 4 */
+                                   "0802", "0803", "0804", "0805",
+                                   /* block 5 */
+                                   "0806", "0807", "0808", "0809", };
+
+  Data data_entries;
+  for (const auto key : keys) {
+    data_entries.insert({key, key});
+  }
+
+  Data index_entries;
+  for (size_t i = 3; i < keys.size(); i += keys_per_block) {
+    // simply ignore the value part
+    index_entries.insert({keys[i], ""});
+  }
+
+  MapIterator data_iter(data_entries);
+  MapIterator index_iter(index_entries);
+
+  auto prefix_extractor = NewFixedPrefixTransform(prefix_size);
+  std::unique_ptr<BlockHashIndex> block_hash_index(
+      CreateBlockHashIndex(&index_iter, &data_iter, index_entries.size(),
+                           BytewiseComparator(), prefix_extractor));
+
+  std::map<std::string, BlockHashIndex::RestartIndex> expected = {
+      {"01xx", BlockHashIndex::RestartIndex(0, 1)},
+      {"02yy", BlockHashIndex::RestartIndex(0, 2)},
+      {"03zz", BlockHashIndex::RestartIndex(1, 1)},
+      {"04pp", BlockHashIndex::RestartIndex(1, 1)},
+      {"05ww", BlockHashIndex::RestartIndex(2, 1)},
+      {"06xx", BlockHashIndex::RestartIndex(2, 1)},
+      {"07pp", BlockHashIndex::RestartIndex(2, 1)},
+      {"08xz", BlockHashIndex::RestartIndex(2, 3)}, };
+
+  const BlockHashIndex::RestartIndex* index = nullptr;
+  // search existed prefixes
+  for (const auto& item : expected) {
+    index = block_hash_index->GetRestartIndex(item.first);
+    ASSERT_TRUE(index != nullptr);
+    ASSERT_EQ(item.second.first_index, index->first_index);
+    ASSERT_EQ(item.second.num_blocks, index->num_blocks);
+  }
+
+  // search non exist prefixes
+  ASSERT_TRUE(!block_hash_index->GetRestartIndex("00xx"));
+  ASSERT_TRUE(!block_hash_index->GetRestartIndex("10yy"));
+  ASSERT_TRUE(!block_hash_index->GetRestartIndex("20zz"));
+
+  delete prefix_extractor;
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); }
diff --git a/table/block_test.cc b/table/block_test.cc
new file mode 100644 (file)
index 0000000..fdba8e9
--- /dev/null
@@ -0,0 +1,242 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#include <stdio.h>
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "db/memtable.h"
+#include "db/write_batch_internal.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/table.h"
+#include "rocksdb/slice_transform.h"
+#include "table/block.h"
+#include "table/block_builder.h"
+#include "table/format.h"
+#include "table/block_hash_index.h"
+#include "util/random.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+
+static std::string RandomString(Random* rnd, int len) {
+  std::string r;
+  test::RandomString(rnd, len, &r);
+  return r;
+}
+std::string GenerateKey(int primary_key, int secondary_key, int padding_size,
+                        Random *rnd) {
+  char buf[50];
+  char *p = &buf[0];
+  snprintf(buf, sizeof(buf), "%6d%4d", primary_key, secondary_key);
+  std::string k(p);
+  if (padding_size) {
+    k += RandomString(rnd, padding_size);
+  }
+
+  return k;
+}
+
+// Generate random key value pairs.
+// The generated key will be sorted. You can tune the parameters to generated
+// different kinds of test key/value pairs for different scenario.
+void GenerateRandomKVs(std::vector<std::string> *keys,
+                       std::vector<std::string> *values, const int from,
+                       const int len, const int step = 1,
+                       const int padding_size = 0,
+                       const int keys_share_prefix = 1) {
+  Random rnd(302);
+
+  // generate different prefix
+  for (int i = from; i < from + len; i += step) {
+    // generating keys that shares the prefix
+    for (int j = 0; j < keys_share_prefix; ++j) {
+      keys->emplace_back(GenerateKey(i, j, padding_size, &rnd));
+
+      // 100 bytes values
+      values->emplace_back(RandomString(&rnd, 100));
+    }
+  }
+}
+
+class BlockTest {};
+
+// block test
+TEST(BlockTest, SimpleTest) {
+  Random rnd(301);
+  Options options = Options();
+  std::unique_ptr<InternalKeyComparator> ic;
+  ic.reset(new test::PlainInternalKeyComparator(options.comparator));
+
+  std::vector<std::string> keys;
+  std::vector<std::string> values;
+  BlockBuilder builder(options, ic.get());
+  int num_records = 100000;
+
+  GenerateRandomKVs(&keys, &values, 0, num_records);
+  // add a bunch of records to a block
+  for (int i = 0; i < num_records; i++) {
+    builder.Add(keys[i], values[i]);
+  }
+
+  // read serialized contents of the block
+  Slice rawblock = builder.Finish();
+
+  // create block reader
+  BlockContents contents;
+  contents.data = rawblock;
+  contents.cachable = false;
+  contents.heap_allocated = false;
+  Block reader(contents);
+
+  // read contents of block sequentially
+  int count = 0;
+  Iterator* iter = reader.NewIterator(options.comparator);
+  for (iter->SeekToFirst();iter->Valid(); count++, iter->Next()) {
+
+    // read kv from block
+    Slice k = iter->key();
+    Slice v = iter->value();
+
+    // compare with lookaside array
+    ASSERT_EQ(k.ToString().compare(keys[count]), 0);
+    ASSERT_EQ(v.ToString().compare(values[count]), 0);
+  }
+  delete iter;
+
+  // read block contents randomly
+  iter = reader.NewIterator(options.comparator);
+  for (int i = 0; i < num_records; i++) {
+
+    // find a random key in the lookaside array
+    int index = rnd.Uniform(num_records);
+    Slice k(keys[index]);
+
+    // search in block for this key
+    iter->Seek(k);
+    ASSERT_TRUE(iter->Valid());
+    Slice v = iter->value();
+    ASSERT_EQ(v.ToString().compare(values[index]), 0);
+  }
+  delete iter;
+}
+
+// return the block contents
+BlockContents GetBlockContents(std::unique_ptr<BlockBuilder> *builder,
+                               const std::vector<std::string> &keys,
+                               const std::vector<std::string> &values,
+                               const int prefix_group_size = 1) {
+  builder->reset(
+      new BlockBuilder(1 /* restart interval */, BytewiseComparator()));
+
+  // Add only half of the keys
+  for (size_t i = 0; i < keys.size(); ++i) {
+    (*builder)->Add(keys[i], values[i]);
+  }
+  Slice rawblock = (*builder)->Finish();
+
+  BlockContents contents;
+  contents.data = rawblock;
+  contents.cachable = false;
+  contents.heap_allocated = false;
+
+  return contents;
+}
+
+void CheckBlockContents(BlockContents contents, const int max_key,
+                        const std::vector<std::string> &keys,
+                        const std::vector<std::string> &values) {
+  const size_t prefix_size = 6;
+  // create block reader
+  Block reader1(contents);
+  Block reader2(contents);
+
+  std::unique_ptr<const SliceTransform> prefix_extractor(
+      NewFixedPrefixTransform(prefix_size));
+
+  {
+    auto iter1 = reader1.NewIterator(nullptr);
+    auto iter2 = reader1.NewIterator(nullptr);
+    reader1.SetBlockHashIndex(CreateBlockHashIndex(iter1, iter2, keys.size(),
+                                                   BytewiseComparator(),
+                                                   prefix_extractor.get()));
+
+    delete iter1;
+    delete iter2;
+  }
+
+  std::unique_ptr<Iterator> hash_iter(
+      reader1.NewIterator(BytewiseComparator()));
+
+  std::unique_ptr<Iterator> regular_iter(
+      reader2.NewIterator(BytewiseComparator()));
+
+  // Seek existent keys
+  for (size_t i = 0; i < keys.size(); i++) {
+    hash_iter->Seek(keys[i]);
+    ASSERT_OK(hash_iter->status());
+    ASSERT_TRUE(hash_iter->Valid());
+
+    Slice v = hash_iter->value();
+    ASSERT_EQ(v.ToString().compare(values[i]), 0);
+  }
+
+  // Seek non-existent keys.
+  // For hash index, if no key with a given prefix is not found, iterator will
+  // simply be set as invalid; whereas the binary search based iterator will
+  // return the one that is closest.
+  for (int i = 1; i < max_key - 1; i += 2) {
+    auto key = GenerateKey(i, 0, 0, nullptr);
+    hash_iter->Seek(key);
+    ASSERT_TRUE(!hash_iter->Valid());
+
+    regular_iter->Seek(key);
+    ASSERT_TRUE(regular_iter->Valid());
+  }
+}
+
+// In this test case, no two key share same prefix.
+TEST(BlockTest, SimpleIndexHash) {
+  const int kMaxKey = 100000;
+  std::vector<std::string> keys;
+  std::vector<std::string> values;
+  GenerateRandomKVs(&keys, &values, 0 /* first key id */,
+                    kMaxKey /* last key id */, 2 /* step */,
+                    8 /* padding size (8 bytes randomly generated suffix) */);
+
+  std::unique_ptr<BlockBuilder> builder;
+  auto contents = GetBlockContents(&builder, keys, values);
+
+  CheckBlockContents(contents, kMaxKey, keys, values);
+}
+
+TEST(BlockTest, IndexHashWithSharedPrefix) {
+  const int kMaxKey = 100000;
+  // for each prefix, there will be 5 keys starts with it.
+  const int kPrefixGroup = 5;
+  std::vector<std::string> keys;
+  std::vector<std::string> values;
+  // Generate keys with same prefix.
+  GenerateRandomKVs(&keys, &values, 0,  // first key id
+                    kMaxKey,            // last key id
+                    2,                  // step
+                    10,                 // padding size,
+                    kPrefixGroup);
+
+  std::unique_ptr<BlockBuilder> builder;
+  auto contents = GetBlockContents(&builder, keys, values, kPrefixGroup);
+
+  CheckBlockContents(contents, kMaxKey, keys, values);
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/table/filter_block.cc b/table/filter_block.cc
new file mode 100644 (file)
index 0000000..3651a7d
--- /dev/null
@@ -0,0 +1,187 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/filter_block.h"
+
+#include "db/dbformat.h"
+#include "rocksdb/filter_policy.h"
+#include "util/coding.h"
+
+namespace rocksdb {
+
+// See doc/table_format.txt for an explanation of the filter block format.
+
+// Generate new filter every 2KB of data
+static const size_t kFilterBaseLg = 11;
+static const size_t kFilterBase = 1 << kFilterBaseLg;
+
+FilterBlockBuilder::FilterBlockBuilder(const Options& opt,
+                                       const Comparator* internal_comparator)
+    : policy_(opt.filter_policy),
+      prefix_extractor_(opt.prefix_extractor.get()),
+      whole_key_filtering_(opt.whole_key_filtering),
+      comparator_(internal_comparator) {}
+
+void FilterBlockBuilder::StartBlock(uint64_t block_offset) {
+  uint64_t filter_index = (block_offset / kFilterBase);
+  assert(filter_index >= filter_offsets_.size());
+  while (filter_index > filter_offsets_.size()) {
+    GenerateFilter();
+  }
+}
+
+bool FilterBlockBuilder::SamePrefix(const Slice &key1,
+                                    const Slice &key2) const {
+  if (!prefix_extractor_->InDomain(key1) &&
+      !prefix_extractor_->InDomain(key2)) {
+    return true;
+  } else if (!prefix_extractor_->InDomain(key1) ||
+             !prefix_extractor_->InDomain(key2)) {
+    return false;
+  } else {
+    return (prefix_extractor_->Transform(key1) ==
+            prefix_extractor_->Transform(key2));
+  }
+}
+
+void FilterBlockBuilder::AddKey(const Slice& key) {
+  // get slice for most recently added entry
+  Slice prev;
+  size_t added_to_start = 0;
+
+  // add key to filter if needed
+  if (whole_key_filtering_) {
+    start_.push_back(entries_.size());
+    ++added_to_start;
+    entries_.append(key.data(), key.size());
+  }
+
+  if (start_.size() > added_to_start) {
+    size_t prev_start = start_[start_.size() - 1 - added_to_start];
+    const char* base = entries_.data() + prev_start;
+    size_t length = entries_.size() - prev_start;
+    prev = Slice(base, length);
+  }
+
+  // add prefix to filter if needed
+  if (prefix_extractor_ && prefix_extractor_->InDomain(ExtractUserKey(key))) {
+    // If prefix_extractor_, this filter_block layer assumes we only
+    // operate on internal keys.
+    Slice user_key = ExtractUserKey(key);
+    // this assumes prefix(prefix(key)) == prefix(key), as the last
+    // entry in entries_ may be either a key or prefix, and we use
+    // prefix(last entry) to get the prefix of the last key.
+    if (prev.size() == 0 ||
+        !SamePrefix(user_key, ExtractUserKey(prev))) {
+      Slice prefix = prefix_extractor_->Transform(user_key);
+      InternalKey internal_prefix_tmp(prefix, 0, kTypeValue);
+      Slice internal_prefix = internal_prefix_tmp.Encode();
+      start_.push_back(entries_.size());
+      entries_.append(internal_prefix.data(), internal_prefix.size());
+    }
+  }
+}
+
+Slice FilterBlockBuilder::Finish() {
+  if (!start_.empty()) {
+    GenerateFilter();
+  }
+
+  // Append array of per-filter offsets
+  const uint32_t array_offset = result_.size();
+  for (size_t i = 0; i < filter_offsets_.size(); i++) {
+    PutFixed32(&result_, filter_offsets_[i]);
+  }
+
+  PutFixed32(&result_, array_offset);
+  result_.push_back(kFilterBaseLg);  // Save encoding parameter in result
+  return Slice(result_);
+}
+
+void FilterBlockBuilder::GenerateFilter() {
+  const size_t num_entries = start_.size();
+  if (num_entries == 0) {
+    // Fast path if there are no keys for this filter
+    filter_offsets_.push_back(result_.size());
+    return;
+  }
+
+  // Make list of keys from flattened key structure
+  start_.push_back(entries_.size());  // Simplify length computation
+  tmp_entries_.resize(num_entries);
+  for (size_t i = 0; i < num_entries; i++) {
+    const char* base = entries_.data() + start_[i];
+    size_t length = start_[i+1] - start_[i];
+    tmp_entries_[i] = Slice(base, length);
+  }
+
+  // Generate filter for current set of keys and append to result_.
+  filter_offsets_.push_back(result_.size());
+  policy_->CreateFilter(&tmp_entries_[0], num_entries, &result_);
+
+  tmp_entries_.clear();
+  entries_.clear();
+  start_.clear();
+}
+
+FilterBlockReader::FilterBlockReader(
+    const Options& opt, const Slice& contents, bool delete_contents_after_use)
+    : policy_(opt.filter_policy),
+      prefix_extractor_(opt.prefix_extractor.get()),
+      whole_key_filtering_(opt.whole_key_filtering),
+      data_(nullptr),
+      offset_(nullptr),
+      num_(0),
+      base_lg_(0) {
+  size_t n = contents.size();
+  if (n < 5) return;  // 1 byte for base_lg_ and 4 for start of offset array
+  base_lg_ = contents[n-1];
+  uint32_t last_word = DecodeFixed32(contents.data() + n - 5);
+  if (last_word > n - 5) return;
+  data_ = contents.data();
+  offset_ = data_ + last_word;
+  num_ = (n - 5 - last_word) / 4;
+  if (delete_contents_after_use) {
+    filter_data.reset(contents.data());
+  }
+}
+
+bool FilterBlockReader::KeyMayMatch(uint64_t block_offset,
+                                    const Slice& key) {
+  if (!whole_key_filtering_) {
+    return true;
+  }
+  return MayMatch(block_offset, key);
+}
+
+bool FilterBlockReader::PrefixMayMatch(uint64_t block_offset,
+                                       const Slice& prefix) {
+  if (!prefix_extractor_) {
+    return true;
+  }
+  return MayMatch(block_offset, prefix);
+}
+
+bool FilterBlockReader::MayMatch(uint64_t block_offset, const Slice& entry) {
+  uint64_t index = block_offset >> base_lg_;
+  if (index < num_) {
+    uint32_t start = DecodeFixed32(offset_ + index*4);
+    uint32_t limit = DecodeFixed32(offset_ + index*4 + 4);
+    if (start <= limit && limit <= (uint32_t)(offset_ - data_)) {
+      Slice filter = Slice(data_ + start, limit - start);
+      return policy_->KeyMayMatch(entry, filter);
+    } else if (start == limit) {
+      // Empty filters do not match any entries
+      return false;
+    }
+  }
+  return true;  // Errors are treated as potential matches
+}
+
+}
diff --git a/table/filter_block.h b/table/filter_block.h
new file mode 100644 (file)
index 0000000..05c2bb9
--- /dev/null
@@ -0,0 +1,92 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A filter block is stored near the end of a Table file.  It contains
+// filters (e.g., bloom filters) for all data blocks in the table combined
+// into a single filter block.
+
+#pragma once
+
+#include <memory>
+#include <stddef.h>
+#include <stdint.h>
+#include <string>
+#include <vector>
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "util/hash.h"
+
+namespace rocksdb {
+
+class FilterPolicy;
+
+// A FilterBlockBuilder is used to construct all of the filters for a
+// particular Table.  It generates a single string which is stored as
+// a special block in the Table.
+//
+// The sequence of calls to FilterBlockBuilder must match the regexp:
+//      (StartBlock AddKey*)* Finish
+class FilterBlockBuilder {
+ public:
+  explicit FilterBlockBuilder(const Options& opt,
+                              const Comparator* internal_comparator);
+
+  void StartBlock(uint64_t block_offset);
+  void AddKey(const Slice& key);
+  Slice Finish();
+
+ private:
+  bool SamePrefix(const Slice &key1, const Slice &key2) const;
+  void GenerateFilter();
+
+  // important: all of these might point to invalid addresses
+  // at the time of destruction of this filter block. destructor
+  // should NOT dereference them.
+  const FilterPolicy* policy_;
+  const SliceTransform* prefix_extractor_;
+  bool whole_key_filtering_;
+  const Comparator* comparator_;
+
+  std::string entries_;         // Flattened entry contents
+  std::vector<size_t> start_;   // Starting index in entries_ of each entry
+  std::string result_;          // Filter data computed so far
+  std::vector<Slice> tmp_entries_; // policy_->CreateFilter() argument
+  std::vector<uint32_t> filter_offsets_;
+
+  // No copying allowed
+  FilterBlockBuilder(const FilterBlockBuilder&);
+  void operator=(const FilterBlockBuilder&);
+};
+
+class FilterBlockReader {
+ public:
+ // REQUIRES: "contents" and *policy must stay live while *this is live.
+  FilterBlockReader(
+    const Options& opt,
+    const Slice& contents,
+    bool delete_contents_after_use = false);
+  bool KeyMayMatch(uint64_t block_offset, const Slice& key);
+  bool PrefixMayMatch(uint64_t block_offset, const Slice& prefix);
+
+ private:
+  const FilterPolicy* policy_;
+  const SliceTransform* prefix_extractor_;
+  bool whole_key_filtering_;
+  const char* data_;    // Pointer to filter data (at block-start)
+  const char* offset_;  // Pointer to beginning of offset array (at block-end)
+  size_t num_;          // Number of entries in offset array
+  size_t base_lg_;      // Encoding parameter (see kFilterBaseLg in .cc file)
+  std::unique_ptr<const char[]> filter_data;
+
+
+  bool MayMatch(uint64_t block_offset, const Slice& entry);
+};
+
+}
diff --git a/table/filter_block_test.cc b/table/filter_block_test.cc
new file mode 100644 (file)
index 0000000..1703d59
--- /dev/null
@@ -0,0 +1,139 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/filter_block.h"
+
+#include "rocksdb/filter_policy.h"
+#include "util/coding.h"
+#include "util/hash.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+
+// For testing: emit an array with one hash value per key
+class TestHashFilter : public FilterPolicy {
+ public:
+  virtual const char* Name() const {
+    return "TestHashFilter";
+  }
+
+  virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const {
+    for (int i = 0; i < n; i++) {
+      uint32_t h = Hash(keys[i].data(), keys[i].size(), 1);
+      PutFixed32(dst, h);
+    }
+  }
+
+  virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const {
+    uint32_t h = Hash(key.data(), key.size(), 1);
+    for (unsigned int i = 0; i + 4 <= filter.size(); i += 4) {
+      if (h == DecodeFixed32(filter.data() + i)) {
+        return true;
+      }
+    }
+    return false;
+  }
+};
+
+class FilterBlockTest {
+ public:
+  TestHashFilter policy_;
+  Options options_;
+
+  FilterBlockTest() {
+    options_ = Options();
+    options_.filter_policy = &policy_;
+  }
+};
+
+TEST(FilterBlockTest, EmptyBuilder) {
+  FilterBlockBuilder builder(options_, options_.comparator);
+  Slice block = builder.Finish();
+  ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block));
+  FilterBlockReader reader(options_, block);
+  ASSERT_TRUE(reader.KeyMayMatch(0, "foo"));
+  ASSERT_TRUE(reader.KeyMayMatch(100000, "foo"));
+}
+
+TEST(FilterBlockTest, SingleChunk) {
+  FilterBlockBuilder builder(options_, options_.comparator);
+  builder.StartBlock(100);
+  builder.AddKey("foo");
+  builder.AddKey("bar");
+  builder.AddKey("box");
+  builder.StartBlock(200);
+  builder.AddKey("box");
+  builder.StartBlock(300);
+  builder.AddKey("hello");
+  Slice block = builder.Finish();
+  FilterBlockReader reader(options_, block);
+  ASSERT_TRUE(reader.KeyMayMatch(100, "foo"));
+  ASSERT_TRUE(reader.KeyMayMatch(100, "bar"));
+  ASSERT_TRUE(reader.KeyMayMatch(100, "box"));
+  ASSERT_TRUE(reader.KeyMayMatch(100, "hello"));
+  ASSERT_TRUE(reader.KeyMayMatch(100, "foo"));
+  ASSERT_TRUE(! reader.KeyMayMatch(100, "missing"));
+  ASSERT_TRUE(! reader.KeyMayMatch(100, "other"));
+}
+
+TEST(FilterBlockTest, MultiChunk) {
+  FilterBlockBuilder builder(options_, options_.comparator);
+
+  // First filter
+  builder.StartBlock(0);
+  builder.AddKey("foo");
+  builder.StartBlock(2000);
+  builder.AddKey("bar");
+
+  // Second filter
+  builder.StartBlock(3100);
+  builder.AddKey("box");
+
+  // Third filter is empty
+
+  // Last filter
+  builder.StartBlock(9000);
+  builder.AddKey("box");
+  builder.AddKey("hello");
+
+  Slice block = builder.Finish();
+  FilterBlockReader reader(options_, block);
+
+  // Check first filter
+  ASSERT_TRUE(reader.KeyMayMatch(0, "foo"));
+  ASSERT_TRUE(reader.KeyMayMatch(2000, "bar"));
+  ASSERT_TRUE(! reader.KeyMayMatch(0, "box"));
+  ASSERT_TRUE(! reader.KeyMayMatch(0, "hello"));
+
+  // Check second filter
+  ASSERT_TRUE(reader.KeyMayMatch(3100, "box"));
+  ASSERT_TRUE(! reader.KeyMayMatch(3100, "foo"));
+  ASSERT_TRUE(! reader.KeyMayMatch(3100, "bar"));
+  ASSERT_TRUE(! reader.KeyMayMatch(3100, "hello"));
+
+  // Check third filter (empty)
+  ASSERT_TRUE(! reader.KeyMayMatch(4100, "foo"));
+  ASSERT_TRUE(! reader.KeyMayMatch(4100, "bar"));
+  ASSERT_TRUE(! reader.KeyMayMatch(4100, "box"));
+  ASSERT_TRUE(! reader.KeyMayMatch(4100, "hello"));
+
+  // Check last filter
+  ASSERT_TRUE(reader.KeyMayMatch(9000, "box"));
+  ASSERT_TRUE(reader.KeyMayMatch(9000, "hello"));
+  ASSERT_TRUE(! reader.KeyMayMatch(9000, "foo"));
+  ASSERT_TRUE(! reader.KeyMayMatch(9000, "bar"));
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/table/flush_block_policy.cc b/table/flush_block_policy.cc
new file mode 100644 (file)
index 0000000..4e22352
--- /dev/null
@@ -0,0 +1,70 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include "rocksdb/options.h"
+#include "rocksdb/flush_block_policy.h"
+#include "rocksdb/slice.h"
+#include "table/block_builder.h"
+
+#include <cassert>
+
+namespace rocksdb {
+
+// Flush block by size
+class FlushBlockBySizePolicy : public FlushBlockPolicy {
+ public:
+  // @params block_size:           Approximate size of user data packed per
+  //                               block.
+  // @params block_size_deviation: This is used to close a block before it
+  //                               reaches the configured
+  FlushBlockBySizePolicy(const uint64_t block_size,
+                         const uint64_t block_size_deviation,
+                         const BlockBuilder& data_block_builder) :
+      block_size_(block_size),
+      block_size_deviation_(block_size_deviation),
+      data_block_builder_(data_block_builder) {
+  }
+
+  virtual bool Update(const Slice& key,
+                      const Slice& value) override {
+    // it makes no sense to flush when the data block is empty
+    if (data_block_builder_.empty()) {
+      return false;
+    }
+
+    auto curr_size = data_block_builder_.CurrentSizeEstimate();
+
+    // Do flush if one of the below two conditions is true:
+    // 1) if the current estimated size already exceeds the block size,
+    // 2) block_size_deviation is set and the estimated size after appending
+    // the kv will exceed the block size and the current size is under the
+    // the deviation.
+    return curr_size >= block_size_ || BlockAlmostFull(key, value);
+  }
+
+ private:
+  bool BlockAlmostFull(const Slice& key, const Slice& value) const {
+    const auto curr_size = data_block_builder_.CurrentSizeEstimate();
+    const auto estimated_size_after =
+      data_block_builder_.EstimateSizeAfterKV(key, value);
+
+    return
+      estimated_size_after > block_size_ &&
+      block_size_deviation_ > 0 &&
+      curr_size * 100 > block_size_ * (100 - block_size_deviation_);
+  }
+
+  const uint64_t block_size_;
+  const uint64_t block_size_deviation_;
+  const BlockBuilder& data_block_builder_;
+};
+
+FlushBlockPolicy* FlushBlockBySizePolicyFactory::NewFlushBlockPolicy(
+    const Options& options, const BlockBuilder& data_block_builder) const {
+  return new FlushBlockBySizePolicy(
+      options.block_size, options.block_size_deviation, data_block_builder);
+}
+
+}  // namespace rocksdb
diff --git a/table/format.cc b/table/format.cc
new file mode 100644 (file)
index 0000000..bbf95e1
--- /dev/null
@@ -0,0 +1,364 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/format.h"
+
+#include <string>
+#include <inttypes.h>
+
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "table/block.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+#include "util/perf_context_imp.h"
+#include "util/xxhash.h"
+
+namespace rocksdb {
+
+extern const uint64_t kLegacyBlockBasedTableMagicNumber;
+extern const uint64_t kLegacyPlainTableMagicNumber;
+extern const uint64_t kBlockBasedTableMagicNumber;
+extern const uint64_t kPlainTableMagicNumber;
+
+void BlockHandle::EncodeTo(std::string* dst) const {
+  // Sanity check that all fields have been set
+  assert(offset_ != ~static_cast<uint64_t>(0));
+  assert(size_ != ~static_cast<uint64_t>(0));
+  PutVarint64(dst, offset_);
+  PutVarint64(dst, size_);
+}
+
+Status BlockHandle::DecodeFrom(Slice* input) {
+  if (GetVarint64(input, &offset_) &&
+      GetVarint64(input, &size_)) {
+    return Status::OK();
+  } else {
+    return Status::Corruption("bad block handle");
+  }
+}
+const BlockHandle BlockHandle::kNullBlockHandle(0, 0);
+
+// legacy footer format:
+//    metaindex handle (varint64 offset, varint64 size)
+//    index handle     (varint64 offset, varint64 size)
+//    <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength
+//    table_magic_number (8 bytes)
+// new footer format:
+//    checksum (char, 1 byte)
+//    metaindex handle (varint64 offset, varint64 size)
+//    index handle     (varint64 offset, varint64 size)
+//    <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength + 1
+//    footer version (4 bytes)
+//    table_magic_number (8 bytes)
+void Footer::EncodeTo(std::string* dst) const {
+  if (version() == kLegacyFooter) {
+    // has to be default checksum with legacy footer
+    assert(checksum_ == kCRC32c);
+    const size_t original_size = dst->size();
+    metaindex_handle_.EncodeTo(dst);
+    index_handle_.EncodeTo(dst);
+    dst->resize(original_size + 2 * BlockHandle::kMaxEncodedLength);  // Padding
+    PutFixed32(dst, static_cast<uint32_t>(table_magic_number() & 0xffffffffu));
+    PutFixed32(dst, static_cast<uint32_t>(table_magic_number() >> 32));
+    assert(dst->size() == original_size + kVersion0EncodedLength);
+  } else {
+    const size_t original_size = dst->size();
+    dst->push_back(static_cast<char>(checksum_));
+    metaindex_handle_.EncodeTo(dst);
+    index_handle_.EncodeTo(dst);
+    dst->resize(original_size + kVersion1EncodedLength - 12);  // Padding
+    PutFixed32(dst, kFooterVersion);
+    PutFixed32(dst, static_cast<uint32_t>(table_magic_number() & 0xffffffffu));
+    PutFixed32(dst, static_cast<uint32_t>(table_magic_number() >> 32));
+    assert(dst->size() == original_size + kVersion1EncodedLength);
+  }
+}
+
+namespace {
+inline bool IsLegacyFooterFormat(uint64_t magic_number) {
+  return magic_number == kLegacyBlockBasedTableMagicNumber ||
+         magic_number == kLegacyPlainTableMagicNumber;
+}
+
+inline uint64_t UpconvertLegacyFooterFormat(uint64_t magic_number) {
+  if (magic_number == kLegacyBlockBasedTableMagicNumber) {
+    return kBlockBasedTableMagicNumber;
+  }
+  if (magic_number == kLegacyPlainTableMagicNumber) {
+    return kPlainTableMagicNumber;
+  }
+  assert(false);
+  return 0;
+}
+}  // namespace
+
+Footer::Footer(uint64_t table_magic_number)
+    : version_(IsLegacyFooterFormat(table_magic_number) ? kLegacyFooter
+                                                        : kFooterVersion),
+      checksum_(kCRC32c),
+      table_magic_number_(table_magic_number) {}
+
+Status Footer::DecodeFrom(Slice* input) {
+  assert(input != nullptr);
+  assert(input->size() >= kMinEncodedLength);
+
+  const char *magic_ptr =
+      input->data() + input->size() - kMagicNumberLengthByte;
+  const uint32_t magic_lo = DecodeFixed32(magic_ptr);
+  const uint32_t magic_hi = DecodeFixed32(magic_ptr + 4);
+  uint64_t magic = ((static_cast<uint64_t>(magic_hi) << 32) |
+                    (static_cast<uint64_t>(magic_lo)));
+
+  // We check for legacy formats here and silently upconvert them
+  bool legacy = IsLegacyFooterFormat(magic);
+  if (legacy) {
+    magic = UpconvertLegacyFooterFormat(magic);
+  }
+  if (HasInitializedTableMagicNumber()) {
+    if (magic != table_magic_number()) {
+      char buffer[80];
+      snprintf(buffer, sizeof(buffer) - 1,
+               "not an sstable (bad magic number --- %lx)",
+               (long)magic);
+      return Status::InvalidArgument(buffer);
+    }
+  } else {
+    set_table_magic_number(magic);
+  }
+
+  if (legacy) {
+    // The size is already asserted to be at least kMinEncodedLength
+    // at the beginning of the function
+    input->remove_prefix(input->size() - kVersion0EncodedLength);
+    version_ = kLegacyFooter;
+    checksum_ = kCRC32c;
+  } else {
+    version_ = DecodeFixed32(magic_ptr - 4);
+    if (version_ != kFooterVersion) {
+      return Status::Corruption("bad footer version");
+    }
+    // Footer version 1 will always occupy exactly this many bytes.
+    // It consists of the checksum type, two block handles, padding,
+    // a version number, and a magic number
+    if (input->size() < kVersion1EncodedLength) {
+      return Status::InvalidArgument("input is too short to be an sstable");
+    } else {
+      input->remove_prefix(input->size() - kVersion1EncodedLength);
+    }
+    uint32_t checksum;
+    if (!GetVarint32(input, &checksum)) {
+      return Status::Corruption("bad checksum type");
+    }
+    checksum_ = static_cast<ChecksumType>(checksum);
+  }
+
+  Status result = metaindex_handle_.DecodeFrom(input);
+  if (result.ok()) {
+    result = index_handle_.DecodeFrom(input);
+  }
+  if (result.ok()) {
+    // We skip over any leftover data (just padding for now) in "input"
+    const char* end = magic_ptr + kMagicNumberLengthByte;
+    *input = Slice(end, input->data() + input->size() - end);
+  }
+  return result;
+}
+
+Status ReadFooterFromFile(RandomAccessFile* file,
+                          uint64_t file_size,
+                          Footer* footer) {
+  if (file_size < Footer::kMinEncodedLength) {
+    return Status::InvalidArgument("file is too short to be an sstable");
+  }
+
+  char footer_space[Footer::kMaxEncodedLength];
+  Slice footer_input;
+  size_t read_offset = (file_size > Footer::kMaxEncodedLength)
+                           ? (file_size - Footer::kMaxEncodedLength)
+                           : 0;
+  Status s = file->Read(read_offset, Footer::kMaxEncodedLength, &footer_input,
+                        footer_space);
+  if (!s.ok()) return s;
+
+  // Check that we actually read the whole footer from the file. It may be
+  // that size isn't correct.
+  if (footer_input.size() < Footer::kMinEncodedLength) {
+    return Status::InvalidArgument("file is too short to be an sstable");
+  }
+
+  return footer->DecodeFrom(&footer_input);
+}
+
+Status ReadBlockContents(RandomAccessFile* file,
+                         const Footer& footer,
+                         const ReadOptions& options,
+                         const BlockHandle& handle,
+                         BlockContents* result,
+                         Env* env,
+                         bool do_uncompress) {
+  result->data = Slice();
+  result->cachable = false;
+  result->heap_allocated = false;
+
+  // Read the block contents as well as the type/crc footer.
+  // See table_builder.cc for the code that built this structure.
+  size_t n = static_cast<size_t>(handle.size());
+  char* buf = new char[n + kBlockTrailerSize];
+  Slice contents;
+
+  PERF_TIMER_AUTO(block_read_time);
+  Status s = file->Read(handle.offset(), n + kBlockTrailerSize, &contents, buf);
+  PERF_TIMER_MEASURE(block_read_time);
+  PERF_COUNTER_ADD(block_read_count, 1);
+  PERF_COUNTER_ADD(block_read_byte, n + kBlockTrailerSize);
+
+  if (!s.ok()) {
+    delete[] buf;
+    return s;
+  }
+  if (contents.size() != n + kBlockTrailerSize) {
+    delete[] buf;
+    return Status::Corruption("truncated block read");
+  }
+
+  // Check the crc of the type and the block contents
+  const char* data = contents.data();    // Pointer to where Read put the data
+  if (options.verify_checksums) {
+    uint32_t value = DecodeFixed32(data + n + 1);
+    uint32_t actual = 0;
+    switch (footer.checksum()) {
+      case kCRC32c:
+        value = crc32c::Unmask(value);
+        actual = crc32c::Value(data, n + 1);
+        break;
+      case kxxHash:
+        actual = XXH32(data, n + 1, 0);
+        break;
+      default:
+        s = Status::Corruption("unknown checksum type");
+    }
+    if (s.ok() && actual != value) {
+      s = Status::Corruption("block checksum mismatch");
+    }
+    if (!s.ok()) {
+      delete[] buf;
+      return s;
+    }
+    PERF_TIMER_MEASURE(block_checksum_time);
+  }
+
+  rocksdb::CompressionType compression_type =
+      static_cast<rocksdb::CompressionType>(data[n]);
+  // If the caller has requested that the block not be uncompressed
+  if (!do_uncompress || compression_type == kNoCompression) {
+    if (data != buf) {
+      // File implementation gave us pointer to some other data.
+      // Use it directly under the assumption that it will be live
+      // while the file is open.
+      delete[] buf;
+      result->data = Slice(data, n);
+      result->heap_allocated = false;
+      result->cachable = false;  // Do not double-cache
+    } else {
+      result->data = Slice(buf, n);
+      result->heap_allocated = true;
+      result->cachable = true;
+    }
+    result->compression_type = compression_type;
+    s = Status::OK();
+  } else {
+    s = UncompressBlockContents(data, n, result);
+    delete[] buf;
+  }
+  PERF_TIMER_STOP(block_decompress_time);
+  return s;
+}
+
+//
+// The 'data' points to the raw block contents that was read in from file.
+// This method allocates a new heap buffer and the raw block
+// contents are uncompresed into this buffer. This
+// buffer is returned via 'result' and it is upto the caller to
+// free this buffer.
+Status UncompressBlockContents(const char* data, size_t n,
+                               BlockContents* result) {
+  char* ubuf = nullptr;
+  int decompress_size = 0;
+  assert(data[n] != kNoCompression);
+  switch (data[n]) {
+    case kSnappyCompression: {
+      size_t ulength = 0;
+      static char snappy_corrupt_msg[] =
+        "Snappy not supported or corrupted Snappy compressed block contents";
+      if (!port::Snappy_GetUncompressedLength(data, n, &ulength)) {
+        return Status::Corruption(snappy_corrupt_msg);
+      }
+      ubuf = new char[ulength];
+      if (!port::Snappy_Uncompress(data, n, ubuf)) {
+        delete[] ubuf;
+        return Status::Corruption(snappy_corrupt_msg);
+      }
+      result->data = Slice(ubuf, ulength);
+      result->heap_allocated = true;
+      result->cachable = true;
+      break;
+    }
+    case kZlibCompression:
+      ubuf = port::Zlib_Uncompress(data, n, &decompress_size);
+      static char zlib_corrupt_msg[] =
+        "Zlib not supported or corrupted Zlib compressed block contents";
+      if (!ubuf) {
+        return Status::Corruption(zlib_corrupt_msg);
+      }
+      result->data = Slice(ubuf, decompress_size);
+      result->heap_allocated = true;
+      result->cachable = true;
+      break;
+    case kBZip2Compression:
+      ubuf = port::BZip2_Uncompress(data, n, &decompress_size);
+      static char bzip2_corrupt_msg[] =
+        "Bzip2 not supported or corrupted Bzip2 compressed block contents";
+      if (!ubuf) {
+        return Status::Corruption(bzip2_corrupt_msg);
+      }
+      result->data = Slice(ubuf, decompress_size);
+      result->heap_allocated = true;
+      result->cachable = true;
+      break;
+    case kLZ4Compression:
+      ubuf = port::LZ4_Uncompress(data, n, &decompress_size);
+      static char lz4_corrupt_msg[] =
+          "LZ4 not supported or corrupted LZ4 compressed block contents";
+      if (!ubuf) {
+        return Status::Corruption(lz4_corrupt_msg);
+      }
+      result->data = Slice(ubuf, decompress_size);
+      result->heap_allocated = true;
+      result->cachable = true;
+      break;
+    case kLZ4HCCompression:
+      ubuf = port::LZ4_Uncompress(data, n, &decompress_size);
+      static char lz4hc_corrupt_msg[] =
+          "LZ4HC not supported or corrupted LZ4HC compressed block contents";
+      if (!ubuf) {
+        return Status::Corruption(lz4hc_corrupt_msg);
+      }
+      result->data = Slice(ubuf, decompress_size);
+      result->heap_allocated = true;
+      result->cachable = true;
+      break;
+    default:
+      return Status::Corruption("bad block type");
+  }
+  result->compression_type = kNoCompression;  // not compressed any more
+  return Status::OK();
+}
+
+}  // namespace rocksdb
diff --git a/table/format.h b/table/format.h
new file mode 100644 (file)
index 0000000..a971c1a
--- /dev/null
@@ -0,0 +1,198 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <string>
+#include <stdint.h>
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+
+namespace rocksdb {
+
+class Block;
+class RandomAccessFile;
+struct ReadOptions;
+
+// the length of the magic number in bytes.
+const int kMagicNumberLengthByte = 8;
+
+// BlockHandle is a pointer to the extent of a file that stores a data
+// block or a meta block.
+class BlockHandle {
+ public:
+  BlockHandle();
+  BlockHandle(uint64_t offset, uint64_t size);
+
+  // The offset of the block in the file.
+  uint64_t offset() const { return offset_; }
+  void set_offset(uint64_t offset) { offset_ = offset; }
+
+  // The size of the stored block
+  uint64_t size() const { return size_; }
+  void set_size(uint64_t size) { size_ = size; }
+
+  void EncodeTo(std::string* dst) const;
+  Status DecodeFrom(Slice* input);
+
+  // if the block handle's offset and size are both "0", we will view it
+  // as a null block handle that points to no where.
+  bool IsNull() const {
+    return offset_ == 0 && size_ == 0;
+  }
+
+  static const BlockHandle& NullBlockHandle() {
+    return kNullBlockHandle;
+  }
+
+  // Maximum encoding length of a BlockHandle
+  enum { kMaxEncodedLength = 10 + 10 };
+
+ private:
+  uint64_t offset_ = 0;
+  uint64_t size_ = 0;
+
+  static const BlockHandle kNullBlockHandle;
+};
+
+// Footer encapsulates the fixed information stored at the tail
+// end of every table file.
+class Footer {
+ public:
+  // Constructs a footer without specifying its table magic number.
+  // In such case, the table magic number of such footer should be
+  // initialized via @ReadFooterFromFile().
+  Footer() : Footer(kInvalidTableMagicNumber) {}
+
+  // @table_magic_number serves two purposes:
+  //  1. Identify different types of the tables.
+  //  2. Help us to identify if a given file is a valid sst.
+  explicit Footer(uint64_t table_magic_number);
+
+  // The version of the footer in this file
+  uint32_t version() const { return version_; }
+
+  // The checksum type used in this file
+  ChecksumType checksum() const { return checksum_; }
+  void set_checksum(const ChecksumType c) { checksum_ = c; }
+
+  // The block handle for the metaindex block of the table
+  const BlockHandle& metaindex_handle() const { return metaindex_handle_; }
+  void set_metaindex_handle(const BlockHandle& h) { metaindex_handle_ = h; }
+
+  // The block handle for the index block of the table
+  const BlockHandle& index_handle() const { return index_handle_; }
+
+  void set_index_handle(const BlockHandle& h) { index_handle_ = h; }
+
+  uint64_t table_magic_number() const { return table_magic_number_; }
+
+  // The version of Footer we encode
+  enum {
+    kLegacyFooter = 0,
+    kFooterVersion = 1,
+  };
+
+  void EncodeTo(std::string* dst) const;
+
+  // Set the current footer based on the input slice.  If table_magic_number_
+  // is not set (i.e., HasInitializedTableMagicNumber() is true), then this
+  // function will also initialize table_magic_number_.  Otherwise, this
+  // function will verify whether the magic number specified in the input
+  // slice matches table_magic_number_ and update the current footer only
+  // when the test passes.
+  Status DecodeFrom(Slice* input);
+
+  // Encoded length of a Footer.  Note that the serialization of a Footer will
+  // always occupy at least kMinEncodedLength bytes.  If fields are changed
+  // the version number should be incremented and kMaxEncodedLength should be
+  // increased accordingly.
+  enum {
+    // Footer version 0 (legacy) will always occupy exactly this many bytes.
+    // It consists of two block handles, padding, and a magic number.
+    kVersion0EncodedLength = 2 * BlockHandle::kMaxEncodedLength + 8,
+    // Footer version 1 will always occupy exactly this many bytes.
+    // It consists of the checksum type, two block handles, padding,
+    // a version number, and a magic number
+    kVersion1EncodedLength = 1 + 2 * BlockHandle::kMaxEncodedLength + 4 + 8,
+
+    kMinEncodedLength = kVersion0EncodedLength,
+    kMaxEncodedLength = kVersion1EncodedLength
+  };
+
+  static const uint64_t kInvalidTableMagicNumber = 0;
+
+ private:
+  // REQUIRES: magic number wasn't initialized.
+  void set_table_magic_number(uint64_t magic_number) {
+    assert(!HasInitializedTableMagicNumber());
+    table_magic_number_ = magic_number;
+  }
+
+  // return true if @table_magic_number_ is set to a value different
+  // from @kInvalidTableMagicNumber.
+  bool HasInitializedTableMagicNumber() const {
+    return (table_magic_number_ != kInvalidTableMagicNumber);
+  }
+
+  uint32_t version_;
+  ChecksumType checksum_;
+  BlockHandle metaindex_handle_;
+  BlockHandle index_handle_;
+  uint64_t table_magic_number_ = 0;
+};
+
+// Read the footer from file
+Status ReadFooterFromFile(RandomAccessFile* file,
+                          uint64_t file_size,
+                          Footer* footer);
+
+// 1-byte type + 32-bit crc
+static const size_t kBlockTrailerSize = 5;
+
+struct BlockContents {
+  Slice data;           // Actual contents of data
+  bool cachable;        // True iff data can be cached
+  bool heap_allocated;  // True iff caller should delete[] data.data()
+  CompressionType compression_type;
+};
+
+// Read the block identified by "handle" from "file".  On failure
+// return non-OK.  On success fill *result and return OK.
+extern Status ReadBlockContents(RandomAccessFile* file,
+                                const Footer& footer,
+                                const ReadOptions& options,
+                                const BlockHandle& handle,
+                                BlockContents* result,
+                                Env* env,
+                                bool do_uncompress);
+
+// The 'data' points to the raw block contents read in from file.
+// This method allocates a new heap buffer and the raw block
+// contents are uncompresed into this buffer. This buffer is
+// returned via 'result' and it is upto the caller to
+// free this buffer.
+extern Status UncompressBlockContents(const char* data,
+                                      size_t n,
+                                      BlockContents* result);
+
+// Implementation details follow.  Clients should ignore,
+
+inline BlockHandle::BlockHandle()
+    : BlockHandle(~static_cast<uint64_t>(0),
+                  ~static_cast<uint64_t>(0)) {
+}
+
+inline BlockHandle::BlockHandle(uint64_t offset, uint64_t size)
+    : offset_(offset),
+      size_(size) {
+}
+
+}  // namespace rocksdb
diff --git a/table/iter_heap.h b/table/iter_heap.h
new file mode 100644 (file)
index 0000000..9569d36
--- /dev/null
@@ -0,0 +1,44 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+
+#pragma once
+#include <queue>
+
+#include "rocksdb/comparator.h"
+#include "table/iterator_wrapper.h"
+
+namespace rocksdb {
+
+// Return the max of two keys.
+class MaxIteratorComparator {
+ public:
+  MaxIteratorComparator(const Comparator* comparator) :
+    comparator_(comparator) {}
+
+  bool operator()(IteratorWrapper* a, IteratorWrapper* b) {
+    return comparator_->Compare(a->key(), b->key()) <= 0;
+  }
+ private:
+  const Comparator* comparator_;
+};
+
+// Return the max of two keys.
+class MinIteratorComparator {
+ public:
+  // if maxHeap is set comparator returns the max value.
+  // else returns the min Value.
+  // Can use to create a minHeap or a maxHeap.
+  MinIteratorComparator(const Comparator* comparator) :
+    comparator_(comparator) {}
+
+  bool operator()(IteratorWrapper* a, IteratorWrapper* b) {
+    return comparator_->Compare(a->key(), b->key()) > 0;
+  }
+ private:
+  const Comparator* comparator_;
+};
+
+}  // namespace rocksdb
diff --git a/table/iterator.cc b/table/iterator.cc
new file mode 100644 (file)
index 0000000..a3d4f63
--- /dev/null
@@ -0,0 +1,72 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/iterator.h"
+
+namespace rocksdb {
+
+Iterator::Iterator() {
+  cleanup_.function = nullptr;
+  cleanup_.next = nullptr;
+}
+
+Iterator::~Iterator() {
+  if (cleanup_.function != nullptr) {
+    (*cleanup_.function)(cleanup_.arg1, cleanup_.arg2);
+    for (Cleanup* c = cleanup_.next; c != nullptr; ) {
+      (*c->function)(c->arg1, c->arg2);
+      Cleanup* next = c->next;
+      delete c;
+      c = next;
+    }
+  }
+}
+
+void Iterator::RegisterCleanup(CleanupFunction func, void* arg1, void* arg2) {
+  assert(func != nullptr);
+  Cleanup* c;
+  if (cleanup_.function == nullptr) {
+    c = &cleanup_;
+  } else {
+    c = new Cleanup;
+    c->next = cleanup_.next;
+    cleanup_.next = c;
+  }
+  c->function = func;
+  c->arg1 = arg1;
+  c->arg2 = arg2;
+}
+
+namespace {
+class EmptyIterator : public Iterator {
+ public:
+  explicit EmptyIterator(const Status& s) : status_(s) { }
+  virtual bool Valid() const { return false; }
+  virtual void Seek(const Slice& target) { }
+  virtual void SeekToFirst() { }
+  virtual void SeekToLast() { }
+  virtual void Next() { assert(false); }
+  virtual void Prev() { assert(false); }
+  Slice key() const { assert(false); return Slice(); }
+  Slice value() const { assert(false); return Slice(); }
+  virtual Status status() const { return status_; }
+ private:
+  Status status_;
+};
+}  // namespace
+
+Iterator* NewEmptyIterator() {
+  return new EmptyIterator(Status::OK());
+}
+
+Iterator* NewErrorIterator(const Status& status) {
+  return new EmptyIterator(status);
+}
+
+}  // namespace rocksdb
diff --git a/table/iterator_wrapper.h b/table/iterator_wrapper.h
new file mode 100644 (file)
index 0000000..cb8520b
--- /dev/null
@@ -0,0 +1,64 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+namespace rocksdb {
+
+// A internal wrapper class with an interface similar to Iterator that
+// caches the valid() and key() results for an underlying iterator.
+// This can help avoid virtual function calls and also gives better
+// cache locality.
+class IteratorWrapper {
+ public:
+  IteratorWrapper(): iter_(nullptr), valid_(false) { }
+  explicit IteratorWrapper(Iterator* iter): iter_(nullptr) {
+    Set(iter);
+  }
+  ~IteratorWrapper() { delete iter_; }
+  Iterator* iter() const { return iter_; }
+
+  // Takes ownership of "iter" and will delete it when destroyed, or
+  // when Set() is invoked again.
+  void Set(Iterator* iter) {
+    delete iter_;
+    iter_ = iter;
+    if (iter_ == nullptr) {
+      valid_ = false;
+    } else {
+      Update();
+    }
+  }
+
+
+  // Iterator interface methods
+  bool Valid() const        { return valid_; }
+  Slice key() const         { assert(Valid()); return key_; }
+  Slice value() const       { assert(Valid()); return iter_->value(); }
+  // Methods below require iter() != nullptr
+  Status status() const     { assert(iter_); return iter_->status(); }
+  void Next()               { assert(iter_); iter_->Next();        Update(); }
+  void Prev()               { assert(iter_); iter_->Prev();        Update(); }
+  void Seek(const Slice& k) { assert(iter_); iter_->Seek(k);       Update(); }
+  void SeekToFirst()        { assert(iter_); iter_->SeekToFirst(); Update(); }
+  void SeekToLast()         { assert(iter_); iter_->SeekToLast();  Update(); }
+
+ private:
+  void Update() {
+    valid_ = iter_->Valid();
+    if (valid_) {
+      key_ = iter_->key();
+    }
+  }
+
+  Iterator* iter_;
+  bool valid_;
+  Slice key_;
+};
+
+}  // namespace rocksdb
diff --git a/table/merger.cc b/table/merger.cc
new file mode 100644 (file)
index 0000000..b829f71
--- /dev/null
@@ -0,0 +1,301 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/merger.h"
+
+#include <vector>
+#include <queue>
+
+#include "rocksdb/comparator.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "table/iter_heap.h"
+#include "table/iterator_wrapper.h"
+#include "util/stop_watch.h"
+#include "util/perf_context_imp.h"
+
+namespace rocksdb {
+namespace {
+
+typedef std::priority_queue<
+          IteratorWrapper*,
+          std::vector<IteratorWrapper*>,
+          MaxIteratorComparator> MaxIterHeap;
+
+typedef std::priority_queue<
+          IteratorWrapper*,
+          std::vector<IteratorWrapper*>,
+          MinIteratorComparator> MinIterHeap;
+
+// Return's a new MaxHeap of IteratorWrapper's using the provided Comparator.
+MaxIterHeap NewMaxIterHeap(const Comparator* comparator) {
+  return MaxIterHeap(MaxIteratorComparator(comparator));
+}
+
+// Return's a new MinHeap of IteratorWrapper's using the provided Comparator.
+MinIterHeap NewMinIterHeap(const Comparator* comparator) {
+  return MinIterHeap(MinIteratorComparator(comparator));
+}
+
+class MergingIterator : public Iterator {
+ public:
+  MergingIterator(const Comparator* comparator, Iterator** children, int n)
+      : comparator_(comparator),
+        children_(n),
+        current_(nullptr),
+        use_heap_(true),
+        direction_(kForward),
+        maxHeap_(NewMaxIterHeap(comparator_)),
+        minHeap_(NewMinIterHeap(comparator_)) {
+    for (int i = 0; i < n; i++) {
+      children_[i].Set(children[i]);
+    }
+    for (auto& child : children_) {
+      if (child.Valid()) {
+        minHeap_.push(&child);
+      }
+    }
+  }
+
+  virtual ~MergingIterator() { }
+
+  virtual bool Valid() const {
+    return (current_ != nullptr);
+  }
+
+  virtual void SeekToFirst() {
+    ClearHeaps();
+    for (auto& child : children_) {
+      child.SeekToFirst();
+      if (child.Valid()) {
+        minHeap_.push(&child);
+      }
+    }
+    FindSmallest();
+    direction_ = kForward;
+  }
+
+  virtual void SeekToLast() {
+    ClearHeaps();
+    for (auto& child : children_) {
+      child.SeekToLast();
+      if (child.Valid()) {
+        maxHeap_.push(&child);
+      }
+    }
+    FindLargest();
+    direction_ = kReverse;
+  }
+
+  virtual void Seek(const Slice& target) {
+    // Invalidate the heap.
+    use_heap_ = false;
+    IteratorWrapper* first_child = nullptr;
+    PERF_TIMER_DECLARE();
+
+    for (auto& child : children_) {
+      PERF_TIMER_START(seek_child_seek_time);
+      child.Seek(target);
+      PERF_TIMER_STOP(seek_child_seek_time);
+      PERF_COUNTER_ADD(seek_child_seek_count, 1);
+
+      if (child.Valid()) {
+        // This child has valid key
+        if (!use_heap_) {
+          if (first_child == nullptr) {
+            // It's the first child has valid key. Only put it int
+            // current_. Now the values in the heap should be invalid.
+            first_child = &child;
+          } else {
+            // We have more than one children with valid keys. Initialize
+            // the heap and put the first child into the heap.
+            PERF_TIMER_START(seek_min_heap_time);
+            ClearHeaps();
+            minHeap_.push(first_child);
+            PERF_TIMER_STOP(seek_min_heap_time);
+          }
+        }
+        if (use_heap_) {
+          PERF_TIMER_START(seek_min_heap_time);
+          minHeap_.push(&child);
+          PERF_TIMER_STOP(seek_min_heap_time);
+        }
+      }
+    }
+    if (use_heap_) {
+      // If heap is valid, need to put the smallest key to curent_.
+      PERF_TIMER_START(seek_min_heap_time);
+      FindSmallest();
+      PERF_TIMER_STOP(seek_min_heap_time);
+    } else {
+      // The heap is not valid, then the current_ iterator is the first
+      // one, or null if there is no first child.
+      current_ = first_child;
+    }
+    direction_ = kForward;
+  }
+
+  virtual void Next() {
+    assert(Valid());
+
+    // Ensure that all children are positioned after key().
+    // If we are moving in the forward direction, it is already
+    // true for all of the non-current_ children since current_ is
+    // the smallest child and key() == current_->key().  Otherwise,
+    // we explicitly position the non-current_ children.
+    if (direction_ != kForward) {
+      ClearHeaps();
+      for (auto& child : children_) {
+        if (&child != current_) {
+          child.Seek(key());
+          if (child.Valid() &&
+              comparator_->Compare(key(), child.key()) == 0) {
+            child.Next();
+          }
+          if (child.Valid()) {
+            minHeap_.push(&child);
+          }
+        }
+      }
+      direction_ = kForward;
+    }
+
+    // as the current points to the current record. move the iterator forward.
+    // and if it is valid add it to the heap.
+    current_->Next();
+    if (use_heap_) {
+      if (current_->Valid()) {
+        minHeap_.push(current_);
+      }
+      FindSmallest();
+    } else if (!current_->Valid()) {
+      current_ = nullptr;
+    }
+  }
+
+  virtual void Prev() {
+    assert(Valid());
+    // Ensure that all children are positioned before key().
+    // If we are moving in the reverse direction, it is already
+    // true for all of the non-current_ children since current_ is
+    // the largest child and key() == current_->key().  Otherwise,
+    // we explicitly position the non-current_ children.
+    if (direction_ != kReverse) {
+      ClearHeaps();
+      for (auto& child : children_) {
+        if (&child != current_) {
+          child.Seek(key());
+          if (child.Valid()) {
+            // Child is at first entry >= key().  Step back one to be < key()
+            child.Prev();
+          } else {
+            // Child has no entries >= key().  Position at last entry.
+            child.SeekToLast();
+          }
+          if (child.Valid()) {
+            maxHeap_.push(&child);
+          }
+        }
+      }
+      direction_ = kReverse;
+    }
+
+    current_->Prev();
+    if (current_->Valid()) {
+      maxHeap_.push(current_);
+    }
+    FindLargest();
+  }
+
+  virtual Slice key() const {
+    assert(Valid());
+    return current_->key();
+  }
+
+  virtual Slice value() const {
+    assert(Valid());
+    return current_->value();
+  }
+
+  virtual Status status() const {
+    Status status;
+    for (auto& child : children_) {
+      status = child.status();
+      if (!status.ok()) {
+        break;
+      }
+    }
+    return status;
+  }
+
+ private:
+  void FindSmallest();
+  void FindLargest();
+  void ClearHeaps();
+
+  const Comparator* comparator_;
+  std::vector<IteratorWrapper> children_;
+  IteratorWrapper* current_;
+  // If the value is true, both of iterators in the heap and current_
+  // contain valid rows. If it is false, only current_ can possibly contain
+  // valid rows.
+  // This flag is always true for reverse direction, as we always use heap for
+  // the reverse iterating case.
+  bool use_heap_;
+  // Which direction is the iterator moving?
+  enum Direction {
+    kForward,
+    kReverse
+  };
+  Direction direction_;
+  MaxIterHeap maxHeap_;
+  MinIterHeap minHeap_;
+};
+
+void MergingIterator::FindSmallest() {
+  assert(use_heap_);
+  if (minHeap_.empty()) {
+    current_ = nullptr;
+  } else {
+    current_ = minHeap_.top();
+    assert(current_->Valid());
+    minHeap_.pop();
+  }
+}
+
+void MergingIterator::FindLargest() {
+  assert(use_heap_);
+  if (maxHeap_.empty()) {
+    current_ = nullptr;
+  } else {
+    current_ = maxHeap_.top();
+    assert(current_->Valid());
+    maxHeap_.pop();
+  }
+}
+
+void MergingIterator::ClearHeaps() {
+  use_heap_ = true;
+  maxHeap_ = NewMaxIterHeap(comparator_);
+  minHeap_ = NewMinIterHeap(comparator_);
+}
+}  // namespace
+
+Iterator* NewMergingIterator(const Comparator* cmp, Iterator** list, int n) {
+  assert(n >= 0);
+  if (n == 0) {
+    return NewEmptyIterator();
+  } else if (n == 1) {
+    return list[0];
+  } else {
+    return new MergingIterator(cmp, list, n);
+  }
+}
+
+}  // namespace rocksdb
diff --git a/table/merger.h b/table/merger.h
new file mode 100644 (file)
index 0000000..3a1a4fe
--- /dev/null
@@ -0,0 +1,29 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+namespace rocksdb {
+
+class Comparator;
+class Iterator;
+class Env;
+
+// Return an iterator that provided the union of the data in
+// children[0,n-1].  Takes ownership of the child iterators and
+// will delete them when the result iterator is deleted.
+//
+// The result does no duplicate suppression.  I.e., if a particular
+// key is present in K child iterators, it will be yielded K times.
+//
+// REQUIRES: n >= 0
+extern Iterator* NewMergingIterator(const Comparator* comparator,
+                                    Iterator** children, int n);
+
+}  // namespace rocksdb
diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc
new file mode 100644 (file)
index 0000000..f28b44d
--- /dev/null
@@ -0,0 +1,266 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+#include "table/meta_blocks.h"
+
+#include <map>
+#include <string>
+
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+#include "table/block.h"
+#include "table/format.h"
+#include "util/coding.h"
+
+namespace rocksdb {
+
+MetaIndexBuilder::MetaIndexBuilder()
+    : meta_index_block_(
+        new BlockBuilder(1 /* restart interval */, BytewiseComparator())) {
+}
+
+void MetaIndexBuilder::Add(const std::string& key,
+                           const BlockHandle& handle) {
+  std::string handle_encoding;
+  handle.EncodeTo(&handle_encoding);
+  meta_block_handles_.insert({key, handle_encoding});
+}
+
+Slice MetaIndexBuilder::Finish() {
+  for (const auto& metablock : meta_block_handles_) {
+    meta_index_block_->Add(metablock.first, metablock.second);
+  }
+  return meta_index_block_->Finish();
+}
+
+PropertyBlockBuilder::PropertyBlockBuilder()
+  : properties_block_(
+      new BlockBuilder(1 /* restart interval */, BytewiseComparator())) {
+}
+
+void PropertyBlockBuilder::Add(const std::string& name,
+                               const std::string& val) {
+  props_.insert({name, val});
+}
+
+void PropertyBlockBuilder::Add(const std::string& name, uint64_t val) {
+  assert(props_.find(name) == props_.end());
+
+  std::string dst;
+  PutVarint64(&dst, val);
+
+  Add(name, dst);
+}
+
+void PropertyBlockBuilder::Add(
+    const UserCollectedProperties& user_collected_properties) {
+  for (const auto& prop : user_collected_properties) {
+    Add(prop.first, prop.second);
+  }
+}
+
+void PropertyBlockBuilder::AddTableProperty(const TableProperties& props) {
+  Add(TablePropertiesNames::kRawKeySize, props.raw_key_size);
+  Add(TablePropertiesNames::kRawValueSize, props.raw_value_size);
+  Add(TablePropertiesNames::kDataSize, props.data_size);
+  Add(TablePropertiesNames::kIndexSize, props.index_size);
+  Add(TablePropertiesNames::kNumEntries, props.num_entries);
+  Add(TablePropertiesNames::kNumDataBlocks, props.num_data_blocks);
+  Add(TablePropertiesNames::kFilterSize, props.filter_size);
+  Add(TablePropertiesNames::kFormatVersion, props.format_version);
+  Add(TablePropertiesNames::kFixedKeyLen, props.fixed_key_len);
+
+  if (!props.filter_policy_name.empty()) {
+    Add(TablePropertiesNames::kFilterPolicy,
+        props.filter_policy_name);
+  }
+}
+
+Slice PropertyBlockBuilder::Finish() {
+  for (const auto& prop : props_) {
+    properties_block_->Add(prop.first, prop.second);
+  }
+
+  return properties_block_->Finish();
+}
+
+void LogPropertiesCollectionError(
+    Logger* info_log, const std::string& method, const std::string& name) {
+  assert(method == "Add" || method == "Finish");
+
+  std::string msg =
+    "[Warning] encountered error when calling TablePropertiesCollector::" +
+    method + "() with collector name: " + name;
+  Log(info_log, "%s", msg.c_str());
+}
+
+bool NotifyCollectTableCollectorsOnAdd(
+    const Slice& key,
+    const Slice& value,
+    const Options::TablePropertiesCollectors& collectors,
+    Logger* info_log) {
+  bool all_succeeded = true;
+  for (auto collector : collectors) {
+    Status s = collector->Add(key, value);
+    all_succeeded = all_succeeded && s.ok();
+    if (!s.ok()) {
+      LogPropertiesCollectionError(info_log, "Add" /* method */,
+                                   collector->Name());
+    }
+  }
+  return all_succeeded;
+}
+
+bool NotifyCollectTableCollectorsOnFinish(
+    const Options::TablePropertiesCollectors& collectors,
+    Logger* info_log,
+    PropertyBlockBuilder* builder) {
+  bool all_succeeded = true;
+  for (auto collector : collectors) {
+    UserCollectedProperties user_collected_properties;
+    Status s = collector->Finish(&user_collected_properties);
+
+    all_succeeded = all_succeeded && s.ok();
+    if (!s.ok()) {
+      LogPropertiesCollectionError(info_log, "Finish" /* method */,
+                                   collector->Name());
+    } else {
+      builder->Add(user_collected_properties);
+    }
+  }
+
+  return all_succeeded;
+}
+
+Status ReadProperties(const Slice &handle_value, RandomAccessFile *file,
+                      const Footer &footer, Env *env, Logger *logger,
+                      TableProperties **table_properties) {
+  assert(table_properties);
+
+  Slice v = handle_value;
+  BlockHandle handle;
+  if (!handle.DecodeFrom(&v).ok()) {
+    return Status::InvalidArgument("Failed to decode properties block handle");
+  }
+
+  BlockContents block_contents;
+  ReadOptions read_options;
+  read_options.verify_checksums = false;
+  Status s = ReadBlockContents(file, footer, read_options, handle,
+                               &block_contents, env, false);
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  Block properties_block(block_contents);
+  std::unique_ptr<Iterator> iter(
+      properties_block.NewIterator(BytewiseComparator()));
+
+  auto new_table_properties = new TableProperties();
+  // All pre-defined properties of type uint64_t
+  std::unordered_map<std::string, uint64_t*> predefined_uint64_properties = {
+      {TablePropertiesNames::kDataSize, &new_table_properties->data_size},
+      {TablePropertiesNames::kIndexSize, &new_table_properties->index_size},
+      {TablePropertiesNames::kFilterSize, &new_table_properties->filter_size},
+      {TablePropertiesNames::kRawKeySize, &new_table_properties->raw_key_size},
+      {TablePropertiesNames::kRawValueSize,
+       &new_table_properties->raw_value_size},
+      {TablePropertiesNames::kNumDataBlocks,
+       &new_table_properties->num_data_blocks},
+      {TablePropertiesNames::kNumEntries, &new_table_properties->num_entries},
+      {TablePropertiesNames::kFormatVersion,
+       &new_table_properties->format_version},
+      {TablePropertiesNames::kFixedKeyLen,
+       &new_table_properties->fixed_key_len}, };
+
+  std::string last_key;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    s = iter->status();
+    if (!s.ok()) {
+      break;
+    }
+
+    auto key = iter->key().ToString();
+    // properties block is strictly sorted with no duplicate key.
+    assert(last_key.empty() ||
+           BytewiseComparator()->Compare(key, last_key) > 0);
+    last_key = key;
+
+    auto raw_val = iter->value();
+    auto pos = predefined_uint64_properties.find(key);
+
+    if (pos != predefined_uint64_properties.end()) {
+      // handle predefined rocksdb properties
+      uint64_t val;
+      if (!GetVarint64(&raw_val, &val)) {
+        // skip malformed value
+        auto error_msg =
+          "[Warning] detect malformed value in properties meta-block:"
+          "\tkey: " + key + "\tval: " + raw_val.ToString();
+        Log(logger, "%s", error_msg.c_str());
+        continue;
+      }
+      *(pos->second) = val;
+    } else if (key == TablePropertiesNames::kFilterPolicy) {
+      new_table_properties->filter_policy_name = raw_val.ToString();
+    } else {
+      // handle user-collected properties
+      new_table_properties->user_collected_properties.insert(
+          {key, raw_val.ToString()});
+    }
+  }
+  if (s.ok()) {
+    *table_properties = new_table_properties;
+  } else {
+    delete new_table_properties;
+  }
+
+  return s;
+}
+
+Status ReadTableProperties(RandomAccessFile* file, uint64_t file_size,
+                           uint64_t table_magic_number, Env* env,
+                           Logger* info_log, TableProperties** properties) {
+  // -- Read metaindex block
+  Footer footer(table_magic_number);
+  auto s = ReadFooterFromFile(file, file_size, &footer);
+  if (!s.ok()) {
+    return s;
+  }
+
+  auto metaindex_handle = footer.metaindex_handle();
+  BlockContents metaindex_contents;
+  ReadOptions read_options;
+  read_options.verify_checksums = false;
+  s = ReadBlockContents(file, footer, read_options, metaindex_handle,
+                        &metaindex_contents, env, false);
+  if (!s.ok()) {
+    return s;
+  }
+  Block metaindex_block(metaindex_contents);
+  std::unique_ptr<Iterator> meta_iter(
+      metaindex_block.NewIterator(BytewiseComparator()));
+
+  // -- Read property block
+  bool found_properties_block = true;
+  s = SeekToPropertiesBlock(meta_iter.get(), &found_properties_block);
+  if (!s.ok()) {
+    return s;
+  }
+
+  TableProperties table_properties;
+  if (found_properties_block == true) {
+    s = ReadProperties(meta_iter->value(), file, footer, env, info_log,
+                       properties);
+  } else {
+    s = Status::Corruption("Unable to read the property block.");
+    Log(WARN_LEVEL, info_log,
+        "Cannot find Properties block from file.");
+  }
+
+  return s;
+}
+
+}  // namespace rocksdb
diff --git a/table/meta_blocks.h b/table/meta_blocks.h
new file mode 100644 (file)
index 0000000..2ac8903
--- /dev/null
@@ -0,0 +1,127 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+
+#include "db/builder.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/table_properties.h"
+#include "table/block_builder.h"
+
+namespace rocksdb {
+
+class BlockBuilder;
+class BlockHandle;
+class Env;
+class Footer;
+class Logger;
+class RandomAccessFile;
+struct TableProperties;
+
+// An STL style comparator that does the bytewise comparator comparasion
+// internally.
+struct BytewiseLessThan {
+  bool operator()(const std::string& key1, const std::string& key2) const {
+    // smaller entries will be placed in front.
+    return comparator->Compare(key1, key2) <= 0;
+  }
+
+  const Comparator* comparator = BytewiseComparator();
+};
+
+// When writing to a block that requires entries to be sorted by
+// `BytewiseComparator`, we can buffer the content to `BytewiseSortedMap`
+// before writng to store.
+typedef std::map<std::string, std::string, BytewiseLessThan> BytewiseSortedMap;
+
+class MetaIndexBuilder {
+ public:
+  MetaIndexBuilder(const MetaIndexBuilder&) = delete;
+  MetaIndexBuilder& operator=(const MetaIndexBuilder&) = delete;
+
+  MetaIndexBuilder();
+  void Add(const std::string& key, const BlockHandle& handle);
+
+  // Write all the added key/value pairs to the block and return the contents
+  // of the block.
+  Slice Finish();
+
+ private:
+  // store the sorted key/handle of the metablocks.
+  BytewiseSortedMap meta_block_handles_;
+  std::unique_ptr<BlockBuilder> meta_index_block_;
+};
+
+class PropertyBlockBuilder {
+ public:
+  PropertyBlockBuilder(const PropertyBlockBuilder&) = delete;
+  PropertyBlockBuilder& operator=(const PropertyBlockBuilder&) = delete;
+
+  PropertyBlockBuilder();
+
+  void AddTableProperty(const TableProperties& props);
+  void Add(const std::string& key, uint64_t value);
+  void Add(const std::string& key, const std::string& value);
+  void Add(const UserCollectedProperties& user_collected_properties);
+
+  // Write all the added entries to the block and return the block contents
+  Slice Finish();
+
+ private:
+  std::unique_ptr<BlockBuilder> properties_block_;
+  BytewiseSortedMap props_;
+};
+
+// Were we encounter any error occurs during user-defined statistics collection,
+// we'll write the warning message to info log.
+void LogPropertiesCollectionError(
+    Logger* info_log, const std::string& method, const std::string& name);
+
+// Utility functions help table builder to trigger batch events for user
+// defined property collectors.
+// Return value indicates if there is any error occurred; if error occurred,
+// the warning message will be logged.
+// NotifyCollectTableCollectorsOnAdd() triggers the `Add` event for all
+// property collectors.
+bool NotifyCollectTableCollectorsOnAdd(
+    const Slice& key,
+    const Slice& value,
+    const Options::TablePropertiesCollectors& collectors,
+    Logger* info_log);
+
+// NotifyCollectTableCollectorsOnAdd() triggers the `Finish` event for all
+// property collectors. The collected properties will be added to `builder`.
+bool NotifyCollectTableCollectorsOnFinish(
+    const Options::TablePropertiesCollectors& collectors,
+    Logger* info_log,
+    PropertyBlockBuilder* builder);
+
+// Read the properties from the table.
+// @returns a status to indicate if the operation succeeded. On success,
+//          *table_properties will point to a heap-allocated TableProperties
+//          object, otherwise value of `table_properties` will not be modified.
+Status ReadProperties(const Slice &handle_value, RandomAccessFile *file,
+                      const Footer &footer, Env *env, Logger *logger,
+                      TableProperties **table_properties);
+
+// Directly read the properties from the properties block of a plain table.
+// @returns a status to indicate if the operation succeeded. On success,
+//          *table_properties will point to a heap-allocated TableProperties
+//          object, otherwise value of `table_properties` will not be modified.
+Status ReadTableProperties(RandomAccessFile* file, uint64_t file_size,
+                           uint64_t table_magic_number, Env* env,
+                           Logger* info_log, TableProperties** properties);
+
+// Seek to the properties block.
+// If it successfully seeks to the properties block, "is_found" will be
+// set to true.
+extern Status SeekToPropertiesBlock(Iterator* meta_iter, bool* is_found);
+
+}  // namespace rocksdb
diff --git a/table/plain_table_builder.cc b/table/plain_table_builder.cc
new file mode 100644 (file)
index 0000000..d76f0b2
--- /dev/null
@@ -0,0 +1,211 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+#include "table/plain_table_builder.h"
+
+#include <assert.h>
+#include <map>
+
+#include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/options.h"
+#include "table/plain_table_factory.h"
+#include "db/dbformat.h"
+#include "table/block_builder.h"
+#include "table/filter_block.h"
+#include "table/format.h"
+#include "table/meta_blocks.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+#include "util/stop_watch.h"
+
+namespace rocksdb {
+
+namespace {
+
+// a utility that helps writing block content to the file
+//   @offset will advance if @block_contents was successfully written.
+//   @block_handle the block handle this particular block.
+Status WriteBlock(
+    const Slice& block_contents,
+    WritableFile* file,
+    uint64_t* offset,
+    BlockHandle* block_handle) {
+  block_handle->set_offset(*offset);
+  block_handle->set_size(block_contents.size());
+  Status s = file->Append(block_contents);
+
+  if (s.ok()) {
+    *offset += block_contents.size();
+  }
+  return s;
+}
+
+}  // namespace
+
+// kPlainTableMagicNumber was picked by running
+//    echo rocksdb.table.plain | sha1sum
+// and taking the leading 64 bits.
+extern const uint64_t kPlainTableMagicNumber = 0x8242229663bf9564ull;
+extern const uint64_t kLegacyPlainTableMagicNumber = 0x4f3418eb7a8f13b8ull;
+
+PlainTableBuilder::PlainTableBuilder(const Options& options,
+                                     WritableFile* file,
+                                     uint32_t user_key_len) :
+    options_(options), file_(file), user_key_len_(user_key_len) {
+  properties_.fixed_key_len = user_key_len;
+
+  // for plain table, we put all the data in a big chuck.
+  properties_.num_data_blocks = 1;
+  // emphasize that currently plain table doesn't have persistent index or
+  // filter block.
+  properties_.index_size = 0;
+  properties_.filter_size = 0;
+  properties_.format_version = 0;
+}
+
+PlainTableBuilder::~PlainTableBuilder() {
+}
+
+void PlainTableBuilder::Add(const Slice& key, const Slice& value) {
+  size_t user_key_size = key.size() - 8;
+  assert(user_key_len_ == 0 || user_key_size == user_key_len_);
+
+  if (!IsFixedLength()) {
+    // Write key length
+    char key_size_buf[5];  // tmp buffer for key size as varint32
+    char* ptr = EncodeVarint32(key_size_buf, user_key_size);
+    assert(ptr <= key_size_buf + sizeof(key_size_buf));
+    auto len = ptr - key_size_buf;
+    file_->Append(Slice(key_size_buf, len));
+    offset_ += len;
+  }
+
+  // Write key
+  ParsedInternalKey parsed_key;
+  if (!ParseInternalKey(key, &parsed_key)) {
+    status_ = Status::Corruption(Slice());
+    return;
+  }
+  // For value size as varint32 (up to 5 bytes).
+  // If the row is of value type with seqId 0, flush the special flag together
+  // in this buffer to safe one file append call, which takes 1 byte.
+  char value_size_buf[6];
+  size_t value_size_buf_size = 0;
+  if (parsed_key.sequence == 0 && parsed_key.type == kTypeValue) {
+    file_->Append(Slice(key.data(), user_key_size));
+    offset_ += user_key_size;
+    value_size_buf[0] = PlainTableFactory::kValueTypeSeqId0;
+    value_size_buf_size = 1;
+  } else {
+    file_->Append(key);
+    offset_ += key.size();
+  }
+
+  // Write value length
+  int value_size = value.size();
+  char* end_ptr =
+      EncodeVarint32(value_size_buf + value_size_buf_size, value_size);
+  assert(end_ptr <= value_size_buf + sizeof(value_size_buf));
+  value_size_buf_size = end_ptr - value_size_buf;
+  file_->Append(Slice(value_size_buf, value_size_buf_size));
+
+  // Write value
+  file_->Append(value);
+  offset_ += value_size + value_size_buf_size;
+
+  properties_.num_entries++;
+  properties_.raw_key_size += key.size();
+  properties_.raw_value_size += value.size();
+
+  // notify property collectors
+  NotifyCollectTableCollectorsOnAdd(
+      key,
+      value,
+      options_.table_properties_collectors,
+      options_.info_log.get()
+  );
+}
+
+Status PlainTableBuilder::status() const { return status_; }
+
+Status PlainTableBuilder::Finish() {
+  assert(!closed_);
+  closed_ = true;
+
+  properties_.data_size = offset_;
+
+  // Write the following blocks
+  //  1. [meta block: properties]
+  //  2. [metaindex block]
+  //  3. [footer]
+  MetaIndexBuilder meta_index_builer;
+
+  PropertyBlockBuilder property_block_builder;
+  // -- Add basic properties
+  property_block_builder.AddTableProperty(properties_);
+
+  // -- Add user collected properties
+  NotifyCollectTableCollectorsOnFinish(
+      options_.table_properties_collectors,
+      options_.info_log.get(),
+      &property_block_builder
+  );
+
+  // -- Write property block
+  BlockHandle property_block_handle;
+  auto s = WriteBlock(
+      property_block_builder.Finish(),
+      file_,
+      &offset_,
+      &property_block_handle
+  );
+  if (!s.ok()) {
+    return s;
+  }
+  meta_index_builer.Add(kPropertiesBlock, property_block_handle);
+
+  // -- write metaindex block
+  BlockHandle metaindex_block_handle;
+  s = WriteBlock(
+      meta_index_builer.Finish(),
+      file_,
+      &offset_,
+      &metaindex_block_handle
+  );
+  if (!s.ok()) {
+    return s;
+  }
+
+  // Write Footer
+  // no need to write out new footer if we're using default checksum
+  Footer footer(kLegacyPlainTableMagicNumber);
+  footer.set_metaindex_handle(metaindex_block_handle);
+  footer.set_index_handle(BlockHandle::NullBlockHandle());
+  std::string footer_encoding;
+  footer.EncodeTo(&footer_encoding);
+  s = file_->Append(footer_encoding);
+  if (s.ok()) {
+    offset_ += footer_encoding.size();
+  }
+
+  return s;
+}
+
+void PlainTableBuilder::Abandon() {
+  closed_ = true;
+}
+
+uint64_t PlainTableBuilder::NumEntries() const {
+  return properties_.num_entries;
+}
+
+uint64_t PlainTableBuilder::FileSize() const {
+  return offset_;
+}
+
+}  // namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/table/plain_table_builder.h b/table/plain_table_builder.h
new file mode 100644 (file)
index 0000000..7bc388b
--- /dev/null
@@ -0,0 +1,84 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// IndexedTable is a simple table format for UNIT TEST ONLY. It is not built
+// as production quality.
+
+#ifndef ROCKSDB_LITE
+#pragma once
+#include <stdint.h>
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+#include "table/table_builder.h"
+#include "rocksdb/table_properties.h"
+
+namespace rocksdb {
+
+class BlockBuilder;
+class BlockHandle;
+class WritableFile;
+class TableBuilder;
+
+class PlainTableBuilder: public TableBuilder {
+public:
+  // Create a builder that will store the contents of the table it is
+  // building in *file.  Does not close the file.  It is up to the
+  // caller to close the file after calling Finish(). The output file
+  // will be part of level specified by 'level'.  A value of -1 means
+  // that the caller does not know which level the output file will reside.
+  PlainTableBuilder(const Options& options, WritableFile* file,
+                    uint32_t user_key_size);
+
+  // REQUIRES: Either Finish() or Abandon() has been called.
+  ~PlainTableBuilder();
+
+  // Add key,value to the table being constructed.
+  // REQUIRES: key is after any previously added key according to comparator.
+  // REQUIRES: Finish(), Abandon() have not been called
+  void Add(const Slice& key, const Slice& value) override;
+
+  // Return non-ok iff some error has been detected.
+  Status status() const override;
+
+  // Finish building the table.  Stops using the file passed to the
+  // constructor after this function returns.
+  // REQUIRES: Finish(), Abandon() have not been called
+  Status Finish() override;
+
+  // Indicate that the contents of this builder should be abandoned.  Stops
+  // using the file passed to the constructor after this function returns.
+  // If the caller is not going to call Finish(), it must call Abandon()
+  // before destroying this builder.
+  // REQUIRES: Finish(), Abandon() have not been called
+  void Abandon() override;
+
+  // Number of calls to Add() so far.
+  uint64_t NumEntries() const override;
+
+  // Size of the file generated so far.  If invoked after a successful
+  // Finish() call, returns the size of the final generated file.
+  uint64_t FileSize() const override;
+
+private:
+  Options options_;
+  WritableFile* file_;
+  uint64_t offset_ = 0;
+  Status status_;
+  TableProperties properties_;
+
+  const size_t user_key_len_;
+  bool closed_ = false;  // Either Finish() or Abandon() has been called.
+
+  bool IsFixedLength() const {
+    return user_key_len_ > 0;
+  }
+
+  // No copying allowed
+  PlainTableBuilder(const PlainTableBuilder&) = delete;
+  void operator=(const PlainTableBuilder&) = delete;
+};
+
+}  // namespace rocksdb
+
+#endif  // ROCKSDB_LITE
diff --git a/table/plain_table_factory.cc b/table/plain_table_factory.cc
new file mode 100644 (file)
index 0000000..4e84468
--- /dev/null
@@ -0,0 +1,50 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+#include "table/plain_table_factory.h"
+
+#include <memory>
+#include <stdint.h>
+#include "db/dbformat.h"
+#include "table/plain_table_builder.h"
+#include "table/plain_table_reader.h"
+#include "port/port.h"
+
+namespace rocksdb {
+
+Status PlainTableFactory::NewTableReader(const Options& options,
+                                         const EnvOptions& soptions,
+                                         const InternalKeyComparator& icomp,
+                                         unique_ptr<RandomAccessFile>&& file,
+                                         uint64_t file_size,
+                                         unique_ptr<TableReader>* table) const {
+  return PlainTableReader::Open(options, soptions, icomp, std::move(file),
+                                file_size, table, bloom_bits_per_key_,
+                                hash_table_ratio_, index_sparseness_);
+}
+
+TableBuilder* PlainTableFactory::NewTableBuilder(
+    const Options& options, const InternalKeyComparator& internal_comparator,
+    WritableFile* file, CompressionType compression_type) const {
+  return new PlainTableBuilder(options, file, user_key_len_);
+}
+
+extern TableFactory* NewPlainTableFactory(uint32_t user_key_len,
+                                          int bloom_bits_per_key,
+                                          double hash_table_ratio,
+                                          size_t index_sparseness) {
+  return new PlainTableFactory(user_key_len, bloom_bits_per_key,
+                               hash_table_ratio, index_sparseness);
+}
+
+extern TableFactory* NewTotalOrderPlainTableFactory(uint32_t user_key_len,
+                                                    int bloom_bits_per_key,
+                                                    size_t index_sparseness) {
+  return new PlainTableFactory(user_key_len, bloom_bits_per_key, 0,
+                               index_sparseness);
+}
+
+}  // namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/table/plain_table_factory.h b/table/plain_table_factory.h
new file mode 100644 (file)
index 0000000..84af22f
--- /dev/null
@@ -0,0 +1,88 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+#include <memory>
+#include <stdint.h>
+
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+
+namespace rocksdb {
+
+struct Options;
+struct EnvOptions;
+
+using std::unique_ptr;
+class Status;
+class RandomAccessFile;
+class WritableFile;
+class Table;
+class TableBuilder;
+
+// IndexedTable requires fixed length key, configured as a constructor
+// parameter of the factory class. Output file format:
+// +-------------+-----------------+
+// | version     | user_key_length |
+// +------------++------------------------------+  <= key1 offset
+// | [key_size] |  key1       | value_size  |   |
+// +------------+-------------+-------------+   |
+// | value1                                     |
+// |                                            |
+// +----------------------------------------+---+  <= key2 offset
+// | [key_size] |  key2       | value_size  |   |
+// +------------+-------------+-------------+   |
+// | value2                                     |
+// |                                            |
+// |        ......                              |
+// +-----------------+--------------------------+
+// If user_key_length = kPlainTableVariableLength, it means the key is variable
+// length, there will be an extra field for key size encoded before every key.
+class PlainTableFactory : public TableFactory {
+ public:
+  ~PlainTableFactory() {}
+  // user_key_size is the length of the user key. If it is set to be
+  // kPlainTableVariableLength, then it means variable length. Otherwise, all
+  // the keys need to have the fix length of this value. bloom_bits_per_key is
+  // number of bits used for bloom filer per key. hash_table_ratio is
+  // the desired utilization of the hash table used for prefix hashing.
+  // hash_table_ratio = number of prefixes / #buckets in the hash table
+  // hash_table_ratio = 0 means skip hash table but only replying on binary
+  // search.
+  // index_sparseness determines index interval for keys
+  // inside the same prefix. It will be the maximum number of linear search
+  // required after hash and binary search.
+  // index_sparseness = 0 means index for every key.
+  explicit PlainTableFactory(uint32_t user_key_len = kPlainTableVariableLength,
+                             int bloom_bits_per_key = 0,
+                             double hash_table_ratio = 0.75,
+                             size_t index_sparseness = 16)
+      : user_key_len_(user_key_len),
+        bloom_bits_per_key_(bloom_bits_per_key),
+        hash_table_ratio_(hash_table_ratio),
+        index_sparseness_(index_sparseness) {}
+  const char* Name() const override { return "PlainTable"; }
+  Status NewTableReader(const Options& options, const EnvOptions& soptions,
+                        const InternalKeyComparator& internal_comparator,
+                        unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
+                        unique_ptr<TableReader>* table) const override;
+  TableBuilder* NewTableBuilder(const Options& options,
+                                const InternalKeyComparator& icomparator,
+                                WritableFile* file,
+                                CompressionType compression_type) const
+      override;
+
+  static const char kValueTypeSeqId0 = 0xFF;
+
+ private:
+  uint32_t user_key_len_;
+  int bloom_bits_per_key_;
+  double hash_table_ratio_;
+  size_t index_sparseness_;
+};
+
+}  // namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/table/plain_table_reader.cc b/table/plain_table_reader.cc
new file mode 100644 (file)
index 0000000..1962017
--- /dev/null
@@ -0,0 +1,756 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+#include "table/plain_table_reader.h"
+
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+
+#include "rocksdb/cache.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/options.h"
+#include "rocksdb/statistics.h"
+
+#include "table/block.h"
+#include "table/filter_block.h"
+#include "table/format.h"
+#include "table/meta_blocks.h"
+#include "table/two_level_iterator.h"
+#include "table/plain_table_factory.h"
+
+#include "util/coding.h"
+#include "util/dynamic_bloom.h"
+#include "util/hash.h"
+#include "util/histogram.h"
+#include "util/murmurhash.h"
+#include "util/perf_context_imp.h"
+#include "util/stop_watch.h"
+
+
+namespace rocksdb {
+
+namespace {
+
+inline uint32_t GetSliceHash(const Slice& s) {
+  return Hash(s.data(), s.size(), 397) ;
+}
+
+inline uint32_t GetBucketIdFromHash(uint32_t hash, uint32_t num_buckets) {
+  return hash % num_buckets;
+}
+
+// Safely getting a uint32_t element from a char array, where, starting from
+// `base`, every 4 bytes are considered as an fixed 32 bit integer.
+inline uint32_t GetFixed32Element(const char* base, size_t offset) {
+  return DecodeFixed32(base + offset * sizeof(uint32_t));
+}
+
+}  // namespace
+
+// Iterator to iterate IndexedTable
+class PlainTableIterator : public Iterator {
+ public:
+  explicit PlainTableIterator(PlainTableReader* table, bool use_prefix_seek);
+  ~PlainTableIterator();
+
+  bool Valid() const;
+
+  void SeekToFirst();
+
+  void SeekToLast();
+
+  void Seek(const Slice& target);
+
+  void Next();
+
+  void Prev();
+
+  Slice key() const;
+
+  Slice value() const;
+
+  Status status() const;
+
+ private:
+  PlainTableReader* table_;
+  bool use_prefix_seek_;
+  uint32_t offset_;
+  uint32_t next_offset_;
+  IterKey key_;
+  Slice value_;
+  Status status_;
+  // No copying allowed
+  PlainTableIterator(const PlainTableIterator&) = delete;
+  void operator=(const Iterator&) = delete;
+};
+
+extern const uint64_t kPlainTableMagicNumber;
+PlainTableReader::PlainTableReader(
+    const Options& options, unique_ptr<RandomAccessFile>&& file,
+    const EnvOptions& storage_options, const InternalKeyComparator& icomparator,
+    uint64_t file_size, int bloom_bits_per_key, double hash_table_ratio,
+    size_t index_sparseness, const TableProperties* table_properties)
+    : options_(options),
+      soptions_(storage_options),
+      file_(std::move(file)),
+      internal_comparator_(icomparator),
+      file_size_(file_size),
+      kHashTableRatio(hash_table_ratio),
+      kBloomBitsPerKey(bloom_bits_per_key),
+      kIndexIntervalForSamePrefixKeys(index_sparseness),
+      table_properties_(nullptr),
+      data_end_offset_(table_properties->data_size),
+      user_key_len_(table_properties->fixed_key_len) {
+  assert(kHashTableRatio >= 0.0);
+}
+
+PlainTableReader::~PlainTableReader() {
+}
+
+Status PlainTableReader::Open(
+    const Options& options, const EnvOptions& soptions,
+    const InternalKeyComparator& internal_comparator,
+    unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
+    unique_ptr<TableReader>* table_reader, const int bloom_bits_per_key,
+    double hash_table_ratio, size_t index_sparseness) {
+  assert(options.allow_mmap_reads);
+
+  if (file_size > kMaxFileSize) {
+    return Status::NotSupported("File is too large for PlainTableReader!");
+  }
+
+  TableProperties* props = nullptr;
+  auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber,
+                               options.env, options.info_log.get(), &props);
+  if (!s.ok()) {
+    return s;
+  }
+
+  std::unique_ptr<PlainTableReader> new_reader(new PlainTableReader(
+      options, std::move(file), soptions, internal_comparator, file_size,
+      bloom_bits_per_key, hash_table_ratio, index_sparseness, props));
+
+  // -- Populate Index
+  s = new_reader->PopulateIndex(props);
+  if (!s.ok()) {
+    return s;
+  }
+
+  *table_reader = std::move(new_reader);
+  return s;
+}
+
+void PlainTableReader::SetupForCompaction() {
+}
+
+Iterator* PlainTableReader::NewIterator(const ReadOptions& options) {
+  return new PlainTableIterator(this, options_.prefix_extractor != nullptr);
+}
+
+struct PlainTableReader::IndexRecord {
+  uint32_t hash; // hash of the prefix
+  uint32_t offset; // offset of a row
+  IndexRecord* next;
+};
+
+// Helper class to track all the index records
+class PlainTableReader::IndexRecordList {
+ public:
+  explicit IndexRecordList(size_t num_records_per_group)
+      : kNumRecordsPerGroup(num_records_per_group),
+        current_group_(nullptr),
+        num_records_in_current_group_(num_records_per_group) {}
+
+  ~IndexRecordList() {
+    for (size_t i = 0; i < groups_.size(); i++) {
+      delete[] groups_[i];
+    }
+  }
+
+  void AddRecord(murmur_t hash, uint32_t offset) {
+    if (num_records_in_current_group_ == kNumRecordsPerGroup) {
+      current_group_ = AllocateNewGroup();
+      num_records_in_current_group_ = 0;
+    }
+    auto& new_record = current_group_[num_records_in_current_group_++];
+    new_record.hash = hash;
+    new_record.offset = offset;
+    new_record.next = nullptr;
+  }
+
+  size_t GetNumRecords() const {
+    return (groups_.size() - 1) * kNumRecordsPerGroup +
+           num_records_in_current_group_;
+  }
+  IndexRecord* At(size_t index) {
+    return &(groups_[index / kNumRecordsPerGroup][index % kNumRecordsPerGroup]);
+  }
+
+ private:
+  IndexRecord* AllocateNewGroup() {
+    IndexRecord* result = new IndexRecord[kNumRecordsPerGroup];
+    groups_.push_back(result);
+    return result;
+  }
+
+  // Each group in `groups_` contains fix-sized records (determined by
+  // kNumRecordsPerGroup). Which can help us minimize the cost if resizing
+  // occurs.
+  const size_t kNumRecordsPerGroup;
+  IndexRecord* current_group_;
+  // List of arrays allocated
+  std::vector<IndexRecord*> groups_;
+  size_t num_records_in_current_group_;
+};
+
+Status PlainTableReader::PopulateIndexRecordList(IndexRecordList* record_list,
+                                                 int* num_prefixes) const {
+  Slice prev_key_prefix_slice;
+  uint32_t prev_key_prefix_hash = 0;
+  uint32_t pos = data_start_offset_;
+  int num_keys_per_prefix = 0;
+  bool is_first_record = true;
+  HistogramImpl keys_per_prefix_hist;
+  // Need map to be ordered to make sure sub indexes generated
+  // are in order.
+
+  *num_prefixes = 0;
+  while (pos < data_end_offset_) {
+    uint32_t key_offset = pos;
+    ParsedInternalKey key;
+    Slice value_slice;
+    Status s = Next(&pos, &key, &value_slice);
+    if (!s.ok()) {
+      return s;
+    }
+    if (bloom_) {
+      // total order mode and bloom filter is enabled.
+      bloom_->AddHash(GetSliceHash(key.user_key));
+    }
+    Slice key_prefix_slice = GetPrefix(key);
+
+    if (is_first_record || prev_key_prefix_slice != key_prefix_slice) {
+      ++(*num_prefixes);
+      if (!is_first_record) {
+        keys_per_prefix_hist.Add(num_keys_per_prefix);
+      }
+      num_keys_per_prefix = 0;
+      prev_key_prefix_slice = key_prefix_slice;
+      prev_key_prefix_hash = GetSliceHash(key_prefix_slice);
+    }
+
+    if (kIndexIntervalForSamePrefixKeys == 0 ||
+        num_keys_per_prefix++ % kIndexIntervalForSamePrefixKeys == 0) {
+      // Add an index key for every kIndexIntervalForSamePrefixKeys keys
+      record_list->AddRecord(prev_key_prefix_hash, key_offset);
+    }
+    is_first_record = false;
+  }
+
+  keys_per_prefix_hist.Add(num_keys_per_prefix);
+  Log(options_.info_log, "Number of Keys per prefix Histogram: %s",
+      keys_per_prefix_hist.ToString().c_str());
+
+  return Status::OK();
+}
+
+void PlainTableReader::AllocateIndexAndBloom(int num_prefixes) {
+  index_.reset();
+
+  if (options_.prefix_extractor.get() != nullptr) {
+    uint32_t bloom_total_bits = num_prefixes * kBloomBitsPerKey;
+    if (bloom_total_bits > 0) {
+      bloom_.reset(new DynamicBloom(bloom_total_bits, options_.bloom_locality));
+    }
+  }
+
+  if (options_.prefix_extractor.get() == nullptr || kHashTableRatio <= 0) {
+    // Fall back to pure binary search if the user fails to specify a prefix
+    // extractor.
+    index_size_ = 1;
+  } else {
+    double hash_table_size_multipier = 1.0 / kHashTableRatio;
+    index_size_ = num_prefixes * hash_table_size_multipier + 1;
+  }
+  index_.reset(new uint32_t[index_size_]);
+}
+
+size_t PlainTableReader::BucketizeIndexesAndFillBloom(
+    IndexRecordList* record_list, std::vector<IndexRecord*>* hash_to_offsets,
+    std::vector<uint32_t>* entries_per_bucket) {
+  bool first = true;
+  uint32_t prev_hash = 0;
+  size_t num_records = record_list->GetNumRecords();
+  for (size_t i = 0; i < num_records; i++) {
+    IndexRecord* index_record = record_list->At(i);
+    uint32_t cur_hash = index_record->hash;
+    if (first || prev_hash != cur_hash) {
+      prev_hash = cur_hash;
+      first = false;
+      if (bloom_ && !IsTotalOrderMode()) {
+        bloom_->AddHash(cur_hash);
+      }
+    }
+    uint32_t bucket = GetBucketIdFromHash(cur_hash, index_size_);
+    IndexRecord* prev_bucket_head = (*hash_to_offsets)[bucket];
+    index_record->next = prev_bucket_head;
+    (*hash_to_offsets)[bucket] = index_record;
+    (*entries_per_bucket)[bucket]++;
+  }
+  size_t sub_index_size = 0;
+  for (auto entry_count : *entries_per_bucket) {
+    if (entry_count <= 1) {
+      continue;
+    }
+    // Only buckets with more than 1 entry will have subindex.
+    sub_index_size += VarintLength(entry_count);
+    // total bytes needed to store these entries' in-file offsets.
+    sub_index_size += entry_count * kOffsetLen;
+  }
+  return sub_index_size;
+}
+
+void PlainTableReader::FillIndexes(
+    const size_t kSubIndexSize,
+    const std::vector<IndexRecord*>& hash_to_offsets,
+    const std::vector<uint32_t>& entries_per_bucket) {
+  Log(options_.info_log, "Reserving %zu bytes for plain table's sub_index",
+      kSubIndexSize);
+  sub_index_.reset(new char[kSubIndexSize]);
+  size_t sub_index_offset = 0;
+  for (int i = 0; i < index_size_; i++) {
+    uint32_t num_keys_for_bucket = entries_per_bucket[i];
+    switch (num_keys_for_bucket) {
+    case 0:
+      // No key for bucket
+      index_[i] = data_end_offset_;
+      break;
+    case 1:
+      // point directly to the file offset
+      index_[i] = hash_to_offsets[i]->offset;
+      break;
+    default:
+      // point to second level indexes.
+      index_[i] = sub_index_offset | kSubIndexMask;
+      char* prev_ptr = &sub_index_[sub_index_offset];
+      char* cur_ptr = EncodeVarint32(prev_ptr, num_keys_for_bucket);
+      sub_index_offset += (cur_ptr - prev_ptr);
+      char* sub_index_pos = &sub_index_[sub_index_offset];
+      IndexRecord* record = hash_to_offsets[i];
+      int j;
+      for (j = num_keys_for_bucket - 1; j >= 0 && record;
+           j--, record = record->next) {
+        EncodeFixed32(sub_index_pos + j * sizeof(uint32_t), record->offset);
+      }
+      assert(j == -1 && record == nullptr);
+      sub_index_offset += kOffsetLen * num_keys_for_bucket;
+      assert(sub_index_offset <= kSubIndexSize);
+      break;
+    }
+  }
+  assert(sub_index_offset == kSubIndexSize);
+
+  Log(options_.info_log, "hash table size: %d, suffix_map length %zu",
+      index_size_, kSubIndexSize);
+}
+
+Status PlainTableReader::PopulateIndex(TableProperties* props) {
+  assert(props != nullptr);
+  table_properties_.reset(props);
+
+  // options.prefix_extractor is requried for a hash-based look-up.
+  if (options_.prefix_extractor.get() == nullptr && kHashTableRatio != 0) {
+    return Status::NotSupported(
+        "PlainTable requires a prefix extractor enable prefix hash mode.");
+  }
+
+  // Get mmapped memory to file_data_.
+  Status s = file_->Read(0, file_size_, &file_data_, nullptr);
+  if (!s.ok()) {
+    return s;
+  }
+
+  IndexRecordList record_list(kRecordsPerGroup);
+  // First, read the whole file, for every kIndexIntervalForSamePrefixKeys rows
+  // for a prefix (starting from the first one), generate a record of (hash,
+  // offset) and append it to IndexRecordList, which is a data structure created
+  // to store them.
+  int num_prefixes;
+
+  // Allocate bloom filter here for total order mode.
+  if (IsTotalOrderMode()) {
+    uint32_t num_bloom_bits = table_properties_->num_entries * kBloomBitsPerKey;
+    if (num_bloom_bits > 0) {
+      bloom_.reset(new DynamicBloom(num_bloom_bits, options_.bloom_locality));
+    }
+  }
+
+  s = PopulateIndexRecordList(&record_list, &num_prefixes);
+  if (!s.ok()) {
+    return s;
+  }
+  // Calculated hash table and bloom filter size and allocate memory for indexes
+  // and bloom filter based on the number of prefixes.
+  AllocateIndexAndBloom(num_prefixes);
+
+  // Bucketize all the index records to a temp data structure, in which for
+  // each bucket, we generate a linked list of IndexRecord, in reversed order.
+  std::vector<IndexRecord*> hash_to_offsets(index_size_, nullptr);
+  std::vector<uint32_t> entries_per_bucket(index_size_, 0);
+  size_t sub_index_size_needed = BucketizeIndexesAndFillBloom(
+      &record_list, &hash_to_offsets, &entries_per_bucket);
+  // From the temp data structure, populate indexes.
+  FillIndexes(sub_index_size_needed, hash_to_offsets, entries_per_bucket);
+
+  // Fill two table properties.
+  // TODO(sdong): after we have the feature of storing index in file, this
+  // properties need to be populated to index_size instead.
+  props->user_collected_properties["plain_table_hash_table_size"] =
+      std::to_string(index_size_ * 4U);
+  props->user_collected_properties["plain_table_sub_index_size"] =
+      std::to_string(sub_index_size_needed);
+
+  return Status::OK();
+}
+
+Status PlainTableReader::GetOffset(const Slice& target, const Slice& prefix,
+                                   uint32_t prefix_hash, bool& prefix_matched,
+                                   uint32_t* offset) const {
+  prefix_matched = false;
+  int bucket = GetBucketIdFromHash(prefix_hash, index_size_);
+  uint32_t bucket_value = index_[bucket];
+  if (bucket_value == data_end_offset_) {
+    *offset = data_end_offset_;
+    return Status::OK();
+  } else if ((bucket_value & kSubIndexMask) == 0) {
+    // point directly to the file
+    *offset = bucket_value;
+    return Status::OK();
+  }
+
+  // point to sub-index, need to do a binary search
+  uint32_t low = 0;
+  uint64_t prefix_index_offset = bucket_value ^ kSubIndexMask;
+
+  const char* index_ptr = &sub_index_[prefix_index_offset];
+  uint32_t upper_bound = 0;
+  const char* base_ptr = GetVarint32Ptr(index_ptr, index_ptr + 4, &upper_bound);
+  uint32_t high = upper_bound;
+  ParsedInternalKey mid_key;
+  ParsedInternalKey parsed_target;
+  if (!ParseInternalKey(target, &parsed_target)) {
+    return Status::Corruption(Slice());
+  }
+
+  // The key is between [low, high). Do a binary search between it.
+  while (high - low > 1) {
+    uint32_t mid = (high + low) / 2;
+    uint32_t file_offset = GetFixed32Element(base_ptr, mid);
+    size_t tmp;
+    Status s = ReadKey(file_data_.data() + file_offset, &mid_key, &tmp);
+    if (!s.ok()) {
+      return s;
+    }
+    int cmp_result = internal_comparator_.Compare(mid_key, parsed_target);
+    if (cmp_result < 0) {
+      low = mid;
+    } else {
+      if (cmp_result == 0) {
+        // Happen to have found the exact key or target is smaller than the
+        // first key after base_offset.
+        prefix_matched = true;
+        *offset = file_offset;
+        return Status::OK();
+      } else {
+        high = mid;
+      }
+    }
+  }
+  // Both of the key at the position low or low+1 could share the same
+  // prefix as target. We need to rule out one of them to avoid to go
+  // to the wrong prefix.
+  ParsedInternalKey low_key;
+  size_t tmp;
+  uint32_t low_key_offset = GetFixed32Element(base_ptr, low);
+  Status s = ReadKey(file_data_.data() + low_key_offset, &low_key, &tmp);
+  if (GetPrefix(low_key) == prefix) {
+    prefix_matched = true;
+    *offset = low_key_offset;
+  } else if (low + 1 < upper_bound) {
+    // There is possible a next prefix, return it
+    prefix_matched = false;
+    *offset = GetFixed32Element(base_ptr, low + 1);
+  } else {
+    // target is larger than a key of the last prefix in this bucket
+    // but with a different prefix. Key does not exist.
+    *offset = data_end_offset_;
+  }
+  return Status::OK();
+}
+
+bool PlainTableReader::MatchBloom(uint32_t hash) const {
+  return bloom_.get() == nullptr || bloom_->MayContainHash(hash);
+}
+
+Slice PlainTableReader::GetPrefix(const ParsedInternalKey& target) const {
+  return GetPrefixFromUserKey(target.user_key);
+}
+
+Status PlainTableReader::ReadKey(const char* start, ParsedInternalKey* key,
+                                 size_t* bytes_read) const {
+  const char* key_ptr = nullptr;
+  *bytes_read = 0;
+  size_t user_key_size = 0;
+  if (IsFixedLength()) {
+    user_key_size = user_key_len_;
+    key_ptr = start;
+  } else {
+    uint32_t tmp_size = 0;
+    key_ptr =
+        GetVarint32Ptr(start, file_data_.data() + data_end_offset_, &tmp_size);
+    if (key_ptr == nullptr) {
+      return Status::Corruption(
+          "Unexpected EOF when reading the next key's size");
+    }
+    user_key_size = (size_t)tmp_size;
+    *bytes_read = key_ptr - start;
+  }
+  if (key_ptr + user_key_size + 1 >= file_data_.data() + data_end_offset_) {
+    return Status::Corruption("Unexpected EOF when reading the next key");
+  }
+
+  if (*(key_ptr + user_key_size) == PlainTableFactory::kValueTypeSeqId0) {
+    // Special encoding for the row with seqID=0
+    key->user_key = Slice(key_ptr, user_key_size);
+    key->sequence = 0;
+    key->type = kTypeValue;
+    *bytes_read += user_key_size + 1;
+  } else {
+    if (start + user_key_size + 8 >= file_data_.data() + data_end_offset_) {
+      return Status::Corruption(
+          "Unexpected EOF when reading internal bytes of the next key");
+    }
+    if (!ParseInternalKey(Slice(key_ptr, user_key_size + 8), key)) {
+      return Status::Corruption(
+          Slice("Incorrect value type found when reading the next key"));
+    }
+    *bytes_read += user_key_size + 8;
+  }
+
+  return Status::OK();
+}
+
+Status PlainTableReader::Next(uint32_t* offset, ParsedInternalKey* key,
+                              Slice* value) const {
+  if (*offset == data_end_offset_) {
+    *offset = data_end_offset_;
+    return Status::OK();
+  }
+
+  if (*offset > data_end_offset_) {
+    return Status::Corruption("Offset is out of file size");
+  }
+
+  const char* start = file_data_.data() + *offset;
+  size_t bytes_for_key;
+  Status s = ReadKey(start, key, &bytes_for_key);
+  if (!s.ok()) {
+    return s;
+  }
+  uint32_t value_size;
+  const char* value_ptr = GetVarint32Ptr(
+      start + bytes_for_key, file_data_.data() + data_end_offset_, &value_size);
+  if (value_ptr == nullptr) {
+    return Status::Corruption(
+        "Unexpected EOF when reading the next value's size.");
+  }
+  *offset = *offset + (value_ptr - start) + value_size;
+  if (*offset > data_end_offset_) {
+    return Status::Corruption("Unexpected EOF when reading the next value. ");
+  }
+  *value = Slice(value_ptr, value_size);
+
+  return Status::OK();
+}
+
+Status PlainTableReader::Get(const ReadOptions& ro, const Slice& target,
+                             void* arg,
+                             bool (*saver)(void*, const ParsedInternalKey&,
+                                           const Slice&, bool),
+                             void (*mark_key_may_exist)(void*)) {
+  // Check bloom filter first.
+  Slice prefix_slice;
+  uint32_t prefix_hash;
+  if (IsTotalOrderMode()) {
+    // Match whole user key for bloom filter check.
+    if (!MatchBloom(GetSliceHash(GetUserKey(target)))) {
+      return Status::OK();
+    }
+    // in total order mode, there is only one bucket 0, and we always use empty
+    // prefix.
+    prefix_slice = Slice();
+    prefix_hash = 0;
+  } else {
+    prefix_slice = GetPrefix(target);
+    prefix_hash = GetSliceHash(prefix_slice);
+    if (!MatchBloom(prefix_hash)) {
+      return Status::OK();
+    }
+  }
+  uint32_t offset;
+  bool prefix_match;
+  Status s =
+      GetOffset(target, prefix_slice, prefix_hash, prefix_match, &offset);
+  if (!s.ok()) {
+    return s;
+  }
+  ParsedInternalKey found_key;
+  ParsedInternalKey parsed_target;
+  if (!ParseInternalKey(target, &parsed_target)) {
+    return Status::Corruption(Slice());
+  }
+
+  Slice found_value;
+  while (offset < data_end_offset_) {
+    Status s = Next(&offset, &found_key, &found_value);
+    if (!s.ok()) {
+      return s;
+    }
+    if (!prefix_match) {
+      // Need to verify prefix for the first key found if it is not yet
+      // checked.
+      if (GetPrefix(found_key) != prefix_slice) {
+        return Status::OK();
+      }
+      prefix_match = true;
+    }
+    if (internal_comparator_.Compare(found_key, parsed_target) >= 0) {
+      if (!(*saver)(arg, found_key, found_value, true)) {
+        break;
+      }
+    }
+  }
+  return Status::OK();
+}
+
+uint64_t PlainTableReader::ApproximateOffsetOf(const Slice& key) {
+  return 0;
+}
+
+PlainTableIterator::PlainTableIterator(PlainTableReader* table,
+                                       bool use_prefix_seek)
+    : table_(table), use_prefix_seek_(use_prefix_seek) {
+  next_offset_ = offset_ = table_->data_end_offset_;
+}
+
+PlainTableIterator::~PlainTableIterator() {
+}
+
+bool PlainTableIterator::Valid() const {
+  return offset_ < table_->data_end_offset_
+      && offset_ >= table_->data_start_offset_;
+}
+
+void PlainTableIterator::SeekToFirst() {
+  next_offset_ = table_->data_start_offset_;
+  if (next_offset_ >= table_->data_end_offset_) {
+    next_offset_ = offset_ = table_->data_end_offset_;
+  } else {
+    Next();
+  }
+}
+
+void PlainTableIterator::SeekToLast() {
+  assert(false);
+  status_ = Status::NotSupported("SeekToLast() is not supported in PlainTable");
+}
+
+void PlainTableIterator::Seek(const Slice& target) {
+  // If the user doesn't set prefix seek option and we are not able to do a
+  // total Seek(). assert failure.
+  if (!use_prefix_seek_ && table_->index_size_ > 1) {
+    assert(false);
+    status_ = Status::NotSupported(
+        "PlainTable cannot issue non-prefix seek unless in total order mode.");
+    offset_ = next_offset_ = table_->data_end_offset_;
+    return;
+  }
+
+  Slice prefix_slice = table_->GetPrefix(target);
+  uint32_t prefix_hash = 0;
+  // Bloom filter is ignored in total-order mode.
+  if (!table_->IsTotalOrderMode()) {
+    prefix_hash = GetSliceHash(prefix_slice);
+    if (!table_->MatchBloom(prefix_hash)) {
+      offset_ = next_offset_ = table_->data_end_offset_;
+      return;
+    }
+  }
+  bool prefix_match;
+  status_ = table_->GetOffset(target, prefix_slice, prefix_hash, prefix_match,
+                              &next_offset_);
+  if (!status_.ok()) {
+    offset_ = next_offset_ = table_->data_end_offset_;
+    return;
+  }
+
+  if (next_offset_ < table_-> data_end_offset_) {
+    for (Next(); status_.ok() && Valid(); Next()) {
+      if (!prefix_match) {
+        // Need to verify the first key's prefix
+        if (table_->GetPrefix(key()) != prefix_slice) {
+          offset_ = next_offset_ = table_->data_end_offset_;
+          break;
+        }
+        prefix_match = true;
+      }
+      if (table_->internal_comparator_.Compare(key(), target) >= 0) {
+        break;
+      }
+    }
+  } else {
+    offset_ = table_->data_end_offset_;
+  }
+}
+
+void PlainTableIterator::Next() {
+  offset_ = next_offset_;
+  if (offset_ < table_->data_end_offset_) {
+    Slice tmp_slice;
+    ParsedInternalKey parsed_key;
+    status_ = table_->Next(&next_offset_, &parsed_key, &value_);
+    if (status_.ok()) {
+      // Make a copy in this case. TODO optimize.
+      key_.SetInternalKey(parsed_key);
+    } else {
+      offset_ = next_offset_ = table_->data_end_offset_;
+    }
+  }
+}
+
+void PlainTableIterator::Prev() {
+  assert(false);
+}
+
+Slice PlainTableIterator::key() const {
+  assert(Valid());
+  return key_.GetKey();
+}
+
+Slice PlainTableIterator::value() const {
+  assert(Valid());
+  return value_;
+}
+
+Status PlainTableIterator::status() const {
+  return status_;
+}
+
+}  // namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/table/plain_table_reader.h b/table/plain_table_reader.h
new file mode 100644 (file)
index 0000000..756439b
--- /dev/null
@@ -0,0 +1,261 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+#include <unordered_map>
+#include <memory>
+#include <vector>
+#include <string>
+#include <stdint.h>
+
+#include "db/dbformat.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+#include "table/table_reader.h"
+#include "table/plain_table_factory.h"
+
+namespace rocksdb {
+
+class Block;
+class BlockHandle;
+class Footer;
+struct Options;
+class RandomAccessFile;
+struct ReadOptions;
+class TableCache;
+class TableReader;
+class DynamicBloom;
+class InternalKeyComparator;
+
+using std::unique_ptr;
+using std::unordered_map;
+extern const uint32_t kPlainTableVariableLength;
+
+// Based on following output file format shown in plain_table_factory.h
+// When opening the output file, IndexedTableReader creates a hash table
+// from key prefixes to offset of the output file. IndexedTable will decide
+// whether it points to the data offset of the first key with the key prefix
+// or the offset of it. If there are too many keys share this prefix, it will
+// create a binary search-able index from the suffix to offset on disk.
+//
+// The implementation of IndexedTableReader requires output file is mmaped
+class PlainTableReader: public TableReader {
+ public:
+  static Status Open(const Options& options, const EnvOptions& soptions,
+                     const InternalKeyComparator& internal_comparator,
+                     unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
+                     unique_ptr<TableReader>* table,
+                     const int bloom_bits_per_key, double hash_table_ratio,
+                     size_t index_sparseness);
+
+  Iterator* NewIterator(const ReadOptions&);
+
+  Status Get(const ReadOptions&, const Slice& key, void* arg,
+             bool (*result_handler)(void* arg, const ParsedInternalKey& k,
+                                    const Slice& v, bool),
+             void (*mark_key_may_exist)(void*) = nullptr);
+
+  uint64_t ApproximateOffsetOf(const Slice& key);
+
+  void SetupForCompaction();
+
+  std::shared_ptr<const TableProperties> GetTableProperties() const {
+    return table_properties_;
+  }
+
+  PlainTableReader(const Options& options, unique_ptr<RandomAccessFile>&& file,
+                   const EnvOptions& storage_options,
+                   const InternalKeyComparator& internal_comparator,
+                   uint64_t file_size, int bloom_num_bits,
+                   double hash_table_ratio, size_t index_sparseness,
+                   const TableProperties* table_properties);
+  virtual ~PlainTableReader();
+
+ protected:
+  // Check bloom filter to see whether it might contain this prefix.
+  // The hash of the prefix is given, since it can be reused for index lookup
+  // too.
+  virtual bool MatchBloom(uint32_t hash) const;
+
+  // PopulateIndex() builds index of keys. It must be called before any query
+  // to the table.
+  //
+  // props: the table properties object that need to be stored. Ownership of
+  //        the object will be passed.
+  //
+  // index_ contains buckets size of index_size_, each is a
+  // 32-bit integer. The lower 31 bits contain an offset value (explained below)
+  // and the first bit of the integer indicates type of the offset.
+  //
+  // +--------------+------------------------------------------------------+
+  // | Flag (1 bit) | Offset to binary search buffer or file (31 bits)     +
+  // +--------------+------------------------------------------------------+
+  //
+  // Explanation for the "flag bit":
+  //
+  // 0 indicates that the bucket contains only one prefix (no conflict when
+  //   hashing this prefix), whose first row starts from this offset of the
+  // file.
+  // 1 indicates that the bucket contains more than one prefixes, or there
+  //   are too many rows for one prefix so we need a binary search for it. In
+  //   this case, the offset indicates the offset of sub_index_ holding the
+  //   binary search indexes of keys for those rows. Those binary search indexes
+  //   are organized in this way:
+  //
+  // The first 4 bytes, indicate how many indexes (N) are stored after it. After
+  // it, there are N 32-bit integers, each points of an offset of the file,
+  // which
+  // points to starting of a row. Those offsets need to be guaranteed to be in
+  // ascending order so the keys they are pointing to are also in ascending
+  // order
+  // to make sure we can use them to do binary searches. Below is visual
+  // presentation of a bucket.
+  //
+  // <begin>
+  //   number_of_records:  varint32
+  //   record 1 file offset:  fixedint32
+  //   record 2 file offset:  fixedint32
+  //    ....
+  //   record N file offset:  fixedint32
+  // <end>
+  Status PopulateIndex(TableProperties* props);
+
+ private:
+  struct IndexRecord;
+  class IndexRecordList;
+
+  // Plain table maintains an index and a sub index.
+  // index is implemented by a hash table.
+  // subindex is a big of memory array.
+  // For more details about the in-memory index, please refer to:
+  // https://github.com/facebook/rocksdb/wiki/PlainTable-Format
+  // #wiki-in-memory-index-format
+  std::unique_ptr<uint32_t[]> index_;
+  int index_size_ = 0;
+  std::unique_ptr<char[]> sub_index_;
+
+  Options options_;
+  const EnvOptions& soptions_;
+  unique_ptr<RandomAccessFile> file_;
+
+  const InternalKeyComparator internal_comparator_;
+  // represents plain table's current status.
+  Status status_;
+
+  Slice file_data_;
+  uint32_t file_size_;
+
+  const double kHashTableRatio;
+  const int kBloomBitsPerKey;
+  // To speed up the search for keys with same prefix, we'll add index key for
+  // every N keys, where the "N" is determined by
+  // kIndexIntervalForSamePrefixKeys
+  const size_t kIndexIntervalForSamePrefixKeys = 16;
+  // Bloom filter is used to rule out non-existent key
+  unique_ptr<DynamicBloom> bloom_;
+
+  std::shared_ptr<const TableProperties> table_properties_;
+  // data_start_offset_ and data_end_offset_ defines the range of the
+  // sst file that stores data.
+  const uint32_t data_start_offset_ = 0;
+  const uint32_t data_end_offset_;
+  const size_t user_key_len_;
+
+  static const size_t kNumInternalBytes = 8;
+  static const uint32_t kSubIndexMask = 0x80000000;
+  static const size_t kOffsetLen = sizeof(uint32_t);
+  static const uint64_t kMaxFileSize = 1u << 31;
+  static const size_t kRecordsPerGroup = 256;
+
+  bool IsFixedLength() const {
+    return user_key_len_ != kPlainTableVariableLength;
+  }
+
+  size_t GetFixedInternalKeyLength() const {
+    return user_key_len_ + kNumInternalBytes;
+  }
+
+  friend class TableCache;
+  friend class PlainTableIterator;
+
+  // Internal helper function to generate an IndexRecordList object from all
+  // the rows, which contains index records as a list.
+  // If bloom_ is not null, all the keys' full-key hash will be added to the
+  // bloom filter.
+  Status PopulateIndexRecordList(IndexRecordList* record_list,
+                                 int* num_prefixes) const;
+
+  // Internal helper function to allocate memory for indexes and bloom filters
+  void AllocateIndexAndBloom(int num_prefixes);
+
+  // Internal helper function to bucket index record list to hash buckets.
+  // bucket_header is a vector of size hash_table_size_, with each entry
+  // containing a linklist of IndexRecord hashed to the same bucket, in reverse
+  // order.
+  // of offsets for the hash, in reversed order.
+  // entries_per_bucket is sized of index_size_. The value is how many index
+  // records are there in bucket_headers for the same bucket.
+  size_t BucketizeIndexesAndFillBloom(
+      IndexRecordList* record_list, std::vector<IndexRecord*>* bucket_headers,
+      std::vector<uint32_t>* entries_per_bucket);
+
+  // Internal helper class to fill the indexes and bloom filters to internal
+  // data structures. bucket_headers and entries_per_bucket are bucketized
+  // indexes and counts generated by BucketizeIndexesAndFillBloom().
+  void FillIndexes(const size_t kSubIndexSize,
+                   const std::vector<IndexRecord*>& bucket_headers,
+                   const std::vector<uint32_t>& entries_per_bucket);
+
+  // Read a plain table key from the position `start`. The read content
+  // will be written to `key` and the size of read bytes will be populated
+  // in `bytes_read`.
+  Status ReadKey(const char* row_ptr, ParsedInternalKey* key,
+                 size_t* bytes_read) const;
+  // Read the key and value at `offset` to parameters `key` and `value`.
+  // On success, `offset` will be updated as the offset for the next key.
+  Status Next(uint32_t* offset, ParsedInternalKey* key, Slice* value) const;
+  // Get file offset for key target.
+  // return value prefix_matched is set to true if the offset is confirmed
+  // for a key with the same prefix as target.
+  Status GetOffset(const Slice& target, const Slice& prefix,
+                   uint32_t prefix_hash, bool& prefix_matched,
+                   uint32_t* offset) const;
+
+  Slice GetUserKey(const Slice& key) const {
+    return Slice(key.data(), key.size() - 8);
+  }
+
+  Slice GetPrefix(const Slice& target) const {
+    assert(target.size() >= 8);  // target is internal key
+    return GetPrefixFromUserKey(GetUserKey(target));
+  }
+
+  inline Slice GetPrefix(const ParsedInternalKey& target) const;
+
+  Slice GetPrefixFromUserKey(const Slice& user_key) const {
+    if (!IsTotalOrderMode()) {
+      return options_.prefix_extractor->Transform(user_key);
+    } else {
+      // Use empty slice as prefix if prefix_extractor is not set. In that case,
+      // it falls back to pure binary search and total iterator seek is
+      // supported.
+      return Slice();
+    }
+  }
+
+  bool IsTotalOrderMode() const {
+    return (options_.prefix_extractor.get() == nullptr);
+  }
+
+  // No copying allowed
+  explicit PlainTableReader(const TableReader&) = delete;
+  void operator=(const TableReader&) = delete;
+};
+}  // namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/table/table_builder.h b/table/table_builder.h
new file mode 100644 (file)
index 0000000..ee32cff
--- /dev/null
@@ -0,0 +1,55 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+namespace rocksdb {
+
+class Slice;
+class Status;
+
+// TableBuilder provides the interface used to build a Table
+// (an immutable and sorted map from keys to values).
+//
+// Multiple threads can invoke const methods on a TableBuilder without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same TableBuilder must use
+// external synchronization.
+class TableBuilder {
+ public:
+  // REQUIRES: Either Finish() or Abandon() has been called.
+  virtual ~TableBuilder() {}
+
+  // Add key,value to the table being constructed.
+  // REQUIRES: key is after any previously added key according to comparator.
+  // REQUIRES: Finish(), Abandon() have not been called
+  virtual void Add(const Slice& key, const Slice& value) = 0;
+
+  // Return non-ok iff some error has been detected.
+  virtual Status status() const = 0;
+
+  // Finish building the table.
+  // REQUIRES: Finish(), Abandon() have not been called
+  virtual Status Finish() = 0;
+
+  // Indicate that the contents of this builder should be abandoned.
+  // If the caller is not going to call Finish(), it must call Abandon()
+  // before destroying this builder.
+  // REQUIRES: Finish(), Abandon() have not been called
+  virtual void Abandon() = 0;
+
+  // Number of calls to Add() so far.
+  virtual uint64_t NumEntries() const = 0;
+
+  // Size of the file generated so far.  If invoked after a successful
+  // Finish() call, returns the size of the final generated file.
+  virtual uint64_t FileSize() const = 0;
+};
+
+}  // namespace rocksdb
diff --git a/table/table_properties.cc b/table/table_properties.cc
new file mode 100644 (file)
index 0000000..c7e1419
--- /dev/null
@@ -0,0 +1,115 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include "rocksdb/table_properties.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/env.h"
+
+namespace rocksdb {
+
+namespace {
+  void AppendProperty(
+      std::string& props,
+      const std::string& key,
+      const std::string& value,
+      const std::string& prop_delim,
+      const std::string& kv_delim) {
+    props.append(key);
+    props.append(kv_delim);
+    props.append(value);
+    props.append(prop_delim);
+  }
+
+  template <class TValue>
+  void AppendProperty(
+      std::string& props,
+      const std::string& key,
+      const TValue& value,
+      const std::string& prop_delim,
+      const std::string& kv_delim) {
+    AppendProperty(
+        props, key, std::to_string(value), prop_delim, kv_delim
+    );
+  }
+}
+
+std::string TableProperties::ToString(
+    const std::string& prop_delim,
+    const std::string& kv_delim) const {
+  std::string result;
+  result.reserve(1024);
+
+  // Basic Info
+  AppendProperty(result, "# data blocks", num_data_blocks, prop_delim,
+                 kv_delim);
+  AppendProperty(result, "# entries", num_entries, prop_delim, kv_delim);
+
+  AppendProperty(result, "raw key size", raw_key_size, prop_delim, kv_delim);
+  AppendProperty(result, "raw average key size",
+                 num_entries != 0 ? 1.0 * raw_key_size / num_entries : 0.0,
+                 prop_delim, kv_delim);
+  AppendProperty(result, "raw value size", raw_value_size, prop_delim,
+                 kv_delim);
+  AppendProperty(result, "raw average value size",
+                 num_entries != 0 ? 1.0 * raw_value_size / num_entries : 0.0,
+                 prop_delim, kv_delim);
+
+  AppendProperty(result, "data block size", data_size, prop_delim, kv_delim);
+  AppendProperty(result, "index block size", index_size, prop_delim, kv_delim);
+  AppendProperty(result, "filter block size", filter_size, prop_delim,
+                 kv_delim);
+  AppendProperty(result, "(estimated) table size",
+                 data_size + index_size + filter_size, prop_delim, kv_delim);
+
+  AppendProperty(
+      result, "filter policy name",
+      filter_policy_name.empty() ? std::string("N/A") : filter_policy_name,
+      prop_delim, kv_delim);
+
+  return result;
+}
+
+const std::string TablePropertiesNames::kDataSize  =
+    "rocksdb.data.size";
+const std::string TablePropertiesNames::kIndexSize =
+    "rocksdb.index.size";
+const std::string TablePropertiesNames::kFilterSize =
+    "rocksdb.filter.size";
+const std::string TablePropertiesNames::kRawKeySize =
+    "rocksdb.raw.key.size";
+const std::string TablePropertiesNames::kRawValueSize =
+    "rocksdb.raw.value.size";
+const std::string TablePropertiesNames::kNumDataBlocks =
+    "rocksdb.num.data.blocks";
+const std::string TablePropertiesNames::kNumEntries =
+    "rocksdb.num.entries";
+const std::string TablePropertiesNames::kFilterPolicy =
+    "rocksdb.filter.policy";
+const std::string TablePropertiesNames::kFormatVersion =
+    "rocksdb.format.version";
+const std::string TablePropertiesNames::kFixedKeyLen =
+    "rocksdb.fixed.key.length";
+
+extern const std::string kPropertiesBlock = "rocksdb.properties";
+// Old property block name for backward compatibility
+extern const std::string kPropertiesBlockOldName = "rocksdb.stats";
+
+// Seek to the properties block.
+// Return true if it successfully seeks to the properties block.
+Status SeekToPropertiesBlock(Iterator* meta_iter, bool* is_found) {
+  *is_found = true;
+  meta_iter->Seek(kPropertiesBlock);
+  if (meta_iter->status().ok() &&
+      (!meta_iter->Valid() || meta_iter->key() != kPropertiesBlock)) {
+    meta_iter->Seek(kPropertiesBlockOldName);
+    if (meta_iter->status().ok() &&
+        (!meta_iter->Valid() || meta_iter->key() != kPropertiesBlockOldName)) {
+      *is_found = false;
+    }
+  }
+  return meta_iter->status();
+}
+
+}  // namespace rocksdb
diff --git a/table/table_reader.h b/table/table_reader.h
new file mode 100644 (file)
index 0000000..02a2d16
--- /dev/null
@@ -0,0 +1,66 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <memory>
+
+namespace rocksdb {
+
+class Iterator;
+struct ParsedInternalKey;
+class Slice;
+struct ReadOptions;
+struct TableProperties;
+
+// A Table is a sorted map from strings to strings.  Tables are
+// immutable and persistent.  A Table may be safely accessed from
+// multiple threads without external synchronization.
+class TableReader {
+ public:
+  virtual ~TableReader() {}
+
+  // Returns a new iterator over the table contents.
+  // The result of NewIterator() is initially invalid (caller must
+  // call one of the Seek methods on the iterator before using it).
+  virtual Iterator* NewIterator(const ReadOptions&) = 0;
+
+  // Given a key, return an approximate byte offset in the file where
+  // the data for that key begins (or would begin if the key were
+  // present in the file).  The returned value is in terms of file
+  // bytes, and so includes effects like compression of the underlying data.
+  // E.g., the approximate offset of the last key in the table will
+  // be close to the file length.
+  virtual uint64_t ApproximateOffsetOf(const Slice& key) = 0;
+
+  // Set up the table for Compaction. Might change some parameters with
+  // posix_fadvise
+  virtual void SetupForCompaction() = 0;
+
+  virtual std::shared_ptr<const TableProperties> GetTableProperties() const = 0;
+
+  // Calls (*result_handler)(handle_context, ...) repeatedly, starting with
+  // the entry found after a call to Seek(key), until result_handler returns
+  // false, where k is the actual internal key for a row found and v as the
+  // value of the key. didIO is true if I/O is involved in the operation. May
+  // not make such a call if filter policy says that key is not present.
+  //
+  // mark_key_may_exist_handler needs to be called when it is configured to be
+  // memory only and the key is not found in the block cache, with
+  // the parameter to be handle_context.
+  //
+  // readOptions is the options for the read
+  // key is the key to search for
+  virtual Status Get(
+      const ReadOptions& readOptions, const Slice& key, void* handle_context,
+      bool (*result_handler)(void* arg, const ParsedInternalKey& k,
+                             const Slice& v, bool didIO),
+      void (*mark_key_may_exist_handler)(void* handle_context) = nullptr) = 0;
+};
+
+}  // namespace rocksdb
diff --git a/table/table_reader_bench.cc b/table/table_reader_bench.cc
new file mode 100644 (file)
index 0000000..a0ff0d7
--- /dev/null
@@ -0,0 +1,271 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include <gflags/gflags.h>
+
+#include "rocksdb/db.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
+#include "db/db_impl.h"
+#include "db/dbformat.h"
+#include "port/atomic_pointer.h"
+#include "table/block_based_table_factory.h"
+#include "table/plain_table_factory.h"
+#include "table/table_builder.h"
+#include "util/histogram.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+
+namespace {
+// Make a key that i determines the first 4 characters and j determines the
+// last 4 characters.
+static std::string MakeKey(int i, int j, bool through_db) {
+  char buf[100];
+  snprintf(buf, sizeof(buf), "%04d__key___%04d", i, j);
+  if (through_db) {
+    return std::string(buf);
+  }
+  // If we directly query table, which operates on internal keys
+  // instead of user keys, we need to add 8 bytes of internal
+  // information (row type etc) to user key to make an internal
+  // key.
+  InternalKey key(std::string(buf), 0, ValueType::kTypeValue);
+  return key.Encode().ToString();
+}
+
+static bool DummySaveValue(void* arg, const ParsedInternalKey& ikey,
+                           const Slice& v, bool didIO) {
+  return false;
+}
+
+uint64_t Now(Env* env, bool measured_by_nanosecond) {
+  return measured_by_nanosecond ? env->NowNanos() : env->NowMicros();
+}
+}  // namespace
+
+// A very simple benchmark that.
+// Create a table with roughly numKey1 * numKey2 keys,
+// where there are numKey1 prefixes of the key, each has numKey2 number of
+// distinguished key, differing in the suffix part.
+// If if_query_empty_keys = false, query the existing keys numKey1 * numKey2
+// times randomly.
+// If if_query_empty_keys = true, query numKey1 * numKey2 random empty keys.
+// Print out the total time.
+// If through_db=true, a full DB will be created and queries will be against
+// it. Otherwise, operations will be directly through table level.
+//
+// If for_terator=true, instead of just query one key each time, it queries
+// a range sharing the same prefix.
+namespace {
+void TableReaderBenchmark(Options& opts, EnvOptions& env_options,
+                          ReadOptions& read_options, int num_keys1,
+                          int num_keys2, int num_iter, int prefix_len,
+                          bool if_query_empty_keys, bool for_iterator,
+                          bool through_db, bool measured_by_nanosecond) {
+  rocksdb::InternalKeyComparator ikc(opts.comparator);
+
+  std::string file_name = test::TmpDir()
+      + "/rocksdb_table_reader_benchmark";
+  std::string dbname = test::TmpDir() + "/rocksdb_table_reader_bench_db";
+  WriteOptions wo;
+  unique_ptr<WritableFile> file;
+  Env* env = Env::Default();
+  TableBuilder* tb = nullptr;
+  DB* db = nullptr;
+  Status s;
+  if (!through_db) {
+    env->NewWritableFile(file_name, &file, env_options);
+    tb = opts.table_factory->NewTableBuilder(opts, ikc, file.get(),
+                                             CompressionType::kNoCompression);
+  } else {
+    s = DB::Open(opts, dbname, &db);
+    ASSERT_OK(s);
+    ASSERT_TRUE(db != nullptr);
+  }
+  // Populate slightly more than 1M keys
+  for (int i = 0; i < num_keys1; i++) {
+    for (int j = 0; j < num_keys2; j++) {
+      std::string key = MakeKey(i * 2, j, through_db);
+      if (!through_db) {
+        tb->Add(key, key);
+      } else {
+        db->Put(wo, key, key);
+      }
+    }
+  }
+  if (!through_db) {
+    tb->Finish();
+    file->Close();
+  } else {
+    db->Flush(FlushOptions());
+  }
+
+  unique_ptr<TableReader> table_reader;
+  unique_ptr<RandomAccessFile> raf;
+  if (!through_db) {
+    Status s = env->NewRandomAccessFile(file_name, &raf, env_options);
+    uint64_t file_size;
+    env->GetFileSize(file_name, &file_size);
+    s = opts.table_factory->NewTableReader(
+        opts, env_options, ikc, std::move(raf), file_size, &table_reader);
+  }
+
+  Random rnd(301);
+  std::string result;
+  HistogramImpl hist;
+
+  void* arg = nullptr;
+  for (int it = 0; it < num_iter; it++) {
+    for (int i = 0; i < num_keys1; i++) {
+      for (int j = 0; j < num_keys2; j++) {
+        int r1 = rnd.Uniform(num_keys1) * 2;
+        int r2 = rnd.Uniform(num_keys2);
+        if (if_query_empty_keys) {
+          r1++;
+          r2 = num_keys2 * 2 - r2;
+        }
+
+        if (!for_iterator) {
+          // Query one existing key;
+          std::string key = MakeKey(r1, r2, through_db);
+          uint64_t start_time = Now(env, measured_by_nanosecond);
+          port::MemoryBarrier();
+          if (!through_db) {
+            s = table_reader->Get(read_options, key, arg, DummySaveValue,
+                                  nullptr);
+          } else {
+            s = db->Get(read_options, key, &result);
+          }
+          port::MemoryBarrier();
+          hist.Add(Now(env, measured_by_nanosecond) - start_time);
+        } else {
+          int r2_len;
+          if (if_query_empty_keys) {
+            r2_len = 0;
+          } else {
+            r2_len = rnd.Uniform(num_keys2) + 1;
+            if (r2_len + r2 > num_keys2) {
+              r2_len = num_keys2 - r2;
+            }
+          }
+          std::string start_key = MakeKey(r1, r2, through_db);
+          std::string end_key = MakeKey(r1, r2 + r2_len, through_db);
+          uint64_t total_time = 0;
+          uint64_t start_time = Now(env, measured_by_nanosecond);
+          port::MemoryBarrier();
+          Iterator* iter;
+          if (!through_db) {
+            iter = table_reader->NewIterator(read_options);
+          } else {
+            iter = db->NewIterator(read_options);
+          }
+          int count = 0;
+          for(iter->Seek(start_key); iter->Valid(); iter->Next()) {
+            if (if_query_empty_keys) {
+              break;
+            }
+            // verify key;
+            port::MemoryBarrier();
+            total_time += Now(env, measured_by_nanosecond) - start_time;
+            assert(Slice(MakeKey(r1, r2 + count, through_db)) == iter->key());
+            start_time = Now(env, measured_by_nanosecond);
+            if (++count >= r2_len) {
+              break;
+            }
+          }
+          if (count != r2_len) {
+            fprintf(
+                stderr, "Iterator cannot iterate expected number of entries. "
+                "Expected %d but got %d\n", r2_len, count);
+            assert(false);
+          }
+          delete iter;
+          port::MemoryBarrier();
+          total_time += Now(env, measured_by_nanosecond) - start_time;
+          hist.Add(total_time);
+        }
+      }
+    }
+  }
+
+  fprintf(
+      stderr,
+      "==================================================="
+      "====================================================\n"
+      "InMemoryTableSimpleBenchmark: %20s   num_key1:  %5d   "
+      "num_key2: %5d  %10s\n"
+      "==================================================="
+      "===================================================="
+      "\nHistogram (unit: %s): \n%s",
+      opts.table_factory->Name(), num_keys1, num_keys2,
+      for_iterator ? "iterator" : (if_query_empty_keys ? "empty" : "non_empty"),
+      measured_by_nanosecond ? "nanosecond" : "microsecond",
+      hist.ToString().c_str());
+  if (!through_db) {
+    env->DeleteFile(file_name);
+  } else {
+    delete db;
+    db = nullptr;
+    DestroyDB(dbname, opts);
+  }
+}
+}  // namespace
+}  // namespace rocksdb
+
+DEFINE_bool(query_empty, false, "query non-existing keys instead of existing "
+            "ones.");
+DEFINE_int32(num_keys1, 4096, "number of distinguish prefix of keys");
+DEFINE_int32(num_keys2, 512, "number of distinguish keys for each prefix");
+DEFINE_int32(iter, 3, "query non-existing keys instead of existing ones");
+DEFINE_int32(prefix_len, 16, "Prefix length used for iterators and indexes");
+DEFINE_bool(iterator, false, "For test iterator");
+DEFINE_bool(through_db, false, "If enable, a DB instance will be created and "
+            "the query will be against DB. Otherwise, will be directly against "
+            "a table reader.");
+DEFINE_bool(plain_table, false, "Use PlainTable");
+DEFINE_string(time_unit, "microsecond",
+              "The time unit used for measuring performance. User can specify "
+              "`microsecond` (default) or `nanosecond`");
+
+int main(int argc, char** argv) {
+  google::SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
+                          " [OPTIONS]...");
+  google::ParseCommandLineFlags(&argc, &argv, true);
+
+  rocksdb::TableFactory* tf = new rocksdb::BlockBasedTableFactory();
+  rocksdb::Options options;
+  if (FLAGS_prefix_len < 16) {
+    options.prefix_extractor.reset(rocksdb::NewFixedPrefixTransform(
+        FLAGS_prefix_len));
+  }
+  rocksdb::ReadOptions ro;
+  rocksdb::EnvOptions env_options;
+  options.create_if_missing = true;
+  options.compression = rocksdb::CompressionType::kNoCompression;
+
+  if (FLAGS_plain_table) {
+    options.allow_mmap_reads = true;
+    env_options.use_mmap_reads = true;
+    tf = new rocksdb::PlainTableFactory(16, (FLAGS_prefix_len == 16) ? 0 : 8,
+                                        0.75);
+    options.prefix_extractor.reset(rocksdb::NewFixedPrefixTransform(
+        FLAGS_prefix_len));
+  } else {
+    tf = new rocksdb::BlockBasedTableFactory();
+  }
+  // if user provides invalid options, just fall back to microsecond.
+  bool measured_by_nanosecond = FLAGS_time_unit == "nanosecond";
+
+  options.table_factory =
+      std::shared_ptr<rocksdb::TableFactory>(tf);
+  rocksdb::TableReaderBenchmark(options, env_options, ro, FLAGS_num_keys1,
+                                FLAGS_num_keys2, FLAGS_iter, FLAGS_prefix_len,
+                                FLAGS_query_empty, FLAGS_iterator,
+                                FLAGS_through_db, measured_by_nanosecond);
+  delete tf;
+  return 0;
+}
diff --git a/table/table_test.cc b/table/table_test.cc
new file mode 100644 (file)
index 0000000..dd81bae
--- /dev/null
@@ -0,0 +1,1805 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <map>
+#include <string>
+#include <memory>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "db/memtable.h"
+#include "db/write_batch_internal.h"
+
+#include "rocksdb/cache.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/statistics.h"
+
+#include "table/block.h"
+#include "table/block_based_table_builder.h"
+#include "table/block_based_table_factory.h"
+#include "table/block_based_table_reader.h"
+#include "table/block_builder.h"
+#include "table/format.h"
+#include "table/meta_blocks.h"
+#include "table/plain_table_factory.h"
+
+#include "util/random.h"
+#include "util/statistics.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+
+extern const uint64_t kLegacyBlockBasedTableMagicNumber;
+extern const uint64_t kLegacyPlainTableMagicNumber;
+extern const uint64_t kBlockBasedTableMagicNumber;
+extern const uint64_t kPlainTableMagicNumber;
+
+namespace {
+
+// Return reverse of "key".
+// Used to test non-lexicographic comparators.
+std::string Reverse(const Slice& key) {
+  auto rev = key.ToString();
+  std::reverse(rev.begin(), rev.end());
+  return rev;
+}
+
+class ReverseKeyComparator : public Comparator {
+ public:
+  virtual const char* Name() const {
+    return "rocksdb.ReverseBytewiseComparator";
+  }
+
+  virtual int Compare(const Slice& a, const Slice& b) const {
+    return BytewiseComparator()->Compare(Reverse(a), Reverse(b));
+  }
+
+  virtual void FindShortestSeparator(
+      std::string* start,
+      const Slice& limit) const {
+    std::string s = Reverse(*start);
+    std::string l = Reverse(limit);
+    BytewiseComparator()->FindShortestSeparator(&s, l);
+    *start = Reverse(s);
+  }
+
+  virtual void FindShortSuccessor(std::string* key) const {
+    std::string s = Reverse(*key);
+    BytewiseComparator()->FindShortSuccessor(&s);
+    *key = Reverse(s);
+  }
+};
+
+ReverseKeyComparator reverse_key_comparator;
+
+void Increment(const Comparator* cmp, std::string* key) {
+  if (cmp == BytewiseComparator()) {
+    key->push_back('\0');
+  } else {
+    assert(cmp == &reverse_key_comparator);
+    std::string rev = Reverse(*key);
+    rev.push_back('\0');
+    *key = Reverse(rev);
+  }
+}
+
+// An STL comparator that uses a Comparator
+struct STLLessThan {
+  const Comparator* cmp;
+
+  STLLessThan() : cmp(BytewiseComparator()) { }
+  explicit STLLessThan(const Comparator* c) : cmp(c) { }
+  bool operator()(const std::string& a, const std::string& b) const {
+    return cmp->Compare(Slice(a), Slice(b)) < 0;
+  }
+};
+
+}  // namespace
+
+class StringSink: public WritableFile {
+ public:
+  ~StringSink() { }
+
+  const std::string& contents() const { return contents_; }
+
+  virtual Status Close() { return Status::OK(); }
+  virtual Status Flush() { return Status::OK(); }
+  virtual Status Sync() { return Status::OK(); }
+
+  virtual Status Append(const Slice& data) {
+    contents_.append(data.data(), data.size());
+    return Status::OK();
+  }
+
+ private:
+  std::string contents_;
+};
+
+
+class StringSource: public RandomAccessFile {
+ public:
+  StringSource(const Slice& contents, uint64_t uniq_id, bool mmap)
+      : contents_(contents.data(), contents.size()), uniq_id_(uniq_id),
+        mmap_(mmap) {
+  }
+
+  virtual ~StringSource() { }
+
+  uint64_t Size() const { return contents_.size(); }
+
+  virtual Status Read(uint64_t offset, size_t n, Slice* result,
+                       char* scratch) const {
+    if (offset > contents_.size()) {
+      return Status::InvalidArgument("invalid Read offset");
+    }
+    if (offset + n > contents_.size()) {
+      n = contents_.size() - offset;
+    }
+    if (!mmap_) {
+      memcpy(scratch, &contents_[offset], n);
+      *result = Slice(scratch, n);
+    } else {
+      *result = Slice(&contents_[offset], n);
+    }
+    return Status::OK();
+  }
+
+  virtual size_t GetUniqueId(char* id, size_t max_size) const {
+    if (max_size < 20) {
+      return 0;
+    }
+
+    char* rid = id;
+    rid = EncodeVarint64(rid, uniq_id_);
+    rid = EncodeVarint64(rid, 0);
+    return static_cast<size_t>(rid-id);
+  }
+
+ private:
+  std::string contents_;
+  uint64_t uniq_id_;
+  bool mmap_;
+};
+
+typedef std::map<std::string, std::string, STLLessThan> KVMap;
+
+// Helper class for tests to unify the interface between
+// BlockBuilder/TableBuilder and Block/Table.
+class Constructor {
+ public:
+  explicit Constructor(const Comparator* cmp) : data_(STLLessThan(cmp)) {}
+  virtual ~Constructor() { }
+
+  void Add(const std::string& key, const Slice& value) {
+    data_[key] = value.ToString();
+  }
+
+  // Finish constructing the data structure with all the keys that have
+  // been added so far.  Returns the keys in sorted order in "*keys"
+  // and stores the key/value pairs in "*kvmap"
+  void Finish(const Options& options,
+              const InternalKeyComparator& internal_comparator,
+              std::vector<std::string>* keys, KVMap* kvmap) {
+    last_internal_key_ = &internal_comparator;
+    *kvmap = data_;
+    keys->clear();
+    for (KVMap::const_iterator it = data_.begin();
+         it != data_.end();
+         ++it) {
+      keys->push_back(it->first);
+    }
+    data_.clear();
+    Status s = FinishImpl(options, internal_comparator, *kvmap);
+    ASSERT_TRUE(s.ok()) << s.ToString();
+  }
+
+  // Construct the data structure from the data in "data"
+  virtual Status FinishImpl(const Options& options,
+                            const InternalKeyComparator& internal_comparator,
+                            const KVMap& data) = 0;
+
+  virtual Iterator* NewIterator() const = 0;
+
+  virtual const KVMap& data() { return data_; }
+
+  virtual DB* db() const { return nullptr; }  // Overridden in DBConstructor
+
+ protected:
+  const InternalKeyComparator* last_internal_key_;
+
+ private:
+  KVMap data_;
+};
+
+class BlockConstructor: public Constructor {
+ public:
+  explicit BlockConstructor(const Comparator* cmp)
+      : Constructor(cmp),
+        comparator_(cmp),
+        block_(nullptr) { }
+  ~BlockConstructor() {
+    delete block_;
+  }
+  virtual Status FinishImpl(const Options& options,
+                            const InternalKeyComparator& internal_comparator,
+                            const KVMap& data) {
+    delete block_;
+    block_ = nullptr;
+    BlockBuilder builder(options, &internal_comparator);
+
+    for (KVMap::const_iterator it = data.begin();
+         it != data.end();
+         ++it) {
+      builder.Add(it->first, it->second);
+    }
+    // Open the block
+    data_ = builder.Finish().ToString();
+    BlockContents contents;
+    contents.data = data_;
+    contents.cachable = false;
+    contents.heap_allocated = false;
+    block_ = new Block(contents);
+    return Status::OK();
+  }
+  virtual Iterator* NewIterator() const {
+    return block_->NewIterator(comparator_);
+  }
+
+ private:
+  const Comparator* comparator_;
+  std::string data_;
+  Block* block_;
+
+  BlockConstructor();
+};
+
+// A helper class that converts internal format keys into user keys
+class KeyConvertingIterator: public Iterator {
+ public:
+  explicit KeyConvertingIterator(Iterator* iter) : iter_(iter) { }
+  virtual ~KeyConvertingIterator() { delete iter_; }
+  virtual bool Valid() const { return iter_->Valid(); }
+  virtual void Seek(const Slice& target) {
+    ParsedInternalKey ikey(target, kMaxSequenceNumber, kTypeValue);
+    std::string encoded;
+    AppendInternalKey(&encoded, ikey);
+    iter_->Seek(encoded);
+  }
+  virtual void SeekToFirst() { iter_->SeekToFirst(); }
+  virtual void SeekToLast() { iter_->SeekToLast(); }
+  virtual void Next() { iter_->Next(); }
+  virtual void Prev() { iter_->Prev(); }
+
+  virtual Slice key() const {
+    assert(Valid());
+    ParsedInternalKey key;
+    if (!ParseInternalKey(iter_->key(), &key)) {
+      status_ = Status::Corruption("malformed internal key");
+      return Slice("corrupted key");
+    }
+    return key.user_key;
+  }
+
+  virtual Slice value() const { return iter_->value(); }
+  virtual Status status() const {
+    return status_.ok() ? iter_->status() : status_;
+  }
+
+ private:
+  mutable Status status_;
+  Iterator* iter_;
+
+  // No copying allowed
+  KeyConvertingIterator(const KeyConvertingIterator&);
+  void operator=(const KeyConvertingIterator&);
+};
+
+class TableConstructor: public Constructor {
+ public:
+  explicit TableConstructor(const Comparator* cmp,
+                            bool convert_to_internal_key = false)
+      : Constructor(cmp),
+        convert_to_internal_key_(convert_to_internal_key) {}
+  ~TableConstructor() { Reset(); }
+
+  virtual Status FinishImpl(const Options& options,
+                            const InternalKeyComparator& internal_comparator,
+                            const KVMap& data) {
+    Reset();
+    sink_.reset(new StringSink());
+    unique_ptr<TableBuilder> builder;
+    builder.reset(options.table_factory->NewTableBuilder(
+        options, internal_comparator, sink_.get(), options.compression));
+
+    for (KVMap::const_iterator it = data.begin();
+         it != data.end();
+         ++it) {
+      if (convert_to_internal_key_) {
+        ParsedInternalKey ikey(it->first, kMaxSequenceNumber, kTypeValue);
+        std::string encoded;
+        AppendInternalKey(&encoded, ikey);
+        builder->Add(encoded, it->second);
+      } else {
+        builder->Add(it->first, it->second);
+      }
+      ASSERT_TRUE(builder->status().ok());
+    }
+    Status s = builder->Finish();
+    ASSERT_TRUE(s.ok()) << s.ToString();
+
+    ASSERT_EQ(sink_->contents().size(), builder->FileSize());
+
+    // Open the table
+    uniq_id_ = cur_uniq_id_++;
+    source_.reset(new StringSource(sink_->contents(), uniq_id_,
+                                   options.allow_mmap_reads));
+    return options.table_factory->NewTableReader(
+        options, soptions, internal_comparator, std::move(source_),
+        sink_->contents().size(), &table_reader_);
+  }
+
+  virtual Iterator* NewIterator() const {
+    ReadOptions ro;
+    Iterator* iter = table_reader_->NewIterator(ro);
+    if (convert_to_internal_key_) {
+      return new KeyConvertingIterator(iter);
+    } else {
+      return iter;
+    }
+  }
+
+  uint64_t ApproximateOffsetOf(const Slice& key) const {
+    return table_reader_->ApproximateOffsetOf(key);
+  }
+
+  virtual Status Reopen(const Options& options) {
+    source_.reset(
+        new StringSource(sink_->contents(), uniq_id_,
+                         options.allow_mmap_reads));
+    return options.table_factory->NewTableReader(
+        options, soptions, *last_internal_key_, std::move(source_),
+        sink_->contents().size(), &table_reader_);
+  }
+
+  virtual TableReader* table_reader() {
+    return table_reader_.get();
+  }
+
+ private:
+  void Reset() {
+    uniq_id_ = 0;
+    table_reader_.reset();
+    sink_.reset();
+    source_.reset();
+  }
+  bool convert_to_internal_key_;
+
+  uint64_t uniq_id_;
+  unique_ptr<StringSink> sink_;
+  unique_ptr<StringSource> source_;
+  unique_ptr<TableReader> table_reader_;
+
+  TableConstructor();
+
+  static uint64_t cur_uniq_id_;
+  const EnvOptions soptions;
+};
+uint64_t TableConstructor::cur_uniq_id_ = 1;
+
+class MemTableConstructor: public Constructor {
+ public:
+  explicit MemTableConstructor(const Comparator* cmp)
+      : Constructor(cmp),
+        internal_comparator_(cmp),
+        table_factory_(new SkipListFactory) {
+    Options options;
+    options.memtable_factory = table_factory_;
+    memtable_ = new MemTable(internal_comparator_, options);
+    memtable_->Ref();
+  }
+  ~MemTableConstructor() {
+    delete memtable_->Unref();
+  }
+  virtual Status FinishImpl(const Options& options,
+                            const InternalKeyComparator& internal_comparator,
+                            const KVMap& data) {
+    delete memtable_->Unref();
+    Options memtable_options;
+    memtable_options.memtable_factory = table_factory_;
+    memtable_ = new MemTable(internal_comparator_, memtable_options);
+    memtable_->Ref();
+    int seq = 1;
+    for (KVMap::const_iterator it = data.begin();
+         it != data.end();
+         ++it) {
+      memtable_->Add(seq, kTypeValue, it->first, it->second);
+      seq++;
+    }
+    return Status::OK();
+  }
+  virtual Iterator* NewIterator() const {
+    return new KeyConvertingIterator(memtable_->NewIterator(ReadOptions()));
+  }
+
+ private:
+  InternalKeyComparator internal_comparator_;
+  MemTable* memtable_;
+  std::shared_ptr<SkipListFactory> table_factory_;
+};
+
+class DBConstructor: public Constructor {
+ public:
+  explicit DBConstructor(const Comparator* cmp)
+      : Constructor(cmp),
+        comparator_(cmp) {
+    db_ = nullptr;
+    NewDB();
+  }
+  ~DBConstructor() {
+    delete db_;
+  }
+  virtual Status FinishImpl(const Options& options,
+                            const InternalKeyComparator& internal_comparator,
+                            const KVMap& data) {
+    delete db_;
+    db_ = nullptr;
+    NewDB();
+    for (KVMap::const_iterator it = data.begin();
+         it != data.end();
+         ++it) {
+      WriteBatch batch;
+      batch.Put(it->first, it->second);
+      ASSERT_TRUE(db_->Write(WriteOptions(), &batch).ok());
+    }
+    return Status::OK();
+  }
+  virtual Iterator* NewIterator() const {
+    return db_->NewIterator(ReadOptions());
+  }
+
+  virtual DB* db() const { return db_; }
+
+ private:
+  void NewDB() {
+    std::string name = test::TmpDir() + "/table_testdb";
+
+    Options options;
+    options.comparator = comparator_;
+    Status status = DestroyDB(name, options);
+    ASSERT_TRUE(status.ok()) << status.ToString();
+
+    options.create_if_missing = true;
+    options.error_if_exists = true;
+    options.write_buffer_size = 10000;  // Something small to force merging
+    status = DB::Open(options, name, &db_);
+    ASSERT_TRUE(status.ok()) << status.ToString();
+  }
+
+  const Comparator* comparator_;
+  DB* db_;
+};
+
+static bool SnappyCompressionSupported() {
+#ifdef SNAPPY
+  std::string out;
+  Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
+  return port::Snappy_Compress(Options().compression_opts,
+                               in.data(), in.size(),
+                               &out);
+#else
+  return false;
+#endif
+}
+
+static bool ZlibCompressionSupported() {
+#ifdef ZLIB
+  std::string out;
+  Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
+  return port::Zlib_Compress(Options().compression_opts,
+                             in.data(), in.size(),
+                             &out);
+#else
+  return false;
+#endif
+}
+
+static bool BZip2CompressionSupported() {
+#ifdef BZIP2
+  std::string out;
+  Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
+  return port::BZip2_Compress(Options().compression_opts,
+                              in.data(), in.size(),
+                              &out);
+#else
+  return false;
+#endif
+}
+
+static bool LZ4CompressionSupported() {
+#ifdef LZ4
+  std::string out;
+  Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
+  return port::LZ4_Compress(Options().compression_opts, in.data(), in.size(),
+                            &out);
+#else
+  return false;
+#endif
+}
+
+static bool LZ4HCCompressionSupported() {
+#ifdef LZ4
+  std::string out;
+  Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
+  return port::LZ4HC_Compress(Options().compression_opts, in.data(), in.size(),
+                              &out);
+#else
+  return false;
+#endif
+}
+
+enum TestType {
+  BLOCK_BASED_TABLE_TEST,
+  PLAIN_TABLE_SEMI_FIXED_PREFIX,
+  PLAIN_TABLE_FULL_STR_PREFIX,
+  PLAIN_TABLE_TOTAL_ORDER,
+  BLOCK_TEST,
+  MEMTABLE_TEST,
+  DB_TEST
+};
+
+struct TestArgs {
+  TestType type;
+  bool reverse_compare;
+  int restart_interval;
+  CompressionType compression;
+};
+
+static std::vector<TestArgs> GenerateArgList() {
+  std::vector<TestArgs> test_args;
+  std::vector<TestType> test_types = {
+      BLOCK_BASED_TABLE_TEST,      PLAIN_TABLE_SEMI_FIXED_PREFIX,
+      PLAIN_TABLE_FULL_STR_PREFIX, PLAIN_TABLE_TOTAL_ORDER,
+      BLOCK_TEST,                  MEMTABLE_TEST,
+      DB_TEST};
+  std::vector<bool> reverse_compare_types = {false, true};
+  std::vector<int> restart_intervals = {16, 1, 1024};
+
+  // Only add compression if it is supported
+  std::vector<CompressionType> compression_types;
+  compression_types.push_back(kNoCompression);
+  if (SnappyCompressionSupported()) {
+    compression_types.push_back(kSnappyCompression);
+  }
+  if (ZlibCompressionSupported()) {
+    compression_types.push_back(kZlibCompression);
+  }
+  if (BZip2CompressionSupported()) {
+    compression_types.push_back(kBZip2Compression);
+  }
+  if (LZ4CompressionSupported()) {
+    compression_types.push_back(kLZ4Compression);
+  }
+  if (LZ4HCCompressionSupported()) {
+    compression_types.push_back(kLZ4HCCompression);
+  }
+
+  for (auto test_type : test_types) {
+    for (auto reverse_compare : reverse_compare_types) {
+      if (test_type == PLAIN_TABLE_SEMI_FIXED_PREFIX ||
+          test_type == PLAIN_TABLE_FULL_STR_PREFIX) {
+        // Plain table doesn't use restart index or compression.
+        TestArgs one_arg;
+        one_arg.type = test_type;
+        one_arg.reverse_compare = reverse_compare;
+        one_arg.restart_interval = restart_intervals[0];
+        one_arg.compression = compression_types[0];
+        test_args.push_back(one_arg);
+        continue;
+      }
+
+      for (auto restart_interval : restart_intervals) {
+        for (auto compression_type : compression_types) {
+          TestArgs one_arg;
+          one_arg.type = test_type;
+          one_arg.reverse_compare = reverse_compare;
+          one_arg.restart_interval = restart_interval;
+          one_arg.compression = compression_type;
+          test_args.push_back(one_arg);
+        }
+      }
+    }
+  }
+  return test_args;
+}
+
+// In order to make all tests run for plain table format, including
+// those operating on empty keys, create a new prefix transformer which
+// return fixed prefix if the slice is not shorter than the prefix length,
+// and the full slice if it is shorter.
+class FixedOrLessPrefixTransform : public SliceTransform {
+ private:
+  const size_t prefix_len_;
+
+ public:
+  explicit FixedOrLessPrefixTransform(size_t prefix_len) :
+      prefix_len_(prefix_len) {
+  }
+
+  virtual const char* Name() const {
+    return "rocksdb.FixedPrefix";
+  }
+
+  virtual Slice Transform(const Slice& src) const {
+    assert(InDomain(src));
+    if (src.size() < prefix_len_) {
+      return src;
+    }
+    return Slice(src.data(), prefix_len_);
+  }
+
+  virtual bool InDomain(const Slice& src) const {
+    return true;
+  }
+
+  virtual bool InRange(const Slice& dst) const {
+    return (dst.size() <= prefix_len_);
+  }
+};
+
+class Harness {
+ public:
+  Harness() : constructor_(nullptr) { }
+
+  void Init(const TestArgs& args) {
+    delete constructor_;
+    constructor_ = nullptr;
+    options_ = Options();
+
+    options_.block_restart_interval = args.restart_interval;
+    options_.compression = args.compression;
+    // Use shorter block size for tests to exercise block boundary
+    // conditions more.
+    options_.block_size = 256;
+    if (args.reverse_compare) {
+      options_.comparator = &reverse_key_comparator;
+    }
+
+    internal_comparator_.reset(
+        new test::PlainInternalKeyComparator(options_.comparator));
+
+    support_prev_ = true;
+    only_support_prefix_seek_ = false;
+    BlockBasedTableOptions table_options;
+    switch (args.type) {
+      case BLOCK_BASED_TABLE_TEST:
+        table_options.flush_block_policy_factory.reset(
+            new FlushBlockBySizePolicyFactory());
+        options_.table_factory.reset(new BlockBasedTableFactory(table_options));
+        constructor_ = new TableConstructor(options_.comparator);
+        break;
+      case PLAIN_TABLE_SEMI_FIXED_PREFIX:
+        support_prev_ = false;
+        only_support_prefix_seek_ = true;
+        options_.prefix_extractor.reset(new FixedOrLessPrefixTransform(2));
+        options_.allow_mmap_reads = true;
+        options_.table_factory.reset(NewPlainTableFactory());
+        constructor_ = new TableConstructor(options_.comparator, true);
+        internal_comparator_.reset(
+            new InternalKeyComparator(options_.comparator));
+        break;
+      case PLAIN_TABLE_FULL_STR_PREFIX:
+        support_prev_ = false;
+        only_support_prefix_seek_ = true;
+        options_.prefix_extractor.reset(NewNoopTransform());
+        options_.allow_mmap_reads = true;
+        options_.table_factory.reset(NewPlainTableFactory());
+        constructor_ = new TableConstructor(options_.comparator, true);
+        internal_comparator_.reset(
+            new InternalKeyComparator(options_.comparator));
+        break;
+      case PLAIN_TABLE_TOTAL_ORDER:
+        support_prev_ = false;
+        only_support_prefix_seek_ = false;
+        options_.prefix_extractor = nullptr;
+        options_.allow_mmap_reads = true;
+        options_.table_factory.reset(NewTotalOrderPlainTableFactory());
+        constructor_ = new TableConstructor(options_.comparator, true);
+        internal_comparator_.reset(
+            new InternalKeyComparator(options_.comparator));
+        break;
+      case BLOCK_TEST:
+        constructor_ = new BlockConstructor(options_.comparator);
+        break;
+      case MEMTABLE_TEST:
+        constructor_ = new MemTableConstructor(options_.comparator);
+        break;
+      case DB_TEST:
+        constructor_ = new DBConstructor(options_.comparator);
+        break;
+    }
+  }
+
+  ~Harness() {
+    delete constructor_;
+  }
+
+  void Add(const std::string& key, const std::string& value) {
+    constructor_->Add(key, value);
+  }
+
+  void Test(Random* rnd) {
+    std::vector<std::string> keys;
+    KVMap data;
+    constructor_->Finish(options_, *internal_comparator_, &keys, &data);
+
+    TestForwardScan(keys, data);
+    if (support_prev_) {
+      TestBackwardScan(keys, data);
+    }
+    TestRandomAccess(rnd, keys, data);
+  }
+
+  void TestForwardScan(const std::vector<std::string>& keys,
+                       const KVMap& data) {
+    Iterator* iter = constructor_->NewIterator();
+    ASSERT_TRUE(!iter->Valid());
+    iter->SeekToFirst();
+    for (KVMap::const_iterator model_iter = data.begin();
+         model_iter != data.end();
+         ++model_iter) {
+      ASSERT_EQ(ToString(data, model_iter), ToString(iter));
+      iter->Next();
+    }
+    ASSERT_TRUE(!iter->Valid());
+    delete iter;
+  }
+
+  void TestBackwardScan(const std::vector<std::string>& keys,
+                        const KVMap& data) {
+    Iterator* iter = constructor_->NewIterator();
+    ASSERT_TRUE(!iter->Valid());
+    iter->SeekToLast();
+    for (KVMap::const_reverse_iterator model_iter = data.rbegin();
+         model_iter != data.rend();
+         ++model_iter) {
+      ASSERT_EQ(ToString(data, model_iter), ToString(iter));
+      iter->Prev();
+    }
+    ASSERT_TRUE(!iter->Valid());
+    delete iter;
+  }
+
+  void TestRandomAccess(Random* rnd,
+                        const std::vector<std::string>& keys,
+                        const KVMap& data) {
+    static const bool kVerbose = false;
+    Iterator* iter = constructor_->NewIterator();
+    ASSERT_TRUE(!iter->Valid());
+    KVMap::const_iterator model_iter = data.begin();
+    if (kVerbose) fprintf(stderr, "---\n");
+    for (int i = 0; i < 200; i++) {
+      const int toss = rnd->Uniform(support_prev_ ? 5 : 3);
+      switch (toss) {
+        case 0: {
+          if (iter->Valid()) {
+            if (kVerbose) fprintf(stderr, "Next\n");
+            iter->Next();
+            ++model_iter;
+            ASSERT_EQ(ToString(data, model_iter), ToString(iter));
+          }
+          break;
+        }
+
+        case 1: {
+          if (kVerbose) fprintf(stderr, "SeekToFirst\n");
+          iter->SeekToFirst();
+          model_iter = data.begin();
+          ASSERT_EQ(ToString(data, model_iter), ToString(iter));
+          break;
+        }
+
+        case 2: {
+          std::string key = PickRandomKey(rnd, keys);
+          model_iter = data.lower_bound(key);
+          if (kVerbose) fprintf(stderr, "Seek '%s'\n",
+                                EscapeString(key).c_str());
+          iter->Seek(Slice(key));
+          ASSERT_EQ(ToString(data, model_iter), ToString(iter));
+          break;
+        }
+
+        case 3: {
+          if (iter->Valid()) {
+            if (kVerbose) fprintf(stderr, "Prev\n");
+            iter->Prev();
+            if (model_iter == data.begin()) {
+              model_iter = data.end();   // Wrap around to invalid value
+            } else {
+              --model_iter;
+            }
+            ASSERT_EQ(ToString(data, model_iter), ToString(iter));
+          }
+          break;
+        }
+
+        case 4: {
+          if (kVerbose) fprintf(stderr, "SeekToLast\n");
+          iter->SeekToLast();
+          if (keys.empty()) {
+            model_iter = data.end();
+          } else {
+            std::string last = data.rbegin()->first;
+            model_iter = data.lower_bound(last);
+          }
+          ASSERT_EQ(ToString(data, model_iter), ToString(iter));
+          break;
+        }
+      }
+    }
+    delete iter;
+  }
+
+  std::string ToString(const KVMap& data, const KVMap::const_iterator& it) {
+    if (it == data.end()) {
+      return "END";
+    } else {
+      return "'" + it->first + "->" + it->second + "'";
+    }
+  }
+
+  std::string ToString(const KVMap& data,
+                       const KVMap::const_reverse_iterator& it) {
+    if (it == data.rend()) {
+      return "END";
+    } else {
+      return "'" + it->first + "->" + it->second + "'";
+    }
+  }
+
+  std::string ToString(const Iterator* it) {
+    if (!it->Valid()) {
+      return "END";
+    } else {
+      return "'" + it->key().ToString() + "->" + it->value().ToString() + "'";
+    }
+  }
+
+  std::string PickRandomKey(Random* rnd, const std::vector<std::string>& keys) {
+    if (keys.empty()) {
+      return "foo";
+    } else {
+      const int index = rnd->Uniform(keys.size());
+      std::string result = keys[index];
+      switch (rnd->Uniform(support_prev_ ? 3 : 1)) {
+        case 0:
+          // Return an existing key
+          break;
+        case 1: {
+          // Attempt to return something smaller than an existing key
+          if (result.size() > 0 && result[result.size() - 1] > '\0'
+              && (!only_support_prefix_seek_
+                  || options_.prefix_extractor->Transform(result).size()
+                  < result.size())) {
+            result[result.size() - 1]--;
+          }
+          break;
+      }
+        case 2: {
+          // Return something larger than an existing key
+          Increment(options_.comparator, &result);
+          break;
+        }
+      }
+      return result;
+    }
+  }
+
+  // Returns nullptr if not running against a DB
+  DB* db() const { return constructor_->db(); }
+
+ private:
+  Options options_ = Options();
+  Constructor* constructor_;
+  bool support_prev_;
+  bool only_support_prefix_seek_;
+  shared_ptr<InternalKeyComparator> internal_comparator_;
+};
+
+static bool Between(uint64_t val, uint64_t low, uint64_t high) {
+  bool result = (val >= low) && (val <= high);
+  if (!result) {
+    fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n",
+            (unsigned long long)(val),
+            (unsigned long long)(low),
+            (unsigned long long)(high));
+  }
+  return result;
+}
+
+// Tests against all kinds of tables
+class TableTest {
+ public:
+  const InternalKeyComparator& GetPlainInternalComparator(
+      const Comparator* comp) {
+    if (!plain_internal_comparator) {
+      plain_internal_comparator.reset(
+          new test::PlainInternalKeyComparator(comp));
+    }
+    return *plain_internal_comparator;
+  }
+
+ private:
+  std::unique_ptr<InternalKeyComparator> plain_internal_comparator;
+};
+
+class GeneralTableTest : public TableTest {};
+class BlockBasedTableTest : public TableTest {};
+class PlainTableTest : public TableTest {};
+class TablePropertyTest {};
+
+// This test serves as the living tutorial for the prefix scan of user collected
+// properties.
+TEST(TablePropertyTest, PrefixScanTest) {
+  UserCollectedProperties props{{"num.111.1", "1"},
+                                {"num.111.2", "2"},
+                                {"num.111.3", "3"},
+                                {"num.333.1", "1"},
+                                {"num.333.2", "2"},
+                                {"num.333.3", "3"},
+                                {"num.555.1", "1"},
+                                {"num.555.2", "2"},
+                                {"num.555.3", "3"}, };
+
+  // prefixes that exist
+  for (const std::string& prefix : {"num.111", "num.333", "num.555"}) {
+    int num = 0;
+    for (auto pos = props.lower_bound(prefix);
+         pos != props.end() &&
+             pos->first.compare(0, prefix.size(), prefix) == 0;
+         ++pos) {
+      ++num;
+      auto key = prefix + "." + std::to_string(num);
+      ASSERT_EQ(key, pos->first);
+      ASSERT_EQ(std::to_string(num), pos->second);
+    }
+    ASSERT_EQ(3, num);
+  }
+
+  // prefixes that don't exist
+  for (const std::string& prefix :
+       {"num.000", "num.222", "num.444", "num.666"}) {
+    auto pos = props.lower_bound(prefix);
+    ASSERT_TRUE(pos == props.end() ||
+                pos->first.compare(0, prefix.size(), prefix) != 0);
+  }
+}
+
+// This test include all the basic checks except those for index size and block
+// size, which will be conducted in separated unit tests.
+TEST(BlockBasedTableTest, BasicBlockBasedTableProperties) {
+  TableConstructor c(BytewiseComparator());
+
+  c.Add("a1", "val1");
+  c.Add("b2", "val2");
+  c.Add("c3", "val3");
+  c.Add("d4", "val4");
+  c.Add("e5", "val5");
+  c.Add("f6", "val6");
+  c.Add("g7", "val7");
+  c.Add("h8", "val8");
+  c.Add("j9", "val9");
+
+  std::vector<std::string> keys;
+  KVMap kvmap;
+  Options options;
+  options.compression = kNoCompression;
+  options.block_restart_interval = 1;
+
+  c.Finish(options, GetPlainInternalComparator(options.comparator), &keys,
+           &kvmap);
+
+  auto& props = *c.table_reader()->GetTableProperties();
+  ASSERT_EQ(kvmap.size(), props.num_entries);
+
+  auto raw_key_size = kvmap.size() * 2ul;
+  auto raw_value_size = kvmap.size() * 4ul;
+
+  ASSERT_EQ(raw_key_size, props.raw_key_size);
+  ASSERT_EQ(raw_value_size, props.raw_value_size);
+  ASSERT_EQ(1ul, props.num_data_blocks);
+  ASSERT_EQ("", props.filter_policy_name);  // no filter policy is used
+
+  // Verify data size.
+  BlockBuilder block_builder(options, options.comparator);
+  for (const auto& item : kvmap) {
+    block_builder.Add(item.first, item.second);
+  }
+  Slice content = block_builder.Finish();
+  ASSERT_EQ(content.size() + kBlockTrailerSize, props.data_size);
+}
+
+TEST(BlockBasedTableTest, FilterPolicyNameProperties) {
+  TableConstructor c(BytewiseComparator());
+  c.Add("a1", "val1");
+  std::vector<std::string> keys;
+  KVMap kvmap;
+  Options options;
+  std::unique_ptr<const FilterPolicy> filter_policy(NewBloomFilterPolicy(10));
+  options.filter_policy = filter_policy.get();
+
+  c.Finish(options, GetPlainInternalComparator(options.comparator), &keys,
+           &kvmap);
+  auto& props = *c.table_reader()->GetTableProperties();
+  ASSERT_EQ("rocksdb.BuiltinBloomFilter", props.filter_policy_name);
+}
+
+static std::string RandomString(Random* rnd, int len) {
+  std::string r;
+  test::RandomString(rnd, len, &r);
+  return r;
+}
+
+void AddInternalKey(TableConstructor* c, const std::string prefix,
+                    int suffix_len = 800) {
+  static Random rnd(1023);
+  InternalKey k(prefix + RandomString(&rnd, 800), 0, kTypeValue);
+  c->Add(k.Encode().ToString(), "v");
+}
+
+TEST(TableTest, HashIndexTest) {
+  TableConstructor c(BytewiseComparator());
+
+  // keys with prefix length 3, make sure the key/value is big enough to fill
+  // one block
+  AddInternalKey(&c, "0015");
+  AddInternalKey(&c, "0035");
+
+  AddInternalKey(&c, "0054");
+  AddInternalKey(&c, "0055");
+
+  AddInternalKey(&c, "0056");
+  AddInternalKey(&c, "0057");
+
+  AddInternalKey(&c, "0058");
+  AddInternalKey(&c, "0075");
+
+  AddInternalKey(&c, "0076");
+  AddInternalKey(&c, "0095");
+
+  std::vector<std::string> keys;
+  KVMap kvmap;
+  Options options;
+  BlockBasedTableOptions table_options;
+  table_options.index_type = BlockBasedTableOptions::kHashSearch;
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+
+  options.prefix_extractor.reset(NewFixedPrefixTransform(3));
+  options.block_cache = NewLRUCache(1024);
+  options.block_size = 1700;
+
+  std::unique_ptr<InternalKeyComparator> comparator(
+      new InternalKeyComparator(BytewiseComparator()));
+  c.Finish(options, *comparator, &keys, &kvmap);
+  auto reader = c.table_reader();
+
+  auto props = c.table_reader()->GetTableProperties();
+  ASSERT_EQ(5u, props->num_data_blocks);
+
+  std::unique_ptr<Iterator> hash_iter(reader->NewIterator(ReadOptions()));
+
+  // -- Find keys do not exist, but have common prefix.
+  std::vector<std::string> prefixes = {"001", "003", "005", "007", "009"};
+  std::vector<std::string> lower_bound = {keys[0], keys[1], keys[2],
+                                          keys[7], keys[9], };
+
+  // find the lower bound of the prefix
+  for (size_t i = 0; i < prefixes.size(); ++i) {
+    hash_iter->Seek(InternalKey(prefixes[i], 0, kTypeValue).Encode());
+    ASSERT_OK(hash_iter->status());
+    ASSERT_TRUE(hash_iter->Valid());
+
+    // seek the first element in the block
+    ASSERT_EQ(lower_bound[i], hash_iter->key().ToString());
+    ASSERT_EQ("v", hash_iter->value().ToString());
+  }
+
+  // find the upper bound of prefixes
+  std::vector<std::string> upper_bound = {keys[1], keys[2], keys[7], keys[9], };
+
+  // find existing keys
+  for (const auto& item : kvmap) {
+    auto ukey = ExtractUserKey(item.first).ToString();
+    hash_iter->Seek(ukey);
+
+    // ASSERT_OK(regular_iter->status());
+    ASSERT_OK(hash_iter->status());
+
+    // ASSERT_TRUE(regular_iter->Valid());
+    ASSERT_TRUE(hash_iter->Valid());
+
+    ASSERT_EQ(item.first, hash_iter->key().ToString());
+    ASSERT_EQ(item.second, hash_iter->value().ToString());
+  }
+
+  for (size_t i = 0; i < prefixes.size(); ++i) {
+    // the key is greater than any existing keys.
+    auto key = prefixes[i] + "9";
+    hash_iter->Seek(InternalKey(key, 0, kTypeValue).Encode());
+
+    ASSERT_OK(hash_iter->status());
+    if (i == prefixes.size() - 1) {
+      // last key
+      ASSERT_TRUE(!hash_iter->Valid());
+    } else {
+      ASSERT_TRUE(hash_iter->Valid());
+      // seek the first element in the block
+      ASSERT_EQ(upper_bound[i], hash_iter->key().ToString());
+      ASSERT_EQ("v", hash_iter->value().ToString());
+    }
+  }
+
+  // find keys with prefix that don't match any of the existing prefixes.
+  std::vector<std::string> non_exist_prefixes = {"002", "004", "006", "008"};
+  for (const auto& prefix : non_exist_prefixes) {
+    hash_iter->Seek(InternalKey(prefix, 0, kTypeValue).Encode());
+    // regular_iter->Seek(prefix);
+
+    ASSERT_OK(hash_iter->status());
+    ASSERT_TRUE(!hash_iter->Valid());
+  }
+}
+
+// It's very hard to figure out the index block size of a block accurately.
+// To make sure we get the index size, we just make sure as key number
+// grows, the filter block size also grows.
+TEST(BlockBasedTableTest, IndexSizeStat) {
+  uint64_t last_index_size = 0;
+
+  // we need to use random keys since the pure human readable texts
+  // may be well compressed, resulting insignifcant change of index
+  // block size.
+  Random rnd(test::RandomSeed());
+  std::vector<std::string> keys;
+
+  for (int i = 0; i < 100; ++i) {
+    keys.push_back(RandomString(&rnd, 10000));
+  }
+
+  // Each time we load one more key to the table. the table index block
+  // size is expected to be larger than last time's.
+  for (size_t i = 1; i < keys.size(); ++i) {
+    TableConstructor c(BytewiseComparator());
+    for (size_t j = 0; j < i; ++j) {
+      c.Add(keys[j], "val");
+    }
+
+    std::vector<std::string> ks;
+    KVMap kvmap;
+    Options options;
+    options.compression = kNoCompression;
+    options.block_restart_interval = 1;
+
+    c.Finish(options, GetPlainInternalComparator(options.comparator), &ks,
+             &kvmap);
+    auto index_size = c.table_reader()->GetTableProperties()->index_size;
+    ASSERT_GT(index_size, last_index_size);
+    last_index_size = index_size;
+  }
+}
+
+TEST(BlockBasedTableTest, NumBlockStat) {
+  Random rnd(test::RandomSeed());
+  TableConstructor c(BytewiseComparator());
+  Options options;
+  options.compression = kNoCompression;
+  options.block_restart_interval = 1;
+  options.block_size = 1000;
+
+  for (int i = 0; i < 10; ++i) {
+    // the key/val are slightly smaller than block size, so that each block
+    // holds roughly one key/value pair.
+    c.Add(RandomString(&rnd, 900), "val");
+  }
+
+  std::vector<std::string> ks;
+  KVMap kvmap;
+  c.Finish(options, GetPlainInternalComparator(options.comparator), &ks,
+           &kvmap);
+  ASSERT_EQ(kvmap.size(),
+            c.table_reader()->GetTableProperties()->num_data_blocks);
+}
+
+// A simple tool that takes the snapshot of block cache statistics.
+class BlockCachePropertiesSnapshot {
+ public:
+  explicit BlockCachePropertiesSnapshot(Statistics* statistics) {
+    block_cache_miss = statistics->getTickerCount(BLOCK_CACHE_MISS);
+    block_cache_hit = statistics->getTickerCount(BLOCK_CACHE_HIT);
+    index_block_cache_miss = statistics->getTickerCount(BLOCK_CACHE_INDEX_MISS);
+    index_block_cache_hit = statistics->getTickerCount(BLOCK_CACHE_INDEX_HIT);
+    data_block_cache_miss = statistics->getTickerCount(BLOCK_CACHE_DATA_MISS);
+    data_block_cache_hit = statistics->getTickerCount(BLOCK_CACHE_DATA_HIT);
+    filter_block_cache_miss =
+        statistics->getTickerCount(BLOCK_CACHE_FILTER_MISS);
+    filter_block_cache_hit = statistics->getTickerCount(BLOCK_CACHE_FILTER_HIT);
+  }
+
+  void AssertIndexBlockStat(int64_t index_block_cache_miss,
+                            int64_t index_block_cache_hit) {
+    ASSERT_EQ(index_block_cache_miss, this->index_block_cache_miss);
+    ASSERT_EQ(index_block_cache_hit, this->index_block_cache_hit);
+  }
+
+  void AssertFilterBlockStat(int64_t filter_block_cache_miss,
+                             int64_t filter_block_cache_hit) {
+    ASSERT_EQ(filter_block_cache_miss, this->filter_block_cache_miss);
+    ASSERT_EQ(filter_block_cache_hit, this->filter_block_cache_hit);
+  }
+
+  // Check if the fetched props matches the expected ones.
+  // TODO(kailiu) Use this only when you disabled filter policy!
+  void AssertEqual(int64_t index_block_cache_miss,
+                   int64_t index_block_cache_hit, int64_t data_block_cache_miss,
+                   int64_t data_block_cache_hit) const {
+    ASSERT_EQ(index_block_cache_miss, this->index_block_cache_miss);
+    ASSERT_EQ(index_block_cache_hit, this->index_block_cache_hit);
+    ASSERT_EQ(data_block_cache_miss, this->data_block_cache_miss);
+    ASSERT_EQ(data_block_cache_hit, this->data_block_cache_hit);
+    ASSERT_EQ(index_block_cache_miss + data_block_cache_miss,
+              this->block_cache_miss);
+    ASSERT_EQ(index_block_cache_hit + data_block_cache_hit,
+              this->block_cache_hit);
+  }
+
+ private:
+  int64_t block_cache_miss = 0;
+  int64_t block_cache_hit = 0;
+  int64_t index_block_cache_miss = 0;
+  int64_t index_block_cache_hit = 0;
+  int64_t data_block_cache_miss = 0;
+  int64_t data_block_cache_hit = 0;
+  int64_t filter_block_cache_miss = 0;
+  int64_t filter_block_cache_hit = 0;
+};
+
+// Make sure, by default, index/filter blocks were pre-loaded (meaning we won't
+// use block cache to store them).
+TEST(BlockBasedTableTest, BlockCacheDisabledTest) {
+  Options options;
+  options.create_if_missing = true;
+  options.statistics = CreateDBStatistics();
+  options.block_cache = NewLRUCache(1024);
+  std::unique_ptr<const FilterPolicy> filter_policy(NewBloomFilterPolicy(10));
+  options.filter_policy = filter_policy.get();
+  BlockBasedTableOptions table_options;
+  // Intentionally commented out: table_options.cache_index_and_filter_blocks =
+  // true;
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  std::vector<std::string> keys;
+  KVMap kvmap;
+
+  TableConstructor c(BytewiseComparator());
+  c.Add("key", "value");
+  c.Finish(options, GetPlainInternalComparator(options.comparator), &keys,
+           &kvmap);
+
+  // preloading filter/index blocks is enabled.
+  auto reader = dynamic_cast<BlockBasedTable*>(c.table_reader());
+  ASSERT_TRUE(reader->TEST_filter_block_preloaded());
+  ASSERT_TRUE(reader->TEST_index_reader_preloaded());
+
+  {
+    // nothing happens in the beginning
+    BlockCachePropertiesSnapshot props(options.statistics.get());
+    props.AssertIndexBlockStat(0, 0);
+    props.AssertFilterBlockStat(0, 0);
+  }
+
+  {
+    // a hack that just to trigger BlockBasedTable::GetFilter.
+    reader->Get(ReadOptions(), "non-exist-key", nullptr, nullptr, nullptr);
+    BlockCachePropertiesSnapshot props(options.statistics.get());
+    props.AssertIndexBlockStat(0, 0);
+    props.AssertFilterBlockStat(0, 0);
+  }
+}
+
+// Due to the difficulities of the intersaction between statistics, this test
+// only tests the case when "index block is put to block cache"
+TEST(BlockBasedTableTest, FilterBlockInBlockCache) {
+  // -- Table construction
+  Options options;
+  options.create_if_missing = true;
+  options.statistics = CreateDBStatistics();
+  options.block_cache = NewLRUCache(1024);
+
+  // Enable the cache for index/filter blocks
+  BlockBasedTableOptions table_options;
+  table_options.cache_index_and_filter_blocks = true;
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  std::vector<std::string> keys;
+  KVMap kvmap;
+
+  TableConstructor c(BytewiseComparator());
+  c.Add("key", "value");
+  c.Finish(options, GetPlainInternalComparator(options.comparator), &keys,
+           &kvmap);
+  // preloading filter/index blocks is prohibited.
+  auto reader = dynamic_cast<BlockBasedTable*>(c.table_reader());
+  ASSERT_TRUE(!reader->TEST_filter_block_preloaded());
+  ASSERT_TRUE(!reader->TEST_index_reader_preloaded());
+
+  // -- PART 1: Open with regular block cache.
+  // Since block_cache is disabled, no cache activities will be involved.
+  unique_ptr<Iterator> iter;
+
+  // At first, no block will be accessed.
+  {
+    BlockCachePropertiesSnapshot props(options.statistics.get());
+    // index will be added to block cache.
+    props.AssertEqual(1,  // index block miss
+                      0, 0, 0);
+  }
+
+  // Only index block will be accessed
+  {
+    iter.reset(c.NewIterator());
+    BlockCachePropertiesSnapshot props(options.statistics.get());
+    // NOTE: to help better highlight the "detla" of each ticker, I use
+    // <last_value> + <added_value> to indicate the increment of changed
+    // value; other numbers remain the same.
+    props.AssertEqual(1, 0 + 1,  // index block hit
+                      0, 0);
+  }
+
+  // Only data block will be accessed
+  {
+    iter->SeekToFirst();
+    BlockCachePropertiesSnapshot props(options.statistics.get());
+    props.AssertEqual(1, 1, 0 + 1,  // data block miss
+                      0);
+  }
+
+  // Data block will be in cache
+  {
+    iter.reset(c.NewIterator());
+    iter->SeekToFirst();
+    BlockCachePropertiesSnapshot props(options.statistics.get());
+    props.AssertEqual(1, 1 + 1, /* index block hit */
+                      1, 0 + 1 /* data block hit */);
+  }
+  // release the iterator so that the block cache can reset correctly.
+  iter.reset();
+
+  // -- PART 2: Open without block cache
+  options.block_cache.reset();
+  options.statistics = CreateDBStatistics();  // reset the stats
+  c.Reopen(options);
+
+  {
+    iter.reset(c.NewIterator());
+    iter->SeekToFirst();
+    ASSERT_EQ("key", iter->key().ToString());
+    BlockCachePropertiesSnapshot props(options.statistics.get());
+    // Nothing is affected at all
+    props.AssertEqual(0, 0, 0, 0);
+  }
+
+  // -- PART 3: Open with very small block cache
+  // In this test, no block will ever get hit since the block cache is
+  // too small to fit even one entry.
+  options.block_cache = NewLRUCache(1);
+  c.Reopen(options);
+  {
+    BlockCachePropertiesSnapshot props(options.statistics.get());
+    props.AssertEqual(1,  // index block miss
+                      0, 0, 0);
+  }
+
+
+  {
+    // Both index and data block get accessed.
+    // It first cache index block then data block. But since the cache size
+    // is only 1, index block will be purged after data block is inserted.
+    iter.reset(c.NewIterator());
+    BlockCachePropertiesSnapshot props(options.statistics.get());
+    props.AssertEqual(1 + 1,  // index block miss
+                      0, 0,   // data block miss
+                      0);
+  }
+
+  {
+    // SeekToFirst() accesses data block. With similar reason, we expect data
+    // block's cache miss.
+    iter->SeekToFirst();
+    BlockCachePropertiesSnapshot props(options.statistics.get());
+    props.AssertEqual(2, 0, 0 + 1,  // data block miss
+                      0);
+  }
+}
+
+TEST(BlockBasedTableTest, BlockCacheLeak) {
+  // Check that when we reopen a table we don't lose access to blocks already
+  // in the cache. This test checks whether the Table actually makes use of the
+  // unique ID from the file.
+
+  Options opt;
+  unique_ptr<InternalKeyComparator> ikc;
+  ikc.reset(new test::PlainInternalKeyComparator(opt.comparator));
+  opt.block_size = 1024;
+  opt.compression = kNoCompression;
+  opt.block_cache =
+      NewLRUCache(16 * 1024 * 1024);  // big enough so we don't ever
+                                      // lose cached values.
+
+  TableConstructor c(BytewiseComparator());
+  c.Add("k01", "hello");
+  c.Add("k02", "hello2");
+  c.Add("k03", std::string(10000, 'x'));
+  c.Add("k04", std::string(200000, 'x'));
+  c.Add("k05", std::string(300000, 'x'));
+  c.Add("k06", "hello3");
+  c.Add("k07", std::string(100000, 'x'));
+  std::vector<std::string> keys;
+  KVMap kvmap;
+  c.Finish(opt, *ikc, &keys, &kvmap);
+
+  unique_ptr<Iterator> iter(c.NewIterator());
+  iter->SeekToFirst();
+  while (iter->Valid()) {
+    iter->key();
+    iter->value();
+    iter->Next();
+  }
+  ASSERT_OK(iter->status());
+
+  ASSERT_OK(c.Reopen(opt));
+  auto table_reader = dynamic_cast<BlockBasedTable*>(c.table_reader());
+  for (const std::string& key : keys) {
+    ASSERT_TRUE(table_reader->TEST_KeyInCache(ReadOptions(), key));
+  }
+}
+
+TEST(PlainTableTest, BasicPlainTableProperties) {
+  PlainTableFactory factory(8, 8, 0);
+  StringSink sink;
+  Options options;
+  InternalKeyComparator ikc(options.comparator);
+  std::unique_ptr<TableBuilder> builder(
+      factory.NewTableBuilder(options, ikc, &sink, kNoCompression));
+
+  for (char c = 'a'; c <= 'z'; ++c) {
+    std::string key(8, c);
+    key.append("\1       ");  // PlainTable expects internal key structure
+    std::string value(28, c + 42);
+    builder->Add(key, value);
+  }
+  ASSERT_OK(builder->Finish());
+
+  StringSource source(sink.contents(), 72242, true);
+
+  TableProperties* props = nullptr;
+  auto s = ReadTableProperties(&source, sink.contents().size(),
+                               kPlainTableMagicNumber, Env::Default(), nullptr,
+                               &props);
+  std::unique_ptr<TableProperties> props_guard(props);
+  ASSERT_OK(s);
+
+  ASSERT_EQ(0ul, props->index_size);
+  ASSERT_EQ(0ul, props->filter_size);
+  ASSERT_EQ(16ul * 26, props->raw_key_size);
+  ASSERT_EQ(28ul * 26, props->raw_value_size);
+  ASSERT_EQ(26ul, props->num_entries);
+  ASSERT_EQ(1ul, props->num_data_blocks);
+}
+
+TEST(GeneralTableTest, ApproximateOffsetOfPlain) {
+  TableConstructor c(BytewiseComparator());
+  c.Add("k01", "hello");
+  c.Add("k02", "hello2");
+  c.Add("k03", std::string(10000, 'x'));
+  c.Add("k04", std::string(200000, 'x'));
+  c.Add("k05", std::string(300000, 'x'));
+  c.Add("k06", "hello3");
+  c.Add("k07", std::string(100000, 'x'));
+  std::vector<std::string> keys;
+  KVMap kvmap;
+  Options options;
+  test::PlainInternalKeyComparator internal_comparator(options.comparator);
+  options.block_size = 1024;
+  options.compression = kNoCompression;
+  c.Finish(options, internal_comparator, &keys, &kvmap);
+
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"),       0,      0));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"),       0,      0));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01a"),      0,      0));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"),       0,      0));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"),       0,      0));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"),   10000,  11000));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04a"), 210000, 211000));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k05"),  210000, 211000));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k06"),  510000, 511000));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k07"),  510000, 511000));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"),  610000, 612000));
+}
+
+static void DoCompressionTest(CompressionType comp) {
+  Random rnd(301);
+  TableConstructor c(BytewiseComparator());
+  std::string tmp;
+  c.Add("k01", "hello");
+  c.Add("k02", test::CompressibleString(&rnd, 0.25, 10000, &tmp));
+  c.Add("k03", "hello3");
+  c.Add("k04", test::CompressibleString(&rnd, 0.25, 10000, &tmp));
+  std::vector<std::string> keys;
+  KVMap kvmap;
+  Options options;
+  test::PlainInternalKeyComparator ikc(options.comparator);
+  options.block_size = 1024;
+  options.compression = comp;
+  c.Finish(options, ikc, &keys, &kvmap);
+
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"),       0,      0));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"),       0,      0));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"),       0,      0));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"),    2000,   3000));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"),    2000,   3000));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"),    4000,   6100));
+}
+
+TEST(GeneralTableTest, ApproximateOffsetOfCompressed) {
+  std::vector<CompressionType> compression_state;
+  if (!SnappyCompressionSupported()) {
+    fprintf(stderr, "skipping snappy compression tests\n");
+  } else {
+    compression_state.push_back(kSnappyCompression);
+  }
+
+  if (!ZlibCompressionSupported()) {
+    fprintf(stderr, "skipping zlib compression tests\n");
+  } else {
+    compression_state.push_back(kZlibCompression);
+  }
+
+  // TODO(kailiu) DoCompressionTest() doesn't work with BZip2.
+  /*
+  if (!BZip2CompressionSupported()) {
+    fprintf(stderr, "skipping bzip2 compression tests\n");
+  } else {
+    compression_state.push_back(kBZip2Compression);
+  }
+  */
+
+  if (!LZ4CompressionSupported()) {
+    fprintf(stderr, "skipping lz4 compression tests\n");
+  } else {
+    compression_state.push_back(kLZ4Compression);
+  }
+
+  if (!LZ4HCCompressionSupported()) {
+    fprintf(stderr, "skipping lz4hc compression tests\n");
+  } else {
+    compression_state.push_back(kLZ4HCCompression);
+  }
+
+  for (auto state : compression_state) {
+    DoCompressionTest(state);
+  }
+}
+
+TEST(Harness, Randomized) {
+  std::vector<TestArgs> args = GenerateArgList();
+  for (unsigned int i = 0; i < args.size(); i++) {
+    Init(args[i]);
+    Random rnd(test::RandomSeed() + 5);
+    for (int num_entries = 0; num_entries < 2000;
+         num_entries += (num_entries < 50 ? 1 : 200)) {
+      if ((num_entries % 10) == 0) {
+        fprintf(stderr, "case %d of %d: num_entries = %d\n", (i + 1),
+                static_cast<int>(args.size()), num_entries);
+      }
+      for (int e = 0; e < num_entries; e++) {
+        std::string v;
+        Add(test::RandomKey(&rnd, rnd.Skewed(4)),
+            test::RandomString(&rnd, rnd.Skewed(5), &v).ToString());
+      }
+      Test(&rnd);
+    }
+  }
+}
+
+TEST(Harness, RandomizedLongDB) {
+  Random rnd(test::RandomSeed());
+  TestArgs args = { DB_TEST, false, 16, kNoCompression };
+  Init(args);
+  int num_entries = 100000;
+  for (int e = 0; e < num_entries; e++) {
+    std::string v;
+    Add(test::RandomKey(&rnd, rnd.Skewed(4)),
+        test::RandomString(&rnd, rnd.Skewed(5), &v).ToString());
+  }
+  Test(&rnd);
+
+  // We must have created enough data to force merging
+  int files = 0;
+  for (int level = 0; level < db()->NumberLevels(); level++) {
+    std::string value;
+    char name[100];
+    snprintf(name, sizeof(name), "rocksdb.num-files-at-level%d", level);
+    ASSERT_TRUE(db()->GetProperty(name, &value));
+    files += atoi(value.c_str());
+  }
+  ASSERT_GT(files, 0);
+}
+
+class MemTableTest { };
+
+TEST(MemTableTest, Simple) {
+  InternalKeyComparator cmp(BytewiseComparator());
+  auto table_factory = std::make_shared<SkipListFactory>();
+  Options options;
+  options.memtable_factory = table_factory;
+  MemTable* memtable = new MemTable(cmp, options);
+  memtable->Ref();
+  WriteBatch batch;
+  WriteBatchInternal::SetSequence(&batch, 100);
+  batch.Put(std::string("k1"), std::string("v1"));
+  batch.Put(std::string("k2"), std::string("v2"));
+  batch.Put(std::string("k3"), std::string("v3"));
+  batch.Put(std::string("largekey"), std::string("vlarge"));
+  ColumnFamilyMemTablesDefault cf_mems_default(memtable, &options);
+  ASSERT_TRUE(WriteBatchInternal::InsertInto(&batch, &cf_mems_default).ok());
+
+  Iterator* iter = memtable->NewIterator(ReadOptions());
+  iter->SeekToFirst();
+  while (iter->Valid()) {
+    fprintf(stderr, "key: '%s' -> '%s'\n",
+            iter->key().ToString().c_str(),
+            iter->value().ToString().c_str());
+    iter->Next();
+  }
+
+  delete iter;
+  delete memtable->Unref();
+}
+
+// Test the empty key
+TEST(Harness, SimpleEmptyKey) {
+  auto args = GenerateArgList();
+  for (const auto& arg : args) {
+    Init(arg);
+    Random rnd(test::RandomSeed() + 1);
+    Add("", "v");
+    Test(&rnd);
+  }
+}
+
+TEST(Harness, SimpleSingle) {
+  auto args = GenerateArgList();
+  for (const auto& arg : args) {
+    Init(arg);
+    Random rnd(test::RandomSeed() + 2);
+    Add("abc", "v");
+    Test(&rnd);
+  }
+}
+
+TEST(Harness, SimpleMulti) {
+  auto args = GenerateArgList();
+  for (const auto& arg : args) {
+    Init(arg);
+    Random rnd(test::RandomSeed() + 3);
+    Add("abc", "v");
+    Add("abcd", "v");
+    Add("ac", "v2");
+    Test(&rnd);
+  }
+}
+
+TEST(Harness, SimpleSpecialKey) {
+  auto args = GenerateArgList();
+  for (const auto& arg : args) {
+    Init(arg);
+    Random rnd(test::RandomSeed() + 4);
+    Add("\xff\xff", "v3");
+    Test(&rnd);
+  }
+}
+
+TEST(Harness, FooterTests) {
+  {
+    // upconvert legacy block based
+    std::string encoded;
+    Footer footer(kLegacyBlockBasedTableMagicNumber);
+    BlockHandle meta_index(10, 5), index(20, 15);
+    footer.set_metaindex_handle(meta_index);
+    footer.set_index_handle(index);
+    footer.EncodeTo(&encoded);
+    Footer decoded_footer;
+    Slice encoded_slice(encoded);
+    decoded_footer.DecodeFrom(&encoded_slice);
+    ASSERT_EQ(decoded_footer.table_magic_number(), kBlockBasedTableMagicNumber);
+    ASSERT_EQ(decoded_footer.checksum(), kCRC32c);
+    ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset());
+    ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
+    ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset());
+    ASSERT_EQ(decoded_footer.index_handle().size(), index.size());
+  }
+  {
+    // xxhash block based
+    std::string encoded;
+    Footer footer(kBlockBasedTableMagicNumber);
+    BlockHandle meta_index(10, 5), index(20, 15);
+    footer.set_metaindex_handle(meta_index);
+    footer.set_index_handle(index);
+    footer.set_checksum(kxxHash);
+    footer.EncodeTo(&encoded);
+    Footer decoded_footer;
+    Slice encoded_slice(encoded);
+    decoded_footer.DecodeFrom(&encoded_slice);
+    ASSERT_EQ(decoded_footer.table_magic_number(), kBlockBasedTableMagicNumber);
+    ASSERT_EQ(decoded_footer.checksum(), kxxHash);
+    ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset());
+    ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
+    ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset());
+    ASSERT_EQ(decoded_footer.index_handle().size(), index.size());
+  }
+  {
+    // upconvert legacy plain table
+    std::string encoded;
+    Footer footer(kLegacyPlainTableMagicNumber);
+    BlockHandle meta_index(10, 5), index(20, 15);
+    footer.set_metaindex_handle(meta_index);
+    footer.set_index_handle(index);
+    footer.EncodeTo(&encoded);
+    Footer decoded_footer;
+    Slice encoded_slice(encoded);
+    decoded_footer.DecodeFrom(&encoded_slice);
+    ASSERT_EQ(decoded_footer.table_magic_number(), kPlainTableMagicNumber);
+    ASSERT_EQ(decoded_footer.checksum(), kCRC32c);
+    ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset());
+    ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
+    ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset());
+    ASSERT_EQ(decoded_footer.index_handle().size(), index.size());
+  }
+  {
+    // xxhash block based
+    std::string encoded;
+    Footer footer(kPlainTableMagicNumber);
+    BlockHandle meta_index(10, 5), index(20, 15);
+    footer.set_metaindex_handle(meta_index);
+    footer.set_index_handle(index);
+    footer.set_checksum(kxxHash);
+    footer.EncodeTo(&encoded);
+    Footer decoded_footer;
+    Slice encoded_slice(encoded);
+    decoded_footer.DecodeFrom(&encoded_slice);
+    ASSERT_EQ(decoded_footer.table_magic_number(), kPlainTableMagicNumber);
+    ASSERT_EQ(decoded_footer.checksum(), kxxHash);
+    ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset());
+    ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
+    ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset());
+    ASSERT_EQ(decoded_footer.index_handle().size(), index.size());
+  }
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/table/two_level_iterator.cc b/table/two_level_iterator.cc
new file mode 100644 (file)
index 0000000..990f181
--- /dev/null
@@ -0,0 +1,190 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/two_level_iterator.h"
+
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+#include "table/block.h"
+#include "table/format.h"
+
+namespace rocksdb {
+
+namespace {
+
+class TwoLevelIterator: public Iterator {
+ public:
+  explicit TwoLevelIterator(TwoLevelIteratorState* state,
+      Iterator* first_level_iter);
+
+  virtual ~TwoLevelIterator() {}
+
+  virtual void Seek(const Slice& target);
+  virtual void SeekToFirst();
+  virtual void SeekToLast();
+  virtual void Next();
+  virtual void Prev();
+
+  virtual bool Valid() const {
+    return second_level_iter_.Valid();
+  }
+  virtual Slice key() const {
+    assert(Valid());
+    return second_level_iter_.key();
+  }
+  virtual Slice value() const {
+    assert(Valid());
+    return second_level_iter_.value();
+  }
+  virtual Status status() const {
+    // It'd be nice if status() returned a const Status& instead of a Status
+    if (!first_level_iter_.status().ok()) {
+      return first_level_iter_.status();
+    } else if (second_level_iter_.iter() != nullptr &&
+               !second_level_iter_.status().ok()) {
+      return second_level_iter_.status();
+    } else {
+      return status_;
+    }
+  }
+
+ private:
+  void SaveError(const Status& s) {
+    if (status_.ok() && !s.ok()) status_ = s;
+  }
+  void SkipEmptyDataBlocksForward();
+  void SkipEmptyDataBlocksBackward();
+  void SetSecondLevelIterator(Iterator* iter);
+  void InitDataBlock();
+
+  std::unique_ptr<TwoLevelIteratorState> state_;
+  IteratorWrapper first_level_iter_;
+  IteratorWrapper second_level_iter_;  // May be nullptr
+  Status status_;
+  // If second_level_iter is non-nullptr, then "data_block_handle_" holds the
+  // "index_value" passed to block_function_ to create the second_level_iter.
+  std::string data_block_handle_;
+};
+
+TwoLevelIterator::TwoLevelIterator(TwoLevelIteratorState* state,
+    Iterator* first_level_iter)
+  : state_(state), first_level_iter_(first_level_iter) {}
+
+void TwoLevelIterator::Seek(const Slice& target) {
+  if (state_->check_prefix_may_match &&
+      !state_->PrefixMayMatch(target)) {
+    SetSecondLevelIterator(nullptr);
+    return;
+  }
+  first_level_iter_.Seek(target);
+
+  InitDataBlock();
+  if (second_level_iter_.iter() != nullptr) {
+    second_level_iter_.Seek(target);
+  }
+  SkipEmptyDataBlocksForward();
+}
+
+void TwoLevelIterator::SeekToFirst() {
+  first_level_iter_.SeekToFirst();
+  InitDataBlock();
+  if (second_level_iter_.iter() != nullptr) {
+    second_level_iter_.SeekToFirst();
+  }
+  SkipEmptyDataBlocksForward();
+}
+
+void TwoLevelIterator::SeekToLast() {
+  first_level_iter_.SeekToLast();
+  InitDataBlock();
+  if (second_level_iter_.iter() != nullptr) {
+    second_level_iter_.SeekToLast();
+  }
+  SkipEmptyDataBlocksBackward();
+}
+
+void TwoLevelIterator::Next() {
+  assert(Valid());
+  second_level_iter_.Next();
+  SkipEmptyDataBlocksForward();
+}
+
+void TwoLevelIterator::Prev() {
+  assert(Valid());
+  second_level_iter_.Prev();
+  SkipEmptyDataBlocksBackward();
+}
+
+
+void TwoLevelIterator::SkipEmptyDataBlocksForward() {
+  while (second_level_iter_.iter() == nullptr ||
+         (!second_level_iter_.Valid() &&
+         !second_level_iter_.status().IsIncomplete())) {
+    // Move to next block
+    if (!first_level_iter_.Valid()) {
+      SetSecondLevelIterator(nullptr);
+      return;
+    }
+    first_level_iter_.Next();
+    InitDataBlock();
+    if (second_level_iter_.iter() != nullptr) {
+      second_level_iter_.SeekToFirst();
+    }
+  }
+}
+
+void TwoLevelIterator::SkipEmptyDataBlocksBackward() {
+  while (second_level_iter_.iter() == nullptr ||
+         (!second_level_iter_.Valid() &&
+         !second_level_iter_.status().IsIncomplete())) {
+    // Move to next block
+    if (!first_level_iter_.Valid()) {
+      SetSecondLevelIterator(nullptr);
+      return;
+    }
+    first_level_iter_.Prev();
+    InitDataBlock();
+    if (second_level_iter_.iter() != nullptr) {
+      second_level_iter_.SeekToLast();
+    }
+  }
+}
+
+void TwoLevelIterator::SetSecondLevelIterator(Iterator* iter) {
+  if (second_level_iter_.iter() != nullptr) {
+    SaveError(second_level_iter_.status());
+  }
+  second_level_iter_.Set(iter);
+}
+
+void TwoLevelIterator::InitDataBlock() {
+  if (!first_level_iter_.Valid()) {
+    SetSecondLevelIterator(nullptr);
+  } else {
+    Slice handle = first_level_iter_.value();
+    if (second_level_iter_.iter() != nullptr
+        && handle.compare(data_block_handle_) == 0) {
+      // second_level_iter is already constructed with this iterator, so
+      // no need to change anything
+    } else {
+      Iterator* iter = state_->NewSecondaryIterator(handle);
+      data_block_handle_.assign(handle.data(), handle.size());
+      SetSecondLevelIterator(iter);
+    }
+  }
+}
+
+}  // namespace
+
+Iterator* NewTwoLevelIterator(TwoLevelIteratorState* state,
+      Iterator* first_level_iter) {
+  return new TwoLevelIterator(state, first_level_iter);
+}
+
+}  // namespace rocksdb
diff --git a/table/two_level_iterator.h b/table/two_level_iterator.h
new file mode 100644 (file)
index 0000000..b808338
--- /dev/null
@@ -0,0 +1,45 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include "rocksdb/iterator.h"
+#include "rocksdb/env.h"
+#include "table/iterator_wrapper.h"
+
+namespace rocksdb {
+
+struct ReadOptions;
+class InternalKeyComparator;
+
+struct TwoLevelIteratorState {
+  explicit TwoLevelIteratorState(bool check_prefix_may_match)
+    : check_prefix_may_match(check_prefix_may_match) {}
+
+  virtual ~TwoLevelIteratorState() {}
+  virtual Iterator* NewSecondaryIterator(const Slice& handle) = 0;
+  virtual bool PrefixMayMatch(const Slice& internal_key) = 0;
+
+  // If call PrefixMayMatch()
+  bool check_prefix_may_match;
+};
+
+
+// Return a new two level iterator.  A two-level iterator contains an
+// index iterator whose values point to a sequence of blocks where
+// each block is itself a sequence of key,value pairs.  The returned
+// two-level iterator yields the concatenation of all key/value pairs
+// in the sequence of blocks.  Takes ownership of "index_iter" and
+// will delete it when no longer needed.
+//
+// Uses a supplied function to convert an index_iter value into
+// an iterator over the contents of the corresponding block.
+extern Iterator* NewTwoLevelIterator(TwoLevelIteratorState* state,
+      Iterator* first_level_iter);
+
+}  // namespace rocksdb
diff --git a/tools/auto_sanity_test.sh b/tools/auto_sanity_test.sh
new file mode 100755 (executable)
index 0000000..2d63c0a
--- /dev/null
@@ -0,0 +1,71 @@
+TMP_DIR="/tmp/rocksdb-sanity-test"
+
+if [ "$#" -lt 2 ]; then
+  echo "usage: ./auto_sanity_test.sh [new_commit] [old_commit]"
+  echo "Missing either [new_commit] or [old_commit], perform sanity check with the latest and 10th latest commits."
+  recent_commits=`git log | grep -e "^commit [a-z0-9]\+$"| head -n10 | sed -e 's/commit //g'`
+  commit_new=`echo "$recent_commits" | head -n1`
+  commit_old=`echo "$recent_commits" | tail -n1`
+  echo "the most recent commits are:"
+  echo "$recent_commits"
+else
+  commit_new=$1
+  commit_old=$2
+fi
+
+if [ ! -d $TMP_DIR ]; then
+  mkdir $TMP_DIR
+fi
+dir_new="${TMP_DIR}/${commit_new}"
+dir_old="${TMP_DIR}/${commit_old}"
+
+function makestuff() {
+  echo "make clean"
+  make clean > /dev/null
+  echo "make db_sanity_test -j32"
+  make db_sanity_test -j32 > /dev/null
+  if [ $? -ne 0 ]; then
+    echo "[ERROR] Failed to perform 'make db_sanity_test'"
+    exit 1
+  fi
+}
+
+rm -r -f $dir_new
+rm -r -f $dir_old
+
+echo "Running db sanity check with commits $commit_new and $commit_old."
+
+echo "============================================================="
+echo "Making build $commit_new"
+makestuff
+mv db_sanity_test new_db_sanity_test
+echo "Creating db based on the new commit --- $commit_new"
+./new_db_sanity_test $dir_new create
+
+echo "============================================================="
+echo "Making build $commit_old"
+makestuff
+mv db_sanity_test old_db_sanity_test
+echo "Creating db based on the old commit --- $commit_old"
+./old_db_sanity_test $dir_old create
+
+echo "============================================================="
+echo "Verifying new db $dir_new using the old commit --- $commit_old"
+./old_db_sanity_test $dir_new verify
+if [ $? -ne 0 ]; then
+  echo "[ERROR] Verification of $dir_new using commit $commit_old failed."
+  exit 2
+fi
+
+echo "============================================================="
+echo "Verifying old db $dir_old using the new commit --- $commit_new"
+./new_db_sanity_test $dir_old verify
+if [ $? -ne 0 ]; then
+  echo "[ERROR] Verification of $dir_old using commit $commit_new failed."
+  exit 2
+fi
+
+rm old_db_sanity_test
+rm new_db_sanity_test
+
+echo "Auto sanity test passed!"
diff --git a/tools/blob_store_bench.cc b/tools/blob_store_bench.cc
new file mode 100644 (file)
index 0000000..60a0b84
--- /dev/null
@@ -0,0 +1,280 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include <cstdio>
+#include <vector>
+#include <atomic>
+
+#include "rocksdb/env.h"
+#include "util/blob_store.h"
+#include "util/testutil.h"
+
+#define KB 1024LL
+#define MB 1024*1024LL
+// BlobStore does costly asserts to make sure it's running correctly, which
+// significantly impacts benchmark runtime.
+// NDEBUG will compile out those asserts.
+#ifndef NDEBUG
+#define NDEBUG
+#endif
+
+using namespace rocksdb;
+using namespace std;
+
+// used by all threads
+uint64_t timeout_sec;
+Env *env;
+BlobStore* bs;
+
+namespace {
+std::string RandomString(Random* rnd, uint64_t len) {
+  std::string r;
+  test::RandomString(rnd, len, &r);
+  return r;
+}
+}  // namespace
+
+struct Result {
+  uint32_t writes;
+  uint32_t reads;
+  uint32_t deletes;
+  uint64_t data_written;
+  uint64_t data_read;
+
+  void print() {
+    printf("Total writes = %u\n", writes);
+    printf("Total reads = %u\n", reads);
+    printf("Total deletes = %u\n", deletes);
+    printf("Write throughput = %lf MB/s\n",
+           (double)data_written / (1024*1024.0) / timeout_sec);
+    printf("Read throughput = %lf MB/s\n",
+           (double)data_read / (1024*1024.0) / timeout_sec);
+    printf("Total throughput = %lf MB/s\n",
+           (double)(data_read + data_written) / (1024*1024.0) / timeout_sec);
+  }
+
+  Result() {
+    writes = reads = deletes = data_read = data_written = 0;
+  }
+
+  Result (uint32_t writes, uint32_t reads, uint32_t deletes,
+          uint64_t data_written, uint64_t data_read) :
+    writes(writes), reads(reads), deletes(deletes),
+    data_written(data_written), data_read(data_read) {}
+
+};
+
+namespace {
+Result operator + (const Result &a, const Result &b) {
+  return Result(a.writes + b.writes, a.reads + b.reads,
+                a.deletes + b.deletes, a.data_written + b.data_written,
+                a.data_read + b.data_read);
+}
+}  // namespace
+
+struct WorkerThread {
+  uint64_t data_size_from, data_size_to;
+  double read_ratio;
+  uint64_t working_set_size; // start deleting once you reach this
+  Result result;
+  atomic<bool> stopped;
+
+  WorkerThread(uint64_t data_size_from, uint64_t data_size_to,
+                double read_ratio, uint64_t working_set_size) :
+    data_size_from(data_size_from), data_size_to(data_size_to),
+    read_ratio(read_ratio), working_set_size(working_set_size),
+    stopped(false) {}
+
+  WorkerThread(const WorkerThread& wt) :
+    data_size_from(wt.data_size_from), data_size_to(wt.data_size_to),
+    read_ratio(wt.read_ratio), working_set_size(wt.working_set_size),
+    stopped(false) {}
+};
+
+static void WorkerThreadBody(void* arg) {
+  WorkerThread* t = reinterpret_cast<WorkerThread*>(arg);
+  Random rnd(5);
+  string buf;
+  vector<pair<Blob, uint64_t>> blobs;
+  vector<string> random_strings;
+
+  for (int i = 0; i < 10; ++i) {
+    random_strings.push_back(RandomString(&rnd, t->data_size_to));
+  }
+
+  uint64_t total_size = 0;
+
+  uint64_t start_micros = env->NowMicros();
+  while (env->NowMicros() - start_micros < timeout_sec * 1000 * 1000) {
+    if (blobs.size() && rand() < RAND_MAX * t->read_ratio) {
+      // read
+      int bi = rand() % blobs.size();
+      Status s = bs->Get(blobs[bi].first, &buf);
+      assert(s.ok());
+      t->result.data_read += buf.size();
+      t->result.reads++;
+    } else {
+      // write
+      uint64_t size = rand() % (t->data_size_to - t->data_size_from) +
+        t->data_size_from;
+      total_size += size;
+      string put_str = random_strings[rand() % random_strings.size()];
+      blobs.push_back(make_pair(Blob(), size));
+      Status s = bs->Put(Slice(put_str.data(), size), &blobs.back().first);
+      assert(s.ok());
+      t->result.data_written += size;
+      t->result.writes++;
+    }
+
+    while (total_size >= t->working_set_size) {
+      // delete random
+      int bi = rand() % blobs.size();
+      total_size -= blobs[bi].second;
+      bs->Delete(blobs[bi].first);
+      blobs.erase(blobs.begin() + bi);
+      t->result.deletes++;
+    }
+  }
+  t->stopped.store(true);
+}
+
+namespace {
+Result StartBenchmark(vector<WorkerThread*>& config) {
+  for (auto w : config) {
+    env->StartThread(WorkerThreadBody, w);
+  }
+
+  Result result;
+
+  for (auto w : config) {
+    while (!w->stopped.load());
+    result = result + w->result;
+  }
+
+  for (auto w : config) {
+    delete w;
+  }
+
+  delete bs;
+
+  return result;
+}
+
+vector<WorkerThread*> SetupBenchmarkBalanced() {
+  string test_path;
+  env->GetTestDirectory(&test_path);
+  test_path.append("/blob_store");
+
+  // config start
+  uint32_t block_size = 16*KB;
+  uint32_t file_size = 1*MB;
+  double read_write_ratio = 0.5;
+  uint64_t data_read_from = 16*KB;
+  uint64_t data_read_to = 32*KB;
+  int number_of_threads = 10;
+  uint64_t working_set_size = 5*MB;
+  timeout_sec = 5;
+  // config end
+
+  bs = new BlobStore(test_path, block_size, file_size / block_size, 10000, env);
+
+  vector <WorkerThread*> config;
+
+  for (int i = 0; i < number_of_threads; ++i) {
+    config.push_back(new WorkerThread(data_read_from,
+                                      data_read_to,
+                                      read_write_ratio,
+                                      working_set_size));
+  };
+
+  return config;
+}
+
+vector<WorkerThread*> SetupBenchmarkWriteHeavy() {
+  string test_path;
+  env->GetTestDirectory(&test_path);
+  test_path.append("/blob_store");
+
+  // config start
+  uint32_t block_size = 16*KB;
+  uint32_t file_size = 1*MB;
+  double read_write_ratio = 0.1;
+  uint64_t data_read_from = 16*KB;
+  uint64_t data_read_to = 32*KB;
+  int number_of_threads = 10;
+  uint64_t working_set_size = 5*MB;
+  timeout_sec = 5;
+  // config end
+
+  bs = new BlobStore(test_path, block_size, file_size / block_size, 10000, env);
+
+  vector <WorkerThread*> config;
+
+  for (int i = 0; i < number_of_threads; ++i) {
+    config.push_back(new WorkerThread(data_read_from,
+                                      data_read_to,
+                                      read_write_ratio,
+                                      working_set_size));
+  };
+
+  return config;
+}
+
+vector<WorkerThread*> SetupBenchmarkReadHeavy() {
+  string test_path;
+  env->GetTestDirectory(&test_path);
+  test_path.append("/blob_store");
+
+  // config start
+  uint32_t block_size = 16*KB;
+  uint32_t file_size = 1*MB;
+  double read_write_ratio = 0.9;
+  uint64_t data_read_from = 16*KB;
+  uint64_t data_read_to = 32*KB;
+  int number_of_threads = 10;
+  uint64_t working_set_size = 5*MB;
+  timeout_sec = 5;
+  // config end
+
+  bs = new BlobStore(test_path, block_size, file_size / block_size, 10000, env);
+
+  vector <WorkerThread*> config;
+
+  for (int i = 0; i < number_of_threads; ++i) {
+    config.push_back(new WorkerThread(data_read_from,
+                                      data_read_to,
+                                      read_write_ratio,
+                                      working_set_size));
+  };
+
+  return config;
+}
+}  // namespace
+
+int main(int argc, const char** argv) {
+  srand(33);
+  env = Env::Default();
+
+  {
+    printf("--- Balanced read/write benchmark ---\n");
+    vector <WorkerThread*> config = SetupBenchmarkBalanced();
+    Result r = StartBenchmark(config);
+    r.print();
+  }
+  {
+    printf("--- Write heavy benchmark ---\n");
+    vector <WorkerThread*> config = SetupBenchmarkWriteHeavy();
+    Result r = StartBenchmark(config);
+    r.print();
+  }
+  {
+    printf("--- Read heavy benchmark ---\n");
+    vector <WorkerThread*> config = SetupBenchmarkReadHeavy();
+    Result r = StartBenchmark(config);
+    r.print();
+  }
+
+  return 0;
+}
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
new file mode 100644 (file)
index 0000000..3c93eca
--- /dev/null
@@ -0,0 +1,150 @@
+#! /usr/bin/env python
+import os
+import re
+import sys
+import time
+import random
+import getopt
+import logging
+import tempfile
+import subprocess
+import shutil
+
+# This script runs and kills db_stress multiple times. It checks consistency
+# in case of unsafe crashes in RocksDB.
+
+def main(argv):
+    try:
+        opts, args = getopt.getopt(argv, "hd:t:i:o:b:")
+    except getopt.GetoptError:
+        print("db_crashtest.py -d <duration_test> -t <#threads> "
+              "-i <interval for one run> -o <ops_per_thread> "
+              "-b <write_buffer_size>\n")
+        sys.exit(2)
+
+    # default values, will be overridden by cmdline args
+    interval = 120  # time for one db_stress instance to run
+    duration = 6000  # total time for this script to test db_stress
+    threads = 32
+    # since we will be killing anyway, use large value for ops_per_thread
+    ops_per_thread = 100000000
+    write_buf_size = 4 * 1024 * 1024
+
+    for opt, arg in opts:
+        if opt == '-h':
+            print("db_crashtest.py -d <duration_test>"
+                  " -t <#threads> -i <interval for one run>"
+                  " -o <ops_per_thread> -b <write_buffer_size>\n")
+            sys.exit()
+        elif opt == "-d":
+            duration = int(arg)
+        elif opt == "-t":
+            threads = int(arg)
+        elif opt == "-i":
+            interval = int(arg)
+        elif opt == "-o":
+            ops_per_thread = int(arg)
+        elif opt == "-b":
+            write_buf_size = int(arg)
+        else:
+            print("db_crashtest.py -d <duration_test>"
+                  " -t <#threads> -i <interval for one run>"
+                  " -o <ops_per_thread> -b <write_buffer_size>\n")
+            sys.exit(2)
+
+    exit_time = time.time() + duration
+
+    print("Running blackbox-crash-test with \ninterval_between_crash="
+          + str(interval) + "\ntotal-duration=" + str(duration)
+          + "\nthreads=" + str(threads) + "\nops_per_thread="
+          + str(ops_per_thread) + "\nwrite_buffer_size="
+          + str(write_buf_size) + "\n")
+
+    dbname = tempfile.mkdtemp(prefix='rocksdb_crashtest_')
+
+    while time.time() < exit_time:
+        run_had_errors = False
+        killtime = time.time() + interval
+
+        cmd = re.sub('\s+', ' ', """
+            ./db_stress
+            --test_batches_snapshots=1
+            --ops_per_thread=%s
+            --threads=%s
+            --write_buffer_size=%s
+            --destroy_db_initially=0
+            --reopen=20
+            --readpercent=45
+            --prefixpercent=5
+            --writepercent=35
+            --delpercent=5
+            --iterpercent=10
+            --db=%s
+            --max_key=100000000
+            --disable_seek_compaction=%s
+            --mmap_read=%s
+            --block_size=16384
+            --cache_size=1048576
+            --open_files=500000
+            --verify_checksum=1
+            --sync=0
+            --progress_reports=0
+            --disable_wal=0
+            --disable_data_sync=1
+            --target_file_size_base=2097152
+            --target_file_size_multiplier=2
+            --max_write_buffer_number=3
+            --max_background_compactions=20
+            --max_bytes_for_level_base=10485760
+            --filter_deletes=%s
+            --memtablerep=prefix_hash
+            --prefix_size=7
+            """ % (ops_per_thread,
+                   threads,
+                   write_buf_size,
+                   dbname,
+                   random.randint(0, 1),
+                   random.randint(0, 1),
+                   random.randint(0, 1)))
+
+        child = subprocess.Popen([cmd],
+                                 stderr=subprocess.PIPE, shell=True)
+        print("Running db_stress with pid=%d: %s\n\n"
+              % (child.pid, cmd))
+
+        stop_early = False
+        while time.time() < killtime:
+            if child.poll() is not None:
+                print("WARNING: db_stress ended before kill: exitcode=%d\n"
+                      % child.returncode)
+                stop_early = True
+                break
+            time.sleep(1)
+
+        if not stop_early:
+            if child.poll() is not None:
+                print("WARNING: db_stress ended before kill: exitcode=%d\n"
+                      % child.returncode)
+            else:
+                child.kill()
+                print("KILLED %d\n" % child.pid)
+                time.sleep(1)  # time to stabilize after a kill
+
+        while True:
+            line = child.stderr.readline().strip()
+            if line != '':
+                run_had_errors = True
+                print('***' + line + '^')
+            else:
+                break
+
+        if run_had_errors:
+            sys.exit(2)
+
+        time.sleep(1)  # time to stabilize before the next run
+
+    # we need to clean up after ourselves -- only do this on test success
+    shutil.rmtree(dbname, True)
+
+if __name__ == "__main__":
+    sys.exit(main(sys.argv[1:]))
diff --git a/tools/db_crashtest2.py b/tools/db_crashtest2.py
new file mode 100644 (file)
index 0000000..0a12b5a
--- /dev/null
@@ -0,0 +1,168 @@
+#! /usr/bin/env python
+import os
+import re
+import sys
+import time
+import random
+import getopt
+import logging
+import tempfile
+import subprocess
+import shutil
+
+# This python script runs db_stress multiple times. Some runs with
+# kill_random_test that causes rocksdb to crash at various points in code.
+
+def main(argv):
+    try:
+        opts, args = getopt.getopt(argv, "hd:t:k:o:b:")
+    except getopt.GetoptError:
+        print str(getopt.GetoptError)
+        print "db_crashtest2.py -d <duration_test> -t <#threads> " \
+              "-k <kills with prob 1/k> -o <ops_per_thread> "\
+              "-b <write_buffer_size>\n"
+        sys.exit(2)
+
+    # default values, will be overridden by cmdline args
+    kill_random_test = 97  # kill with probability 1/97 by default
+    duration = 10000  # total time for this script to test db_stress
+    threads = 32
+    ops_per_thread = 200000
+    write_buf_size = 4 * 1024 * 1024
+
+    for opt, arg in opts:
+        if opt == '-h':
+            print "db_crashtest2.py -d <duration_test> -t <#threads> " \
+                  "-k <kills with prob 1/k> -o <ops_per_thread> " \
+                  "-b <write_buffer_size>\n"
+            sys.exit()
+        elif opt == "-d":
+            duration = int(arg)
+        elif opt == "-t":
+            threads = int(arg)
+        elif opt == "-k":
+            kill_random_test = int(arg)
+        elif opt == "-o":
+            ops_per_thread = int(arg)
+        elif opt == "-b":
+            write_buf_size = int(arg)
+        else:
+            print "unrecognized option " + str(opt) + "\n"
+            print "db_crashtest2.py -d <duration_test> -t <#threads> " \
+                  "-k <kills with prob 1/k> -o <ops_per_thread> " \
+                  "-b <write_buffer_size>\n"
+            sys.exit(2)
+
+    exit_time = time.time() + duration
+
+    print "Running whitebox-crash-test with \ntotal-duration=" + str(duration) \
+          + "\nthreads=" + str(threads) + "\nops_per_thread=" \
+          + str(ops_per_thread) + "\nwrite_buffer_size=" \
+          + str(write_buf_size) + "\n"
+
+    total_check_mode = 3
+    check_mode = 0
+
+    while time.time() < exit_time:
+        killoption = ""
+        if check_mode == 0:
+            # run with kill_random_test
+            killoption = " --kill_random_test=" + str(kill_random_test)
+            # use large ops per thread since we will kill it anyway
+            additional_opts = "--ops_per_thread=" + \
+                              str(100 * ops_per_thread) + killoption
+        elif check_mode == 1:
+            # normal run with universal compaction mode
+            additional_opts = "--ops_per_thread=" + str(ops_per_thread) + \
+                              " --compaction_style=1"
+        else:
+            # nomral run
+            additional_opts = "--ops_per_thread=" + str(ops_per_thread)
+
+        dbname = tempfile.mkdtemp(prefix='rocksdb_crashtest_')
+        cmd = re.sub('\s+', ' ', """
+            ./db_stress
+            --test_batches_snapshots=%s
+            --threads=%s
+            --write_buffer_size=%s
+            --destroy_db_initially=0
+            --reopen=20
+            --readpercent=45
+            --prefixpercent=5
+            --writepercent=35
+            --delpercent=5
+            --iterpercent=10
+            --db=%s
+            --max_key=100000000
+            --disable_seek_compaction=%s
+            --mmap_read=%s
+            --block_size=16384
+            --cache_size=1048576
+            --open_files=500000
+            --verify_checksum=1
+            --sync=0
+            --progress_reports=0
+            --disable_wal=0
+            --disable_data_sync=1
+            --target_file_size_base=2097152
+            --target_file_size_multiplier=2
+            --max_write_buffer_number=3
+            --max_background_compactions=20
+            --max_bytes_for_level_base=10485760
+            --filter_deletes=%s
+            --memtablerep=prefix_hash
+            --prefix_size=7
+            %s
+            """ % (random.randint(0, 1),
+                   threads,
+                   write_buf_size,
+                   dbname,
+                   random.randint(0, 1),
+                   random.randint(0, 1),
+                   random.randint(0, 1),
+                   additional_opts))
+
+        print "Running:" + cmd + "\n"
+
+        popen = subprocess.Popen([cmd], stdout=subprocess.PIPE,
+                                 stderr=subprocess.STDOUT,
+                                 shell=True)
+        stdoutdata, stderrdata = popen.communicate()
+        retncode = popen.returncode
+        msg = ("check_mode={0}, kill option={1}, exitcode={2}\n".format(
+               check_mode, killoption, retncode))
+        print msg
+        print stdoutdata
+
+        expected = False
+        if (killoption == '') and (retncode == 0):
+            # we expect zero retncode if no kill option
+            expected = True
+        elif killoption != '' and retncode < 0:
+            # we expect negative retncode if kill option was given
+            expected = True
+
+        if not expected:
+            print "TEST FAILED. See kill option and exit code above!!!\n"
+            sys.exit(1)
+
+        stdoutdata = stdoutdata.lower()
+        errorcount = (stdoutdata.count('error') -
+                      stdoutdata.count('got errors 0 times'))
+        print "#times error occurred in output is " + str(errorcount) + "\n"
+
+        if (errorcount > 0):
+            print "TEST FAILED. Output has 'error'!!!\n"
+            sys.exit(2)
+        if (stdoutdata.find('fail') >= 0):
+            print "TEST FAILED. Output has 'fail'!!!\n"
+            sys.exit(2)
+        # we need to clean up after ourselves -- only do this on test success
+        shutil.rmtree(dbname, True)
+
+        check_mode = (check_mode + 1) % total_check_mode
+
+        time.sleep(1)  # time to stabilize after a kill
+
+if __name__ == "__main__":
+    sys.exit(main(sys.argv[1:]))
diff --git a/tools/db_repl_stress.cc b/tools/db_repl_stress.cc
new file mode 100644 (file)
index 0000000..27cb6d5
--- /dev/null
@@ -0,0 +1,134 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#include <cstdio>
+
+#include <gflags/gflags.h>
+
+#include "db/write_batch_internal.h"
+#include "rocksdb/db.h"
+#include "rocksdb/types.h"
+#include "port/atomic_pointer.h"
+#include "util/testutil.h"
+
+
+// Run a thread to perform Put's.
+// Another thread uses GetUpdatesSince API to keep getting the updates.
+// options :
+// --num_inserts = the num of inserts the first thread should perform.
+// --wal_ttl = the wal ttl for the run.
+
+using namespace rocksdb;
+
+struct DataPumpThread {
+  size_t no_records;
+  DB* db; // Assumption DB is Open'ed already.
+};
+
+static std::string RandomString(Random* rnd, int len) {
+  std::string r;
+  test::RandomString(rnd, len, &r);
+  return r;
+}
+
+static void DataPumpThreadBody(void* arg) {
+  DataPumpThread* t = reinterpret_cast<DataPumpThread*>(arg);
+  DB* db = t->db;
+  Random rnd(301);
+  size_t i = 0;
+  while(i++ < t->no_records) {
+    if(!db->Put(WriteOptions(), Slice(RandomString(&rnd, 500)),
+                Slice(RandomString(&rnd, 500))).ok()) {
+      fprintf(stderr, "Error in put\n");
+      exit(1);
+    }
+  }
+}
+
+struct ReplicationThread {
+  port::AtomicPointer stop;
+  DB* db;
+  volatile size_t no_read;
+};
+
+static void ReplicationThreadBody(void* arg) {
+  ReplicationThread* t = reinterpret_cast<ReplicationThread*>(arg);
+  DB* db = t->db;
+  unique_ptr<TransactionLogIterator> iter;
+  SequenceNumber currentSeqNum = 1;
+  while (t->stop.Acquire_Load() != nullptr) {
+    iter.reset();
+    Status s;
+    while(!db->GetUpdatesSince(currentSeqNum, &iter).ok()) {
+      if (t->stop.Acquire_Load() == nullptr) {
+        return;
+      }
+    }
+    fprintf(stderr, "Refreshing iterator\n");
+    for(;iter->Valid(); iter->Next(), t->no_read++, currentSeqNum++) {
+      BatchResult res = iter->GetBatch();
+      if (res.sequence != currentSeqNum) {
+        fprintf(stderr,
+                "Missed a seq no. b/w %ld and %ld\n",
+                (long)currentSeqNum,
+                (long)res.sequence);
+        exit(1);
+      }
+    }
+  }
+}
+
+DEFINE_uint64(num_inserts, 1000, "the num of inserts the first thread should"
+              " perform.");
+DEFINE_uint64(wal_ttl_seconds, 1000, "the wal ttl for the run(in seconds)");
+DEFINE_uint64(wal_size_limit_MB, 10, "the wal size limit for the run"
+              "(in MB)");
+
+int main(int argc, const char** argv) {
+  google::SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
+    " --num_inserts=<num_inserts> --wal_ttl_seconds=<WAL_ttl_seconds>" +
+    " --wal_size_limit_MB=<WAL_size_limit_MB>");
+  google::ParseCommandLineFlags(&argc, const_cast<char***>(&argv), true);
+
+  Env* env = Env::Default();
+  std::string default_db_path;
+  env->GetTestDirectory(&default_db_path);
+  default_db_path += "db_repl_stress";
+  Options options;
+  options.create_if_missing = true;
+  options.WAL_ttl_seconds = FLAGS_wal_ttl_seconds;
+  options.WAL_size_limit_MB = FLAGS_wal_size_limit_MB;
+  DB* db;
+  DestroyDB(default_db_path, options);
+
+  Status s = DB::Open(options, default_db_path, &db);
+
+  if (!s.ok()) {
+    fprintf(stderr, "Could not open DB due to %s\n", s.ToString().c_str());
+    exit(1);
+  }
+
+  DataPumpThread dataPump;
+  dataPump.no_records = FLAGS_num_inserts;
+  dataPump.db = db;
+  env->StartThread(DataPumpThreadBody, &dataPump);
+
+  ReplicationThread replThread;
+  replThread.db = db;
+  replThread.no_read = 0;
+  replThread.stop.Release_Store(env); // store something to make it non-null.
+
+  env->StartThread(ReplicationThreadBody, &replThread);
+  while(replThread.no_read < FLAGS_num_inserts);
+  replThread.stop.Release_Store(nullptr);
+  if (replThread.no_read < dataPump.no_records) {
+    // no. read should be => than inserted.
+    fprintf(stderr, "No. of Record's written and read not same\nRead : %zu"
+            " Written : %zu\n", replThread.no_read, dataPump.no_records);
+    exit(1);
+  }
+  fprintf(stderr, "Successful!\n");
+  exit(0);
+}
diff --git a/tools/db_sanity_test.cc b/tools/db_sanity_test.cc
new file mode 100644 (file)
index 0000000..e970f5e
--- /dev/null
@@ -0,0 +1,203 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include <cstdio>
+#include <vector>
+#include <memory>
+
+#include "include/rocksdb/db.h"
+#include "include/rocksdb/options.h"
+#include "include/rocksdb/env.h"
+#include "include/rocksdb/slice.h"
+#include "include/rocksdb/status.h"
+#include "include/rocksdb/comparator.h"
+#include "include/rocksdb/table.h"
+#include "include/rocksdb/slice_transform.h"
+
+namespace rocksdb {
+
+class SanityTest {
+ public:
+  explicit SanityTest(const std::string& path)
+      : env_(Env::Default()), path_(path) {
+    env_->CreateDirIfMissing(path);
+  }
+  virtual ~SanityTest() {}
+
+  virtual std::string Name() const = 0;
+  virtual Options GetOptions() const = 0;
+
+  Status Create() {
+    Options options = GetOptions();
+    options.create_if_missing = true;
+    std::string dbname = path_ + Name();
+    DestroyDB(dbname, options);
+    DB* db;
+    Status s = DB::Open(options, dbname, &db);
+    std::unique_ptr<DB> db_guard(db);
+    if (!s.ok()) {
+      return s;
+    }
+    for (int i = 0; i < 1000000; ++i) {
+      std::string k = "key" + std::to_string(i);
+      std::string v = "value" + std::to_string(i);
+      s = db->Put(WriteOptions(), Slice(k), Slice(v));
+      if (!s.ok()) {
+        return s;
+      }
+    }
+    return Status::OK();
+  }
+  Status Verify() {
+    DB* db;
+    std::string dbname = path_ + Name();
+    Status s = DB::Open(GetOptions(), dbname, &db);
+    std::unique_ptr<DB> db_guard(db);
+    if (!s.ok()) {
+      return s;
+    }
+    for (int i = 0; i < 1000000; ++i) {
+      std::string k = "key" + std::to_string(i);
+      std::string v = "value" + std::to_string(i);
+      std::string result;
+      s = db->Get(ReadOptions(), Slice(k), &result);
+      if (!s.ok()) {
+        return s;
+      }
+      if (result != v) {
+        return Status::Corruption("Unexpected value for key " + k);
+      }
+    }
+    return Status::OK();
+  }
+
+ private:
+  Env* env_;
+  std::string const path_;
+};
+
+class SanityTestBasic : public SanityTest {
+ public:
+  explicit SanityTestBasic(const std::string& path) : SanityTest(path) {}
+  virtual Options GetOptions() const {
+    Options options;
+    options.create_if_missing = true;
+    return options;
+  }
+  virtual std::string Name() const { return "Basic"; }
+};
+
+class SanityTestSpecialComparator : public SanityTest {
+ public:
+  explicit SanityTestSpecialComparator(const std::string& path)
+      : SanityTest(path) {
+    options_.comparator = new NewComparator();
+  }
+  ~SanityTestSpecialComparator() { delete options_.comparator; }
+  virtual Options GetOptions() const { return options_; }
+  virtual std::string Name() const { return "SpecialComparator"; }
+
+ private:
+  class NewComparator : public Comparator {
+   public:
+    virtual const char* Name() const { return "rocksdb.NewComparator"; }
+    virtual int Compare(const Slice& a, const Slice& b) const {
+      return BytewiseComparator()->Compare(a, b);
+    }
+    virtual void FindShortestSeparator(std::string* s, const Slice& l) const {
+      BytewiseComparator()->FindShortestSeparator(s, l);
+    }
+    virtual void FindShortSuccessor(std::string* key) const {
+      BytewiseComparator()->FindShortSuccessor(key);
+    }
+  };
+  Options options_;
+};
+
+class SanityTestZlibCompression : public SanityTest {
+ public:
+  explicit SanityTestZlibCompression(const std::string& path)
+      : SanityTest(path) {
+    options_.compression = kZlibCompression;
+  }
+  virtual Options GetOptions() const { return options_; }
+  virtual std::string Name() const { return "ZlibCompression"; }
+
+ private:
+  Options options_;
+};
+
+class SanityTestPlainTableFactory : public SanityTest {
+ public:
+  explicit SanityTestPlainTableFactory(const std::string& path)
+      : SanityTest(path) {
+    options_.table_factory.reset(NewPlainTableFactory());
+    options_.prefix_extractor.reset(NewFixedPrefixTransform(2));
+    options_.allow_mmap_reads = true;
+  }
+  ~SanityTestPlainTableFactory() {}
+  virtual Options GetOptions() const { return options_; }
+  virtual std::string Name() const { return "PlainTable"; }
+
+ private:
+  Options options_;
+};
+
+namespace {
+bool RunSanityTests(const std::string& command, const std::string& path) {
+  std::vector<SanityTest*> sanity_tests = {
+      new SanityTestBasic(path),
+      new SanityTestSpecialComparator(path),
+      new SanityTestZlibCompression(path),
+      new SanityTestPlainTableFactory(path)};
+
+  if (command == "create") {
+    fprintf(stderr, "Creating...\n");
+  } else {
+    fprintf(stderr, "Verifying...\n");
+  }
+  for (auto sanity_test : sanity_tests) {
+    Status s;
+    fprintf(stderr, "%s -- ", sanity_test->Name().c_str());
+    if (command == "create") {
+      s = sanity_test->Create();
+    } else {
+      assert(command == "verify");
+      s = sanity_test->Verify();
+    }
+    fprintf(stderr, "%s\n", s.ToString().c_str());
+    if (!s.ok()) {
+      fprintf(stderr, "FAIL\n");
+      return false;
+    }
+
+    delete sanity_test;
+  }
+  return true;
+}
+}  // namespace
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  std::string path, command;
+  bool ok = (argc == 3);
+  if (ok) {
+    path = std::string(argv[1]);
+    command = std::string(argv[2]);
+    ok = (command == "create" || command == "verify");
+  }
+  if (!ok) {
+    fprintf(stderr, "Usage: %s <path> [create|verify] \n", argv[0]);
+    exit(1);
+  }
+  if (path.back() != '/') {
+    path += "/";
+  }
+
+  bool sanity_ok = rocksdb::RunSanityTests(command, path);
+
+  return sanity_ok ? 0 : 1;
+}
diff --git a/tools/db_stress.cc b/tools/db_stress.cc
new file mode 100644 (file)
index 0000000..c774171
--- /dev/null
@@ -0,0 +1,1732 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// The test uses an array to compare against values written to the database.
+// Keys written to the array are in 1:1 correspondence to the actual values in
+// the database according to the formula in the function GenerateValue.
+
+// Space is reserved in the array from 0 to FLAGS_max_key and values are
+// randomly written/deleted/read from those positions. During verification we
+// compare all the positions in the array. To shorten/elongate the running
+// time, you could change the settings: FLAGS_max_key, FLAGS_ops_per_thread,
+// (sometimes also FLAGS_threads).
+//
+// NOTE that if FLAGS_test_batches_snapshots is set, the test will have
+// different behavior. See comment of the flag for details.
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <gflags/gflags.h>
+#include "db/db_impl.h"
+#include "db/version_set.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/cache.h"
+#include "utilities/db_ttl.h"
+#include "rocksdb/env.h"
+#include "rocksdb/write_batch.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/statistics.h"
+#include "port/port.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+#include "util/histogram.h"
+#include "util/mutexlock.h"
+#include "util/random.h"
+#include "util/testutil.h"
+#include "util/logging.h"
+#include "hdfs/env_hdfs.h"
+#include "utilities/merge_operators.h"
+
+static const long KB = 1024;
+
+
+static bool ValidateUint32Range(const char* flagname, uint64_t value) {
+  if (value > std::numeric_limits<uint32_t>::max()) {
+    fprintf(stderr,
+            "Invalid value for --%s: %lu, overflow\n",
+            flagname,
+            (unsigned long)value);
+    return false;
+  }
+  return true;
+}
+
+DEFINE_uint64(seed, 2341234, "Seed for PRNG");
+static const bool FLAGS_seed_dummy __attribute__((unused)) =
+    google::RegisterFlagValidator(&FLAGS_seed, &ValidateUint32Range);
+
+DEFINE_int64(max_key, 1 * KB* KB,
+             "Max number of key/values to place in database");
+
+DEFINE_int32(column_families, 10, "Number of column families");
+
+DEFINE_bool(test_batches_snapshots, false,
+            "If set, the test uses MultiGet(), Multiut() and MultiDelete()"
+            " which read/write/delete multiple keys in a batch. In this mode,"
+            " we do not verify db content by comparing the content with the "
+            "pre-allocated array. Instead, we do partial verification inside"
+            " MultiGet() by checking various values in a batch. Benefit of"
+            " this mode:\n"
+            "\t(a) No need to acquire mutexes during writes (less cache "
+            "flushes in multi-core leading to speed up)\n"
+            "\t(b) No long validation at the end (more speed up)\n"
+            "\t(c) Test snapshot and atomicity of batch writes");
+
+DEFINE_int32(threads, 32, "Number of concurrent threads to run.");
+
+DEFINE_int32(ttl, -1,
+             "Opens the db with this ttl value if this is not -1. "
+             "Carefully specify a large value such that verifications on "
+             "deleted values don't fail");
+
+DEFINE_int32(value_size_mult, 8,
+             "Size of value will be this number times rand_int(1,3) bytes");
+
+DEFINE_bool(verify_before_write, false, "Verify before write");
+
+DEFINE_bool(histogram, false, "Print histogram of operation timings");
+
+DEFINE_bool(destroy_db_initially, true,
+            "Destroys the database dir before start if this is true");
+
+DEFINE_bool(verbose, false, "Verbose");
+
+DEFINE_bool(progress_reports, true,
+            "If true, db_stress will report number of finished operations");
+
+DEFINE_int32(write_buffer_size, rocksdb::Options().write_buffer_size,
+             "Number of bytes to buffer in memtable before compacting");
+
+DEFINE_int32(max_write_buffer_number,
+             rocksdb::Options().max_write_buffer_number,
+             "The number of in-memory memtables. "
+             "Each memtable is of size FLAGS_write_buffer_size.");
+
+DEFINE_int32(min_write_buffer_number_to_merge,
+             rocksdb::Options().min_write_buffer_number_to_merge,
+             "The minimum number of write buffers that will be merged together "
+             "before writing to storage. This is cheap because it is an "
+             "in-memory merge. If this feature is not enabled, then all these "
+             "write buffers are flushed to L0 as separate files and this "
+             "increases read amplification because a get request has to check "
+             "in all of these files. Also, an in-memory merge may result in "
+             "writing less data to storage if there are duplicate records in"
+             " each of these individual write buffers.");
+
+DEFINE_int32(open_files, rocksdb::Options().max_open_files,
+             "Maximum number of files to keep open at the same time "
+             "(use default if == 0)");
+
+DEFINE_int64(compressed_cache_size, -1,
+             "Number of bytes to use as a cache of compressed data."
+             " Negative means use default settings.");
+
+DEFINE_int32(compaction_style, rocksdb::Options().compaction_style, "");
+
+DEFINE_int32(level0_file_num_compaction_trigger,
+             rocksdb::Options().level0_file_num_compaction_trigger,
+             "Level0 compaction start trigger");
+
+DEFINE_int32(level0_slowdown_writes_trigger,
+             rocksdb::Options().level0_slowdown_writes_trigger,
+             "Number of files in level-0 that will slow down writes");
+
+DEFINE_int32(level0_stop_writes_trigger,
+             rocksdb::Options().level0_stop_writes_trigger,
+             "Number of files in level-0 that will trigger put stop.");
+
+DEFINE_int32(block_size, rocksdb::Options().block_size,
+             "Number of bytes in a block.");
+
+DEFINE_int32(max_background_compactions,
+             rocksdb::Options().max_background_compactions,
+             "The maximum number of concurrent background compactions "
+             "that can occur in parallel.");
+
+DEFINE_int32(max_background_flushes, rocksdb::Options().max_background_flushes,
+             "The maximum number of concurrent background flushes "
+             "that can occur in parallel.");
+
+DEFINE_int32(universal_size_ratio, 0, "The ratio of file sizes that trigger"
+             " compaction in universal style");
+
+DEFINE_int32(universal_min_merge_width, 0, "The minimum number of files to "
+             "compact in universal style compaction");
+
+DEFINE_int32(universal_max_merge_width, 0, "The max number of files to compact"
+             " in universal style compaction");
+
+DEFINE_int32(universal_max_size_amplification_percent, 0,
+             "The max size amplification for universal style compaction");
+
+DEFINE_int32(clear_column_family_one_in, 1000000,
+             "With a chance of 1/N, delete a column family and then recreate "
+             "it again. If N == 0, never drop/create column families. "
+             "When test_batches_snapshots is true, this flag has no effect");
+
+DEFINE_int64(cache_size, 2 * KB * KB * KB,
+             "Number of bytes to use as a cache of uncompressed data.");
+
+static bool ValidateInt32Positive(const char* flagname, int32_t value) {
+  if (value < 0) {
+    fprintf(stderr, "Invalid value for --%s: %d, must be >=0\n",
+            flagname, value);
+    return false;
+  }
+  return true;
+}
+DEFINE_int32(reopen, 10, "Number of times database reopens");
+static const bool FLAGS_reopen_dummy __attribute__((unused)) =
+    google::RegisterFlagValidator(&FLAGS_reopen, &ValidateInt32Positive);
+
+DEFINE_int32(bloom_bits, 10, "Bloom filter bits per key. "
+             "Negative means use default settings.");
+
+DEFINE_string(db, "", "Use the db with the following name.");
+
+DEFINE_bool(verify_checksum, false,
+            "Verify checksum for every block read from storage");
+
+DEFINE_bool(mmap_read, rocksdb::EnvOptions().use_mmap_reads,
+            "Allow reads to occur via mmap-ing files");
+
+// Database statistics
+static std::shared_ptr<rocksdb::Statistics> dbstats;
+DEFINE_bool(statistics, false, "Create database statistics");
+
+DEFINE_bool(sync, false, "Sync all writes to disk");
+
+DEFINE_bool(disable_data_sync, false,
+            "If true, do not wait until data is synced to disk.");
+
+DEFINE_bool(use_fsync, false, "If true, issue fsync instead of fdatasync");
+
+DEFINE_int32(kill_random_test, 0,
+             "If non-zero, kill at various points in source code with "
+             "probability 1/this");
+static const bool FLAGS_kill_random_test_dummy __attribute__((unused)) =
+    google::RegisterFlagValidator(&FLAGS_kill_random_test,
+                                  &ValidateInt32Positive);
+extern int rocksdb_kill_odds;
+
+DEFINE_bool(disable_wal, false, "If true, do not write WAL for write.");
+
+DEFINE_int32(target_file_size_base, 64 * KB,
+             "Target level-1 file size for compaction");
+
+DEFINE_int32(target_file_size_multiplier, 1,
+             "A multiplier to compute targe level-N file size (N >= 2)");
+
+DEFINE_uint64(max_bytes_for_level_base, 256 * KB, "Max bytes for level-1");
+
+DEFINE_int32(max_bytes_for_level_multiplier, 2,
+             "A multiplier to compute max bytes for level-N (N >= 2)");
+
+static bool ValidateInt32Percent(const char* flagname, int32_t value) {
+  if (value < 0 || value>100) {
+    fprintf(stderr, "Invalid value for --%s: %d, 0<= pct <=100 \n",
+            flagname, value);
+    return false;
+  }
+  return true;
+}
+DEFINE_int32(readpercent, 10,
+             "Ratio of reads to total workload (expressed as a percentage)");
+static const bool FLAGS_readpercent_dummy __attribute__((unused)) =
+    google::RegisterFlagValidator(&FLAGS_readpercent, &ValidateInt32Percent);
+
+DEFINE_int32(prefixpercent, 20,
+             "Ratio of prefix iterators to total workload (expressed as a"
+             " percentage)");
+static const bool FLAGS_prefixpercent_dummy __attribute__((unused)) =
+    google::RegisterFlagValidator(&FLAGS_prefixpercent, &ValidateInt32Percent);
+
+DEFINE_int32(writepercent, 45,
+             " Ratio of deletes to total workload (expressed as a percentage)");
+static const bool FLAGS_writepercent_dummy __attribute__((unused)) =
+    google::RegisterFlagValidator(&FLAGS_writepercent, &ValidateInt32Percent);
+
+DEFINE_int32(delpercent, 15,
+             "Ratio of deletes to total workload (expressed as a percentage)");
+static const bool FLAGS_delpercent_dummy __attribute__((unused)) =
+    google::RegisterFlagValidator(&FLAGS_delpercent, &ValidateInt32Percent);
+
+DEFINE_int32(iterpercent, 10, "Ratio of iterations to total workload"
+             " (expressed as a percentage)");
+static const bool FLAGS_iterpercent_dummy __attribute__((unused)) =
+    google::RegisterFlagValidator(&FLAGS_iterpercent, &ValidateInt32Percent);
+
+DEFINE_uint64(num_iterations, 10, "Number of iterations per MultiIterate run");
+static const bool FLAGS_num_iterations_dummy __attribute__((unused)) =
+    google::RegisterFlagValidator(&FLAGS_num_iterations, &ValidateUint32Range);
+
+DEFINE_bool(disable_seek_compaction, false,
+            "Option to disable compation triggered by read.");
+
+namespace {
+enum rocksdb::CompressionType StringToCompressionType(const char* ctype) {
+  assert(ctype);
+
+  if (!strcasecmp(ctype, "none"))
+    return rocksdb::kNoCompression;
+  else if (!strcasecmp(ctype, "snappy"))
+    return rocksdb::kSnappyCompression;
+  else if (!strcasecmp(ctype, "zlib"))
+    return rocksdb::kZlibCompression;
+  else if (!strcasecmp(ctype, "bzip2"))
+    return rocksdb::kBZip2Compression;
+  else if (!strcasecmp(ctype, "lz4"))
+    return rocksdb::kLZ4Compression;
+  else if (!strcasecmp(ctype, "lz4hc"))
+    return rocksdb::kLZ4HCCompression;
+
+  fprintf(stdout, "Cannot parse compression type '%s'\n", ctype);
+  return rocksdb::kSnappyCompression; //default value
+}
+}  // namespace
+
+DEFINE_string(compression_type, "snappy",
+              "Algorithm to use to compress the database");
+static enum rocksdb::CompressionType FLAGS_compression_type_e =
+    rocksdb::kSnappyCompression;
+
+DEFINE_string(hdfs, "", "Name of hdfs environment");
+// posix or hdfs environment
+static rocksdb::Env* FLAGS_env = rocksdb::Env::Default();
+
+DEFINE_uint64(ops_per_thread, 1200000, "Number of operations per thread.");
+static const bool FLAGS_ops_per_thread_dummy __attribute__((unused)) =
+    google::RegisterFlagValidator(&FLAGS_ops_per_thread, &ValidateUint32Range);
+
+DEFINE_uint64(log2_keys_per_lock, 2, "Log2 of number of keys per lock");
+static const bool FLAGS_log2_keys_per_lock_dummy __attribute__((unused)) =
+    google::RegisterFlagValidator(&FLAGS_log2_keys_per_lock,
+                                  &ValidateUint32Range);
+
+DEFINE_int32(purge_redundant_percent, 50,
+             "Percentage of times we want to purge redundant keys in memory "
+             "before flushing");
+static const bool FLAGS_purge_redundant_percent_dummy __attribute__((unused)) =
+    google::RegisterFlagValidator(&FLAGS_purge_redundant_percent,
+                                  &ValidateInt32Percent);
+
+DEFINE_bool(filter_deletes, false, "On true, deletes use KeyMayExist to drop"
+            " the delete if key not present");
+
+enum RepFactory {
+  kSkipList,
+  kHashSkipList,
+  kVectorRep
+};
+
+namespace {
+enum RepFactory StringToRepFactory(const char* ctype) {
+  assert(ctype);
+
+  if (!strcasecmp(ctype, "skip_list"))
+    return kSkipList;
+  else if (!strcasecmp(ctype, "prefix_hash"))
+    return kHashSkipList;
+  else if (!strcasecmp(ctype, "vector"))
+    return kVectorRep;
+
+  fprintf(stdout, "Cannot parse memreptable %s\n", ctype);
+  return kSkipList;
+}
+}  // namespace
+
+static enum RepFactory FLAGS_rep_factory;
+DEFINE_string(memtablerep, "prefix_hash", "");
+
+static bool ValidatePrefixSize(const char* flagname, int32_t value) {
+  if (value < 0 || value > 8) {
+    fprintf(stderr, "Invalid value for --%s: %d. 0 <= PrefixSize <= 8\n",
+            flagname, value);
+    return false;
+  }
+  return true;
+}
+DEFINE_int32(prefix_size, 7, "Control the prefix size for HashSkipListRep");
+static const bool FLAGS_prefix_size_dummy =
+  google::RegisterFlagValidator(&FLAGS_prefix_size, &ValidatePrefixSize);
+
+DEFINE_bool(use_merge, false, "On true, replaces all writes with a Merge "
+            "that behaves like a Put");
+
+
+namespace rocksdb {
+
+// convert long to a big-endian slice key
+static std::string Key(long val) {
+  std::string little_endian_key;
+  std::string big_endian_key;
+  PutFixed64(&little_endian_key, val);
+  assert(little_endian_key.size() == sizeof(val));
+  big_endian_key.resize(sizeof(val));
+  for (int i=0; i<(int)sizeof(val); i++) {
+    big_endian_key[i] = little_endian_key[sizeof(val) - 1 - i];
+  }
+  return big_endian_key;
+}
+
+static std::string StringToHex(const std::string& str) {
+  std::string result = "0x";
+  char buf[10];
+  for (size_t i = 0; i < str.length(); i++) {
+    snprintf(buf, 10, "%02X", (unsigned char)str[i]);
+    result += buf;
+  }
+  return result;
+}
+
+
+class StressTest;
+namespace {
+
+class Stats {
+ private:
+  double start_;
+  double finish_;
+  double seconds_;
+  long done_;
+  long gets_;
+  long prefixes_;
+  long writes_;
+  long deletes_;
+  long iterator_size_sums_;
+  long founds_;
+  long iterations_;
+  long errors_;
+  int next_report_;
+  size_t bytes_;
+  double last_op_finish_;
+  HistogramImpl hist_;
+
+ public:
+  Stats() { }
+
+  void Start() {
+    next_report_ = 100;
+    hist_.Clear();
+    done_ = 0;
+    gets_ = 0;
+    prefixes_ = 0;
+    writes_ = 0;
+    deletes_ = 0;
+    iterator_size_sums_ = 0;
+    founds_ = 0;
+    iterations_ = 0;
+    errors_ = 0;
+    bytes_ = 0;
+    seconds_ = 0;
+    start_ = FLAGS_env->NowMicros();
+    last_op_finish_ = start_;
+    finish_ = start_;
+  }
+
+  void Merge(const Stats& other) {
+    hist_.Merge(other.hist_);
+    done_ += other.done_;
+    gets_ += other.gets_;
+    prefixes_ += other.prefixes_;
+    writes_ += other.writes_;
+    deletes_ += other.deletes_;
+    iterator_size_sums_ += other.iterator_size_sums_;
+    founds_ += other.founds_;
+    iterations_ += other.iterations_;
+    errors_ += other.errors_;
+    bytes_ += other.bytes_;
+    seconds_ += other.seconds_;
+    if (other.start_ < start_) start_ = other.start_;
+    if (other.finish_ > finish_) finish_ = other.finish_;
+  }
+
+  void Stop() {
+    finish_ = FLAGS_env->NowMicros();
+    seconds_ = (finish_ - start_) * 1e-6;
+  }
+
+  void FinishedSingleOp() {
+    if (FLAGS_histogram) {
+      double now = FLAGS_env->NowMicros();
+      double micros = now - last_op_finish_;
+      hist_.Add(micros);
+      if (micros > 20000) {
+        fprintf(stdout, "long op: %.1f micros%30s\r", micros, "");
+      }
+      last_op_finish_ = now;
+    }
+
+      done_++;
+    if (FLAGS_progress_reports) {
+      if (done_ >= next_report_) {
+        if      (next_report_ < 1000)   next_report_ += 100;
+        else if (next_report_ < 5000)   next_report_ += 500;
+        else if (next_report_ < 10000)  next_report_ += 1000;
+        else if (next_report_ < 50000)  next_report_ += 5000;
+        else if (next_report_ < 100000) next_report_ += 10000;
+        else if (next_report_ < 500000) next_report_ += 50000;
+        else                            next_report_ += 100000;
+        fprintf(stdout, "... finished %ld ops%30s\r", done_, "");
+      }
+    }
+  }
+
+  void AddBytesForWrites(int nwrites, size_t nbytes) {
+    writes_ += nwrites;
+    bytes_ += nbytes;
+  }
+
+  void AddGets(int ngets, int nfounds) {
+    founds_ += nfounds;
+    gets_ += ngets;
+  }
+
+  void AddPrefixes(int nprefixes, int count) {
+    prefixes_ += nprefixes;
+    iterator_size_sums_ += count;
+  }
+
+  void AddIterations(int n) {
+    iterations_ += n;
+  }
+
+  void AddDeletes(int n) {
+    deletes_ += n;
+  }
+
+  void AddErrors(int n) {
+    errors_ += n;
+  }
+
+  void Report(const char* name) {
+    std::string extra;
+    if (bytes_ < 1 || done_ < 1) {
+      fprintf(stderr, "No writes or ops?\n");
+      return;
+    }
+
+    double elapsed = (finish_ - start_) * 1e-6;
+    double bytes_mb = bytes_ / 1048576.0;
+    double rate = bytes_mb / elapsed;
+    double throughput = (double)done_/elapsed;
+
+    fprintf(stdout, "%-12s: ", name);
+    fprintf(stdout, "%.3f micros/op %ld ops/sec\n",
+            seconds_ * 1e6 / done_, (long)throughput);
+    fprintf(stdout, "%-12s: Wrote %.2f MB (%.2f MB/sec) (%ld%% of %ld ops)\n",
+            "", bytes_mb, rate, (100*writes_)/done_, done_);
+    fprintf(stdout, "%-12s: Wrote %ld times\n", "", writes_);
+    fprintf(stdout, "%-12s: Deleted %ld times\n", "", deletes_);
+    fprintf(stdout, "%-12s: %ld read and %ld found the key\n", "",
+            gets_, founds_);
+    fprintf(stdout, "%-12s: Prefix scanned %ld times\n", "", prefixes_);
+    fprintf(stdout, "%-12s: Iterator size sum is %ld\n", "",
+            iterator_size_sums_);
+    fprintf(stdout, "%-12s: Iterated %ld times\n", "", iterations_);
+    fprintf(stdout, "%-12s: Got errors %ld times\n", "", errors_);
+
+    if (FLAGS_histogram) {
+      fprintf(stdout, "Microseconds per op:\n%s\n", hist_.ToString().c_str());
+    }
+    fflush(stdout);
+  }
+};
+
+// State shared by all concurrent executions of the same benchmark.
+class SharedState {
+ public:
+  static const uint32_t SENTINEL;
+
+  explicit SharedState(StressTest* stress_test)
+      : cv_(&mu_),
+        seed_(FLAGS_seed),
+        max_key_(FLAGS_max_key),
+        log2_keys_per_lock_(FLAGS_log2_keys_per_lock),
+        num_threads_(FLAGS_threads),
+        num_initialized_(0),
+        num_populated_(0),
+        vote_reopen_(0),
+        num_done_(0),
+        start_(false),
+        start_verify_(false),
+        stress_test_(stress_test),
+        verification_failure_(false) {
+    if (FLAGS_test_batches_snapshots) {
+      fprintf(stdout, "No lock creation because test_batches_snapshots set\n");
+      return;
+    }
+    values_.resize(FLAGS_column_families);
+
+    for (int i = 0; i < FLAGS_column_families; ++i) {
+      values_[i] = std::vector<uint32_t>(max_key_, SENTINEL);
+    }
+
+    long num_locks = (max_key_ >> log2_keys_per_lock_);
+    if (max_key_ & ((1 << log2_keys_per_lock_) - 1)) {
+      num_locks++;
+    }
+    fprintf(stdout, "Creating %ld locks\n", num_locks * FLAGS_column_families);
+    key_locks_.resize(FLAGS_column_families);
+    for (int i = 0; i < FLAGS_column_families; ++i) {
+      key_locks_[i] = std::vector<port::Mutex>(num_locks);
+    }
+  }
+
+  ~SharedState() {}
+
+  port::Mutex* GetMutex() {
+    return &mu_;
+  }
+
+  port::CondVar* GetCondVar() {
+    return &cv_;
+  }
+
+  StressTest* GetStressTest() const {
+    return stress_test_;
+  }
+
+  long GetMaxKey() const {
+    return max_key_;
+  }
+
+  uint32_t GetNumThreads() const {
+    return num_threads_;
+  }
+
+  void IncInitialized() {
+    num_initialized_++;
+  }
+
+  void IncOperated() {
+    num_populated_++;
+  }
+
+  void IncDone() {
+    num_done_++;
+  }
+
+  void IncVotedReopen() {
+    vote_reopen_ = (vote_reopen_ + 1) % num_threads_;
+  }
+
+  bool AllInitialized() const {
+    return num_initialized_ >= num_threads_;
+  }
+
+  bool AllOperated() const {
+    return num_populated_ >= num_threads_;
+  }
+
+  bool AllDone() const {
+    return num_done_ >= num_threads_;
+  }
+
+  bool AllVotedReopen() {
+    return (vote_reopen_ == 0);
+  }
+
+  void SetStart() {
+    start_ = true;
+  }
+
+  void SetStartVerify() {
+    start_verify_ = true;
+  }
+
+  bool Started() const {
+    return start_;
+  }
+
+  bool VerifyStarted() const {
+    return start_verify_;
+  }
+
+  void SetVerificationFailure() { verification_failure_.store(true); }
+
+  bool HasVerificationFailedYet() { return verification_failure_.load(); }
+
+  port::Mutex* GetMutexForKey(int cf, long key) {
+    return &key_locks_[cf][key >> log2_keys_per_lock_];
+  }
+
+  void LockColumnFamily(int cf) {
+    for (auto& mutex : key_locks_[cf]) {
+      mutex.Lock();
+    }
+  }
+
+  void UnlockColumnFamily(int cf) {
+    for (auto& mutex : key_locks_[cf]) {
+      mutex.Unlock();
+    }
+  }
+
+  void ClearColumnFamily(int cf) {
+    std::fill(values_[cf].begin(), values_[cf].end(), SENTINEL);
+  }
+
+  void Put(int cf, long key, uint32_t value_base) {
+    values_[cf][key] = value_base;
+  }
+
+  uint32_t Get(int cf, long key) const { return values_[cf][key]; }
+
+  void Delete(int cf, long key) { values_[cf][key] = SENTINEL; }
+
+  uint32_t GetSeed() const { return seed_; }
+
+ private:
+  port::Mutex mu_;
+  port::CondVar cv_;
+  const uint32_t seed_;
+  const long max_key_;
+  const uint32_t log2_keys_per_lock_;
+  const int num_threads_;
+  long num_initialized_;
+  long num_populated_;
+  long vote_reopen_;
+  long num_done_;
+  bool start_;
+  bool start_verify_;
+  StressTest* stress_test_;
+  std::atomic<bool> verification_failure_;
+
+  std::vector<std::vector<uint32_t>> values_;
+  std::vector<std::vector<port::Mutex>> key_locks_;
+};
+
+const uint32_t SharedState::SENTINEL = 0xffffffff;
+
+// Per-thread state for concurrent executions of the same benchmark.
+struct ThreadState {
+  uint32_t tid; // 0..n-1
+  Random rand;  // Has different seeds for different threads
+  SharedState* shared;
+  Stats stats;
+
+  ThreadState(uint32_t index, SharedState *shared)
+      : tid(index),
+        rand(1000 + index + shared->GetSeed()),
+        shared(shared) {
+  }
+};
+
+}  // namespace
+
+class StressTest {
+ public:
+  StressTest()
+      : cache_(NewLRUCache(FLAGS_cache_size)),
+        compressed_cache_(FLAGS_compressed_cache_size >= 0
+                              ? NewLRUCache(FLAGS_compressed_cache_size)
+                              : nullptr),
+        filter_policy_(FLAGS_bloom_bits >= 0
+                           ? NewBloomFilterPolicy(FLAGS_bloom_bits)
+                           : nullptr),
+        db_(nullptr),
+        new_column_family_name_(0),
+        num_times_reopened_(0) {
+    if (FLAGS_destroy_db_initially) {
+      std::vector<std::string> files;
+      FLAGS_env->GetChildren(FLAGS_db, &files);
+      for (unsigned int i = 0; i < files.size(); i++) {
+        if (Slice(files[i]).starts_with("heap-")) {
+          FLAGS_env->DeleteFile(FLAGS_db + "/" + files[i]);
+        }
+      }
+      DestroyDB(FLAGS_db, Options());
+    }
+  }
+
+  ~StressTest() {
+    for (auto cf : column_families_) {
+      delete cf;
+    }
+    column_families_.clear();
+    delete db_;
+    delete filter_policy_;
+  }
+
+  bool Run() {
+    PrintEnv();
+    Open();
+    SharedState shared(this);
+    uint32_t n = shared.GetNumThreads();
+
+    std::vector<ThreadState*> threads(n);
+    for (uint32_t i = 0; i < n; i++) {
+      threads[i] = new ThreadState(i, &shared);
+      FLAGS_env->StartThread(ThreadBody, threads[i]);
+    }
+    // Each thread goes through the following states:
+    // initializing -> wait for others to init -> read/populate/depopulate
+    // wait for others to operate -> verify -> done
+
+    {
+      MutexLock l(shared.GetMutex());
+      while (!shared.AllInitialized()) {
+        shared.GetCondVar()->Wait();
+      }
+
+      double now = FLAGS_env->NowMicros();
+      fprintf(stdout, "%s Starting database operations\n",
+              FLAGS_env->TimeToString((uint64_t) now/1000000).c_str());
+
+      shared.SetStart();
+      shared.GetCondVar()->SignalAll();
+      while (!shared.AllOperated()) {
+        shared.GetCondVar()->Wait();
+      }
+
+      now = FLAGS_env->NowMicros();
+      if (FLAGS_test_batches_snapshots) {
+        fprintf(stdout, "%s Limited verification already done during gets\n",
+                FLAGS_env->TimeToString((uint64_t) now/1000000).c_str());
+      } else {
+        fprintf(stdout, "%s Starting verification\n",
+                FLAGS_env->TimeToString((uint64_t) now/1000000).c_str());
+      }
+
+      shared.SetStartVerify();
+      shared.GetCondVar()->SignalAll();
+      while (!shared.AllDone()) {
+        shared.GetCondVar()->Wait();
+      }
+    }
+
+    for (unsigned int i = 1; i < n; i++) {
+      threads[0]->stats.Merge(threads[i]->stats);
+    }
+    threads[0]->stats.Report("Stress Test");
+
+    for (unsigned int i = 0; i < n; i++) {
+      delete threads[i];
+      threads[i] = nullptr;
+    }
+    double now = FLAGS_env->NowMicros();
+    if (!FLAGS_test_batches_snapshots) {
+      fprintf(stdout, "%s Verification successful\n",
+              FLAGS_env->TimeToString((uint64_t) now/1000000).c_str());
+    }
+    PrintStatistics();
+
+    if (shared.HasVerificationFailedYet()) {
+      printf("Verification failed :(\n");
+      return false;
+    }
+    return true;
+  }
+
+ private:
+
+  static void ThreadBody(void* v) {
+    ThreadState* thread = reinterpret_cast<ThreadState*>(v);
+    SharedState* shared = thread->shared;
+
+    {
+      MutexLock l(shared->GetMutex());
+      shared->IncInitialized();
+      if (shared->AllInitialized()) {
+        shared->GetCondVar()->SignalAll();
+      }
+      while (!shared->Started()) {
+        shared->GetCondVar()->Wait();
+      }
+    }
+    thread->shared->GetStressTest()->OperateDb(thread);
+
+    {
+      MutexLock l(shared->GetMutex());
+      shared->IncOperated();
+      if (shared->AllOperated()) {
+        shared->GetCondVar()->SignalAll();
+      }
+      while (!shared->VerifyStarted()) {
+        shared->GetCondVar()->Wait();
+      }
+    }
+
+    if (!FLAGS_test_batches_snapshots) {
+      thread->shared->GetStressTest()->VerifyDb(thread);
+    }
+
+    {
+      MutexLock l(shared->GetMutex());
+      shared->IncDone();
+      if (shared->AllDone()) {
+        shared->GetCondVar()->SignalAll();
+      }
+    }
+
+  }
+
+  // Given a key K and value V, this puts ("0"+K, "0"+V), ("1"+K, "1"+V), ...
+  // ("9"+K, "9"+V) in DB atomically i.e in a single batch.
+  // Also refer MultiGet.
+  Status MultiPut(ThreadState* thread, const WriteOptions& writeoptions,
+                  ColumnFamilyHandle* column_family, const Slice& key,
+                  const Slice& value, size_t sz) {
+    std::string keys[10] = {"9", "8", "7", "6", "5",
+                            "4", "3", "2", "1", "0"};
+    std::string values[10] = {"9", "8", "7", "6", "5",
+                              "4", "3", "2", "1", "0"};
+    Slice value_slices[10];
+    WriteBatch batch;
+    Status s;
+    for (int i = 0; i < 10; i++) {
+      keys[i] += key.ToString();
+      values[i] += value.ToString();
+      value_slices[i] = values[i];
+      if (FLAGS_use_merge) {
+        batch.Merge(column_family, keys[i], value_slices[i]);
+      } else {
+        batch.Put(column_family, keys[i], value_slices[i]);
+      }
+    }
+
+    s = db_->Write(writeoptions, &batch);
+    if (!s.ok()) {
+      fprintf(stderr, "multiput error: %s\n", s.ToString().c_str());
+      thread->stats.AddErrors(1);
+    } else {
+      // we did 10 writes each of size sz + 1
+      thread->stats.AddBytesForWrites(10, (sz + 1) * 10);
+    }
+
+    return s;
+  }
+
+  // Given a key K, this deletes ("0"+K), ("1"+K),... ("9"+K)
+  // in DB atomically i.e in a single batch. Also refer MultiGet.
+  Status MultiDelete(ThreadState* thread, const WriteOptions& writeoptions,
+                     ColumnFamilyHandle* column_family, const Slice& key) {
+    std::string keys[10] = {"9", "7", "5", "3", "1",
+                            "8", "6", "4", "2", "0"};
+
+    WriteBatch batch;
+    Status s;
+    for (int i = 0; i < 10; i++) {
+      keys[i] += key.ToString();
+      batch.Delete(column_family, keys[i]);
+    }
+
+    s = db_->Write(writeoptions, &batch);
+    if (!s.ok()) {
+      fprintf(stderr, "multidelete error: %s\n", s.ToString().c_str());
+      thread->stats.AddErrors(1);
+    } else {
+      thread->stats.AddDeletes(10);
+    }
+
+    return s;
+  }
+
+  // Given a key K, this gets values for "0"+K, "1"+K,..."9"+K
+  // in the same snapshot, and verifies that all the values are of the form
+  // "0"+V, "1"+V,..."9"+V.
+  // ASSUMES that MultiPut was used to put (K, V) into the DB.
+  Status MultiGet(ThreadState* thread, const ReadOptions& readoptions,
+                  ColumnFamilyHandle* column_family, const Slice& key,
+                  std::string* value) {
+    std::string keys[10] = {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"};
+    Slice key_slices[10];
+    std::string values[10];
+    ReadOptions readoptionscopy = readoptions;
+    readoptionscopy.snapshot = db_->GetSnapshot();
+    Status s;
+    for (int i = 0; i < 10; i++) {
+      keys[i] += key.ToString();
+      key_slices[i] = keys[i];
+      s = db_->Get(readoptionscopy, column_family, key_slices[i], value);
+      if (!s.ok() && !s.IsNotFound()) {
+        fprintf(stderr, "get error: %s\n", s.ToString().c_str());
+        values[i] = "";
+        thread->stats.AddErrors(1);
+        // we continue after error rather than exiting so that we can
+        // find more errors if any
+      } else if (s.IsNotFound()) {
+        values[i] = "";
+        thread->stats.AddGets(1, 0);
+      } else {
+        values[i] = *value;
+
+        char expected_prefix = (keys[i])[0];
+        char actual_prefix = (values[i])[0];
+        if (actual_prefix != expected_prefix) {
+          fprintf(stderr, "error expected prefix = %c actual = %c\n",
+                  expected_prefix, actual_prefix);
+        }
+        (values[i])[0] = ' '; // blank out the differing character
+        thread->stats.AddGets(1, 1);
+      }
+    }
+    db_->ReleaseSnapshot(readoptionscopy.snapshot);
+
+    // Now that we retrieved all values, check that they all match
+    for (int i = 1; i < 10; i++) {
+      if (values[i] != values[0]) {
+        fprintf(stderr, "error : inconsistent values for key %s: %s, %s\n",
+                key.ToString(true).c_str(), StringToHex(values[0]).c_str(),
+                StringToHex(values[i]).c_str());
+      // we continue after error rather than exiting so that we can
+      // find more errors if any
+      }
+    }
+
+    return s;
+  }
+
+  // Given a key, this does prefix scans for "0"+P, "1"+P,..."9"+P
+  // in the same snapshot where P is the first FLAGS_prefix_size - 1 bytes
+  // of the key. Each of these 10 scans returns a series of values;
+  // each series should be the same length, and it is verified for each
+  // index i that all the i'th values are of the form "0"+V, "1"+V,..."9"+V.
+  // ASSUMES that MultiPut was used to put (K, V)
+  Status MultiPrefixScan(ThreadState* thread, const ReadOptions& readoptions,
+                         ColumnFamilyHandle* column_family,
+                         const Slice& key) {
+    std::string prefixes[10] = {"0", "1", "2", "3", "4",
+                                "5", "6", "7", "8", "9"};
+    Slice prefix_slices[10];
+    ReadOptions readoptionscopy[10];
+    const Snapshot* snapshot = db_->GetSnapshot();
+    Iterator* iters[10];
+    Status s = Status::OK();
+    for (int i = 0; i < 10; i++) {
+      prefixes[i] += key.ToString();
+      prefixes[i].resize(FLAGS_prefix_size);
+      prefix_slices[i] = Slice(prefixes[i]);
+      readoptionscopy[i] = readoptions;
+      readoptionscopy[i].snapshot = snapshot;
+      iters[i] = db_->NewIterator(readoptionscopy[i], column_family);
+      iters[i]->Seek(prefix_slices[i]);
+    }
+
+    int count = 0;
+    while (iters[0]->Valid() && iters[0]->key().starts_with(prefix_slices[0])) {
+      count++;
+      std::string values[10];
+      // get list of all values for this iteration
+      for (int i = 0; i < 10; i++) {
+        // no iterator should finish before the first one
+        assert(iters[i]->Valid() &&
+               iters[i]->key().starts_with(prefix_slices[i]));
+        values[i] = iters[i]->value().ToString();
+
+        char expected_first = (prefixes[i])[0];
+        char actual_first = (values[i])[0];
+
+        if (actual_first != expected_first) {
+          fprintf(stderr, "error expected first = %c actual = %c\n",
+                  expected_first, actual_first);
+        }
+        (values[i])[0] = ' '; // blank out the differing character
+      }
+      // make sure all values are equivalent
+      for (int i = 0; i < 10; i++) {
+        if (values[i] != values[0]) {
+          fprintf(stderr, "error : %d, inconsistent values for prefix %s: %s, %s\n",
+                  i, prefixes[i].c_str(), StringToHex(values[0]).c_str(),
+                  StringToHex(values[i]).c_str());
+          // we continue after error rather than exiting so that we can
+          // find more errors if any
+        }
+        iters[i]->Next();
+      }
+    }
+
+    // cleanup iterators and snapshot
+    for (int i = 0; i < 10; i++) {
+      // if the first iterator finished, they should have all finished
+      assert(!iters[i]->Valid() ||
+             !iters[i]->key().starts_with(prefix_slices[i]));
+      assert(iters[i]->status().ok());
+      delete iters[i];
+    }
+    db_->ReleaseSnapshot(snapshot);
+
+    if (s.ok()) {
+      thread->stats.AddPrefixes(1, count);
+    } else {
+      thread->stats.AddErrors(1);
+    }
+
+    return s;
+  }
+
+  // Given a key K, this creates an iterator which scans to K and then
+  // does a random sequence of Next/Prev operations.
+  Status MultiIterate(ThreadState* thread, const ReadOptions& readoptions,
+                      ColumnFamilyHandle* column_family, const Slice& key) {
+    Status s;
+    const Snapshot* snapshot = db_->GetSnapshot();
+    ReadOptions readoptionscopy = readoptions;
+    readoptionscopy.snapshot = snapshot;
+    unique_ptr<Iterator> iter(db_->NewIterator(readoptionscopy, column_family));
+
+    iter->Seek(key);
+    for (uint64_t i = 0; i < FLAGS_num_iterations && iter->Valid(); i++) {
+      if (thread->rand.OneIn(2)) {
+        iter->Next();
+      } else {
+        iter->Prev();
+      }
+    }
+
+    if (s.ok()) {
+      thread->stats.AddIterations(1);
+    } else {
+      thread->stats.AddErrors(1);
+    }
+
+    db_->ReleaseSnapshot(snapshot);
+
+    return s;
+  }
+
+  void OperateDb(ThreadState* thread) {
+    ReadOptions read_opts(FLAGS_verify_checksum, true);
+    WriteOptions write_opts;
+    char value[100];
+    long max_key = thread->shared->GetMaxKey();
+    std::string from_db;
+    if (FLAGS_sync) {
+      write_opts.sync = true;
+    }
+    write_opts.disableWAL = FLAGS_disable_wal;
+    const int prefixBound = (int)FLAGS_readpercent + (int)FLAGS_prefixpercent;
+    const int writeBound = prefixBound + (int)FLAGS_writepercent;
+    const int delBound = writeBound + (int)FLAGS_delpercent;
+
+    thread->stats.Start();
+    for (uint64_t i = 0; i < FLAGS_ops_per_thread; i++) {
+      if (thread->shared->HasVerificationFailedYet()) {
+        break;
+      }
+      if (i != 0 && (i % (FLAGS_ops_per_thread / (FLAGS_reopen + 1))) == 0) {
+        {
+          thread->stats.FinishedSingleOp();
+          MutexLock l(thread->shared->GetMutex());
+          thread->shared->IncVotedReopen();
+          if (thread->shared->AllVotedReopen()) {
+            thread->shared->GetStressTest()->Reopen();
+            thread->shared->GetCondVar()->SignalAll();
+          }
+          else {
+            thread->shared->GetCondVar()->Wait();
+          }
+          // Commenting this out as we don't want to reset stats on each open.
+          // thread->stats.Start();
+        }
+      }
+
+      if (!FLAGS_test_batches_snapshots &&
+          FLAGS_clear_column_family_one_in != 0) {
+        if (thread->rand.OneIn(FLAGS_clear_column_family_one_in)) {
+          // drop column family and then create it again (can't drop default)
+          int cf = thread->rand.Next() % (FLAGS_column_families - 1) + 1;
+          std::string new_name =
+              std::to_string(new_column_family_name_.fetch_add(1));
+          {
+            MutexLock l(thread->shared->GetMutex());
+            fprintf(
+                stdout,
+                "[CF %d] Dropping and recreating column family. new name: %s\n",
+                cf, new_name.c_str());
+          }
+          thread->shared->LockColumnFamily(cf);
+          Status s __attribute__((unused));
+          s = db_->DropColumnFamily(column_families_[cf]);
+          delete column_families_[cf];
+          assert(s.ok());
+          s = db_->CreateColumnFamily(ColumnFamilyOptions(options_), new_name,
+                                      &column_families_[cf]);
+          column_family_names_[cf] = new_name;
+          thread->shared->ClearColumnFamily(cf);
+          assert(s.ok());
+          thread->shared->UnlockColumnFamily(cf);
+        }
+      }
+
+      long rand_key = thread->rand.Next() % max_key;
+      int rand_column_family = thread->rand.Next() % FLAGS_column_families;
+      std::string keystr = Key(rand_key);
+      Slice key = keystr;
+      int prob_op = thread->rand.Uniform(100);
+      std::unique_ptr<MutexLock> l;
+      if (!FLAGS_test_batches_snapshots) {
+        l.reset(new MutexLock(
+            thread->shared->GetMutexForKey(rand_column_family, rand_key)));
+      }
+      auto column_family = column_families_[rand_column_family];
+
+      if (prob_op >= 0 && prob_op < (int)FLAGS_readpercent) {
+        // OPERATION read
+        if (!FLAGS_test_batches_snapshots) {
+          Status s = db_->Get(read_opts, column_family, key, &from_db);
+          if (s.ok()) {
+            // found case
+            thread->stats.AddGets(1, 1);
+          } else if (s.IsNotFound()) {
+            // not found case
+            thread->stats.AddGets(1, 0);
+          } else {
+            // errors case
+            thread->stats.AddErrors(1);
+          }
+        } else {
+          MultiGet(thread, read_opts, column_family, key, &from_db);
+        }
+      } else if ((int)FLAGS_readpercent <= prob_op && prob_op < prefixBound) {
+        // OPERATION prefix scan
+        // keys are 8 bytes long, prefix size is FLAGS_prefix_size. There are
+        // (8 - FLAGS_prefix_size) bytes besides the prefix. So there will
+        // be 2 ^ ((8 - FLAGS_prefix_size) * 8) possible keys with the same
+        // prefix
+        if (!FLAGS_test_batches_snapshots) {
+          Slice prefix = Slice(key.data(), FLAGS_prefix_size);
+          Iterator* iter = db_->NewIterator(read_opts, column_family);
+          int64_t count = 0;
+          for (iter->Seek(prefix);
+               iter->Valid() && iter->key().starts_with(prefix); iter->Next()) {
+            ++count;
+          }
+          assert(count <=
+                 (static_cast<int64_t>(1) << ((8 - FLAGS_prefix_size) * 8)));
+          if (iter->status().ok()) {
+            thread->stats.AddPrefixes(1, count);
+          } else {
+            thread->stats.AddErrors(1);
+          }
+          delete iter;
+        } else {
+          MultiPrefixScan(thread, read_opts, column_family, key);
+        }
+      } else if (prefixBound <= prob_op && prob_op < writeBound) {
+        // OPERATION write
+        uint32_t value_base = thread->rand.Next();
+        size_t sz = GenerateValue(value_base, value, sizeof(value));
+        Slice v(value, sz);
+        if (!FLAGS_test_batches_snapshots) {
+          if (FLAGS_verify_before_write) {
+            std::string keystr2 = Key(rand_key);
+            Slice k = keystr2;
+            Status s = db_->Get(read_opts, column_family, k, &from_db);
+            if (VerifyValue(rand_column_family, rand_key, read_opts,
+                            thread->shared, from_db, s, true) == false) {
+              break;
+            }
+          }
+          thread->shared->Put(rand_column_family, rand_key, value_base);
+          if (FLAGS_use_merge) {
+            db_->Merge(write_opts, column_family, key, v);
+          } else {
+            db_->Put(write_opts, column_family, key, v);
+          }
+          thread->stats.AddBytesForWrites(1, sz);
+        } else {
+          MultiPut(thread, write_opts, column_family, key, v, sz);
+        }
+        PrintKeyValue(rand_column_family, rand_key, value, sz);
+      } else if (writeBound <= prob_op && prob_op < delBound) {
+        // OPERATION delete
+        if (!FLAGS_test_batches_snapshots) {
+          thread->shared->Delete(rand_column_family, rand_key);
+          db_->Delete(write_opts, column_family, key);
+          thread->stats.AddDeletes(1);
+        } else {
+          MultiDelete(thread, write_opts, column_family, key);
+        }
+      } else {
+        // OPERATION iterate
+        MultiIterate(thread, read_opts, column_family, key);
+      }
+      thread->stats.FinishedSingleOp();
+    }
+
+    thread->stats.Stop();
+  }
+
+  void VerifyDb(ThreadState* thread) const {
+    ReadOptions options(FLAGS_verify_checksum, true);
+    auto shared = thread->shared;
+    static const long max_key = shared->GetMaxKey();
+    static const long keys_per_thread = max_key / shared->GetNumThreads();
+    long start = keys_per_thread * thread->tid;
+    long end = start + keys_per_thread;
+    if (thread->tid == shared->GetNumThreads() - 1) {
+      end = max_key;
+    }
+    for (size_t cf = 0; cf < column_families_.size(); ++cf) {
+      if (thread->shared->HasVerificationFailedYet()) {
+        break;
+      }
+      if (!thread->rand.OneIn(2)) {
+        // Use iterator to verify this range
+        unique_ptr<Iterator> iter(
+            db_->NewIterator(options, column_families_[cf]));
+        iter->Seek(Key(start));
+        for (long i = start; i < end; i++) {
+          if (thread->shared->HasVerificationFailedYet()) {
+            break;
+          }
+          // TODO(ljin): update "long" to uint64_t
+          // Reseek when the prefix changes
+          if (i % (static_cast<int64_t>(1) << 8 * (8 - FLAGS_prefix_size)) ==
+              0) {
+            iter->Seek(Key(i));
+          }
+          std::string from_db;
+          std::string keystr = Key(i);
+          Slice k = keystr;
+          Status s = iter->status();
+          if (iter->Valid()) {
+            if (iter->key().compare(k) > 0) {
+              s = Status::NotFound(Slice());
+            } else if (iter->key().compare(k) == 0) {
+              from_db = iter->value().ToString();
+              iter->Next();
+            } else if (iter->key().compare(k) < 0) {
+              VerificationAbort(shared, "An out of range key was found", cf, i);
+            }
+          } else {
+            // The iterator found no value for the key in question, so do not
+            // move to the next item in the iterator
+            s = Status::NotFound(Slice());
+          }
+          VerifyValue(cf, i, options, shared, from_db, s, true);
+          if (from_db.length()) {
+            PrintKeyValue(cf, i, from_db.data(), from_db.length());
+          }
+        }
+      } else {
+        // Use Get to verify this range
+        for (long i = start; i < end; i++) {
+          if (thread->shared->HasVerificationFailedYet()) {
+            break;
+          }
+          std::string from_db;
+          std::string keystr = Key(i);
+          Slice k = keystr;
+          Status s = db_->Get(options, column_families_[cf], k, &from_db);
+          VerifyValue(cf, i, options, shared, from_db, s, true);
+          if (from_db.length()) {
+            PrintKeyValue(cf, i, from_db.data(), from_db.length());
+          }
+        }
+      }
+    }
+  }
+
+  void VerificationAbort(SharedState* shared, std::string msg, int cf,
+                         long key) const {
+    printf("Verification failed for column family %d key %ld: %s\n", cf, key,
+           msg.c_str());
+    shared->SetVerificationFailure();
+  }
+
+  bool VerifyValue(int cf, long key, const ReadOptions& opts,
+                   SharedState* shared, const std::string& value_from_db,
+                   Status s, bool strict = false) const {
+    if (shared->HasVerificationFailedYet()) {
+      return false;
+    }
+    // compare value_from_db with the value in the shared state
+    char value[100];
+    uint32_t value_base = shared->Get(cf, key);
+    if (value_base == SharedState::SENTINEL && !strict) {
+      return true;
+    }
+
+    if (s.ok()) {
+      if (value_base == SharedState::SENTINEL) {
+        VerificationAbort(shared, "Unexpected value found", cf, key);
+        return false;
+      }
+      size_t sz = GenerateValue(value_base, value, sizeof(value));
+      if (value_from_db.length() != sz) {
+        VerificationAbort(shared, "Length of value read is not equal", cf, key);
+        return false;
+      }
+      if (memcmp(value_from_db.data(), value, sz) != 0) {
+        VerificationAbort(shared, "Contents of value read don't match", cf,
+                          key);
+        return false;
+      }
+    } else {
+      if (value_base != SharedState::SENTINEL) {
+        VerificationAbort(shared, "Value not found: " + s.ToString(), cf, key);
+        return false;
+      }
+    }
+    return true;
+  }
+
+  static void PrintKeyValue(int cf, uint32_t key, const char* value,
+                            size_t sz) {
+    if (!FLAGS_verbose) {
+      return;
+    }
+    fprintf(stdout, "[CF %d] %u ==> (%u) ", cf, key, (unsigned int)sz);
+    for (size_t i = 0; i < sz; i++) {
+      fprintf(stdout, "%X", value[i]);
+    }
+    fprintf(stdout, "\n");
+  }
+
+  static size_t GenerateValue(uint32_t rand, char *v, size_t max_sz) {
+    size_t value_sz = ((rand % 3) + 1) * FLAGS_value_size_mult;
+    assert(value_sz <= max_sz && value_sz >= sizeof(uint32_t));
+    *((uint32_t*)v) = rand;
+    for (size_t i=sizeof(uint32_t); i < value_sz; i++) {
+      v[i] = (char)(rand ^ i);
+    }
+    v[value_sz] = '\0';
+    return value_sz; // the size of the value set.
+  }
+
+  void PrintEnv() const {
+    fprintf(stdout, "RocksDB version     : %d.%d\n", kMajorVersion,
+            kMinorVersion);
+    fprintf(stdout, "Column families     : %d\n", FLAGS_column_families);
+    if (!FLAGS_test_batches_snapshots) {
+      fprintf(stdout, "Clear CFs one in    : %d\n",
+              FLAGS_clear_column_family_one_in);
+    }
+    fprintf(stdout, "Number of threads   : %d\n", FLAGS_threads);
+    fprintf(stdout,
+            "Ops per thread      : %lu\n",
+            (unsigned long)FLAGS_ops_per_thread);
+    std::string ttl_state("unused");
+    if (FLAGS_ttl > 0) {
+      ttl_state = NumberToString(FLAGS_ttl);
+    }
+    fprintf(stdout, "Time to live(sec)   : %s\n", ttl_state.c_str());
+    fprintf(stdout, "Read percentage     : %d%%\n", FLAGS_readpercent);
+    fprintf(stdout, "Prefix percentage   : %d%%\n", FLAGS_prefixpercent);
+    fprintf(stdout, "Write percentage    : %d%%\n", FLAGS_writepercent);
+    fprintf(stdout, "Delete percentage   : %d%%\n", FLAGS_delpercent);
+    fprintf(stdout, "Iterate percentage  : %d%%\n", FLAGS_iterpercent);
+    fprintf(stdout, "Write-buffer-size   : %d\n", FLAGS_write_buffer_size);
+    fprintf(stdout,
+            "Iterations          : %lu\n",
+            (unsigned long)FLAGS_num_iterations);
+    fprintf(stdout,
+            "Max key             : %lu\n",
+            (unsigned long)FLAGS_max_key);
+    fprintf(stdout, "Ratio #ops/#keys    : %f\n",
+            (1.0 * FLAGS_ops_per_thread * FLAGS_threads)/FLAGS_max_key);
+    fprintf(stdout, "Num times DB reopens: %d\n", FLAGS_reopen);
+    fprintf(stdout, "Batches/snapshots   : %d\n",
+            FLAGS_test_batches_snapshots);
+    fprintf(stdout, "Purge redundant %%   : %d\n",
+            FLAGS_purge_redundant_percent);
+    fprintf(stdout, "Deletes use filter  : %d\n",
+            FLAGS_filter_deletes);
+    fprintf(stdout, "Num keys per lock   : %d\n",
+            1 << FLAGS_log2_keys_per_lock);
+
+    const char* compression = "";
+    switch (FLAGS_compression_type_e) {
+      case rocksdb::kNoCompression:
+        compression = "none";
+        break;
+      case rocksdb::kSnappyCompression:
+        compression = "snappy";
+        break;
+      case rocksdb::kZlibCompression:
+        compression = "zlib";
+        break;
+      case rocksdb::kBZip2Compression:
+        compression = "bzip2";
+        break;
+      case rocksdb::kLZ4Compression:
+        compression = "lz4";
+      case rocksdb::kLZ4HCCompression:
+        compression = "lz4hc";
+        break;
+      }
+
+    fprintf(stdout, "Compression         : %s\n", compression);
+
+    const char* memtablerep = "";
+    switch (FLAGS_rep_factory) {
+      case kSkipList:
+        memtablerep = "skip_list";
+        break;
+      case kHashSkipList:
+        memtablerep = "prefix_hash";
+        break;
+      case kVectorRep:
+        memtablerep = "vector";
+        break;
+    }
+
+    fprintf(stdout, "Memtablerep         : %s\n", memtablerep);
+
+    fprintf(stdout, "------------------------------------------------\n");
+  }
+
+  void Open() {
+    assert(db_ == nullptr);
+    options_.block_cache = cache_;
+    options_.block_cache_compressed = compressed_cache_;
+    options_.write_buffer_size = FLAGS_write_buffer_size;
+    options_.max_write_buffer_number = FLAGS_max_write_buffer_number;
+    options_.min_write_buffer_number_to_merge =
+        FLAGS_min_write_buffer_number_to_merge;
+    options_.max_background_compactions = FLAGS_max_background_compactions;
+    options_.max_background_flushes = FLAGS_max_background_flushes;
+    options_.compaction_style =
+        static_cast<rocksdb::CompactionStyle>(FLAGS_compaction_style);
+    options_.block_size = FLAGS_block_size;
+    options_.filter_policy = filter_policy_;
+    options_.prefix_extractor.reset(NewFixedPrefixTransform(FLAGS_prefix_size));
+    options_.max_open_files = FLAGS_open_files;
+    options_.statistics = dbstats;
+    options_.env = FLAGS_env;
+    options_.disableDataSync = FLAGS_disable_data_sync;
+    options_.use_fsync = FLAGS_use_fsync;
+    options_.allow_mmap_reads = FLAGS_mmap_read;
+    rocksdb_kill_odds = FLAGS_kill_random_test;
+    options_.target_file_size_base = FLAGS_target_file_size_base;
+    options_.target_file_size_multiplier = FLAGS_target_file_size_multiplier;
+    options_.max_bytes_for_level_base = FLAGS_max_bytes_for_level_base;
+    options_.max_bytes_for_level_multiplier =
+        FLAGS_max_bytes_for_level_multiplier;
+    options_.level0_stop_writes_trigger = FLAGS_level0_stop_writes_trigger;
+    options_.level0_slowdown_writes_trigger =
+        FLAGS_level0_slowdown_writes_trigger;
+    options_.level0_file_num_compaction_trigger =
+        FLAGS_level0_file_num_compaction_trigger;
+    options_.compression = FLAGS_compression_type_e;
+    options_.create_if_missing = true;
+    options_.disable_seek_compaction = FLAGS_disable_seek_compaction;
+    options_.max_manifest_file_size = 10 * 1024;
+    options_.filter_deletes = FLAGS_filter_deletes;
+    if ((FLAGS_prefix_size == 0) == (FLAGS_rep_factory == kHashSkipList)) {
+      fprintf(stderr,
+            "prefix_size should be non-zero iff memtablerep == prefix_hash\n");
+      exit(1);
+    }
+    switch (FLAGS_rep_factory) {
+      case kHashSkipList:
+        options_.memtable_factory.reset(NewHashSkipListRepFactory());
+        break;
+      case kSkipList:
+        // no need to do anything
+        break;
+      case kVectorRep:
+        options_.memtable_factory.reset(new VectorRepFactory());
+        break;
+    }
+    static Random purge_percent(1000); // no benefit from non-determinism here
+    if (static_cast<int32_t>(purge_percent.Uniform(100)) <
+        FLAGS_purge_redundant_percent - 1) {
+      options_.purge_redundant_kvs_while_flush = false;
+    }
+
+    if (FLAGS_use_merge) {
+      options_.merge_operator = MergeOperators::CreatePutOperator();
+    }
+
+    // set universal style compaction configurations, if applicable
+    if (FLAGS_universal_size_ratio != 0) {
+      options_.compaction_options_universal.size_ratio =
+          FLAGS_universal_size_ratio;
+    }
+    if (FLAGS_universal_min_merge_width != 0) {
+      options_.compaction_options_universal.min_merge_width =
+          FLAGS_universal_min_merge_width;
+    }
+    if (FLAGS_universal_max_merge_width != 0) {
+      options_.compaction_options_universal.max_merge_width =
+          FLAGS_universal_max_merge_width;
+    }
+    if (FLAGS_universal_max_size_amplification_percent != 0) {
+      options_.compaction_options_universal.max_size_amplification_percent =
+          FLAGS_universal_max_size_amplification_percent;
+    }
+
+    fprintf(stdout, "DB path: [%s]\n", FLAGS_db.c_str());
+
+    Status s;
+    if (FLAGS_ttl == -1) {
+      std::vector<std::string> existing_column_families;
+      s = DB::ListColumnFamilies(DBOptions(options_), FLAGS_db,
+                                 &existing_column_families);  // ignore errors
+      if (!s.ok()) {
+        // DB doesn't exist
+        assert(existing_column_families.empty());
+        assert(column_family_names_.empty());
+        column_family_names_.push_back(kDefaultColumnFamilyName);
+      } else if (column_family_names_.empty()) {
+        // this is the first call to the function Open()
+        column_family_names_ = existing_column_families;
+      } else {
+        // this is a reopen. just assert that existing column_family_names are
+        // equivalent to what we remember
+        auto sorted_cfn = column_family_names_;
+        sort(sorted_cfn.begin(), sorted_cfn.end());
+        sort(existing_column_families.begin(), existing_column_families.end());
+        if (sorted_cfn != existing_column_families) {
+          fprintf(stderr,
+                  "Expected column families differ from the existing:\n");
+          printf("Expected: {");
+          for (auto cf : sorted_cfn) {
+            printf("%s ", cf.c_str());
+          }
+          printf("}\n");
+          printf("Existing: {");
+          for (auto cf : existing_column_families) {
+            printf("%s ", cf.c_str());
+          }
+          printf("}\n");
+        }
+        assert(sorted_cfn == existing_column_families);
+      }
+      std::vector<ColumnFamilyDescriptor> cf_descriptors;
+      for (auto name : column_family_names_) {
+        if (name != kDefaultColumnFamilyName) {
+          new_column_family_name_ =
+              std::max(new_column_family_name_.load(), std::stoi(name) + 1);
+        }
+        cf_descriptors.emplace_back(name, ColumnFamilyOptions(options_));
+      }
+      s = DB::Open(DBOptions(options_), FLAGS_db, cf_descriptors,
+                   &column_families_, &db_);
+      if (s.ok()) {
+        while (s.ok() &&
+               column_families_.size() < (size_t)FLAGS_column_families) {
+          ColumnFamilyHandle* cf = nullptr;
+          std::string name = std::to_string(new_column_family_name_.load());
+          new_column_family_name_++;
+          s = db_->CreateColumnFamily(ColumnFamilyOptions(options_), name, &cf);
+          column_families_.push_back(cf);
+          column_family_names_.push_back(name);
+        }
+      }
+      assert(!s.ok() || column_families_.size() ==
+                            static_cast<size_t>(FLAGS_column_families));
+    } else {
+      DBWithTTL* db_with_ttl;
+      s = DBWithTTL::Open(options_, FLAGS_db, &db_with_ttl, FLAGS_ttl);
+      db_ = db_with_ttl;
+    }
+    if (!s.ok()) {
+      fprintf(stderr, "open error: %s\n", s.ToString().c_str());
+      exit(1);
+    }
+  }
+
+  void Reopen() {
+    for (auto cf : column_families_) {
+      delete cf;
+    }
+    column_families_.clear();
+    delete db_;
+    db_ = nullptr;
+
+    num_times_reopened_++;
+    double now = FLAGS_env->NowMicros();
+    fprintf(stdout, "%s Reopening database for the %dth time\n",
+            FLAGS_env->TimeToString((uint64_t) now/1000000).c_str(),
+            num_times_reopened_);
+    Open();
+  }
+
+  void PrintStatistics() {
+    if (dbstats) {
+      fprintf(stdout, "STATISTICS:\n%s\n", dbstats->ToString().c_str());
+    }
+  }
+
+ private:
+  shared_ptr<Cache> cache_;
+  shared_ptr<Cache> compressed_cache_;
+  const FilterPolicy* filter_policy_;
+  DB* db_;
+  Options options_;
+  std::vector<ColumnFamilyHandle*> column_families_;
+  std::vector<std::string> column_family_names_;
+  std::atomic<int> new_column_family_name_;
+  int num_times_reopened_;
+};
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  google::SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
+                          " [OPTIONS]...");
+  google::ParseCommandLineFlags(&argc, &argv, true);
+
+  if (FLAGS_statistics) {
+    dbstats = rocksdb::CreateDBStatistics();
+  }
+  FLAGS_compression_type_e =
+    StringToCompressionType(FLAGS_compression_type.c_str());
+  if (!FLAGS_hdfs.empty()) {
+    FLAGS_env  = new rocksdb::HdfsEnv(FLAGS_hdfs);
+  }
+  FLAGS_rep_factory = StringToRepFactory(FLAGS_memtablerep.c_str());
+
+  // The number of background threads should be at least as much the
+  // max number of concurrent compactions.
+  FLAGS_env->SetBackgroundThreads(FLAGS_max_background_compactions);
+
+  if (FLAGS_prefixpercent > 0 && FLAGS_prefix_size <= 0) {
+    fprintf(stderr,
+            "Error: prefixpercent is non-zero while prefix_size is "
+            "not positive!\n");
+    exit(1);
+  }
+  if (FLAGS_test_batches_snapshots && FLAGS_prefix_size <= 0) {
+    fprintf(stderr,
+            "Error: please specify prefix_size for "
+            "test_batches_snapshots test!\n");
+    exit(1);
+  }
+  if ((FLAGS_readpercent + FLAGS_prefixpercent +
+       FLAGS_writepercent + FLAGS_delpercent + FLAGS_iterpercent) != 100) {
+      fprintf(stderr,
+              "Error: Read+Prefix+Write+Delete+Iterate percents != 100!\n");
+      exit(1);
+  }
+  if (FLAGS_disable_wal == 1 && FLAGS_reopen > 0) {
+      fprintf(stderr, "Error: Db cannot reopen safely with disable_wal set!\n");
+      exit(1);
+  }
+  if ((unsigned)FLAGS_reopen >= FLAGS_ops_per_thread) {
+      fprintf(stderr,
+              "Error: #DB-reopens should be < ops_per_thread\n"
+              "Provided reopens = %d and ops_per_thread = %lu\n",
+              FLAGS_reopen,
+              (unsigned long)FLAGS_ops_per_thread);
+      exit(1);
+  }
+
+  // Choose a location for the test database if none given with --db=<path>
+  if (FLAGS_db.empty()) {
+      std::string default_db_path;
+      rocksdb::Env::Default()->GetTestDirectory(&default_db_path);
+      default_db_path += "/dbstress";
+      FLAGS_db = default_db_path;
+  }
+
+  rocksdb::StressTest stress;
+  if (stress.Run()) {
+    return 0;
+  } else {
+    return 1;
+  }
+}
diff --git a/tools/ldb.cc b/tools/ldb.cc
new file mode 100644 (file)
index 0000000..4581b80
--- /dev/null
@@ -0,0 +1,13 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+
+#include "rocksdb/ldb_tool.h"
+
+int main(int argc, char** argv) {
+  rocksdb::LDBTool tool;
+  tool.Run(argc, argv);
+  return 0;
+}
diff --git a/tools/ldb_test.py b/tools/ldb_test.py
new file mode 100644 (file)
index 0000000..b4ef522
--- /dev/null
@@ -0,0 +1,383 @@
+import os
+import os.path
+import shutil
+import subprocess
+import time
+import unittest
+import tempfile
+
+def my_check_output(*popenargs, **kwargs):
+    """
+    If we had python 2.7, we should simply use subprocess.check_output.
+    This is a stop-gap solution for python 2.6
+    """
+    if 'stdout' in kwargs:
+        raise ValueError('stdout argument not allowed, it will be overridden.')
+    process = subprocess.Popen(stderr=subprocess.PIPE, stdout=subprocess.PIPE,
+                               *popenargs, **kwargs)
+    output, unused_err = process.communicate()
+    retcode = process.poll()
+    if retcode:
+        cmd = kwargs.get("args")
+        if cmd is None:
+            cmd = popenargs[0]
+        raise Exception("Exit code is not 0.  It is %d.  Command: %s" %
+                (retcode, cmd))
+    return output
+
+def run_err_null(cmd):
+    return os.system(cmd + " 2>/dev/null ")
+
+class LDBTestCase(unittest.TestCase):
+    def setUp(self):
+        self.TMP_DIR  = tempfile.mkdtemp(prefix="ldb_test_")
+        self.DB_NAME = "testdb"
+
+    def tearDown(self):
+        assert(self.TMP_DIR.strip() != "/"
+                and self.TMP_DIR.strip() != "/tmp"
+                and self.TMP_DIR.strip() != "/tmp/") #Just some paranoia
+
+        shutil.rmtree(self.TMP_DIR)
+
+    def dbParam(self, dbName):
+        return "--db=%s" % os.path.join(self.TMP_DIR, dbName)
+
+    def assertRunOKFull(self, params, expectedOutput, unexpected=False):
+        """
+        All command-line params must be specified.
+        Allows full flexibility in testing; for example: missing db param.
+
+        """
+
+        output = my_check_output("./ldb %s |grep -v \"Created bg thread\"" %
+                            params, shell=True)
+        if not unexpected:
+            self.assertEqual(output.strip(), expectedOutput.strip())
+        else:
+            self.assertNotEqual(output.strip(), expectedOutput.strip())
+
+    def assertRunFAILFull(self, params):
+        """
+        All command-line params must be specified.
+        Allows full flexibility in testing; for example: missing db param.
+
+        """
+        try:
+
+            my_check_output("./ldb %s >/dev/null 2>&1 |grep -v \"Created bg \
+                thread\"" % params, shell=True)
+        except Exception, e:
+            return
+        self.fail(
+            "Exception should have been raised for command with params: %s" %
+            params)
+
+    def assertRunOK(self, params, expectedOutput, unexpected=False):
+        """
+        Uses the default test db.
+
+        """
+        self.assertRunOKFull("%s %s" % (self.dbParam(self.DB_NAME), params),
+                             expectedOutput, unexpected)
+
+    def assertRunFAIL(self, params):
+        """
+        Uses the default test db.
+        """
+        self.assertRunFAILFull("%s %s" % (self.dbParam(self.DB_NAME), params))
+
+    def testSimpleStringPutGet(self):
+        print "Running testSimpleStringPutGet..."
+        self.assertRunFAIL("put x1 y1")
+        self.assertRunOK("put --create_if_missing x1 y1", "OK")
+        self.assertRunOK("get x1", "y1")
+        self.assertRunFAIL("get x2")
+
+        self.assertRunOK("put x2 y2", "OK")
+        self.assertRunOK("get x1", "y1")
+        self.assertRunOK("get x2", "y2")
+        self.assertRunFAIL("get x3")
+
+        self.assertRunOK("scan --from=x1 --to=z", "x1 : y1\nx2 : y2")
+        self.assertRunOK("put x3 y3", "OK")
+
+        self.assertRunOK("scan --from=x1 --to=z", "x1 : y1\nx2 : y2\nx3 : y3")
+        self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3")
+        self.assertRunOK("scan --from=x", "x1 : y1\nx2 : y2\nx3 : y3")
+
+        self.assertRunOK("scan --to=x2", "x1 : y1")
+        self.assertRunOK("scan --from=x1 --to=z --max_keys=1", "x1 : y1")
+        self.assertRunOK("scan --from=x1 --to=z --max_keys=2",
+                "x1 : y1\nx2 : y2")
+
+        self.assertRunOK("scan --from=x1 --to=z --max_keys=3",
+                "x1 : y1\nx2 : y2\nx3 : y3")
+        self.assertRunOK("scan --from=x1 --to=z --max_keys=4",
+                "x1 : y1\nx2 : y2\nx3 : y3")
+        self.assertRunOK("scan --from=x1 --to=x2", "x1 : y1")
+        self.assertRunOK("scan --from=x2 --to=x4", "x2 : y2\nx3 : y3")
+        self.assertRunFAIL("scan --from=x4 --to=z") # No results => FAIL
+        self.assertRunFAIL("scan --from=x1 --to=z --max_keys=foo")
+
+        self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3")
+
+        self.assertRunOK("delete x1", "OK")
+        self.assertRunOK("scan", "x2 : y2\nx3 : y3")
+
+        self.assertRunOK("delete NonExistentKey", "OK")
+        # It is weird that GET and SCAN raise exception for
+        # non-existent key, while delete does not
+
+        self.assertRunOK("checkconsistency", "OK")
+
+    def dumpDb(self, params, dumpFile):
+        return 0 == run_err_null("./ldb dump %s > %s" % (params, dumpFile))
+
+    def loadDb(self, params, dumpFile):
+        return 0 == run_err_null("cat %s | ./ldb load %s" % (dumpFile, params))
+
+    def testStringBatchPut(self):
+        print "Running testStringBatchPut..."
+        self.assertRunOK("batchput x1 y1 --create_if_missing", "OK")
+        self.assertRunOK("scan", "x1 : y1")
+        self.assertRunOK("batchput x2 y2 x3 y3 \"x4 abc\" \"y4 xyz\"", "OK")
+        self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 abc : y4 xyz")
+        self.assertRunFAIL("batchput")
+        self.assertRunFAIL("batchput k1")
+        self.assertRunFAIL("batchput k1 v1 k2")
+
+    def testCountDelimDump(self):
+        print "Running testCountDelimDump..."
+        self.assertRunOK("batchput x.1 x1 --create_if_missing", "OK")
+        self.assertRunOK("batchput y.abc abc y.2 2 z.13c pqr", "OK")
+        self.assertRunOK("dump --count_delim", "x => count:1\tsize:5\ny => count:2\tsize:12\nz => count:1\tsize:8")
+        self.assertRunOK("dump --count_delim=\".\"", "x => count:1\tsize:5\ny => count:2\tsize:12\nz => count:1\tsize:8")
+        self.assertRunOK("batchput x,2 x2 x,abc xabc", "OK")
+        self.assertRunOK("dump --count_delim=\",\"", "x => count:2\tsize:14\nx.1 => count:1\tsize:5\ny.2 => count:1\tsize:4\ny.abc => count:1\tsize:8\nz.13c => count:1\tsize:8")
+
+    def testCountDelimIDump(self):
+        print "Running testCountDelimIDump..."
+        self.assertRunOK("batchput x.1 x1 --create_if_missing", "OK")
+        self.assertRunOK("batchput y.abc abc y.2 2 z.13c pqr", "OK")
+        self.assertRunOK("dump --count_delim", "x => count:1\tsize:5\ny => count:2\tsize:12\nz => count:1\tsize:8")
+        self.assertRunOK("dump --count_delim=\".\"", "x => count:1\tsize:5\ny => count:2\tsize:12\nz => count:1\tsize:8")
+        self.assertRunOK("batchput x,2 x2 x,abc xabc", "OK")
+        self.assertRunOK("dump --count_delim=\",\"", "x => count:2\tsize:14\nx.1 => count:1\tsize:5\ny.2 => count:1\tsize:4\ny.abc => count:1\tsize:8\nz.13c => count:1\tsize:8")
+
+    def testInvalidCmdLines(self):
+        print "Running testInvalidCmdLines..."
+        # db not specified
+        self.assertRunFAILFull("put 0x6133 0x6233 --hex --create_if_missing")
+        # No param called he
+        self.assertRunFAIL("put 0x6133 0x6233 --he --create_if_missing")
+        # max_keys is not applicable for put
+        self.assertRunFAIL("put 0x6133 0x6233 --max_keys=1 --create_if_missing")
+        # hex has invalid boolean value
+
+    def testHexPutGet(self):
+        print "Running testHexPutGet..."
+        self.assertRunOK("put a1 b1 --create_if_missing", "OK")
+        self.assertRunOK("scan", "a1 : b1")
+        self.assertRunOK("scan --hex", "0x6131 : 0x6231")
+        self.assertRunFAIL("put --hex 6132 6232")
+        self.assertRunOK("put --hex 0x6132 0x6232", "OK")
+        self.assertRunOK("scan --hex", "0x6131 : 0x6231\n0x6132 : 0x6232")
+        self.assertRunOK("scan", "a1 : b1\na2 : b2")
+        self.assertRunOK("get a1", "b1")
+        self.assertRunOK("get --hex 0x6131", "0x6231")
+        self.assertRunOK("get a2", "b2")
+        self.assertRunOK("get --hex 0x6132", "0x6232")
+        self.assertRunOK("get --key_hex 0x6132", "b2")
+        self.assertRunOK("get --key_hex --value_hex 0x6132", "0x6232")
+        self.assertRunOK("get --value_hex a2", "0x6232")
+        self.assertRunOK("scan --key_hex --value_hex",
+                "0x6131 : 0x6231\n0x6132 : 0x6232")
+        self.assertRunOK("scan --hex --from=0x6131 --to=0x6133",
+                "0x6131 : 0x6231\n0x6132 : 0x6232")
+        self.assertRunOK("scan --hex --from=0x6131 --to=0x6132",
+                "0x6131 : 0x6231")
+        self.assertRunOK("scan --key_hex", "0x6131 : b1\n0x6132 : b2")
+        self.assertRunOK("scan --value_hex", "a1 : 0x6231\na2 : 0x6232")
+        self.assertRunOK("batchput --hex 0x6133 0x6233 0x6134 0x6234", "OK")
+        self.assertRunOK("scan", "a1 : b1\na2 : b2\na3 : b3\na4 : b4")
+        self.assertRunOK("delete --hex 0x6133", "OK")
+        self.assertRunOK("scan", "a1 : b1\na2 : b2\na4 : b4")
+        self.assertRunOK("checkconsistency", "OK")
+
+    def testTtlPutGet(self):
+        print "Running testTtlPutGet..."
+        self.assertRunOK("put a1 b1 --ttl --create_if_missing", "OK")
+        self.assertRunOK("scan --hex", "0x6131 : 0x6231", True)
+        self.assertRunOK("dump --ttl ", "a1 ==> b1", True)
+        self.assertRunOK("dump --hex --ttl ",
+                         "0x6131 ==> 0x6231\nKeys in range: 1")
+        self.assertRunOK("scan --hex --ttl", "0x6131 : 0x6231")
+        self.assertRunOK("get --value_hex a1", "0x6231", True)
+        self.assertRunOK("get --ttl a1", "b1")
+        self.assertRunOK("put a3 b3 --create_if_missing", "OK")
+        # fails because timstamp's length is greater than value's
+        self.assertRunFAIL("get --ttl a3")
+        self.assertRunOK("checkconsistency", "OK")
+
+    def testInvalidCmdLines(self):
+        print "Running testInvalidCmdLines..."
+        # db not specified
+        self.assertRunFAILFull("put 0x6133 0x6233 --hex --create_if_missing")
+        # No param called he
+        self.assertRunFAIL("put 0x6133 0x6233 --he --create_if_missing")
+        # max_keys is not applicable for put
+        self.assertRunFAIL("put 0x6133 0x6233 --max_keys=1 --create_if_missing")
+        # hex has invalid boolean value
+        self.assertRunFAIL("put 0x6133 0x6233 --hex=Boo --create_if_missing")
+
+    def testDumpLoad(self):
+        print "Running testDumpLoad..."
+        self.assertRunOK("batchput --create_if_missing x1 y1 x2 y2 x3 y3 x4 y4",
+                "OK")
+        self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+        origDbPath = os.path.join(self.TMP_DIR, self.DB_NAME)
+
+        # Dump and load without any additional params specified
+        dumpFilePath = os.path.join(self.TMP_DIR, "dump1")
+        loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump1")
+        self.assertTrue(self.dumpDb("--db=%s" % origDbPath, dumpFilePath))
+        self.assertTrue(self.loadDb(
+            "--db=%s --create_if_missing" % loadedDbPath, dumpFilePath))
+        self.assertRunOKFull("scan --db=%s" % loadedDbPath,
+                "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+
+        # Dump and load in hex
+        dumpFilePath = os.path.join(self.TMP_DIR, "dump2")
+        loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump2")
+        self.assertTrue(self.dumpDb("--db=%s --hex" % origDbPath, dumpFilePath))
+        self.assertTrue(self.loadDb(
+            "--db=%s --hex --create_if_missing" % loadedDbPath, dumpFilePath))
+        self.assertRunOKFull("scan --db=%s" % loadedDbPath,
+                "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+
+        # Dump only a portion of the key range
+        dumpFilePath = os.path.join(self.TMP_DIR, "dump3")
+        loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump3")
+        self.assertTrue(self.dumpDb(
+            "--db=%s --from=x1 --to=x3" % origDbPath, dumpFilePath))
+        self.assertTrue(self.loadDb(
+            "--db=%s --create_if_missing" % loadedDbPath, dumpFilePath))
+        self.assertRunOKFull("scan --db=%s" % loadedDbPath, "x1 : y1\nx2 : y2")
+
+        # Dump upto max_keys rows
+        dumpFilePath = os.path.join(self.TMP_DIR, "dump4")
+        loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump4")
+        self.assertTrue(self.dumpDb(
+            "--db=%s --max_keys=3" % origDbPath, dumpFilePath))
+        self.assertTrue(self.loadDb(
+            "--db=%s --create_if_missing" % loadedDbPath, dumpFilePath))
+        self.assertRunOKFull("scan --db=%s" % loadedDbPath,
+                "x1 : y1\nx2 : y2\nx3 : y3")
+
+        # Load into an existing db, create_if_missing is not specified
+        self.assertTrue(self.dumpDb("--db=%s" % origDbPath, dumpFilePath))
+        self.assertTrue(self.loadDb("--db=%s" % loadedDbPath, dumpFilePath))
+        self.assertRunOKFull("scan --db=%s" % loadedDbPath,
+                "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+
+        # Dump and load with WAL disabled
+        dumpFilePath = os.path.join(self.TMP_DIR, "dump5")
+        loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump5")
+        self.assertTrue(self.dumpDb("--db=%s" % origDbPath, dumpFilePath))
+        self.assertTrue(self.loadDb(
+            "--db=%s --disable_wal --create_if_missing" % loadedDbPath,
+            dumpFilePath))
+        self.assertRunOKFull("scan --db=%s" % loadedDbPath,
+                "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+
+        # Dump and load with lots of extra params specified
+        extraParams = " ".join(["--bloom_bits=14", "--compression_type=bzip2",
+                                "--block_size=1024", "--auto_compaction=true",
+                                "--write_buffer_size=4194304",
+                                "--file_size=2097152"])
+        dumpFilePath = os.path.join(self.TMP_DIR, "dump6")
+        loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump6")
+        self.assertTrue(self.dumpDb(
+            "--db=%s %s" % (origDbPath, extraParams), dumpFilePath))
+        self.assertTrue(self.loadDb(
+            "--db=%s %s --create_if_missing" % (loadedDbPath, extraParams),
+            dumpFilePath))
+        self.assertRunOKFull("scan --db=%s" % loadedDbPath,
+                "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+
+        # Dump with count_only
+        dumpFilePath = os.path.join(self.TMP_DIR, "dump7")
+        loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump7")
+        self.assertTrue(self.dumpDb(
+            "--db=%s --count_only" % origDbPath, dumpFilePath))
+        self.assertTrue(self.loadDb(
+            "--db=%s --create_if_missing" % loadedDbPath, dumpFilePath))
+        # DB should have atleast one value for scan to work
+        self.assertRunOKFull("put --db=%s k1 v1" % loadedDbPath, "OK")
+        self.assertRunOKFull("scan --db=%s" % loadedDbPath, "k1 : v1")
+
+        # Dump command fails because of typo in params
+        dumpFilePath = os.path.join(self.TMP_DIR, "dump8")
+        self.assertFalse(self.dumpDb(
+            "--db=%s --create_if_missing" % origDbPath, dumpFilePath))
+
+    def testMiscAdminTask(self):
+        print "Running testMiscAdminTask..."
+        # These tests need to be improved; for example with asserts about
+        # whether compaction or level reduction actually took place.
+        self.assertRunOK("batchput --create_if_missing x1 y1 x2 y2 x3 y3 x4 y4",
+                "OK")
+        self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+        origDbPath = os.path.join(self.TMP_DIR, self.DB_NAME)
+
+        self.assertTrue(0 == run_err_null(
+            "./ldb compact --db=%s" % origDbPath))
+        self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+
+        self.assertTrue(0 == run_err_null(
+            "./ldb reduce_levels --db=%s --new_levels=2" % origDbPath))
+        self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+
+        self.assertTrue(0 == run_err_null(
+            "./ldb reduce_levels --db=%s --new_levels=3" % origDbPath))
+        self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+
+        self.assertTrue(0 == run_err_null(
+            "./ldb compact --db=%s --from=x1 --to=x3" % origDbPath))
+        self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+
+        self.assertTrue(0 == run_err_null(
+            "./ldb compact --db=%s --hex --from=0x6131 --to=0x6134"
+            % origDbPath))
+        self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+
+        #TODO(dilip): Not sure what should be passed to WAL.Currently corrupted.
+        self.assertTrue(0 == run_err_null(
+            "./ldb dump_wal --db=%s --walfile=%s --header" % (
+                origDbPath, os.path.join(origDbPath, "LOG"))))
+        self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+
+    def testCheckConsistency(self):
+        print "Running testCheckConsistency..."
+
+        dbPath = os.path.join(self.TMP_DIR, self.DB_NAME)
+        self.assertRunOK("put x1 y1 --create_if_missing", "OK")
+        self.assertRunOK("put x2 y2", "OK")
+        self.assertRunOK("get x1", "y1")
+        self.assertRunOK("checkconsistency", "OK")
+
+        sstFilePath = my_check_output("ls %s" % os.path.join(dbPath, "*.sst"),
+                                      shell=True)
+
+        # Modify the file
+        my_check_output("echo 'evil' > %s" % sstFilePath, shell=True)
+        self.assertRunFAIL("checkconsistency")
+
+        # Delete the file
+        my_check_output("rm -f %s" % sstFilePath, shell=True)
+        self.assertRunFAIL("checkconsistency")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tools/reduce_levels_test.cc b/tools/reduce_levels_test.cc
new file mode 100644 (file)
index 0000000..b588b52
--- /dev/null
@@ -0,0 +1,197 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#include "rocksdb/db.h"
+#include "db/db_impl.h"
+#include "db/version_set.h"
+#include "util/logging.h"
+#include "util/testutil.h"
+#include "util/testharness.h"
+#include "util/ldb_cmd.h"
+
+namespace rocksdb {
+
+class ReduceLevelTest {
+public:
+  ReduceLevelTest() {
+    dbname_ = test::TmpDir() + "/db_reduce_levels_test";
+    DestroyDB(dbname_, Options());
+    db_ = nullptr;
+  }
+
+  Status OpenDB(bool create_if_missing, int levels,
+      int mem_table_compact_level);
+
+  Status Put(const std::string& k, const std::string& v) {
+    return db_->Put(WriteOptions(), k, v);
+  }
+
+  std::string Get(const std::string& k) {
+    ReadOptions options;
+    std::string result;
+    Status s = db_->Get(options, k, &result);
+    if (s.IsNotFound()) {
+      result = "NOT_FOUND";
+    } else if (!s.ok()) {
+      result = s.ToString();
+    }
+    return result;
+  }
+
+  Status CompactMemTable() {
+    if (db_ == nullptr) {
+      return Status::InvalidArgument("DB not opened.");
+    }
+    DBImpl* db_impl = reinterpret_cast<DBImpl*>(db_);
+    return db_impl->TEST_FlushMemTable();
+  }
+
+  void CloseDB() {
+    if (db_ != nullptr) {
+      delete db_;
+      db_ = nullptr;
+    }
+  }
+
+  bool ReduceLevels(int target_level);
+
+  int FilesOnLevel(int level) {
+    std::string property;
+    ASSERT_TRUE(
+        db_->GetProperty("rocksdb.num-files-at-level" + NumberToString(level),
+                         &property));
+    return atoi(property.c_str());
+  }
+
+private:
+  std::string dbname_;
+  DB* db_;
+};
+
+Status ReduceLevelTest::OpenDB(bool create_if_missing, int num_levels,
+    int mem_table_compact_level) {
+  rocksdb::Options opt;
+  opt.num_levels = num_levels;
+  opt.create_if_missing = create_if_missing;
+  opt.max_mem_compaction_level = mem_table_compact_level;
+  rocksdb::Status st = rocksdb::DB::Open(opt, dbname_, &db_);
+  if (!st.ok()) {
+    fprintf(stderr, "Can't open the db:%s\n", st.ToString().c_str());
+  }
+  return st;
+}
+
+bool ReduceLevelTest::ReduceLevels(int target_level) {
+  std::vector<std::string> args = rocksdb::ReduceDBLevelsCommand::PrepareArgs(
+      dbname_, target_level, false);
+  LDBCommand* level_reducer = LDBCommand::InitFromCmdLineArgs(args);
+  level_reducer->Run();
+  bool is_succeed = level_reducer->GetExecuteState().IsSucceed();
+  delete level_reducer;
+  return is_succeed;
+}
+
+TEST(ReduceLevelTest, Last_Level) {
+  // create files on all levels;
+  ASSERT_OK(OpenDB(true, 4, 3));
+  ASSERT_OK(Put("aaaa", "11111"));
+  ASSERT_OK(CompactMemTable());
+  ASSERT_EQ(FilesOnLevel(3), 1);
+  CloseDB();
+
+  ASSERT_TRUE(ReduceLevels(3));
+  ASSERT_OK(OpenDB(true, 3, 1));
+  ASSERT_EQ(FilesOnLevel(2), 1);
+  CloseDB();
+
+  ASSERT_TRUE(ReduceLevels(2));
+  ASSERT_OK(OpenDB(true, 2, 1));
+  ASSERT_EQ(FilesOnLevel(1), 1);
+  CloseDB();
+}
+
+TEST(ReduceLevelTest, Top_Level) {
+  // create files on all levels;
+  ASSERT_OK(OpenDB(true, 5, 0));
+  ASSERT_OK(Put("aaaa", "11111"));
+  ASSERT_OK(CompactMemTable());
+  ASSERT_EQ(FilesOnLevel(0), 1);
+  CloseDB();
+
+  ASSERT_TRUE(ReduceLevels(4));
+  ASSERT_OK(OpenDB(true, 4, 0));
+  CloseDB();
+
+  ASSERT_TRUE(ReduceLevels(3));
+  ASSERT_OK(OpenDB(true, 3, 0));
+  CloseDB();
+
+  ASSERT_TRUE(ReduceLevels(2));
+  ASSERT_OK(OpenDB(true, 2, 0));
+  CloseDB();
+}
+
+TEST(ReduceLevelTest, All_Levels) {
+  // create files on all levels;
+  ASSERT_OK(OpenDB(true, 5, 1));
+  ASSERT_OK(Put("a", "a11111"));
+  ASSERT_OK(CompactMemTable());
+  ASSERT_EQ(FilesOnLevel(1), 1);
+  CloseDB();
+
+  ASSERT_OK(OpenDB(true, 5, 2));
+  ASSERT_OK(Put("b", "b11111"));
+  ASSERT_OK(CompactMemTable());
+  ASSERT_EQ(FilesOnLevel(1), 1);
+  ASSERT_EQ(FilesOnLevel(2), 1);
+  CloseDB();
+
+  ASSERT_OK(OpenDB(true, 5, 3));
+  ASSERT_OK(Put("c", "c11111"));
+  ASSERT_OK(CompactMemTable());
+  ASSERT_EQ(FilesOnLevel(1), 1);
+  ASSERT_EQ(FilesOnLevel(2), 1);
+  ASSERT_EQ(FilesOnLevel(3), 1);
+  CloseDB();
+
+  ASSERT_OK(OpenDB(true, 5, 4));
+  ASSERT_OK(Put("d", "d11111"));
+  ASSERT_OK(CompactMemTable());
+  ASSERT_EQ(FilesOnLevel(1), 1);
+  ASSERT_EQ(FilesOnLevel(2), 1);
+  ASSERT_EQ(FilesOnLevel(3), 1);
+  ASSERT_EQ(FilesOnLevel(4), 1);
+  CloseDB();
+
+  ASSERT_TRUE(ReduceLevels(4));
+  ASSERT_OK(OpenDB(true, 4, 0));
+  ASSERT_EQ("a11111", Get("a"));
+  ASSERT_EQ("b11111", Get("b"));
+  ASSERT_EQ("c11111", Get("c"));
+  ASSERT_EQ("d11111", Get("d"));
+  CloseDB();
+
+  ASSERT_TRUE(ReduceLevels(3));
+  ASSERT_OK(OpenDB(true, 3, 0));
+  ASSERT_EQ("a11111", Get("a"));
+  ASSERT_EQ("b11111", Get("b"));
+  ASSERT_EQ("c11111", Get("c"));
+  ASSERT_EQ("d11111", Get("d"));
+  CloseDB();
+
+  ASSERT_TRUE(ReduceLevels(2));
+  ASSERT_OK(OpenDB(true, 2, 0));
+  ASSERT_EQ("a11111", Get("a"));
+  ASSERT_EQ("b11111", Get("b"));
+  ASSERT_EQ("c11111", Get("c"));
+  ASSERT_EQ("d11111", Get("d"));
+  CloseDB();
+}
+
+}
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/tools/sst_dump.cc b/tools/sst_dump.cc
new file mode 100644 (file)
index 0000000..9a144bb
--- /dev/null
@@ -0,0 +1,367 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+
+#include <map>
+#include <string>
+#include <vector>
+#include <inttypes.h>
+
+#include "db/dbformat.h"
+#include "db/memtable.h"
+#include "db/write_batch_internal.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+#include "table/block_based_table_factory.h"
+#include "table/plain_table_factory.h"
+#include "table/meta_blocks.h"
+#include "table/block.h"
+#include "table/block_builder.h"
+#include "table/format.h"
+#include "util/ldb_cmd.h"
+#include "util/random.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+
+class SstFileReader {
+ public:
+  explicit SstFileReader(const std::string& file_name,
+                         bool verify_checksum,
+                         bool output_hex);
+
+  Status ReadSequential(bool print_kv,
+                        uint64_t read_num,
+                        bool has_from,
+                        const std::string& from_key,
+                        bool has_to,
+                        const std::string& to_key);
+
+  Status ReadTableProperties(
+      std::shared_ptr<const TableProperties>* table_properties);
+  uint64_t GetReadNumber() { return read_num_; }
+
+ private:
+  Status NewTableReader(const std::string& file_path);
+  Status SetTableOptionsByMagicNumber(uint64_t table_magic_number,
+                                      RandomAccessFile* file,
+                                      uint64_t file_size);
+
+  std::string file_name_;
+  uint64_t read_num_;
+  bool verify_checksum_;
+  bool output_hex_;
+  EnvOptions soptions_;
+
+  Status init_result_;
+  unique_ptr<TableReader> table_reader_;
+  unique_ptr<RandomAccessFile> file_;
+  // options_ and internal_comparator_ will also be used in
+  // ReadSequential internally (specifically, seek-related operations)
+  Options options_;
+  InternalKeyComparator internal_comparator_;
+};
+
+SstFileReader::SstFileReader(const std::string& file_path,
+                             bool verify_checksum,
+                             bool output_hex)
+    :file_name_(file_path), read_num_(0), verify_checksum_(verify_checksum),
+    output_hex_(output_hex), internal_comparator_(BytewiseComparator()) {
+  fprintf(stdout, "Process %s\n", file_path.c_str());
+
+  init_result_ = NewTableReader(file_name_);
+}
+
+extern uint64_t kBlockBasedTableMagicNumber;
+extern uint64_t kPlainTableMagicNumber;
+
+Status SstFileReader::NewTableReader(const std::string& file_path) {
+  uint64_t magic_number;
+
+  // read table magic number
+  Footer footer;
+
+  unique_ptr<RandomAccessFile> file;
+  uint64_t file_size;
+  Status s = options_.env->NewRandomAccessFile(file_path, &file_, soptions_);
+  if (s.ok()) {
+    s = options_.env->GetFileSize(file_path, &file_size);
+  }
+  if (s.ok()) {
+    s = ReadFooterFromFile(file_.get(), file_size, &footer);
+  }
+  if (s.ok()) {
+    magic_number = footer.table_magic_number();
+  }
+
+  if (s.ok()) {
+    if (magic_number == kPlainTableMagicNumber) {
+      soptions_.use_mmap_reads = true;
+    }
+    options_.comparator = &internal_comparator_;
+    s = SetTableOptionsByMagicNumber(magic_number, file_.get(), file_size);
+  }
+
+  if (s.ok()) {
+    s = options_.table_factory->NewTableReader(
+        options_, soptions_, internal_comparator_, std::move(file_), file_size,
+        &table_reader_);
+  }
+  return s;
+}
+
+Status SstFileReader::SetTableOptionsByMagicNumber(uint64_t table_magic_number,
+                                                   RandomAccessFile* file,
+                                                   uint64_t file_size) {
+  TableProperties* table_properties;
+  Status s = rocksdb::ReadTableProperties(file, file_size, table_magic_number,
+                                          options_.env, options_.info_log.get(),
+                                          &table_properties);
+  if (!s.ok()) {
+    return s;
+  }
+  std::unique_ptr<TableProperties> props_guard(table_properties);
+
+  if (table_magic_number == kBlockBasedTableMagicNumber) {
+    options_.table_factory = std::make_shared<BlockBasedTableFactory>();
+    fprintf(stdout, "Sst file format: block-based\n");
+  } else if (table_magic_number == kPlainTableMagicNumber) {
+    options_.allow_mmap_reads = true;
+    options_.table_factory = std::make_shared<PlainTableFactory>(
+        table_properties->fixed_key_len, 2, 0.8);
+    options_.prefix_extractor.reset(NewNoopTransform());
+    fprintf(stdout, "Sst file format: plain table\n");
+  } else {
+    char error_msg_buffer[80];
+    snprintf(error_msg_buffer, sizeof(error_msg_buffer) - 1,
+             "Unsupported table magic number --- %lx",
+             (long)table_magic_number);
+    return Status::InvalidArgument(error_msg_buffer);
+  }
+
+  return Status::OK();
+}
+
+Status SstFileReader::ReadSequential(bool print_kv,
+                                     uint64_t read_num,
+                                     bool has_from,
+                                     const std::string& from_key,
+                                     bool has_to,
+                                     const std::string& to_key) {
+  if (!table_reader_) {
+    return init_result_;
+  }
+
+  Iterator* iter = table_reader_->NewIterator(ReadOptions(verify_checksum_,
+                                                         false));
+  uint64_t i = 0;
+  if (has_from) {
+    InternalKey ikey(from_key, kMaxSequenceNumber, kValueTypeForSeek);
+    iter->Seek(ikey.Encode());
+  } else {
+    iter->SeekToFirst();
+  }
+  for (; iter->Valid(); iter->Next()) {
+    Slice key = iter->key();
+    Slice value = iter->value();
+    ++i;
+    if (read_num > 0 && i > read_num)
+      break;
+
+    ParsedInternalKey ikey;
+    if (!ParseInternalKey(key, &ikey)) {
+      std::cerr << "Internal Key ["
+                << key.ToString(true /* in hex*/)
+                << "] parse error!\n";
+      continue;
+    }
+
+    // If end marker was specified, we stop before it
+    if (has_to && BytewiseComparator()->Compare(ikey.user_key, to_key) >= 0) {
+      break;
+    }
+
+    if (print_kv) {
+      fprintf(stdout, "%s => %s\n",
+          ikey.DebugString(output_hex_).c_str(),
+          value.ToString(output_hex_).c_str());
+    }
+  }
+
+  read_num_ += i;
+
+  Status ret = iter->status();
+  delete iter;
+  return ret;
+}
+
+Status SstFileReader::ReadTableProperties(
+    std::shared_ptr<const TableProperties>* table_properties) {
+  if (!table_reader_) {
+    return init_result_;
+  }
+
+  *table_properties = table_reader_->GetTableProperties();
+  return init_result_;
+}
+
+}  // namespace rocksdb
+
+static void print_help() {
+  fprintf(stderr,
+      "sst_dump [--command=check|scan] [--verify_checksum] "
+      "--file=data_dir_OR_sst_file"
+      " [--output_hex]"
+      " [--input_key_hex]"
+      " [--from=<user_key>]"
+      " [--to=<user_key>]"
+      " [--read_num=NUM]"
+      " [--show_properties]\n");
+}
+
+namespace {
+string HexToString(const string& str) {
+  string parsed;
+  if (str[0] != '0' || str[1] != 'x') {
+    fprintf(stderr, "Invalid hex input %s.  Must start with 0x\n",
+            str.c_str());
+    throw "Invalid hex input";
+  }
+
+  for (unsigned int i = 2; i < str.length();) {
+    int c;
+    sscanf(str.c_str() + i, "%2X", &c);
+    parsed.push_back(c);
+    i += 2;
+  }
+  return parsed;
+}
+}  // namespace
+
+int main(int argc, char** argv) {
+  const char* dir_or_file = nullptr;
+  uint64_t read_num = -1;
+  std::string command;
+
+  char junk;
+  uint64_t n;
+  bool verify_checksum = false;
+  bool output_hex = false;
+  bool input_key_hex = false;
+  bool has_from = false;
+  bool has_to = false;
+  bool show_properties = false;
+  std::string from_key;
+  std::string to_key;
+  for (int i = 1; i < argc; i++) {
+    if (strncmp(argv[i], "--file=", 7) == 0) {
+      dir_or_file = argv[i] + 7;
+    } else if (strcmp(argv[i], "--output_hex") == 0) {
+      output_hex = true;
+    } else if (strcmp(argv[i], "--input_key_hex") == 0) {
+      input_key_hex = true;
+    } else if (sscanf(argv[i],
+               "--read_num=%lu%c",
+               (unsigned long*)&n, &junk) == 1) {
+      read_num = n;
+    } else if (strcmp(argv[i], "--verify_checksum") == 0) {
+      verify_checksum = true;
+    } else if (strncmp(argv[i], "--command=", 10) == 0) {
+      command = argv[i] + 10;
+    } else if (strncmp(argv[i], "--from=", 7) == 0) {
+      from_key = argv[i] + 7;
+      has_from = true;
+    } else if (strncmp(argv[i], "--to=", 5) == 0) {
+      to_key = argv[i] + 5;
+      has_to = true;
+    } else if (strcmp(argv[i], "--show_properties") == 0) {
+      show_properties = true;
+    } else {
+      print_help();
+      exit(1);
+    }
+  }
+
+
+  if (input_key_hex) {
+    if (has_from) {
+      from_key = HexToString(from_key);
+    }
+    if (has_to) {
+      to_key = HexToString(to_key);
+    }
+  }
+
+  if (dir_or_file == nullptr) {
+    print_help();
+    exit(1);
+  }
+
+  std::vector<std::string> filenames;
+  rocksdb::Env* env = rocksdb::Env::Default();
+  rocksdb::Status st = env->GetChildren(dir_or_file, &filenames);
+  bool dir = true;
+  if (!st.ok()) {
+    filenames.clear();
+    filenames.push_back(dir_or_file);
+    dir = false;
+  }
+
+  fprintf(stdout, "from [%s] to [%s]\n",
+      rocksdb::Slice(from_key).ToString(true).c_str(),
+      rocksdb::Slice(to_key).ToString(true).c_str());
+
+  uint64_t total_read = 0;
+  for (size_t i = 0; i < filenames.size(); i++) {
+    std::string filename = filenames.at(i);
+    if (filename.length() <= 4 ||
+        filename.rfind(".sst") != filename.length() - 4) {
+      // ignore
+      continue;
+    }
+    if (dir) {
+      filename = std::string(dir_or_file) + "/" + filename;
+    }
+    rocksdb::SstFileReader reader(filename, verify_checksum,
+                                  output_hex);
+    rocksdb::Status st;
+    // scan all files in give file path.
+    if (command == "" || command == "scan" || command == "check") {
+      st = reader.ReadSequential(command != "check",
+                                 read_num > 0 ? (read_num - total_read) :
+                                                read_num,
+                                 has_from, from_key, has_to, to_key);
+      if (!st.ok()) {
+        fprintf(stderr, "%s: %s\n", filename.c_str(),
+            st.ToString().c_str());
+      }
+      total_read += reader.GetReadNumber();
+      if (read_num > 0 && total_read > read_num) {
+        break;
+      }
+    }
+    if (show_properties) {
+      std::shared_ptr<const rocksdb::TableProperties> table_properties;
+      st = reader.ReadTableProperties(&table_properties);
+      if (!st.ok()) {
+        fprintf(stderr, "%s: %s\n", filename.c_str(), st.ToString().c_str());
+      } else {
+        fprintf(stdout,
+                "Table Properties:\n"
+                "------------------------------\n"
+                "  %s",
+                table_properties->ToString("\n  ", ": ").c_str());
+        fprintf(stdout, "# deleted keys: %zd\n",
+                rocksdb::GetDeletedKeys(
+                    table_properties->user_collected_properties));
+      }
+    }
+  }
+}
diff --git a/util/arena.cc b/util/arena.cc
new file mode 100644 (file)
index 0000000..9b2cb82
--- /dev/null
@@ -0,0 +1,93 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/arena.h"
+#include <algorithm>
+
+namespace rocksdb {
+
+const size_t Arena::kMinBlockSize = 4096;
+const size_t Arena::kMaxBlockSize = 2 << 30;
+static const int kAlignUnit = sizeof(void*);
+
+size_t OptimizeBlockSize(size_t block_size) {
+  // Make sure block_size is in optimal range
+  block_size = std::max(Arena::kMinBlockSize, block_size);
+  block_size = std::min(Arena::kMaxBlockSize, block_size);
+
+  // make sure block_size is the multiple of kAlignUnit
+  if (block_size % kAlignUnit != 0) {
+    block_size = (1 + block_size / kAlignUnit) * kAlignUnit;
+  }
+
+  return block_size;
+}
+
+Arena::Arena(size_t block_size) : kBlockSize(OptimizeBlockSize(block_size)) {
+  assert(kBlockSize >= kMinBlockSize && kBlockSize <= kMaxBlockSize &&
+         kBlockSize % kAlignUnit == 0);
+}
+
+Arena::~Arena() {
+  for (const auto& block : blocks_) {
+    delete[] block;
+  }
+}
+
+char* Arena::AllocateFallback(size_t bytes, bool aligned) {
+  if (bytes > kBlockSize / 4) {
+    ++irregular_block_num;
+    // Object is more than a quarter of our block size.  Allocate it separately
+    // to avoid wasting too much space in leftover bytes.
+    return AllocateNewBlock(bytes);
+  }
+
+  // We waste the remaining space in the current block.
+  auto block_head = AllocateNewBlock(kBlockSize);
+  alloc_bytes_remaining_ = kBlockSize - bytes;
+
+  if (aligned) {
+    aligned_alloc_ptr_ = block_head + bytes;
+    unaligned_alloc_ptr_ = block_head + kBlockSize;
+    return block_head;
+  } else {
+    aligned_alloc_ptr_ = block_head;
+    unaligned_alloc_ptr_ = block_head + kBlockSize - bytes;
+    return unaligned_alloc_ptr_;
+  }
+}
+
+char* Arena::AllocateAligned(size_t bytes) {
+  assert((kAlignUnit & (kAlignUnit - 1)) ==
+         0);  // Pointer size should be a power of 2
+  size_t current_mod =
+      reinterpret_cast<uintptr_t>(aligned_alloc_ptr_) & (kAlignUnit - 1);
+  size_t slop = (current_mod == 0 ? 0 : kAlignUnit - current_mod);
+  size_t needed = bytes + slop;
+  char* result;
+  if (needed <= alloc_bytes_remaining_) {
+    result = aligned_alloc_ptr_ + slop;
+    aligned_alloc_ptr_ += needed;
+    alloc_bytes_remaining_ -= needed;
+  } else {
+    // AllocateFallback always returned aligned memory
+    result = AllocateFallback(bytes, true /* aligned */);
+  }
+  assert((reinterpret_cast<uintptr_t>(result) & (kAlignUnit - 1)) == 0);
+  return result;
+}
+
+char* Arena::AllocateNewBlock(size_t block_bytes) {
+  char* block = new char[block_bytes];
+  blocks_memory_ += block_bytes;
+  blocks_.push_back(block);
+  return block;
+}
+
+}  // namespace rocksdb
diff --git a/util/arena.h b/util/arena.h
new file mode 100644 (file)
index 0000000..6ce5a43
--- /dev/null
@@ -0,0 +1,100 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// Arena is an implementation of Arena class. For a request of small size,
+// it allocates a block with pre-defined block size. For a request of big
+// size, it uses malloc to directly get the requested size.
+
+#pragma once
+#include <cstddef>
+#include <vector>
+#include <assert.h>
+#include <stdint.h>
+#include "util/arena.h"
+
+namespace rocksdb {
+
+class Arena {
+ public:
+  // No copying allowed
+  Arena(const Arena&) = delete;
+  void operator=(const Arena&) = delete;
+
+  static const size_t kMinBlockSize;
+  static const size_t kMaxBlockSize;
+
+  explicit Arena(size_t block_size = kMinBlockSize);
+  ~Arena();
+
+  char* Allocate(size_t bytes);
+
+  char* AllocateAligned(size_t bytes);
+
+  // Returns an estimate of the total memory usage of data allocated
+  // by the arena (exclude the space allocated but not yet used for future
+  // allocations).
+  size_t ApproximateMemoryUsage() const {
+    return blocks_memory_ + blocks_.capacity() * sizeof(char*) -
+           alloc_bytes_remaining_;
+  }
+
+  size_t MemoryAllocatedBytes() const { return blocks_memory_; }
+
+  size_t AllocatedAndUnused() const { return alloc_bytes_remaining_; }
+
+  // If an allocation is too big, we'll allocate an irregular block with the
+  // same size of that allocation.
+  virtual size_t IrregularBlockNum() const { return irregular_block_num; }
+
+  size_t BlockSize() const { return kBlockSize; }
+
+ private:
+  // Number of bytes allocated in one block
+  const size_t kBlockSize;
+  // Array of new[] allocated memory blocks
+  typedef std::vector<char*> Blocks;
+  Blocks blocks_;
+  size_t irregular_block_num = 0;
+
+  // Stats for current active block.
+  // For each block, we allocate aligned memory chucks from one end and
+  // allocate unaligned memory chucks from the other end. Otherwise the
+  // memory waste for alignment will be higher if we allocate both types of
+  // memory from one direction.
+  char* unaligned_alloc_ptr_ = nullptr;
+  char* aligned_alloc_ptr_ = nullptr;
+  // How many bytes left in currently active block?
+  size_t alloc_bytes_remaining_ = 0;
+
+  char* AllocateFallback(size_t bytes, bool aligned);
+  char* AllocateNewBlock(size_t block_bytes);
+
+  // Bytes of memory in blocks allocated so far
+  size_t blocks_memory_ = 0;
+};
+
+inline char* Arena::Allocate(size_t bytes) {
+  // The semantics of what to return are a bit messy if we allow
+  // 0-byte allocations, so we disallow them here (we don't need
+  // them for our internal use).
+  assert(bytes > 0);
+  if (bytes <= alloc_bytes_remaining_) {
+    unaligned_alloc_ptr_ -= bytes;
+    alloc_bytes_remaining_ -= bytes;
+    return unaligned_alloc_ptr_;
+  }
+  return AllocateFallback(bytes, false /* unaligned */);
+}
+
+// check and adjust the block_size so that the return value is
+//  1. in the range of [kMinBlockSize, kMaxBlockSize].
+//  2. the multiple of align unit.
+extern size_t OptimizeBlockSize(size_t block_size);
+
+}  // namespace rocksdb
diff --git a/util/arena_test.cc b/util/arena_test.cc
new file mode 100644 (file)
index 0000000..1b2b531
--- /dev/null
@@ -0,0 +1,133 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/arena.h"
+#include "util/random.h"
+#include "util/testharness.h"
+
+namespace rocksdb {
+
+class ArenaTest {};
+
+TEST(ArenaTest, Empty) { Arena arena0; }
+
+TEST(ArenaTest, MemoryAllocatedBytes) {
+  const int N = 17;
+  size_t req_sz;  // requested size
+  size_t bsz = 8192;  // block size
+  size_t expected_memory_allocated;
+
+  Arena arena(bsz);
+
+  // requested size > quarter of a block:
+  //   allocate requested size separately
+  req_sz = 3001;
+  for (int i = 0; i < N; i++) {
+    arena.Allocate(req_sz);
+  }
+  expected_memory_allocated = req_sz * N;
+  ASSERT_EQ(arena.MemoryAllocatedBytes(), expected_memory_allocated);
+
+  // requested size < quarter of a block:
+  //   allocate a block with the default size, then try to use unused part
+  //   of the block. So one new block will be allocated for the first
+  //   Allocate(99) call. All the remaining calls won't lead to new allocation.
+  req_sz = 99;
+  for (int i = 0; i < N; i++) {
+    arena.Allocate(req_sz);
+  }
+  expected_memory_allocated += bsz;
+  ASSERT_EQ(arena.MemoryAllocatedBytes(), expected_memory_allocated);
+
+  // requested size > quarter of a block:
+  //   allocate requested size separately
+  req_sz = 99999999;
+  for (int i = 0; i < N; i++) {
+    arena.Allocate(req_sz);
+  }
+  expected_memory_allocated += req_sz * N;
+  ASSERT_EQ(arena.MemoryAllocatedBytes(), expected_memory_allocated);
+}
+
+// Make sure we didn't count the allocate but not used memory space in
+// Arena::ApproximateMemoryUsage()
+TEST(ArenaTest, ApproximateMemoryUsageTest) {
+  const size_t kBlockSize = 4096;
+  const size_t kEntrySize = kBlockSize / 8;
+  const size_t kZero = 0;
+  Arena arena(kBlockSize);
+  ASSERT_EQ(kZero, arena.ApproximateMemoryUsage());
+
+  auto num_blocks = kBlockSize / kEntrySize;
+
+  // first allocation
+  arena.AllocateAligned(kEntrySize);
+  auto mem_usage = arena.MemoryAllocatedBytes();
+  ASSERT_EQ(mem_usage, kBlockSize);
+  auto usage = arena.ApproximateMemoryUsage();
+  ASSERT_LT(usage, mem_usage);
+  for (size_t i = 1; i < num_blocks; ++i) {
+    arena.AllocateAligned(kEntrySize);
+    ASSERT_EQ(mem_usage, arena.MemoryAllocatedBytes());
+    ASSERT_EQ(arena.ApproximateMemoryUsage(), usage + kEntrySize);
+    usage = arena.ApproximateMemoryUsage();
+  }
+  ASSERT_GT(usage, mem_usage);
+}
+
+TEST(ArenaTest, Simple) {
+  std::vector<std::pair<size_t, char*>> allocated;
+  Arena arena;
+  const int N = 100000;
+  size_t bytes = 0;
+  Random rnd(301);
+  for (int i = 0; i < N; i++) {
+    size_t s;
+    if (i % (N / 10) == 0) {
+      s = i;
+    } else {
+      s = rnd.OneIn(4000)
+              ? rnd.Uniform(6000)
+              : (rnd.OneIn(10) ? rnd.Uniform(100) : rnd.Uniform(20));
+    }
+    if (s == 0) {
+      // Our arena disallows size 0 allocations.
+      s = 1;
+    }
+    char* r;
+    if (rnd.OneIn(10)) {
+      r = arena.AllocateAligned(s);
+    } else {
+      r = arena.Allocate(s);
+    }
+
+    for (unsigned int b = 0; b < s; b++) {
+      // Fill the "i"th allocation with a known bit pattern
+      r[b] = i % 256;
+    }
+    bytes += s;
+    allocated.push_back(std::make_pair(s, r));
+    ASSERT_GE(arena.ApproximateMemoryUsage(), bytes);
+    if (i > N / 10) {
+      ASSERT_LE(arena.ApproximateMemoryUsage(), bytes * 1.10);
+    }
+  }
+  for (unsigned int i = 0; i < allocated.size(); i++) {
+    size_t num_bytes = allocated[i].first;
+    const char* p = allocated[i].second;
+    for (unsigned int b = 0; b < num_bytes; b++) {
+      // Check the "i"th allocation for the known bit pattern
+      ASSERT_EQ(int(p[b]) & 0xff, (int)(i % 256));
+    }
+  }
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); }
diff --git a/util/auto_roll_logger.cc b/util/auto_roll_logger.cc
new file mode 100644 (file)
index 0000000..19c2b8c
--- /dev/null
@@ -0,0 +1,116 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#include "util/auto_roll_logger.h"
+#include "util/mutexlock.h"
+
+using namespace std;
+
+namespace rocksdb {
+
+// -- AutoRollLogger
+Status AutoRollLogger::ResetLogger() {
+  status_ = env_->NewLogger(log_fname_, &logger_);
+
+  if (!status_.ok()) {
+    return status_;
+  }
+
+  if (logger_->GetLogFileSize() ==
+      (size_t)Logger::DO_NOT_SUPPORT_GET_LOG_FILE_SIZE) {
+    status_ = Status::NotSupported(
+        "The underlying logger doesn't support GetLogFileSize()");
+  }
+  if (status_.ok()) {
+    cached_now = static_cast<uint64_t>(env_->NowMicros() * 1e-6);
+    ctime_ = cached_now;
+    cached_now_access_count = 0;
+  }
+
+  return status_;
+}
+
+void AutoRollLogger::RollLogFile() {
+  std::string old_fname = OldInfoLogFileName(
+      dbname_, env_->NowMicros(), db_absolute_path_, db_log_dir_);
+  env_->RenameFile(log_fname_, old_fname);
+}
+
+void AutoRollLogger::Logv(const char* format, va_list ap) {
+  assert(GetStatus().ok());
+
+  std::shared_ptr<Logger> logger;
+  {
+    MutexLock l(&mutex_);
+    if ((kLogFileTimeToRoll > 0 && LogExpired()) ||
+        (kMaxLogFileSize > 0 && logger_->GetLogFileSize() >= kMaxLogFileSize)) {
+      RollLogFile();
+      Status s = ResetLogger();
+      if (!s.ok()) {
+        // can't really log the error if creating a new LOG file failed
+        return;
+      }
+    }
+
+    // pin down the current logger_ instance before releasing the mutex.
+    logger = logger_;
+  }
+
+  // Another thread could have put a new Logger instance into logger_ by now.
+  // However, since logger is still hanging on to the previous instance
+  // (reference count is not zero), we don't have to worry about it being
+  // deleted while we are accessing it.
+  // Note that logv itself is not mutex protected to allow maximum concurrency,
+  // as thread safety should have been handled by the underlying logger.
+  logger->Logv(format, ap);
+}
+
+bool AutoRollLogger::LogExpired() {
+  if (cached_now_access_count >= call_NowMicros_every_N_records_) {
+    cached_now = static_cast<uint64_t>(env_->NowMicros() * 1e-6);
+    cached_now_access_count = 0;
+  }
+
+  ++cached_now_access_count;
+  return cached_now >= ctime_ + kLogFileTimeToRoll;
+}
+
+Status CreateLoggerFromOptions(
+    const std::string& dbname,
+    const std::string& db_log_dir,
+    Env* env,
+    const DBOptions& options,
+    std::shared_ptr<Logger>* logger) {
+  std::string db_absolute_path;
+  env->GetAbsolutePath(dbname, &db_absolute_path);
+  std::string fname = InfoLogFileName(dbname, db_absolute_path, db_log_dir);
+
+  // Currently we only support roll by time-to-roll and log size
+  if (options.log_file_time_to_roll > 0 || options.max_log_file_size > 0) {
+    AutoRollLogger* result = new AutoRollLogger(
+        env, dbname, db_log_dir,
+        options.max_log_file_size,
+        options.log_file_time_to_roll, options.info_log_level);
+    Status s = result->GetStatus();
+    if (!s.ok()) {
+      delete result;
+    } else {
+      logger->reset(result);
+    }
+    return s;
+  } else {
+    // Open a log file in the same directory as the db
+    env->CreateDir(dbname);  // In case it does not exist
+    env->RenameFile(fname, OldInfoLogFileName(dbname, env->NowMicros(),
+                                              db_absolute_path, db_log_dir));
+    auto s = env->NewLogger(fname, logger);
+    if (logger->get() != nullptr) {
+      (*logger)->SetInfoLogLevel(options.info_log_level);
+    }
+    return s;
+  }
+}
+
+}  // namespace rocksdb
diff --git a/util/auto_roll_logger.h b/util/auto_roll_logger.h
new file mode 100644 (file)
index 0000000..c592d79
--- /dev/null
@@ -0,0 +1,91 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Logger implementation that can be shared by all environments
+// where enough posix functionality is available.
+
+#pragma once
+#include "db/filename.h"
+#include "port/port.h"
+#include "util/posix_logger.h"
+
+namespace rocksdb {
+
+// Rolls the log file by size and/or time
+class AutoRollLogger : public Logger {
+ public:
+  AutoRollLogger(Env* env, const std::string& dbname,
+                 const std::string& db_log_dir, size_t log_max_size,
+                 size_t log_file_time_to_roll,
+                 const InfoLogLevel log_level = InfoLogLevel::INFO_LEVEL)
+      : Logger(log_level),
+        dbname_(dbname),
+        db_log_dir_(db_log_dir),
+        env_(env),
+        status_(Status::OK()),
+        kMaxLogFileSize(log_max_size),
+        kLogFileTimeToRoll(log_file_time_to_roll),
+        cached_now(static_cast<uint64_t>(env_->NowMicros() * 1e-6)),
+        ctime_(cached_now),
+        cached_now_access_count(0),
+        call_NowMicros_every_N_records_(100),
+        mutex_() {
+    env->GetAbsolutePath(dbname, &db_absolute_path_);
+    log_fname_ = InfoLogFileName(dbname_, db_absolute_path_, db_log_dir_);
+    RollLogFile();
+    ResetLogger();
+  }
+
+  void Logv(const char* format, va_list ap);
+
+  // check if the logger has encountered any problem.
+  Status GetStatus() {
+    return status_;
+  }
+
+  size_t GetLogFileSize() const {
+    return logger_->GetLogFileSize();
+  }
+
+  virtual ~AutoRollLogger() {
+  }
+
+  void SetCallNowMicrosEveryNRecords(uint64_t call_NowMicros_every_N_records) {
+    call_NowMicros_every_N_records_ = call_NowMicros_every_N_records;
+  }
+
+ private:
+
+  bool LogExpired();
+  Status ResetLogger();
+  void RollLogFile();
+
+  std::string log_fname_; // Current active info log's file name.
+  std::string dbname_;
+  std::string db_log_dir_;
+  std::string db_absolute_path_;
+  Env* env_;
+  std::shared_ptr<Logger> logger_;
+  // current status of the logger
+  Status status_;
+  const size_t kMaxLogFileSize;
+  const size_t kLogFileTimeToRoll;
+  // to avoid frequent env->NowMicros() calls, we cached the current time
+  uint64_t cached_now;
+  uint64_t ctime_;
+  uint64_t cached_now_access_count;
+  uint64_t call_NowMicros_every_N_records_;
+  port::Mutex mutex_;
+};
+
+// Facade to craete logger automatically
+Status CreateLoggerFromOptions(
+    const std::string& dbname,
+    const std::string& db_log_dir,
+    Env* env,
+    const DBOptions& options,
+    std::shared_ptr<Logger>* logger);
+
+}  // namespace rocksdb
diff --git a/util/auto_roll_logger_test.cc b/util/auto_roll_logger_test.cc
new file mode 100755 (executable)
index 0000000..c49894f
--- /dev/null
@@ -0,0 +1,292 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#include <string>
+#include <cmath>
+#include <iostream>
+#include <fstream>
+#include <iterator>
+#include <algorithm>
+#include "util/testharness.h"
+#include "util/auto_roll_logger.h"
+#include "rocksdb/db.h"
+#include <sys/stat.h>
+#include <errno.h>
+
+using namespace std;
+
+namespace rocksdb {
+
+class AutoRollLoggerTest {
+ public:
+  static void InitTestDb() {
+    string deleteCmd = "rm -rf " + kTestDir;
+    ASSERT_TRUE(system(deleteCmd.c_str()) == 0);
+    Env::Default()->CreateDir(kTestDir);
+  }
+
+  void RollLogFileBySizeTest(AutoRollLogger* logger,
+                             size_t log_max_size,
+                             const string& log_message);
+  uint64_t RollLogFileByTimeTest(AutoRollLogger* logger,
+                                 size_t time,
+                                 const string& log_message);
+
+  static const string kSampleMessage;
+  static const string kTestDir;
+  static const string kLogFile;
+  static Env* env;
+};
+
+const string AutoRollLoggerTest::kSampleMessage(
+    "this is the message to be written to the log file!!");
+const string AutoRollLoggerTest::kTestDir(test::TmpDir() + "/db_log_test");
+const string AutoRollLoggerTest::kLogFile(test::TmpDir() + "/db_log_test/LOG");
+Env* AutoRollLoggerTest::env = Env::Default();
+
+// In this test we only want to Log some simple log message with
+// no format. LogMessage() provides such a simple interface and
+// avoids the [format-security] warning which occurs when you
+// call Log(logger, log_message) directly.
+namespace {
+void LogMessage(Logger* logger, const char* message) {
+  Log(logger, "%s", message);
+}
+
+void LogMessage(const InfoLogLevel log_level, Logger* logger,
+                const char* message) {
+  Log(log_level, logger, "%s", message);
+}
+}  // namespace
+
+namespace {
+void GetFileCreateTime(const std::string& fname, uint64_t* file_ctime) {
+  struct stat s;
+  if (stat(fname.c_str(), &s) != 0) {
+    *file_ctime = (uint64_t)0;
+  }
+  *file_ctime = static_cast<uint64_t>(s.st_ctime);
+}
+}  // namespace
+
+void AutoRollLoggerTest::RollLogFileBySizeTest(AutoRollLogger* logger,
+                                               size_t log_max_size,
+                                               const string& log_message) {
+  logger->SetInfoLogLevel(InfoLogLevel::INFO_LEVEL);
+  // measure the size of each message, which is supposed
+  // to be equal or greater than log_message.size()
+  LogMessage(logger, log_message.c_str());
+  size_t message_size = logger->GetLogFileSize();
+  size_t current_log_size = message_size;
+
+  // Test the cases when the log file will not be rolled.
+  while (current_log_size + message_size < log_max_size) {
+    LogMessage(logger, log_message.c_str());
+    current_log_size += message_size;
+    ASSERT_EQ(current_log_size, logger->GetLogFileSize());
+  }
+
+  // Now the log file will be rolled
+  LogMessage(logger, log_message.c_str());
+  // Since rotation is checked before actual logging, we need to
+  // trigger the rotation by logging another message.
+  LogMessage(logger, log_message.c_str());
+
+  ASSERT_TRUE(message_size == logger->GetLogFileSize());
+}
+
+uint64_t AutoRollLoggerTest::RollLogFileByTimeTest(
+    AutoRollLogger* logger, size_t time, const string& log_message) {
+  uint64_t expected_create_time;
+  uint64_t actual_create_time;
+  uint64_t total_log_size;
+  ASSERT_OK(env->GetFileSize(kLogFile, &total_log_size));
+  GetFileCreateTime(kLogFile, &expected_create_time);
+  logger->SetCallNowMicrosEveryNRecords(0);
+
+  // -- Write to the log for several times, which is supposed
+  // to be finished before time.
+  for (int i = 0; i < 10; ++i) {
+     LogMessage(logger, log_message.c_str());
+     ASSERT_OK(logger->GetStatus());
+     // Make sure we always write to the same log file (by
+     // checking the create time);
+     GetFileCreateTime(kLogFile, &actual_create_time);
+
+     // Also make sure the log size is increasing.
+     ASSERT_EQ(expected_create_time, actual_create_time);
+     ASSERT_GT(logger->GetLogFileSize(), total_log_size);
+     total_log_size = logger->GetLogFileSize();
+  }
+
+  // -- Make the log file expire
+  sleep(time);
+  LogMessage(logger, log_message.c_str());
+
+  // At this time, the new log file should be created.
+  GetFileCreateTime(kLogFile, &actual_create_time);
+  ASSERT_GT(actual_create_time, expected_create_time);
+  ASSERT_LT(logger->GetLogFileSize(), total_log_size);
+  expected_create_time = actual_create_time;
+
+  return expected_create_time;
+}
+
+TEST(AutoRollLoggerTest, RollLogFileBySize) {
+    InitTestDb();
+    size_t log_max_size = 1024 * 5;
+
+    AutoRollLogger logger(Env::Default(), kTestDir, "", log_max_size, 0);
+
+    RollLogFileBySizeTest(&logger, log_max_size,
+                          kSampleMessage + ":RollLogFileBySize");
+}
+
+TEST(AutoRollLoggerTest, RollLogFileByTime) {
+    size_t time = 1;
+    size_t log_size = 1024 * 5;
+
+    InitTestDb();
+    // -- Test the existence of file during the server restart.
+    ASSERT_TRUE(!env->FileExists(kLogFile));
+    AutoRollLogger logger(Env::Default(), kTestDir, "", log_size, 1);
+    ASSERT_TRUE(env->FileExists(kLogFile));
+
+    RollLogFileByTimeTest(&logger, time, kSampleMessage + ":RollLogFileByTime");
+}
+
+TEST(AutoRollLoggerTest,
+     OpenLogFilesMultipleTimesWithOptionLog_max_size) {
+  // If only 'log_max_size' options is specified, then every time
+  // when rocksdb is restarted, a new empty log file will be created.
+  InitTestDb();
+  // WORKAROUND:
+  // avoid complier's complaint of "comparison between signed
+  // and unsigned integer expressions" because literal 0 is
+  // treated as "singed".
+  size_t kZero = 0;
+  size_t log_size = 1024;
+
+  AutoRollLogger* logger = new AutoRollLogger(
+    Env::Default(), kTestDir, "", log_size, 0);
+
+  LogMessage(logger, kSampleMessage.c_str());
+  ASSERT_GT(logger->GetLogFileSize(), kZero);
+  delete logger;
+
+  // reopens the log file and an empty log file will be created.
+  logger = new AutoRollLogger(
+    Env::Default(), kTestDir, "", log_size, 0);
+  ASSERT_EQ(logger->GetLogFileSize(), kZero);
+  delete logger;
+}
+
+TEST(AutoRollLoggerTest, CompositeRollByTimeAndSizeLogger) {
+  size_t time = 1, log_max_size = 1024 * 5;
+
+  InitTestDb();
+
+  AutoRollLogger logger(Env::Default(), kTestDir, "", log_max_size, time);
+
+  // Test the ability to roll by size
+  RollLogFileBySizeTest(
+      &logger, log_max_size,
+      kSampleMessage + ":CompositeRollByTimeAndSizeLogger");
+
+  // Test the ability to roll by Time
+  RollLogFileByTimeTest( &logger, time,
+      kSampleMessage + ":CompositeRollByTimeAndSizeLogger");
+}
+
+TEST(AutoRollLoggerTest, CreateLoggerFromOptions) {
+  DBOptions options;
+  shared_ptr<Logger> logger;
+
+  // Normal logger
+  ASSERT_OK(CreateLoggerFromOptions(kTestDir, "", env, options, &logger));
+  ASSERT_TRUE(dynamic_cast<PosixLogger*>(logger.get()));
+
+  // Only roll by size
+  InitTestDb();
+  options.max_log_file_size = 1024;
+  ASSERT_OK(CreateLoggerFromOptions(kTestDir, "", env, options, &logger));
+  AutoRollLogger* auto_roll_logger =
+    dynamic_cast<AutoRollLogger*>(logger.get());
+  ASSERT_TRUE(auto_roll_logger);
+  RollLogFileBySizeTest(
+      auto_roll_logger, options.max_log_file_size,
+      kSampleMessage + ":CreateLoggerFromOptions - size");
+
+  // Only roll by Time
+  InitTestDb();
+  options.max_log_file_size = 0;
+  options.log_file_time_to_roll = 1;
+  ASSERT_OK(CreateLoggerFromOptions(kTestDir, "", env, options, &logger));
+  auto_roll_logger =
+    dynamic_cast<AutoRollLogger*>(logger.get());
+  RollLogFileByTimeTest(
+      auto_roll_logger, options.log_file_time_to_roll,
+      kSampleMessage + ":CreateLoggerFromOptions - time");
+
+  // roll by both Time and size
+  InitTestDb();
+  options.max_log_file_size = 1024 * 5;
+  options.log_file_time_to_roll = 1;
+  ASSERT_OK(CreateLoggerFromOptions(kTestDir, "", env, options, &logger));
+  auto_roll_logger =
+    dynamic_cast<AutoRollLogger*>(logger.get());
+  RollLogFileBySizeTest(
+      auto_roll_logger, options.max_log_file_size,
+      kSampleMessage + ":CreateLoggerFromOptions - both");
+  RollLogFileByTimeTest(
+      auto_roll_logger, options.log_file_time_to_roll,
+      kSampleMessage + ":CreateLoggerFromOptions - both");
+}
+
+TEST(AutoRollLoggerTest, InfoLogLevel) {
+  InitTestDb();
+
+  size_t log_size = 8192;
+  size_t log_lines = 0;
+  // an extra-scope to force the AutoRollLogger to flush the log file when it
+  // becomes out of scope.
+  {
+    AutoRollLogger logger(Env::Default(), kTestDir, "", log_size, 0);
+    for (int log_level = InfoLogLevel::FATAL_LEVEL;
+         log_level >= InfoLogLevel::DEBUG_LEVEL; log_level--) {
+      logger.SetInfoLogLevel((InfoLogLevel)log_level);
+      for (int log_type = InfoLogLevel::DEBUG_LEVEL;
+           log_type <= InfoLogLevel::FATAL_LEVEL; log_type++) {
+        // log messages with log level smaller than log_level will not be
+        // logged.
+        LogMessage((InfoLogLevel)log_type, &logger, kSampleMessage.c_str());
+      }
+      log_lines += InfoLogLevel::FATAL_LEVEL - log_level + 1;
+    }
+    for (int log_level = InfoLogLevel::FATAL_LEVEL;
+         log_level >= InfoLogLevel::DEBUG_LEVEL; log_level--) {
+      logger.SetInfoLogLevel((InfoLogLevel)log_level);
+
+      // again, messages with level smaller than log_level will not be logged.
+      Debug(&logger, "%s", kSampleMessage.c_str());
+      Info(&logger, "%s", kSampleMessage.c_str());
+      Warn(&logger, "%s", kSampleMessage.c_str());
+      Error(&logger, "%s", kSampleMessage.c_str());
+      Fatal(&logger, "%s", kSampleMessage.c_str());
+      log_lines += InfoLogLevel::FATAL_LEVEL - log_level + 1;
+    }
+  }
+  std::ifstream inFile(AutoRollLoggerTest::kLogFile.c_str());
+  size_t lines = std::count(std::istreambuf_iterator<char>(inFile),
+                         std::istreambuf_iterator<char>(), '\n');
+  ASSERT_EQ(log_lines, lines);
+  inFile.close();
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/util/autovector.h b/util/autovector.h
new file mode 100644 (file)
index 0000000..212073e
--- /dev/null
@@ -0,0 +1,307 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+#pragma once
+
+#include <algorithm>
+#include <cassert>
+#include <stdexcept>
+#include <iterator>
+#include <vector>
+
+namespace rocksdb {
+
+#ifdef ROCKSDB_LITE
+template <class T, size_t kSize = 8>
+class autovector : public std::vector<T> {};
+#else
+// A vector that leverages pre-allocated stack-based array to achieve better
+// performance for array with small amount of items.
+//
+// The interface resembles that of vector, but with less features since we aim
+// to solve the problem that we have in hand, rather than implementing a
+// full-fledged generic container.
+//
+// Currently we don't support:
+//  * reserve()/shrink_to_fit()/resize()
+//     If used correctly, in most cases, people should not touch the
+//     underlying vector at all.
+//  * random insert()/erase(), please only use push_back()/pop_back().
+//  * No move/swap operations. Each autovector instance has a
+//     stack-allocated array and if we want support move/swap operations, we
+//     need to copy the arrays other than just swapping the pointers. In this
+//     case we'll just explicitly forbid these operations since they may
+//     lead users to make false assumption by thinking they are inexpensive
+//     operations.
+//
+// Naming style of public methods almost follows that of the STL's.
+template <class T, size_t kSize = 8>
+class autovector {
+ public:
+  // General STL-style container member types.
+  typedef T value_type;
+  typedef typename std::vector<T>::difference_type difference_type;
+  typedef typename std::vector<T>::size_type size_type;
+  typedef value_type& reference;
+  typedef const value_type& const_reference;
+  typedef value_type* pointer;
+  typedef const value_type* const_pointer;
+
+  // This class is the base for regular/const iterator
+  template <class TAutoVector, class TValueType>
+  class iterator_impl {
+   public:
+    // -- iterator traits
+    typedef iterator_impl<TAutoVector, TValueType> self_type;
+    typedef TValueType value_type;
+    typedef TValueType& reference;
+    typedef TValueType* pointer;
+    typedef typename TAutoVector::difference_type difference_type;
+    typedef std::random_access_iterator_tag iterator_category;
+
+    iterator_impl(TAutoVector* vect, size_t index)
+        : vect_(vect), index_(index) {};
+    iterator_impl(const iterator_impl&) = default;
+    ~iterator_impl() {}
+    iterator_impl& operator=(const iterator_impl&) = default;
+
+    // -- Advancement
+    // iterator++
+    self_type& operator++() {
+      ++index_;
+      return *this;
+    }
+
+    // ++iterator
+    self_type operator++(int) {
+      auto old = *this;
+      ++index_;
+      return old;
+    }
+
+    // iterator--
+    self_type& operator--() {
+      --index_;
+      return *this;
+    }
+
+    // --iterator
+    self_type operator--(int) {
+      auto old = *this;
+      --index_;
+      return old;
+    }
+
+    self_type operator-(difference_type len) {
+      return self_type(vect_, index_ - len);
+    }
+
+    difference_type operator-(const self_type& other) {
+      assert(vect_ == other.vect_);
+      return index_ - other.index_;
+    }
+
+    self_type operator+(difference_type len) {
+      return self_type(vect_, index_ + len);
+    }
+
+    self_type& operator+=(difference_type len) {
+      index_ += len;
+      return *this;
+    }
+
+    self_type& operator-=(difference_type len) {
+      index_ -= len;
+      return *this;
+    }
+
+    // -- Reference
+    reference operator*() {
+      assert(vect_->size() >= index_);
+      return (*vect_)[index_];
+    }
+    pointer operator->() {
+      assert(vect_->size() >= index_);
+      return &(*vect_)[index_];
+    }
+
+    // -- Logical Operators
+    bool operator==(const self_type& other) const {
+      assert(vect_ == other.vect_);
+      return index_ == other.index_;
+    }
+
+    bool operator!=(const self_type& other) const { return !(*this == other); }
+
+    bool operator>(const self_type& other) const {
+      assert(vect_ == other.vect_);
+      return index_ > other.index_;
+    }
+
+    bool operator<(const self_type& other) const {
+      assert(vect_ == other.vect_);
+      return index_ < other.index_;
+    }
+
+    bool operator>=(const self_type& other) const {
+      assert(vect_ == other.vect_);
+      return index_ >= other.index_;
+    }
+
+    bool operator<=(const self_type& other) const {
+      assert(vect_ == other.vect_);
+      return index_ <= other.index_;
+    }
+
+   private:
+    TAutoVector* vect_ = nullptr;
+    size_t index_ = 0;
+  };
+
+  typedef iterator_impl<autovector, value_type> iterator;
+  typedef iterator_impl<const autovector, const value_type> const_iterator;
+  typedef std::reverse_iterator<iterator> reverse_iterator;
+  typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
+
+  autovector() = default;
+  ~autovector() = default;
+
+  // -- Immutable operations
+  // Indicate if all data resides in in-stack data structure.
+  bool only_in_stack() const {
+    // If no element was inserted at all, the vector's capacity will be `0`.
+    return vect_.capacity() == 0;
+  }
+
+  size_type size() const { return num_stack_items_ + vect_.size(); }
+
+  bool empty() const { return size() == 0; }
+
+  // will not check boundry
+  const_reference operator[](size_type n) const {
+    return n < kSize ? values_[n] : vect_[n - kSize];
+  }
+
+  reference operator[](size_type n) {
+    return n < kSize ? values_[n] : vect_[n - kSize];
+  }
+
+  // will check boundry
+  const_reference at(size_type n) const {
+    if (n >= size()) {
+      throw std::out_of_range("autovector: index out of range");
+    }
+    return (*this)[n];
+  }
+
+  reference at(size_type n) {
+    if (n >= size()) {
+      throw std::out_of_range("autovector: index out of range");
+    }
+    return (*this)[n];
+  }
+
+  reference front() {
+    assert(!empty());
+    return *begin();
+  }
+
+  const_reference front() const {
+    assert(!empty());
+    return *begin();
+  }
+
+  reference back() {
+    assert(!empty());
+    return *(end() - 1);
+  }
+
+  const_reference back() const {
+    assert(!empty());
+    return *(end() - 1);
+  }
+
+  // -- Mutable Operations
+  void push_back(T&& item) {
+    if (num_stack_items_ < kSize) {
+      values_[num_stack_items_++] = std::move(item);
+    } else {
+      vect_.push_back(item);
+    }
+  }
+
+  void push_back(const T& item) { push_back(value_type(item)); }
+
+  template <class... Args>
+  void emplace_back(Args&&... args) {
+    push_back(value_type(args...));
+  }
+
+  void pop_back() {
+    assert(!empty());
+    if (!vect_.empty()) {
+      vect_.pop_back();
+    } else {
+      --num_stack_items_;
+    }
+  }
+
+  void clear() {
+    num_stack_items_ = 0;
+    vect_.clear();
+  }
+
+  // -- Copy and Assignment
+  autovector& assign(const autovector& other);
+
+  autovector(const autovector& other) { assign(other); }
+
+  autovector& operator=(const autovector& other) { return assign(other); }
+
+  // move operation are disallowed since it is very hard to make sure both
+  // autovectors are allocated from the same function stack.
+  autovector& operator=(autovector&& other) = delete;
+  autovector(autovector&& other) = delete;
+
+  // -- Iterator Operations
+  iterator begin() { return iterator(this, 0); }
+
+  const_iterator begin() const { return const_iterator(this, 0); }
+
+  iterator end() { return iterator(this, this->size()); }
+
+  const_iterator end() const { return const_iterator(this, this->size()); }
+
+  reverse_iterator rbegin() { return reverse_iterator(end()); }
+
+  const_reverse_iterator rbegin() const {
+    return const_reverse_iterator(end());
+  }
+
+  reverse_iterator rend() { return reverse_iterator(begin()); }
+
+  const_reverse_iterator rend() const {
+    return const_reverse_iterator(begin());
+  }
+
+ private:
+  size_type num_stack_items_ = 0;  // current number of items
+  value_type values_[kSize];       // the first `kSize` items
+  // used only if there are more than `kSize` items.
+  std::vector<T> vect_;
+};
+
+template <class T, size_t kSize>
+autovector<T, kSize>& autovector<T, kSize>::assign(const autovector& other) {
+  // copy the internal vector
+  vect_.assign(other.vect_.begin(), other.vect_.end());
+
+  // copy array
+  num_stack_items_ = other.num_stack_items_;
+  std::copy(other.values_, other.values_ + num_stack_items_, values_);
+
+  return *this;
+}
+#endif  // ROCKSDB_LITE
+}  // namespace rocksdb
diff --git a/util/autovector_test.cc b/util/autovector_test.cc
new file mode 100644 (file)
index 0000000..88744cf
--- /dev/null
@@ -0,0 +1,294 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include <atomic>
+#include <iostream>
+
+#include "rocksdb/env.h"
+#include "util/autovector.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+
+using namespace std;
+
+class AutoVectorTest { };
+
+const unsigned long kSize = 8;
+TEST(AutoVectorTest, PushBackAndPopBack) {
+  autovector<size_t, kSize> vec;
+  ASSERT_TRUE(vec.empty());
+  ASSERT_EQ(0ul, vec.size());
+
+  for (size_t i = 0; i < 1000 * kSize; ++i) {
+    vec.push_back(i);
+    ASSERT_TRUE(!vec.empty());
+    if (i < kSize) {
+      ASSERT_TRUE(vec.only_in_stack());
+    } else {
+      ASSERT_TRUE(!vec.only_in_stack());
+    }
+    ASSERT_EQ(i + 1, vec.size());
+    ASSERT_EQ(i, vec[i]);
+    ASSERT_EQ(i, vec.at(i));
+  }
+
+  size_t size = vec.size();
+  while (size != 0) {
+    vec.pop_back();
+    // will always be in heap
+    ASSERT_TRUE(!vec.only_in_stack());
+    ASSERT_EQ(--size, vec.size());
+  }
+
+  ASSERT_TRUE(vec.empty());
+}
+
+TEST(AutoVectorTest, EmplaceBack) {
+  typedef std::pair<size_t, std::string> ValueType;
+  autovector<ValueType, kSize> vec;
+
+  for (size_t i = 0; i < 1000 * kSize; ++i) {
+    vec.emplace_back(i, std::to_string(i + 123));
+    ASSERT_TRUE(!vec.empty());
+    if (i < kSize) {
+      ASSERT_TRUE(vec.only_in_stack());
+    } else {
+      ASSERT_TRUE(!vec.only_in_stack());
+    }
+
+    ASSERT_EQ(i + 1, vec.size());
+    ASSERT_EQ(i, vec[i].first);
+    ASSERT_EQ(std::to_string(i + 123), vec[i].second);
+  }
+
+  vec.clear();
+  ASSERT_TRUE(vec.empty());
+  ASSERT_TRUE(!vec.only_in_stack());
+}
+
+namespace {
+void AssertEqual(
+    const autovector<size_t, kSize>& a, const autovector<size_t, kSize>& b) {
+  ASSERT_EQ(a.size(), b.size());
+  ASSERT_EQ(a.empty(), b.empty());
+  ASSERT_EQ(a.only_in_stack(), b.only_in_stack());
+  for (size_t i = 0; i < a.size(); ++i) {
+    ASSERT_EQ(a[i], b[i]);
+  }
+}
+}  // namespace
+
+TEST(AutoVectorTest, CopyAndAssignment) {
+  // Test both heap-allocated and stack-allocated cases.
+  for (auto size : { kSize / 2, kSize * 1000 }) {
+    autovector<size_t, kSize> vec;
+    for (size_t i = 0; i < size; ++i) {
+      vec.push_back(i);
+    }
+
+    {
+      autovector<size_t, kSize> other;
+      other = vec;
+      AssertEqual(other, vec);
+    }
+
+    {
+      autovector<size_t, kSize> other(vec);
+      AssertEqual(other, vec);
+    }
+  }
+}
+
+TEST(AutoVectorTest, Iterators) {
+  autovector<std::string, kSize> vec;
+  for (size_t i = 0; i < kSize * 1000; ++i) {
+    vec.push_back(std::to_string(i));
+  }
+
+  // basic operator test
+  ASSERT_EQ(vec.front(), *vec.begin());
+  ASSERT_EQ(vec.back(), *(vec.end() - 1));
+  ASSERT_TRUE(vec.begin() < vec.end());
+
+  // non-const iterator
+  size_t index = 0;
+  for (const auto& item : vec) {
+    ASSERT_EQ(vec[index++], item);
+  }
+
+  index = vec.size() - 1;
+  for (auto pos = vec.rbegin(); pos != vec.rend(); ++pos) {
+    ASSERT_EQ(vec[index--], *pos);
+  }
+
+  // const iterator
+  const auto& cvec = vec;
+  index = 0;
+  for (const auto& item : cvec) {
+    ASSERT_EQ(cvec[index++], item);
+  }
+
+  index = vec.size() - 1;
+  for (auto pos = cvec.rbegin(); pos != cvec.rend(); ++pos) {
+    ASSERT_EQ(cvec[index--], *pos);
+  }
+
+  // forward and backward
+  auto pos = vec.begin();
+  while (pos != vec.end()) {
+    auto old_val = *pos;
+    auto old = pos++;
+    // HACK: make sure -> works
+    ASSERT_TRUE(!old->empty());
+    ASSERT_EQ(old_val, *old);
+    ASSERT_TRUE(pos == vec.end() || old_val != *pos);
+  }
+
+  pos = vec.begin();
+  for (size_t i = 0; i < vec.size(); i += 2) {
+    // Cannot use ASSERT_EQ since that macro depends on iostream serialization
+    ASSERT_TRUE(pos + 2 - 2 == pos);
+    pos += 2;
+    ASSERT_TRUE(pos >= vec.begin());
+    ASSERT_TRUE(pos <= vec.end());
+
+    size_t diff = static_cast<size_t>(pos - vec.begin());
+    ASSERT_EQ(i + 2, diff);
+  }
+}
+
+namespace {
+vector<string> GetTestKeys(size_t size) {
+  vector<string> keys;
+  keys.resize(size);
+
+  int index = 0;
+  for (auto& key : keys) {
+    key = "item-" + to_string(index++);
+  }
+  return keys;
+}
+}  // namespace
+
+template<class TVector>
+void BenchmarkVectorCreationAndInsertion(
+    string name, size_t ops, size_t item_size,
+    const std::vector<typename TVector::value_type>& items) {
+  auto env = Env::Default();
+
+  int index = 0;
+  auto start_time = env->NowNanos();
+  auto ops_remaining = ops;
+  while(ops_remaining--) {
+    TVector v;
+    for (size_t i = 0; i < item_size; ++i) {
+      v.push_back(items[index++]);
+    }
+  }
+  auto elapsed = env->NowNanos() - start_time;
+  cout << "created " << ops << " " << name << " instances:\n\t"
+       << "each was inserted with " << item_size << " elements\n\t"
+       << "total time elapsed: " << elapsed << " (ns)" << endl;
+}
+
+template <class TVector>
+size_t BenchmarkSequenceAccess(string name, size_t ops, size_t elem_size) {
+  TVector v;
+  for (const auto& item : GetTestKeys(elem_size)) {
+    v.push_back(item);
+  }
+  auto env = Env::Default();
+
+  auto ops_remaining = ops;
+  auto start_time = env->NowNanos();
+  size_t total = 0;
+  while (ops_remaining--) {
+    auto end = v.end();
+    for (auto pos = v.begin(); pos != end; ++pos) {
+      total += pos->size();
+    }
+  }
+  auto elapsed = env->NowNanos() - start_time;
+  cout << "performed " << ops << " sequence access against " << name << "\n\t"
+       << "size: " << elem_size << "\n\t"
+       << "total time elapsed: " << elapsed << " (ns)" << endl;
+  // HACK avoid compiler's optimization to ignore total
+  return total;
+}
+
+// This test case only reports the performance between std::vector<string>
+// and autovector<string>. We chose string for comparison because in most
+// o our use cases we used std::vector<string>.
+TEST(AutoVectorTest, PerfBench) {
+  // We run same operations for kOps times in order to get a more fair result.
+  size_t kOps = 100000;
+
+  // Creation and insertion test
+  // Test the case when there is:
+  //  * no element inserted: internal array of std::vector may not really get
+  //    initialize.
+  //  * one element inserted: internal array of std::vector must have
+  //    initialized.
+  //  * kSize elements inserted. This shows the most time we'll spend if we
+  //    keep everything in stack.
+  //  * 2 * kSize elements inserted. The internal vector of
+  //    autovector must have been initialized.
+  cout << "=====================================================" << endl;
+  cout << "Creation and Insertion Test (value type: std::string)" << endl;
+  cout << "=====================================================" << endl;
+
+  // pre-generated unique keys
+  auto string_keys = GetTestKeys(kOps * 2 * kSize);
+  for (auto insertions : { 0ul, 1ul, kSize / 2, kSize, 2 * kSize }) {
+    BenchmarkVectorCreationAndInsertion<vector<string>>(
+      "vector<string>", kOps, insertions, string_keys
+    );
+    BenchmarkVectorCreationAndInsertion<autovector<string, kSize>>(
+      "autovector<string>", kOps, insertions, string_keys
+    );
+    cout << "-----------------------------------" << endl;
+  }
+
+  cout << "=====================================================" << endl;
+  cout << "Creation and Insertion Test (value type: uint64_t)" << endl;
+  cout << "=====================================================" << endl;
+
+  // pre-generated unique keys
+  vector<uint64_t> int_keys(kOps * 2 * kSize);
+  for (size_t i = 0; i < kOps * 2 * kSize; ++i) {
+    int_keys[i] = i;
+  }
+  for (auto insertions : { 0ul, 1ul, kSize / 2, kSize, 2 * kSize }) {
+    BenchmarkVectorCreationAndInsertion<vector<uint64_t>>(
+      "vector<uint64_t>", kOps, insertions, int_keys
+    );
+    BenchmarkVectorCreationAndInsertion<autovector<uint64_t, kSize>>(
+      "autovector<uint64_t>", kOps, insertions, int_keys
+    );
+    cout << "-----------------------------------" << endl;
+  }
+
+  // Sequence Access Test
+  cout << "=====================================================" << endl;
+  cout << "Sequence Access Test" << endl;
+  cout << "=====================================================" << endl;
+  for (auto elem_size : { kSize / 2, kSize, 2 * kSize }) {
+    BenchmarkSequenceAccess<vector<string>>(
+        "vector", kOps, elem_size
+    );
+    BenchmarkSequenceAccess<autovector<string, kSize>>(
+        "autovector", kOps, elem_size
+    );
+    cout << "-----------------------------------" << endl;
+  }
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/util/benchharness.cc b/util/benchharness.cc
new file mode 100644 (file)
index 0000000..8cd3700
--- /dev/null
@@ -0,0 +1,398 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// This code is derived from Benchmark.cpp implemented in Folly, the opensourced
+// Facebook C++ library available at https://github.com/facebook/folly
+// The code has removed any dependence on other folly and boost libraries
+
+#include "util/benchharness.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <limits>
+#include <string>
+#include <utility>
+#include <vector>
+
+using std::function;
+using std::get;
+using std::make_pair;
+using std::max;
+using std::min;
+using std::pair;
+using std::sort;
+using std::string;
+using std::tuple;
+using std::vector;
+
+DEFINE_bool(benchmark, false, "Run benchmarks.");
+
+DEFINE_int64(bm_min_usec, 100,
+             "Minimum # of microseconds we'll accept for each benchmark.");
+
+DEFINE_int64(bm_min_iters, 1,
+             "Minimum # of iterations we'll try for each benchmark.");
+
+DEFINE_int32(bm_max_secs, 1,
+             "Maximum # of seconds we'll spend on each benchmark.");
+
+
+namespace rocksdb {
+namespace benchmark {
+
+BenchmarkSuspender::NanosecondsSpent BenchmarkSuspender::nsSpent;
+
+typedef function<uint64_t(unsigned int)> BenchmarkFun;
+static vector<tuple<const char*, const char*, BenchmarkFun>> benchmarks;
+
+// Add the global baseline
+BENCHMARK(globalBenchmarkBaseline) {
+  asm volatile("");
+}
+
+void detail::AddBenchmarkImpl(const char* file, const char* name,
+                              BenchmarkFun fun) {
+  benchmarks.emplace_back(file, name, std::move(fun));
+}
+
+/**
+ * Given a point, gives density at that point as a number 0.0 < x <=
+ * 1.0. The result is 1.0 if all samples are equal to where, and
+ * decreases near 0 if all points are far away from it. The density is
+ * computed with the help of a radial basis function.
+ */
+static double Density(const double * begin, const double *const end,
+                      const double where, const double bandwidth) {
+  assert(begin < end);
+  assert(bandwidth > 0.0);
+  double sum = 0.0;
+  for (auto i = begin; i < end; i++) {
+    auto d = (*i - where) / bandwidth;
+    sum += exp(- d * d);
+  }
+  return sum / (end - begin);
+}
+
+/**
+ * Computes mean and variance for a bunch of data points. Note that
+ * mean is currently not being used.
+ */
+static pair<double, double>
+MeanVariance(const double * begin, const double *const end) {
+  assert(begin < end);
+  double sum = 0.0, sum2 = 0.0;
+  for (auto i = begin; i < end; i++) {
+    sum += *i;
+    sum2 += *i * *i;
+  }
+  auto const n = end - begin;
+  return make_pair(sum / n, sqrt((sum2 - sum * sum / n) / n));
+}
+
+/**
+ * Computes the mode of a sample set through brute force. Assumes
+ * input is sorted.
+ */
+static double Mode(const double * begin, const double *const end) {
+  assert(begin < end);
+  // Lower bound and upper bound for result and their respective
+  // densities.
+  auto
+    result = 0.0,
+    bestDensity = 0.0;
+
+  // Get the variance so we pass it down to Density()
+  auto const sigma = MeanVariance(begin, end).second;
+  if (!sigma) {
+    // No variance means constant signal
+    return *begin;
+  }
+
+  for (auto i = begin; i < end; i++) {
+    assert(i == begin || *i >= i[-1]);
+    auto candidate = Density(begin, end, *i, sigma * sqrt(2.0));
+    if (candidate > bestDensity) {
+      // Found a new best
+      bestDensity = candidate;
+      result = *i;
+    } else {
+      // Density is decreasing... we could break here if we definitely
+      // knew this is unimodal.
+    }
+  }
+
+  return result;
+}
+
+/**
+ * Given a bunch of benchmark samples, estimate the actual run time.
+ */
+static double EstimateTime(double * begin, double * end) {
+  assert(begin < end);
+
+  // Current state of the art: get the minimum. After some
+  // experimentation, it seems taking the minimum is the best.
+
+  return *std::min_element(begin, end);
+
+  // What follows after estimates the time as the mode of the
+  // distribution.
+
+  // Select the awesomest (i.e. most frequent) result. We do this by
+  // sorting and then computing the longest run length.
+  sort(begin, end);
+
+  // Eliminate outliers. A time much larger than the minimum time is
+  // considered an outlier.
+  while (end[-1] > 2.0 * *begin) {
+    --end;
+    if (begin == end) {
+//      LOG(INFO) << *begin;
+    }
+    assert(begin < end);
+  }
+
+  double result = 0;
+
+  /* Code used just for comparison purposes */ {
+    unsigned bestFrequency = 0;
+    unsigned candidateFrequency = 1;
+    double candidateValue = *begin;
+    for (auto current = begin + 1; ; ++current) {
+      if (current == end || *current != candidateValue) {
+        // Done with the current run, see if it was best
+        if (candidateFrequency > bestFrequency) {
+          bestFrequency = candidateFrequency;
+          result = candidateValue;
+        }
+        if (current == end) {
+          break;
+        }
+        // Start a new run
+        candidateValue = *current;
+        candidateFrequency = 1;
+      } else {
+        // Cool, inside a run, increase the frequency
+        ++candidateFrequency;
+      }
+    }
+  }
+
+  result = Mode(begin, end);
+
+  return result;
+}
+
+static double RunBenchmarkGetNSPerIteration(const BenchmarkFun& fun,
+                                            const double globalBaseline) {
+  // They key here is accuracy; too low numbers means the accuracy was
+  // coarse. We up the ante until we get to at least minNanoseconds
+  // timings.
+  static const auto minNanoseconds = FLAGS_bm_min_usec * 1000UL;
+
+  // We do measurements in several epochs and take the minimum, to
+  // account for jitter.
+  static const unsigned int epochs = 1000;
+  // We establish a total time budget as we don't want a measurement
+  // to take too long. This will curtail the number of actual epochs.
+  const uint64_t timeBudgetInNs = FLAGS_bm_max_secs * 1000000000;
+  auto env = Env::Default();
+  uint64_t global = env->NowNanos();
+
+  double epochResults[epochs] = { 0 };
+  size_t actualEpochs = 0;
+
+  for (; actualEpochs < epochs; ++actualEpochs) {
+    for (unsigned int n = FLAGS_bm_min_iters; n < (1UL << 30); n *= 2) {
+      auto const nsecs = fun(n);
+      if (nsecs < minNanoseconds) {
+        continue;
+      }
+      // We got an accurate enough timing, done. But only save if
+      // smaller than the current result.
+      epochResults[actualEpochs] = max(0.0,
+          static_cast<double>(nsecs) / n - globalBaseline);
+      // Done with the current epoch, we got a meaningful timing.
+      break;
+    }
+    uint64_t now = env->NowNanos();
+    if ((now - global) >= timeBudgetInNs) {
+      // No more time budget available.
+      ++actualEpochs;
+      break;
+    }
+  }
+
+  // If the benchmark was basically drowned in baseline noise, it's
+  // possible it became negative.
+  return max(0.0, EstimateTime(epochResults, epochResults + actualEpochs));
+}
+
+struct ScaleInfo {
+  double boundary;
+  const char* suffix;
+};
+
+static const ScaleInfo kTimeSuffixes[] {
+  { 365.25 * 24 * 3600, "years" },
+  { 24 * 3600, "days" },
+  { 3600, "hr" },
+  { 60, "min" },
+  { 1, "s" },
+  { 1E-3, "ms" },
+  { 1E-6, "us" },
+  { 1E-9, "ns" },
+  { 1E-12, "ps" },
+  { 1E-15, "fs" },
+  { 0, nullptr },
+};
+
+static const ScaleInfo kMetricSuffixes[] {
+  { 1E24, "Y" },  // yotta
+  { 1E21, "Z" },  // zetta
+  { 1E18, "X" },  // "exa" written with suffix 'X' so as to not create
+                  //   confusion with scientific notation
+  { 1E15, "P" },  // peta
+  { 1E12, "T" },  // terra
+  { 1E9, "G" },   // giga
+  { 1E6, "M" },   // mega
+  { 1E3, "K" },   // kilo
+  { 1, "" },
+  { 1E-3, "m" },  // milli
+  { 1E-6, "u" },  // micro
+  { 1E-9, "n" },  // nano
+  { 1E-12, "p" },  // pico
+  { 1E-15, "f" },  // femto
+  { 1E-18, "a" },  // atto
+  { 1E-21, "z" },  // zepto
+  { 1E-24, "y" },  // yocto
+  { 0, nullptr },
+};
+
+static string HumanReadable(double n, unsigned int decimals,
+                            const ScaleInfo* scales) {
+  if (std::isinf(n) || std::isnan(n)) {
+    return std::to_string(n);
+  }
+
+  const double absValue = fabs(n);
+  const ScaleInfo* scale = scales;
+  while (absValue < scale[0].boundary && scale[1].suffix != nullptr) {
+    ++scale;
+  }
+
+  const double scaledValue = n / scale->boundary;
+  char a[80];
+  snprintf(a, sizeof(a), "%.*f%s", decimals, scaledValue, scale->suffix);
+  return a;
+}
+
+static string ReadableTime(double n, unsigned int decimals) {
+  return HumanReadable(n, decimals, kTimeSuffixes);
+}
+
+static string MetricReadable(double n, unsigned int decimals) {
+  return HumanReadable(n, decimals, kMetricSuffixes);
+}
+
+static void PrintBenchmarkResultsAsTable(
+  const vector<tuple<const char*, const char*, double> >& data) {
+  // Width available
+  static const uint columns = 76;
+
+  // Compute the longest benchmark name
+  size_t longestName = 0;
+  for (size_t i = 1; i < benchmarks.size(); i++) {
+    longestName = max(longestName, strlen(get<1>(benchmarks[i])));
+  }
+
+  // Print a horizontal rule
+  auto separator = [&](char pad) {
+    puts(string(columns, pad).c_str());
+  };
+
+  // Print header for a file
+  auto header = [&](const char* file) {
+    separator('=');
+    printf("%-*srelative  time/iter  iters/s\n",
+           columns - 28, file);
+    separator('=');
+  };
+
+  double baselineNsPerIter = std::numeric_limits<double>::max();
+  const char* lastFile = "";
+
+  for (auto& datum : data) {
+    auto file = get<0>(datum);
+    if (strcmp(file, lastFile)) {
+      // New file starting
+      header(file);
+      lastFile = file;
+    }
+
+    string s = get<1>(datum);
+    if (s == "-") {
+      separator('-');
+      continue;
+    }
+    bool useBaseline /* = void */;
+    if (s[0] == '%') {
+      s.erase(0, 1);
+      useBaseline = true;
+    } else {
+      baselineNsPerIter = get<2>(datum);
+      useBaseline = false;
+    }
+    s.resize(columns - 29, ' ');
+    auto nsPerIter = get<2>(datum);
+    auto secPerIter = nsPerIter / 1E9;
+    auto itersPerSec = 1 / secPerIter;
+    if (!useBaseline) {
+      // Print without baseline
+      printf("%*s           %9s  %7s\n",
+             static_cast<int>(s.size()), s.c_str(),
+             ReadableTime(secPerIter, 2).c_str(),
+             MetricReadable(itersPerSec, 2).c_str());
+    } else {
+      // Print with baseline
+      auto rel = baselineNsPerIter / nsPerIter * 100.0;
+      printf("%*s %7.2f%%  %9s  %7s\n",
+             static_cast<int>(s.size()), s.c_str(),
+             rel,
+             ReadableTime(secPerIter, 2).c_str(),
+             MetricReadable(itersPerSec, 2).c_str());
+    }
+  }
+  separator('=');
+}
+
+void RunBenchmarks() {
+  ASSERT_TRUE(!benchmarks.empty());
+
+  vector<tuple<const char*, const char*, double>> results;
+  results.reserve(benchmarks.size() - 1);
+
+  // PLEASE KEEP QUIET. MEASUREMENTS IN PROGRESS.
+
+  auto const globalBaseline = RunBenchmarkGetNSPerIteration(
+    get<2>(benchmarks.front()), 0);
+  for (size_t i = 1; i < benchmarks.size(); i++) {
+    double elapsed = 0.0;
+    if (strcmp(get<1>(benchmarks[i]), "-") != 0) {  // skip separators
+      elapsed = RunBenchmarkGetNSPerIteration(get<2>(benchmarks[i]),
+                                              globalBaseline);
+    }
+    results.emplace_back(get<0>(benchmarks[i]),
+                         get<1>(benchmarks[i]), elapsed);
+  }
+
+  // PLEASE MAKE NOISE. MEASUREMENTS DONE.
+
+  PrintBenchmarkResultsAsTable(results);
+}
+
+}  // namespace benchmark
+}  // namespace rocksdb
diff --git a/util/benchharness.h b/util/benchharness.h
new file mode 100644 (file)
index 0000000..6d010cb
--- /dev/null
@@ -0,0 +1,350 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// This code is derived from Benchmark.h implemented in Folly, the opensourced
+// Facebook C++ library available at https://github.com/facebook/folly
+// The code has removed any dependence on other folly and boost libraries
+
+#pragma once
+
+#include <gflags/gflags.h>
+
+#include <cassert>
+#include <functional>
+#include <limits>
+
+#include "util/testharness.h"
+#include "rocksdb/env.h"
+
+namespace rocksdb {
+namespace benchmark {
+
+/**
+ * Runs all benchmarks defined. Usually put in main().
+ */
+void RunBenchmarks();
+
+namespace detail {
+
+/**
+ * Adds a benchmark wrapped in a std::function. Only used
+ * internally. Pass by value is intentional.
+ */
+void AddBenchmarkImpl(const char* file,
+                      const char* name,
+                      std::function<uint64_t(unsigned int)>);
+
+}  // namespace detail
+
+
+/**
+ * Supporting type for BENCHMARK_SUSPEND defined below.
+ */
+struct BenchmarkSuspender {
+  BenchmarkSuspender() { start_ = Env::Default()->NowNanos(); }
+
+  BenchmarkSuspender(const BenchmarkSuspender&) = delete;
+  BenchmarkSuspender(BenchmarkSuspender && rhs) {
+    start_ = rhs.start_;
+    rhs.start_ = 0;
+  }
+
+  BenchmarkSuspender& operator=(const BenchmarkSuspender &) = delete;
+  BenchmarkSuspender& operator=(BenchmarkSuspender && rhs) {
+    if (start_ > 0) {
+      tally();
+    }
+    start_ = rhs.start_;
+    rhs.start_ = 0;
+    return *this;
+  }
+
+  ~BenchmarkSuspender() {
+    if (start_ > 0) {
+      tally();
+    }
+  }
+
+  void Dismiss() {
+    assert(start_ > 0);
+    tally();
+    start_ = 0;
+  }
+
+  void Rehire() { start_ = Env::Default()->NowNanos(); }
+
+  /**
+   * This helps the macro definition. To get around the dangers of
+   * operator bool, returns a pointer to member (which allows no
+   * arithmetic).
+   */
+  /* implicit */
+  operator int BenchmarkSuspender::*() const { return nullptr; }
+
+  /**
+   * Accumulates nanoseconds spent outside benchmark.
+   */
+  typedef uint64_t NanosecondsSpent;
+  static NanosecondsSpent nsSpent;
+
+ private:
+  void tally() {
+    uint64_t end = Env::Default()->NowNanos();
+    nsSpent += start_ - end;
+    start_ = end;
+  }
+
+  uint64_t start_;
+};
+
+/**
+ * Adds a benchmark. Usually not called directly but instead through
+ * the macro BENCHMARK defined below. The lambda function involved
+ * must take exactly one parameter of type unsigned, and the benchmark
+ * uses it with counter semantics (iteration occurs inside the
+ * function).
+ */
+template <typename Lambda>
+void
+AddBenchmark_n(const char* file, const char* name, Lambda&& lambda) {
+  auto execute = [=](unsigned int times) -> uint64_t {
+    BenchmarkSuspender::nsSpent = 0;
+    uint64_t start, end;
+    auto env = Env::Default();
+
+    // CORE MEASUREMENT STARTS
+    start = env->NowNanos();
+    lambda(times);
+    end = env->NowNanos();
+    // CORE MEASUREMENT ENDS
+    return (end - start) - BenchmarkSuspender::nsSpent;
+  };
+
+  detail::AddBenchmarkImpl(file, name,
+                           std::function<uint64_t(unsigned int)>(execute));
+}
+
+/**
+ * Adds a benchmark. Usually not called directly but instead through
+ * the macro BENCHMARK defined below. The lambda function involved
+ * must take zero parameters, and the benchmark calls it repeatedly
+ * (iteration occurs outside the function).
+ */
+template <typename Lambda>
+void
+AddBenchmark(const char* file, const char* name, Lambda&& lambda) {
+  AddBenchmark_n(file, name, [=](unsigned int times) {
+      while (times-- > 0) {
+        lambda();
+      }
+    });
+}
+
+}  // namespace benchmark
+}  // namespace rocksdb
+
+/**
+ * FB_ONE_OR_NONE(hello, world) expands to hello and
+ * FB_ONE_OR_NONE(hello) expands to nothing. This macro is used to
+ * insert or eliminate text based on the presence of another argument.
+ */
+#define FB_ONE_OR_NONE(a, ...) FB_THIRD(a, ## __VA_ARGS__, a)
+#define FB_THIRD(a, b, ...) __VA_ARGS__
+
+#define FB_CONCATENATE_IMPL(s1, s2) s1##s2
+#define FB_CONCATENATE(s1, s2) FB_CONCATENATE_IMPL(s1, s2)
+
+#define FB_ANONYMOUS_VARIABLE(str) FB_CONCATENATE(str, __LINE__)
+
+#define FB_STRINGIZE(x) #x
+
+/**
+ * Introduces a benchmark function. Used internally, see BENCHMARK and
+ * friends below.
+ */
+#define BENCHMARK_IMPL_N(funName, stringName, paramType, paramName)     \
+  static void funName(paramType);                                       \
+  static bool FB_ANONYMOUS_VARIABLE(rocksdbBenchmarkUnused) = (         \
+    ::rocksdb::benchmark::AddBenchmark_n(__FILE__, stringName,          \
+      [](paramType paramName) { funName(paramName); }),                 \
+    true);                                                              \
+  static void funName(paramType paramName)
+
+#define BENCHMARK_IMPL(funName, stringName)                             \
+  static void funName();                                                \
+  static bool FB_ANONYMOUS_VARIABLE(rocksdbBenchmarkUnused) = (         \
+    ::rocksdb::benchmark::AddBenchmark(__FILE__, stringName,            \
+      []() { funName(); }),                                             \
+    true);                                                              \
+  static void funName()
+
+/**
+ * Introduces a benchmark function. Use with either one one or two
+ * arguments. The first is the name of the benchmark. Use something
+ * descriptive, such as insertVectorBegin. The second argument may be
+ * missing, or could be a symbolic counter. The counter dictates how
+ * many internal iteration the benchmark does. Example:
+ *
+ * BENCHMARK(vectorPushBack) {
+ *   vector<int> v;
+ *   v.push_back(42);
+ * }
+ *
+ * BENCHMARK_N(insertVectorBegin, n) {
+ *   vector<int> v;
+ *   FOR_EACH_RANGE (i, 0, n) {
+ *     v.insert(v.begin(), 42);
+ *   }
+ * }
+ */
+#define BENCHMARK_N(name, ...)                                  \
+  BENCHMARK_IMPL_N(                                             \
+    name,                                                       \
+    FB_STRINGIZE(name),                                         \
+    FB_ONE_OR_NONE(unsigned, ## __VA_ARGS__),                   \
+    __VA_ARGS__)
+
+#define BENCHMARK(name)                                         \
+  BENCHMARK_IMPL(                                               \
+    name,                                                       \
+    FB_STRINGIZE(name))
+
+/**
+ * Defines a benchmark that passes a parameter to another one. This is
+ * common for benchmarks that need a "problem size" in addition to
+ * "number of iterations". Consider:
+ *
+ * void pushBack(uint n, size_t initialSize) {
+ *   vector<int> v;
+ *   BENCHMARK_SUSPEND {
+ *     v.resize(initialSize);
+ *   }
+ *   FOR_EACH_RANGE (i, 0, n) {
+ *    v.push_back(i);
+ *   }
+ * }
+ * BENCHMARK_PARAM(pushBack, 0)
+ * BENCHMARK_PARAM(pushBack, 1000)
+ * BENCHMARK_PARAM(pushBack, 1000000)
+ *
+ * The benchmark above estimates the speed of push_back at different
+ * initial sizes of the vector. The framework will pass 0, 1000, and
+ * 1000000 for initialSize, and the iteration count for n.
+ */
+#define BENCHMARK_PARAM(name, param)                                    \
+  BENCHMARK_NAMED_PARAM(name, param, param)
+
+/*
+ * Like BENCHMARK_PARAM(), but allows a custom name to be specified for each
+ * parameter, rather than using the parameter value.
+ *
+ * Useful when the parameter value is not a valid token for string pasting,
+ * of when you want to specify multiple parameter arguments.
+ *
+ * For example:
+ *
+ * void addValue(uint n, int64_t bucketSize, int64_t min, int64_t max) {
+ *   Histogram<int64_t> hist(bucketSize, min, max);
+ *   int64_t num = min;
+ *   FOR_EACH_RANGE (i, 0, n) {
+ *     hist.addValue(num);
+ *     ++num;
+ *     if (num > max) { num = min; }
+ *   }
+ * }
+ *
+ * BENCHMARK_NAMED_PARAM(addValue, 0_to_100, 1, 0, 100)
+ * BENCHMARK_NAMED_PARAM(addValue, 0_to_1000, 10, 0, 1000)
+ * BENCHMARK_NAMED_PARAM(addValue, 5k_to_20k, 250, 5000, 20000)
+ */
+#define BENCHMARK_NAMED_PARAM(name, param_name, ...)                    \
+  BENCHMARK_IMPL(                                                       \
+      FB_CONCATENATE(name, FB_CONCATENATE(_, param_name)),              \
+      FB_STRINGIZE(name) "(" FB_STRINGIZE(param_name) ")",              \
+      unsigned,                                                         \
+      iters) {                                                          \
+    name(iters, ## __VA_ARGS__);                                        \
+  }
+
+/**
+ * Just like BENCHMARK, but prints the time relative to a
+ * baseline. The baseline is the most recent BENCHMARK() seen in
+ * lexical order. Example:
+ *
+ * // This is the baseline
+ * BENCHMARK_N(insertVectorBegin, n) {
+ *   vector<int> v;
+ *   FOR_EACH_RANGE (i, 0, n) {
+ *     v.insert(v.begin(), 42);
+ *   }
+ * }
+ *
+ * BENCHMARK_RELATIVE_N(insertListBegin, n) {
+ *   list<int> s;
+ *   FOR_EACH_RANGE (i, 0, n) {
+ *     s.insert(s.begin(), 42);
+ *   }
+ * }
+ *
+ * Any number of relative benchmark can be associated with a
+ * baseline. Another BENCHMARK() occurrence effectively establishes a
+ * new baseline.
+ */
+#define BENCHMARK_RELATIVE_N(name, ...)                         \
+  BENCHMARK_IMPL_N(                                             \
+    name,                                                       \
+    "%" FB_STRINGIZE(name),                                     \
+    FB_ONE_OR_NONE(unsigned, ## __VA_ARGS__),                   \
+    __VA_ARGS__)
+
+#define BENCHMARK_RELATIVE(name)                                \
+  BENCHMARK_IMPL(                                               \
+    name,                                                       \
+    "%" FB_STRINGIZE(name))
+
+/**
+ * A combination of BENCHMARK_RELATIVE and BENCHMARK_PARAM.
+ */
+#define BENCHMARK_RELATIVE_PARAM(name, param)                           \
+  BENCHMARK_RELATIVE_NAMED_PARAM(name, param, param)
+
+/**
+ * A combination of BENCHMARK_RELATIVE and BENCHMARK_NAMED_PARAM.
+ */
+#define BENCHMARK_RELATIVE_NAMED_PARAM(name, param_name, ...)           \
+  BENCHMARK_IMPL(                                                       \
+      FB_CONCATENATE(name, FB_CONCATENATE(_, param_name)),              \
+      "%" FB_STRINGIZE(name) "(" FB_STRINGIZE(param_name) ")",          \
+      unsigned,                                                         \
+      iters) {                                                          \
+    name(iters, ## __VA_ARGS__);                                        \
+  }
+
+/**
+ * Draws a line of dashes.
+ */
+#define BENCHMARK_DRAW_LINE()                                       \
+  static bool FB_ANONYMOUS_VARIABLE(rocksdbBenchmarkUnused) = (     \
+    ::rocksdb::benchmark::AddBenchmark(__FILE__, "-", []() { }),               \
+    true);
+
+/**
+ * Allows execution of code that doesn't count torward the benchmark's
+ * time budget. Example:
+ *
+ * BENCHMARK_START_GROUP(insertVectorBegin, n) {
+ *   vector<int> v;
+ *   BENCHMARK_SUSPEND {
+ *     v.reserve(n);
+ *   }
+ *   FOR_EACH_RANGE (i, 0, n) {
+ *     v.insert(v.begin(), 42);
+ *   }
+ * }
+ */
+#define BENCHMARK_SUSPEND                               \
+  if (auto FB_ANONYMOUS_VARIABLE(BENCHMARK_SUSPEND) =   \
+      ::rocksdb::benchmark::BenchmarkSuspender()) {}               \
+  else
diff --git a/util/benchharness_test.cc b/util/benchharness_test.cc
new file mode 100644 (file)
index 0000000..75ff658
--- /dev/null
@@ -0,0 +1,67 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+
+#include "util/benchharness.h"
+#include <vector>
+
+namespace rocksdb {
+
+BENCHMARK(insertFrontVector) {
+  std::vector<int> v;
+  for (int i = 0; i < 100; i++) {
+    v.insert(v.begin(), i);
+  }
+}
+
+BENCHMARK_RELATIVE(insertBackVector) {
+  std::vector<int> v;
+  for (size_t i = 0; i < 100; i++) {
+    v.insert(v.end(), i);
+  }
+}
+
+BENCHMARK_N(insertFrontVector_n, n) {
+  std::vector<int> v;
+  for (size_t i = 0; i < n; i++) {
+    v.insert(v.begin(), i);
+  }
+}
+
+BENCHMARK_RELATIVE_N(insertBackVector_n, n) {
+  std::vector<int> v;
+  for (size_t i = 0; i < n; i++) {
+    v.insert(v.end(), i);
+  }
+}
+
+BENCHMARK_N(insertFrontEnd_n, n) {
+  std::vector<int> v;
+  for (size_t i = 0; i < n; i++) {
+    v.insert(v.begin(), i);
+  }
+  for (size_t i = 0; i < n; i++) {
+    v.insert(v.end(), i);
+  }
+}
+
+BENCHMARK_RELATIVE_N(insertFrontEndSuspend_n, n) {
+  std::vector<int> v;
+  for (size_t i = 0; i < n; i++) {
+    v.insert(v.begin(), i);
+  }
+  BENCHMARK_SUSPEND {
+    for (size_t i = 0; i < n; i++) {
+      v.insert(v.end(), i);
+    }
+  }
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  rocksdb::benchmark::RunBenchmarks();
+  return 0;
+}
diff --git a/util/blob_store.cc b/util/blob_store.cc
new file mode 100644 (file)
index 0000000..daaf4bc
--- /dev/null
@@ -0,0 +1,270 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+#include "util/blob_store.h"
+
+namespace rocksdb {
+
+using namespace std;
+
+// BlobChunk
+bool BlobChunk::ImmediatelyBefore(const BlobChunk& chunk) const {
+  // overlapping!?
+  assert(!Overlap(chunk));
+  // size == 0 is a marker, not a block
+  return size != 0 &&
+    bucket_id == chunk.bucket_id &&
+    offset + size == chunk.offset;
+}
+
+bool BlobChunk::Overlap(const BlobChunk &chunk) const {
+  return size != 0 && chunk.size != 0 && bucket_id == chunk.bucket_id &&
+    ((offset >= chunk.offset && offset < chunk.offset + chunk.size) ||
+     (chunk.offset >= offset && chunk.offset < offset + size));
+}
+
+// Blob
+string Blob::ToString() const {
+  string ret;
+  for (auto chunk : chunks) {
+    PutFixed32(&ret, chunk.bucket_id);
+    PutFixed32(&ret, chunk.offset);
+    PutFixed32(&ret, chunk.size);
+  }
+  return ret;
+}
+
+Blob::Blob(const std::string& blob) {
+  for (uint32_t i = 0; i < blob.size(); ) {
+    uint32_t t[3] = {0};
+    for (int j = 0; j < 3 && i + sizeof(uint32_t) - 1 < blob.size();
+                    ++j, i += sizeof(uint32_t)) {
+      t[j] = DecodeFixed32(blob.data() + i);
+    }
+    chunks.push_back(BlobChunk(t[0], t[1], t[2]));
+  }
+}
+
+// FreeList
+Status FreeList::Free(const Blob& blob) {
+  // add it back to the free list
+  for (auto chunk : blob.chunks) {
+    free_blocks_ += chunk.size;
+    if (fifo_free_chunks_.size() &&
+        fifo_free_chunks_.back().ImmediatelyBefore(chunk)) {
+      fifo_free_chunks_.back().size += chunk.size;
+    } else {
+      fifo_free_chunks_.push_back(chunk);
+    }
+  }
+
+  return Status::OK();
+}
+
+Status FreeList::Allocate(uint32_t blocks, Blob* blob) {
+  if (free_blocks_ < blocks) {
+    return Status::Incomplete("");
+  }
+
+  blob->chunks.clear();
+  free_blocks_ -= blocks;
+
+  while (blocks > 0) {
+    assert(fifo_free_chunks_.size() > 0);
+    auto& front = fifo_free_chunks_.front();
+    if (front.size > blocks) {
+      blob->chunks.push_back(BlobChunk(front.bucket_id, front.offset, blocks));
+      front.offset += blocks;
+      front.size -= blocks;
+      blocks = 0;
+    } else {
+      blob->chunks.push_back(front);
+      blocks -= front.size;
+      fifo_free_chunks_.pop_front();
+    }
+  }
+  assert(blocks == 0);
+
+  return Status::OK();
+}
+
+bool FreeList::Overlap(const Blob &blob) const {
+  for (auto chunk : blob.chunks) {
+    for (auto itr = fifo_free_chunks_.begin();
+         itr != fifo_free_chunks_.end();
+         ++itr) {
+      if (itr->Overlap(chunk)) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+// BlobStore
+BlobStore::BlobStore(const string& directory,
+                     uint64_t block_size,
+                     uint32_t blocks_per_bucket,
+                     uint32_t max_buckets,
+                     Env* env) :
+    directory_(directory),
+    block_size_(block_size),
+    blocks_per_bucket_(blocks_per_bucket),
+    env_(env),
+    max_buckets_(max_buckets) {
+  env_->CreateDirIfMissing(directory_);
+
+  storage_options_.use_mmap_writes = false;
+  storage_options_.use_mmap_reads = false;
+
+  buckets_size_ = 0;
+  buckets_ = new unique_ptr<RandomRWFile>[max_buckets_];
+
+  CreateNewBucket();
+}
+
+BlobStore::~BlobStore() {
+  // TODO we don't care about recovery for now
+  delete [] buckets_;
+}
+
+Status BlobStore::Put(const Slice& value, Blob* blob) {
+  // convert size to number of blocks
+  Status s = Allocate((value.size() + block_size_ - 1) / block_size_, blob);
+  if (!s.ok()) {
+    return s;
+  }
+  auto size_left = (uint64_t) value.size();
+
+  uint64_t offset = 0; // in bytes, not blocks
+  for (auto chunk : blob->chunks) {
+    uint64_t write_size = min(chunk.size * block_size_, size_left);
+    assert(chunk.bucket_id < buckets_size_);
+    s = buckets_[chunk.bucket_id].get()->Write(chunk.offset * block_size_,
+                                               Slice(value.data() + offset,
+                                                     write_size));
+    if (!s.ok()) {
+      Delete(*blob);
+      return s;
+    }
+    offset += write_size;
+    size_left -= write_size;
+    if (write_size < chunk.size * block_size_) {
+      // if we have any space left in the block, fill it up with zeros
+      string zero_string(chunk.size * block_size_ - write_size, 0);
+      s = buckets_[chunk.bucket_id].get()->Write(chunk.offset * block_size_ +
+                                                    write_size,
+                                                 Slice(zero_string));
+    }
+  }
+
+  if (size_left > 0) {
+    Delete(*blob);
+    return Status::Corruption("Tried to write more data than fits in the blob");
+  }
+
+  return Status::OK();
+}
+
+Status BlobStore::Get(const Blob& blob,
+                      string* value) const {
+  {
+    // assert that it doesn't overlap with free list
+    // it will get compiled out for release
+    MutexLock l(&free_list_mutex_);
+    assert(!free_list_.Overlap(blob));
+  }
+
+  value->resize(blob.Size() * block_size_);
+
+  uint64_t offset = 0; // in bytes, not blocks
+  for (auto chunk : blob.chunks) {
+    Slice result;
+    assert(chunk.bucket_id < buckets_size_);
+    Status s;
+    s = buckets_[chunk.bucket_id].get()->Read(chunk.offset * block_size_,
+                                              chunk.size * block_size_,
+                                              &result,
+                                              &value->at(offset));
+    if (!s.ok()) {
+      value->clear();
+      return s;
+    }
+    if (result.size() < chunk.size * block_size_) {
+      value->clear();
+      return Status::Corruption("Could not read in from file");
+    }
+    offset += chunk.size * block_size_;
+  }
+
+  // remove the '\0's at the end of the string
+  value->erase(find(value->begin(), value->end(), '\0'), value->end());
+
+  return Status::OK();
+}
+
+Status BlobStore::Delete(const Blob& blob) {
+  MutexLock l(&free_list_mutex_);
+  return free_list_.Free(blob);
+}
+
+Status BlobStore::Sync() {
+  for (size_t i = 0; i < buckets_size_; ++i) {
+    Status s = buckets_[i].get()->Sync();
+    if (!s.ok()) {
+      return s;
+    }
+  }
+  return Status::OK();
+}
+
+Status BlobStore::Allocate(uint32_t blocks, Blob* blob) {
+  MutexLock l(&free_list_mutex_);
+  Status s;
+
+  s = free_list_.Allocate(blocks, blob);
+  if (!s.ok()) {
+    s = CreateNewBucket();
+    if (!s.ok()) {
+      return s;
+    }
+    s = free_list_.Allocate(blocks, blob);
+  }
+
+  return s;
+}
+
+// called with free_list_mutex_ held
+Status BlobStore::CreateNewBucket() {
+  MutexLock l(&buckets_mutex_);
+
+  if (buckets_size_ >= max_buckets_) {
+    return Status::NotSupported("Max size exceeded\n");
+  }
+
+  int new_bucket_id = buckets_size_;
+
+  char fname[200];
+  sprintf(fname, "%s/%d.bs", directory_.c_str(), new_bucket_id);
+
+  Status s = env_->NewRandomRWFile(string(fname),
+                                   &buckets_[new_bucket_id],
+                                   storage_options_);
+  if (!s.ok()) {
+    return s;
+  }
+
+  // whether Allocate succeeds or not, does not affect the overall correctness
+  // of this function - calling Allocate is really optional
+  // (also, tmpfs does not support allocate)
+  buckets_[new_bucket_id].get()->Allocate(0, block_size_ * blocks_per_bucket_);
+
+  buckets_size_ = new_bucket_id + 1;
+
+  return free_list_.Free(Blob(new_bucket_id, 0, blocks_per_bucket_));
+}
+
+} // namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/util/blob_store.h b/util/blob_store.h
new file mode 100644 (file)
index 0000000..ce86337
--- /dev/null
@@ -0,0 +1,163 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+#pragma once
+#include "rocksdb/env.h"
+#include "rocksdb/status.h"
+#include "port/port.h"
+#include "util/mutexlock.h"
+#include "util/coding.h"
+
+#include <list>
+#include <deque>
+#include <cstdint>
+#include <iostream>
+#include <stdexcept>
+#include <algorithm>
+#include <cstdio>
+
+namespace rocksdb {
+
+struct BlobChunk {
+  uint32_t bucket_id;
+  uint32_t offset; // in blocks
+  uint32_t size; // in blocks
+  BlobChunk() {}
+  BlobChunk(uint32_t bucket_id, uint32_t offset, uint32_t size) :
+    bucket_id(bucket_id), offset(offset), size(size) {}
+
+  // returns true if it's immediately before chunk
+  bool ImmediatelyBefore(const BlobChunk& chunk) const;
+  // returns true if chunks overlap
+  bool Overlap(const BlobChunk &chunk) const;
+};
+
+// We represent each Blob as a string in format:
+// bucket_id offset size|bucket_id offset size...
+// The string can be used to reference the Blob stored on external
+// device/file
+// Not thread-safe!
+struct Blob {
+  // Generates the string
+  std::string ToString() const;
+  // Parses the previously generated string
+  explicit Blob(const std::string& blob);
+  // Creates unfragmented Blob
+  Blob(uint32_t bucket_id, uint32_t offset, uint32_t size) {
+    SetOneChunk(bucket_id, offset, size);
+  }
+  Blob() {}
+
+  void SetOneChunk(uint32_t bucket_id, uint32_t offset, uint32_t size) {
+    chunks.clear();
+    chunks.push_back(BlobChunk(bucket_id, offset, size));
+  }
+
+  uint32_t Size() const { // in blocks
+    uint32_t ret = 0;
+    for (auto chunk : chunks) {
+      ret += chunk.size;
+    }
+    assert(ret > 0);
+    return ret;
+  }
+
+  // bucket_id, offset, size
+  std::vector<BlobChunk> chunks;
+};
+
+// Keeps a list of free chunks
+// NOT thread-safe. Externally synchronized
+class FreeList {
+ public:
+  FreeList() :
+    free_blocks_(0) {}
+  ~FreeList() {}
+
+  // Allocates a a blob. Stores the allocated blob in
+  // 'blob'. Returns non-OK status if it failed to allocate.
+  // Thread-safe
+  Status Allocate(uint32_t blocks, Blob* blob);
+  // Frees the blob for reuse. Thread-safe
+  Status Free(const Blob& blob);
+
+  // returns true if blob is overlapping with any of the
+  // chunks stored in free list
+  bool Overlap(const Blob &blob) const;
+
+ private:
+  std::deque<BlobChunk> fifo_free_chunks_;
+  uint32_t free_blocks_;
+  mutable port::Mutex mutex_;
+};
+
+// thread-safe
+class BlobStore {
+ public:
+   // directory - wherever the blobs should be stored. It will be created
+   //   if missing
+   // block_size - self explanatory
+   // blocks_per_bucket - how many blocks we want to keep in one bucket.
+   //   Bucket is a device or a file that we use to store the blobs.
+   //   If we don't have enough blocks to allocate a new blob, we will
+   //   try to create a new file or device.
+   // max_buckets - maximum number of buckets BlobStore will create
+   //   BlobStore max size in bytes is
+   //     max_buckets * blocks_per_bucket * block_size
+   // env - env for creating new files
+  BlobStore(const std::string& directory,
+            uint64_t block_size,
+            uint32_t blocks_per_bucket,
+            uint32_t max_buckets,
+            Env* env);
+  ~BlobStore();
+
+  // Allocates space for value.size bytes (rounded up to be multiple of
+  // block size) and writes value.size bytes from value.data to a backing store.
+  // Sets Blob blob that can than be used for addressing the
+  // stored value. Returns non-OK status on error.
+  Status Put(const Slice& value, Blob* blob);
+  // Value needs to have enough space to store all the loaded stuff.
+  // This function is thread safe!
+  Status Get(const Blob& blob, std::string* value) const;
+  // Frees the blob for reuse, but does not delete the data
+  // on the backing store.
+  Status Delete(const Blob& blob);
+  // Sync all opened files that are modified
+  Status Sync();
+
+ private:
+  const std::string directory_;
+  // block_size_ is uint64_t because when we multiply with
+  // blocks_size_ we want the result to be uint64_t or
+  // we risk overflowing
+  const uint64_t block_size_;
+  const uint32_t blocks_per_bucket_;
+  Env* env_;
+  EnvOptions storage_options_;
+  // protected by free_list_mutex_
+  FreeList free_list_;
+  // free_list_mutex_ is locked BEFORE buckets_mutex_
+  mutable port::Mutex free_list_mutex_;
+  // protected by buckets_mutex_
+  // array of buckets
+  unique_ptr<RandomRWFile>* buckets_;
+  // number of buckets in the array
+  uint32_t buckets_size_;
+  uint32_t max_buckets_;
+  mutable port::Mutex buckets_mutex_;
+
+  // Calls FreeList allocate. If free list can't allocate
+  // new blob, creates new bucket and tries again
+  // Thread-safe
+  Status Allocate(uint32_t blocks, Blob* blob);
+
+  // Creates a new backing store and adds all the blocks
+  // from the new backing store to the free list
+  Status CreateNewBucket();
+};
+
+} // namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/util/blob_store_test.cc b/util/blob_store_test.cc
new file mode 100644 (file)
index 0000000..f199f5d
--- /dev/null
@@ -0,0 +1,200 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/blob_store.h"
+
+#include "util/testharness.h"
+#include "util/testutil.h"
+#include "util/random.h"
+
+#include <cstdlib>
+#include <string>
+
+namespace rocksdb {
+
+using namespace std;
+
+class BlobStoreTest { };
+
+TEST(BlobStoreTest, RangeParseTest) {
+  Blob e;
+  for (int i = 0; i < 5; ++i) {
+    e.chunks.push_back(BlobChunk(rand(), rand(), rand()));
+  }
+  string x = e.ToString();
+  Blob nx(x);
+
+  ASSERT_EQ(nx.ToString(), x);
+}
+
+// make sure we're reusing the freed space
+TEST(BlobStoreTest, SanityTest) {
+  const uint64_t block_size = 10;
+  const uint32_t blocks_per_file = 20;
+  Random random(5);
+
+  BlobStore blob_store(test::TmpDir() + "/blob_store_test",
+                       block_size,
+                       blocks_per_file,
+                       1000,
+                       Env::Default());
+
+  string buf;
+
+  // put string of size 170
+  test::RandomString(&random, 170, &buf);
+  Blob r1;
+  ASSERT_OK(blob_store.Put(Slice(buf), &r1));
+  // use the first file
+  for (size_t i = 0; i < r1.chunks.size(); ++i) {
+    ASSERT_EQ(r1.chunks[0].bucket_id, 0u);
+  }
+
+  // put string of size 30
+  test::RandomString(&random, 30, &buf);
+  Blob r2;
+  ASSERT_OK(blob_store.Put(Slice(buf), &r2));
+  // use the first file
+  for (size_t i = 0; i < r2.chunks.size(); ++i) {
+    ASSERT_EQ(r2.chunks[0].bucket_id, 0u);
+  }
+
+  // delete blob of size 170
+  ASSERT_OK(blob_store.Delete(r1));
+
+  // put a string of size 100
+  test::RandomString(&random, 100, &buf);
+  Blob r3;
+  ASSERT_OK(blob_store.Put(Slice(buf), &r3));
+  // use the first file
+  for (size_t i = 0; i < r3.chunks.size(); ++i) {
+    ASSERT_EQ(r3.chunks[0].bucket_id, 0u);
+  }
+
+  // put a string of size 70
+  test::RandomString(&random, 70, &buf);
+  Blob r4;
+  ASSERT_OK(blob_store.Put(Slice(buf), &r4));
+  // use the first file
+  for (size_t i = 0; i < r4.chunks.size(); ++i) {
+    ASSERT_EQ(r4.chunks[0].bucket_id, 0u);
+  }
+
+  // put a string of size 5
+  test::RandomString(&random, 5, &buf);
+  Blob r5;
+  ASSERT_OK(blob_store.Put(Slice(buf), &r5));
+  // now you get to use the second file
+  for (size_t i = 0; i < r5.chunks.size(); ++i) {
+    ASSERT_EQ(r5.chunks[0].bucket_id, 1u);
+  }
+}
+
+TEST(BlobStoreTest, FragmentedChunksTest) {
+  const uint64_t block_size = 10;
+  const uint32_t blocks_per_file = 20;
+  Random random(5);
+
+  BlobStore blob_store(test::TmpDir() + "/blob_store_test",
+                       block_size,
+                       blocks_per_file,
+                       1000,
+                       Env::Default());
+
+  string buf;
+
+  vector <Blob> r(4);
+
+  // put 4 strings of size 50
+  for (int k = 0; k < 4; ++k)  {
+    test::RandomString(&random, 50, &buf);
+    ASSERT_OK(blob_store.Put(Slice(buf), &r[k]));
+    // use the first file
+    for (size_t i = 0; i < r[k].chunks.size(); ++i) {
+      ASSERT_EQ(r[k].chunks[0].bucket_id, 0u);
+    }
+  }
+
+  // delete the first and third
+  ASSERT_OK(blob_store.Delete(r[0]));
+  ASSERT_OK(blob_store.Delete(r[2]));
+
+  // put string of size 100. it should reuse space that we deleting
+  // by deleting first and third strings of size 50
+  test::RandomString(&random, 100, &buf);
+  Blob r2;
+  ASSERT_OK(blob_store.Put(Slice(buf), &r2));
+  // use the first file
+  for (size_t i = 0; i < r2.chunks.size(); ++i) {
+    ASSERT_EQ(r2.chunks[0].bucket_id, 0u);
+  }
+}
+
+TEST(BlobStoreTest, CreateAndStoreTest) {
+  const uint64_t block_size = 10;
+  const uint32_t blocks_per_file = 1000;
+  const int max_blurb_size = 300;
+  Random random(5);
+
+  BlobStore blob_store(test::TmpDir() + "/blob_store_test",
+                       block_size,
+                       blocks_per_file,
+                       10000,
+                       Env::Default());
+  vector<pair<Blob, string>> ranges;
+
+  for (int i = 0; i < 2000; ++i) {
+    int decision = rand() % 5;
+    if (decision <= 2 || ranges.size() == 0) {
+      string buf;
+      int size_blocks = (rand() % max_blurb_size + 1);
+      int string_size = size_blocks * block_size - (rand() % block_size);
+      test::RandomString(&random, string_size, &buf);
+      Blob r;
+      ASSERT_OK(blob_store.Put(Slice(buf), &r));
+      ranges.push_back(make_pair(r, buf));
+    } else if (decision == 3) {
+      int ti = rand() % ranges.size();
+      string out_buf;
+      ASSERT_OK(blob_store.Get(ranges[ti].first, &out_buf));
+      ASSERT_EQ(ranges[ti].second, out_buf);
+    } else {
+      int ti = rand() % ranges.size();
+      ASSERT_OK(blob_store.Delete(ranges[ti].first));
+      ranges.erase(ranges.begin() + ti);
+    }
+  }
+  ASSERT_OK(blob_store.Sync());
+}
+
+TEST(BlobStoreTest, MaxSizeTest) {
+  const uint64_t block_size = 10;
+  const uint32_t blocks_per_file = 100;
+  const int max_buckets = 10;
+  Random random(5);
+
+  BlobStore blob_store(test::TmpDir() + "/blob_store_test",
+                       block_size,
+                       blocks_per_file,
+                       max_buckets,
+                       Env::Default());
+  string buf;
+  for (int i = 0; i < max_buckets; ++i) {
+    test::RandomString(&random, 1000, &buf);
+    Blob r;
+    ASSERT_OK(blob_store.Put(Slice(buf), &r));
+  }
+
+  test::RandomString(&random, 1000, &buf);
+  Blob r;
+  // should fail because max size
+  Status s = blob_store.Put(Slice(buf), &r);
+  ASSERT_EQ(s.ok(), false);
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/util/bloom.cc b/util/bloom.cc
new file mode 100644 (file)
index 0000000..78ae04a
--- /dev/null
@@ -0,0 +1,111 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/filter_policy.h"
+
+#include "rocksdb/slice.h"
+#include "util/hash.h"
+
+namespace rocksdb {
+
+namespace {
+static uint32_t BloomHash(const Slice& key) {
+  return Hash(key.data(), key.size(), 0xbc9f1d34);
+}
+
+class BloomFilterPolicy : public FilterPolicy {
+ private:
+  size_t bits_per_key_;
+  size_t k_;
+  uint32_t (*hash_func_)(const Slice& key);
+
+  void initialize() {
+    // We intentionally round down to reduce probing cost a little bit
+    k_ = static_cast<size_t>(bits_per_key_ * 0.69);  // 0.69 =~ ln(2)
+    if (k_ < 1) k_ = 1;
+    if (k_ > 30) k_ = 30;
+  }
+
+ public:
+  explicit BloomFilterPolicy(int bits_per_key,
+                             uint32_t (*hash_func)(const Slice& key))
+      : bits_per_key_(bits_per_key), hash_func_(hash_func) {
+    initialize();
+  }
+  explicit BloomFilterPolicy(int bits_per_key)
+      : bits_per_key_(bits_per_key) {
+    hash_func_ = BloomHash;
+    initialize();
+  }
+
+  virtual const char* Name() const {
+    return "rocksdb.BuiltinBloomFilter";
+  }
+
+  virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const {
+    // Compute bloom filter size (in both bits and bytes)
+    size_t bits = n * bits_per_key_;
+
+    // For small n, we can see a very high false positive rate.  Fix it
+    // by enforcing a minimum bloom filter length.
+    if (bits < 64) bits = 64;
+
+    size_t bytes = (bits + 7) / 8;
+    bits = bytes * 8;
+
+    const size_t init_size = dst->size();
+    dst->resize(init_size + bytes, 0);
+    dst->push_back(static_cast<char>(k_));  // Remember # of probes in filter
+    char* array = &(*dst)[init_size];
+    for (size_t i = 0; i < (size_t)n; i++) {
+      // Use double-hashing to generate a sequence of hash values.
+      // See analysis in [Kirsch,Mitzenmacher 2006].
+      uint32_t h = hash_func_(keys[i]);
+      const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
+      for (size_t j = 0; j < k_; j++) {
+        const uint32_t bitpos = h % bits;
+        array[bitpos/8] |= (1 << (bitpos % 8));
+        h += delta;
+      }
+    }
+  }
+
+  virtual bool KeyMayMatch(const Slice& key, const Slice& bloom_filter) const {
+    const size_t len = bloom_filter.size();
+    if (len < 2) return false;
+
+    const char* array = bloom_filter.data();
+    const size_t bits = (len - 1) * 8;
+
+    // Use the encoded k so that we can read filters generated by
+    // bloom filters created using different parameters.
+    const size_t k = array[len-1];
+    if (k > 30) {
+      // Reserved for potentially new encodings for short bloom filters.
+      // Consider it a match.
+      return true;
+    }
+
+    uint32_t h = hash_func_(key);
+    const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
+    for (size_t j = 0; j < k; j++) {
+      const uint32_t bitpos = h % bits;
+      if ((array[bitpos/8] & (1 << (bitpos % 8))) == 0) return false;
+      h += delta;
+    }
+    return true;
+  }
+};
+}
+
+const FilterPolicy* NewBloomFilterPolicy(int bits_per_key) {
+  return new BloomFilterPolicy(bits_per_key);
+}
+
+}  // namespace rocksdb
diff --git a/util/bloom_test.cc b/util/bloom_test.cc
new file mode 100644 (file)
index 0000000..2c430e2
--- /dev/null
@@ -0,0 +1,170 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <gflags/gflags.h>
+
+#include "rocksdb/filter_policy.h"
+
+#include "util/logging.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+DEFINE_int32(bits_per_key, 10, "");
+
+namespace rocksdb {
+
+static const int kVerbose = 1;
+
+static Slice Key(int i, char* buffer) {
+  memcpy(buffer, &i, sizeof(i));
+  return Slice(buffer, sizeof(i));
+}
+
+class BloomTest {
+ private:
+  const FilterPolicy* policy_;
+  std::string filter_;
+  std::vector<std::string> keys_;
+
+ public:
+  BloomTest() : policy_(NewBloomFilterPolicy(FLAGS_bits_per_key)) { }
+
+  ~BloomTest() {
+    delete policy_;
+  }
+
+  void Reset() {
+    keys_.clear();
+    filter_.clear();
+  }
+
+  void Add(const Slice& s) {
+    keys_.push_back(s.ToString());
+  }
+
+  void Build() {
+    std::vector<Slice> key_slices;
+    for (size_t i = 0; i < keys_.size(); i++) {
+      key_slices.push_back(Slice(keys_[i]));
+    }
+    filter_.clear();
+    policy_->CreateFilter(&key_slices[0], key_slices.size(), &filter_);
+    keys_.clear();
+    if (kVerbose >= 2) DumpFilter();
+  }
+
+  size_t FilterSize() const {
+    return filter_.size();
+  }
+
+  void DumpFilter() {
+    fprintf(stderr, "F(");
+    for (size_t i = 0; i+1 < filter_.size(); i++) {
+      const unsigned int c = static_cast<unsigned int>(filter_[i]);
+      for (int j = 0; j < 8; j++) {
+        fprintf(stderr, "%c", (c & (1 <<j)) ? '1' : '.');
+      }
+    }
+    fprintf(stderr, ")\n");
+  }
+
+  bool Matches(const Slice& s) {
+    if (!keys_.empty()) {
+      Build();
+    }
+    return policy_->KeyMayMatch(s, filter_);
+  }
+
+  double FalsePositiveRate() {
+    char buffer[sizeof(int)];
+    int result = 0;
+    for (int i = 0; i < 10000; i++) {
+      if (Matches(Key(i + 1000000000, buffer))) {
+        result++;
+      }
+    }
+    return result / 10000.0;
+  }
+};
+
+TEST(BloomTest, EmptyFilter) {
+  ASSERT_TRUE(! Matches("hello"));
+  ASSERT_TRUE(! Matches("world"));
+}
+
+TEST(BloomTest, Small) {
+  Add("hello");
+  Add("world");
+  ASSERT_TRUE(Matches("hello"));
+  ASSERT_TRUE(Matches("world"));
+  ASSERT_TRUE(! Matches("x"));
+  ASSERT_TRUE(! Matches("foo"));
+}
+
+static int NextLength(int length) {
+  if (length < 10) {
+    length += 1;
+  } else if (length < 100) {
+    length += 10;
+  } else if (length < 1000) {
+    length += 100;
+  } else {
+    length += 1000;
+  }
+  return length;
+}
+
+TEST(BloomTest, VaryingLengths) {
+  char buffer[sizeof(int)];
+
+  // Count number of filters that significantly exceed the false positive rate
+  int mediocre_filters = 0;
+  int good_filters = 0;
+
+  for (int length = 1; length <= 10000; length = NextLength(length)) {
+    Reset();
+    for (int i = 0; i < length; i++) {
+      Add(Key(i, buffer));
+    }
+    Build();
+
+    ASSERT_LE(FilterSize(), (size_t)((length * 10 / 8) + 40)) << length;
+
+    // All added keys must match
+    for (int i = 0; i < length; i++) {
+      ASSERT_TRUE(Matches(Key(i, buffer)))
+          << "Length " << length << "; key " << i;
+    }
+
+    // Check false positive rate
+    double rate = FalsePositiveRate();
+    if (kVerbose >= 1) {
+      fprintf(stderr, "False positives: %5.2f%% @ length = %6d ; bytes = %6d\n",
+              rate*100.0, length, static_cast<int>(FilterSize()));
+    }
+    ASSERT_LE(rate, 0.02);   // Must not be over 2%
+    if (rate > 0.0125) mediocre_filters++;  // Allowed, but not too often
+    else good_filters++;
+  }
+  if (kVerbose >= 1) {
+    fprintf(stderr, "Filters: %d good, %d mediocre\n",
+            good_filters, mediocre_filters);
+  }
+  ASSERT_LE(mediocre_filters, good_filters/5);
+}
+
+// Different bits-per-byte
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+
+  return rocksdb::test::RunAllTests();
+}
diff --git a/util/build_version.h b/util/build_version.h
new file mode 100644 (file)
index 0000000..2035a78
--- /dev/null
@@ -0,0 +1,16 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#pragma once
+#if !defined(IOS_CROSS_COMPILE)
+// if we compile with Xcode, we don't run build_detect_vesion, so we don't
+// generate these variables
+// these variables tell us about the git config and time
+extern const char* rocksdb_build_git_sha;
+
+// these variables tell us when the compilation occurred
+extern const char* rocksdb_build_compile_time;
+extern const char* rocksdb_build_compile_date;
+#endif
diff --git a/util/cache.cc b/util/cache.cc
new file mode 100644 (file)
index 0000000..f1c48a8
--- /dev/null
@@ -0,0 +1,481 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "rocksdb/cache.h"
+#include "port/port.h"
+#include "util/autovector.h"
+#include "util/hash.h"
+#include "util/mutexlock.h"
+
+namespace rocksdb {
+
+Cache::~Cache() {
+}
+
+namespace {
+
+// LRU cache implementation
+
+// An entry is a variable length heap-allocated structure.  Entries
+// are kept in a circular doubly linked list ordered by access time.
+struct LRUHandle {
+  void* value;
+  void (*deleter)(const Slice&, void* value);
+  LRUHandle* next_hash;
+  LRUHandle* next;
+  LRUHandle* prev;
+  size_t charge;      // TODO(opt): Only allow uint32_t?
+  size_t key_length;
+  uint32_t refs;
+  uint32_t hash;      // Hash of key(); used for fast sharding and comparisons
+  char key_data[1];   // Beginning of key
+
+  Slice key() const {
+    // For cheaper lookups, we allow a temporary Handle object
+    // to store a pointer to a key in "value".
+    if (next == this) {
+      return *(reinterpret_cast<Slice*>(value));
+    } else {
+      return Slice(key_data, key_length);
+    }
+  }
+};
+
+// We provide our own simple hash table since it removes a whole bunch
+// of porting hacks and is also faster than some of the built-in hash
+// table implementations in some of the compiler/runtime combinations
+// we have tested.  E.g., readrandom speeds up by ~5% over the g++
+// 4.4.3's builtin hashtable.
+class HandleTable {
+ public:
+  HandleTable() : length_(0), elems_(0), list_(nullptr) { Resize(); }
+  ~HandleTable() { delete[] list_; }
+
+  LRUHandle* Lookup(const Slice& key, uint32_t hash) {
+    return *FindPointer(key, hash);
+  }
+
+  LRUHandle* Insert(LRUHandle* h) {
+    LRUHandle** ptr = FindPointer(h->key(), h->hash);
+    LRUHandle* old = *ptr;
+    h->next_hash = (old == nullptr ? nullptr : old->next_hash);
+    *ptr = h;
+    if (old == nullptr) {
+      ++elems_;
+      if (elems_ > length_) {
+        // Since each cache entry is fairly large, we aim for a small
+        // average linked list length (<= 1).
+        Resize();
+      }
+    }
+    return old;
+  }
+
+  LRUHandle* Remove(const Slice& key, uint32_t hash) {
+    LRUHandle** ptr = FindPointer(key, hash);
+    LRUHandle* result = *ptr;
+    if (result != nullptr) {
+      *ptr = result->next_hash;
+      --elems_;
+    }
+    return result;
+  }
+
+ private:
+  // The table consists of an array of buckets where each bucket is
+  // a linked list of cache entries that hash into the bucket.
+  uint32_t length_;
+  uint32_t elems_;
+  LRUHandle** list_;
+
+  // Return a pointer to slot that points to a cache entry that
+  // matches key/hash.  If there is no such cache entry, return a
+  // pointer to the trailing slot in the corresponding linked list.
+  LRUHandle** FindPointer(const Slice& key, uint32_t hash) {
+    LRUHandle** ptr = &list_[hash & (length_ - 1)];
+    while (*ptr != nullptr &&
+           ((*ptr)->hash != hash || key != (*ptr)->key())) {
+      ptr = &(*ptr)->next_hash;
+    }
+    return ptr;
+  }
+
+  void Resize() {
+    uint32_t new_length = 16;
+    while (new_length < elems_ * 1.5) {
+      new_length *= 2;
+    }
+    LRUHandle** new_list = new LRUHandle*[new_length];
+    memset(new_list, 0, sizeof(new_list[0]) * new_length);
+    uint32_t count = 0;
+    for (uint32_t i = 0; i < length_; i++) {
+      LRUHandle* h = list_[i];
+      while (h != nullptr) {
+        LRUHandle* next = h->next_hash;
+        uint32_t hash = h->hash;
+        LRUHandle** ptr = &new_list[hash & (new_length - 1)];
+        h->next_hash = *ptr;
+        *ptr = h;
+        h = next;
+        count++;
+      }
+    }
+    assert(elems_ == count);
+    delete[] list_;
+    list_ = new_list;
+    length_ = new_length;
+  }
+};
+
+// A single shard of sharded cache.
+class LRUCache {
+ public:
+  LRUCache();
+  ~LRUCache();
+
+  // Separate from constructor so caller can easily make an array of LRUCache
+  void SetCapacity(size_t capacity) { capacity_ = capacity; }
+  void SetRemoveScanCountLimit(size_t remove_scan_count_limit) {
+    remove_scan_count_limit_ = remove_scan_count_limit;
+  }
+
+  // Like Cache methods, but with an extra "hash" parameter.
+  Cache::Handle* Insert(const Slice& key, uint32_t hash,
+                        void* value, size_t charge,
+                        void (*deleter)(const Slice& key, void* value));
+  Cache::Handle* Lookup(const Slice& key, uint32_t hash);
+  void Release(Cache::Handle* handle);
+  void Erase(const Slice& key, uint32_t hash);
+  // Although in some platforms the update of size_t is atomic, to make sure
+  // GetUsage() works correctly under any platforms, we'll protect this
+  // function with mutex.
+  size_t GetUsage() const {
+    MutexLock l(&mutex_);
+    return usage_;
+  }
+
+  void ApplyToAllCacheEntries(void (*callback)(void*, size_t),
+                              bool thread_safe);
+
+ private:
+  void LRU_Remove(LRUHandle* e);
+  void LRU_Append(LRUHandle* e);
+  // Just reduce the reference count by 1.
+  // Return true if last reference
+  bool Unref(LRUHandle* e);
+  // Call deleter and free
+  void FreeEntry(LRUHandle* e);
+
+  // Initialized before use.
+  size_t capacity_;
+  uint32_t remove_scan_count_limit_;
+
+  // mutex_ protects the following state.
+  // We don't count mutex_ as the cache's internal state so semantically we
+  // don't mind mutex_ invoking the non-const actions.
+  mutable port::Mutex mutex_;
+  size_t usage_;
+
+  // Dummy head of LRU list.
+  // lru.prev is newest entry, lru.next is oldest entry.
+  LRUHandle lru_;
+
+  HandleTable table_;
+};
+
+LRUCache::LRUCache()
+    : usage_(0) {
+  // Make empty circular linked list
+  lru_.next = &lru_;
+  lru_.prev = &lru_;
+}
+
+LRUCache::~LRUCache() {
+  for (LRUHandle* e = lru_.next; e != &lru_; ) {
+    LRUHandle* next = e->next;
+    assert(e->refs == 1);  // Error if caller has an unreleased handle
+    if (Unref(e)) {
+      FreeEntry(e);
+    }
+    e = next;
+  }
+}
+
+bool LRUCache::Unref(LRUHandle* e) {
+  assert(e->refs > 0);
+  e->refs--;
+  return e->refs == 0;
+}
+
+void LRUCache::FreeEntry(LRUHandle* e) {
+  assert(e->refs == 0);
+  (*e->deleter)(e->key(), e->value);
+  free(e);
+}
+
+void LRUCache::ApplyToAllCacheEntries(void (*callback)(void*, size_t),
+                                      bool thread_safe) {
+  if (thread_safe) {
+    mutex_.Lock();
+  }
+  for (auto e = lru_.next; e != &lru_; e = e->next) {
+    callback(e->value, e->charge);
+  }
+  if (thread_safe) {
+    mutex_.Unlock();
+  }
+}
+
+void LRUCache::LRU_Remove(LRUHandle* e) {
+  e->next->prev = e->prev;
+  e->prev->next = e->next;
+  usage_ -= e->charge;
+}
+
+void LRUCache::LRU_Append(LRUHandle* e) {
+  // Make "e" newest entry by inserting just before lru_
+  e->next = &lru_;
+  e->prev = lru_.prev;
+  e->prev->next = e;
+  e->next->prev = e;
+  usage_ += e->charge;
+}
+
+Cache::Handle* LRUCache::Lookup(const Slice& key, uint32_t hash) {
+  MutexLock l(&mutex_);
+  LRUHandle* e = table_.Lookup(key, hash);
+  if (e != nullptr) {
+    e->refs++;
+    LRU_Remove(e);
+    LRU_Append(e);
+  }
+  return reinterpret_cast<Cache::Handle*>(e);
+}
+
+void LRUCache::Release(Cache::Handle* handle) {
+  LRUHandle* e = reinterpret_cast<LRUHandle*>(handle);
+  bool last_reference = false;
+  {
+    MutexLock l(&mutex_);
+    last_reference = Unref(e);
+  }
+  if (last_reference) {
+    FreeEntry(e);
+  }
+}
+
+Cache::Handle* LRUCache::Insert(
+    const Slice& key, uint32_t hash, void* value, size_t charge,
+    void (*deleter)(const Slice& key, void* value)) {
+
+  LRUHandle* e = reinterpret_cast<LRUHandle*>(
+      malloc(sizeof(LRUHandle)-1 + key.size()));
+  autovector<LRUHandle*> last_reference_list;
+
+  e->value = value;
+  e->deleter = deleter;
+  e->charge = charge;
+  e->key_length = key.size();
+  e->hash = hash;
+  e->refs = 2;  // One from LRUCache, one for the returned handle
+  memcpy(e->key_data, key.data(), key.size());
+
+  {
+    MutexLock l(&mutex_);
+
+    LRU_Append(e);
+
+    LRUHandle* old = table_.Insert(e);
+    if (old != nullptr) {
+      LRU_Remove(old);
+      if (Unref(old)) {
+        last_reference_list.push_back(old);
+      }
+    }
+
+    if (remove_scan_count_limit_ > 0) {
+      // Try to free the space by evicting the entries that are only
+      // referenced by the cache first.
+      LRUHandle* cur = lru_.next;
+      for (unsigned int scanCount = 0;
+           usage_ > capacity_ && cur != &lru_
+           && scanCount < remove_scan_count_limit_; scanCount++) {
+        LRUHandle* next = cur->next;
+        if (cur->refs <= 1) {
+          LRU_Remove(cur);
+          table_.Remove(cur->key(), cur->hash);
+          if (Unref(cur)) {
+            last_reference_list.push_back(cur);
+          }
+        }
+        cur = next;
+      }
+    }
+
+    // Free the space following strict LRU policy until enough space
+    // is freed.
+    while (usage_ > capacity_ && lru_.next != &lru_) {
+      LRUHandle* old = lru_.next;
+      LRU_Remove(old);
+      table_.Remove(old->key(), old->hash);
+      if (Unref(old)) {
+        last_reference_list.push_back(old);
+      }
+    }
+  }
+
+  // we free the entries here outside of mutex for
+  // performance reasons
+  for (auto entry : last_reference_list) {
+    FreeEntry(entry);
+  }
+
+  return reinterpret_cast<Cache::Handle*>(e);
+}
+
+void LRUCache::Erase(const Slice& key, uint32_t hash) {
+  LRUHandle* e;
+  bool last_reference = false;
+  {
+    MutexLock l(&mutex_);
+    e = table_.Remove(key, hash);
+    if (e != nullptr) {
+      LRU_Remove(e);
+      last_reference = Unref(e);
+    }
+  }
+  // mutex not held here
+  // last_reference will only be true if e != nullptr
+  if (last_reference) {
+    FreeEntry(e);
+  }
+}
+
+static int kNumShardBits = 4;          // default values, can be overridden
+static int kRemoveScanCountLimit = 0; // default values, can be overridden
+
+class ShardedLRUCache : public Cache {
+ private:
+  LRUCache* shards_;
+  port::Mutex id_mutex_;
+  uint64_t last_id_;
+  int num_shard_bits_;
+  size_t capacity_;
+
+  static inline uint32_t HashSlice(const Slice& s) {
+    return Hash(s.data(), s.size(), 0);
+  }
+
+  uint32_t Shard(uint32_t hash) {
+    // Note, hash >> 32 yields hash in gcc, not the zero we expect!
+    return (num_shard_bits_ > 0) ? (hash >> (32 - num_shard_bits_)) : 0;
+  }
+
+  void init(size_t capacity, int numbits, int removeScanCountLimit) {
+    num_shard_bits_ = numbits;
+    capacity_ = capacity;
+    int num_shards = 1 << num_shard_bits_;
+    shards_ = new LRUCache[num_shards];
+    const size_t per_shard = (capacity + (num_shards - 1)) / num_shards;
+    for (int s = 0; s < num_shards; s++) {
+      shards_[s].SetCapacity(per_shard);
+      shards_[s].SetRemoveScanCountLimit(removeScanCountLimit);
+    }
+  }
+
+ public:
+  explicit ShardedLRUCache(size_t capacity)
+      : last_id_(0) {
+    init(capacity, kNumShardBits, kRemoveScanCountLimit);
+  }
+  ShardedLRUCache(size_t capacity, int num_shard_bits,
+                  int removeScanCountLimit)
+     : last_id_(0) {
+    init(capacity, num_shard_bits, removeScanCountLimit);
+  }
+  virtual ~ShardedLRUCache() {
+    delete[] shards_;
+  }
+  virtual Handle* Insert(const Slice& key, void* value, size_t charge,
+                         void (*deleter)(const Slice& key, void* value)) {
+    const uint32_t hash = HashSlice(key);
+    return shards_[Shard(hash)].Insert(key, hash, value, charge, deleter);
+  }
+  virtual Handle* Lookup(const Slice& key) {
+    const uint32_t hash = HashSlice(key);
+    return shards_[Shard(hash)].Lookup(key, hash);
+  }
+  virtual void Release(Handle* handle) {
+    LRUHandle* h = reinterpret_cast<LRUHandle*>(handle);
+    shards_[Shard(h->hash)].Release(handle);
+  }
+  virtual void Erase(const Slice& key) {
+    const uint32_t hash = HashSlice(key);
+    shards_[Shard(hash)].Erase(key, hash);
+  }
+  virtual void* Value(Handle* handle) {
+    return reinterpret_cast<LRUHandle*>(handle)->value;
+  }
+  virtual uint64_t NewId() {
+    MutexLock l(&id_mutex_);
+    return ++(last_id_);
+  }
+  virtual size_t GetCapacity() const {
+    return capacity_;
+  }
+
+  virtual size_t GetUsage() const {
+    // We will not lock the cache when getting the usage from shards.
+    // for (size_t i = 0; i < num_shard_bits_; ++i)
+    int num_shards = 1 << num_shard_bits_;
+    size_t usage = 0;
+    for (int s = 0; s < num_shards; s++) {
+      usage += shards_[s].GetUsage();
+    }
+    return usage;
+  }
+
+  virtual void DisownData() {
+    shards_ = nullptr;
+  }
+
+  virtual void ApplyToAllCacheEntries(void (*callback)(void*, size_t),
+                                      bool thread_safe) override {
+    int num_shards = 1 << num_shard_bits_;
+    for (int s = 0; s < num_shards; s++) {
+      shards_[s].ApplyToAllCacheEntries(callback, thread_safe);
+    }
+  }
+};
+
+}  // end anonymous namespace
+
+shared_ptr<Cache> NewLRUCache(size_t capacity) {
+  return NewLRUCache(capacity, kNumShardBits);
+}
+
+shared_ptr<Cache> NewLRUCache(size_t capacity, int num_shard_bits) {
+  return NewLRUCache(capacity, num_shard_bits, kRemoveScanCountLimit);
+}
+
+shared_ptr<Cache> NewLRUCache(size_t capacity, int num_shard_bits,
+                              int removeScanCountLimit) {
+  if (num_shard_bits >= 20) {
+    return nullptr;  // the cache cannot be sharded into too many fine pieces
+  }
+  return std::make_shared<ShardedLRUCache>(capacity,
+                                           num_shard_bits,
+                                           removeScanCountLimit);
+}
+
+}  // namespace rocksdb
diff --git a/util/cache_test.cc b/util/cache_test.cc
new file mode 100644 (file)
index 0000000..c12cdb7
--- /dev/null
@@ -0,0 +1,449 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/cache.h"
+
+#include <vector>
+#include <string>
+#include <iostream>
+#include "util/coding.h"
+#include "util/testharness.h"
+
+namespace rocksdb {
+
+// Conversions between numeric keys/values and the types expected by Cache.
+static std::string EncodeKey(int k) {
+  std::string result;
+  PutFixed32(&result, k);
+  return result;
+}
+static int DecodeKey(const Slice& k) {
+  assert(k.size() == 4);
+  return DecodeFixed32(k.data());
+}
+static void* EncodeValue(uintptr_t v) { return reinterpret_cast<void*>(v); }
+static int DecodeValue(void* v) { return reinterpret_cast<uintptr_t>(v); }
+
+class CacheTest {
+ public:
+  static CacheTest* current_;
+
+  static void Deleter(const Slice& key, void* v) {
+    current_->deleted_keys_.push_back(DecodeKey(key));
+    current_->deleted_values_.push_back(DecodeValue(v));
+  }
+
+  static const int kCacheSize = 1000;
+  static const int kNumShardBits = 4;
+  static const int kRemoveScanCountLimit = 16;
+
+  static const int kCacheSize2 = 100;
+  static const int kNumShardBits2 = 2;
+  static const int kRemoveScanCountLimit2 = 200;
+
+  std::vector<int> deleted_keys_;
+  std::vector<int> deleted_values_;
+  shared_ptr<Cache> cache_;
+  shared_ptr<Cache> cache2_;
+
+  CacheTest() :
+      cache_(NewLRUCache(kCacheSize, kNumShardBits, kRemoveScanCountLimit)),
+      cache2_(NewLRUCache(kCacheSize2, kNumShardBits2,
+                          kRemoveScanCountLimit2)) {
+    current_ = this;
+  }
+
+  ~CacheTest() {
+  }
+
+  int Lookup(shared_ptr<Cache> cache, int key) {
+    Cache::Handle* handle = cache->Lookup(EncodeKey(key));
+    const int r = (handle == nullptr) ? -1 : DecodeValue(cache->Value(handle));
+    if (handle != nullptr) {
+      cache->Release(handle);
+    }
+    return r;
+  }
+
+  void Insert(shared_ptr<Cache> cache, int key, int value, int charge = 1) {
+    cache->Release(cache->Insert(EncodeKey(key), EncodeValue(value), charge,
+                                  &CacheTest::Deleter));
+  }
+
+  void Erase(shared_ptr<Cache> cache, int key) {
+    cache->Erase(EncodeKey(key));
+  }
+
+
+  int Lookup(int key) {
+    return Lookup(cache_, key);
+  }
+
+  void Insert(int key, int value, int charge = 1) {
+    Insert(cache_, key, value, charge);
+  }
+
+  void Erase(int key) {
+    Erase(cache_, key);
+  }
+
+  int Lookup2(int key) {
+    return Lookup(cache2_, key);
+  }
+
+  void Insert2(int key, int value, int charge = 1) {
+    Insert(cache2_, key, value, charge);
+  }
+
+  void Erase2(int key) {
+    Erase(cache2_, key);
+  }
+};
+CacheTest* CacheTest::current_;
+
+namespace {
+void dumbDeleter(const Slice& key, void* value) { }
+}  // namespace
+
+TEST(CacheTest, UsageTest) {
+  // cache is shared_ptr and will be automatically cleaned up.
+  const uint64_t kCapacity = 100000;
+  auto cache = NewLRUCache(kCapacity, 8, 200);
+
+  size_t usage = 0;
+  const char* value = "abcdef";
+  // make sure everything will be cached
+  for (int i = 1; i < 100; ++i) {
+    std::string key(i, 'a');
+    auto kv_size = key.size() + 5;
+    cache->Release(
+        cache->Insert(key, (void*)value, kv_size, dumbDeleter)
+    );
+    usage += kv_size;
+    ASSERT_EQ(usage, cache->GetUsage());
+  }
+
+  // make sure the cache will be overloaded
+  for (uint64_t i = 1; i < kCapacity; ++i) {
+    auto key = std::to_string(i);
+    cache->Release(
+        cache->Insert(key, (void*)value, key.size() + 5, dumbDeleter)
+    );
+  }
+
+  // the usage should be close to the capacity
+  ASSERT_GT(kCapacity, cache->GetUsage());
+  ASSERT_LT(kCapacity * 0.95, cache->GetUsage());
+}
+
+TEST(CacheTest, HitAndMiss) {
+  ASSERT_EQ(-1, Lookup(100));
+
+  Insert(100, 101);
+  ASSERT_EQ(101, Lookup(100));
+  ASSERT_EQ(-1,  Lookup(200));
+  ASSERT_EQ(-1,  Lookup(300));
+
+  Insert(200, 201);
+  ASSERT_EQ(101, Lookup(100));
+  ASSERT_EQ(201, Lookup(200));
+  ASSERT_EQ(-1,  Lookup(300));
+
+  Insert(100, 102);
+  ASSERT_EQ(102, Lookup(100));
+  ASSERT_EQ(201, Lookup(200));
+  ASSERT_EQ(-1,  Lookup(300));
+
+  ASSERT_EQ(1U, deleted_keys_.size());
+  ASSERT_EQ(100, deleted_keys_[0]);
+  ASSERT_EQ(101, deleted_values_[0]);
+}
+
+TEST(CacheTest, Erase) {
+  Erase(200);
+  ASSERT_EQ(0U, deleted_keys_.size());
+
+  Insert(100, 101);
+  Insert(200, 201);
+  Erase(100);
+  ASSERT_EQ(-1,  Lookup(100));
+  ASSERT_EQ(201, Lookup(200));
+  ASSERT_EQ(1U, deleted_keys_.size());
+  ASSERT_EQ(100, deleted_keys_[0]);
+  ASSERT_EQ(101, deleted_values_[0]);
+
+  Erase(100);
+  ASSERT_EQ(-1,  Lookup(100));
+  ASSERT_EQ(201, Lookup(200));
+  ASSERT_EQ(1U, deleted_keys_.size());
+}
+
+TEST(CacheTest, EntriesArePinned) {
+  Insert(100, 101);
+  Cache::Handle* h1 = cache_->Lookup(EncodeKey(100));
+  ASSERT_EQ(101, DecodeValue(cache_->Value(h1)));
+
+  Insert(100, 102);
+  Cache::Handle* h2 = cache_->Lookup(EncodeKey(100));
+  ASSERT_EQ(102, DecodeValue(cache_->Value(h2)));
+  ASSERT_EQ(0U, deleted_keys_.size());
+
+  cache_->Release(h1);
+  ASSERT_EQ(1U, deleted_keys_.size());
+  ASSERT_EQ(100, deleted_keys_[0]);
+  ASSERT_EQ(101, deleted_values_[0]);
+
+  Erase(100);
+  ASSERT_EQ(-1, Lookup(100));
+  ASSERT_EQ(1U, deleted_keys_.size());
+
+  cache_->Release(h2);
+  ASSERT_EQ(2U, deleted_keys_.size());
+  ASSERT_EQ(100, deleted_keys_[1]);
+  ASSERT_EQ(102, deleted_values_[1]);
+}
+
+TEST(CacheTest, EvictionPolicy) {
+  Insert(100, 101);
+  Insert(200, 201);
+
+  // Frequently used entry must be kept around
+  for (int i = 0; i < kCacheSize + 100; i++) {
+    Insert(1000+i, 2000+i);
+    ASSERT_EQ(2000+i, Lookup(1000+i));
+    ASSERT_EQ(101, Lookup(100));
+  }
+  ASSERT_EQ(101, Lookup(100));
+  ASSERT_EQ(-1, Lookup(200));
+}
+
+TEST(CacheTest, EvictionPolicyRef) {
+  Insert(100, 101);
+  Insert(101, 102);
+  Insert(102, 103);
+  Insert(103, 104);
+  Insert(200, 101);
+  Insert(201, 102);
+  Insert(202, 103);
+  Insert(203, 104);
+  Cache::Handle* h201 = cache_->Lookup(EncodeKey(200));
+  Cache::Handle* h202 = cache_->Lookup(EncodeKey(201));
+  Cache::Handle* h203 = cache_->Lookup(EncodeKey(202));
+  Cache::Handle* h204 = cache_->Lookup(EncodeKey(203));
+  Insert(300, 101);
+  Insert(301, 102);
+  Insert(302, 103);
+  Insert(303, 104);
+
+  // Insert entries much more than Cache capacity
+  for (int i = 0; i < kCacheSize + 100; i++) {
+    Insert(1000 + i, 2000 + i);
+  }
+
+  // Check whether the entries inserted in the beginning
+  // are evicted. Ones without extra ref are evicted and
+  // those with are not.
+  ASSERT_EQ(-1, Lookup(100));
+  ASSERT_EQ(-1, Lookup(101));
+  ASSERT_EQ(-1, Lookup(102));
+  ASSERT_EQ(-1, Lookup(103));
+
+  ASSERT_EQ(-1, Lookup(300));
+  ASSERT_EQ(-1, Lookup(301));
+  ASSERT_EQ(-1, Lookup(302));
+  ASSERT_EQ(-1, Lookup(303));
+
+  ASSERT_EQ(101, Lookup(200));
+  ASSERT_EQ(102, Lookup(201));
+  ASSERT_EQ(103, Lookup(202));
+  ASSERT_EQ(104, Lookup(203));
+
+  // Cleaning up all the handles
+  cache_->Release(h201);
+  cache_->Release(h202);
+  cache_->Release(h203);
+  cache_->Release(h204);
+}
+
+TEST(CacheTest, EvictionPolicyRef2) {
+  std::vector<Cache::Handle*> handles;
+
+  Insert(100, 101);
+  // Insert entries much more than Cache capacity
+  for (int i = 0; i < kCacheSize + 100; i++) {
+    Insert(1000 + i, 2000 + i);
+    if (i < kCacheSize ) {
+      handles.push_back(cache_->Lookup(EncodeKey(1000 + i)));
+    }
+  }
+
+  // Make sure referenced keys are also possible to be deleted
+  // if there are not sufficient non-referenced keys
+  for (int i = 0; i < 5; i++) {
+    ASSERT_EQ(-1, Lookup(1000 + i));
+  }
+
+  for (int i = kCacheSize; i < kCacheSize + 100; i++) {
+    ASSERT_EQ(2000 + i, Lookup(1000 + i));
+  }
+  ASSERT_EQ(-1, Lookup(100));
+
+  // Cleaning up all the handles
+  while (handles.size() > 0) {
+    cache_->Release(handles.back());
+    handles.pop_back();
+  }
+}
+
+TEST(CacheTest, EvictionPolicyRefLargeScanLimit) {
+  std::vector<Cache::Handle*> handles2;
+
+  // Cache2 has a cache RemoveScanCountLimit higher than cache size
+  // so it would trigger a boundary condition.
+
+  // Populate the cache with 10 more keys than its size.
+  // Reference all keys except one close to the end.
+  for (int i = 0; i < kCacheSize2 + 10; i++) {
+    Insert2(1000 + i, 2000+i);
+    if (i != kCacheSize2 ) {
+      handles2.push_back(cache2_->Lookup(EncodeKey(1000 + i)));
+    }
+  }
+
+  // Make sure referenced keys are also possible to be deleted
+  // if there are not sufficient non-referenced keys
+  for (int i = 0; i < 3; i++) {
+    ASSERT_EQ(-1, Lookup2(1000 + i));
+  }
+  // The non-referenced value is deleted even if it's accessed
+  // recently.
+  ASSERT_EQ(-1, Lookup2(1000 + kCacheSize2));
+  // Other values recently accessed are not deleted since they
+  // are referenced.
+  for (int i = kCacheSize2 - 10; i < kCacheSize2 + 10; i++) {
+    if (i != kCacheSize2) {
+      ASSERT_EQ(2000 + i, Lookup2(1000 + i));
+    }
+  }
+
+  // Cleaning up all the handles
+  while (handles2.size() > 0) {
+    cache2_->Release(handles2.back());
+    handles2.pop_back();
+  }
+}
+
+
+
+TEST(CacheTest, HeavyEntries) {
+  // Add a bunch of light and heavy entries and then count the combined
+  // size of items still in the cache, which must be approximately the
+  // same as the total capacity.
+  const int kLight = 1;
+  const int kHeavy = 10;
+  int added = 0;
+  int index = 0;
+  while (added < 2*kCacheSize) {
+    const int weight = (index & 1) ? kLight : kHeavy;
+    Insert(index, 1000+index, weight);
+    added += weight;
+    index++;
+  }
+
+  int cached_weight = 0;
+  for (int i = 0; i < index; i++) {
+    const int weight = (i & 1 ? kLight : kHeavy);
+    int r = Lookup(i);
+    if (r >= 0) {
+      cached_weight += weight;
+      ASSERT_EQ(1000+i, r);
+    }
+  }
+  ASSERT_LE(cached_weight, kCacheSize + kCacheSize/10);
+}
+
+TEST(CacheTest, NewId) {
+  uint64_t a = cache_->NewId();
+  uint64_t b = cache_->NewId();
+  ASSERT_NE(a, b);
+}
+
+
+class Value {
+ private:
+  int v_;
+ public:
+  explicit Value(int v) : v_(v) { }
+
+  ~Value() { std::cout << v_ << " is destructed\n"; }
+};
+
+namespace {
+void deleter(const Slice& key, void* value) {
+  delete (Value *)value;
+}
+}  // namespace
+
+TEST(CacheTest, BadEviction) {
+  int n = 10;
+
+  // a LRUCache with n entries and one shard only
+  std::shared_ptr<Cache> cache = NewLRUCache(n, 0);
+
+  std::vector<Cache::Handle*> handles(n+1);
+
+  // Insert n+1 entries, but not releasing.
+  for (int i = 0; i < n+1; i++) {
+    std::string key = std::to_string(i+1);
+    handles[i] = cache->Insert(key, new Value(i+1), 1, &deleter);
+  }
+
+  // Guess what's in the cache now?
+  for (int i = 0; i < n+1; i++) {
+    std::string key = std::to_string(i+1);
+    auto h = cache->Lookup(key);
+    std::cout << key << (h?" found\n":" not found\n");
+    // Only the first entry should be missing
+    ASSERT_TRUE(h || i == 0);
+    if (h) cache->Release(h);
+  }
+
+  for (int i = 0; i < n+1; i++) {
+    cache->Release(handles[i]);
+  }
+  std::cout << "Poor entries\n";
+}
+
+namespace {
+std::vector<std::pair<int, int>> callback_state;
+void callback(void* entry, size_t charge) {
+  callback_state.push_back({DecodeValue(entry), static_cast<int>(charge)});
+}
+};
+
+TEST(CacheTest, ApplyToAllCacheEntiresTest) {
+  std::vector<std::pair<int, int>> inserted;
+  callback_state.clear();
+
+  for (int i = 0; i < 10; ++i) {
+    Insert(i, i * 2, i + 1);
+    inserted.push_back({i * 2, i + 1});
+  }
+  cache_->ApplyToAllCacheEntries(callback, true);
+
+  sort(inserted.begin(), inserted.end());
+  sort(callback_state.begin(), callback_state.end());
+  ASSERT_TRUE(inserted == callback_state);
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/util/coding.cc b/util/coding.cc
new file mode 100644 (file)
index 0000000..31ae0e3
--- /dev/null
@@ -0,0 +1,169 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/coding.h"
+
+#include <algorithm>
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+
+namespace rocksdb {
+
+char* EncodeVarint32(char* dst, uint32_t v) {
+  // Operate on characters as unsigneds
+  unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
+  static const int B = 128;
+  if (v < (1 << 7)) {
+    *(ptr++) = v;
+  } else if (v < (1 << 14)) {
+    *(ptr++) = v | B;
+    *(ptr++) = v >> 7;
+  } else if (v < (1 << 21)) {
+    *(ptr++) = v | B;
+    *(ptr++) = (v >> 7) | B;
+    *(ptr++) = v >> 14;
+  } else if (v < (1 << 28)) {
+    *(ptr++) = v | B;
+    *(ptr++) = (v >> 7) | B;
+    *(ptr++) = (v >> 14) | B;
+    *(ptr++) = v >> 21;
+  } else {
+    *(ptr++) = v | B;
+    *(ptr++) = (v >> 7) | B;
+    *(ptr++) = (v >> 14) | B;
+    *(ptr++) = (v >> 21) | B;
+    *(ptr++) = v >> 28;
+  }
+  return reinterpret_cast<char*>(ptr);
+}
+
+const char* GetVarint32PtrFallback(const char* p, const char* limit,
+                                   uint32_t* value) {
+  uint32_t result = 0;
+  for (uint32_t shift = 0; shift <= 28 && p < limit; shift += 7) {
+    uint32_t byte = *(reinterpret_cast<const unsigned char*>(p));
+    p++;
+    if (byte & 128) {
+      // More bytes are present
+      result |= ((byte & 127) << shift);
+    } else {
+      result |= (byte << shift);
+      *value = result;
+      return reinterpret_cast<const char*>(p);
+    }
+  }
+  return nullptr;
+}
+
+const char* GetVarint64Ptr(const char* p, const char* limit, uint64_t* value) {
+  uint64_t result = 0;
+  for (uint32_t shift = 0; shift <= 63 && p < limit; shift += 7) {
+    uint64_t byte = *(reinterpret_cast<const unsigned char*>(p));
+    p++;
+    if (byte & 128) {
+      // More bytes are present
+      result |= ((byte & 127) << shift);
+    } else {
+      result |= (byte << shift);
+      *value = result;
+      return reinterpret_cast<const char*>(p);
+    }
+  }
+  return nullptr;
+}
+
+void BitStreamPutInt(char* dst, size_t dstlen, size_t offset,
+                     uint32_t bits, uint64_t value) {
+  assert((offset + bits + 7)/8 <= dstlen);
+  assert(bits <= 64);
+
+  unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
+
+  size_t byteOffset = offset / 8;
+  size_t bitOffset = offset % 8;
+
+  // This prevents unused variable warnings when compiling.
+#ifndef NDEBUG
+  // Store truncated value.
+  uint64_t origValue = (bits < 64)?(value & (((uint64_t)1 << bits) - 1)):value;
+  uint32_t origBits = bits;
+#endif
+
+  while (bits > 0) {
+    size_t bitsToGet = std::min<size_t>(bits, 8 - bitOffset);
+    unsigned char mask = ((1 << bitsToGet) - 1);
+
+    ptr[byteOffset] = (ptr[byteOffset] & ~(mask << bitOffset)) +
+                      ((value & mask) << bitOffset);
+
+    value >>= bitsToGet;
+    byteOffset += 1;
+    bitOffset = 0;
+    bits -= bitsToGet;
+  }
+
+  assert(origValue == BitStreamGetInt(dst, dstlen, offset, origBits));
+}
+
+uint64_t BitStreamGetInt(const char* src, size_t srclen, size_t offset,
+                         uint32_t bits) {
+  assert((offset + bits + 7)/8 <= srclen);
+  assert(bits <= 64);
+
+  const unsigned char* ptr = reinterpret_cast<const unsigned char*>(src);
+
+  uint64_t result = 0;
+
+  size_t byteOffset = offset / 8;
+  size_t bitOffset = offset % 8;
+  size_t shift = 0;
+
+  while (bits > 0) {
+    size_t bitsToGet = std::min<size_t>(bits, 8 - bitOffset);
+    unsigned char mask = ((1 << bitsToGet) - 1);
+
+    result += (uint64_t)((ptr[byteOffset] >> bitOffset) & mask) << shift;
+
+    shift += bitsToGet;
+    byteOffset += 1;
+    bitOffset = 0;
+    bits -= bitsToGet;
+  }
+
+  return result;
+}
+
+void BitStreamPutInt(std::string* dst, size_t offset, uint32_t bits,
+                     uint64_t value) {
+  assert((offset + bits + 7)/8 <= dst->size());
+
+  const size_t kTmpBufLen = sizeof(value) + 1;
+  char tmpBuf[kTmpBufLen];
+
+  // Number of bytes of tmpBuf being used
+  const size_t kUsedBytes = (offset%8 + bits)/8;
+
+  // Copy relevant parts of dst to tmpBuf
+  for (size_t idx = 0; idx <= kUsedBytes; ++idx) {
+    tmpBuf[idx] = (*dst)[offset/8 + idx];
+  }
+
+  BitStreamPutInt(tmpBuf, kTmpBufLen, offset%8, bits, value);
+
+  // Copy tmpBuf back to dst
+  for (size_t idx = 0; idx <= kUsedBytes; ++idx) {
+    (*dst)[offset/8 + idx] = tmpBuf[idx];
+  }
+
+  // Do the check here too as we are working with a buffer.
+  assert(((bits < 64)?(value & (((uint64_t)1 << bits) - 1)):value) ==
+         BitStreamGetInt(dst, offset, bits));
+}
+
+}  // namespace rocksdb
diff --git a/util/coding.h b/util/coding.h
new file mode 100644 (file)
index 0000000..8ffba51
--- /dev/null
@@ -0,0 +1,294 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Endian-neutral encoding:
+// * Fixed-length numbers are encoded with least-significant byte first
+// * In addition we support variable length "varint" encoding
+// * Strings are encoded prefixed by their length in varint format
+
+#pragma once
+#include <algorithm>
+#include <stdint.h>
+#include <string.h>
+#include <string>
+
+#include "rocksdb/write_batch.h"
+#include "port/port.h"
+
+namespace rocksdb {
+
+// The maximum length of a varint in bytes for 32 and 64 bits respectively.
+const unsigned int kMaxVarint32Length = 5;
+const unsigned int kMaxVarint64Length = 10;
+
+// Standard Put... routines append to a string
+extern void PutFixed32(std::string* dst, uint32_t value);
+extern void PutFixed64(std::string* dst, uint64_t value);
+extern void PutVarint32(std::string* dst, uint32_t value);
+extern void PutVarint64(std::string* dst, uint64_t value);
+extern void PutLengthPrefixedSlice(std::string* dst, const Slice& value);
+extern void PutLengthPrefixedSliceParts(std::string* dst,
+                                        const SliceParts& slice_parts);
+
+// Standard Get... routines parse a value from the beginning of a Slice
+// and advance the slice past the parsed value.
+extern bool GetVarint32(Slice* input, uint32_t* value);
+extern bool GetVarint64(Slice* input, uint64_t* value);
+extern bool GetLengthPrefixedSlice(Slice* input, Slice* result);
+// This function assumes data is well-formed.
+extern Slice GetLengthPrefixedSlice(const char* data);
+
+extern Slice GetSliceUntil(Slice* slice, char delimiter);
+
+// Pointer-based variants of GetVarint...  These either store a value
+// in *v and return a pointer just past the parsed value, or return
+// nullptr on error.  These routines only look at bytes in the range
+// [p..limit-1]
+extern const char* GetVarint32Ptr(const char* p,const char* limit, uint32_t* v);
+extern const char* GetVarint64Ptr(const char* p,const char* limit, uint64_t* v);
+
+// Returns the length of the varint32 or varint64 encoding of "v"
+extern int VarintLength(uint64_t v);
+
+// Lower-level versions of Put... that write directly into a character buffer
+// REQUIRES: dst has enough space for the value being written
+extern void EncodeFixed32(char* dst, uint32_t value);
+extern void EncodeFixed64(char* dst, uint64_t value);
+
+// Lower-level versions of Put... that write directly into a character buffer
+// and return a pointer just past the last byte written.
+// REQUIRES: dst has enough space for the value being written
+extern char* EncodeVarint32(char* dst, uint32_t value);
+extern char* EncodeVarint64(char* dst, uint64_t value);
+
+// Lower-level versions of Get... that read directly from a character buffer
+// without any bounds checking.
+
+inline uint32_t DecodeFixed32(const char* ptr) {
+  if (port::kLittleEndian) {
+    // Load the raw bytes
+    uint32_t result;
+    memcpy(&result, ptr, sizeof(result));  // gcc optimizes this to a plain load
+    return result;
+  } else {
+    return ((static_cast<uint32_t>(static_cast<unsigned char>(ptr[0])))
+        | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[1])) << 8)
+        | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[2])) << 16)
+        | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[3])) << 24));
+  }
+}
+
+inline uint64_t DecodeFixed64(const char* ptr) {
+  if (port::kLittleEndian) {
+    // Load the raw bytes
+    uint64_t result;
+    memcpy(&result, ptr, sizeof(result));  // gcc optimizes this to a plain load
+    return result;
+  } else {
+    uint64_t lo = DecodeFixed32(ptr);
+    uint64_t hi = DecodeFixed32(ptr + 4);
+    return (hi << 32) | lo;
+  }
+}
+
+// Internal routine for use by fallback path of GetVarint32Ptr
+extern const char* GetVarint32PtrFallback(const char* p,
+                                          const char* limit,
+                                          uint32_t* value);
+inline const char* GetVarint32Ptr(const char* p,
+                                  const char* limit,
+                                  uint32_t* value) {
+  if (p < limit) {
+    uint32_t result = *(reinterpret_cast<const unsigned char*>(p));
+    if ((result & 128) == 0) {
+      *value = result;
+      return p + 1;
+    }
+  }
+  return GetVarint32PtrFallback(p, limit, value);
+}
+
+// Writes an unsigned integer with bits number of bits with its least
+// significant bit at offset.
+// Bits are numbered from 0 to 7 in the first byte, 8 to 15 in the second and
+// so on.
+// value is truncated to the bits number of least significant bits.
+// REQUIRES: (offset+bits+7)/8 <= dstlen
+// REQUIRES: bits <= 64
+extern void BitStreamPutInt(char* dst, size_t dstlen, size_t offset,
+                            uint32_t bits, uint64_t value);
+
+// Reads an unsigned integer with bits number of bits with its least
+// significant bit at offset.
+// Bits are numbered in the same way as ByteStreamPutInt().
+// REQUIRES: (offset+bits+7)/8 <= srclen
+// REQUIRES: bits <= 64
+extern uint64_t BitStreamGetInt(const char* src, size_t srclen, size_t offset,
+                                uint32_t bits);
+
+// Convenience functions
+extern void BitStreamPutInt(std::string* dst, size_t offset, uint32_t bits,
+                            uint64_t value);
+extern uint64_t BitStreamGetInt(const std::string* src, size_t offset,
+                                uint32_t bits);
+extern uint64_t BitStreamGetInt(const Slice* src, size_t offset,
+                                uint32_t bits);
+
+// -- Implementation of the functions declared above
+inline void EncodeFixed32(char* buf, uint32_t value) {
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+  memcpy(buf, &value, sizeof(value));
+#else
+  buf[0] = value & 0xff;
+  buf[1] = (value >> 8) & 0xff;
+  buf[2] = (value >> 16) & 0xff;
+  buf[3] = (value >> 24) & 0xff;
+#endif
+}
+
+inline void EncodeFixed64(char* buf, uint64_t value) {
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+  memcpy(buf, &value, sizeof(value));
+#else
+  buf[0] = value & 0xff;
+  buf[1] = (value >> 8) & 0xff;
+  buf[2] = (value >> 16) & 0xff;
+  buf[3] = (value >> 24) & 0xff;
+  buf[4] = (value >> 32) & 0xff;
+  buf[5] = (value >> 40) & 0xff;
+  buf[6] = (value >> 48) & 0xff;
+  buf[7] = (value >> 56) & 0xff;
+#endif
+}
+
+inline void PutFixed32(std::string* dst, uint32_t value) {
+  char buf[sizeof(value)];
+  EncodeFixed32(buf, value);
+  dst->append(buf, sizeof(buf));
+}
+
+inline void PutFixed64(std::string* dst, uint64_t value) {
+  char buf[sizeof(value)];
+  EncodeFixed64(buf, value);
+  dst->append(buf, sizeof(buf));
+}
+
+inline void PutVarint32(std::string* dst, uint32_t v) {
+  char buf[5];
+  char* ptr = EncodeVarint32(buf, v);
+  dst->append(buf, ptr - buf);
+}
+
+inline char* EncodeVarint64(char* dst, uint64_t v) {
+  static const unsigned int B = 128;
+  unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
+  while (v >= B) {
+    *(ptr++) = (v & (B - 1)) | B;
+    v >>= 7;
+  }
+  *(ptr++) = static_cast<unsigned char>(v);
+  return reinterpret_cast<char*>(ptr);
+}
+
+inline void PutVarint64(std::string* dst, uint64_t v) {
+  char buf[10];
+  char* ptr = EncodeVarint64(buf, v);
+  dst->append(buf, ptr - buf);
+}
+
+inline void PutLengthPrefixedSlice(std::string* dst, const Slice& value) {
+  PutVarint32(dst, value.size());
+  dst->append(value.data(), value.size());
+}
+
+inline void PutLengthPrefixedSliceParts(std::string* dst,
+                                        const SliceParts& slice_parts) {
+  uint32_t total_bytes = 0;
+  for (int i = 0; i < slice_parts.num_parts; ++i) {
+    total_bytes += slice_parts.parts[i].size();
+  }
+  PutVarint32(dst, total_bytes);
+  for (int i = 0; i < slice_parts.num_parts; ++i) {
+    dst->append(slice_parts.parts[i].data(), slice_parts.parts[i].size());
+  }
+}
+
+inline int VarintLength(uint64_t v) {
+  int len = 1;
+  while (v >= 128) {
+    v >>= 7;
+    len++;
+  }
+  return len;
+}
+
+inline bool GetVarint32(Slice* input, uint32_t* value) {
+  const char* p = input->data();
+  const char* limit = p + input->size();
+  const char* q = GetVarint32Ptr(p, limit, value);
+  if (q == nullptr) {
+    return false;
+  } else {
+    *input = Slice(q, limit - q);
+    return true;
+  }
+}
+
+inline bool GetVarint64(Slice* input, uint64_t* value) {
+  const char* p = input->data();
+  const char* limit = p + input->size();
+  const char* q = GetVarint64Ptr(p, limit, value);
+  if (q == nullptr) {
+    return false;
+  } else {
+    *input = Slice(q, limit - q);
+    return true;
+  }
+}
+
+inline bool GetLengthPrefixedSlice(Slice* input, Slice* result) {
+  uint32_t len = 0;
+  if (GetVarint32(input, &len) && input->size() >= len) {
+    *result = Slice(input->data(), len);
+    input->remove_prefix(len);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+inline Slice GetLengthPrefixedSlice(const char* data) {
+  uint32_t len = 0;
+  // +5: we assume "data" is not corrupted
+  auto p = GetVarint32Ptr(data, data + 5 /* limit */, &len);
+  return Slice(p, len);
+}
+
+inline Slice GetSliceUntil(Slice* slice, char delimiter) {
+  uint32_t len = 0;
+  for (len = 0; len < slice->size() && slice->data()[len] != delimiter; ++len) {
+    // nothing
+  }
+
+  Slice ret(slice->data(), len);
+  slice->remove_prefix(len + ((len < slice->size()) ? 1 : 0));
+  return ret;
+}
+
+inline uint64_t BitStreamGetInt(const std::string* src, size_t offset,
+                                uint32_t bits) {
+  return BitStreamGetInt(src->data(), src->size(), offset, bits);
+}
+
+inline uint64_t BitStreamGetInt(const Slice* src, size_t offset,
+                                uint32_t bits) {
+  return BitStreamGetInt(src->data(), src->size(), offset, bits);
+}
+
+}  // namespace rocksdb
diff --git a/util/coding_test.cc b/util/coding_test.cc
new file mode 100644 (file)
index 0000000..ed542d6
--- /dev/null
@@ -0,0 +1,296 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/coding.h"
+
+#include "util/testharness.h"
+
+namespace rocksdb {
+
+class Coding { };
+
+TEST(Coding, Fixed32) {
+  std::string s;
+  for (uint32_t v = 0; v < 100000; v++) {
+    PutFixed32(&s, v);
+  }
+
+  const char* p = s.data();
+  for (uint32_t v = 0; v < 100000; v++) {
+    uint32_t actual = DecodeFixed32(p);
+    ASSERT_EQ(v, actual);
+    p += sizeof(uint32_t);
+  }
+}
+
+TEST(Coding, Fixed64) {
+  std::string s;
+  for (int power = 0; power <= 63; power++) {
+    uint64_t v = static_cast<uint64_t>(1) << power;
+    PutFixed64(&s, v - 1);
+    PutFixed64(&s, v + 0);
+    PutFixed64(&s, v + 1);
+  }
+
+  const char* p = s.data();
+  for (int power = 0; power <= 63; power++) {
+    uint64_t v = static_cast<uint64_t>(1) << power;
+    uint64_t actual = 0;
+    actual = DecodeFixed64(p);
+    ASSERT_EQ(v-1, actual);
+    p += sizeof(uint64_t);
+
+    actual = DecodeFixed64(p);
+    ASSERT_EQ(v+0, actual);
+    p += sizeof(uint64_t);
+
+    actual = DecodeFixed64(p);
+    ASSERT_EQ(v+1, actual);
+    p += sizeof(uint64_t);
+  }
+}
+
+// Test that encoding routines generate little-endian encodings
+TEST(Coding, EncodingOutput) {
+  std::string dst;
+  PutFixed32(&dst, 0x04030201);
+  ASSERT_EQ(4U, dst.size());
+  ASSERT_EQ(0x01, static_cast<int>(dst[0]));
+  ASSERT_EQ(0x02, static_cast<int>(dst[1]));
+  ASSERT_EQ(0x03, static_cast<int>(dst[2]));
+  ASSERT_EQ(0x04, static_cast<int>(dst[3]));
+
+  dst.clear();
+  PutFixed64(&dst, 0x0807060504030201ull);
+  ASSERT_EQ(8U, dst.size());
+  ASSERT_EQ(0x01, static_cast<int>(dst[0]));
+  ASSERT_EQ(0x02, static_cast<int>(dst[1]));
+  ASSERT_EQ(0x03, static_cast<int>(dst[2]));
+  ASSERT_EQ(0x04, static_cast<int>(dst[3]));
+  ASSERT_EQ(0x05, static_cast<int>(dst[4]));
+  ASSERT_EQ(0x06, static_cast<int>(dst[5]));
+  ASSERT_EQ(0x07, static_cast<int>(dst[6]));
+  ASSERT_EQ(0x08, static_cast<int>(dst[7]));
+}
+
+TEST(Coding, Varint32) {
+  std::string s;
+  for (uint32_t i = 0; i < (32 * 32); i++) {
+    uint32_t v = (i / 32) << (i % 32);
+    PutVarint32(&s, v);
+  }
+
+  const char* p = s.data();
+  const char* limit = p + s.size();
+  for (uint32_t i = 0; i < (32 * 32); i++) {
+    uint32_t expected = (i / 32) << (i % 32);
+    uint32_t actual = 0;
+    const char* start = p;
+    p = GetVarint32Ptr(p, limit, &actual);
+    ASSERT_TRUE(p != nullptr);
+    ASSERT_EQ(expected, actual);
+    ASSERT_EQ(VarintLength(actual), p - start);
+  }
+  ASSERT_EQ(p, s.data() + s.size());
+}
+
+TEST(Coding, Varint64) {
+  // Construct the list of values to check
+  std::vector<uint64_t> values;
+  // Some special values
+  values.push_back(0);
+  values.push_back(100);
+  values.push_back(~static_cast<uint64_t>(0));
+  values.push_back(~static_cast<uint64_t>(0) - 1);
+  for (uint32_t k = 0; k < 64; k++) {
+    // Test values near powers of two
+    const uint64_t power = 1ull << k;
+    values.push_back(power);
+    values.push_back(power-1);
+    values.push_back(power+1);
+  };
+
+  std::string s;
+  for (unsigned int i = 0; i < values.size(); i++) {
+    PutVarint64(&s, values[i]);
+  }
+
+  const char* p = s.data();
+  const char* limit = p + s.size();
+  for (unsigned int i = 0; i < values.size(); i++) {
+    ASSERT_TRUE(p < limit);
+    uint64_t actual = 0;
+    const char* start = p;
+    p = GetVarint64Ptr(p, limit, &actual);
+    ASSERT_TRUE(p != nullptr);
+    ASSERT_EQ(values[i], actual);
+    ASSERT_EQ(VarintLength(actual), p - start);
+  }
+  ASSERT_EQ(p, limit);
+
+}
+
+TEST(Coding, Varint32Overflow) {
+  uint32_t result;
+  std::string input("\x81\x82\x83\x84\x85\x11");
+  ASSERT_TRUE(GetVarint32Ptr(input.data(), input.data() + input.size(), &result)
+              == nullptr);
+}
+
+TEST(Coding, Varint32Truncation) {
+  uint32_t large_value = (1u << 31) + 100;
+  std::string s;
+  PutVarint32(&s, large_value);
+  uint32_t result;
+  for (unsigned int len = 0; len < s.size() - 1; len++) {
+    ASSERT_TRUE(GetVarint32Ptr(s.data(), s.data() + len, &result) == nullptr);
+  }
+  ASSERT_TRUE(
+      GetVarint32Ptr(s.data(), s.data() + s.size(), &result) != nullptr);
+  ASSERT_EQ(large_value, result);
+}
+
+TEST(Coding, Varint64Overflow) {
+  uint64_t result;
+  std::string input("\x81\x82\x83\x84\x85\x81\x82\x83\x84\x85\x11");
+  ASSERT_TRUE(GetVarint64Ptr(input.data(), input.data() + input.size(), &result)
+              == nullptr);
+}
+
+TEST(Coding, Varint64Truncation) {
+  uint64_t large_value = (1ull << 63) + 100ull;
+  std::string s;
+  PutVarint64(&s, large_value);
+  uint64_t result;
+  for (unsigned int len = 0; len < s.size() - 1; len++) {
+    ASSERT_TRUE(GetVarint64Ptr(s.data(), s.data() + len, &result) == nullptr);
+  }
+  ASSERT_TRUE(
+      GetVarint64Ptr(s.data(), s.data() + s.size(), &result) != nullptr);
+  ASSERT_EQ(large_value, result);
+}
+
+TEST(Coding, Strings) {
+  std::string s;
+  PutLengthPrefixedSlice(&s, Slice(""));
+  PutLengthPrefixedSlice(&s, Slice("foo"));
+  PutLengthPrefixedSlice(&s, Slice("bar"));
+  PutLengthPrefixedSlice(&s, Slice(std::string(200, 'x')));
+
+  Slice input(s);
+  Slice v;
+  ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v));
+  ASSERT_EQ("", v.ToString());
+  ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v));
+  ASSERT_EQ("foo", v.ToString());
+  ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v));
+  ASSERT_EQ("bar", v.ToString());
+  ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v));
+  ASSERT_EQ(std::string(200, 'x'), v.ToString());
+  ASSERT_EQ("", input.ToString());
+}
+
+TEST(Coding, BitStream) {
+  const int kNumBytes = 10;
+  char bytes[kNumBytes+1];
+  for (int i = 0; i < kNumBytes + 1; ++i) {
+      bytes[i] = '\0';
+  }
+
+  // Simple byte aligned test.
+  for (int i = 0; i < kNumBytes; ++i) {
+    BitStreamPutInt(bytes, kNumBytes, i*8, 8, 255-i);
+
+    ASSERT_EQ((unsigned char)bytes[i], (unsigned char)(255-i));
+  }
+  for (int i = 0; i < kNumBytes; ++i) {
+    ASSERT_EQ(BitStreamGetInt(bytes, kNumBytes, i*8, 8), (uint32_t)(255-i));
+  }
+  ASSERT_EQ(bytes[kNumBytes], '\0');
+
+  // Write and read back at strange offsets
+  for (int i = 0; i < kNumBytes + 1; ++i) {
+      bytes[i] = '\0';
+  }
+  for (int i = 0; i < kNumBytes; ++i) {
+    BitStreamPutInt(bytes, kNumBytes, i*5+1, 4, (i * 7) % (1 << 4));
+  }
+  for (int i = 0; i < kNumBytes; ++i) {
+    ASSERT_EQ(BitStreamGetInt(bytes, kNumBytes, i*5+1, 4),
+              (uint32_t)((i * 7) % (1 << 4)));
+  }
+  ASSERT_EQ(bytes[kNumBytes], '\0');
+
+  // Create 11011011 as a bit pattern
+  for (int i = 0; i < kNumBytes + 1; ++i) {
+      bytes[i] = '\0';
+  }
+  for (int i = 0; i < kNumBytes; ++i) {
+    BitStreamPutInt(bytes, kNumBytes, i*8, 2, 3);
+    BitStreamPutInt(bytes, kNumBytes, i*8+3, 2, 3);
+    BitStreamPutInt(bytes, kNumBytes, i*8+6, 2, 3);
+
+    ASSERT_EQ((unsigned char)bytes[i],
+              (unsigned char)(3 + (3 << 3) + (3 << 6)));
+  }
+  ASSERT_EQ(bytes[kNumBytes], '\0');
+
+
+  // Test large values
+  for (int i = 0; i < kNumBytes + 1; ++i) {
+      bytes[i] = '\0';
+  }
+  BitStreamPutInt(bytes, kNumBytes, 0, 64, (uint64_t)(-1));
+  for (int i = 0; i < 64/8; ++i) {
+    ASSERT_EQ((unsigned char)bytes[i],
+              (unsigned char)(255));
+  }
+  ASSERT_EQ(bytes[64/8], '\0');
+
+
+}
+
+TEST(Coding, BitStreamConvenienceFuncs) {
+  std::string bytes(1, '\0');
+
+  // Check that independent changes to byte are preserved.
+  BitStreamPutInt(&bytes, 0, 2, 3);
+  BitStreamPutInt(&bytes, 3, 2, 3);
+  BitStreamPutInt(&bytes, 6, 2, 3);
+  ASSERT_EQ((unsigned char)bytes[0], (unsigned char)(3 + (3 << 3) + (3 << 6)));
+  ASSERT_EQ(BitStreamGetInt(&bytes, 0, 2), 3u);
+  ASSERT_EQ(BitStreamGetInt(&bytes, 3, 2), 3u);
+  ASSERT_EQ(BitStreamGetInt(&bytes, 6, 2), 3u);
+  Slice slice(bytes);
+  ASSERT_EQ(BitStreamGetInt(&slice, 0, 2), 3u);
+  ASSERT_EQ(BitStreamGetInt(&slice, 3, 2), 3u);
+  ASSERT_EQ(BitStreamGetInt(&slice, 6, 2), 3u);
+
+  // Test overlapping crossing over byte boundaries
+  bytes = std::string(2, '\0');
+  BitStreamPutInt(&bytes, 6, 4, 15);
+  ASSERT_EQ((unsigned char)bytes[0], 3 << 6);
+  ASSERT_EQ((unsigned char)bytes[1], 3);
+  ASSERT_EQ(BitStreamGetInt(&bytes, 6, 4), 15u);
+  slice = Slice(bytes);
+  ASSERT_EQ(BitStreamGetInt(&slice, 6, 4), 15u);
+
+  // Test 64-bit number
+  bytes = std::string(64/8, '\0');
+  BitStreamPutInt(&bytes, 0, 64, (uint64_t)(-1));
+  ASSERT_EQ(BitStreamGetInt(&bytes, 0, 64), (uint64_t)(-1));
+  slice = Slice(bytes);
+  ASSERT_EQ(BitStreamGetInt(&slice, 0, 64), (uint64_t)(-1));
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/util/comparator.cc b/util/comparator.cc
new file mode 100644 (file)
index 0000000..adeacac
--- /dev/null
@@ -0,0 +1,86 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <algorithm>
+#include <stdint.h>
+#include "rocksdb/comparator.h"
+#include "rocksdb/slice.h"
+#include "port/port.h"
+#include "util/logging.h"
+
+namespace rocksdb {
+
+Comparator::~Comparator() { }
+
+namespace {
+class BytewiseComparatorImpl : public Comparator {
+ public:
+  BytewiseComparatorImpl() { }
+
+  virtual const char* Name() const {
+    return "leveldb.BytewiseComparator";
+  }
+
+  virtual int Compare(const Slice& a, const Slice& b) const {
+    return a.compare(b);
+  }
+
+  virtual void FindShortestSeparator(
+      std::string* start,
+      const Slice& limit) const {
+    // Find length of common prefix
+    size_t min_length = std::min(start->size(), limit.size());
+    size_t diff_index = 0;
+    while ((diff_index < min_length) &&
+           ((*start)[diff_index] == limit[diff_index])) {
+      diff_index++;
+    }
+
+    if (diff_index >= min_length) {
+      // Do not shorten if one string is a prefix of the other
+    } else {
+      uint8_t diff_byte = static_cast<uint8_t>((*start)[diff_index]);
+      if (diff_byte < static_cast<uint8_t>(0xff) &&
+          diff_byte + 1 < static_cast<uint8_t>(limit[diff_index])) {
+        (*start)[diff_index]++;
+        start->resize(diff_index + 1);
+        assert(Compare(*start, limit) < 0);
+      }
+    }
+  }
+
+  virtual void FindShortSuccessor(std::string* key) const {
+    // Find first character that can be incremented
+    size_t n = key->size();
+    for (size_t i = 0; i < n; i++) {
+      const uint8_t byte = (*key)[i];
+      if (byte != static_cast<uint8_t>(0xff)) {
+        (*key)[i] = byte + 1;
+        key->resize(i+1);
+        return;
+      }
+    }
+    // *key is a run of 0xffs.  Leave it alone.
+  }
+};
+}  // namespace
+
+static port::OnceType once = LEVELDB_ONCE_INIT;
+static const Comparator* bytewise;
+
+static void InitModule() {
+  bytewise = new BytewiseComparatorImpl;
+}
+
+const Comparator* BytewiseComparator() {
+  port::InitOnce(&once, InitModule);
+  return bytewise;
+}
+
+}  // namespace rocksdb
diff --git a/util/crc32c.cc b/util/crc32c.cc
new file mode 100644 (file)
index 0000000..d27fb4b
--- /dev/null
@@ -0,0 +1,393 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A portable implementation of crc32c, optimized to handle
+// four bytes at a time.
+
+#include "util/crc32c.h"
+
+#include <stdint.h>
+#ifdef __SSE4_2__
+#include <nmmintrin.h>
+#endif
+#include "util/coding.h"
+
+namespace rocksdb {
+namespace crc32c {
+
+static const uint32_t table0_[256] = {
+  0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4,
+  0xc79a971f, 0x35f1141c, 0x26a1e7e8, 0xd4ca64eb,
+  0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b,
+  0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24,
+  0x105ec76f, 0xe235446c, 0xf165b798, 0x030e349b,
+  0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384,
+  0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54,
+  0x5d1d08bf, 0xaf768bbc, 0xbc267848, 0x4e4dfb4b,
+  0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a,
+  0xe72719c1, 0x154c9ac2, 0x061c6936, 0xf477ea35,
+  0xaa64d611, 0x580f5512, 0x4b5fa6e6, 0xb93425e5,
+  0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa,
+  0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45,
+  0xf779deae, 0x05125dad, 0x1642ae59, 0xe4292d5a,
+  0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a,
+  0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595,
+  0x417b1dbc, 0xb3109ebf, 0xa0406d4b, 0x522bee48,
+  0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957,
+  0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687,
+  0x0c38d26c, 0xfe53516f, 0xed03a29b, 0x1f682198,
+  0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927,
+  0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38,
+  0xdbfc821c, 0x2997011f, 0x3ac7f2eb, 0xc8ac71e8,
+  0x1c661503, 0xee0d9600, 0xfd5d65f4, 0x0f36e6f7,
+  0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096,
+  0xa65c047d, 0x5437877e, 0x4767748a, 0xb50cf789,
+  0xeb1fcbad, 0x197448ae, 0x0a24bb5a, 0xf84f3859,
+  0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46,
+  0x7198540d, 0x83f3d70e, 0x90a324fa, 0x62c8a7f9,
+  0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6,
+  0xfb410cc2, 0x092a8fc1, 0x1a7a7c35, 0xe811ff36,
+  0x3cdb9bdd, 0xceb018de, 0xdde0eb2a, 0x2f8b6829,
+  0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c,
+  0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93,
+  0x082f63b7, 0xfa44e0b4, 0xe9141340, 0x1b7f9043,
+  0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c,
+  0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3,
+  0x55326b08, 0xa759e80b, 0xb4091bff, 0x466298fc,
+  0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0x0b21572c,
+  0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033,
+  0xa24bb5a6, 0x502036a5, 0x4370c551, 0xb11b4652,
+  0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d,
+  0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d,
+  0xef087a76, 0x1d63f975, 0x0e330a81, 0xfc588982,
+  0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d,
+  0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622,
+  0x38cc2a06, 0xcaa7a905, 0xd9f75af1, 0x2b9cd9f2,
+  0xff56bd19, 0x0d3d3e1a, 0x1e6dcdee, 0xec064eed,
+  0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530,
+  0x0417b1db, 0xf67c32d8, 0xe52cc12c, 0x1747422f,
+  0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff,
+  0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0,
+  0xd3d3e1ab, 0x21b862a8, 0x32e8915c, 0xc083125f,
+  0x144976b4, 0xe622f5b7, 0xf5720643, 0x07198540,
+  0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90,
+  0x9e902e7b, 0x6cfbad78, 0x7fab5e8c, 0x8dc0dd8f,
+  0xe330a81a, 0x115b2b19, 0x020bd8ed, 0xf0605bee,
+  0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1,
+  0x69e9f0d5, 0x9b8273d6, 0x88d28022, 0x7ab90321,
+  0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e,
+  0xf36e6f75, 0x0105ec76, 0x12551f82, 0xe03e9c81,
+  0x34f4f86a, 0xc69f7b69, 0xd5cf889d, 0x27a40b9e,
+  0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e,
+  0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351
+};
+static const uint32_t table1_[256] = {
+  0x00000000, 0x13a29877, 0x274530ee, 0x34e7a899,
+  0x4e8a61dc, 0x5d28f9ab, 0x69cf5132, 0x7a6dc945,
+  0x9d14c3b8, 0x8eb65bcf, 0xba51f356, 0xa9f36b21,
+  0xd39ea264, 0xc03c3a13, 0xf4db928a, 0xe7790afd,
+  0x3fc5f181, 0x2c6769f6, 0x1880c16f, 0x0b225918,
+  0x714f905d, 0x62ed082a, 0x560aa0b3, 0x45a838c4,
+  0xa2d13239, 0xb173aa4e, 0x859402d7, 0x96369aa0,
+  0xec5b53e5, 0xfff9cb92, 0xcb1e630b, 0xd8bcfb7c,
+  0x7f8be302, 0x6c297b75, 0x58ced3ec, 0x4b6c4b9b,
+  0x310182de, 0x22a31aa9, 0x1644b230, 0x05e62a47,
+  0xe29f20ba, 0xf13db8cd, 0xc5da1054, 0xd6788823,
+  0xac154166, 0xbfb7d911, 0x8b507188, 0x98f2e9ff,
+  0x404e1283, 0x53ec8af4, 0x670b226d, 0x74a9ba1a,
+  0x0ec4735f, 0x1d66eb28, 0x298143b1, 0x3a23dbc6,
+  0xdd5ad13b, 0xcef8494c, 0xfa1fe1d5, 0xe9bd79a2,
+  0x93d0b0e7, 0x80722890, 0xb4958009, 0xa737187e,
+  0xff17c604, 0xecb55e73, 0xd852f6ea, 0xcbf06e9d,
+  0xb19da7d8, 0xa23f3faf, 0x96d89736, 0x857a0f41,
+  0x620305bc, 0x71a19dcb, 0x45463552, 0x56e4ad25,
+  0x2c896460, 0x3f2bfc17, 0x0bcc548e, 0x186eccf9,
+  0xc0d23785, 0xd370aff2, 0xe797076b, 0xf4359f1c,
+  0x8e585659, 0x9dface2e, 0xa91d66b7, 0xbabffec0,
+  0x5dc6f43d, 0x4e646c4a, 0x7a83c4d3, 0x69215ca4,
+  0x134c95e1, 0x00ee0d96, 0x3409a50f, 0x27ab3d78,
+  0x809c2506, 0x933ebd71, 0xa7d915e8, 0xb47b8d9f,
+  0xce1644da, 0xddb4dcad, 0xe9537434, 0xfaf1ec43,
+  0x1d88e6be, 0x0e2a7ec9, 0x3acdd650, 0x296f4e27,
+  0x53028762, 0x40a01f15, 0x7447b78c, 0x67e52ffb,
+  0xbf59d487, 0xacfb4cf0, 0x981ce469, 0x8bbe7c1e,
+  0xf1d3b55b, 0xe2712d2c, 0xd69685b5, 0xc5341dc2,
+  0x224d173f, 0x31ef8f48, 0x050827d1, 0x16aabfa6,
+  0x6cc776e3, 0x7f65ee94, 0x4b82460d, 0x5820de7a,
+  0xfbc3faf9, 0xe861628e, 0xdc86ca17, 0xcf245260,
+  0xb5499b25, 0xa6eb0352, 0x920cabcb, 0x81ae33bc,
+  0x66d73941, 0x7575a136, 0x419209af, 0x523091d8,
+  0x285d589d, 0x3bffc0ea, 0x0f186873, 0x1cbaf004,
+  0xc4060b78, 0xd7a4930f, 0xe3433b96, 0xf0e1a3e1,
+  0x8a8c6aa4, 0x992ef2d3, 0xadc95a4a, 0xbe6bc23d,
+  0x5912c8c0, 0x4ab050b7, 0x7e57f82e, 0x6df56059,
+  0x1798a91c, 0x043a316b, 0x30dd99f2, 0x237f0185,
+  0x844819fb, 0x97ea818c, 0xa30d2915, 0xb0afb162,
+  0xcac27827, 0xd960e050, 0xed8748c9, 0xfe25d0be,
+  0x195cda43, 0x0afe4234, 0x3e19eaad, 0x2dbb72da,
+  0x57d6bb9f, 0x447423e8, 0x70938b71, 0x63311306,
+  0xbb8de87a, 0xa82f700d, 0x9cc8d894, 0x8f6a40e3,
+  0xf50789a6, 0xe6a511d1, 0xd242b948, 0xc1e0213f,
+  0x26992bc2, 0x353bb3b5, 0x01dc1b2c, 0x127e835b,
+  0x68134a1e, 0x7bb1d269, 0x4f567af0, 0x5cf4e287,
+  0x04d43cfd, 0x1776a48a, 0x23910c13, 0x30339464,
+  0x4a5e5d21, 0x59fcc556, 0x6d1b6dcf, 0x7eb9f5b8,
+  0x99c0ff45, 0x8a626732, 0xbe85cfab, 0xad2757dc,
+  0xd74a9e99, 0xc4e806ee, 0xf00fae77, 0xe3ad3600,
+  0x3b11cd7c, 0x28b3550b, 0x1c54fd92, 0x0ff665e5,
+  0x759baca0, 0x663934d7, 0x52de9c4e, 0x417c0439,
+  0xa6050ec4, 0xb5a796b3, 0x81403e2a, 0x92e2a65d,
+  0xe88f6f18, 0xfb2df76f, 0xcfca5ff6, 0xdc68c781,
+  0x7b5fdfff, 0x68fd4788, 0x5c1aef11, 0x4fb87766,
+  0x35d5be23, 0x26772654, 0x12908ecd, 0x013216ba,
+  0xe64b1c47, 0xf5e98430, 0xc10e2ca9, 0xd2acb4de,
+  0xa8c17d9b, 0xbb63e5ec, 0x8f844d75, 0x9c26d502,
+  0x449a2e7e, 0x5738b609, 0x63df1e90, 0x707d86e7,
+  0x0a104fa2, 0x19b2d7d5, 0x2d557f4c, 0x3ef7e73b,
+  0xd98eedc6, 0xca2c75b1, 0xfecbdd28, 0xed69455f,
+  0x97048c1a, 0x84a6146d, 0xb041bcf4, 0xa3e32483
+};
+static const uint32_t table2_[256] = {
+  0x00000000, 0xa541927e, 0x4f6f520d, 0xea2ec073,
+  0x9edea41a, 0x3b9f3664, 0xd1b1f617, 0x74f06469,
+  0x38513ec5, 0x9d10acbb, 0x773e6cc8, 0xd27ffeb6,
+  0xa68f9adf, 0x03ce08a1, 0xe9e0c8d2, 0x4ca15aac,
+  0x70a27d8a, 0xd5e3eff4, 0x3fcd2f87, 0x9a8cbdf9,
+  0xee7cd990, 0x4b3d4bee, 0xa1138b9d, 0x045219e3,
+  0x48f3434f, 0xedb2d131, 0x079c1142, 0xa2dd833c,
+  0xd62de755, 0x736c752b, 0x9942b558, 0x3c032726,
+  0xe144fb14, 0x4405696a, 0xae2ba919, 0x0b6a3b67,
+  0x7f9a5f0e, 0xdadbcd70, 0x30f50d03, 0x95b49f7d,
+  0xd915c5d1, 0x7c5457af, 0x967a97dc, 0x333b05a2,
+  0x47cb61cb, 0xe28af3b5, 0x08a433c6, 0xade5a1b8,
+  0x91e6869e, 0x34a714e0, 0xde89d493, 0x7bc846ed,
+  0x0f382284, 0xaa79b0fa, 0x40577089, 0xe516e2f7,
+  0xa9b7b85b, 0x0cf62a25, 0xe6d8ea56, 0x43997828,
+  0x37691c41, 0x92288e3f, 0x78064e4c, 0xdd47dc32,
+  0xc76580d9, 0x622412a7, 0x880ad2d4, 0x2d4b40aa,
+  0x59bb24c3, 0xfcfab6bd, 0x16d476ce, 0xb395e4b0,
+  0xff34be1c, 0x5a752c62, 0xb05bec11, 0x151a7e6f,
+  0x61ea1a06, 0xc4ab8878, 0x2e85480b, 0x8bc4da75,
+  0xb7c7fd53, 0x12866f2d, 0xf8a8af5e, 0x5de93d20,
+  0x29195949, 0x8c58cb37, 0x66760b44, 0xc337993a,
+  0x8f96c396, 0x2ad751e8, 0xc0f9919b, 0x65b803e5,
+  0x1148678c, 0xb409f5f2, 0x5e273581, 0xfb66a7ff,
+  0x26217bcd, 0x8360e9b3, 0x694e29c0, 0xcc0fbbbe,
+  0xb8ffdfd7, 0x1dbe4da9, 0xf7908dda, 0x52d11fa4,
+  0x1e704508, 0xbb31d776, 0x511f1705, 0xf45e857b,
+  0x80aee112, 0x25ef736c, 0xcfc1b31f, 0x6a802161,
+  0x56830647, 0xf3c29439, 0x19ec544a, 0xbcadc634,
+  0xc85da25d, 0x6d1c3023, 0x8732f050, 0x2273622e,
+  0x6ed23882, 0xcb93aafc, 0x21bd6a8f, 0x84fcf8f1,
+  0xf00c9c98, 0x554d0ee6, 0xbf63ce95, 0x1a225ceb,
+  0x8b277743, 0x2e66e53d, 0xc448254e, 0x6109b730,
+  0x15f9d359, 0xb0b84127, 0x5a968154, 0xffd7132a,
+  0xb3764986, 0x1637dbf8, 0xfc191b8b, 0x595889f5,
+  0x2da8ed9c, 0x88e97fe2, 0x62c7bf91, 0xc7862def,
+  0xfb850ac9, 0x5ec498b7, 0xb4ea58c4, 0x11abcaba,
+  0x655baed3, 0xc01a3cad, 0x2a34fcde, 0x8f756ea0,
+  0xc3d4340c, 0x6695a672, 0x8cbb6601, 0x29faf47f,
+  0x5d0a9016, 0xf84b0268, 0x1265c21b, 0xb7245065,
+  0x6a638c57, 0xcf221e29, 0x250cde5a, 0x804d4c24,
+  0xf4bd284d, 0x51fcba33, 0xbbd27a40, 0x1e93e83e,
+  0x5232b292, 0xf77320ec, 0x1d5de09f, 0xb81c72e1,
+  0xccec1688, 0x69ad84f6, 0x83834485, 0x26c2d6fb,
+  0x1ac1f1dd, 0xbf8063a3, 0x55aea3d0, 0xf0ef31ae,
+  0x841f55c7, 0x215ec7b9, 0xcb7007ca, 0x6e3195b4,
+  0x2290cf18, 0x87d15d66, 0x6dff9d15, 0xc8be0f6b,
+  0xbc4e6b02, 0x190ff97c, 0xf321390f, 0x5660ab71,
+  0x4c42f79a, 0xe90365e4, 0x032da597, 0xa66c37e9,
+  0xd29c5380, 0x77ddc1fe, 0x9df3018d, 0x38b293f3,
+  0x7413c95f, 0xd1525b21, 0x3b7c9b52, 0x9e3d092c,
+  0xeacd6d45, 0x4f8cff3b, 0xa5a23f48, 0x00e3ad36,
+  0x3ce08a10, 0x99a1186e, 0x738fd81d, 0xd6ce4a63,
+  0xa23e2e0a, 0x077fbc74, 0xed517c07, 0x4810ee79,
+  0x04b1b4d5, 0xa1f026ab, 0x4bdee6d8, 0xee9f74a6,
+  0x9a6f10cf, 0x3f2e82b1, 0xd50042c2, 0x7041d0bc,
+  0xad060c8e, 0x08479ef0, 0xe2695e83, 0x4728ccfd,
+  0x33d8a894, 0x96993aea, 0x7cb7fa99, 0xd9f668e7,
+  0x9557324b, 0x3016a035, 0xda386046, 0x7f79f238,
+  0x0b899651, 0xaec8042f, 0x44e6c45c, 0xe1a75622,
+  0xdda47104, 0x78e5e37a, 0x92cb2309, 0x378ab177,
+  0x437ad51e, 0xe63b4760, 0x0c158713, 0xa954156d,
+  0xe5f54fc1, 0x40b4ddbf, 0xaa9a1dcc, 0x0fdb8fb2,
+  0x7b2bebdb, 0xde6a79a5, 0x3444b9d6, 0x91052ba8
+};
+static const uint32_t table3_[256] = {
+  0x00000000, 0xdd45aab8, 0xbf672381, 0x62228939,
+  0x7b2231f3, 0xa6679b4b, 0xc4451272, 0x1900b8ca,
+  0xf64463e6, 0x2b01c95e, 0x49234067, 0x9466eadf,
+  0x8d665215, 0x5023f8ad, 0x32017194, 0xef44db2c,
+  0xe964b13d, 0x34211b85, 0x560392bc, 0x8b463804,
+  0x924680ce, 0x4f032a76, 0x2d21a34f, 0xf06409f7,
+  0x1f20d2db, 0xc2657863, 0xa047f15a, 0x7d025be2,
+  0x6402e328, 0xb9474990, 0xdb65c0a9, 0x06206a11,
+  0xd725148b, 0x0a60be33, 0x6842370a, 0xb5079db2,
+  0xac072578, 0x71428fc0, 0x136006f9, 0xce25ac41,
+  0x2161776d, 0xfc24ddd5, 0x9e0654ec, 0x4343fe54,
+  0x5a43469e, 0x8706ec26, 0xe524651f, 0x3861cfa7,
+  0x3e41a5b6, 0xe3040f0e, 0x81268637, 0x5c632c8f,
+  0x45639445, 0x98263efd, 0xfa04b7c4, 0x27411d7c,
+  0xc805c650, 0x15406ce8, 0x7762e5d1, 0xaa274f69,
+  0xb327f7a3, 0x6e625d1b, 0x0c40d422, 0xd1057e9a,
+  0xaba65fe7, 0x76e3f55f, 0x14c17c66, 0xc984d6de,
+  0xd0846e14, 0x0dc1c4ac, 0x6fe34d95, 0xb2a6e72d,
+  0x5de23c01, 0x80a796b9, 0xe2851f80, 0x3fc0b538,
+  0x26c00df2, 0xfb85a74a, 0x99a72e73, 0x44e284cb,
+  0x42c2eeda, 0x9f874462, 0xfda5cd5b, 0x20e067e3,
+  0x39e0df29, 0xe4a57591, 0x8687fca8, 0x5bc25610,
+  0xb4868d3c, 0x69c32784, 0x0be1aebd, 0xd6a40405,
+  0xcfa4bccf, 0x12e11677, 0x70c39f4e, 0xad8635f6,
+  0x7c834b6c, 0xa1c6e1d4, 0xc3e468ed, 0x1ea1c255,
+  0x07a17a9f, 0xdae4d027, 0xb8c6591e, 0x6583f3a6,
+  0x8ac7288a, 0x57828232, 0x35a00b0b, 0xe8e5a1b3,
+  0xf1e51979, 0x2ca0b3c1, 0x4e823af8, 0x93c79040,
+  0x95e7fa51, 0x48a250e9, 0x2a80d9d0, 0xf7c57368,
+  0xeec5cba2, 0x3380611a, 0x51a2e823, 0x8ce7429b,
+  0x63a399b7, 0xbee6330f, 0xdcc4ba36, 0x0181108e,
+  0x1881a844, 0xc5c402fc, 0xa7e68bc5, 0x7aa3217d,
+  0x52a0c93f, 0x8fe56387, 0xedc7eabe, 0x30824006,
+  0x2982f8cc, 0xf4c75274, 0x96e5db4d, 0x4ba071f5,
+  0xa4e4aad9, 0x79a10061, 0x1b838958, 0xc6c623e0,
+  0xdfc69b2a, 0x02833192, 0x60a1b8ab, 0xbde41213,
+  0xbbc47802, 0x6681d2ba, 0x04a35b83, 0xd9e6f13b,
+  0xc0e649f1, 0x1da3e349, 0x7f816a70, 0xa2c4c0c8,
+  0x4d801be4, 0x90c5b15c, 0xf2e73865, 0x2fa292dd,
+  0x36a22a17, 0xebe780af, 0x89c50996, 0x5480a32e,
+  0x8585ddb4, 0x58c0770c, 0x3ae2fe35, 0xe7a7548d,
+  0xfea7ec47, 0x23e246ff, 0x41c0cfc6, 0x9c85657e,
+  0x73c1be52, 0xae8414ea, 0xcca69dd3, 0x11e3376b,
+  0x08e38fa1, 0xd5a62519, 0xb784ac20, 0x6ac10698,
+  0x6ce16c89, 0xb1a4c631, 0xd3864f08, 0x0ec3e5b0,
+  0x17c35d7a, 0xca86f7c2, 0xa8a47efb, 0x75e1d443,
+  0x9aa50f6f, 0x47e0a5d7, 0x25c22cee, 0xf8878656,
+  0xe1873e9c, 0x3cc29424, 0x5ee01d1d, 0x83a5b7a5,
+  0xf90696d8, 0x24433c60, 0x4661b559, 0x9b241fe1,
+  0x8224a72b, 0x5f610d93, 0x3d4384aa, 0xe0062e12,
+  0x0f42f53e, 0xd2075f86, 0xb025d6bf, 0x6d607c07,
+  0x7460c4cd, 0xa9256e75, 0xcb07e74c, 0x16424df4,
+  0x106227e5, 0xcd278d5d, 0xaf050464, 0x7240aedc,
+  0x6b401616, 0xb605bcae, 0xd4273597, 0x09629f2f,
+  0xe6264403, 0x3b63eebb, 0x59416782, 0x8404cd3a,
+  0x9d0475f0, 0x4041df48, 0x22635671, 0xff26fcc9,
+  0x2e238253, 0xf36628eb, 0x9144a1d2, 0x4c010b6a,
+  0x5501b3a0, 0x88441918, 0xea669021, 0x37233a99,
+  0xd867e1b5, 0x05224b0d, 0x6700c234, 0xba45688c,
+  0xa345d046, 0x7e007afe, 0x1c22f3c7, 0xc167597f,
+  0xc747336e, 0x1a0299d6, 0x782010ef, 0xa565ba57,
+  0xbc65029d, 0x6120a825, 0x0302211c, 0xde478ba4,
+  0x31035088, 0xec46fa30, 0x8e647309, 0x5321d9b1,
+  0x4a21617b, 0x9764cbc3, 0xf54642fa, 0x2803e842
+};
+
+// Used to fetch a naturally-aligned 32-bit word in little endian byte-order
+static inline uint32_t LE_LOAD32(const uint8_t *p) {
+  return DecodeFixed32(reinterpret_cast<const char*>(p));
+}
+
+#ifdef __SSE4_2__
+static inline uint64_t LE_LOAD64(const uint8_t *p) {
+  return DecodeFixed64(reinterpret_cast<const char*>(p));
+}
+#endif
+
+static inline void Slow_CRC32(uint64_t* l, uint8_t const **p) {
+  uint32_t c = *l ^ LE_LOAD32(*p);
+  *p += 4;
+  *l = table3_[c & 0xff] ^
+  table2_[(c >> 8) & 0xff] ^
+  table1_[(c >> 16) & 0xff] ^
+  table0_[c >> 24];
+  // DO it twice.
+  c = *l ^ LE_LOAD32(*p);
+  *p += 4;
+  *l = table3_[c & 0xff] ^
+  table2_[(c >> 8) & 0xff] ^
+  table1_[(c >> 16) & 0xff] ^
+  table0_[c >> 24];
+}
+
+static inline void Fast_CRC32(uint64_t* l, uint8_t const **p) {
+#ifdef __SSE4_2__
+  *l = _mm_crc32_u64(*l, LE_LOAD64(*p));
+  *p += 8;
+#else
+  Slow_CRC32(l, p);
+#endif
+}
+
+template<void (*CRC32)(uint64_t*, uint8_t const**)>
+uint32_t ExtendImpl(uint32_t crc, const char* buf, size_t size) {
+  const uint8_t *p = reinterpret_cast<const uint8_t *>(buf);
+  const uint8_t *e = p + size;
+  uint64_t l = crc ^ 0xffffffffu;
+
+// Align n to (1 << m) byte boundary
+#define ALIGN(n, m)     ((n + ((1 << m) - 1)) & ~((1 << m) - 1))
+
+#define STEP1 do {                              \
+    int c = (l & 0xff) ^ *p++;                  \
+    l = table0_[c] ^ (l >> 8);                  \
+} while (0)
+
+
+  // Point x at first 16-byte aligned byte in string.  This might be
+  // just past the end of the string.
+  const uintptr_t pval = reinterpret_cast<uintptr_t>(p);
+  const uint8_t* x = reinterpret_cast<const uint8_t*>(ALIGN(pval, 4));
+  if (x <= e) {
+    // Process bytes until finished or p is 16-byte aligned
+    while (p != x) {
+      STEP1;
+    }
+  }
+  // Process bytes 16 at a time
+  while ((e-p) >= 16) {
+    CRC32(&l, &p);
+    CRC32(&l, &p);
+  }
+  // Process bytes 8 at a time
+  while ((e-p) >= 8) {
+    CRC32(&l, &p);
+  }
+  // Process the last few bytes
+  while (p != e) {
+    STEP1;
+  }
+#undef STEP1
+#undef ALIGN
+  return l ^ 0xffffffffu;
+}
+
+// Detect if SS42 or not.
+static bool isSSE42() {
+#if defined(__GNUC__) && defined(__x86_64__) && !defined(IOS_CROSS_COMPILE)
+  uint32_t c_;
+  uint32_t d_;
+  __asm__("cpuid" : "=c"(c_), "=d"(d_) : "a"(1) : "ebx");
+  return c_ & (1U << 20);  // copied from CpuId.h in Folly.
+#else
+  return false;
+#endif
+}
+
+typedef uint32_t (*Function)(uint32_t, const char*, size_t);
+
+static inline Function Choose_Extend() {
+  return isSSE42() ? ExtendImpl<Fast_CRC32> : ExtendImpl<Slow_CRC32>;
+}
+
+Function ChosenExtend = Choose_Extend();
+
+uint32_t Extend(uint32_t crc, const char* buf, size_t size) {
+  return ChosenExtend(crc, buf, size);
+}
+
+}  // namespace crc32c
+}  // namespace rocksdb
diff --git a/util/crc32c.h b/util/crc32c.h
new file mode 100644 (file)
index 0000000..e5e6e14
--- /dev/null
@@ -0,0 +1,46 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <stddef.h>
+#include <stdint.h>
+
+namespace rocksdb {
+namespace crc32c {
+
+// Return the crc32c of concat(A, data[0,n-1]) where init_crc is the
+// crc32c of some string A.  Extend() is often used to maintain the
+// crc32c of a stream of data.
+extern uint32_t Extend(uint32_t init_crc, const char* data, size_t n);
+
+// Return the crc32c of data[0,n-1]
+inline uint32_t Value(const char* data, size_t n) {
+  return Extend(0, data, n);
+}
+
+static const uint32_t kMaskDelta = 0xa282ead8ul;
+
+// Return a masked representation of crc.
+//
+// Motivation: it is problematic to compute the CRC of a string that
+// contains embedded CRCs.  Therefore we recommend that CRCs stored
+// somewhere (e.g., in files) should be masked before being stored.
+inline uint32_t Mask(uint32_t crc) {
+  // Rotate right by 15 bits and add a constant.
+  return ((crc >> 15) | (crc << 17)) + kMaskDelta;
+}
+
+// Return the crc whose masked representation is masked_crc.
+inline uint32_t Unmask(uint32_t masked_crc) {
+  uint32_t rot = masked_crc - kMaskDelta;
+  return ((rot >> 17) | (rot << 15));
+}
+
+}  // namespace crc32c
+}  // namespace rocksdb
diff --git a/util/crc32c_test.cc b/util/crc32c_test.cc
new file mode 100644 (file)
index 0000000..300c9d3
--- /dev/null
@@ -0,0 +1,77 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/crc32c.h"
+#include "util/testharness.h"
+
+namespace rocksdb {
+namespace crc32c {
+
+class CRC { };
+
+TEST(CRC, StandardResults) {
+  // From rfc3720 section B.4.
+  char buf[32];
+
+  memset(buf, 0, sizeof(buf));
+  ASSERT_EQ(0x8a9136aaU, Value(buf, sizeof(buf)));
+
+  memset(buf, 0xff, sizeof(buf));
+  ASSERT_EQ(0x62a8ab43U, Value(buf, sizeof(buf)));
+
+  for (int i = 0; i < 32; i++) {
+    buf[i] = i;
+  }
+  ASSERT_EQ(0x46dd794eU, Value(buf, sizeof(buf)));
+
+  for (int i = 0; i < 32; i++) {
+    buf[i] = 31 - i;
+  }
+  ASSERT_EQ(0x113fdb5cU, Value(buf, sizeof(buf)));
+
+  unsigned char data[48] = {
+    0x01, 0xc0, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00,
+    0x14, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x04, 0x00,
+    0x00, 0x00, 0x00, 0x14,
+    0x00, 0x00, 0x00, 0x18,
+    0x28, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00,
+    0x02, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00,
+  };
+  ASSERT_EQ(0xd9963a56, Value(reinterpret_cast<char*>(data), sizeof(data)));
+}
+
+TEST(CRC, Values) {
+  ASSERT_NE(Value("a", 1), Value("foo", 3));
+}
+
+TEST(CRC, Extend) {
+  ASSERT_EQ(Value("hello world", 11),
+            Extend(Value("hello ", 6), "world", 5));
+}
+
+TEST(CRC, Mask) {
+  uint32_t crc = Value("foo", 3);
+  ASSERT_NE(crc, Mask(crc));
+  ASSERT_NE(crc, Mask(Mask(crc)));
+  ASSERT_EQ(crc, Unmask(Mask(crc)));
+  ASSERT_EQ(crc, Unmask(Unmask(Mask(Mask(crc)))));
+}
+
+}  // namespace crc32c
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/util/dynamic_bloom.cc b/util/dynamic_bloom.cc
new file mode 100644 (file)
index 0000000..a4c8e11
--- /dev/null
@@ -0,0 +1,50 @@
+// Copyright (c) 2013, Facebook, Inc. All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#include "dynamic_bloom.h"
+
+#include <algorithm>
+
+#include "port/port.h"
+#include "rocksdb/slice.h"
+#include "util/hash.h"
+
+namespace rocksdb {
+
+namespace {
+static uint32_t BloomHash(const Slice& key) {
+  return Hash(key.data(), key.size(), 0xbc9f1d34);
+}
+}
+
+DynamicBloom::DynamicBloom(uint32_t total_bits,
+                           uint32_t cl_per_block,
+                           uint32_t num_probes,
+                           uint32_t (*hash_func)(const Slice& key))
+  : kBlocked(cl_per_block > 0),
+    kBitsPerBlock(std::min(cl_per_block, num_probes) * CACHE_LINE_SIZE * 8),
+    kTotalBits((kBlocked ? (total_bits + kBitsPerBlock - 1) / kBitsPerBlock
+                              * kBitsPerBlock :
+                           total_bits + 7) / 8 * 8),
+    kNumBlocks(kBlocked ? kTotalBits / kBitsPerBlock : 1),
+    kNumProbes(num_probes),
+    hash_func_(hash_func == nullptr ? &BloomHash : hash_func) {
+  assert(kBlocked ? kTotalBits > 0 : kTotalBits >= kBitsPerBlock);
+  assert(kNumProbes > 0);
+
+  uint32_t sz = kTotalBits / 8;
+  if (kBlocked) {
+    sz += CACHE_LINE_SIZE - 1;
+  }
+  raw_ = new unsigned char[sz]();
+  if (kBlocked && (reinterpret_cast<uint64_t>(raw_) % CACHE_LINE_SIZE)) {
+    data_ = raw_ + CACHE_LINE_SIZE -
+      reinterpret_cast<uint64_t>(raw_) % CACHE_LINE_SIZE;
+  } else {
+    data_ = raw_;
+  }
+}
+
+}  // rocksdb
diff --git a/util/dynamic_bloom.h b/util/dynamic_bloom.h
new file mode 100644 (file)
index 0000000..efc461c
--- /dev/null
@@ -0,0 +1,101 @@
+// Copyright (c) 2013, Facebook, Inc. All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+
+#include <atomic>
+#include <memory>
+
+namespace rocksdb {
+
+class Slice;
+
+class DynamicBloom {
+ public:
+  // total_bits: fixed total bits for the bloom
+  // num_probes: number of hash probes for a single key
+  // cl_per_block: block size in cache lines. When this is non-zero, a
+  //               query/set is done within a block to improve cache locality.
+  // hash_func:  customized hash function
+  explicit DynamicBloom(uint32_t total_bits, uint32_t cl_per_block = 0,
+      uint32_t num_probes = 6,
+      uint32_t (*hash_func)(const Slice& key) = nullptr);
+
+  ~DynamicBloom() {
+    delete[] raw_;
+  }
+
+  // Assuming single threaded access to this function.
+  void Add(const Slice& key);
+
+  // Assuming single threaded access to this function.
+  void AddHash(uint32_t hash);
+
+  // Multithreaded access to this function is OK
+  bool MayContain(const Slice& key);
+
+  // Multithreaded access to this function is OK
+  bool MayContainHash(uint32_t hash);
+
+ private:
+  const bool kBlocked;
+  const uint32_t kBitsPerBlock;
+  const uint32_t kTotalBits;
+  const uint32_t kNumBlocks;
+  const uint32_t kNumProbes;
+
+  uint32_t (*hash_func_)(const Slice& key);
+  unsigned char* data_;
+  unsigned char* raw_;
+};
+
+inline void DynamicBloom::Add(const Slice& key) { AddHash(hash_func_(key)); }
+
+inline bool DynamicBloom::MayContain(const Slice& key) {
+  return (MayContainHash(hash_func_(key)));
+}
+
+inline bool DynamicBloom::MayContainHash(uint32_t h) {
+  const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
+  if (kBlocked) {
+    uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * kBitsPerBlock;
+    for (uint32_t i = 0; i < kNumProbes; ++i) {
+      const uint32_t bitpos = b + h % kBitsPerBlock;
+      if (((data_[bitpos / 8]) & (1 << (bitpos % 8))) == 0) {
+        return false;
+      }
+      h += delta;
+    }
+  } else {
+    for (uint32_t i = 0; i < kNumProbes; ++i) {
+      const uint32_t bitpos = h % kTotalBits;
+      if (((data_[bitpos / 8]) & (1 << (bitpos % 8))) == 0) {
+        return false;
+      }
+      h += delta;
+    }
+  }
+  return true;
+}
+
+inline void DynamicBloom::AddHash(uint32_t h) {
+  const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
+  if (kBlocked) {
+    uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * kBitsPerBlock;
+    for (uint32_t i = 0; i < kNumProbes; ++i) {
+      const uint32_t bitpos = b + h % kBitsPerBlock;
+      data_[bitpos / 8] |= (1 << (bitpos % 8));
+      h += delta;
+    }
+  } else {
+    for (uint32_t i = 0; i < kNumProbes; ++i) {
+      const uint32_t bitpos = h % kTotalBits;
+      data_[bitpos / 8] |= (1 << (bitpos % 8));
+      h += delta;
+    }
+  }
+}
+
+}  // rocksdb
diff --git a/util/dynamic_bloom_test.cc b/util/dynamic_bloom_test.cc
new file mode 100644 (file)
index 0000000..4a34d50
--- /dev/null
@@ -0,0 +1,202 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#define __STDC_FORMAT_MACROS
+#include <inttypes.h>
+#include <algorithm>
+#include <gflags/gflags.h>
+
+#include "dynamic_bloom.h"
+#include "port/port.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+#include "util/stop_watch.h"
+
+DEFINE_int32(bits_per_key, 10, "");
+DEFINE_int32(num_probes, 6, "");
+DEFINE_bool(enable_perf, false, "");
+
+namespace rocksdb {
+
+static Slice Key(uint64_t i, char* buffer) {
+  memcpy(buffer, &i, sizeof(i));
+  return Slice(buffer, sizeof(i));
+}
+
+class DynamicBloomTest {
+};
+
+TEST(DynamicBloomTest, EmptyFilter) {
+  DynamicBloom bloom1(100, 0, 2);
+  ASSERT_TRUE(!bloom1.MayContain("hello"));
+  ASSERT_TRUE(!bloom1.MayContain("world"));
+
+  DynamicBloom bloom2(CACHE_LINE_SIZE * 8 * 2 - 1, 1, 2);
+  ASSERT_TRUE(!bloom2.MayContain("hello"));
+  ASSERT_TRUE(!bloom2.MayContain("world"));
+}
+
+TEST(DynamicBloomTest, Small) {
+  DynamicBloom bloom1(100, 0, 2);
+  bloom1.Add("hello");
+  bloom1.Add("world");
+  ASSERT_TRUE(bloom1.MayContain("hello"));
+  ASSERT_TRUE(bloom1.MayContain("world"));
+  ASSERT_TRUE(!bloom1.MayContain("x"));
+  ASSERT_TRUE(!bloom1.MayContain("foo"));
+
+  DynamicBloom bloom2(CACHE_LINE_SIZE * 8 * 2 - 1, 1, 2);
+  bloom2.Add("hello");
+  bloom2.Add("world");
+  ASSERT_TRUE(bloom2.MayContain("hello"));
+  ASSERT_TRUE(bloom2.MayContain("world"));
+  ASSERT_TRUE(!bloom2.MayContain("x"));
+  ASSERT_TRUE(!bloom2.MayContain("foo"));
+}
+
+static uint32_t NextNum(uint32_t num) {
+  if (num < 10) {
+    num += 1;
+  } else if (num < 100) {
+    num += 10;
+  } else if (num < 1000) {
+    num += 100;
+  } else {
+    num += 1000;
+  }
+  return num;
+}
+
+TEST(DynamicBloomTest, VaryingLengths) {
+  char buffer[sizeof(uint64_t)];
+
+  // Count number of filters that significantly exceed the false positive rate
+  int mediocre_filters = 0;
+  int good_filters = 0;
+  uint32_t num_probes = static_cast<uint32_t>(FLAGS_num_probes);
+
+  fprintf(stderr, "bits_per_key: %d  num_probes: %d\n",
+          FLAGS_bits_per_key, num_probes);
+
+  for (uint32_t cl_per_block = 0; cl_per_block < num_probes;
+      ++cl_per_block) {
+    for (uint32_t num = 1; num <= 10000; num = NextNum(num)) {
+      uint32_t bloom_bits = 0;
+      if (cl_per_block == 0) {
+        bloom_bits = std::max(num * FLAGS_bits_per_key, 64U);
+      } else {
+        bloom_bits = std::max(num * FLAGS_bits_per_key,
+            cl_per_block * CACHE_LINE_SIZE * 8);
+      }
+      DynamicBloom bloom(bloom_bits, cl_per_block, num_probes);
+      for (uint64_t i = 0; i < num; i++) {
+        bloom.Add(Key(i, buffer));
+        ASSERT_TRUE(bloom.MayContain(Key(i, buffer)));
+      }
+
+      // All added keys must match
+      for (uint64_t i = 0; i < num; i++) {
+        ASSERT_TRUE(bloom.MayContain(Key(i, buffer)))
+          << "Num " << num << "; key " << i;
+      }
+
+      // Check false positive rate
+
+      int result = 0;
+      for (uint64_t i = 0; i < 10000; i++) {
+        if (bloom.MayContain(Key(i + 1000000000, buffer))) {
+          result++;
+        }
+      }
+      double rate = result / 10000.0;
+
+      fprintf(stderr, "False positives: %5.2f%% @ num = %6u, bloom_bits = %6u, "
+              "cl per block = %u\n", rate*100.0, num, bloom_bits, cl_per_block);
+
+      if (rate > 0.0125)
+        mediocre_filters++;  // Allowed, but not too often
+      else
+        good_filters++;
+    }
+
+    fprintf(stderr, "Filters: %d good, %d mediocre\n",
+            good_filters, mediocre_filters);
+    ASSERT_LE(mediocre_filters, good_filters/5);
+  }
+}
+
+TEST(DynamicBloomTest, perf) {
+  StopWatchNano timer(Env::Default());
+  uint32_t num_probes = static_cast<uint32_t>(FLAGS_num_probes);
+
+  if (!FLAGS_enable_perf) {
+    return;
+  }
+
+  for (uint64_t m = 1; m <= 8; ++m) {
+    const uint64_t num_keys = m * 8 * 1024 * 1024;
+    fprintf(stderr, "testing %" PRIu64 "M keys\n", m * 8);
+
+    DynamicBloom std_bloom(num_keys * 10, 0, num_probes);
+
+    timer.Start();
+    for (uint64_t i = 1; i <= num_keys; ++i) {
+      std_bloom.Add(Slice(reinterpret_cast<const char*>(&i), 8));
+    }
+
+    uint64_t elapsed = timer.ElapsedNanos();
+    fprintf(stderr, "standard bloom, avg add latency %" PRIu64 "\n",
+            elapsed / num_keys);
+
+    uint64_t count = 0;
+    timer.Start();
+    for (uint64_t i = 1; i <= num_keys; ++i) {
+      if (std_bloom.MayContain(Slice(reinterpret_cast<const char*>(&i), 8))) {
+        ++count;
+      }
+    }
+    elapsed = timer.ElapsedNanos();
+    fprintf(stderr, "standard bloom, avg query latency %" PRIu64 "\n",
+            elapsed / count);
+    ASSERT_TRUE(count == num_keys);
+
+    for (uint32_t cl_per_block = 1; cl_per_block <= num_probes;
+        ++cl_per_block) {
+      DynamicBloom blocked_bloom(num_keys * 10, cl_per_block, num_probes);
+
+      timer.Start();
+      for (uint64_t i = 1; i <= num_keys; ++i) {
+        blocked_bloom.Add(Slice(reinterpret_cast<const char*>(&i), 8));
+      }
+
+      uint64_t elapsed = timer.ElapsedNanos();
+      fprintf(stderr, "blocked bloom(%d), avg add latency %" PRIu64 "\n",
+              cl_per_block, elapsed / num_keys);
+
+      uint64_t count = 0;
+      timer.Start();
+      for (uint64_t i = 1; i <= num_keys; ++i) {
+        if (blocked_bloom.MayContain(
+              Slice(reinterpret_cast<const char*>(&i), 8))) {
+          ++count;
+        }
+      }
+
+      elapsed = timer.ElapsedNanos();
+      fprintf(stderr, "blocked bloom(%d), avg query latency %" PRIu64 "\n",
+              cl_per_block, elapsed / count);
+      ASSERT_TRUE(count == num_keys);
+    }
+  }
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+
+  return rocksdb::test::RunAllTests();
+}
diff --git a/util/env.cc b/util/env.cc
new file mode 100644 (file)
index 0000000..1c0cae4
--- /dev/null
@@ -0,0 +1,251 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/env.h"
+
+#include <sys/time.h>
+#include "rocksdb/options.h"
+#include "util/arena.h"
+#include "util/autovector.h"
+
+namespace rocksdb {
+
+Env::~Env() {
+}
+
+SequentialFile::~SequentialFile() {
+}
+
+RandomAccessFile::~RandomAccessFile() {
+}
+
+WritableFile::~WritableFile() {
+}
+
+Logger::~Logger() {
+}
+
+FileLock::~FileLock() {
+}
+
+void LogFlush(Logger *info_log) {
+  if (info_log) {
+    info_log->Flush();
+  }
+}
+
+void Log(Logger* info_log, const char* format, ...) {
+  if (info_log) {
+    va_list ap;
+    va_start(ap, format);
+    info_log->Logv(InfoLogLevel::INFO_LEVEL, format, ap);
+    va_end(ap);
+  }
+}
+
+void Log(const InfoLogLevel log_level, Logger* info_log, const char* format,
+         ...) {
+  if (info_log) {
+    va_list ap;
+    va_start(ap, format);
+    info_log->Logv(log_level, format, ap);
+    va_end(ap);
+  }
+}
+
+void Debug(Logger* info_log, const char* format, ...) {
+  if (info_log) {
+    va_list ap;
+    va_start(ap, format);
+    info_log->Logv(InfoLogLevel::DEBUG_LEVEL, format, ap);
+    va_end(ap);
+  }
+}
+
+void Info(Logger* info_log, const char* format, ...) {
+  if (info_log) {
+    va_list ap;
+    va_start(ap, format);
+    info_log->Logv(InfoLogLevel::INFO_LEVEL, format, ap);
+    va_end(ap);
+  }
+}
+
+void Warn(Logger* info_log, const char* format, ...) {
+  if (info_log) {
+    va_list ap;
+    va_start(ap, format);
+    info_log->Logv(InfoLogLevel::WARN_LEVEL, format, ap);
+    va_end(ap);
+  }
+}
+void Error(Logger* info_log, const char* format, ...) {
+  if (info_log) {
+    va_list ap;
+    va_start(ap, format);
+    info_log->Logv(InfoLogLevel::ERROR_LEVEL, format, ap);
+    va_end(ap);
+  }
+}
+void Fatal(Logger* info_log, const char* format, ...) {
+  if (info_log) {
+    va_list ap;
+    va_start(ap, format);
+    info_log->Logv(InfoLogLevel::FATAL_LEVEL, format, ap);
+    va_end(ap);
+  }
+}
+
+void LogFlush(const shared_ptr<Logger>& info_log) {
+  if (info_log) {
+    info_log->Flush();
+  }
+}
+
+void Log(const InfoLogLevel log_level, const shared_ptr<Logger>& info_log,
+         const char* format, ...) {
+  if (info_log) {
+    va_list ap;
+    va_start(ap, format);
+    info_log->Logv(log_level, format, ap);
+    va_end(ap);
+  }
+}
+
+void Debug(const shared_ptr<Logger>& info_log, const char* format, ...) {
+  if (info_log) {
+    va_list ap;
+    va_start(ap, format);
+    info_log->Logv(InfoLogLevel::DEBUG_LEVEL, format, ap);
+    va_end(ap);
+  }
+}
+
+void Info(const shared_ptr<Logger>& info_log, const char* format, ...) {
+  if (info_log) {
+    va_list ap;
+    va_start(ap, format);
+    info_log->Logv(InfoLogLevel::INFO_LEVEL, format, ap);
+    va_end(ap);
+  }
+}
+
+void Warn(const shared_ptr<Logger>& info_log, const char* format, ...) {
+  if (info_log) {
+    va_list ap;
+    va_start(ap, format);
+    info_log->Logv(InfoLogLevel::WARN_LEVEL, format, ap);
+    va_end(ap);
+  }
+}
+
+void Error(const shared_ptr<Logger>& info_log, const char* format, ...) {
+  if (info_log) {
+    va_list ap;
+    va_start(ap, format);
+    info_log->Logv(InfoLogLevel::ERROR_LEVEL, format, ap);
+    va_end(ap);
+  }
+}
+
+void Fatal(const shared_ptr<Logger>& info_log, const char* format, ...) {
+  if (info_log) {
+    va_list ap;
+    va_start(ap, format);
+    info_log->Logv(InfoLogLevel::FATAL_LEVEL, format, ap);
+    va_end(ap);
+  }
+}
+
+void Log(const shared_ptr<Logger>& info_log, const char* format, ...) {
+  if (info_log) {
+    va_list ap;
+    va_start(ap, format);
+    info_log->Logv(InfoLogLevel::INFO_LEVEL, format, ap);
+    va_end(ap);
+  }
+}
+
+Status WriteStringToFile(Env* env, const Slice& data, const std::string& fname,
+                         bool should_sync) {
+  unique_ptr<WritableFile> file;
+  EnvOptions soptions;
+  Status s = env->NewWritableFile(fname, &file, soptions);
+  if (!s.ok()) {
+    return s;
+  }
+  s = file->Append(data);
+  if (s.ok() && should_sync) {
+    s = file->Sync();
+  }
+  if (!s.ok()) {
+    env->DeleteFile(fname);
+  }
+  return s;
+}
+
+Status ReadFileToString(Env* env, const std::string& fname, std::string* data) {
+  EnvOptions soptions;
+  data->clear();
+  unique_ptr<SequentialFile> file;
+  Status s = env->NewSequentialFile(fname, &file, soptions);
+  if (!s.ok()) {
+    return s;
+  }
+  static const int kBufferSize = 8192;
+  char* space = new char[kBufferSize];
+  while (true) {
+    Slice fragment;
+    s = file->Read(kBufferSize, &fragment, space);
+    if (!s.ok()) {
+      break;
+    }
+    data->append(fragment.data(), fragment.size());
+    if (fragment.empty()) {
+      break;
+    }
+  }
+  delete[] space;
+  return s;
+}
+
+EnvWrapper::~EnvWrapper() {
+}
+
+namespace {  // anonymous namespace
+
+void AssignEnvOptions(EnvOptions* env_options, const DBOptions& options) {
+  env_options->use_os_buffer = options.allow_os_buffer;
+  env_options->use_mmap_reads = options.allow_mmap_reads;
+  env_options->use_mmap_writes = options.allow_mmap_writes;
+  env_options->set_fd_cloexec = options.is_fd_close_on_exec;
+  env_options->bytes_per_sync = options.bytes_per_sync;
+}
+
+}
+
+EnvOptions Env::OptimizeForLogWrite(const EnvOptions& env_options) const {
+  return env_options;
+}
+
+EnvOptions Env::OptimizeForManifestWrite(const EnvOptions& env_options) const {
+  return env_options;
+}
+
+EnvOptions::EnvOptions(const DBOptions& options) {
+  AssignEnvOptions(this, options);
+}
+
+EnvOptions::EnvOptions() {
+  DBOptions options;
+  AssignEnvOptions(this, options);
+}
+
+
+}  // namespace rocksdb
diff --git a/util/env_hdfs.cc b/util/env_hdfs.cc
new file mode 100644 (file)
index 0000000..c724b23
--- /dev/null
@@ -0,0 +1,523 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#ifdef USE_HDFS
+#ifndef ROCKSDB_HDFS_FILE_C
+#define ROCKSDB_HDFS_FILE_C
+
+#include <algorithm>
+#include <stdio.h>
+#include <sys/time.h>
+#include <time.h>
+#include <iostream>
+#include <sstream>
+#include "rocksdb/env.h"
+#include "rocksdb/status.h"
+#include "hdfs/hdfs.h"
+#include "hdfs/env_hdfs.h"
+
+//
+// This file defines an HDFS environment for rocksdb. It uses the libhdfs
+// api to access HDFS. All HDFS files created by one instance of rocksdb
+// will reside on the same HDFS cluster.
+//
+
+namespace rocksdb {
+
+namespace {
+
+// Log error message
+static Status IOError(const std::string& context, int err_number) {
+  return Status::IOError(context, strerror(err_number));
+}
+
+// assume that there is one global logger for now. It is not thread-safe,
+// but need not be because the logger is initialized at db-open time.
+static Logger* mylog = nullptr;
+
+// Used for reading a file from HDFS. It implements both sequential-read
+// access methods as well as random read access methods.
+class HdfsReadableFile: virtual public SequentialFile, virtual public RandomAccessFile {
+ private:
+  hdfsFS fileSys_;
+  std::string filename_;
+  hdfsFile hfile_;
+
+ public:
+  HdfsReadableFile(hdfsFS fileSys, const std::string& fname)
+      : fileSys_(fileSys), filename_(fname), hfile_(nullptr) {
+    Log(mylog, "[hdfs] HdfsReadableFile opening file %s\n",
+        filename_.c_str());
+    hfile_ = hdfsOpenFile(fileSys_, filename_.c_str(), O_RDONLY, 0, 0, 0);
+    Log(mylog, "[hdfs] HdfsReadableFile opened file %s hfile_=0x%p\n",
+            filename_.c_str(), hfile_);
+  }
+
+  virtual ~HdfsReadableFile() {
+    Log(mylog, "[hdfs] HdfsReadableFile closing file %s\n",
+       filename_.c_str());
+    hdfsCloseFile(fileSys_, hfile_);
+    Log(mylog, "[hdfs] HdfsReadableFile closed file %s\n",
+        filename_.c_str());
+    hfile_ = nullptr;
+  }
+
+  bool isValid() {
+    return hfile_ != nullptr;
+  }
+
+  // sequential access, read data at current offset in file
+  virtual Status Read(size_t n, Slice* result, char* scratch) {
+    Status s;
+    Log(mylog, "[hdfs] HdfsReadableFile reading %s %ld\n",
+        filename_.c_str(), n);
+    size_t bytes_read = hdfsRead(fileSys_, hfile_, scratch, (tSize)n);
+    Log(mylog, "[hdfs] HdfsReadableFile read %s\n", filename_.c_str());
+    *result = Slice(scratch, bytes_read);
+    if (bytes_read < n) {
+      if (feof()) {
+        // We leave status as ok if we hit the end of the file
+      } else {
+        // A partial read with an error: return a non-ok status
+        s = IOError(filename_, errno);
+      }
+    }
+    return s;
+  }
+
+  // random access, read data from specified offset in file
+  virtual Status Read(uint64_t offset, size_t n, Slice* result,
+                      char* scratch) const {
+    Status s;
+    Log(mylog, "[hdfs] HdfsReadableFile preading %s\n", filename_.c_str());
+    ssize_t bytes_read = hdfsPread(fileSys_, hfile_, offset,
+                                   (void*)scratch, (tSize)n);
+    Log(mylog, "[hdfs] HdfsReadableFile pread %s\n", filename_.c_str());
+    *result = Slice(scratch, (bytes_read < 0) ? 0 : bytes_read);
+    if (bytes_read < 0) {
+      // An error: return a non-ok status
+      s = IOError(filename_, errno);
+    }
+    return s;
+  }
+
+  virtual Status Skip(uint64_t n) {
+    Log(mylog, "[hdfs] HdfsReadableFile skip %s\n", filename_.c_str());
+    // get current offset from file
+    tOffset current = hdfsTell(fileSys_, hfile_);
+    if (current < 0) {
+      return IOError(filename_, errno);
+    }
+    // seek to new offset in file
+    tOffset newoffset = current + n;
+    int val = hdfsSeek(fileSys_, hfile_, newoffset);
+    if (val < 0) {
+      return IOError(filename_, errno);
+    }
+    return Status::OK();
+  }
+
+ private:
+
+  // returns true if we are at the end of file, false otherwise
+  bool feof() {
+    Log(mylog, "[hdfs] HdfsReadableFile feof %s\n", filename_.c_str());
+    if (hdfsTell(fileSys_, hfile_) == fileSize()) {
+      return true;
+    }
+    return false;
+  }
+
+  // the current size of the file
+  tOffset fileSize() {
+    Log(mylog, "[hdfs] HdfsReadableFile fileSize %s\n", filename_.c_str());
+    hdfsFileInfo* pFileInfo = hdfsGetPathInfo(fileSys_, filename_.c_str());
+    tOffset size = 0L;
+    if (pFileInfo != nullptr) {
+      size = pFileInfo->mSize;
+      hdfsFreeFileInfo(pFileInfo, 1);
+    } else {
+      throw rocksdb::HdfsFatalException("fileSize on unknown file " +
+                                            filename_);
+    }
+    return size;
+  }
+};
+
+// Appends to an existing file in HDFS.
+class HdfsWritableFile: public WritableFile {
+ private:
+  hdfsFS fileSys_;
+  std::string filename_;
+  hdfsFile hfile_;
+
+ public:
+  HdfsWritableFile(hdfsFS fileSys, const std::string& fname)
+      : fileSys_(fileSys), filename_(fname) , hfile_(nullptr) {
+    Log(mylog, "[hdfs] HdfsWritableFile opening %s\n", filename_.c_str());
+    hfile_ = hdfsOpenFile(fileSys_, filename_.c_str(), O_WRONLY, 0, 0, 0);
+    Log(mylog, "[hdfs] HdfsWritableFile opened %s\n", filename_.c_str());
+    assert(hfile_ != nullptr);
+  }
+  virtual ~HdfsWritableFile() {
+    if (hfile_ != nullptr) {
+      Log(mylog, "[hdfs] HdfsWritableFile closing %s\n", filename_.c_str());
+      hdfsCloseFile(fileSys_, hfile_);
+      Log(mylog, "[hdfs] HdfsWritableFile closed %s\n", filename_.c_str());
+      hfile_ = nullptr;
+    }
+  }
+
+  // If the file was successfully created, then this returns true.
+  // Otherwise returns false.
+  bool isValid() {
+    return hfile_ != nullptr;
+  }
+
+  // The name of the file, mostly needed for debug logging.
+  const std::string& getName() {
+    return filename_;
+  }
+
+  virtual Status Append(const Slice& data) {
+    Log(mylog, "[hdfs] HdfsWritableFile Append %s\n", filename_.c_str());
+    const char* src = data.data();
+    size_t left = data.size();
+    size_t ret = hdfsWrite(fileSys_, hfile_, src, left);
+    Log(mylog, "[hdfs] HdfsWritableFile Appended %s\n", filename_.c_str());
+    if (ret != left) {
+      return IOError(filename_, errno);
+    }
+    return Status::OK();
+  }
+
+  virtual Status Flush() {
+    return Status::OK();
+  }
+
+  virtual Status Sync() {
+    Status s;
+    Log(mylog, "[hdfs] HdfsWritableFile Sync %s\n", filename_.c_str());
+    if (hdfsFlush(fileSys_, hfile_) == -1) {
+      return IOError(filename_, errno);
+    }
+    if (hdfsSync(fileSys_, hfile_) == -1) {
+      return IOError(filename_, errno);
+    }
+    Log(mylog, "[hdfs] HdfsWritableFile Synced %s\n", filename_.c_str());
+    return Status::OK();
+  }
+
+  // This is used by HdfsLogger to write data to the debug log file
+  virtual Status Append(const char* src, size_t size) {
+    if (hdfsWrite(fileSys_, hfile_, src, size) != (tSize)size) {
+      return IOError(filename_, errno);
+    }
+    return Status::OK();
+  }
+
+  virtual Status Close() {
+    Log(mylog, "[hdfs] HdfsWritableFile closing %s\n", filename_.c_str());
+    if (hdfsCloseFile(fileSys_, hfile_) != 0) {
+      return IOError(filename_, errno);
+    }
+    Log(mylog, "[hdfs] HdfsWritableFile closed %s\n", filename_.c_str());
+    hfile_ = nullptr;
+    return Status::OK();
+  }
+};
+
+// The object that implements the debug logs to reside in HDFS.
+class HdfsLogger : public Logger {
+ private:
+  HdfsWritableFile* file_;
+  uint64_t (*gettid_)();  // Return the thread id for the current thread
+
+ public:
+  HdfsLogger(HdfsWritableFile* f, uint64_t (*gettid)(),
+             const InfoLogLevel log_level = InfoLogLevel::ERROR)
+      : Logger(log_level), file_(f), gettid_(gettid) {
+    Log(mylog, "[hdfs] HdfsLogger opened %s\n",
+            file_->getName().c_str());
+  }
+
+  virtual ~HdfsLogger() {
+    Log(mylog, "[hdfs] HdfsLogger closed %s\n",
+            file_->getName().c_str());
+    delete file_;
+    if (mylog != nullptr && mylog == this) {
+      mylog = nullptr;
+    }
+  }
+
+  virtual void Logv(const char* format, va_list ap) {
+    const uint64_t thread_id = (*gettid_)();
+
+    // We try twice: the first time with a fixed-size stack allocated buffer,
+    // and the second time with a much larger dynamically allocated buffer.
+    char buffer[500];
+    for (int iter = 0; iter < 2; iter++) {
+      char* base;
+      int bufsize;
+      if (iter == 0) {
+        bufsize = sizeof(buffer);
+        base = buffer;
+      } else {
+        bufsize = 30000;
+        base = new char[bufsize];
+      }
+      char* p = base;
+      char* limit = base + bufsize;
+
+      struct timeval now_tv;
+      gettimeofday(&now_tv, nullptr);
+      const time_t seconds = now_tv.tv_sec;
+      struct tm t;
+      localtime_r(&seconds, &t);
+      p += snprintf(p, limit - p,
+                    "%04d/%02d/%02d-%02d:%02d:%02d.%06d %llx ",
+                    t.tm_year + 1900,
+                    t.tm_mon + 1,
+                    t.tm_mday,
+                    t.tm_hour,
+                    t.tm_min,
+                    t.tm_sec,
+                    static_cast<int>(now_tv.tv_usec),
+                    static_cast<long long unsigned int>(thread_id));
+
+      // Print the message
+      if (p < limit) {
+        va_list backup_ap;
+        va_copy(backup_ap, ap);
+        p += vsnprintf(p, limit - p, format, backup_ap);
+        va_end(backup_ap);
+      }
+
+      // Truncate to available space if necessary
+      if (p >= limit) {
+        if (iter == 0) {
+          continue;       // Try again with larger buffer
+        } else {
+          p = limit - 1;
+        }
+      }
+
+      // Add newline if necessary
+      if (p == base || p[-1] != '\n') {
+        *p++ = '\n';
+      }
+
+      assert(p <= limit);
+      file_->Append(base, p-base);
+      file_->Flush();
+      if (base != buffer) {
+        delete[] base;
+      }
+      break;
+    }
+  }
+};
+
+}  // namespace
+
+// Finally, the hdfs environment
+
+// open a file for sequential reading
+Status HdfsEnv::NewSequentialFile(const std::string& fname,
+                                 SequentialFile** result) {
+  HdfsReadableFile* f = new HdfsReadableFile(fileSys_, fname);
+  if (f == nullptr) {
+    *result = nullptr;
+    return IOError(fname, errno);
+  }
+  *result = dynamic_cast<SequentialFile*>(f);
+  return Status::OK();
+}
+
+// open a file for random reading
+Status HdfsEnv::NewRandomAccessFile(const std::string& fname,
+                                   RandomAccessFile** result) {
+  HdfsReadableFile* f = new HdfsReadableFile(fileSys_, fname);
+  if (f == nullptr) {
+    *result = nullptr;
+    return IOError(fname, errno);
+  }
+  *result = dynamic_cast<RandomAccessFile*>(f);
+  return Status::OK();
+}
+
+// create a new file for writing
+Status HdfsEnv::NewWritableFile(const std::string& fname,
+                               WritableFile** result) {
+  Status s;
+  HdfsWritableFile* f = new HdfsWritableFile(fileSys_, fname);
+  if (f == nullptr || !f->isValid()) {
+    *result = nullptr;
+    return IOError(fname, errno);
+  }
+  *result = dynamic_cast<WritableFile*>(f);
+  return Status::OK();
+}
+
+Status HdfsEnv::NewRandomRWFile(const std::string& fname,
+                                unique_ptr<RandomRWFile>* result,
+                                const EnvOptions& options) {
+  return Status::NotSupported("NewRandomRWFile not supported on HdfsEnv");
+}
+
+virtual Status NewDirectory(const std::string& name,
+                            unique_ptr<Directory>* result) {
+  return Status::NotSupported("NewDirectory not yet supported on HdfsEnv");
+}
+
+bool HdfsEnv::FileExists(const std::string& fname) {
+  int value = hdfsExists(fileSys_, fname.c_str());
+  if (value == 0) {
+    return true;
+  }
+  return false;
+}
+
+Status HdfsEnv::GetChildren(const std::string& path,
+                            std::vector<std::string>* result) {
+  int value = hdfsExists(fileSys_, path.c_str());
+  switch (value) {
+  case 0: {
+    int numEntries = 0;
+    hdfsFileInfo* pHdfsFileInfo = 0;
+    pHdfsFileInfo = hdfsListDirectory(fileSys_, path.c_str(), &numEntries);
+    if (numEntries >= 0) {
+      for(int i = 0; i < numEntries; i++) {
+        char* pathname = pHdfsFileInfo[i].mName;
+        char* filename = rindex(pathname, '/');
+        if (filename != nullptr) {
+          result->push_back(filename+1);
+        }
+      }
+      if (pHdfsFileInfo != nullptr) {
+        hdfsFreeFileInfo(pHdfsFileInfo, numEntries);
+      }
+    } else {
+      // numEntries < 0 indicates error
+      Log(mylog, "hdfsListDirectory call failed with error ");
+      throw HdfsFatalException("hdfsListDirectory call failed negative error.\n");
+    }
+    break;
+  }
+  case 1:           // directory does not exist, exit
+    break;
+  default:          // anything else should be an error
+    Log(mylog, "hdfsListDirectory call failed with error ");
+    throw HdfsFatalException("hdfsListDirectory call failed with error.\n");
+  }
+  return Status::OK();
+}
+
+Status HdfsEnv::DeleteFile(const std::string& fname) {
+  if (hdfsDelete(fileSys_, fname.c_str()) == 0) {
+    return Status::OK();
+  }
+  return IOError(fname, errno);
+};
+
+Status HdfsEnv::CreateDir(const std::string& name) {
+  if (hdfsCreateDirectory(fileSys_, name.c_str()) == 0) {
+    return Status::OK();
+  }
+  return IOError(name, errno);
+};
+
+Status HdfsEnv::CreateDirIfMissing(const std::string& name) {
+  const int value = hdfsExists(fileSys_, name.c_str());
+  //  Not atomic. state might change b/w hdfsExists and CreateDir.
+  if (value == 0) {
+    return Status::OK();
+  } else {
+    return CreateDir(name);
+  }
+};
+
+Status HdfsEnv::DeleteDir(const std::string& name) {
+  return DeleteFile(name);
+};
+
+Status HdfsEnv::GetFileSize(const std::string& fname, uint64_t* size) {
+  *size = 0L;
+  hdfsFileInfo* pFileInfo = hdfsGetPathInfo(fileSys_, fname.c_str());
+  if (pFileInfo != nullptr) {
+    *size = pFileInfo->mSize;
+    hdfsFreeFileInfo(pFileInfo, 1);
+    return Status::OK();
+  }
+  return IOError(fname, errno);
+}
+
+Status HdfsEnv::GetFileModificationTime(const std::string& fname,
+                                        uint64_t* time) {
+  hdfsFileInfo* pFileInfo = hdfsGetPathInfo(fileSys_, fname.c_str());
+  if (pFileInfo != nullptr) {
+    *time = static_cast<uint64_t>(pFileInfo->mLastMod);
+    hdfsFreeFileInfo(pFileInfo, 1);
+    return Status::OK();
+  }
+  return IOError(fname, errno);
+
+}
+
+// The rename is not atomic. HDFS does not allow a renaming if the
+// target already exists. So, we delete the target before attemting the
+// rename.
+Status HdfsEnv::RenameFile(const std::string& src, const std::string& target) {
+  hdfsDelete(fileSys_, target.c_str());
+  if (hdfsRename(fileSys_, src.c_str(), target.c_str()) == 0) {
+    return Status::OK();
+  }
+  return IOError(src, errno);
+}
+
+Status HdfsEnv::LockFile(const std::string& fname, FileLock** lock) {
+  // there isn's a very good way to atomically check and create
+  // a file via libhdfs
+  *lock = nullptr;
+  return Status::OK();
+}
+
+Status HdfsEnv::UnlockFile(FileLock* lock) {
+  return Status::OK();
+}
+
+Status HdfsEnv::NewLogger(const std::string& fname,
+                          shared_ptr<Logger>* result) {
+  HdfsWritableFile* f = new HdfsWritableFile(fileSys_, fname);
+  if (f == nullptr || !f->isValid()) {
+    *result = nullptr;
+    return IOError(fname, errno);
+  }
+  HdfsLogger* h = new HdfsLogger(f, &HdfsEnv::gettid);
+  *result = h;
+  if (mylog == nullptr) {
+    // mylog = h; // uncomment this for detailed logging
+  }
+  return Status::OK();
+}
+
+}  // namespace rocksdb
+
+#endif // ROCKSDB_HDFS_FILE_C
+
+#else // USE_HDFS
+
+// dummy placeholders used when HDFS is not available
+#include "rocksdb/env.h"
+#include "hdfs/env_hdfs.h"
+namespace rocksdb {
+ Status HdfsEnv::NewSequentialFile(const std::string& fname,
+                                   unique_ptr<SequentialFile>* result,
+                                   const EnvOptions& options) {
+   return Status::NotSupported("Not compiled with hdfs support");
+ }
+}
+
+#endif
diff --git a/util/env_posix.cc b/util/env_posix.cc
new file mode 100644 (file)
index 0000000..5cbd5bd
--- /dev/null
@@ -0,0 +1,1654 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <deque>
+#include <set>
+#include <dirent.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#ifdef OS_LINUX
+#include <sys/statfs.h>
+#endif
+#include <sys/time.h>
+#include <sys/types.h>
+#include <time.h>
+#include <unistd.h>
+#if defined(OS_LINUX)
+#include <linux/fs.h>
+#include <fcntl.h>
+#endif
+#if defined(LEVELDB_PLATFORM_ANDROID)
+#include <sys/stat.h>
+#endif
+#include "rocksdb/env.h"
+#include "rocksdb/slice.h"
+#include "port/port.h"
+#include "util/coding.h"
+#include "util/logging.h"
+#include "util/posix_logger.h"
+#include "util/random.h"
+#include <signal.h>
+
+// Get nano time for mach systems
+#ifdef __MACH__
+#include <mach/clock.h>
+#include <mach/mach.h>
+#endif
+
+#if !defined(TMPFS_MAGIC)
+#define TMPFS_MAGIC 0x01021994
+#endif
+#if !defined(XFS_SUPER_MAGIC)
+#define XFS_SUPER_MAGIC 0x58465342
+#endif
+#if !defined(EXT4_SUPER_MAGIC)
+#define EXT4_SUPER_MAGIC 0xEF53
+#endif
+
+// For non linux platform, the following macros are used only as place
+// holder.
+#ifndef OS_LINUX
+#define POSIX_FADV_NORMAL 0 /* [MC1] no further special treatment */
+#define POSIX_FADV_RANDOM 1 /* [MC1] expect random page refs */
+#define POSIX_FADV_SEQUENTIAL 2 /* [MC1] expect sequential page refs */
+#define POSIX_FADV_WILLNEED 3 /* [MC1] will need these pages */
+#define POSIX_FADV_DONTNEED 4 /* [MC1] dont need these pages */
+#endif
+
+// This is only set from db_stress.cc and for testing only.
+// If non-zero, kill at various points in source code with probability 1/this
+int rocksdb_kill_odds = 0;
+
+namespace rocksdb {
+
+namespace {
+
+// A wrapper for fadvise, if the platform doesn't support fadvise,
+// it will simply return Status::NotSupport.
+int Fadvise(int fd, off_t offset, size_t len, int advice) {
+#ifdef OS_LINUX
+  return posix_fadvise(fd, offset, len, advice);
+#else
+  return 0;  // simply do nothing.
+#endif
+}
+
+// list of pathnames that are locked
+static std::set<std::string> lockedFiles;
+static port::Mutex mutex_lockedFiles;
+
+static Status IOError(const std::string& context, int err_number) {
+  return Status::IOError(context, strerror(err_number));
+}
+
+#ifdef NDEBUG
+// empty in release build
+#define TEST_KILL_RANDOM(rocksdb_kill_odds)
+#else
+
+// Kill the process with probablity 1/odds for testing.
+static void TestKillRandom(int odds, const std::string& srcfile,
+                           int srcline) {
+  time_t curtime = time(nullptr);
+  Random r((uint32_t)curtime);
+
+  assert(odds > 0);
+  bool crash = r.OneIn(odds);
+  if (crash) {
+    fprintf(stdout, "Crashing at %s:%d\n", srcfile.c_str(), srcline);
+    fflush(stdout);
+    kill(getpid(), SIGTERM);
+  }
+}
+
+// To avoid crashing always at some frequently executed codepaths (during
+// kill random test), use this factor to reduce odds
+#define REDUCE_ODDS 2
+#define REDUCE_ODDS2 4
+
+#define TEST_KILL_RANDOM(rocksdb_kill_odds) {   \
+  if (rocksdb_kill_odds > 0) { \
+    TestKillRandom(rocksdb_kill_odds, __FILE__, __LINE__);     \
+  } \
+}
+
+#endif
+
+#if defined(OS_LINUX)
+namespace {
+  static size_t GetUniqueIdFromFile(int fd, char* id, size_t max_size) {
+    if (max_size < kMaxVarint64Length*3) {
+      return 0;
+    }
+
+    struct stat buf;
+    int result = fstat(fd, &buf);
+    if (result == -1) {
+      return 0;
+    }
+
+    long version = 0;
+    result = ioctl(fd, FS_IOC_GETVERSION, &version);
+    if (result == -1) {
+      return 0;
+    }
+    uint64_t uversion = (uint64_t)version;
+
+    char* rid = id;
+    rid = EncodeVarint64(rid, buf.st_dev);
+    rid = EncodeVarint64(rid, buf.st_ino);
+    rid = EncodeVarint64(rid, uversion);
+    assert(rid >= id);
+    return static_cast<size_t>(rid-id);
+  }
+}
+#endif
+
+class PosixSequentialFile: public SequentialFile {
+ private:
+  std::string filename_;
+  FILE* file_;
+  int fd_;
+  bool use_os_buffer_;
+
+ public:
+  PosixSequentialFile(const std::string& fname, FILE* f,
+      const EnvOptions& options)
+      : filename_(fname), file_(f), fd_(fileno(f)),
+        use_os_buffer_(options.use_os_buffer) {
+  }
+  virtual ~PosixSequentialFile() { fclose(file_); }
+
+  virtual Status Read(size_t n, Slice* result, char* scratch) {
+    Status s;
+    size_t r = 0;
+    do {
+      r = fread_unlocked(scratch, 1, n, file_);
+    } while (r == 0 && ferror(file_) && errno == EINTR);
+    *result = Slice(scratch, r);
+    if (r < n) {
+      if (feof(file_)) {
+        // We leave status as ok if we hit the end of the file
+        // We also clear the error so that the reads can continue
+        // if a new data is written to the file
+        clearerr(file_);
+      } else {
+        // A partial read with an error: return a non-ok status
+        s = IOError(filename_, errno);
+      }
+    }
+    if (!use_os_buffer_) {
+      // we need to fadvise away the entire range of pages because
+      // we do not want readahead pages to be cached.
+      Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED); // free OS pages
+    }
+    return s;
+  }
+
+  virtual Status Skip(uint64_t n) {
+    if (fseek(file_, n, SEEK_CUR)) {
+      return IOError(filename_, errno);
+    }
+    return Status::OK();
+  }
+
+  virtual Status InvalidateCache(size_t offset, size_t length) {
+#ifndef OS_LINUX
+    return Status::OK();
+#else
+    // free OS pages
+    int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
+    if (ret == 0) {
+      return Status::OK();
+    }
+    return IOError(filename_, errno);
+#endif
+  }
+};
+
+// pread() based random-access
+class PosixRandomAccessFile: public RandomAccessFile {
+ private:
+  std::string filename_;
+  int fd_;
+  bool use_os_buffer_;
+
+ public:
+  PosixRandomAccessFile(const std::string& fname, int fd,
+                        const EnvOptions& options)
+      : filename_(fname), fd_(fd), use_os_buffer_(options.use_os_buffer) {
+    assert(!options.use_mmap_reads);
+  }
+  virtual ~PosixRandomAccessFile() { close(fd_); }
+
+  virtual Status Read(uint64_t offset, size_t n, Slice* result,
+                      char* scratch) const {
+    Status s;
+    ssize_t r = -1;
+    do {
+      r = pread(fd_, scratch, n, static_cast<off_t>(offset));
+    } while (r < 0 && errno == EINTR);
+    *result = Slice(scratch, (r < 0) ? 0 : r);
+    if (r < 0) {
+      // An error: return a non-ok status
+      s = IOError(filename_, errno);
+    }
+    if (!use_os_buffer_) {
+      // we need to fadvise away the entire range of pages because
+      // we do not want readahead pages to be cached.
+      Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED); // free OS pages
+    }
+    return s;
+  }
+
+#ifdef OS_LINUX
+  virtual size_t GetUniqueId(char* id, size_t max_size) const {
+    return GetUniqueIdFromFile(fd_, id, max_size);
+  }
+#endif
+
+  virtual void Hint(AccessPattern pattern) {
+    switch(pattern) {
+      case NORMAL:
+        Fadvise(fd_, 0, 0, POSIX_FADV_NORMAL);
+        break;
+      case RANDOM:
+        Fadvise(fd_, 0, 0, POSIX_FADV_RANDOM);
+        break;
+      case SEQUENTIAL:
+        Fadvise(fd_, 0, 0, POSIX_FADV_SEQUENTIAL);
+        break;
+      case WILLNEED:
+        Fadvise(fd_, 0, 0, POSIX_FADV_WILLNEED);
+        break;
+      case DONTNEED:
+        Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED);
+        break;
+      default:
+        assert(false);
+        break;
+    }
+  }
+
+  virtual Status InvalidateCache(size_t offset, size_t length) {
+#ifndef OS_LINUX
+    return Status::OK();
+#else
+    // free OS pages
+    int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
+    if (ret == 0) {
+      return Status::OK();
+    }
+    return IOError(filename_, errno);
+#endif
+  }
+};
+
+// mmap() based random-access
+class PosixMmapReadableFile: public RandomAccessFile {
+ private:
+  int fd_;
+  std::string filename_;
+  void* mmapped_region_;
+  size_t length_;
+
+ public:
+  // base[0,length-1] contains the mmapped contents of the file.
+  PosixMmapReadableFile(const int fd, const std::string& fname,
+                        void* base, size_t length,
+                        const EnvOptions& options)
+      : fd_(fd), filename_(fname), mmapped_region_(base), length_(length) {
+    fd_ = fd_ + 0;  // suppress the warning for used variables
+    assert(options.use_mmap_reads);
+    assert(options.use_os_buffer);
+  }
+  virtual ~PosixMmapReadableFile() {
+    int ret = munmap(mmapped_region_, length_);
+    if (ret != 0) {
+      fprintf(stdout, "failed to munmap %p length %zu \n",
+              mmapped_region_, length_);
+    }
+  }
+
+  virtual Status Read(uint64_t offset, size_t n, Slice* result,
+                      char* scratch) const {
+    Status s;
+    if (offset + n > length_) {
+      *result = Slice();
+      s = IOError(filename_, EINVAL);
+    } else {
+      *result = Slice(reinterpret_cast<char*>(mmapped_region_) + offset, n);
+    }
+    return s;
+  }
+  virtual Status InvalidateCache(size_t offset, size_t length) {
+#ifndef OS_LINUX
+    return Status::OK();
+#else
+    // free OS pages
+    int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
+    if (ret == 0) {
+      return Status::OK();
+    }
+    return IOError(filename_, errno);
+#endif
+  }
+};
+
+// We preallocate up to an extra megabyte and use memcpy to append new
+// data to the file.  This is safe since we either properly close the
+// file before reading from it, or for log files, the reading code
+// knows enough to skip zero suffixes.
+class PosixMmapFile : public WritableFile {
+ private:
+  std::string filename_;
+  int fd_;
+  size_t page_size_;
+  size_t map_size_;       // How much extra memory to map at a time
+  char* base_;            // The mapped region
+  char* limit_;           // Limit of the mapped region
+  char* dst_;             // Where to write next  (in range [base_,limit_])
+  char* last_sync_;       // Where have we synced up to
+  uint64_t file_offset_;  // Offset of base_ in file
+  // Have we done an munmap of unsynced data?
+  bool pending_sync_;
+#ifdef ROCKSDB_FALLOCATE_PRESENT
+  bool fallocate_with_keep_size_;
+#endif
+
+  // Roundup x to a multiple of y
+  static size_t Roundup(size_t x, size_t y) {
+    return ((x + y - 1) / y) * y;
+  }
+
+  size_t TruncateToPageBoundary(size_t s) {
+    s -= (s & (page_size_ - 1));
+    assert((s % page_size_) == 0);
+    return s;
+  }
+
+  bool UnmapCurrentRegion() {
+    bool result = true;
+    TEST_KILL_RANDOM(rocksdb_kill_odds);
+    if (base_ != nullptr) {
+      if (last_sync_ < limit_) {
+        // Defer syncing this data until next Sync() call, if any
+        pending_sync_ = true;
+      }
+      if (munmap(base_, limit_ - base_) != 0) {
+        result = false;
+      }
+      file_offset_ += limit_ - base_;
+      base_ = nullptr;
+      limit_ = nullptr;
+      last_sync_ = nullptr;
+      dst_ = nullptr;
+
+      // Increase the amount we map the next time, but capped at 1MB
+      if (map_size_ < (1<<20)) {
+        map_size_ *= 2;
+      }
+    }
+    return result;
+  }
+
+  Status MapNewRegion() {
+#ifdef ROCKSDB_FALLOCATE_PRESENT
+    assert(base_ == nullptr);
+
+    TEST_KILL_RANDOM(rocksdb_kill_odds);
+    // we can't fallocate with FALLOC_FL_KEEP_SIZE here
+    int alloc_status = fallocate(fd_, 0, file_offset_, map_size_);
+    if (alloc_status != 0) {
+      // fallback to posix_fallocate
+      alloc_status = posix_fallocate(fd_, file_offset_, map_size_);
+    }
+    if (alloc_status != 0) {
+      return Status::IOError("Error allocating space to file : " + filename_ +
+        "Error : " + strerror(alloc_status));
+    }
+
+    TEST_KILL_RANDOM(rocksdb_kill_odds);
+    void* ptr = mmap(nullptr, map_size_, PROT_READ | PROT_WRITE, MAP_SHARED,
+                     fd_, file_offset_);
+    if (ptr == MAP_FAILED) {
+      return Status::IOError("MMap failed on " + filename_);
+    }
+
+    TEST_KILL_RANDOM(rocksdb_kill_odds);
+
+    base_ = reinterpret_cast<char*>(ptr);
+    limit_ = base_ + map_size_;
+    dst_ = base_;
+    last_sync_ = base_;
+    return Status::OK();
+#else
+    return Status::NotSupported("This platform doesn't support fallocate()");
+#endif
+  }
+
+ public:
+  PosixMmapFile(const std::string& fname, int fd, size_t page_size,
+                const EnvOptions& options)
+      : filename_(fname),
+        fd_(fd),
+        page_size_(page_size),
+        map_size_(Roundup(65536, page_size)),
+        base_(nullptr),
+        limit_(nullptr),
+        dst_(nullptr),
+        last_sync_(nullptr),
+        file_offset_(0),
+        pending_sync_(false) {
+#ifdef ROCKSDB_FALLOCATE_PRESENT
+    fallocate_with_keep_size_ = options.fallocate_with_keep_size;
+#endif
+    assert((page_size & (page_size - 1)) == 0);
+    assert(options.use_mmap_writes);
+  }
+
+
+  ~PosixMmapFile() {
+    if (fd_ >= 0) {
+      PosixMmapFile::Close();
+    }
+  }
+
+  virtual Status Append(const Slice& data) {
+    const char* src = data.data();
+    size_t left = data.size();
+    TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS);
+    PrepareWrite(GetFileSize(), left);
+    while (left > 0) {
+      assert(base_ <= dst_);
+      assert(dst_ <= limit_);
+      size_t avail = limit_ - dst_;
+      if (avail == 0) {
+        if (UnmapCurrentRegion()) {
+          Status s = MapNewRegion();
+          if (!s.ok()) {
+            return s;
+          }
+          TEST_KILL_RANDOM(rocksdb_kill_odds);
+        }
+      }
+
+      size_t n = (left <= avail) ? left : avail;
+      memcpy(dst_, src, n);
+      dst_ += n;
+      src += n;
+      left -= n;
+    }
+    TEST_KILL_RANDOM(rocksdb_kill_odds);
+    return Status::OK();
+  }
+
+  virtual Status Close() {
+    Status s;
+    size_t unused = limit_ - dst_;
+
+    TEST_KILL_RANDOM(rocksdb_kill_odds);
+
+    if (!UnmapCurrentRegion()) {
+      s = IOError(filename_, errno);
+    } else if (unused > 0) {
+      // Trim the extra space at the end of the file
+      if (ftruncate(fd_, file_offset_ - unused) < 0) {
+        s = IOError(filename_, errno);
+      }
+    }
+
+    TEST_KILL_RANDOM(rocksdb_kill_odds);
+
+    if (close(fd_) < 0) {
+      if (s.ok()) {
+        s = IOError(filename_, errno);
+      }
+    }
+
+    fd_ = -1;
+    base_ = nullptr;
+    limit_ = nullptr;
+    return s;
+  }
+
+  virtual Status Flush() {
+    TEST_KILL_RANDOM(rocksdb_kill_odds);
+    return Status::OK();
+  }
+
+  virtual Status Sync() {
+    Status s;
+
+    if (pending_sync_) {
+      // Some unmapped data was not synced
+      TEST_KILL_RANDOM(rocksdb_kill_odds);
+      pending_sync_ = false;
+      if (fdatasync(fd_) < 0) {
+        s = IOError(filename_, errno);
+      }
+      TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS);
+    }
+
+    if (dst_ > last_sync_) {
+      // Find the beginnings of the pages that contain the first and last
+      // bytes to be synced.
+      size_t p1 = TruncateToPageBoundary(last_sync_ - base_);
+      size_t p2 = TruncateToPageBoundary(dst_ - base_ - 1);
+      last_sync_ = dst_;
+      TEST_KILL_RANDOM(rocksdb_kill_odds);
+      if (msync(base_ + p1, p2 - p1 + page_size_, MS_SYNC) < 0) {
+        s = IOError(filename_, errno);
+      }
+      TEST_KILL_RANDOM(rocksdb_kill_odds);
+    }
+
+    return s;
+  }
+
+  /**
+   * Flush data as well as metadata to stable storage.
+   */
+  virtual Status Fsync() {
+    if (pending_sync_) {
+      // Some unmapped data was not synced
+      TEST_KILL_RANDOM(rocksdb_kill_odds);
+      pending_sync_ = false;
+      if (fsync(fd_) < 0) {
+        return IOError(filename_, errno);
+      }
+      TEST_KILL_RANDOM(rocksdb_kill_odds);
+    }
+    // This invocation to Sync will not issue the call to
+    // fdatasync because pending_sync_ has already been cleared.
+    return Sync();
+  }
+
+  /**
+   * Get the size of valid data in the file. This will not match the
+   * size that is returned from the filesystem because we use mmap
+   * to extend file by map_size every time.
+   */
+  virtual uint64_t GetFileSize() {
+    size_t used = dst_ - base_;
+    return file_offset_ + used;
+  }
+
+  virtual Status InvalidateCache(size_t offset, size_t length) {
+#ifndef OS_LINUX
+    return Status::OK();
+#else
+    // free OS pages
+    int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
+    if (ret == 0) {
+      return Status::OK();
+    }
+    return IOError(filename_, errno);
+#endif
+  }
+
+#ifdef ROCKSDB_FALLOCATE_PRESENT
+  virtual Status Allocate(off_t offset, off_t len) {
+    TEST_KILL_RANDOM(rocksdb_kill_odds);
+    int alloc_status = fallocate(
+        fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, offset, len);
+    if (alloc_status == 0) {
+      return Status::OK();
+    } else {
+      return IOError(filename_, errno);
+    }
+  }
+#endif
+};
+
+// Use posix write to write data to a file.
+class PosixWritableFile : public WritableFile {
+ private:
+  const std::string filename_;
+  int fd_;
+  size_t cursize_;      // current size of cached data in buf_
+  size_t capacity_;     // max size of buf_
+  unique_ptr<char[]> buf_;           // a buffer to cache writes
+  uint64_t filesize_;
+  bool pending_sync_;
+  bool pending_fsync_;
+  uint64_t last_sync_size_;
+  uint64_t bytes_per_sync_;
+#ifdef ROCKSDB_FALLOCATE_PRESENT
+  bool fallocate_with_keep_size_;
+#endif
+
+ public:
+  PosixWritableFile(const std::string& fname, int fd, size_t capacity,
+                    const EnvOptions& options)
+      : filename_(fname),
+        fd_(fd),
+        cursize_(0),
+        capacity_(capacity),
+        buf_(new char[capacity]),
+        filesize_(0),
+        pending_sync_(false),
+        pending_fsync_(false),
+        last_sync_size_(0),
+        bytes_per_sync_(options.bytes_per_sync) {
+#ifdef ROCKSDB_FALLOCATE_PRESENT
+    fallocate_with_keep_size_ = options.fallocate_with_keep_size;
+#endif
+    assert(!options.use_mmap_writes);
+  }
+
+  ~PosixWritableFile() {
+    if (fd_ >= 0) {
+      PosixWritableFile::Close();
+    }
+  }
+
+  virtual Status Append(const Slice& data) {
+    const char* src = data.data();
+    size_t left = data.size();
+    Status s;
+    pending_sync_ = true;
+    pending_fsync_ = true;
+
+    TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS2);
+
+    PrepareWrite(GetFileSize(), left);
+    // if there is no space in the cache, then flush
+    if (cursize_ + left > capacity_) {
+      s = Flush();
+      if (!s.ok()) {
+        return s;
+      }
+      // Increase the buffer size, but capped at 1MB
+      if (capacity_ < (1<<20)) {
+        capacity_ *= 2;
+        buf_.reset(new char[capacity_]);
+      }
+      assert(cursize_ == 0);
+    }
+
+    // if the write fits into the cache, then write to cache
+    // otherwise do a write() syscall to write to OS buffers.
+    if (cursize_ + left <= capacity_) {
+      memcpy(buf_.get()+cursize_, src, left);
+      cursize_ += left;
+    } else {
+      while (left != 0) {
+        ssize_t done = write(fd_, src, left);
+        if (done < 0) {
+          if (errno == EINTR) {
+            continue;
+          }
+          return IOError(filename_, errno);
+        }
+        TEST_KILL_RANDOM(rocksdb_kill_odds);
+
+        left -= done;
+        src += done;
+      }
+    }
+    filesize_ += data.size();
+    return Status::OK();
+  }
+
+  virtual Status Close() {
+    Status s;
+    s = Flush(); // flush cache to OS
+    if (!s.ok()) {
+      return s;
+    }
+
+    TEST_KILL_RANDOM(rocksdb_kill_odds);
+
+    size_t block_size;
+    size_t last_allocated_block;
+    GetPreallocationStatus(&block_size, &last_allocated_block);
+    if (last_allocated_block > 0) {
+      // trim the extra space preallocated at the end of the file
+      int dummy __attribute__((unused));
+      dummy = ftruncate(fd_, filesize_);  // ignore errors
+    }
+
+    if (close(fd_) < 0) {
+      if (s.ok()) {
+        s = IOError(filename_, errno);
+      }
+    }
+    fd_ = -1;
+    return s;
+  }
+
+  // write out the cached data to the OS cache
+  virtual Status Flush() {
+    TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS2);
+    size_t left = cursize_;
+    char* src = buf_.get();
+    while (left != 0) {
+      ssize_t done = write(fd_, src, left);
+      if (done < 0) {
+        if (errno == EINTR) {
+          continue;
+        }
+        return IOError(filename_, errno);
+      }
+      TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS2);
+      left -= done;
+      src += done;
+    }
+    cursize_ = 0;
+
+    // sync OS cache to disk for every bytes_per_sync_
+    // TODO: give log file and sst file different options (log
+    // files could be potentially cached in OS for their whole
+    // life time, thus we might not want to flush at all).
+    if (bytes_per_sync_ &&
+        filesize_ - last_sync_size_ >= bytes_per_sync_) {
+      RangeSync(last_sync_size_, filesize_ - last_sync_size_);
+      last_sync_size_ = filesize_;
+    }
+
+    return Status::OK();
+  }
+
+  virtual Status Sync() {
+    Status s = Flush();
+    if (!s.ok()) {
+      return s;
+    }
+    TEST_KILL_RANDOM(rocksdb_kill_odds);
+    if (pending_sync_ && fdatasync(fd_) < 0) {
+      return IOError(filename_, errno);
+    }
+    TEST_KILL_RANDOM(rocksdb_kill_odds);
+    pending_sync_ = false;
+    return Status::OK();
+  }
+
+  virtual Status Fsync() {
+    Status s = Flush();
+    if (!s.ok()) {
+      return s;
+    }
+    TEST_KILL_RANDOM(rocksdb_kill_odds);
+    if (pending_fsync_ && fsync(fd_) < 0) {
+      return IOError(filename_, errno);
+    }
+    TEST_KILL_RANDOM(rocksdb_kill_odds);
+    pending_fsync_ = false;
+    pending_sync_ = false;
+    return Status::OK();
+  }
+
+  virtual uint64_t GetFileSize() {
+    return filesize_;
+  }
+
+  virtual Status InvalidateCache(size_t offset, size_t length) {
+#ifndef OS_LINUX
+    return Status::OK();
+#else
+    // free OS pages
+    int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
+    if (ret == 0) {
+      return Status::OK();
+    }
+    return IOError(filename_, errno);
+#endif
+  }
+
+#ifdef ROCKSDB_FALLOCATE_PRESENT
+  virtual Status Allocate(off_t offset, off_t len) {
+    TEST_KILL_RANDOM(rocksdb_kill_odds);
+    int alloc_status = fallocate(
+        fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, offset, len);
+    if (alloc_status == 0) {
+      return Status::OK();
+    } else {
+      return IOError(filename_, errno);
+    }
+  }
+
+  virtual Status RangeSync(off64_t offset, off64_t nbytes) {
+    if (sync_file_range(fd_, offset, nbytes, SYNC_FILE_RANGE_WRITE) == 0) {
+      return Status::OK();
+    } else {
+      return IOError(filename_, errno);
+    }
+  }
+  virtual size_t GetUniqueId(char* id, size_t max_size) const {
+    return GetUniqueIdFromFile(fd_, id, max_size);
+  }
+#endif
+};
+
+class PosixRandomRWFile : public RandomRWFile {
+ private:
+  const std::string filename_;
+  int fd_;
+  bool pending_sync_;
+  bool pending_fsync_;
+#ifdef ROCKSDB_FALLOCATE_PRESENT
+  bool fallocate_with_keep_size_;
+#endif
+
+ public:
+  PosixRandomRWFile(const std::string& fname, int fd, const EnvOptions& options)
+      : filename_(fname),
+        fd_(fd),
+        pending_sync_(false),
+        pending_fsync_(false) {
+#ifdef ROCKSDB_FALLOCATE_PRESENT
+    fallocate_with_keep_size_ = options.fallocate_with_keep_size;
+#endif
+    assert(!options.use_mmap_writes && !options.use_mmap_reads);
+  }
+
+  ~PosixRandomRWFile() {
+    if (fd_ >= 0) {
+      Close();
+    }
+  }
+
+  virtual Status Write(uint64_t offset, const Slice& data) {
+    const char* src = data.data();
+    size_t left = data.size();
+    Status s;
+    pending_sync_ = true;
+    pending_fsync_ = true;
+
+    while (left != 0) {
+      ssize_t done = pwrite(fd_, src, left, offset);
+      if (done < 0) {
+        if (errno == EINTR) {
+          continue;
+        }
+        return IOError(filename_, errno);
+      }
+
+      left -= done;
+      src += done;
+      offset += done;
+    }
+
+    return Status::OK();
+  }
+
+  virtual Status Read(uint64_t offset, size_t n, Slice* result,
+                      char* scratch) const {
+    Status s;
+    ssize_t r = pread(fd_, scratch, n, static_cast<off_t>(offset));
+    *result = Slice(scratch, (r < 0) ? 0 : r);
+    if (r < 0) {
+      s = IOError(filename_, errno);
+    }
+    return s;
+  }
+
+  virtual Status Close() {
+    Status s = Status::OK();
+    if (fd_ >= 0 && close(fd_) < 0) {
+      s = IOError(filename_, errno);
+    }
+    fd_ = -1;
+    return s;
+  }
+
+  virtual Status Sync() {
+    if (pending_sync_ && fdatasync(fd_) < 0) {
+      return IOError(filename_, errno);
+    }
+    pending_sync_ = false;
+    return Status::OK();
+  }
+
+  virtual Status Fsync() {
+    if (pending_fsync_ && fsync(fd_) < 0) {
+      return IOError(filename_, errno);
+    }
+    pending_fsync_ = false;
+    pending_sync_ = false;
+    return Status::OK();
+  }
+
+#ifdef ROCKSDB_FALLOCATE_PRESENT
+  virtual Status Allocate(off_t offset, off_t len) {
+    TEST_KILL_RANDOM(rocksdb_kill_odds);
+    int alloc_status = fallocate(
+        fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, offset, len);
+    if (alloc_status == 0) {
+      return Status::OK();
+    } else {
+      return IOError(filename_, errno);
+    }
+  }
+#endif
+};
+
+class PosixDirectory : public Directory {
+ public:
+  explicit PosixDirectory(int fd) : fd_(fd) {}
+  ~PosixDirectory() {
+    close(fd_);
+  }
+
+  virtual Status Fsync() {
+    if (fsync(fd_) == -1) {
+      return IOError("directory", errno);
+    }
+    return Status::OK();
+  }
+
+ private:
+  int fd_;
+};
+
+static int LockOrUnlock(const std::string& fname, int fd, bool lock) {
+  mutex_lockedFiles.Lock();
+  if (lock) {
+    // If it already exists in the lockedFiles set, then it is already locked,
+    // and fail this lock attempt. Otherwise, insert it into lockedFiles.
+    // This check is needed because fcntl() does not detect lock conflict
+    // if the fcntl is issued by the same thread that earlier acquired
+    // this lock.
+    if (lockedFiles.insert(fname).second == false) {
+      mutex_lockedFiles.Unlock();
+      errno = ENOLCK;
+      return -1;
+    }
+  } else {
+    // If we are unlocking, then verify that we had locked it earlier,
+    // it should already exist in lockedFiles. Remove it from lockedFiles.
+    if (lockedFiles.erase(fname) != 1) {
+      mutex_lockedFiles.Unlock();
+      errno = ENOLCK;
+      return -1;
+    }
+  }
+  errno = 0;
+  struct flock f;
+  memset(&f, 0, sizeof(f));
+  f.l_type = (lock ? F_WRLCK : F_UNLCK);
+  f.l_whence = SEEK_SET;
+  f.l_start = 0;
+  f.l_len = 0;        // Lock/unlock entire file
+  int value = fcntl(fd, F_SETLK, &f);
+  if (value == -1 && lock) {
+    // if there is an error in locking, then remove the pathname from lockedfiles
+    lockedFiles.erase(fname);
+  }
+  mutex_lockedFiles.Unlock();
+  return value;
+}
+
+class PosixFileLock : public FileLock {
+ public:
+  int fd_;
+  std::string filename;
+};
+
+
+namespace {
+void PthreadCall(const char* label, int result) {
+  if (result != 0) {
+    fprintf(stderr, "pthread %s: %s\n", label, strerror(result));
+    exit(1);
+  }
+}
+}
+
+class PosixEnv : public Env {
+ public:
+  PosixEnv();
+
+  virtual ~PosixEnv(){
+    for (const auto tid : threads_to_join_) {
+      pthread_join(tid, nullptr);
+    }
+  }
+
+  void SetFD_CLOEXEC(int fd, const EnvOptions* options) {
+    if ((options == nullptr || options->set_fd_cloexec) && fd > 0) {
+      fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC);
+    }
+  }
+
+  virtual Status NewSequentialFile(const std::string& fname,
+                                   unique_ptr<SequentialFile>* result,
+                                   const EnvOptions& options) {
+    result->reset();
+    FILE* f = nullptr;
+    do {
+      f = fopen(fname.c_str(), "r");
+    } while (f == nullptr && errno == EINTR);
+    if (f == nullptr) {
+      *result = nullptr;
+      return IOError(fname, errno);
+    } else {
+      int fd = fileno(f);
+      SetFD_CLOEXEC(fd, &options);
+      result->reset(new PosixSequentialFile(fname, f, options));
+      return Status::OK();
+    }
+  }
+
+  virtual Status NewRandomAccessFile(const std::string& fname,
+                                     unique_ptr<RandomAccessFile>* result,
+                                     const EnvOptions& options) {
+    result->reset();
+    Status s;
+    int fd = open(fname.c_str(), O_RDONLY);
+    SetFD_CLOEXEC(fd, &options);
+    if (fd < 0) {
+      s = IOError(fname, errno);
+    } else if (options.use_mmap_reads && sizeof(void*) >= 8) {
+      // Use of mmap for random reads has been removed because it
+      // kills performance when storage is fast.
+      // Use mmap when virtual address-space is plentiful.
+      uint64_t size;
+      s = GetFileSize(fname, &size);
+      if (s.ok()) {
+        void* base = mmap(nullptr, size, PROT_READ, MAP_SHARED, fd, 0);
+        if (base != MAP_FAILED) {
+          result->reset(new PosixMmapReadableFile(fd, fname, base,
+                                                  size, options));
+        } else {
+          s = IOError(fname, errno);
+        }
+      }
+      close(fd);
+    } else {
+      result->reset(new PosixRandomAccessFile(fname, fd, options));
+    }
+    return s;
+  }
+
+  virtual Status NewWritableFile(const std::string& fname,
+                                 unique_ptr<WritableFile>* result,
+                                 const EnvOptions& options) {
+    result->reset();
+    Status s;
+    int fd = -1;
+    do {
+      fd = open(fname.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644);
+    } while (fd < 0 && errno == EINTR);
+    if (fd < 0) {
+      s = IOError(fname, errno);
+    } else {
+      SetFD_CLOEXEC(fd, &options);
+      if (options.use_mmap_writes) {
+        if (!checkedDiskForMmap_) {
+          // this will be executed once in the program's lifetime.
+          // do not use mmapWrite on non ext-3/xfs/tmpfs systems.
+          if (!SupportsFastAllocate(fname)) {
+            forceMmapOff = true;
+          }
+          checkedDiskForMmap_ = true;
+        }
+      }
+      if (options.use_mmap_writes && !forceMmapOff) {
+        result->reset(new PosixMmapFile(fname, fd, page_size_, options));
+      } else {
+        // disable mmap writes
+        EnvOptions no_mmap_writes_options = options;
+        no_mmap_writes_options.use_mmap_writes = false;
+
+        result->reset(
+            new PosixWritableFile(fname, fd, 65536, no_mmap_writes_options)
+        );
+      }
+    }
+    return s;
+  }
+
+  virtual Status NewRandomRWFile(const std::string& fname,
+                                 unique_ptr<RandomRWFile>* result,
+                                 const EnvOptions& options) {
+    result->reset();
+    // no support for mmap yet
+    if (options.use_mmap_writes || options.use_mmap_reads) {
+      return Status::NotSupported("No support for mmap read/write yet");
+    }
+    Status s;
+    const int fd = open(fname.c_str(), O_CREAT | O_RDWR, 0644);
+    if (fd < 0) {
+      s = IOError(fname, errno);
+    } else {
+      SetFD_CLOEXEC(fd, &options);
+      result->reset(new PosixRandomRWFile(fname, fd, options));
+    }
+    return s;
+  }
+
+  virtual Status NewDirectory(const std::string& name,
+                              unique_ptr<Directory>* result) {
+    result->reset();
+    const int fd = open(name.c_str(), 0);
+    if (fd < 0) {
+      return IOError(name, errno);
+    } else {
+      result->reset(new PosixDirectory(fd));
+    }
+    return Status::OK();
+  }
+
+  virtual bool FileExists(const std::string& fname) {
+    return access(fname.c_str(), F_OK) == 0;
+  }
+
+  virtual Status GetChildren(const std::string& dir,
+                             std::vector<std::string>* result) {
+    result->clear();
+    DIR* d = opendir(dir.c_str());
+    if (d == nullptr) {
+      return IOError(dir, errno);
+    }
+    struct dirent* entry;
+    while ((entry = readdir(d)) != nullptr) {
+      result->push_back(entry->d_name);
+    }
+    closedir(d);
+    return Status::OK();
+  }
+
+  virtual Status DeleteFile(const std::string& fname) {
+    Status result;
+    if (unlink(fname.c_str()) != 0) {
+      result = IOError(fname, errno);
+    }
+    return result;
+  };
+
+  virtual Status CreateDir(const std::string& name) {
+    Status result;
+    if (mkdir(name.c_str(), 0755) != 0) {
+      result = IOError(name, errno);
+    }
+    return result;
+  };
+
+  virtual Status CreateDirIfMissing(const std::string& name) {
+    Status result;
+    if (mkdir(name.c_str(), 0755) != 0) {
+      if (errno != EEXIST) {
+        result = IOError(name, errno);
+      } else if (!DirExists(name)) { // Check that name is actually a
+                                     // directory.
+        // Message is taken from mkdir
+        result = Status::IOError("`"+name+"' exists but is not a directory");
+      }
+    }
+    return result;
+  };
+
+  virtual Status DeleteDir(const std::string& name) {
+    Status result;
+    if (rmdir(name.c_str()) != 0) {
+      result = IOError(name, errno);
+    }
+    return result;
+  };
+
+  virtual Status GetFileSize(const std::string& fname, uint64_t* size) {
+    Status s;
+    struct stat sbuf;
+    if (stat(fname.c_str(), &sbuf) != 0) {
+      *size = 0;
+      s = IOError(fname, errno);
+    } else {
+      *size = sbuf.st_size;
+    }
+    return s;
+  }
+
+  virtual Status GetFileModificationTime(const std::string& fname,
+                                         uint64_t* file_mtime) {
+    struct stat s;
+    if (stat(fname.c_str(), &s) !=0) {
+      return IOError(fname, errno);
+    }
+    *file_mtime = static_cast<uint64_t>(s.st_mtime);
+    return Status::OK();
+  }
+  virtual Status RenameFile(const std::string& src, const std::string& target) {
+    Status result;
+    if (rename(src.c_str(), target.c_str()) != 0) {
+      result = IOError(src, errno);
+    }
+    return result;
+  }
+
+  virtual Status LockFile(const std::string& fname, FileLock** lock) {
+    *lock = nullptr;
+    Status result;
+    int fd = open(fname.c_str(), O_RDWR | O_CREAT, 0644);
+    if (fd < 0) {
+      result = IOError(fname, errno);
+    } else if (LockOrUnlock(fname, fd, true) == -1) {
+      result = IOError("lock " + fname, errno);
+      close(fd);
+    } else {
+      SetFD_CLOEXEC(fd, nullptr);
+      PosixFileLock* my_lock = new PosixFileLock;
+      my_lock->fd_ = fd;
+      my_lock->filename = fname;
+      *lock = my_lock;
+    }
+    return result;
+  }
+
+  virtual Status UnlockFile(FileLock* lock) {
+    PosixFileLock* my_lock = reinterpret_cast<PosixFileLock*>(lock);
+    Status result;
+    if (LockOrUnlock(my_lock->filename, my_lock->fd_, false) == -1) {
+      result = IOError("unlock", errno);
+    }
+    close(my_lock->fd_);
+    delete my_lock;
+    return result;
+  }
+
+  virtual void Schedule(void (*function)(void*), void* arg, Priority pri = LOW);
+
+  virtual void StartThread(void (*function)(void* arg), void* arg);
+
+  virtual void WaitForJoin();
+
+  virtual unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const override;
+
+  virtual Status GetTestDirectory(std::string* result) {
+    const char* env = getenv("TEST_TMPDIR");
+    if (env && env[0] != '\0') {
+      *result = env;
+    } else {
+      char buf[100];
+      snprintf(buf, sizeof(buf), "/tmp/rocksdbtest-%d", int(geteuid()));
+      *result = buf;
+    }
+    // Directory may already exist
+    CreateDir(*result);
+    return Status::OK();
+  }
+
+  static uint64_t gettid() {
+    pthread_t tid = pthread_self();
+    uint64_t thread_id = 0;
+    memcpy(&thread_id, &tid, std::min(sizeof(thread_id), sizeof(tid)));
+    return thread_id;
+  }
+
+  virtual Status NewLogger(const std::string& fname,
+                           shared_ptr<Logger>* result) {
+    FILE* f = fopen(fname.c_str(), "w");
+    if (f == nullptr) {
+      result->reset();
+      return IOError(fname, errno);
+    } else {
+      int fd = fileno(f);
+      SetFD_CLOEXEC(fd, nullptr);
+      result->reset(new PosixLogger(f, &PosixEnv::gettid, this));
+      return Status::OK();
+    }
+  }
+
+  virtual uint64_t NowMicros() {
+    struct timeval tv;
+    // TODO(kailiu) MAC DON'T HAVE THIS
+    gettimeofday(&tv, nullptr);
+    return static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
+  }
+
+  virtual uint64_t NowNanos() {
+#ifdef OS_LINUX
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return static_cast<uint64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec;
+#elif __MACH__
+    clock_serv_t cclock;
+    mach_timespec_t ts;
+    host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock);
+    clock_get_time(cclock, &ts);
+    mach_port_deallocate(mach_task_self(), cclock);
+#endif
+    return static_cast<uint64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec;
+  }
+
+  virtual void SleepForMicroseconds(int micros) {
+    usleep(micros);
+  }
+
+  virtual Status GetHostName(char* name, uint64_t len) {
+    int ret = gethostname(name, len);
+    if (ret < 0) {
+      if (errno == EFAULT || errno == EINVAL)
+        return Status::InvalidArgument(strerror(errno));
+      else
+        return IOError("GetHostName", errno);
+    }
+    return Status::OK();
+  }
+
+  virtual Status GetCurrentTime(int64_t* unix_time) {
+    time_t ret = time(nullptr);
+    if (ret == (time_t) -1) {
+      return IOError("GetCurrentTime", errno);
+    }
+    *unix_time = (int64_t) ret;
+    return Status::OK();
+  }
+
+  virtual Status GetAbsolutePath(const std::string& db_path,
+      std::string* output_path) {
+    if (db_path.find('/') == 0) {
+      *output_path = db_path;
+      return Status::OK();
+    }
+
+    char the_path[256];
+    char* ret = getcwd(the_path, 256);
+    if (ret == nullptr) {
+      return Status::IOError(strerror(errno));
+    }
+
+    *output_path = ret;
+    return Status::OK();
+  }
+
+  // Allow increasing the number of worker threads.
+  virtual void SetBackgroundThreads(int num, Priority pri) {
+    assert(pri >= Priority::LOW && pri <= Priority::HIGH);
+    thread_pools_[pri].SetBackgroundThreads(num);
+  }
+
+  virtual std::string TimeToString(uint64_t secondsSince1970) {
+    const time_t seconds = (time_t)secondsSince1970;
+    struct tm t;
+    int maxsize = 64;
+    std::string dummy;
+    dummy.reserve(maxsize);
+    dummy.resize(maxsize);
+    char* p = &dummy[0];
+    localtime_r(&seconds, &t);
+    snprintf(p, maxsize,
+             "%04d/%02d/%02d-%02d:%02d:%02d ",
+             t.tm_year + 1900,
+             t.tm_mon + 1,
+             t.tm_mday,
+             t.tm_hour,
+             t.tm_min,
+             t.tm_sec);
+    return dummy;
+  }
+
+  EnvOptions OptimizeForLogWrite(const EnvOptions& env_options) const {
+    EnvOptions optimized = env_options;
+    optimized.use_mmap_writes = false;
+    // TODO(icanadi) it's faster if fallocate_with_keep_size is false, but it
+    // breaks TransactionLogIteratorStallAtLastRecord unit test. Fix the unit
+    // test and make this false
+    optimized.fallocate_with_keep_size = true;
+    return optimized;
+  }
+
+  EnvOptions OptimizeForManifestWrite(const EnvOptions& env_options) const {
+    EnvOptions optimized = env_options;
+    optimized.use_mmap_writes = false;
+    optimized.fallocate_with_keep_size = true;
+    return optimized;
+  }
+
+ private:
+  bool checkedDiskForMmap_;
+  bool forceMmapOff; // do we override Env options?
+
+
+  // Returns true iff the named directory exists and is a directory.
+  virtual bool DirExists(const std::string& dname) {
+    struct stat statbuf;
+    if (stat(dname.c_str(), &statbuf) == 0) {
+      return S_ISDIR(statbuf.st_mode);
+    }
+    return false; // stat() failed return false
+  }
+
+  bool SupportsFastAllocate(const std::string& path) {
+#ifdef ROCKSDB_FALLOCATE_PRESENT
+    struct statfs s;
+    if (statfs(path.c_str(), &s)){
+      return false;
+    }
+    switch (s.f_type) {
+      case EXT4_SUPER_MAGIC:
+        return true;
+      case XFS_SUPER_MAGIC:
+        return true;
+      case TMPFS_MAGIC:
+        return true;
+      default:
+        return false;
+    }
+#else
+    return false;
+#endif
+  }
+
+  size_t page_size_;
+
+
+  class ThreadPool {
+   public:
+    ThreadPool()
+        : total_threads_limit_(1),
+          bgthreads_(0),
+          queue_(),
+          queue_len_(0),
+          exit_all_threads_(false) {
+      PthreadCall("mutex_init", pthread_mutex_init(&mu_, nullptr));
+      PthreadCall("cvar_init", pthread_cond_init(&bgsignal_, nullptr));
+    }
+
+    ~ThreadPool() {
+      PthreadCall("lock", pthread_mutex_lock(&mu_));
+      assert(!exit_all_threads_);
+      exit_all_threads_ = true;
+      PthreadCall("signalall", pthread_cond_broadcast(&bgsignal_));
+      PthreadCall("unlock", pthread_mutex_unlock(&mu_));
+      for (const auto tid : bgthreads_) {
+        pthread_join(tid, nullptr);
+      }
+    }
+
+    void BGThread() {
+      while (true) {
+        // Wait until there is an item that is ready to run
+        PthreadCall("lock", pthread_mutex_lock(&mu_));
+        while (queue_.empty() && !exit_all_threads_) {
+          PthreadCall("wait", pthread_cond_wait(&bgsignal_, &mu_));
+        }
+        if (exit_all_threads_) { // mechanism to let BG threads exit safely
+          PthreadCall("unlock", pthread_mutex_unlock(&mu_));
+          break;
+        }
+        void (*function)(void*) = queue_.front().function;
+        void* arg = queue_.front().arg;
+        queue_.pop_front();
+        queue_len_.store(queue_.size(), std::memory_order_relaxed);
+
+        PthreadCall("unlock", pthread_mutex_unlock(&mu_));
+        (*function)(arg);
+      }
+    }
+
+    static void* BGThreadWrapper(void* arg) {
+      reinterpret_cast<ThreadPool*>(arg)->BGThread();
+      return nullptr;
+    }
+
+    void SetBackgroundThreads(int num) {
+      PthreadCall("lock", pthread_mutex_lock(&mu_));
+      if (num > total_threads_limit_) {
+        total_threads_limit_ = num;
+      }
+      assert(total_threads_limit_ > 0);
+      PthreadCall("unlock", pthread_mutex_unlock(&mu_));
+    }
+
+    void Schedule(void (*function)(void*), void* arg) {
+      PthreadCall("lock", pthread_mutex_lock(&mu_));
+
+      if (exit_all_threads_) {
+        PthreadCall("unlock", pthread_mutex_unlock(&mu_));
+        return;
+      }
+      // Start background thread if necessary
+      while ((int)bgthreads_.size() < total_threads_limit_) {
+        pthread_t t;
+        PthreadCall(
+          "create thread",
+          pthread_create(&t,
+                         nullptr,
+                         &ThreadPool::BGThreadWrapper,
+                         this));
+
+        // Set the thread name to aid debugging
+#if defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ)
+#if __GLIBC_PREREQ(2, 12)
+        char name_buf[16];
+        snprintf(name_buf, sizeof name_buf, "rocksdb:bg%zu", bgthreads_.size());
+        name_buf[sizeof name_buf - 1] = '\0';
+        pthread_setname_np(t, name_buf);
+#endif
+#endif
+
+        bgthreads_.push_back(t);
+      }
+
+      // Add to priority queue
+      queue_.push_back(BGItem());
+      queue_.back().function = function;
+      queue_.back().arg = arg;
+      queue_len_.store(queue_.size(), std::memory_order_relaxed);
+
+      // always wake up at least one waiting thread.
+      PthreadCall("signal", pthread_cond_signal(&bgsignal_));
+
+      PthreadCall("unlock", pthread_mutex_unlock(&mu_));
+    }
+
+    unsigned int GetQueueLen() const {
+      return queue_len_.load(std::memory_order_relaxed);
+    }
+
+   private:
+    // Entry per Schedule() call
+    struct BGItem { void* arg; void (*function)(void*); };
+    typedef std::deque<BGItem> BGQueue;
+
+    pthread_mutex_t mu_;
+    pthread_cond_t bgsignal_;
+    int total_threads_limit_;
+    std::vector<pthread_t> bgthreads_;
+    BGQueue queue_;
+    std::atomic_uint queue_len_;  // Queue length. Used for stats reporting
+    bool exit_all_threads_;
+  };
+
+  std::vector<ThreadPool> thread_pools_;
+
+  pthread_mutex_t mu_;
+  std::vector<pthread_t> threads_to_join_;
+
+};
+
+PosixEnv::PosixEnv() : checkedDiskForMmap_(false),
+                       forceMmapOff(false),
+                       page_size_(getpagesize()),
+                       thread_pools_(Priority::TOTAL) {
+  PthreadCall("mutex_init", pthread_mutex_init(&mu_, nullptr));
+}
+
+void PosixEnv::Schedule(void (*function)(void*), void* arg, Priority pri) {
+  assert(pri >= Priority::LOW && pri <= Priority::HIGH);
+  thread_pools_[pri].Schedule(function, arg);
+}
+
+unsigned int PosixEnv::GetThreadPoolQueueLen(Priority pri) const {
+  assert(pri >= Priority::LOW && pri <= Priority::HIGH);
+  return thread_pools_[pri].GetQueueLen();
+}
+
+namespace {
+struct StartThreadState {
+  void (*user_function)(void*);
+  void* arg;
+};
+}
+static void* StartThreadWrapper(void* arg) {
+  StartThreadState* state = reinterpret_cast<StartThreadState*>(arg);
+  state->user_function(state->arg);
+  delete state;
+  return nullptr;
+}
+
+void PosixEnv::StartThread(void (*function)(void* arg), void* arg) {
+  pthread_t t;
+  StartThreadState* state = new StartThreadState;
+  state->user_function = function;
+  state->arg = arg;
+  PthreadCall("start thread",
+              pthread_create(&t, nullptr,  &StartThreadWrapper, state));
+  PthreadCall("lock", pthread_mutex_lock(&mu_));
+  threads_to_join_.push_back(t);
+  PthreadCall("unlock", pthread_mutex_unlock(&mu_));
+}
+
+void PosixEnv::WaitForJoin() {
+  for (const auto tid : threads_to_join_) {
+    pthread_join(tid, nullptr);
+  }
+  threads_to_join_.clear();
+}
+
+}  // namespace
+
+std::string Env::GenerateUniqueId() {
+  std::string uuid_file = "/proc/sys/kernel/random/uuid";
+  if (FileExists(uuid_file)) {
+    std::string uuid;
+    Status s = ReadFileToString(this, uuid_file, &uuid);
+    if (s.ok()) {
+      return uuid;
+    }
+  }
+  // Could not read uuid_file - generate uuid using "nanos-random"
+  Random64 r(time(nullptr));
+  uint64_t random_uuid_portion =
+    r.Uniform(std::numeric_limits<uint64_t>::max());
+  uint64_t nanos_uuid_portion = NowNanos();
+  char uuid2[200];
+  snprintf(uuid2,
+           200,
+           "%lx-%lx",
+           (unsigned long)nanos_uuid_portion,
+           (unsigned long)random_uuid_portion);
+  return uuid2;
+}
+
+Env* Env::Default() {
+  static PosixEnv default_env;
+  return &default_env;
+}
+
+}  // namespace rocksdb
diff --git a/util/env_test.cc b/util/env_test.cc
new file mode 100644 (file)
index 0000000..1ac3773
--- /dev/null
@@ -0,0 +1,550 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <sys/types.h>
+
+#include <iostream>
+#include <unordered_set>
+
+#ifdef OS_LINUX
+#include <sys/stat.h>
+#include <unistd.h>
+#endif
+
+#include "rocksdb/env.h"
+#include "port/port.h"
+#include "util/coding.h"
+#include "util/log_buffer.h"
+#include "util/mutexlock.h"
+#include "util/testharness.h"
+
+namespace rocksdb {
+
+static const int kDelayMicros = 100000;
+
+class EnvPosixTest {
+ private:
+  port::Mutex mu_;
+  std::string events_;
+
+ public:
+  Env* env_;
+  EnvPosixTest() : env_(Env::Default()) { }
+};
+
+static void SetBool(void* ptr) {
+  reinterpret_cast<port::AtomicPointer*>(ptr)->NoBarrier_Store(ptr);
+}
+
+TEST(EnvPosixTest, RunImmediately) {
+  port::AtomicPointer called (nullptr);
+  env_->Schedule(&SetBool, &called);
+  Env::Default()->SleepForMicroseconds(kDelayMicros);
+  ASSERT_TRUE(called.NoBarrier_Load() != nullptr);
+}
+
+TEST(EnvPosixTest, RunMany) {
+  port::AtomicPointer last_id (nullptr);
+
+  struct CB {
+    port::AtomicPointer* last_id_ptr;   // Pointer to shared slot
+    uintptr_t id;             // Order# for the execution of this callback
+
+    CB(port::AtomicPointer* p, int i) : last_id_ptr(p), id(i) { }
+
+    static void Run(void* v) {
+      CB* cb = reinterpret_cast<CB*>(v);
+      void* cur = cb->last_id_ptr->NoBarrier_Load();
+      ASSERT_EQ(cb->id-1, reinterpret_cast<uintptr_t>(cur));
+      cb->last_id_ptr->Release_Store(reinterpret_cast<void*>(cb->id));
+    }
+  };
+
+  // Schedule in different order than start time
+  CB cb1(&last_id, 1);
+  CB cb2(&last_id, 2);
+  CB cb3(&last_id, 3);
+  CB cb4(&last_id, 4);
+  env_->Schedule(&CB::Run, &cb1);
+  env_->Schedule(&CB::Run, &cb2);
+  env_->Schedule(&CB::Run, &cb3);
+  env_->Schedule(&CB::Run, &cb4);
+
+  Env::Default()->SleepForMicroseconds(kDelayMicros);
+  void* cur = last_id.Acquire_Load();
+  ASSERT_EQ(4U, reinterpret_cast<uintptr_t>(cur));
+}
+
+struct State {
+  port::Mutex mu;
+  int val;
+  int num_running;
+};
+
+static void ThreadBody(void* arg) {
+  State* s = reinterpret_cast<State*>(arg);
+  s->mu.Lock();
+  s->val += 1;
+  s->num_running -= 1;
+  s->mu.Unlock();
+}
+
+TEST(EnvPosixTest, StartThread) {
+  State state;
+  state.val = 0;
+  state.num_running = 3;
+  for (int i = 0; i < 3; i++) {
+    env_->StartThread(&ThreadBody, &state);
+  }
+  while (true) {
+    state.mu.Lock();
+    int num = state.num_running;
+    state.mu.Unlock();
+    if (num == 0) {
+      break;
+    }
+    Env::Default()->SleepForMicroseconds(kDelayMicros);
+  }
+  ASSERT_EQ(state.val, 3);
+}
+
+TEST(EnvPosixTest, TwoPools) {
+
+  class CB {
+   public:
+    CB(const std::string& pool_name, int pool_size)
+        : mu_(),
+          num_running_(0),
+          num_finished_(0),
+          pool_size_(pool_size),
+          pool_name_(pool_name) { }
+
+    static void Run(void* v) {
+      CB* cb = reinterpret_cast<CB*>(v);
+      cb->Run();
+    }
+
+    void Run() {
+      {
+        MutexLock l(&mu_);
+        num_running_++;
+        std::cout << "Pool " << pool_name_ << ": "
+                  << num_running_ << " running threads.\n";
+        // make sure we don't have more than pool_size_ jobs running.
+        ASSERT_LE(num_running_, pool_size_);
+      }
+
+      // sleep for 1 sec
+      Env::Default()->SleepForMicroseconds(1000000);
+
+      {
+        MutexLock l(&mu_);
+        num_running_--;
+        num_finished_++;
+      }
+    }
+
+    int NumFinished() {
+      MutexLock l(&mu_);
+      return num_finished_;
+    }
+
+   private:
+    port::Mutex mu_;
+    int num_running_;
+    int num_finished_;
+    int pool_size_;
+    std::string pool_name_;
+  };
+
+  const int kLowPoolSize = 2;
+  const int kHighPoolSize = 4;
+  const int kJobs = 8;
+
+  CB low_pool_job("low", kLowPoolSize);
+  CB high_pool_job("high", kHighPoolSize);
+
+  env_->SetBackgroundThreads(kLowPoolSize);
+  env_->SetBackgroundThreads(kHighPoolSize, Env::Priority::HIGH);
+
+  ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::LOW));
+  ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+
+  // schedule same number of jobs in each pool
+  for (int i = 0; i < kJobs; i++) {
+    env_->Schedule(&CB::Run, &low_pool_job);
+    env_->Schedule(&CB::Run, &high_pool_job, Env::Priority::HIGH);
+  }
+  // Wait a short while for the jobs to be dispatched.
+  Env::Default()->SleepForMicroseconds(kDelayMicros);
+  ASSERT_EQ((unsigned int)(kJobs - kLowPoolSize),
+            env_->GetThreadPoolQueueLen());
+  ASSERT_EQ((unsigned int)(kJobs - kLowPoolSize),
+            env_->GetThreadPoolQueueLen(Env::Priority::LOW));
+  ASSERT_EQ((unsigned int)(kJobs - kHighPoolSize),
+            env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+
+  // wait for all jobs to finish
+  while (low_pool_job.NumFinished() < kJobs ||
+         high_pool_job.NumFinished() < kJobs) {
+    env_->SleepForMicroseconds(kDelayMicros);
+  }
+
+  ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::LOW));
+  ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+}
+
+#ifdef OS_LINUX
+// To make sure the Env::GetUniqueId() related tests work correctly, The files
+// should be stored in regular storage like "hard disk" or "flash device".
+// Otherwise we cannot get the correct id.
+//
+// The following function act as the replacement of test::TmpDir() that may be
+// customized by user to be on a storage that doesn't work with GetUniqueId().
+//
+// TODO(kailiu) This function still assumes /tmp/<test-dir> reside in regular
+// storage system.
+namespace {
+bool IsSingleVarint(const std::string& s) {
+  Slice slice(s);
+
+  uint64_t v;
+  if (!GetVarint64(&slice, &v)) {
+    return false;
+  }
+
+  return slice.size() == 0;
+}
+
+bool IsUniqueIDValid(const std::string& s) {
+  return !s.empty() && !IsSingleVarint(s);
+}
+
+const size_t MAX_ID_SIZE = 100;
+char temp_id[MAX_ID_SIZE];
+
+std::string GetOnDiskTestDir() {
+  char base[100];
+  snprintf(base, sizeof(base), "/tmp/rocksdbtest-%d",
+           static_cast<int>(geteuid()));
+  // Directory may already exist
+  Env::Default()->CreateDirIfMissing(base);
+
+  return base;
+}
+}  // namespace
+
+// Only works in linux platforms
+TEST(EnvPosixTest, RandomAccessUniqueID) {
+  // Create file.
+  const EnvOptions soptions;
+  std::string fname = GetOnDiskTestDir() + "/" + "testfile";
+  unique_ptr<WritableFile> wfile;
+  ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions));
+
+  unique_ptr<RandomAccessFile> file;
+
+  // Get Unique ID
+  ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions));
+  size_t id_size = file->GetUniqueId(temp_id, MAX_ID_SIZE);
+  ASSERT_TRUE(id_size > 0);
+  std::string unique_id1(temp_id, id_size);
+  ASSERT_TRUE(IsUniqueIDValid(unique_id1));
+
+  // Get Unique ID again
+  ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions));
+  id_size = file->GetUniqueId(temp_id, MAX_ID_SIZE);
+  ASSERT_TRUE(id_size > 0);
+  std::string unique_id2(temp_id, id_size);
+  ASSERT_TRUE(IsUniqueIDValid(unique_id2));
+
+  // Get Unique ID again after waiting some time.
+  env_->SleepForMicroseconds(1000000);
+  ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions));
+  id_size = file->GetUniqueId(temp_id, MAX_ID_SIZE);
+  ASSERT_TRUE(id_size > 0);
+  std::string unique_id3(temp_id, id_size);
+  ASSERT_TRUE(IsUniqueIDValid(unique_id3));
+
+  // Check IDs are the same.
+  ASSERT_EQ(unique_id1, unique_id2);
+  ASSERT_EQ(unique_id2, unique_id3);
+
+  // Delete the file
+  env_->DeleteFile(fname);
+}
+
+// only works in linux platforms
+#ifdef ROCKSDB_FALLOCATE_PRESENT
+TEST(EnvPosixTest, AllocateTest) {
+  std::string fname = GetOnDiskTestDir() + "/preallocate_testfile";
+  EnvOptions soptions;
+  soptions.use_mmap_writes = false;
+  unique_ptr<WritableFile> wfile;
+  ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions));
+
+  // allocate 100 MB
+  size_t kPreallocateSize = 100 * 1024 * 1024;
+  size_t kBlockSize = 512;
+  std::string data = "test";
+  wfile->SetPreallocationBlockSize(kPreallocateSize);
+  ASSERT_OK(wfile->Append(Slice(data)));
+  ASSERT_OK(wfile->Flush());
+
+  struct stat f_stat;
+  stat(fname.c_str(), &f_stat);
+  ASSERT_EQ((unsigned int)data.size(), f_stat.st_size);
+  // verify that blocks are preallocated
+  // Note here that we don't check the exact number of blocks preallocated --
+  // we only require that number of allocated blocks is at least what we expect.
+  // It looks like some FS give us more blocks that we asked for. That's fine.
+  // It might be worth investigating further.
+  auto st_blocks = f_stat.st_blocks;
+  ASSERT_LE((unsigned int)(kPreallocateSize / kBlockSize), st_blocks);
+
+  // close the file, should deallocate the blocks
+  wfile.reset();
+
+  stat(fname.c_str(), &f_stat);
+  ASSERT_EQ((unsigned int)data.size(), f_stat.st_size);
+  // verify that preallocated blocks were deallocated on file close
+  ASSERT_GT(st_blocks, f_stat.st_blocks);
+}
+#endif
+
+// Returns true if any of the strings in ss are the prefix of another string.
+bool HasPrefix(const std::unordered_set<std::string>& ss) {
+  for (const std::string& s: ss) {
+    if (s.empty()) {
+      return true;
+    }
+    for (size_t i = 1; i < s.size(); ++i) {
+      if (ss.count(s.substr(0, i)) != 0) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+// Only works in linux platforms
+TEST(EnvPosixTest, RandomAccessUniqueIDConcurrent) {
+  // Check whether a bunch of concurrently existing files have unique IDs.
+  const EnvOptions soptions;
+
+  // Create the files
+  std::vector<std::string> fnames;
+  for (int i = 0; i < 1000; ++i) {
+    fnames.push_back(GetOnDiskTestDir() + "/" + "testfile" + std::to_string(i));
+
+    // Create file.
+    unique_ptr<WritableFile> wfile;
+    ASSERT_OK(env_->NewWritableFile(fnames[i], &wfile, soptions));
+  }
+
+  // Collect and check whether the IDs are unique.
+  std::unordered_set<std::string> ids;
+  for (const std::string fname: fnames) {
+    unique_ptr<RandomAccessFile> file;
+    std::string unique_id;
+    ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions));
+    size_t id_size = file->GetUniqueId(temp_id, MAX_ID_SIZE);
+    ASSERT_TRUE(id_size > 0);
+    unique_id = std::string(temp_id, id_size);
+    ASSERT_TRUE(IsUniqueIDValid(unique_id));
+
+    ASSERT_TRUE(ids.count(unique_id) == 0);
+    ids.insert(unique_id);
+  }
+
+  // Delete the files
+  for (const std::string fname: fnames) {
+    ASSERT_OK(env_->DeleteFile(fname));
+  }
+
+  ASSERT_TRUE(!HasPrefix(ids));
+}
+
+// Only works in linux platforms
+TEST(EnvPosixTest, RandomAccessUniqueIDDeletes) {
+  const EnvOptions soptions;
+
+  std::string fname = GetOnDiskTestDir() + "/" + "testfile";
+
+  // Check that after file is deleted we don't get same ID again in a new file.
+  std::unordered_set<std::string> ids;
+  for (int i = 0; i < 1000; ++i) {
+    // Create file.
+    {
+      unique_ptr<WritableFile> wfile;
+      ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions));
+    }
+
+    // Get Unique ID
+    std::string unique_id;
+    {
+      unique_ptr<RandomAccessFile> file;
+      ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions));
+      size_t id_size = file->GetUniqueId(temp_id, MAX_ID_SIZE);
+      ASSERT_TRUE(id_size > 0);
+      unique_id = std::string(temp_id, id_size);
+    }
+
+    ASSERT_TRUE(IsUniqueIDValid(unique_id));
+    ASSERT_TRUE(ids.count(unique_id) == 0);
+    ids.insert(unique_id);
+
+    // Delete the file
+    ASSERT_OK(env_->DeleteFile(fname));
+  }
+
+  ASSERT_TRUE(!HasPrefix(ids));
+}
+
+// Only works in linux platforms
+TEST(EnvPosixTest, InvalidateCache) {
+  const EnvOptions soptions;
+  std::string fname = test::TmpDir() + "/" + "testfile";
+
+  // Create file.
+  {
+    unique_ptr<WritableFile> wfile;
+    ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions));
+    ASSERT_OK(wfile.get()->Append(Slice("Hello world")));
+    ASSERT_OK(wfile.get()->InvalidateCache(0, 0));
+    ASSERT_OK(wfile.get()->Close());
+  }
+
+  // Random Read
+  {
+    unique_ptr<RandomAccessFile> file;
+    char scratch[100];
+    Slice result;
+    ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions));
+    ASSERT_OK(file.get()->Read(0, 11, &result, scratch));
+    ASSERT_EQ(memcmp(scratch, "Hello world", 11), 0);
+    ASSERT_OK(file.get()->InvalidateCache(0, 11));
+    ASSERT_OK(file.get()->InvalidateCache(0, 0));
+  }
+
+  // Sequential Read
+  {
+    unique_ptr<SequentialFile> file;
+    char scratch[100];
+    Slice result;
+    ASSERT_OK(env_->NewSequentialFile(fname, &file, soptions));
+    ASSERT_OK(file.get()->Read(11, &result, scratch));
+    ASSERT_EQ(memcmp(scratch, "Hello world", 11), 0);
+    ASSERT_OK(file.get()->InvalidateCache(0, 11));
+    ASSERT_OK(file.get()->InvalidateCache(0, 0));
+  }
+  // Delete the file
+  ASSERT_OK(env_->DeleteFile(fname));
+}
+#endif
+
+TEST(EnvPosixTest, PosixRandomRWFileTest) {
+  EnvOptions soptions;
+  soptions.use_mmap_writes = soptions.use_mmap_reads = false;
+  std::string fname = test::TmpDir() + "/" + "testfile";
+
+  unique_ptr<RandomRWFile> file;
+  ASSERT_OK(env_->NewRandomRWFile(fname, &file, soptions));
+  // If you run the unit test on tmpfs, then tmpfs might not
+  // support fallocate. It is still better to trigger that
+  // code-path instead of eliminating it completely.
+  file.get()->Allocate(0, 10*1024*1024);
+  ASSERT_OK(file.get()->Write(100, Slice("Hello world")));
+  ASSERT_OK(file.get()->Write(105, Slice("Hello world")));
+  ASSERT_OK(file.get()->Sync());
+  ASSERT_OK(file.get()->Fsync());
+  char scratch[100];
+  Slice result;
+  ASSERT_OK(file.get()->Read(100, 16, &result, scratch));
+  ASSERT_EQ(result.compare("HelloHello world"), 0);
+  ASSERT_OK(file.get()->Close());
+}
+
+class TestLogger : public Logger {
+ public:
+  virtual void Logv(const char* format, va_list ap) override {
+    log_count++;
+
+    char new_format[550];
+    std::fill_n(new_format, sizeof(new_format), '2');
+    {
+      va_list backup_ap;
+      va_copy(backup_ap, ap);
+      int n = vsnprintf(new_format, sizeof(new_format) - 1, format, backup_ap);
+      // 48 bytes for extra information + bytes allocated
+
+      if (new_format[0] == '[') {
+        // "[DEBUG] "
+        ASSERT_TRUE(n <= 56 + (512 - static_cast<int>(sizeof(struct timeval))));
+      } else {
+        ASSERT_TRUE(n <= 48 + (512 - static_cast<int>(sizeof(struct timeval))));
+      }
+      va_end(backup_ap);
+    }
+
+    for (size_t i = 0; i < sizeof(new_format); i++) {
+      if (new_format[i] == 'x') {
+        char_x_count++;
+      } else if (new_format[i] == '\0') {
+        char_0_count++;
+      }
+    }
+  }
+  int log_count;
+  int char_x_count;
+  int char_0_count;
+};
+
+TEST(EnvPosixTest, LogBufferTest) {
+  TestLogger test_logger;
+  test_logger.SetInfoLogLevel(InfoLogLevel::INFO_LEVEL);
+  test_logger.log_count = 0;
+  test_logger.char_x_count = 0;
+  test_logger.char_0_count = 0;
+  LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, &test_logger);
+  LogBuffer log_buffer_debug(DEBUG_LEVEL, &test_logger);
+
+  char bytes200[200];
+  std::fill_n(bytes200, sizeof(bytes200), '1');
+  bytes200[sizeof(bytes200) - 1] = '\0';
+  char bytes600[600];
+  std::fill_n(bytes600, sizeof(bytes600), '1');
+  bytes600[sizeof(bytes600) - 1] = '\0';
+  char bytes9000[9000];
+  std::fill_n(bytes9000, sizeof(bytes9000), '1');
+  bytes9000[sizeof(bytes9000) - 1] = '\0';
+
+  LogToBuffer(&log_buffer, "x%sx", bytes200);
+  LogToBuffer(&log_buffer, "x%sx", bytes600);
+  LogToBuffer(&log_buffer, "x%sx%sx%sx", bytes200, bytes200, bytes200);
+  LogToBuffer(&log_buffer, "x%sx%sx", bytes200, bytes600);
+  LogToBuffer(&log_buffer, "x%sx%sx", bytes600, bytes9000);
+
+  LogToBuffer(&log_buffer_debug, "x%sx", bytes200);
+  test_logger.SetInfoLogLevel(DEBUG_LEVEL);
+  LogToBuffer(&log_buffer_debug, "x%sx%sx%sx", bytes600, bytes9000, bytes200);
+
+  ASSERT_EQ(0, test_logger.log_count);
+  log_buffer.FlushBufferToLog();
+  log_buffer_debug.FlushBufferToLog();
+  ASSERT_EQ(6, test_logger.log_count);
+  ASSERT_EQ(6, test_logger.char_0_count);
+  ASSERT_EQ(10, test_logger.char_x_count);
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/util/filelock_test.cc b/util/filelock_test.cc
new file mode 100644 (file)
index 0000000..a9e30a5
--- /dev/null
@@ -0,0 +1,58 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#include "rocksdb/status.h"
+#include "rocksdb/env.h"
+
+#include <vector>
+#include "util/coding.h"
+#include "util/testharness.h"
+
+namespace rocksdb {
+
+class LockTest {
+ public:
+  static LockTest* current_;
+  std::string file_;
+  rocksdb::Env* env_;
+
+  LockTest() : file_(test::TmpDir() + "/db_testlock_file"),
+               env_(rocksdb::Env::Default()) {
+    current_ = this;
+  }
+
+  ~LockTest() {
+  }
+
+  Status LockFile(FileLock** db_lock) {
+    return env_->LockFile(file_, db_lock);
+  }
+
+  Status UnlockFile(FileLock* db_lock) {
+    return env_->UnlockFile(db_lock);
+  }
+};
+LockTest* LockTest::current_;
+
+TEST(LockTest, LockBySameThread) {
+  FileLock* lock1;
+  FileLock* lock2;
+
+  // acquire a lock on a file
+  ASSERT_OK(LockFile(&lock1));
+
+  // re-acquire the lock on the same file. This should fail.
+  ASSERT_TRUE(LockFile(&lock2).IsIOError());
+
+  // release the lock
+  ASSERT_OK(UnlockFile(lock1));
+
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/util/filter_policy.cc b/util/filter_policy.cc
new file mode 100644 (file)
index 0000000..e950b75
--- /dev/null
@@ -0,0 +1,16 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/filter_policy.h"
+
+namespace rocksdb {
+
+FilterPolicy::~FilterPolicy() { }
+
+}  // namespace rocksdb
diff --git a/util/hash.cc b/util/hash.cc
new file mode 100644 (file)
index 0000000..e38c186
--- /dev/null
@@ -0,0 +1,49 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <string.h>
+#include "util/coding.h"
+#include "util/hash.h"
+
+namespace rocksdb {
+
+uint32_t Hash(const char* data, size_t n, uint32_t seed) {
+  // Similar to murmur hash
+  const uint32_t m = 0xc6a4a793;
+  const uint32_t r = 24;
+  const char* limit = data + n;
+  uint32_t h = seed ^ (n * m);
+
+  // Pick up four bytes at a time
+  while (data + 4 <= limit) {
+    uint32_t w = DecodeFixed32(data);
+    data += 4;
+    h += w;
+    h *= m;
+    h ^= (h >> 16);
+  }
+
+  // Pick up remaining bytes
+  switch (limit - data) {
+    case 3:
+      h += data[2] << 16;
+      // fall through
+    case 2:
+      h += data[1] << 8;
+      // fall through
+    case 1:
+      h += data[0];
+      h *= m;
+      h ^= (h >> r);
+      break;
+  }
+  return h;
+}
+
+}  // namespace rocksdb
diff --git a/util/hash.h b/util/hash.h
new file mode 100644 (file)
index 0000000..c9eb659
--- /dev/null
@@ -0,0 +1,20 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Simple hash function used for internal data structures
+
+#pragma once
+#include <stddef.h>
+#include <stdint.h>
+
+namespace rocksdb {
+
+extern uint32_t Hash(const char* data, size_t n, uint32_t seed);
+
+}
diff --git a/util/hash_cuckoo_rep.cc b/util/hash_cuckoo_rep.cc
new file mode 100644 (file)
index 0000000..d10bc5d
--- /dev/null
@@ -0,0 +1,627 @@
+
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+
+#ifndef ROCKSDB_LITE
+#include "util/hash_cuckoo_rep.h"
+
+#include <algorithm>
+#include <atomic>
+#include <limits>
+#include <queue>
+#include <string>
+#include <memory>
+#include <vector>
+
+#include "rocksdb/memtablerep.h"
+#include "util/murmurhash.h"
+#include "db/memtable.h"
+#include "db/skiplist.h"
+#include "util/stl_wrappers.h"
+
+namespace rocksdb {
+namespace {
+
+// the default maximum size of the cuckoo path searching queue
+static const int kCuckooPathMaxSearchSteps = 100;
+
+struct CuckooStep {
+  static const int kNullStep = -1;
+  // the bucket id in the cuckoo array.
+  int bucket_id_;
+  // index of cuckoo-step array that points to its previous step,
+  // -1 if it the beginning step.
+  int prev_step_id_;
+  // the depth of the current step.
+  unsigned int depth_;
+
+  CuckooStep() : bucket_id_(-1), prev_step_id_(kNullStep), depth_(1) {}
+
+  CuckooStep(CuckooStep&&) = default;
+  CuckooStep& operator=(CuckooStep&&) = default;
+
+  CuckooStep(const CuckooStep&) = delete;
+  CuckooStep& operator=(const CuckooStep&) = delete;
+
+  CuckooStep(int bucket_id, int prev_step_id, int depth)
+      : bucket_id_(bucket_id), prev_step_id_(prev_step_id), depth_(depth) {}
+};
+
+class HashCuckooRep : public MemTableRep {
+ public:
+  explicit HashCuckooRep(const MemTableRep::KeyComparator& compare,
+                         Arena* arena, const size_t bucket_count,
+                         const unsigned int hash_func_count)
+      : MemTableRep(arena),
+        compare_(compare),
+        arena_(arena),
+        bucket_count_(bucket_count),
+        cuckoo_path_max_depth_(kDefaultCuckooPathMaxDepth),
+        occupied_count_(0),
+        hash_function_count_(hash_func_count),
+        backup_table_(nullptr) {
+    char* mem = reinterpret_cast<char*>(
+        arena_->Allocate(sizeof(std::atomic<const char*>) * bucket_count_));
+    cuckoo_array_ = new (mem) std::atomic<const char*>[bucket_count_];
+    for (unsigned int bid = 0; bid < bucket_count_; ++bid) {
+      cuckoo_array_[bid].store(nullptr, std::memory_order_relaxed);
+    }
+
+    cuckoo_path_ = reinterpret_cast<int*>(
+        arena_->Allocate(sizeof(int*) * (cuckoo_path_max_depth_ + 1)));
+    is_nearly_full_ = false;
+  }
+
+  // return false, indicating HashCuckooRep does not support merge operator.
+  virtual bool IsMergeOperatorSupported() const override { return false; }
+
+  // return false, indicating HashCuckooRep does not support snapshot.
+  virtual bool IsSnapshotSupported() const override { return false; }
+
+  // Returns true iff an entry that compares equal to key is in the collection.
+  virtual bool Contains(const char* internal_key) const override;
+
+  virtual ~HashCuckooRep() override {}
+
+  // Insert the specified key (internal_key) into the mem-table.  Assertion
+  // fails if
+  // the current mem-table already contains the specified key.
+  virtual void Insert(KeyHandle handle) override;
+
+  // This function returns std::numeric_limits<size_t>::max() in the following
+  // three cases to disallow further write operations:
+  // 1. when the fullness reaches kMaxFullnes.
+  // 2. when the backup_table_ is used.
+  //
+  // otherwise, this function will always return 0.
+  virtual size_t ApproximateMemoryUsage() override {
+    if (is_nearly_full_) {
+      return std::numeric_limits<size_t>::max();
+    }
+    return 0;
+  }
+
+  virtual void Get(const LookupKey& k, void* callback_args,
+                   bool (*callback_func)(void* arg,
+                                         const char* entry)) override;
+
+  class Iterator : public MemTableRep::Iterator {
+    std::shared_ptr<std::vector<const char*>> bucket_;
+    typename std::vector<const char*>::const_iterator mutable cit_;
+    const KeyComparator& compare_;
+    std::string tmp_;  // For passing to EncodeKey
+    bool mutable sorted_;
+    void DoSort() const;
+
+   public:
+    explicit Iterator(std::shared_ptr<std::vector<const char*>> bucket,
+                      const KeyComparator& compare);
+
+    // Initialize an iterator over the specified collection.
+    // The returned iterator is not valid.
+    // explicit Iterator(const MemTableRep* collection);
+    virtual ~Iterator() override{};
+
+    // Returns true iff the iterator is positioned at a valid node.
+    virtual bool Valid() const override;
+
+    // Returns the key at the current position.
+    // REQUIRES: Valid()
+    virtual const char* key() const override;
+
+    // Advances to the next position.
+    // REQUIRES: Valid()
+    virtual void Next() override;
+
+    // Advances to the previous position.
+    // REQUIRES: Valid()
+    virtual void Prev() override;
+
+    // Advance to the first entry with a key >= target
+    virtual void Seek(const Slice& user_key, const char* memtable_key) override;
+
+    // Position at the first entry in collection.
+    // Final state of iterator is Valid() iff collection is not empty.
+    virtual void SeekToFirst() override;
+
+    // Position at the last entry in collection.
+    // Final state of iterator is Valid() iff collection is not empty.
+    virtual void SeekToLast() override;
+  };
+
+  struct CuckooStepBuffer {
+    CuckooStepBuffer() : write_index_(0), read_index_(0) {}
+    ~CuckooStepBuffer() {}
+
+    int write_index_;
+    int read_index_;
+    CuckooStep steps_[kCuckooPathMaxSearchSteps];
+
+    CuckooStep& NextWriteBuffer() { return steps_[write_index_++]; }
+
+    inline const CuckooStep& ReadNext() { return steps_[read_index_++]; }
+
+    inline bool HasNewWrite() { return write_index_ > read_index_; }
+
+    inline void reset() {
+      write_index_ = 0;
+      read_index_ = 0;
+    }
+
+    inline bool IsFull() { return write_index_ >= kCuckooPathMaxSearchSteps; }
+
+    // returns the number of steps that has been read
+    inline int ReadCount() { return read_index_; }
+
+    // returns the number of steps that has been written to the buffer.
+    inline int WriteCount() { return write_index_; }
+  };
+
+ private:
+  const MemTableRep::KeyComparator& compare_;
+  // the pointer to Arena to allocate memory, immutable after construction.
+  Arena* const arena_;
+  // the number of hash bucket in the hash table.
+  const size_t bucket_count_;
+  // the maxinum depth of the cuckoo path.
+  const unsigned int cuckoo_path_max_depth_;
+  // the current number of entries in cuckoo_array_ which has been occupied.
+  size_t occupied_count_;
+  // the current number of hash functions used in the cuckoo hash.
+  unsigned int hash_function_count_;
+  // the backup MemTableRep to handle the case where cuckoo hash cannot find
+  // a vacant bucket for inserting the key of a put request.
+  std::shared_ptr<MemTableRep> backup_table_;
+  // the array to store pointers, pointing to the actual data.
+  std::atomic<const char*>* cuckoo_array_;
+  // a buffer to store cuckoo path
+  int* cuckoo_path_;
+  // a boolean flag indicating whether the fullness of bucket array
+  // reaches the point to make the current memtable immutable.
+  bool is_nearly_full_;
+
+  // the default maximum depth of the cuckoo path.
+  static const unsigned int kDefaultCuckooPathMaxDepth = 10;
+
+  CuckooStepBuffer step_buffer_;
+
+  // returns the bucket id assogied to the input slice based on the
+  unsigned int GetHash(const Slice& slice, const int hash_func_id) const {
+    // the seeds used in the Murmur hash to produce different hash functions.
+    static const int kMurmurHashSeeds[HashCuckooRepFactory::kMaxHashCount] = {
+        545609244,  1769731426, 763324157,  13099088,   592422103,
+        1899789565, 248369300,  1984183468, 1613664382, 1491157517};
+    return MurmurHash(slice.data(), slice.size(),
+                      kMurmurHashSeeds[hash_func_id]) %
+           bucket_count_;
+  }
+
+  // A cuckoo path is a sequence of bucket ids, where each id points to a
+  // location of cuckoo_array_.  This path describes the displacement sequence
+  // of entries in order to store the desired data specified by the input user
+  // key.  The path starts from one of the locations associated with the
+  // specified user key and ends at a vacant space in the cuckoo array. This
+  // function will update the cuckoo_path.
+  //
+  // @return true if it found a cuckoo path.
+  bool FindCuckooPath(const char* internal_key, const Slice& user_key,
+                      int* cuckoo_path, size_t* cuckoo_path_length,
+                      int initial_hash_id = 0);
+
+  // Perform quick insert by checking whether there is a vacant bucket in one
+  // of the possible locations of the input key.  If so, then the function will
+  // return true and the key will be stored in that vacant bucket.
+  //
+  // This function is a helper function of FindCuckooPath that discovers the
+  // first possible steps of a cuckoo path.  It begins by first computing
+  // the possible locations of the input keys (and stores them in bucket_ids.)
+  // Then, if one of its possible locations is vacant, then the input key will
+  // be stored in that vacant space and the function will return true.
+  // Otherwise, the function will return false indicating a complete search
+  // of cuckoo-path is needed.
+  bool QuickInsert(const char* internal_key, const Slice& user_key,
+                   int bucket_ids[], const int initial_hash_id);
+
+  // Unhide default implementations of GetIterator
+  using MemTableRep::GetIterator;
+  // Returns the pointer to the internal iterator to the buckets where buckets
+  // are sorted according to the user specified KeyComparator.  Note that
+  // any insert after this function call may affect the sorted nature of
+  // the returned iterator.
+  virtual MemTableRep::Iterator* GetIterator() override {
+    std::vector<const char*> compact_buckets;
+    for (unsigned int bid = 0; bid < bucket_count_; ++bid) {
+      const char* bucket = cuckoo_array_[bid].load(std::memory_order_relaxed);
+      if (bucket != nullptr) {
+        compact_buckets.push_back(bucket);
+      }
+    }
+    MemTableRep* backup_table = backup_table_.get();
+    if (backup_table != nullptr) {
+      std::unique_ptr<MemTableRep::Iterator> iter(backup_table->GetIterator());
+      for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+        compact_buckets.push_back(iter->key());
+      }
+    }
+    return new Iterator(
+        std::shared_ptr<std::vector<const char*>>(
+            new std::vector<const char*>(std::move(compact_buckets))),
+        compare_);
+  }
+};
+
+void HashCuckooRep::Get(const LookupKey& key, void* callback_args,
+                        bool (*callback_func)(void* arg, const char* entry)) {
+  Slice user_key = key.user_key();
+  for (unsigned int hid = 0; hid < hash_function_count_; ++hid) {
+    const char* bucket =
+        cuckoo_array_[GetHash(user_key, hid)].load(std::memory_order_acquire);
+    if (bucket != nullptr) {
+      auto bucket_user_key = UserKey(bucket);
+      if (user_key.compare(bucket_user_key) == 0) {
+        callback_func(callback_args, bucket);
+        break;
+      }
+    } else {
+      // as Put() always stores at the vacant bucket located by the
+      // hash function with the smallest possible id, when we first
+      // find a vacant bucket in Get(), that means a miss.
+      break;
+    }
+  }
+  MemTableRep* backup_table = backup_table_.get();
+  if (backup_table != nullptr) {
+    backup_table->Get(key, callback_args, callback_func);
+  }
+}
+
+void HashCuckooRep::Insert(KeyHandle handle) {
+  static const float kMaxFullness = 0.90;
+
+  auto* key = static_cast<char*>(handle);
+  int initial_hash_id = 0;
+  size_t cuckoo_path_length = 0;
+  auto user_key = UserKey(key);
+  // find cuckoo path
+  if (FindCuckooPath(key, user_key, cuckoo_path_, &cuckoo_path_length,
+                     initial_hash_id) == false) {
+    // if true, then we can't find a vacant bucket for this key even we
+    // have used up all the hash functions.  Then use a backup memtable to
+    // store such key, which will further make this mem-table become
+    // immutable.
+    if (backup_table_.get() == nullptr) {
+      VectorRepFactory factory(10);
+      backup_table_.reset(factory.CreateMemTableRep(compare_, arena_, nullptr));
+      is_nearly_full_ = true;
+    }
+    backup_table_->Insert(key);
+    return;
+  }
+  // when reaching this point, means the insert can be done successfully.
+  occupied_count_++;
+  if (occupied_count_ >= bucket_count_ * kMaxFullness) {
+    is_nearly_full_ = true;
+  }
+
+  // perform kickout process if the length of cuckoo path > 1.
+  if (cuckoo_path_length == 0) return;
+
+  // the cuckoo path stores the kickout path in reverse order.
+  // so the kickout or displacement is actually performed
+  // in reverse order, which avoids false-negatives on read
+  // by moving each key involved in the cuckoo path to the new
+  // location before replacing it.
+  for (size_t i = 1; i < cuckoo_path_length; ++i) {
+    int kicked_out_bid = cuckoo_path_[i - 1];
+    int current_bid = cuckoo_path_[i];
+    // since we only allow one writer at a time, it is safe to do relaxed read.
+    cuckoo_array_[kicked_out_bid]
+        .store(cuckoo_array_[current_bid].load(std::memory_order_relaxed),
+               std::memory_order_release);
+  }
+  int insert_key_bid = cuckoo_path_[cuckoo_path_length - 1];
+  cuckoo_array_[insert_key_bid].store(key, std::memory_order_release);
+}
+
+bool HashCuckooRep::Contains(const char* internal_key) const {
+  auto user_key = UserKey(internal_key);
+  for (unsigned int hid = 0; hid < hash_function_count_; ++hid) {
+    const char* stored_key =
+        cuckoo_array_[GetHash(user_key, hid)].load(std::memory_order_acquire);
+    if (stored_key != nullptr) {
+      if (compare_(internal_key, stored_key) == 0) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+bool HashCuckooRep::QuickInsert(const char* internal_key, const Slice& user_key,
+                                int bucket_ids[], const int initial_hash_id) {
+  int cuckoo_bucket_id = -1;
+
+  // Below does the followings:
+  // 0. Calculate all possible locations of the input key.
+  // 1. Check if there is a bucket having same user_key as the input does.
+  // 2. If there exists such bucket, then replace this bucket by the newly
+  //    insert data and return.  This step also performs duplication check.
+  // 3. If no such bucket exists but exists a vacant bucket, then insert the
+  //    input data into it.
+  // 4. If step 1 to 3 all fail, then return false.
+  for (unsigned int hid = initial_hash_id; hid < hash_function_count_; ++hid) {
+    bucket_ids[hid] = GetHash(user_key, hid);
+    // since only one PUT is allowed at a time, and this is part of the PUT
+    // operation, so we can safely perform relaxed load.
+    const char* stored_key =
+        cuckoo_array_[bucket_ids[hid]].load(std::memory_order_relaxed);
+    if (stored_key == nullptr) {
+      if (cuckoo_bucket_id == -1) {
+        cuckoo_bucket_id = bucket_ids[hid];
+      }
+    } else {
+      const auto bucket_user_key = UserKey(stored_key);
+      if (bucket_user_key.compare(user_key) == 0) {
+        cuckoo_bucket_id = bucket_ids[hid];
+        break;
+      }
+    }
+  }
+
+  if (cuckoo_bucket_id != -1) {
+    cuckoo_array_[cuckoo_bucket_id]
+        .store(internal_key, std::memory_order_release);
+    return true;
+  }
+
+  return false;
+}
+
+// Perform pre-check and find the shortest cuckoo path.  A cuckoo path
+// is a displacement sequence for inserting the specified input key.
+//
+// @return true if it successfully found a vacant space or cuckoo-path.
+//     If the return value is true but the length of cuckoo_path is zero,
+//     then it indicates that a vacant bucket or an bucket with matched user
+//     key with the input is found, and a quick insertion is done.
+bool HashCuckooRep::FindCuckooPath(const char* internal_key,
+                                   const Slice& user_key, int* cuckoo_path,
+                                   size_t* cuckoo_path_length,
+                                   const int initial_hash_id) {
+  int bucket_ids[HashCuckooRepFactory::kMaxHashCount];
+  *cuckoo_path_length = 0;
+
+  if (QuickInsert(internal_key, user_key, bucket_ids, initial_hash_id)) {
+    return true;
+  }
+  // If this step is reached, then it means:
+  // 1. no vacant bucket in any of the possible locations of the input key.
+  // 2. none of the possible locations of the input key has the same user
+  //    key as the input `internal_key`.
+
+  // the front and back indices for the step_queue_
+  step_buffer_.reset();
+
+  for (unsigned int hid = initial_hash_id; hid < hash_function_count_; ++hid) {
+    /// CuckooStep& current_step = step_queue_[front_pos++];
+    CuckooStep& current_step = step_buffer_.NextWriteBuffer();
+    current_step.bucket_id_ = bucket_ids[hid];
+    current_step.prev_step_id_ = CuckooStep::kNullStep;
+    current_step.depth_ = 1;
+  }
+
+  while (step_buffer_.HasNewWrite()) {
+    int step_id = step_buffer_.read_index_;
+    const CuckooStep& step = step_buffer_.ReadNext();
+    // Since it's a BFS process, then the first step with its depth deeper
+    // than the maximum allowed depth indicates all the remaining steps
+    // in the step buffer queue will all exceed the maximum depth.
+    // Return false immediately indicating we can't find a vacant bucket
+    // for the input key before the maximum allowed depth.
+    if (step.depth_ >= cuckoo_path_max_depth_) {
+      return false;
+    }
+    // again, we can perform no barrier load safely here as the current
+    // thread is the only writer.
+    auto bucket_user_key =
+        UserKey(cuckoo_array_[step.bucket_id_].load(std::memory_order_relaxed));
+    if (step.prev_step_id_ != CuckooStep::kNullStep) {
+      if (bucket_user_key.compare(user_key) == 0) {
+        // then there is a loop in the current path, stop discovering this path.
+        continue;
+      }
+    }
+    // if the current bucket stores at its nth location, then we only consider
+    // its mth location where m > n.  This property makes sure that all reads
+    // will not miss if we do have data associated to the query key.
+    //
+    // The n and m in the above statement is the start_hid and hid in the code.
+    unsigned int start_hid = hash_function_count_;
+    for (unsigned int hid = 0; hid < hash_function_count_; ++hid) {
+      bucket_ids[hid] = GetHash(bucket_user_key, hid);
+      if (step.bucket_id_ == bucket_ids[hid]) {
+        start_hid = hid;
+      }
+    }
+    // must found a bucket which is its current "home".
+    assert(start_hid != hash_function_count_);
+
+    // explore all possible next steps from the current step.
+    for (unsigned int hid = start_hid + 1; hid < hash_function_count_; ++hid) {
+      CuckooStep& next_step = step_buffer_.NextWriteBuffer();
+      next_step.bucket_id_ = bucket_ids[hid];
+      next_step.prev_step_id_ = step_id;
+      next_step.depth_ = step.depth_ + 1;
+      // once a vacant bucket is found, trace back all its previous steps
+      // to generate a cuckoo path.
+      if (cuckoo_array_[next_step.bucket_id_].load(std::memory_order_relaxed) ==
+          nullptr) {
+        // store the last step in the cuckoo path.  Note that cuckoo_path
+        // stores steps in reverse order.  This allows us to move keys along
+        // the cuckoo path by storing each key to the new place first before
+        // removing it from the old place.  This property ensures reads will
+        // not missed due to moving keys along the cuckoo path.
+        cuckoo_path[(*cuckoo_path_length)++] = next_step.bucket_id_;
+        int depth;
+        for (depth = step.depth_; depth > 0 && step_id != CuckooStep::kNullStep;
+             depth--) {
+          const CuckooStep& prev_step = step_buffer_.steps_[step_id];
+          cuckoo_path[(*cuckoo_path_length)++] = prev_step.bucket_id_;
+          step_id = prev_step.prev_step_id_;
+        }
+        assert(depth == 0 && step_id == CuckooStep::kNullStep);
+        return true;
+      }
+      if (step_buffer_.IsFull()) {
+        // if true, then it reaches maxinum number of cuckoo search steps.
+        return false;
+      }
+    }
+  }
+
+  // tried all possible paths but still not unable to find a cuckoo path
+  // which path leads to a vacant bucket.
+  return false;
+}
+
+HashCuckooRep::Iterator::Iterator(
+    std::shared_ptr<std::vector<const char*>> bucket,
+    const KeyComparator& compare)
+    : bucket_(bucket),
+      cit_(bucket_->end()),
+      compare_(compare),
+      sorted_(false) {}
+
+void HashCuckooRep::Iterator::DoSort() const {
+  if (!sorted_) {
+    std::sort(bucket_->begin(), bucket_->end(),
+              stl_wrappers::Compare(compare_));
+    cit_ = bucket_->begin();
+    sorted_ = true;
+  }
+}
+
+// Returns true iff the iterator is positioned at a valid node.
+bool HashCuckooRep::Iterator::Valid() const {
+  DoSort();
+  return cit_ != bucket_->end();
+}
+
+// Returns the key at the current position.
+// REQUIRES: Valid()
+const char* HashCuckooRep::Iterator::key() const {
+  assert(Valid());
+  return *cit_;
+}
+
+// Advances to the next position.
+// REQUIRES: Valid()
+void HashCuckooRep::Iterator::Next() {
+  assert(Valid());
+  if (cit_ == bucket_->end()) {
+    return;
+  }
+  ++cit_;
+}
+
+// Advances to the previous position.
+// REQUIRES: Valid()
+void HashCuckooRep::Iterator::Prev() {
+  assert(Valid());
+  if (cit_ == bucket_->begin()) {
+    // If you try to go back from the first element, the iterator should be
+    // invalidated. So we set it to past-the-end. This means that you can
+    // treat the container circularly.
+    cit_ = bucket_->end();
+  } else {
+    --cit_;
+  }
+}
+
+// Advance to the first entry with a key >= target
+void HashCuckooRep::Iterator::Seek(const Slice& user_key,
+                                   const char* memtable_key) {
+  DoSort();
+  // Do binary search to find first value not less than the target
+  const char* encoded_key =
+      (memtable_key != nullptr) ? memtable_key : EncodeKey(&tmp_, user_key);
+  cit_ = std::equal_range(bucket_->begin(), bucket_->end(), encoded_key,
+                          [this](const char* a, const char* b) {
+                            return compare_(a, b) < 0;
+                          }).first;
+}
+
+// Position at the first entry in collection.
+// Final state of iterator is Valid() iff collection is not empty.
+void HashCuckooRep::Iterator::SeekToFirst() {
+  DoSort();
+  cit_ = bucket_->begin();
+}
+
+// Position at the last entry in collection.
+// Final state of iterator is Valid() iff collection is not empty.
+void HashCuckooRep::Iterator::SeekToLast() {
+  DoSort();
+  cit_ = bucket_->end();
+  if (bucket_->size() != 0) {
+    --cit_;
+  }
+}
+
+}  // anom namespace
+
+MemTableRep* HashCuckooRepFactory::CreateMemTableRep(
+    const MemTableRep::KeyComparator& compare, Arena* arena,
+    const SliceTransform* transform) {
+  // The estimated average fullness.  The write performance of any close hash
+  // degrades as the fullness of the mem-table increases.  Setting kFullness
+  // to a value around 0.7 can better avoid write performance degradation while
+  // keeping efficient memory usage.
+  static const float kFullness = 0.7;
+  size_t pointer_size = sizeof(std::atomic<const char*>);
+  assert(write_buffer_size_ >= (average_data_size_ + pointer_size));
+  size_t bucket_count =
+      (write_buffer_size_ / (average_data_size_ + pointer_size)) / kFullness +
+      1;
+  unsigned int hash_function_count = hash_function_count_;
+  if (hash_function_count < 2) {
+    hash_function_count = 2;
+  }
+  if (hash_function_count > kMaxHashCount) {
+    hash_function_count = kMaxHashCount;
+  }
+  return new HashCuckooRep(compare, arena, bucket_count, hash_function_count);
+}
+
+MemTableRepFactory* NewHashCuckooRepFactory(size_t write_buffer_size,
+                                            size_t average_data_size,
+                                            unsigned int hash_function_count) {
+  return new HashCuckooRepFactory(write_buffer_size, average_data_size,
+                                  hash_function_count);
+}
+
+}  // namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/util/hash_cuckoo_rep.h b/util/hash_cuckoo_rep.h
new file mode 100644 (file)
index 0000000..8f97ed4
--- /dev/null
@@ -0,0 +1,42 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+#pragma once
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/memtablerep.h"
+
+namespace rocksdb {
+
+class HashCuckooRepFactory : public MemTableRepFactory {
+ public:
+  // maxinum number of hash functions used in the cuckoo hash.
+  static const unsigned int kMaxHashCount = 10;
+
+  explicit HashCuckooRepFactory(size_t write_buffer_size,
+                                size_t average_data_size,
+                                unsigned int hash_function_count)
+      : write_buffer_size_(write_buffer_size),
+        average_data_size_(average_data_size),
+        hash_function_count_(hash_function_count) {}
+
+  virtual ~HashCuckooRepFactory() {}
+
+  virtual MemTableRep* CreateMemTableRep(
+      const MemTableRep::KeyComparator& compare, Arena* arena,
+      const SliceTransform* transform) override;
+
+  virtual const char* Name() const override { return "HashCuckooRepFactory"; }
+
+ private:
+  size_t write_buffer_size_;
+  size_t average_data_size_;
+  const unsigned int hash_function_count_;
+};
+}  // namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/util/hash_linklist_rep.cc b/util/hash_linklist_rep.cc
new file mode 100644 (file)
index 0000000..64aa2d9
--- /dev/null
@@ -0,0 +1,480 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+
+#ifndef ROCKSDB_LITE
+#include "util/hash_linklist_rep.h"
+
+#include "rocksdb/memtablerep.h"
+#include "util/arena.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "port/port.h"
+#include "port/atomic_pointer.h"
+#include "util/murmurhash.h"
+#include "db/memtable.h"
+#include "db/skiplist.h"
+
+namespace rocksdb {
+namespace {
+
+typedef const char* Key;
+
+struct Node {
+  // Accessors/mutators for links.  Wrapped in methods so we can
+  // add the appropriate barriers as necessary.
+  Node* Next() {
+    // Use an 'acquire load' so that we observe a fully initialized
+    // version of the returned Node.
+    return reinterpret_cast<Node*>(next_.Acquire_Load());
+  }
+  void SetNext(Node* x) {
+    // Use a 'release store' so that anybody who reads through this
+    // pointer observes a fully initialized version of the inserted node.
+    next_.Release_Store(x);
+  }
+  // No-barrier variants that can be safely used in a few locations.
+  Node* NoBarrier_Next() {
+    return reinterpret_cast<Node*>(next_.NoBarrier_Load());
+  }
+
+  void NoBarrier_SetNext(Node* x) {
+    next_.NoBarrier_Store(x);
+  }
+
+ private:
+  port::AtomicPointer next_;
+ public:
+  char key[0];
+};
+
+class HashLinkListRep : public MemTableRep {
+ public:
+  HashLinkListRep(const MemTableRep::KeyComparator& compare, Arena* arena,
+                  const SliceTransform* transform, size_t bucket_size);
+
+  virtual KeyHandle Allocate(const size_t len, char** buf) override;
+
+  virtual void Insert(KeyHandle handle) override;
+
+  virtual bool Contains(const char* key) const override;
+
+  virtual size_t ApproximateMemoryUsage() override;
+
+  virtual void Get(const LookupKey& k, void* callback_args,
+                   bool (*callback_func)(void* arg,
+                                         const char* entry)) override;
+
+  virtual ~HashLinkListRep();
+
+  virtual MemTableRep::Iterator* GetIterator() override;
+
+  virtual MemTableRep::Iterator* GetIterator(const Slice& slice) override;
+
+  virtual MemTableRep::Iterator* GetDynamicPrefixIterator() override;
+
+ private:
+  friend class DynamicIterator;
+  typedef SkipList<const char*, const MemTableRep::KeyComparator&> FullList;
+
+  size_t bucket_size_;
+
+  // Maps slices (which are transformed user keys) to buckets of keys sharing
+  // the same transform.
+  port::AtomicPointer* buckets_;
+
+  // The user-supplied transform whose domain is the user keys.
+  const SliceTransform* transform_;
+
+  const MemTableRep::KeyComparator& compare_;
+
+  bool BucketContains(Node* head, const Slice& key) const;
+
+  Slice GetPrefix(const Slice& internal_key) const {
+    return transform_->Transform(ExtractUserKey(internal_key));
+  }
+
+  size_t GetHash(const Slice& slice) const {
+    return MurmurHash(slice.data(), slice.size(), 0) % bucket_size_;
+  }
+
+  Node* GetBucket(size_t i) const {
+    return static_cast<Node*>(buckets_[i].Acquire_Load());
+  }
+
+  Node* GetBucket(const Slice& slice) const {
+    return GetBucket(GetHash(slice));
+  }
+
+  bool Equal(const Slice& a, const Key& b) const {
+    return (compare_(b, a) == 0);
+  }
+
+
+  bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); }
+
+  bool KeyIsAfterNode(const Slice& internal_key, const Node* n) const {
+    // nullptr n is considered infinite
+    return (n != nullptr) && (compare_(n->key, internal_key) < 0);
+  }
+
+  bool KeyIsAfterNode(const Key& key, const Node* n) const {
+    // nullptr n is considered infinite
+    return (n != nullptr) && (compare_(n->key, key) < 0);
+  }
+
+
+  Node* FindGreaterOrEqualInBucket(Node* head, const Slice& key) const;
+
+  class FullListIterator : public MemTableRep::Iterator {
+   public:
+    explicit FullListIterator(FullList* list, Arena* arena)
+      : iter_(list), full_list_(list), arena_(arena) {}
+
+    virtual ~FullListIterator() {
+    }
+
+    // Returns true iff the iterator is positioned at a valid node.
+    virtual bool Valid() const {
+      return iter_.Valid();
+    }
+
+    // Returns the key at the current position.
+    // REQUIRES: Valid()
+    virtual const char* key() const {
+      assert(Valid());
+      return iter_.key();
+    }
+
+    // Advances to the next position.
+    // REQUIRES: Valid()
+    virtual void Next() {
+      assert(Valid());
+      iter_.Next();
+    }
+
+    // Advances to the previous position.
+    // REQUIRES: Valid()
+    virtual void Prev() {
+      assert(Valid());
+      iter_.Prev();
+    }
+
+    // Advance to the first entry with a key >= target
+    virtual void Seek(const Slice& internal_key, const char* memtable_key) {
+      const char* encoded_key =
+          (memtable_key != nullptr) ?
+              memtable_key : EncodeKey(&tmp_, internal_key);
+      iter_.Seek(encoded_key);
+    }
+
+    // Position at the first entry in collection.
+    // Final state of iterator is Valid() iff collection is not empty.
+    virtual void SeekToFirst() {
+      iter_.SeekToFirst();
+    }
+
+    // Position at the last entry in collection.
+    // Final state of iterator is Valid() iff collection is not empty.
+    virtual void SeekToLast() {
+      iter_.SeekToLast();
+    }
+   private:
+    FullList::Iterator iter_;
+    // To destruct with the iterator.
+    std::unique_ptr<FullList> full_list_;
+    std::unique_ptr<Arena> arena_;
+    std::string tmp_;       // For passing to EncodeKey
+  };
+
+  class Iterator : public MemTableRep::Iterator {
+   public:
+    explicit Iterator(const HashLinkListRep* const hash_link_list_rep,
+                      Node* head) :
+        hash_link_list_rep_(hash_link_list_rep), head_(head), node_(nullptr) {
+    }
+
+    virtual ~Iterator() {
+    }
+
+    // Returns true iff the iterator is positioned at a valid node.
+    virtual bool Valid() const {
+      return node_ != nullptr;
+    }
+
+    // Returns the key at the current position.
+    // REQUIRES: Valid()
+    virtual const char* key() const {
+      assert(Valid());
+      return node_->key;
+    }
+
+    // Advances to the next position.
+    // REQUIRES: Valid()
+    virtual void Next() {
+      assert(Valid());
+      node_ = node_->Next();
+    }
+
+    // Advances to the previous position.
+    // REQUIRES: Valid()
+    virtual void Prev() {
+      // Prefix iterator does not support total order.
+      // We simply set the iterator to invalid state
+      Reset(nullptr);
+    }
+
+    // Advance to the first entry with a key >= target
+    virtual void Seek(const Slice& internal_key, const char* memtable_key) {
+      node_ = hash_link_list_rep_->FindGreaterOrEqualInBucket(head_,
+                                                              internal_key);
+    }
+
+    // Position at the first entry in collection.
+    // Final state of iterator is Valid() iff collection is not empty.
+    virtual void SeekToFirst() {
+      // Prefix iterator does not support total order.
+      // We simply set the iterator to invalid state
+      Reset(nullptr);
+    }
+
+    // Position at the last entry in collection.
+    // Final state of iterator is Valid() iff collection is not empty.
+    virtual void SeekToLast() {
+      // Prefix iterator does not support total order.
+      // We simply set the iterator to invalid state
+      Reset(nullptr);
+    }
+
+   protected:
+    void Reset(Node* head) {
+      head_ = head;
+      node_ = nullptr;
+    }
+   private:
+    friend class HashLinkListRep;
+    const HashLinkListRep* const hash_link_list_rep_;
+    Node* head_;
+    Node* node_;
+    std::string tmp_;       // For passing to EncodeKey
+
+    virtual void SeekToHead() {
+      node_ = head_;
+    }
+  };
+
+  class DynamicIterator : public HashLinkListRep::Iterator {
+   public:
+    explicit DynamicIterator(HashLinkListRep& memtable_rep)
+      : HashLinkListRep::Iterator(&memtable_rep, nullptr),
+        memtable_rep_(memtable_rep) {}
+
+    // Advance to the first entry with a key >= target
+    virtual void Seek(const Slice& k, const char* memtable_key) {
+      auto transformed = memtable_rep_.GetPrefix(k);
+      Reset(memtable_rep_.GetBucket(transformed));
+      HashLinkListRep::Iterator::Seek(k, memtable_key);
+    }
+
+   private:
+    // the underlying memtable
+    const HashLinkListRep& memtable_rep_;
+  };
+
+  class EmptyIterator : public MemTableRep::Iterator {
+    // This is used when there wasn't a bucket. It is cheaper than
+    // instantiating an empty bucket over which to iterate.
+   public:
+    EmptyIterator() { }
+    virtual bool Valid() const {
+      return false;
+    }
+    virtual const char* key() const {
+      assert(false);
+      return nullptr;
+    }
+    virtual void Next() { }
+    virtual void Prev() { }
+    virtual void Seek(const Slice& user_key, const char* memtable_key) { }
+    virtual void SeekToFirst() { }
+    virtual void SeekToLast() { }
+   private:
+  };
+};
+
+HashLinkListRep::HashLinkListRep(const MemTableRep::KeyComparator& compare,
+                                 Arena* arena, const SliceTransform* transform,
+                                 size_t bucket_size)
+  : MemTableRep(arena),
+    bucket_size_(bucket_size),
+    transform_(transform),
+    compare_(compare) {
+  char* mem = arena_->AllocateAligned(
+      sizeof(port::AtomicPointer) * bucket_size);
+
+  buckets_ = new (mem) port::AtomicPointer[bucket_size];
+
+  for (size_t i = 0; i < bucket_size_; ++i) {
+    buckets_[i].NoBarrier_Store(nullptr);
+  }
+}
+
+HashLinkListRep::~HashLinkListRep() {
+}
+
+KeyHandle HashLinkListRep::Allocate(const size_t len, char** buf) {
+  char* mem = arena_->AllocateAligned(sizeof(Node) + len);
+  Node* x = new (mem) Node();
+  *buf = x->key;
+  return static_cast<void*>(x);
+}
+
+void HashLinkListRep::Insert(KeyHandle handle) {
+  Node* x = static_cast<Node*>(handle);
+  assert(!Contains(x->key));
+  Slice internal_key = GetLengthPrefixedSlice(x->key);
+  auto transformed = GetPrefix(internal_key);
+  auto& bucket = buckets_[GetHash(transformed)];
+  Node* head = static_cast<Node*>(bucket.Acquire_Load());
+
+  if (!head) {
+    // NoBarrier_SetNext() suffices since we will add a barrier when
+    // we publish a pointer to "x" in prev[i].
+    x->NoBarrier_SetNext(nullptr);
+    bucket.Release_Store(static_cast<void*>(x));
+    return;
+  }
+
+  Node* cur = head;
+  Node* prev = nullptr;
+  while (true) {
+    if (cur == nullptr) {
+      break;
+    }
+    Node* next = cur->Next();
+    // Make sure the lists are sorted.
+    // If x points to head_ or next points nullptr, it is trivially satisfied.
+    assert((cur == head) || (next == nullptr) ||
+           KeyIsAfterNode(next->key, cur));
+    if (KeyIsAfterNode(internal_key, cur)) {
+      // Keep searching in this list
+      prev = cur;
+      cur = next;
+    } else {
+      break;
+    }
+  }
+
+  // Our data structure does not allow duplicate insertion
+  assert(cur == nullptr || !Equal(x->key, cur->key));
+
+  // NoBarrier_SetNext() suffices since we will add a barrier when
+  // we publish a pointer to "x" in prev[i].
+  x->NoBarrier_SetNext(cur);
+
+  if (prev) {
+    prev->SetNext(x);
+  } else {
+    bucket.Release_Store(static_cast<void*>(x));
+  }
+}
+
+bool HashLinkListRep::Contains(const char* key) const {
+  Slice internal_key = GetLengthPrefixedSlice(key);
+
+  auto transformed = GetPrefix(internal_key);
+  auto bucket = GetBucket(transformed);
+  if (bucket == nullptr) {
+    return false;
+  }
+  return BucketContains(bucket, internal_key);
+}
+
+size_t HashLinkListRep::ApproximateMemoryUsage() {
+  // Memory is always allocated from the arena.
+  return 0;
+}
+
+void HashLinkListRep::Get(const LookupKey& k, void* callback_args,
+                          bool (*callback_func)(void* arg, const char* entry)) {
+  auto transformed = transform_->Transform(k.user_key());
+  auto bucket = GetBucket(transformed);
+  if (bucket != nullptr) {
+    Iterator iter(this, bucket);
+    for (iter.Seek(k.internal_key(), nullptr);
+         iter.Valid() && callback_func(callback_args, iter.key());
+         iter.Next()) {
+    }
+  }
+}
+
+MemTableRep::Iterator* HashLinkListRep::GetIterator() {
+  // allocate a new arena of similar size to the one currently in use
+  Arena* new_arena = new Arena(arena_->BlockSize());
+  auto list = new FullList(compare_, new_arena);
+  for (size_t i = 0; i < bucket_size_; ++i) {
+    auto bucket = GetBucket(i);
+    if (bucket != nullptr) {
+      Iterator itr(this, bucket);
+      for (itr.SeekToHead(); itr.Valid(); itr.Next()) {
+        list->Insert(itr.key());
+      }
+    }
+  }
+  return new FullListIterator(list, new_arena);
+}
+
+MemTableRep::Iterator* HashLinkListRep::GetIterator(const Slice& slice) {
+  auto bucket = GetBucket(transform_->Transform(slice));
+  if (bucket == nullptr) {
+    return new EmptyIterator();
+  }
+  return new Iterator(this, bucket);
+}
+
+MemTableRep::Iterator* HashLinkListRep::GetDynamicPrefixIterator() {
+  return new DynamicIterator(*this);
+}
+
+bool HashLinkListRep::BucketContains(Node* head, const Slice& user_key) const {
+  Node* x = FindGreaterOrEqualInBucket(head, user_key);
+  return (x != nullptr && Equal(user_key, x->key));
+}
+
+Node* HashLinkListRep::FindGreaterOrEqualInBucket(Node* head,
+                                                  const Slice& key) const {
+  Node* x = head;
+  while (true) {
+    if (x == nullptr) {
+      return x;
+    }
+    Node* next = x->Next();
+    // Make sure the lists are sorted.
+    // If x points to head_ or next points nullptr, it is trivially satisfied.
+    assert((x == head) || (next == nullptr) || KeyIsAfterNode(next->key, x));
+    if (KeyIsAfterNode(key, x)) {
+      // Keep searching in this list
+      x = next;
+    } else {
+      break;
+    }
+  }
+  return x;
+}
+
+} // anon namespace
+
+MemTableRep* HashLinkListRepFactory::CreateMemTableRep(
+    const MemTableRep::KeyComparator& compare, Arena* arena,
+    const SliceTransform* transform) {
+  return new HashLinkListRep(compare, arena, transform, bucket_count_);
+}
+
+MemTableRepFactory* NewHashLinkListRepFactory(size_t bucket_count) {
+  return new HashLinkListRepFactory(bucket_count);
+}
+
+} // namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/util/hash_linklist_rep.h b/util/hash_linklist_rep.h
new file mode 100644 (file)
index 0000000..f1ab5d5
--- /dev/null
@@ -0,0 +1,36 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+#pragma once
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/memtablerep.h"
+
+namespace rocksdb {
+
+class HashLinkListRepFactory : public MemTableRepFactory {
+ public:
+  explicit HashLinkListRepFactory(size_t bucket_count)
+      : bucket_count_(bucket_count) { }
+
+  virtual ~HashLinkListRepFactory() {}
+
+  virtual MemTableRep* CreateMemTableRep(
+      const MemTableRep::KeyComparator& compare, Arena* arena,
+      const SliceTransform* transform) override;
+
+  virtual const char* Name() const override {
+    return "HashLinkListRepFactory";
+  }
+
+ private:
+  const size_t bucket_count_;
+};
+
+}
+#endif  // ROCKSDB_LITE
diff --git a/util/hash_skiplist_rep.cc b/util/hash_skiplist_rep.cc
new file mode 100644 (file)
index 0000000..21df9f6
--- /dev/null
@@ -0,0 +1,336 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+
+#ifndef ROCKSDB_LITE
+#include "util/hash_skiplist_rep.h"
+
+#include "rocksdb/memtablerep.h"
+#include "util/arena.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "port/port.h"
+#include "port/atomic_pointer.h"
+#include "util/murmurhash.h"
+#include "db/memtable.h"
+#include "db/skiplist.h"
+
+namespace rocksdb {
+namespace {
+
+class HashSkipListRep : public MemTableRep {
+ public:
+  HashSkipListRep(const MemTableRep::KeyComparator& compare, Arena* arena,
+                  const SliceTransform* transform, size_t bucket_size,
+                  int32_t skiplist_height, int32_t skiplist_branching_factor);
+
+  virtual void Insert(KeyHandle handle) override;
+
+  virtual bool Contains(const char* key) const override;
+
+  virtual size_t ApproximateMemoryUsage() override;
+
+  virtual void Get(const LookupKey& k, void* callback_args,
+                   bool (*callback_func)(void* arg,
+                                         const char* entry)) override;
+
+  virtual ~HashSkipListRep();
+
+  virtual MemTableRep::Iterator* GetIterator() override;
+
+  virtual MemTableRep::Iterator* GetIterator(const Slice& slice) override;
+
+  virtual MemTableRep::Iterator* GetDynamicPrefixIterator() override;
+
+ private:
+  friend class DynamicIterator;
+  typedef SkipList<const char*, const MemTableRep::KeyComparator&> Bucket;
+
+  size_t bucket_size_;
+
+  const int32_t skiplist_height_;
+  const int32_t skiplist_branching_factor_;
+
+  // Maps slices (which are transformed user keys) to buckets of keys sharing
+  // the same transform.
+  port::AtomicPointer* buckets_;
+
+  // The user-supplied transform whose domain is the user keys.
+  const SliceTransform* transform_;
+
+  const MemTableRep::KeyComparator& compare_;
+  // immutable after construction
+  Arena* const arena_;
+
+  inline size_t GetHash(const Slice& slice) const {
+    return MurmurHash(slice.data(), slice.size(), 0) % bucket_size_;
+  }
+  inline Bucket* GetBucket(size_t i) const {
+    return static_cast<Bucket*>(buckets_[i].Acquire_Load());
+  }
+  inline Bucket* GetBucket(const Slice& slice) const {
+    return GetBucket(GetHash(slice));
+  }
+  // Get a bucket from buckets_. If the bucket hasn't been initialized yet,
+  // initialize it before returning.
+  Bucket* GetInitializedBucket(const Slice& transformed);
+
+  class Iterator : public MemTableRep::Iterator {
+   public:
+    explicit Iterator(Bucket* list, bool own_list = true,
+                      Arena* arena = nullptr)
+        : list_(list), iter_(list), own_list_(own_list), arena_(arena) {}
+
+    virtual ~Iterator() {
+      // if we own the list, we should also delete it
+      if (own_list_) {
+        assert(list_ != nullptr);
+        delete list_;
+      }
+    }
+
+    // Returns true iff the iterator is positioned at a valid node.
+    virtual bool Valid() const {
+      return list_ != nullptr && iter_.Valid();
+    }
+
+    // Returns the key at the current position.
+    // REQUIRES: Valid()
+    virtual const char* key() const {
+      assert(Valid());
+      return iter_.key();
+    }
+
+    // Advances to the next position.
+    // REQUIRES: Valid()
+    virtual void Next() {
+      assert(Valid());
+      iter_.Next();
+    }
+
+    // Advances to the previous position.
+    // REQUIRES: Valid()
+    virtual void Prev() {
+      assert(Valid());
+      iter_.Prev();
+    }
+
+    // Advance to the first entry with a key >= target
+    virtual void Seek(const Slice& internal_key, const char* memtable_key) {
+      if (list_ != nullptr) {
+        const char* encoded_key =
+            (memtable_key != nullptr) ?
+                memtable_key : EncodeKey(&tmp_, internal_key);
+        iter_.Seek(encoded_key);
+      }
+    }
+
+    // Position at the first entry in collection.
+    // Final state of iterator is Valid() iff collection is not empty.
+    virtual void SeekToFirst() {
+      if (list_ != nullptr) {
+        iter_.SeekToFirst();
+      }
+    }
+
+    // Position at the last entry in collection.
+    // Final state of iterator is Valid() iff collection is not empty.
+    virtual void SeekToLast() {
+      if (list_ != nullptr) {
+        iter_.SeekToLast();
+      }
+    }
+   protected:
+    void Reset(Bucket* list) {
+      if (own_list_) {
+        assert(list_ != nullptr);
+        delete list_;
+      }
+      list_ = list;
+      iter_.SetList(list);
+      own_list_ = false;
+    }
+   private:
+    // if list_ is nullptr, we should NEVER call any methods on iter_
+    // if list_ is nullptr, this Iterator is not Valid()
+    Bucket* list_;
+    Bucket::Iterator iter_;
+    // here we track if we own list_. If we own it, we are also
+    // responsible for it's cleaning. This is a poor man's shared_ptr
+    bool own_list_;
+    std::unique_ptr<Arena> arena_;
+    std::string tmp_;       // For passing to EncodeKey
+  };
+
+  class DynamicIterator : public HashSkipListRep::Iterator {
+   public:
+    explicit DynamicIterator(const HashSkipListRep& memtable_rep)
+      : HashSkipListRep::Iterator(nullptr, false),
+        memtable_rep_(memtable_rep) {}
+
+    // Advance to the first entry with a key >= target
+    virtual void Seek(const Slice& k, const char* memtable_key) {
+      auto transformed = memtable_rep_.transform_->Transform(ExtractUserKey(k));
+      Reset(memtable_rep_.GetBucket(transformed));
+      HashSkipListRep::Iterator::Seek(k, memtable_key);
+    }
+
+    // Position at the first entry in collection.
+    // Final state of iterator is Valid() iff collection is not empty.
+    virtual void SeekToFirst() {
+      // Prefix iterator does not support total order.
+      // We simply set the iterator to invalid state
+      Reset(nullptr);
+    }
+
+    // Position at the last entry in collection.
+    // Final state of iterator is Valid() iff collection is not empty.
+    virtual void SeekToLast() {
+      // Prefix iterator does not support total order.
+      // We simply set the iterator to invalid state
+      Reset(nullptr);
+    }
+   private:
+    // the underlying memtable
+    const HashSkipListRep& memtable_rep_;
+  };
+
+  class EmptyIterator : public MemTableRep::Iterator {
+    // This is used when there wasn't a bucket. It is cheaper than
+    // instantiating an empty bucket over which to iterate.
+   public:
+    EmptyIterator() { }
+    virtual bool Valid() const {
+      return false;
+    }
+    virtual const char* key() const {
+      assert(false);
+      return nullptr;
+    }
+    virtual void Next() { }
+    virtual void Prev() { }
+    virtual void Seek(const Slice& internal_key,
+                      const char* memtable_key) { }
+    virtual void SeekToFirst() { }
+    virtual void SeekToLast() { }
+   private:
+  };
+};
+
+HashSkipListRep::HashSkipListRep(const MemTableRep::KeyComparator& compare,
+                                 Arena* arena, const SliceTransform* transform,
+                                 size_t bucket_size, int32_t skiplist_height,
+                                 int32_t skiplist_branching_factor)
+    : MemTableRep(arena),
+      bucket_size_(bucket_size),
+      skiplist_height_(skiplist_height),
+      skiplist_branching_factor_(skiplist_branching_factor),
+      transform_(transform),
+      compare_(compare),
+      arena_(arena) {
+  buckets_ = new port::AtomicPointer[bucket_size];
+
+  for (size_t i = 0; i < bucket_size_; ++i) {
+    buckets_[i].NoBarrier_Store(nullptr);
+  }
+}
+
+HashSkipListRep::~HashSkipListRep() {
+  delete[] buckets_;
+}
+
+HashSkipListRep::Bucket* HashSkipListRep::GetInitializedBucket(
+    const Slice& transformed) {
+  size_t hash = GetHash(transformed);
+  auto bucket = GetBucket(hash);
+  if (bucket == nullptr) {
+    auto addr = arena_->AllocateAligned(sizeof(Bucket));
+    bucket = new (addr) Bucket(compare_, arena_, skiplist_height_,
+                               skiplist_branching_factor_);
+    buckets_[hash].Release_Store(static_cast<void*>(bucket));
+  }
+  return bucket;
+}
+
+void HashSkipListRep::Insert(KeyHandle handle) {
+  auto* key = static_cast<char*>(handle);
+  assert(!Contains(key));
+  auto transformed = transform_->Transform(UserKey(key));
+  auto bucket = GetInitializedBucket(transformed);
+  bucket->Insert(key);
+}
+
+bool HashSkipListRep::Contains(const char* key) const {
+  auto transformed = transform_->Transform(UserKey(key));
+  auto bucket = GetBucket(transformed);
+  if (bucket == nullptr) {
+    return false;
+  }
+  return bucket->Contains(key);
+}
+
+size_t HashSkipListRep::ApproximateMemoryUsage() {
+  return sizeof(buckets_);
+}
+
+void HashSkipListRep::Get(const LookupKey& k, void* callback_args,
+                          bool (*callback_func)(void* arg, const char* entry)) {
+  auto transformed = transform_->Transform(k.user_key());
+  auto bucket = GetBucket(transformed);
+  if (bucket != nullptr) {
+    Bucket::Iterator iter(bucket);
+    for (iter.Seek(k.memtable_key().data());
+         iter.Valid() && callback_func(callback_args, iter.key());
+         iter.Next()) {
+    }
+  }
+}
+
+MemTableRep::Iterator* HashSkipListRep::GetIterator() {
+  // allocate a new arena of similar size to the one currently in use
+  Arena* new_arena = new Arena(arena_->BlockSize());
+  auto list = new Bucket(compare_, new_arena);
+  for (size_t i = 0; i < bucket_size_; ++i) {
+    auto bucket = GetBucket(i);
+    if (bucket != nullptr) {
+      Bucket::Iterator itr(bucket);
+      for (itr.SeekToFirst(); itr.Valid(); itr.Next()) {
+        list->Insert(itr.key());
+      }
+    }
+  }
+  return new Iterator(list, true, new_arena);
+}
+
+MemTableRep::Iterator* HashSkipListRep::GetIterator(const Slice& slice) {
+  auto bucket = GetBucket(transform_->Transform(slice));
+  if (bucket == nullptr) {
+    return new EmptyIterator();
+  }
+  return new Iterator(bucket, false);
+}
+
+MemTableRep::Iterator* HashSkipListRep::GetDynamicPrefixIterator() {
+  return new DynamicIterator(*this);
+}
+
+} // anon namespace
+
+MemTableRep* HashSkipListRepFactory::CreateMemTableRep(
+    const MemTableRep::KeyComparator& compare, Arena* arena,
+    const SliceTransform* transform) {
+  return new HashSkipListRep(compare, arena, transform, bucket_count_,
+                             skiplist_height_, skiplist_branching_factor_);
+}
+
+MemTableRepFactory* NewHashSkipListRepFactory(
+    size_t bucket_count, int32_t skiplist_height,
+    int32_t skiplist_branching_factor) {
+  return new HashSkipListRepFactory(bucket_count, skiplist_height,
+      skiplist_branching_factor);
+}
+
+} // namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/util/hash_skiplist_rep.h b/util/hash_skiplist_rep.h
new file mode 100644 (file)
index 0000000..16903c6
--- /dev/null
@@ -0,0 +1,43 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+#pragma once
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/memtablerep.h"
+
+namespace rocksdb {
+
+class HashSkipListRepFactory : public MemTableRepFactory {
+ public:
+  explicit HashSkipListRepFactory(
+    size_t bucket_count,
+    int32_t skiplist_height,
+    int32_t skiplist_branching_factor)
+      : bucket_count_(bucket_count),
+        skiplist_height_(skiplist_height),
+        skiplist_branching_factor_(skiplist_branching_factor) { }
+
+  virtual ~HashSkipListRepFactory() {}
+
+  virtual MemTableRep* CreateMemTableRep(
+      const MemTableRep::KeyComparator& compare, Arena* arena,
+      const SliceTransform* transform) override;
+
+  virtual const char* Name() const override {
+    return "HashSkipListRepFactory";
+  }
+
+ private:
+  const size_t bucket_count_;
+  const int32_t skiplist_height_;
+  const int32_t skiplist_branching_factor_;
+};
+
+}
+#endif  // ROCKSDB_LITE
diff --git a/util/histogram.cc b/util/histogram.cc
new file mode 100644 (file)
index 0000000..968769c
--- /dev/null
@@ -0,0 +1,198 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/histogram.h"
+
+#include <cassert>
+#include <math.h>
+#include <stdio.h>
+#include "port/port.h"
+
+namespace rocksdb {
+
+HistogramBucketMapper::HistogramBucketMapper()
+    :
+      // Add newer bucket index here.
+      // Should be alwyas added in sorted order.
+      // If you change this, you also need to change
+      // size of array buckets_ in HistogramImpl
+      bucketValues_(
+          {1,         2,         3,         4,         5,         6,
+           7,         8,         9,         10,        12,        14,
+           16,        18,        20,        25,        30,        35,
+           40,        45,        50,        60,        70,        80,
+           90,        100,       120,       140,       160,       180,
+           200,       250,       300,       350,       400,       450,
+           500,       600,       700,       800,       900,       1000,
+           1200,      1400,      1600,      1800,      2000,      2500,
+           3000,      3500,      4000,      4500,      5000,      6000,
+           7000,      8000,      9000,      10000,     12000,     14000,
+           16000,     18000,     20000,     25000,     30000,     35000,
+           40000,     45000,     50000,     60000,     70000,     80000,
+           90000,     100000,    120000,    140000,    160000,    180000,
+           200000,    250000,    300000,    350000,    400000,    450000,
+           500000,    600000,    700000,    800000,    900000,    1000000,
+           1200000,   1400000,   1600000,   1800000,   2000000,   2500000,
+           3000000,   3500000,   4000000,   4500000,   5000000,   6000000,
+           7000000,   8000000,   9000000,   10000000,  12000000,  14000000,
+           16000000,  18000000,  20000000,  25000000,  30000000,  35000000,
+           40000000,  45000000,  50000000,  60000000,  70000000,  80000000,
+           90000000,  100000000, 120000000, 140000000, 160000000, 180000000,
+           200000000, 250000000, 300000000, 350000000, 400000000, 450000000,
+           500000000, 600000000, 700000000, 800000000, 900000000, 1000000000}),
+      maxBucketValue_(bucketValues_.back()),
+      minBucketValue_(bucketValues_.front()) {
+  for (size_t i =0; i < bucketValues_.size(); ++i) {
+    valueIndexMap_[bucketValues_[i]] = i;
+  }
+}
+
+const size_t HistogramBucketMapper::IndexForValue(const uint64_t value) const {
+  if (value >= maxBucketValue_) {
+    return bucketValues_.size() - 1;
+  } else if ( value >= minBucketValue_ ) {
+    std::map<uint64_t, uint64_t>::const_iterator lowerBound =
+      valueIndexMap_.lower_bound(value);
+    if (lowerBound != valueIndexMap_.end()) {
+      return lowerBound->second;
+    } else {
+      return 0;
+    }
+  } else {
+    return 0;
+  }
+}
+
+namespace {
+  const HistogramBucketMapper bucketMapper;
+}
+
+void HistogramImpl::Clear() {
+  min_ = bucketMapper.LastValue();
+  max_ = 0;
+  num_ = 0;
+  sum_ = 0;
+  sum_squares_ = 0;
+  memset(buckets_, 0, sizeof buckets_);
+}
+
+bool HistogramImpl::Empty() { return sum_squares_ == 0; }
+
+void HistogramImpl::Add(uint64_t value) {
+  const size_t index = bucketMapper.IndexForValue(value);
+  buckets_[index] += 1;
+  if (min_ > value) min_ = value;
+  if (max_ < value) max_ = value;
+  num_++;
+  sum_ += value;
+  sum_squares_ += (value * value);
+}
+
+void HistogramImpl::Merge(const HistogramImpl& other) {
+  if (other.min_ < min_) min_ = other.min_;
+  if (other.max_ > max_) max_ = other.max_;
+  num_ += other.num_;
+  sum_ += other.sum_;
+  sum_squares_ += other.sum_squares_;
+  for (unsigned int b = 0; b < bucketMapper.BucketCount(); b++) {
+    buckets_[b] += other.buckets_[b];
+  }
+}
+
+double HistogramImpl::Median() const {
+  return Percentile(50.0);
+}
+
+double HistogramImpl::Percentile(double p) const {
+  double threshold = num_ * (p / 100.0);
+  double sum = 0;
+  for (unsigned int b = 0; b < bucketMapper.BucketCount(); b++) {
+    sum += buckets_[b];
+    if (sum >= threshold) {
+      // Scale linearly within this bucket
+      double left_point = (b == 0) ? 0 : bucketMapper.BucketLimit(b-1);
+      double right_point = bucketMapper.BucketLimit(b);
+      double left_sum = sum - buckets_[b];
+      double right_sum = sum;
+      double pos = 0;
+      double right_left_diff = right_sum - left_sum;
+      if (right_left_diff != 0) {
+       pos = (threshold - left_sum) / (right_sum - left_sum);
+      }
+      double r = left_point + (right_point - left_point) * pos;
+      if (r < min_) r = min_;
+      if (r > max_) r = max_;
+      return r;
+    }
+  }
+  return max_;
+}
+
+double HistogramImpl::Average() const {
+  if (num_ == 0.0) return 0;
+  return sum_ / num_;
+}
+
+double HistogramImpl::StandardDeviation() const {
+  if (num_ == 0.0) return 0;
+  double variance = (sum_squares_ * num_ - sum_ * sum_) / (num_ * num_);
+  return sqrt(variance);
+}
+
+std::string HistogramImpl::ToString() const {
+  std::string r;
+  char buf[200];
+  snprintf(buf, sizeof(buf),
+           "Count: %.0f  Average: %.4f  StdDev: %.2f\n",
+           num_, Average(), StandardDeviation());
+  r.append(buf);
+  snprintf(buf, sizeof(buf),
+           "Min: %.4f  Median: %.4f  Max: %.4f\n",
+           (num_ == 0.0 ? 0.0 : min_), Median(), max_);
+  r.append(buf);
+  snprintf(buf, sizeof(buf),
+           "Percentiles: "
+           "P50: %.2f P75: %.2f P99: %.2f P99.9: %.2f P99.99: %.2f\n",
+           Percentile(50), Percentile(75), Percentile(99), Percentile(99.9),
+           Percentile(99.99));
+  r.append(buf);
+  r.append("------------------------------------------------------\n");
+  const double mult = 100.0 / num_;
+  double sum = 0;
+  for (unsigned int b = 0; b < bucketMapper.BucketCount(); b++) {
+    if (buckets_[b] <= 0.0) continue;
+    sum += buckets_[b];
+    snprintf(buf, sizeof(buf),
+             "[ %7lu, %7lu ) %8lu %7.3f%% %7.3f%% ",
+             // left
+             (unsigned long)((b == 0) ? 0 : bucketMapper.BucketLimit(b-1)),
+             (unsigned long)bucketMapper.BucketLimit(b), // right
+             (unsigned long)buckets_[b],                 // count
+             (mult * buckets_[b]),        // percentage
+             (mult * sum));               // cumulative percentage
+    r.append(buf);
+
+    // Add hash marks based on percentage; 20 marks for 100%.
+    int marks = static_cast<int>(20*(buckets_[b] / num_) + 0.5);
+    r.append(marks, '#');
+    r.push_back('\n');
+  }
+  return r;
+}
+
+void HistogramImpl::Data(HistogramData * const data) const {
+  assert(data);
+  data->median = Median();
+  data->percentile95 = Percentile(95);
+  data->percentile99 = Percentile(99);
+  data->average = Average();
+  data->standard_deviation = StandardDeviation();
+}
+
+} // namespace levedb
diff --git a/util/histogram.h b/util/histogram.h
new file mode 100644 (file)
index 0000000..d95588d
--- /dev/null
@@ -0,0 +1,79 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include "rocksdb/statistics.h"
+
+#include <cassert>
+#include <string>
+#include <vector>
+#include <map>
+
+namespace rocksdb {
+
+class HistogramBucketMapper {
+ public:
+
+  HistogramBucketMapper();
+
+  // converts a value to the bucket index.
+  const size_t IndexForValue(const uint64_t value) const;
+  // number of buckets required.
+
+  const size_t BucketCount() const {
+    return bucketValues_.size();
+  }
+
+  uint64_t LastValue() const {
+    return maxBucketValue_;
+  }
+
+  uint64_t FirstValue() const {
+    return minBucketValue_;
+  }
+
+  uint64_t BucketLimit(const uint64_t bucketNumber) const {
+    assert(bucketNumber < BucketCount());
+    return bucketValues_[bucketNumber];
+  }
+
+ private:
+  const std::vector<uint64_t> bucketValues_;
+  const uint64_t maxBucketValue_;
+  const uint64_t minBucketValue_;
+  std::map<uint64_t, uint64_t> valueIndexMap_;
+};
+
+class HistogramImpl {
+ public:
+  virtual void Clear();
+  virtual bool Empty();
+  virtual void Add(uint64_t value);
+  void Merge(const HistogramImpl& other);
+
+  virtual std::string ToString() const;
+
+  virtual double Median() const;
+  virtual double Percentile(double p) const;
+  virtual double Average() const;
+  virtual double StandardDeviation() const;
+  virtual void Data(HistogramData * const data) const;
+
+ private:
+  // To be able to use HistogramImpl as thread local variable, its constructor
+  // has to be static. That's why we're using manually values from BucketMapper
+  double min_ = 1000000000;  // this is BucketMapper:LastValue()
+  double max_ = 0;
+  double num_ = 0;
+  double sum_ = 0;
+  double sum_squares_ = 0;
+  uint64_t buckets_[138] = {0};  // this is BucketMapper::BucketCount()
+};
+
+}  // namespace rocksdb
diff --git a/util/histogram_test.cc b/util/histogram_test.cc
new file mode 100644 (file)
index 0000000..065f957
--- /dev/null
@@ -0,0 +1,62 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#include "util/histogram.h"
+
+#include "util/testharness.h"
+
+namespace rocksdb {
+
+class HistogramTest { };
+
+TEST(HistogramTest, BasicOperation) {
+
+  HistogramImpl histogram;
+  for (uint64_t i = 1; i <= 100; i++) {
+    histogram.Add(i);
+  }
+
+  {
+    double median = histogram.Median();
+    // ASSERT_LE(median, 50);
+    ASSERT_GT(median, 0);
+  }
+
+  {
+    double percentile100 = histogram.Percentile(100.0);
+    ASSERT_LE(percentile100, 100.0);
+    ASSERT_GT(percentile100, 0.0);
+    double percentile99 = histogram.Percentile(99.0);
+    double percentile85 = histogram.Percentile(85.0);
+    ASSERT_LE(percentile99, 99.0);
+    ASSERT_TRUE(percentile99 >= percentile85);
+  }
+
+  ASSERT_EQ(histogram.Average(), 50.5); // avg is acurately caluclated.
+}
+
+TEST(HistogramTest, EmptyHistogram) {
+  HistogramImpl histogram;
+  ASSERT_EQ(histogram.Median(), 0.0);
+  ASSERT_EQ(histogram.Percentile(85.0), 0.0);
+  ASSERT_EQ(histogram.Average(), 0.0);
+}
+
+TEST(HistogramTest, ClearHistogram) {
+  HistogramImpl histogram;
+  for (uint64_t i = 1; i <= 100; i++) {
+    histogram.Add(i);
+  }
+  histogram.Clear();
+  ASSERT_EQ(histogram.Median(), 0);
+  ASSERT_EQ(histogram.Percentile(85.0), 0);
+  ASSERT_EQ(histogram.Average(), 0);
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/util/ldb_cmd.cc b/util/ldb_cmd.cc
new file mode 100644 (file)
index 0000000..597179f
--- /dev/null
@@ -0,0 +1,1839 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#ifndef ROCKSDB_LITE
+#include "util/ldb_cmd.h"
+
+#include "db/dbformat.h"
+#include "db/db_impl.h"
+#include "db/log_reader.h"
+#include "db/filename.h"
+#include "db/write_batch_internal.h"
+#include "rocksdb/write_batch.h"
+#include "rocksdb/cache.h"
+#include "util/coding.h"
+#include "utilities/ttl/db_ttl_impl.h"
+
+#include <ctime>
+#include <dirent.h>
+#include <sstream>
+#include <string>
+#include <stdexcept>
+
+namespace rocksdb {
+
+using namespace std;
+
+const string LDBCommand::ARG_DB = "db";
+const string LDBCommand::ARG_HEX = "hex";
+const string LDBCommand::ARG_KEY_HEX = "key_hex";
+const string LDBCommand::ARG_VALUE_HEX = "value_hex";
+const string LDBCommand::ARG_TTL = "ttl";
+const string LDBCommand::ARG_TTL_START = "start_time";
+const string LDBCommand::ARG_TTL_END = "end_time";
+const string LDBCommand::ARG_TIMESTAMP = "timestamp";
+const string LDBCommand::ARG_FROM = "from";
+const string LDBCommand::ARG_TO = "to";
+const string LDBCommand::ARG_MAX_KEYS = "max_keys";
+const string LDBCommand::ARG_BLOOM_BITS = "bloom_bits";
+const string LDBCommand::ARG_COMPRESSION_TYPE = "compression_type";
+const string LDBCommand::ARG_BLOCK_SIZE = "block_size";
+const string LDBCommand::ARG_AUTO_COMPACTION = "auto_compaction";
+const string LDBCommand::ARG_WRITE_BUFFER_SIZE = "write_buffer_size";
+const string LDBCommand::ARG_FILE_SIZE = "file_size";
+const string LDBCommand::ARG_CREATE_IF_MISSING = "create_if_missing";
+
+const char* LDBCommand::DELIM = " ==> ";
+
+LDBCommand* LDBCommand::InitFromCmdLineArgs(
+  int argc,
+  char** argv,
+  const Options& options
+) {
+  vector<string> args;
+  for (int i = 1; i < argc; i++) {
+    args.push_back(argv[i]);
+  }
+  return InitFromCmdLineArgs(args, options);
+}
+
+/**
+ * Parse the command-line arguments and create the appropriate LDBCommand2
+ * instance.
+ * The command line arguments must be in the following format:
+ * ./ldb --db=PATH_TO_DB [--commonOpt1=commonOpt1Val] ..
+ *        COMMAND <PARAM1> <PARAM2> ... [-cmdSpecificOpt1=cmdSpecificOpt1Val] ..
+ * This is similar to the command line format used by HBaseClientTool.
+ * Command name is not included in args.
+ * Returns nullptr if the command-line cannot be parsed.
+ */
+LDBCommand* LDBCommand::InitFromCmdLineArgs(
+  const vector<string>& args,
+  const Options& options
+) {
+  // --x=y command line arguments are added as x->y map entries.
+  map<string, string> option_map;
+
+  // Command-line arguments of the form --hex end up in this array as hex
+  vector<string> flags;
+
+  // Everything other than option_map and flags. Represents commands
+  // and their parameters.  For eg: put key1 value1 go into this vector.
+  vector<string> cmdTokens;
+
+  const string OPTION_PREFIX = "--";
+
+  for (const auto& arg : args) {
+    if (arg[0] == '-' && arg[1] == '-'){
+      vector<string> splits = stringSplit(arg, '=');
+      if (splits.size() == 2) {
+        string optionKey = splits[0].substr(OPTION_PREFIX.size());
+        option_map[optionKey] = splits[1];
+      } else {
+        string optionKey = splits[0].substr(OPTION_PREFIX.size());
+        flags.push_back(optionKey);
+      }
+    } else {
+      cmdTokens.push_back(arg);
+    }
+  }
+
+  if (cmdTokens.size() < 1) {
+    fprintf(stderr, "Command not specified!");
+    return nullptr;
+  }
+
+  string cmd = cmdTokens[0];
+  vector<string> cmdParams(cmdTokens.begin()+1, cmdTokens.end());
+  LDBCommand* command = LDBCommand::SelectCommand(
+    cmd,
+    cmdParams,
+    option_map,
+    flags
+  );
+
+  if (command) {
+    command->SetOptions(options);
+  }
+  return command;
+}
+
+LDBCommand* LDBCommand::SelectCommand(
+    const std::string& cmd,
+    const vector<string>& cmdParams,
+    const map<string, string>& option_map,
+    const vector<string>& flags
+  ) {
+
+  if (cmd == GetCommand::Name()) {
+    return new GetCommand(cmdParams, option_map, flags);
+  } else if (cmd == PutCommand::Name()) {
+    return new PutCommand(cmdParams, option_map, flags);
+  } else if (cmd == BatchPutCommand::Name()) {
+    return new BatchPutCommand(cmdParams, option_map, flags);
+  } else if (cmd == ScanCommand::Name()) {
+    return new ScanCommand(cmdParams, option_map, flags);
+  } else if (cmd == DeleteCommand::Name()) {
+    return new DeleteCommand(cmdParams, option_map, flags);
+  } else if (cmd == ApproxSizeCommand::Name()) {
+    return new ApproxSizeCommand(cmdParams, option_map, flags);
+  } else if (cmd == DBQuerierCommand::Name()) {
+    return new DBQuerierCommand(cmdParams, option_map, flags);
+  } else if (cmd == CompactorCommand::Name()) {
+    return new CompactorCommand(cmdParams, option_map, flags);
+  } else if (cmd == WALDumperCommand::Name()) {
+    return new WALDumperCommand(cmdParams, option_map, flags);
+  } else if (cmd == ReduceDBLevelsCommand::Name()) {
+    return new ReduceDBLevelsCommand(cmdParams, option_map, flags);
+  } else if (cmd == ChangeCompactionStyleCommand::Name()) {
+    return new ChangeCompactionStyleCommand(cmdParams, option_map, flags);
+  } else if (cmd == DBDumperCommand::Name()) {
+    return new DBDumperCommand(cmdParams, option_map, flags);
+  } else if (cmd == DBLoaderCommand::Name()) {
+    return new DBLoaderCommand(cmdParams, option_map, flags);
+  } else if (cmd == ManifestDumpCommand::Name()) {
+    return new ManifestDumpCommand(cmdParams, option_map, flags);
+  } else if (cmd == ListColumnFamiliesCommand::Name()) {
+    return new ListColumnFamiliesCommand(cmdParams, option_map, flags);
+  } else if (cmd == InternalDumpCommand::Name()) {
+    return new InternalDumpCommand(cmdParams, option_map, flags);
+  } else if (cmd == CheckConsistencyCommand::Name()) {
+    return new CheckConsistencyCommand(cmdParams, option_map, flags);
+  }
+  return nullptr;
+}
+
+
+/**
+ * Parses the specific integer option and fills in the value.
+ * Returns true if the option is found.
+ * Returns false if the option is not found or if there is an error parsing the
+ * value.  If there is an error, the specified exec_state is also
+ * updated.
+ */
+bool LDBCommand::ParseIntOption(const map<string, string>& options,
+                                const string& option, int& value,
+                                LDBCommandExecuteResult& exec_state) {
+
+  map<string, string>::const_iterator itr = option_map_.find(option);
+  if (itr != option_map_.end()) {
+    try {
+      value = stoi(itr->second);
+      return true;
+    } catch(const invalid_argument&) {
+      exec_state = LDBCommandExecuteResult::FAILED(option +
+                      " has an invalid value.");
+    } catch(const out_of_range&) {
+      exec_state = LDBCommandExecuteResult::FAILED(option +
+                      " has a value out-of-range.");
+    }
+  }
+  return false;
+}
+
+/**
+ * Parses the specified option and fills in the value.
+ * Returns true if the option is found.
+ * Returns false otherwise.
+ */
+bool LDBCommand::ParseStringOption(const map<string, string>& options,
+                                   const string& option, string* value) {
+  auto itr = option_map_.find(option);
+  if (itr != option_map_.end()) {
+    *value = itr->second;
+    return true;
+  }
+  return false;
+}
+
+Options LDBCommand::PrepareOptionsForOpenDB() {
+
+  Options opt = options_;
+  opt.create_if_missing = false;
+
+  map<string, string>::const_iterator itr;
+
+  int bits;
+  if (ParseIntOption(option_map_, ARG_BLOOM_BITS, bits, exec_state_)) {
+    if (bits > 0) {
+      opt.filter_policy = NewBloomFilterPolicy(bits);
+    } else {
+      exec_state_ = LDBCommandExecuteResult::FAILED(ARG_BLOOM_BITS +
+                      " must be > 0.");
+    }
+  }
+
+  int block_size;
+  if (ParseIntOption(option_map_, ARG_BLOCK_SIZE, block_size, exec_state_)) {
+    if (block_size > 0) {
+      opt.block_size = block_size;
+    } else {
+      exec_state_ = LDBCommandExecuteResult::FAILED(ARG_BLOCK_SIZE +
+                      " must be > 0.");
+    }
+  }
+
+  itr = option_map_.find(ARG_AUTO_COMPACTION);
+  if (itr != option_map_.end()) {
+    opt.disable_auto_compactions = ! StringToBool(itr->second);
+  }
+
+  itr = option_map_.find(ARG_COMPRESSION_TYPE);
+  if (itr != option_map_.end()) {
+    string comp = itr->second;
+    if (comp == "no") {
+      opt.compression = kNoCompression;
+    } else if (comp == "snappy") {
+      opt.compression = kSnappyCompression;
+    } else if (comp == "zlib") {
+      opt.compression = kZlibCompression;
+    } else if (comp == "bzip2") {
+      opt.compression = kBZip2Compression;
+    } else if (comp == "lz4") {
+      opt.compression = kLZ4Compression;
+    } else if (comp == "lz4hc") {
+      opt.compression = kLZ4HCCompression;
+    } else {
+      // Unknown compression.
+      exec_state_ = LDBCommandExecuteResult::FAILED(
+                      "Unknown compression level: " + comp);
+    }
+  }
+
+  int write_buffer_size;
+  if (ParseIntOption(option_map_, ARG_WRITE_BUFFER_SIZE, write_buffer_size,
+        exec_state_)) {
+    if (write_buffer_size > 0) {
+      opt.write_buffer_size = write_buffer_size;
+    } else {
+      exec_state_ = LDBCommandExecuteResult::FAILED(ARG_WRITE_BUFFER_SIZE +
+                      " must be > 0.");
+    }
+  }
+
+  int file_size;
+  if (ParseIntOption(option_map_, ARG_FILE_SIZE, file_size, exec_state_)) {
+    if (file_size > 0) {
+      opt.target_file_size_base = file_size;
+    } else {
+      exec_state_ = LDBCommandExecuteResult::FAILED(ARG_FILE_SIZE +
+                      " must be > 0.");
+    }
+  }
+
+  return opt;
+}
+
+bool LDBCommand::ParseKeyValue(const string& line, string* key, string* value,
+                              bool is_key_hex, bool is_value_hex) {
+  size_t pos = line.find(DELIM);
+  if (pos != string::npos) {
+    *key = line.substr(0, pos);
+    *value = line.substr(pos + strlen(DELIM));
+    if (is_key_hex) {
+      *key = HexToString(*key);
+    }
+    if (is_value_hex) {
+      *value = HexToString(*value);
+    }
+    return true;
+  } else {
+    return false;
+  }
+}
+
+/**
+ * Make sure that ONLY the command-line options and flags expected by this
+ * command are specified on the command-line.  Extraneous options are usually
+ * the result of user error.
+ * Returns true if all checks pass.  Else returns false, and prints an
+ * appropriate error msg to stderr.
+ */
+bool LDBCommand::ValidateCmdLineOptions() {
+
+  for (map<string, string>::const_iterator itr = option_map_.begin();
+        itr != option_map_.end(); itr++) {
+    if (find(valid_cmd_line_options_.begin(),
+          valid_cmd_line_options_.end(), itr->first) ==
+          valid_cmd_line_options_.end()) {
+      fprintf(stderr, "Invalid command-line option %s\n", itr->first.c_str());
+      return false;
+    }
+  }
+
+  for (vector<string>::const_iterator itr = flags_.begin();
+        itr != flags_.end(); itr++) {
+    if (find(valid_cmd_line_options_.begin(),
+          valid_cmd_line_options_.end(), *itr) ==
+          valid_cmd_line_options_.end()) {
+      fprintf(stderr, "Invalid command-line flag %s\n", itr->c_str());
+      return false;
+    }
+  }
+
+  if (!NoDBOpen() && option_map_.find(ARG_DB) == option_map_.end()) {
+    fprintf(stderr, "%s must be specified\n", ARG_DB.c_str());
+    return false;
+  }
+
+  return true;
+}
+
+CompactorCommand::CompactorCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags) :
+    LDBCommand(options, flags, false,
+               BuildCmdLineOptions({ARG_FROM, ARG_TO, ARG_HEX, ARG_KEY_HEX,
+                                    ARG_VALUE_HEX, ARG_TTL})),
+    null_from_(true), null_to_(true) {
+
+  map<string, string>::const_iterator itr = options.find(ARG_FROM);
+  if (itr != options.end()) {
+    null_from_ = false;
+    from_ = itr->second;
+  }
+
+  itr = options.find(ARG_TO);
+  if (itr != options.end()) {
+    null_to_ = false;
+    to_ = itr->second;
+  }
+
+  if (is_key_hex_) {
+    if (!null_from_) {
+      from_ = HexToString(from_);
+    }
+    if (!null_to_) {
+      to_ = HexToString(to_);
+    }
+  }
+}
+
+void CompactorCommand::Help(string& ret) {
+  ret.append("  ");
+  ret.append(CompactorCommand::Name());
+  ret.append(HelpRangeCmdArgs());
+  ret.append("\n");
+}
+
+void CompactorCommand::DoCommand() {
+
+  Slice* begin = nullptr;
+  Slice* end = nullptr;
+  if (!null_from_) {
+    begin = new Slice(from_);
+  }
+  if (!null_to_) {
+    end = new Slice(to_);
+  }
+
+  db_->CompactRange(begin, end);
+  exec_state_ = LDBCommandExecuteResult::SUCCEED("");
+
+  delete begin;
+  delete end;
+}
+
+const string DBLoaderCommand::ARG_DISABLE_WAL = "disable_wal";
+const string DBLoaderCommand::ARG_BULK_LOAD = "bulk_load";
+const string DBLoaderCommand::ARG_COMPACT = "compact";
+
+DBLoaderCommand::DBLoaderCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags) :
+    LDBCommand(options, flags, false,
+               BuildCmdLineOptions({ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX,
+                                    ARG_FROM, ARG_TO, ARG_CREATE_IF_MISSING,
+                                    ARG_DISABLE_WAL, ARG_BULK_LOAD,
+                                    ARG_COMPACT})),
+    create_if_missing_(false), disable_wal_(false), bulk_load_(false),
+    compact_(false) {
+
+  create_if_missing_ = IsFlagPresent(flags, ARG_CREATE_IF_MISSING);
+  disable_wal_ = IsFlagPresent(flags, ARG_DISABLE_WAL);
+  bulk_load_ = IsFlagPresent(flags, ARG_BULK_LOAD);
+  compact_ = IsFlagPresent(flags, ARG_COMPACT);
+}
+
+void DBLoaderCommand::Help(string& ret) {
+  ret.append("  ");
+  ret.append(DBLoaderCommand::Name());
+  ret.append(" [--" + ARG_CREATE_IF_MISSING + "]");
+  ret.append(" [--" + ARG_DISABLE_WAL + "]");
+  ret.append(" [--" + ARG_BULK_LOAD + "]");
+  ret.append(" [--" + ARG_COMPACT + "]");
+  ret.append("\n");
+}
+
+Options DBLoaderCommand::PrepareOptionsForOpenDB() {
+  Options opt = LDBCommand::PrepareOptionsForOpenDB();
+  opt.create_if_missing = create_if_missing_;
+  if (bulk_load_) {
+    opt.PrepareForBulkLoad();
+  }
+  return opt;
+}
+
+void DBLoaderCommand::DoCommand() {
+  if (!db_) {
+    return;
+  }
+
+  WriteOptions write_options;
+  if (disable_wal_) {
+    write_options.disableWAL = true;
+  }
+
+  int bad_lines = 0;
+  string line;
+  while (getline(cin, line, '\n')) {
+    string key;
+    string value;
+    if (ParseKeyValue(line, &key, &value, is_key_hex_, is_value_hex_)) {
+      db_->Put(write_options, Slice(key), Slice(value));
+    } else if (0 == line.find("Keys in range:")) {
+      // ignore this line
+    } else if (0 == line.find("Created bg thread 0x")) {
+      // ignore this line
+    } else {
+      bad_lines ++;
+    }
+  }
+
+  if (bad_lines > 0) {
+    cout << "Warning: " << bad_lines << " bad lines ignored." << endl;
+  }
+  if (compact_) {
+    db_->CompactRange(nullptr, nullptr);
+  }
+}
+
+// ----------------------------------------------------------------------------
+
+const string ManifestDumpCommand::ARG_VERBOSE = "verbose";
+const string ManifestDumpCommand::ARG_PATH    = "path";
+
+void ManifestDumpCommand::Help(string& ret) {
+  ret.append("  ");
+  ret.append(ManifestDumpCommand::Name());
+  ret.append(" [--" + ARG_VERBOSE + "]");
+  ret.append(" [--" + ARG_PATH + "=<path_to_manifest_file>]");
+  ret.append("\n");
+}
+
+ManifestDumpCommand::ManifestDumpCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags) :
+    LDBCommand(options, flags, false,
+               BuildCmdLineOptions({ARG_VERBOSE, ARG_PATH, ARG_HEX})),
+    verbose_(false),
+    path_("")
+{
+  verbose_ = IsFlagPresent(flags, ARG_VERBOSE);
+
+  map<string, string>::const_iterator itr = options.find(ARG_PATH);
+  if (itr != options.end()) {
+    path_ = itr->second;
+    if (path_.empty()) {
+      exec_state_ = LDBCommandExecuteResult::FAILED("--path: missing pathname");
+    }
+  }
+}
+
+void ManifestDumpCommand::DoCommand() {
+
+  std::string manifestfile;
+
+  if (!path_.empty()) {
+    manifestfile = path_;
+  } else {
+    bool found = false;
+    // We need to find the manifest file by searching the directory
+    // containing the db for files of the form MANIFEST_[0-9]+
+    DIR* d = opendir(db_path_.c_str());
+    if (d == nullptr) {
+      exec_state_ = LDBCommandExecuteResult::FAILED(
+        db_path_ + " is not a directory");
+      return;
+    }
+    struct dirent* entry;
+    while ((entry = readdir(d)) != nullptr) {
+      unsigned int match;
+      unsigned long long num;
+      if (sscanf(entry->d_name,
+                 "MANIFEST-%ln%ln",
+                 (unsigned long*)&num,
+                 (unsigned long*)&match)
+          && match == strlen(entry->d_name)) {
+        if (!found) {
+          manifestfile = db_path_ + "/" + std::string(entry->d_name);
+          found = true;
+        } else {
+          exec_state_ = LDBCommandExecuteResult::FAILED(
+            "Multiple MANIFEST files found; use --path to select one");
+          return;
+        }
+      }
+    }
+    closedir(d);
+  }
+
+  if (verbose_) {
+    printf("Processing Manifest file %s\n", manifestfile.c_str());
+  }
+
+  Options options;
+  EnvOptions sopt;
+  std::string file(manifestfile);
+  std::string dbname("dummy");
+  std::shared_ptr<Cache> tc(NewLRUCache(
+      options.max_open_files - 10, options.table_cache_numshardbits,
+      options.table_cache_remove_scan_count_limit));
+  VersionSet* versions = new VersionSet(dbname, &options, sopt, tc.get());
+  Status s = versions->DumpManifest(options, file, verbose_, is_key_hex_);
+  if (!s.ok()) {
+    printf("Error in processing file %s %s\n", manifestfile.c_str(),
+           s.ToString().c_str());
+  }
+  if (verbose_) {
+    printf("Processing Manifest file %s done\n", manifestfile.c_str());
+  }
+}
+
+// ----------------------------------------------------------------------------
+
+void ListColumnFamiliesCommand::Help(string& ret) {
+  ret.append("  ");
+  ret.append(ListColumnFamiliesCommand::Name());
+  ret.append(" full_path_to_db_directory ");
+  ret.append("\n");
+}
+
+ListColumnFamiliesCommand::ListColumnFamiliesCommand(
+    const vector<string>& params, const map<string, string>& options,
+    const vector<string>& flags)
+    : LDBCommand(options, flags, false, {}) {
+
+  if (params.size() != 1) {
+    exec_state_ = LDBCommandExecuteResult::FAILED(
+        "dbname must be specified for the list_column_families command");
+  } else {
+    dbname_ = params[0];
+  }
+}
+
+void ListColumnFamiliesCommand::DoCommand() {
+  vector<string> column_families;
+  Status s = DB::ListColumnFamilies(DBOptions(), dbname_, &column_families);
+  if (!s.ok()) {
+    printf("Error in processing db %s %s\n", dbname_.c_str(),
+           s.ToString().c_str());
+  } else {
+    printf("Column families in %s: \n{", dbname_.c_str());
+    bool first = true;
+    for (auto cf : column_families) {
+      if (!first) {
+        printf(", ");
+      }
+      first = false;
+      printf("%s", cf.c_str());
+    }
+    printf("}\n");
+  }
+}
+
+// ----------------------------------------------------------------------------
+
+namespace {
+
+string ReadableTime(int unixtime) {
+  char time_buffer [80];
+  time_t rawtime = unixtime;
+  struct tm * timeinfo = localtime(&rawtime);
+  strftime(time_buffer, 80, "%c", timeinfo);
+  return string(time_buffer);
+}
+
+// This function only called when it's the sane case of >1 buckets in time-range
+// Also called only when timekv falls between ttl_start and ttl_end provided
+void IncBucketCounts(vector<uint64_t>& bucket_counts, int ttl_start,
+      int time_range, int bucket_size, int timekv, int num_buckets) {
+  assert(time_range > 0 && timekv >= ttl_start && bucket_size > 0 &&
+    timekv < (ttl_start + time_range) && num_buckets > 1);
+  int bucket = (timekv - ttl_start) / bucket_size;
+  bucket_counts[bucket]++;
+}
+
+void PrintBucketCounts(const vector<uint64_t>& bucket_counts, int ttl_start,
+      int ttl_end, int bucket_size, int num_buckets) {
+  int time_point = ttl_start;
+  for(int i = 0; i < num_buckets - 1; i++, time_point += bucket_size) {
+    fprintf(stdout, "Keys in range %s to %s : %lu\n",
+            ReadableTime(time_point).c_str(),
+            ReadableTime(time_point + bucket_size).c_str(),
+            (unsigned long)bucket_counts[i]);
+  }
+  fprintf(stdout, "Keys in range %s to %s : %lu\n",
+          ReadableTime(time_point).c_str(),
+          ReadableTime(ttl_end).c_str(),
+          (unsigned long)bucket_counts[num_buckets - 1]);
+}
+
+}  // namespace
+
+const string InternalDumpCommand::ARG_COUNT_ONLY = "count_only";
+const string InternalDumpCommand::ARG_COUNT_DELIM = "count_delim";
+const string InternalDumpCommand::ARG_STATS = "stats";
+const string InternalDumpCommand::ARG_INPUT_KEY_HEX = "input_key_hex";
+
+InternalDumpCommand::InternalDumpCommand(const vector<string>& params,
+                                         const map<string, string>& options,
+                                         const vector<string>& flags) :
+    LDBCommand(options, flags, true,
+               BuildCmdLineOptions({ ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX,
+                                     ARG_FROM, ARG_TO, ARG_MAX_KEYS,
+                                     ARG_COUNT_ONLY, ARG_COUNT_DELIM, ARG_STATS,
+                                     ARG_INPUT_KEY_HEX})),
+    has_from_(false),
+    has_to_(false),
+    max_keys_(-1),
+    delim_("."),
+    count_only_(false),
+    count_delim_(false),
+    print_stats_(false),
+    is_input_key_hex_(false) {
+
+  has_from_ = ParseStringOption(options, ARG_FROM, &from_);
+  has_to_ = ParseStringOption(options, ARG_TO, &to_);
+
+  ParseIntOption(options, ARG_MAX_KEYS, max_keys_, exec_state_);
+  map<string, string>::const_iterator itr = options.find(ARG_COUNT_DELIM);
+  if (itr != options.end()) {
+    delim_ = itr->second;
+    count_delim_ = true;
+   // fprintf(stdout,"delim = %c\n",delim_[0]);
+  } else {
+    count_delim_ = IsFlagPresent(flags, ARG_COUNT_DELIM);
+    delim_=".";
+  }
+
+  print_stats_ = IsFlagPresent(flags, ARG_STATS);
+  count_only_ = IsFlagPresent(flags, ARG_COUNT_ONLY);
+  is_input_key_hex_ = IsFlagPresent(flags, ARG_INPUT_KEY_HEX);
+
+  if (is_input_key_hex_) {
+    if (has_from_) {
+      from_ = HexToString(from_);
+    }
+    if (has_to_) {
+      to_ = HexToString(to_);
+    }
+  }
+}
+
+void InternalDumpCommand::Help(string& ret) {
+  ret.append("  ");
+  ret.append(InternalDumpCommand::Name());
+  ret.append(HelpRangeCmdArgs());
+  ret.append(" [--" + ARG_INPUT_KEY_HEX + "]");
+  ret.append(" [--" + ARG_MAX_KEYS + "=<N>]");
+  ret.append(" [--" + ARG_COUNT_ONLY + "]");
+  ret.append(" [--" + ARG_COUNT_DELIM + "=<char>]");
+  ret.append(" [--" + ARG_STATS + "]");
+  ret.append("\n");
+}
+
+void InternalDumpCommand::DoCommand() {
+  if (!db_) {
+    return;
+  }
+
+  if (print_stats_) {
+    string stats;
+    if (db_->GetProperty("rocksdb.stats", &stats)) {
+      fprintf(stdout, "%s\n", stats.c_str());
+    }
+  }
+
+  // Cast as DBImpl to get internal iterator
+  DBImpl* idb = dynamic_cast<DBImpl*>(db_);
+  if (!idb) {
+    exec_state_ = LDBCommandExecuteResult::FAILED("DB is not DBImpl");
+    return;
+  }
+  string rtype1,rtype2,row,val;
+  rtype2 = "";
+  uint64_t c=0;
+  uint64_t s1=0,s2=0;
+  // Setup internal key iterator
+  auto iter = unique_ptr<Iterator>(idb->TEST_NewInternalIterator());
+  Status st = iter->status();
+  if (!st.ok()) {
+    exec_state_ = LDBCommandExecuteResult::FAILED("Iterator error:"
+                                                  + st.ToString());
+  }
+
+  if (has_from_) {
+    InternalKey ikey(from_, kMaxSequenceNumber, kValueTypeForSeek);
+    iter->Seek(ikey.Encode());
+  } else {
+    iter->SeekToFirst();
+  }
+
+  long long count = 0;
+  for (; iter->Valid(); iter->Next()) {
+    ParsedInternalKey ikey;
+    if (!ParseInternalKey(iter->key(), &ikey)) {
+      fprintf(stderr, "Internal Key [%s] parse error!\n",
+              iter->key().ToString(true /* in hex*/).data());
+      // TODO: add error counter
+      continue;
+    }
+
+    // If end marker was specified, we stop before it
+    if (has_to_ && options_.comparator->Compare(ikey.user_key, to_) >= 0) {
+      break;
+    }
+
+    ++count;
+    int k;
+    if (count_delim_) {
+      rtype1 = "";
+      s1=0;
+      row = iter->key().ToString();
+      val = iter->value().ToString();
+      for(k=0;row[k]!='\x01' && row[k]!='\0';k++)
+        s1++;
+      for(k=0;val[k]!='\x01' && val[k]!='\0';k++)
+        s1++;
+      for(int j=0;row[j]!=delim_[0] && row[j]!='\0' && row[j]!='\x01';j++)
+        rtype1+=row[j];
+      if(rtype2.compare("") && rtype2.compare(rtype1)!=0) {
+        fprintf(stdout,"%s => count:%lld\tsize:%lld\n",rtype2.c_str(),
+            (long long)c,(long long)s2);
+        c=1;
+        s2=s1;
+        rtype2 = rtype1;
+      } else {
+        c++;
+        s2+=s1;
+        rtype2=rtype1;
+    }
+  }
+
+    if (!count_only_ && !count_delim_) {
+      string key = ikey.DebugString(is_key_hex_);
+      string value = iter->value().ToString(is_value_hex_);
+      std::cout << key << " => " << value << "\n";
+    }
+
+    // Terminate if maximum number of keys have been dumped
+    if (max_keys_ > 0 && count >= max_keys_) break;
+  }
+  if(count_delim_) {
+    fprintf(stdout,"%s => count:%lld\tsize:%lld\n", rtype2.c_str(),
+        (long long)c,(long long)s2);
+  } else
+  fprintf(stdout, "Internal keys in range: %lld\n", (long long) count);
+}
+
+
+const string DBDumperCommand::ARG_COUNT_ONLY = "count_only";
+const string DBDumperCommand::ARG_COUNT_DELIM = "count_delim";
+const string DBDumperCommand::ARG_STATS = "stats";
+const string DBDumperCommand::ARG_TTL_BUCKET = "bucket";
+
+DBDumperCommand::DBDumperCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags) :
+    LDBCommand(options, flags, true,
+               BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX,
+                                    ARG_VALUE_HEX, ARG_FROM, ARG_TO,
+                                    ARG_MAX_KEYS, ARG_COUNT_ONLY,
+                                    ARG_COUNT_DELIM, ARG_STATS, ARG_TTL_START,
+                                    ARG_TTL_END, ARG_TTL_BUCKET,
+                                    ARG_TIMESTAMP})),
+    null_from_(true),
+    null_to_(true),
+    max_keys_(-1),
+    count_only_(false),
+    count_delim_(false),
+    print_stats_(false) {
+
+  map<string, string>::const_iterator itr = options.find(ARG_FROM);
+  if (itr != options.end()) {
+    null_from_ = false;
+    from_ = itr->second;
+  }
+
+  itr = options.find(ARG_TO);
+  if (itr != options.end()) {
+    null_to_ = false;
+    to_ = itr->second;
+  }
+
+  itr = options.find(ARG_MAX_KEYS);
+  if (itr != options.end()) {
+    try {
+      max_keys_ = stoi(itr->second);
+    } catch(const invalid_argument&) {
+      exec_state_ = LDBCommandExecuteResult::FAILED(ARG_MAX_KEYS +
+                        " has an invalid value");
+    } catch(const out_of_range&) {
+      exec_state_ = LDBCommandExecuteResult::FAILED(ARG_MAX_KEYS +
+                        " has a value out-of-range");
+    }
+  }
+  itr = options.find(ARG_COUNT_DELIM);
+  if (itr != options.end()) {
+    delim_ = itr->second;
+    count_delim_ = true;
+  } else {
+    count_delim_ = IsFlagPresent(flags, ARG_COUNT_DELIM);
+    delim_=".";
+  }
+
+  print_stats_ = IsFlagPresent(flags, ARG_STATS);
+  count_only_ = IsFlagPresent(flags, ARG_COUNT_ONLY);
+
+  if (is_key_hex_) {
+    if (!null_from_) {
+      from_ = HexToString(from_);
+    }
+    if (!null_to_) {
+      to_ = HexToString(to_);
+    }
+  }
+}
+
+void DBDumperCommand::Help(string& ret) {
+  ret.append("  ");
+  ret.append(DBDumperCommand::Name());
+  ret.append(HelpRangeCmdArgs());
+  ret.append(" [--" + ARG_TTL + "]");
+  ret.append(" [--" + ARG_MAX_KEYS + "=<N>]");
+  ret.append(" [--" + ARG_TIMESTAMP + "]");
+  ret.append(" [--" + ARG_COUNT_ONLY + "]");
+  ret.append(" [--" + ARG_COUNT_DELIM + "=<char>]");
+  ret.append(" [--" + ARG_STATS + "]");
+  ret.append(" [--" + ARG_TTL_BUCKET + "=<N>]");
+  ret.append(" [--" + ARG_TTL_START + "=<N>:- is inclusive]");
+  ret.append(" [--" + ARG_TTL_END + "=<N>:- is exclusive]");
+  ret.append("\n");
+}
+
+void DBDumperCommand::DoCommand() {
+  if (!db_) {
+    return;
+  }
+  // Parse command line args
+  uint64_t count = 0;
+  if (print_stats_) {
+    string stats;
+    if (db_->GetProperty("rocksdb.stats", &stats)) {
+      fprintf(stdout, "%s\n", stats.c_str());
+    }
+  }
+
+  // Setup key iterator
+  Iterator* iter = db_->NewIterator(ReadOptions());
+  Status st = iter->status();
+  if (!st.ok()) {
+    exec_state_ = LDBCommandExecuteResult::FAILED("Iterator error."
+        + st.ToString());
+  }
+
+  if (!null_from_) {
+    iter->Seek(from_);
+  } else {
+    iter->SeekToFirst();
+  }
+
+  int max_keys = max_keys_;
+  int ttl_start;
+  if (!ParseIntOption(option_map_, ARG_TTL_START, ttl_start, exec_state_)) {
+    ttl_start = DBWithTTLImpl::kMinTimestamp;  // TTL introduction time
+  }
+  int ttl_end;
+  if (!ParseIntOption(option_map_, ARG_TTL_END, ttl_end, exec_state_)) {
+    ttl_end = DBWithTTLImpl::kMaxTimestamp;  // Max time allowed by TTL feature
+  }
+  if (ttl_end < ttl_start) {
+    fprintf(stderr, "Error: End time can't be less than start time\n");
+    delete iter;
+    return;
+  }
+  int time_range = ttl_end - ttl_start;
+  int bucket_size;
+  if (!ParseIntOption(option_map_, ARG_TTL_BUCKET, bucket_size, exec_state_) ||
+      bucket_size <= 0) {
+    bucket_size = time_range; // Will have just 1 bucket by default
+  }
+  //cretaing variables for row count of each type
+  string rtype1,rtype2,row,val;
+  rtype2 = "";
+  uint64_t c=0;
+  uint64_t s1=0,s2=0;
+
+  // At this point, bucket_size=0 => time_range=0
+  uint64_t num_buckets = (bucket_size >= time_range) ? 1 :
+    ((time_range + bucket_size - 1) / bucket_size);
+  vector<uint64_t> bucket_counts(num_buckets, 0);
+  if (is_db_ttl_ && !count_only_ && timestamp_ && !count_delim_) {
+    fprintf(stdout, "Dumping key-values from %s to %s\n",
+            ReadableTime(ttl_start).c_str(), ReadableTime(ttl_end).c_str());
+  }
+
+  for (; iter->Valid(); iter->Next()) {
+    int rawtime = 0;
+    // If end marker was specified, we stop before it
+    if (!null_to_ && (iter->key().ToString() >= to_))
+      break;
+    // Terminate if maximum number of keys have been dumped
+    if (max_keys == 0)
+      break;
+    if (is_db_ttl_) {
+      TtlIterator* it_ttl = dynamic_cast<TtlIterator*>(iter);
+      assert(it_ttl);
+      rawtime = it_ttl->timestamp();
+      if (rawtime < ttl_start || rawtime >= ttl_end) {
+        continue;
+      }
+    }
+    if (max_keys > 0) {
+      --max_keys;
+    }
+    if (is_db_ttl_ && num_buckets > 1) {
+      IncBucketCounts(bucket_counts, ttl_start, time_range, bucket_size,
+                      rawtime, num_buckets);
+    }
+    ++count;
+    if (count_delim_) {
+      rtype1 = "";
+      row = iter->key().ToString();
+      val = iter->value().ToString();
+      s1 = row.size()+val.size();
+      for(int j=0;row[j]!=delim_[0] && row[j]!='\0';j++)
+        rtype1+=row[j];
+      if(rtype2.compare("") && rtype2.compare(rtype1)!=0) {
+        fprintf(stdout,"%s => count:%lld\tsize:%lld\n",rtype2.c_str(),
+            (long long )c,(long long)s2);
+        c=1;
+        s2=s1;
+        rtype2 = rtype1;
+      } else {
+          c++;
+          s2+=s1;
+          rtype2=rtype1;
+      }
+
+    }
+
+
+
+    if (!count_only_ && !count_delim_) {
+      if (is_db_ttl_ && timestamp_) {
+        fprintf(stdout, "%s ", ReadableTime(rawtime).c_str());
+      }
+      string str = PrintKeyValue(iter->key().ToString(),
+                                 iter->value().ToString(), is_key_hex_,
+                                 is_value_hex_);
+      fprintf(stdout, "%s\n", str.c_str());
+    }
+  }
+
+  if (num_buckets > 1 && is_db_ttl_) {
+    PrintBucketCounts(bucket_counts, ttl_start, ttl_end, bucket_size,
+                      num_buckets);
+  } else if(count_delim_) {
+    fprintf(stdout,"%s => count:%lld\tsize:%lld\n",rtype2.c_str(),
+        (long long )c,(long long)s2);
+  } else {
+    fprintf(stdout, "Keys in range: %lld\n", (long long) count);
+  }
+  // Clean up
+  delete iter;
+}
+
+const string ReduceDBLevelsCommand::ARG_NEW_LEVELS = "new_levels";
+const string  ReduceDBLevelsCommand::ARG_PRINT_OLD_LEVELS = "print_old_levels";
+
+ReduceDBLevelsCommand::ReduceDBLevelsCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags) :
+    LDBCommand(options, flags, false,
+               BuildCmdLineOptions({ARG_NEW_LEVELS, ARG_PRINT_OLD_LEVELS})),
+    old_levels_(1 << 16),
+    new_levels_(-1),
+    print_old_levels_(false) {
+
+
+  ParseIntOption(option_map_, ARG_NEW_LEVELS, new_levels_, exec_state_);
+  print_old_levels_ = IsFlagPresent(flags, ARG_PRINT_OLD_LEVELS);
+
+  if(new_levels_ <= 0) {
+    exec_state_ = LDBCommandExecuteResult::FAILED(
+           " Use --" + ARG_NEW_LEVELS + " to specify a new level number\n");
+  }
+}
+
+vector<string> ReduceDBLevelsCommand::PrepareArgs(const string& db_path,
+    int new_levels, bool print_old_level) {
+  vector<string> ret;
+  ret.push_back("reduce_levels");
+  ret.push_back("--" + ARG_DB + "=" + db_path);
+  ret.push_back("--" + ARG_NEW_LEVELS + "=" + to_string(new_levels));
+  if(print_old_level) {
+    ret.push_back("--" + ARG_PRINT_OLD_LEVELS);
+  }
+  return ret;
+}
+
+void ReduceDBLevelsCommand::Help(string& ret) {
+  ret.append("  ");
+  ret.append(ReduceDBLevelsCommand::Name());
+  ret.append(" --" + ARG_NEW_LEVELS + "=<New number of levels>");
+  ret.append(" [--" + ARG_PRINT_OLD_LEVELS + "]");
+  ret.append("\n");
+}
+
+Options ReduceDBLevelsCommand::PrepareOptionsForOpenDB() {
+  Options opt = LDBCommand::PrepareOptionsForOpenDB();
+  opt.num_levels = old_levels_;
+  opt.max_bytes_for_level_multiplier_additional.resize(opt.num_levels, 1);
+  // Disable size compaction
+  opt.max_bytes_for_level_base = 1ULL << 50;
+  opt.max_bytes_for_level_multiplier = 1;
+  opt.max_mem_compaction_level = 0;
+  return opt;
+}
+
+Status ReduceDBLevelsCommand::GetOldNumOfLevels(Options& opt,
+    int* levels) {
+  EnvOptions soptions;
+  std::shared_ptr<Cache> tc(
+      NewLRUCache(opt.max_open_files - 10, opt.table_cache_numshardbits,
+                  opt.table_cache_remove_scan_count_limit));
+  const InternalKeyComparator cmp(opt.comparator);
+  VersionSet versions(db_path_, &opt, soptions, tc.get());
+  std::vector<ColumnFamilyDescriptor> dummy;
+  ColumnFamilyDescriptor dummy_descriptor(kDefaultColumnFamilyName,
+                                          ColumnFamilyOptions(opt));
+  dummy.push_back(dummy_descriptor);
+  // We rely the VersionSet::Recover to tell us the internal data structures
+  // in the db. And the Recover() should never do any change
+  // (like LogAndApply) to the manifest file.
+  Status st = versions.Recover(dummy);
+  if (!st.ok()) {
+    return st;
+  }
+  int max = -1;
+  auto default_cfd = versions.GetColumnFamilySet()->GetDefault();
+  for (int i = 0; i < default_cfd->NumberLevels(); i++) {
+    if (default_cfd->current()->NumLevelFiles(i)) {
+      max = i;
+    }
+  }
+
+  *levels = max + 1;
+  return st;
+}
+
+void ReduceDBLevelsCommand::DoCommand() {
+  if (new_levels_ <= 1) {
+    exec_state_ = LDBCommandExecuteResult::FAILED(
+        "Invalid number of levels.\n");
+    return;
+  }
+
+  Status st;
+  Options opt = PrepareOptionsForOpenDB();
+  int old_level_num = -1;
+  st = GetOldNumOfLevels(opt, &old_level_num);
+  if (!st.ok()) {
+    exec_state_ = LDBCommandExecuteResult::FAILED(st.ToString());
+    return;
+  }
+
+  if (print_old_levels_) {
+    fprintf(stdout, "The old number of levels in use is %d\n", old_level_num);
+  }
+
+  if (old_level_num <= new_levels_) {
+    return;
+  }
+
+  old_levels_ = old_level_num;
+
+  OpenDB();
+  if (!db_) {
+    return;
+  }
+  // Compact the whole DB to put all files to the highest level.
+  fprintf(stdout, "Compacting the db...\n");
+  db_->CompactRange(nullptr, nullptr);
+  CloseDB();
+
+  EnvOptions soptions;
+  st = VersionSet::ReduceNumberOfLevels(db_path_, &opt, soptions, new_levels_);
+  if (!st.ok()) {
+    exec_state_ = LDBCommandExecuteResult::FAILED(st.ToString());
+    return;
+  }
+}
+
+const string ChangeCompactionStyleCommand::ARG_OLD_COMPACTION_STYLE =
+  "old_compaction_style";
+const string ChangeCompactionStyleCommand::ARG_NEW_COMPACTION_STYLE =
+  "new_compaction_style";
+
+ChangeCompactionStyleCommand::ChangeCompactionStyleCommand(
+      const vector<string>& params, const map<string, string>& options,
+      const vector<string>& flags) :
+    LDBCommand(options, flags, false,
+               BuildCmdLineOptions({ARG_OLD_COMPACTION_STYLE,
+                                    ARG_NEW_COMPACTION_STYLE})),
+    old_compaction_style_(-1),
+    new_compaction_style_(-1) {
+
+  ParseIntOption(option_map_, ARG_OLD_COMPACTION_STYLE, old_compaction_style_,
+    exec_state_);
+  if (old_compaction_style_ != kCompactionStyleLevel &&
+     old_compaction_style_ != kCompactionStyleUniversal) {
+    exec_state_ = LDBCommandExecuteResult::FAILED(
+      "Use --" + ARG_OLD_COMPACTION_STYLE + " to specify old compaction " +
+      "style. Check ldb help for proper compaction style value.\n");
+    return;
+  }
+
+  ParseIntOption(option_map_, ARG_NEW_COMPACTION_STYLE, new_compaction_style_,
+    exec_state_);
+  if (new_compaction_style_ != kCompactionStyleLevel &&
+     new_compaction_style_ != kCompactionStyleUniversal) {
+    exec_state_ = LDBCommandExecuteResult::FAILED(
+      "Use --" + ARG_NEW_COMPACTION_STYLE + " to specify new compaction " +
+      "style. Check ldb help for proper compaction style value.\n");
+    return;
+  }
+
+  if (new_compaction_style_ == old_compaction_style_) {
+    exec_state_ = LDBCommandExecuteResult::FAILED(
+      "Old compaction style is the same as new compaction style. "
+      "Nothing to do.\n");
+    return;
+  }
+
+  if (old_compaction_style_ == kCompactionStyleUniversal &&
+      new_compaction_style_ == kCompactionStyleLevel) {
+    exec_state_ = LDBCommandExecuteResult::FAILED(
+      "Convert from universal compaction to level compaction. "
+      "Nothing to do.\n");
+    return;
+  }
+}
+
+void ChangeCompactionStyleCommand::Help(string& ret) {
+  ret.append("  ");
+  ret.append(ChangeCompactionStyleCommand::Name());
+  ret.append(" --" + ARG_OLD_COMPACTION_STYLE + "=<Old compaction style: 0 " +
+             "for level compaction, 1 for universal compaction>");
+  ret.append(" --" + ARG_NEW_COMPACTION_STYLE + "=<New compaction style: 0 " +
+             "for level compaction, 1 for universal compaction>");
+  ret.append("\n");
+}
+
+Options ChangeCompactionStyleCommand::PrepareOptionsForOpenDB() {
+  Options opt = LDBCommand::PrepareOptionsForOpenDB();
+
+  if (old_compaction_style_ == kCompactionStyleLevel &&
+      new_compaction_style_ == kCompactionStyleUniversal) {
+    // In order to convert from level compaction to universal compaction, we
+    // need to compact all data into a single file and move it to level 0.
+    opt.disable_auto_compactions = true;
+    opt.target_file_size_base = INT_MAX;
+    opt.target_file_size_multiplier = 1;
+    opt.max_bytes_for_level_base = INT_MAX;
+    opt.max_bytes_for_level_multiplier = 1;
+  }
+
+  return opt;
+}
+
+void ChangeCompactionStyleCommand::DoCommand() {
+  // print db stats before we have made any change
+  std::string property;
+  std::string files_per_level;
+  for (int i = 0; i < db_->NumberLevels(); i++) {
+    db_->GetProperty("rocksdb.num-files-at-level" + NumberToString(i),
+                     &property);
+
+    // format print string
+    char buf[100];
+    snprintf(buf, sizeof(buf), "%s%s", (i ? "," : ""), property.c_str());
+    files_per_level += buf;
+  }
+  fprintf(stdout, "files per level before compaction: %s\n",
+          files_per_level.c_str());
+
+  // manual compact into a single file and move the file to level 0
+  db_->CompactRange(nullptr, nullptr,
+                    true /* reduce level */,
+                    0    /* reduce to level 0 */);
+
+  // verify compaction result
+  files_per_level = "";
+  int num_files = 0;
+  for (int i = 0; i < db_->NumberLevels(); i++) {
+    db_->GetProperty("rocksdb.num-files-at-level" + NumberToString(i),
+                     &property);
+
+    // format print string
+    char buf[100];
+    snprintf(buf, sizeof(buf), "%s%s", (i ? "," : ""), property.c_str());
+    files_per_level += buf;
+
+    num_files = atoi(property.c_str());
+
+    // level 0 should have only 1 file
+    if (i == 0 && num_files != 1) {
+      exec_state_ = LDBCommandExecuteResult::FAILED("Number of db files at "
+        "level 0 after compaction is " + std::to_string(num_files) +
+        ", not 1.\n");
+      return;
+    }
+    // other levels should have no file
+    if (i > 0 && num_files != 0) {
+      exec_state_ = LDBCommandExecuteResult::FAILED("Number of db files at "
+        "level " + std::to_string(i) + " after compaction is " +
+        std::to_string(num_files) + ", not 0.\n");
+      return;
+    }
+  }
+
+  fprintf(stdout, "files per level after compaction: %s\n",
+          files_per_level.c_str());
+}
+
+class InMemoryHandler : public WriteBatch::Handler {
+ public:
+  InMemoryHandler(stringstream& row, bool print_values) : Handler(),row_(row) {
+    print_values_ = print_values;
+  }
+
+  void commonPutMerge(const Slice& key, const Slice& value) {
+    string k = LDBCommand::StringToHex(key.ToString());
+    if (print_values_) {
+      string v = LDBCommand::StringToHex(value.ToString());
+      row_ << k << " : ";
+      row_ << v << " ";
+    } else {
+      row_ << k << " ";
+    }
+  }
+
+  virtual void Put(const Slice& key, const Slice& value) {
+    row_ << "PUT : ";
+    commonPutMerge(key, value);
+  }
+
+  virtual void Merge(const Slice& key, const Slice& value) {
+    row_ << "MERGE : ";
+    commonPutMerge(key, value);
+  }
+
+  virtual void Delete(const Slice& key) {
+    row_ <<",DELETE : ";
+    row_ << LDBCommand::StringToHex(key.ToString()) << " ";
+  }
+
+  virtual ~InMemoryHandler() { };
+
+ private:
+  stringstream & row_;
+  bool print_values_;
+};
+
+const string WALDumperCommand::ARG_WAL_FILE = "walfile";
+const string WALDumperCommand::ARG_PRINT_VALUE = "print_value";
+const string WALDumperCommand::ARG_PRINT_HEADER = "header";
+
+WALDumperCommand::WALDumperCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags) :
+    LDBCommand(options, flags, true,
+               BuildCmdLineOptions(
+                {ARG_WAL_FILE, ARG_PRINT_HEADER, ARG_PRINT_VALUE})),
+    print_header_(false), print_values_(false) {
+
+  wal_file_.clear();
+
+  map<string, string>::const_iterator itr = options.find(ARG_WAL_FILE);
+  if (itr != options.end()) {
+    wal_file_ = itr->second;
+  }
+
+
+  print_header_ = IsFlagPresent(flags, ARG_PRINT_HEADER);
+  print_values_ = IsFlagPresent(flags, ARG_PRINT_VALUE);
+  if (wal_file_.empty()) {
+    exec_state_ = LDBCommandExecuteResult::FAILED(
+                    "Argument " + ARG_WAL_FILE + " must be specified.");
+  }
+}
+
+void WALDumperCommand::Help(string& ret) {
+  ret.append("  ");
+  ret.append(WALDumperCommand::Name());
+  ret.append(" --" + ARG_WAL_FILE + "=<write_ahead_log_file_path>");
+  ret.append(" [--" + ARG_PRINT_HEADER + "] ");
+  ret.append(" [--" + ARG_PRINT_VALUE + "] ");
+  ret.append("\n");
+}
+
+void WALDumperCommand::DoCommand() {
+  struct StdErrReporter : public log::Reader::Reporter {
+    virtual void Corruption(size_t bytes, const Status& s) {
+      cerr<<"Corruption detected in log file "<<s.ToString()<<"\n";
+    }
+  };
+
+  unique_ptr<SequentialFile> file;
+  Env* env_ = Env::Default();
+  EnvOptions soptions;
+  Status status = env_->NewSequentialFile(wal_file_, &file, soptions);
+  if (!status.ok()) {
+    exec_state_ = LDBCommandExecuteResult::FAILED("Failed to open WAL file " +
+      status.ToString());
+  } else {
+    StdErrReporter reporter;
+    log::Reader reader(move(file), &reporter, true, 0);
+    string scratch;
+    WriteBatch batch;
+    Slice record;
+    stringstream row;
+    if (print_header_) {
+      cout<<"Sequence,Count,ByteSize,Physical Offset,Key(s)";
+      if (print_values_) {
+        cout << " : value ";
+      }
+      cout << "\n";
+    }
+    while(reader.ReadRecord(&record, &scratch)) {
+      row.str("");
+      if (record.size() < 12) {
+        reporter.Corruption(
+            record.size(), Status::Corruption("log record too small"));
+      } else {
+        WriteBatchInternal::SetContents(&batch, record);
+        row<<WriteBatchInternal::Sequence(&batch)<<",";
+        row<<WriteBatchInternal::Count(&batch)<<",";
+        row<<WriteBatchInternal::ByteSize(&batch)<<",";
+        row<<reader.LastRecordOffset()<<",";
+        InMemoryHandler handler(row, print_values_);
+        batch.Iterate(&handler);
+        row<<"\n";
+      }
+      cout<<row.str();
+    }
+  }
+}
+
+
+GetCommand::GetCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags) :
+  LDBCommand(options, flags, true, BuildCmdLineOptions({ARG_TTL, ARG_HEX,
+                                                        ARG_KEY_HEX,
+                                                        ARG_VALUE_HEX})) {
+
+  if (params.size() != 1) {
+    exec_state_ = LDBCommandExecuteResult::FAILED(
+                    "<key> must be specified for the get command");
+  } else {
+    key_ = params.at(0);
+  }
+
+  if (is_key_hex_) {
+    key_ = HexToString(key_);
+  }
+}
+
+void GetCommand::Help(string& ret) {
+  ret.append("  ");
+  ret.append(GetCommand::Name());
+  ret.append(" <key>");
+  ret.append(" [--" + ARG_TTL + "]");
+  ret.append("\n");
+}
+
+void GetCommand::DoCommand() {
+  string value;
+  Status st = db_->Get(ReadOptions(), key_, &value);
+  if (st.ok()) {
+    fprintf(stdout, "%s\n",
+              (is_value_hex_ ? StringToHex(value) : value).c_str());
+  } else {
+    exec_state_ = LDBCommandExecuteResult::FAILED(st.ToString());
+  }
+}
+
+
+ApproxSizeCommand::ApproxSizeCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags) :
+  LDBCommand(options, flags, true,
+             BuildCmdLineOptions({ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX,
+                                  ARG_FROM, ARG_TO})) {
+
+  if (options.find(ARG_FROM) != options.end()) {
+    start_key_ = options.find(ARG_FROM)->second;
+  } else {
+    exec_state_ = LDBCommandExecuteResult::FAILED(ARG_FROM +
+                    " must be specified for approxsize command");
+    return;
+  }
+
+  if (options.find(ARG_TO) != options.end()) {
+    end_key_ = options.find(ARG_TO)->second;
+  } else {
+    exec_state_ = LDBCommandExecuteResult::FAILED(ARG_TO +
+                    " must be specified for approxsize command");
+    return;
+  }
+
+  if (is_key_hex_) {
+    start_key_ = HexToString(start_key_);
+    end_key_ = HexToString(end_key_);
+  }
+}
+
+void ApproxSizeCommand::Help(string& ret) {
+  ret.append("  ");
+  ret.append(ApproxSizeCommand::Name());
+  ret.append(HelpRangeCmdArgs());
+  ret.append("\n");
+}
+
+void ApproxSizeCommand::DoCommand() {
+
+  Range ranges[1];
+  ranges[0] = Range(start_key_, end_key_);
+  uint64_t sizes[1];
+  db_->GetApproximateSizes(ranges, 1, sizes);
+  fprintf(stdout, "%lu\n", (unsigned long)sizes[0]);
+  /* Weird that GetApproximateSizes() returns void, although documentation
+   * says that it returns a Status object.
+  if (!st.ok()) {
+    exec_state_ = LDBCommandExecuteResult::FAILED(st.ToString());
+  }
+  */
+}
+
+
+BatchPutCommand::BatchPutCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags) :
+  LDBCommand(options, flags, false,
+             BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX,
+                                  ARG_CREATE_IF_MISSING})) {
+
+  if (params.size() < 2) {
+    exec_state_ = LDBCommandExecuteResult::FAILED(
+        "At least one <key> <value> pair must be specified batchput.");
+  } else if (params.size() % 2 != 0) {
+    exec_state_ = LDBCommandExecuteResult::FAILED(
+        "Equal number of <key>s and <value>s must be specified for batchput.");
+  } else {
+    for (size_t i = 0; i < params.size(); i += 2) {
+      string key = params.at(i);
+      string value = params.at(i+1);
+      key_values_.push_back(pair<string, string>(
+                    is_key_hex_ ? HexToString(key) : key,
+                    is_value_hex_ ? HexToString(value) : value));
+    }
+  }
+}
+
+void BatchPutCommand::Help(string& ret) {
+  ret.append("  ");
+  ret.append(BatchPutCommand::Name());
+  ret.append(" <key> <value> [<key> <value>] [..]");
+  ret.append(" [--" + ARG_TTL + "]");
+  ret.append("\n");
+}
+
+void BatchPutCommand::DoCommand() {
+  WriteBatch batch;
+
+  for (vector<pair<string, string>>::const_iterator itr
+        = key_values_.begin(); itr != key_values_.end(); itr++) {
+      batch.Put(itr->first, itr->second);
+  }
+  Status st = db_->Write(WriteOptions(), &batch);
+  if (st.ok()) {
+    fprintf(stdout, "OK\n");
+  } else {
+    exec_state_ = LDBCommandExecuteResult::FAILED(st.ToString());
+  }
+}
+
+Options BatchPutCommand::PrepareOptionsForOpenDB() {
+  Options opt = LDBCommand::PrepareOptionsForOpenDB();
+  opt.create_if_missing = IsFlagPresent(flags_, ARG_CREATE_IF_MISSING);
+  return opt;
+}
+
+
+ScanCommand::ScanCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags) :
+    LDBCommand(options, flags, true,
+               BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX, ARG_TO,
+                                    ARG_VALUE_HEX, ARG_FROM, ARG_TIMESTAMP,
+                                    ARG_MAX_KEYS, ARG_TTL_START, ARG_TTL_END})),
+    start_key_specified_(false),
+    end_key_specified_(false),
+    max_keys_scanned_(-1) {
+
+  map<string, string>::const_iterator itr = options.find(ARG_FROM);
+  if (itr != options.end()) {
+    start_key_ = itr->second;
+    if (is_key_hex_) {
+      start_key_ = HexToString(start_key_);
+    }
+    start_key_specified_ = true;
+  }
+  itr = options.find(ARG_TO);
+  if (itr != options.end()) {
+    end_key_ = itr->second;
+    if (is_key_hex_) {
+      end_key_ = HexToString(end_key_);
+    }
+    end_key_specified_ = true;
+  }
+
+  itr = options.find(ARG_MAX_KEYS);
+  if (itr != options.end()) {
+    try {
+      max_keys_scanned_ = stoi(itr->second);
+    } catch(const invalid_argument&) {
+      exec_state_ = LDBCommandExecuteResult::FAILED(ARG_MAX_KEYS +
+                        " has an invalid value");
+    } catch(const out_of_range&) {
+      exec_state_ = LDBCommandExecuteResult::FAILED(ARG_MAX_KEYS +
+                        " has a value out-of-range");
+    }
+  }
+}
+
+void ScanCommand::Help(string& ret) {
+  ret.append("  ");
+  ret.append(ScanCommand::Name());
+  ret.append(HelpRangeCmdArgs());
+  ret.append(" [--" + ARG_TTL + "]");
+  ret.append(" [--" + ARG_TIMESTAMP + "]");
+  ret.append(" [--" + ARG_MAX_KEYS + "=<N>q] ");
+  ret.append(" [--" + ARG_TTL_START + "=<N>:- is inclusive]");
+  ret.append(" [--" + ARG_TTL_END + "=<N>:- is exclusive]");
+  ret.append("\n");
+}
+
+void ScanCommand::DoCommand() {
+
+  int num_keys_scanned = 0;
+  Iterator* it = db_->NewIterator(ReadOptions());
+  if (start_key_specified_) {
+    it->Seek(start_key_);
+  } else {
+    it->SeekToFirst();
+  }
+  int ttl_start;
+  if (!ParseIntOption(option_map_, ARG_TTL_START, ttl_start, exec_state_)) {
+    ttl_start = DBWithTTLImpl::kMinTimestamp;  // TTL introduction time
+  }
+  int ttl_end;
+  if (!ParseIntOption(option_map_, ARG_TTL_END, ttl_end, exec_state_)) {
+    ttl_end = DBWithTTLImpl::kMaxTimestamp;  // Max time allowed by TTL feature
+  }
+  if (ttl_end < ttl_start) {
+    fprintf(stderr, "Error: End time can't be less than start time\n");
+    delete it;
+    return;
+  }
+  if (is_db_ttl_ && timestamp_) {
+    fprintf(stdout, "Scanning key-values from %s to %s\n",
+            ReadableTime(ttl_start).c_str(), ReadableTime(ttl_end).c_str());
+  }
+  for ( ;
+        it->Valid() && (!end_key_specified_ || it->key().ToString() < end_key_);
+        it->Next()) {
+    string key = it->key().ToString();
+    if (is_db_ttl_) {
+      TtlIterator* it_ttl = dynamic_cast<TtlIterator*>(it);
+      assert(it_ttl);
+      int rawtime = it_ttl->timestamp();
+      if (rawtime < ttl_start || rawtime >= ttl_end) {
+        continue;
+      }
+      if (timestamp_) {
+        fprintf(stdout, "%s ", ReadableTime(rawtime).c_str());
+      }
+    }
+    string value = it->value().ToString();
+    fprintf(stdout, "%s : %s\n",
+          (is_key_hex_ ? StringToHex(key) : key).c_str(),
+          (is_value_hex_ ? StringToHex(value) : value).c_str()
+        );
+    num_keys_scanned++;
+    if (max_keys_scanned_ >= 0 && num_keys_scanned >= max_keys_scanned_) {
+      break;
+    }
+  }
+  if (!it->status().ok()) {  // Check for any errors found during the scan
+    exec_state_ = LDBCommandExecuteResult::FAILED(it->status().ToString());
+  }
+  delete it;
+}
+
+
+DeleteCommand::DeleteCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags) :
+  LDBCommand(options, flags, false,
+             BuildCmdLineOptions({ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX})) {
+
+  if (params.size() != 1) {
+    exec_state_ = LDBCommandExecuteResult::FAILED(
+                    "KEY must be specified for the delete command");
+  } else {
+    key_ = params.at(0);
+    if (is_key_hex_) {
+      key_ = HexToString(key_);
+    }
+  }
+}
+
+void DeleteCommand::Help(string& ret) {
+  ret.append("  ");
+  ret.append(DeleteCommand::Name() + " <key>");
+  ret.append("\n");
+}
+
+void DeleteCommand::DoCommand() {
+  Status st = db_->Delete(WriteOptions(), key_);
+  if (st.ok()) {
+    fprintf(stdout, "OK\n");
+  } else {
+    exec_state_ = LDBCommandExecuteResult::FAILED(st.ToString());
+  }
+}
+
+
+PutCommand::PutCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags) :
+  LDBCommand(options, flags, false,
+             BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX,
+                                  ARG_CREATE_IF_MISSING})) {
+
+  if (params.size() != 2) {
+    exec_state_ = LDBCommandExecuteResult::FAILED(
+                    "<key> and <value> must be specified for the put command");
+  } else {
+    key_ = params.at(0);
+    value_ = params.at(1);
+  }
+
+  if (is_key_hex_) {
+    key_ = HexToString(key_);
+  }
+
+  if (is_value_hex_) {
+    value_ = HexToString(value_);
+  }
+}
+
+void PutCommand::Help(string& ret) {
+  ret.append("  ");
+  ret.append(PutCommand::Name());
+  ret.append(" <key> <value> ");
+  ret.append(" [--" + ARG_TTL + "]");
+  ret.append("\n");
+}
+
+void PutCommand::DoCommand() {
+  Status st = db_->Put(WriteOptions(), key_, value_);
+  if (st.ok()) {
+    fprintf(stdout, "OK\n");
+  } else {
+    exec_state_ = LDBCommandExecuteResult::FAILED(st.ToString());
+  }
+}
+
+Options PutCommand::PrepareOptionsForOpenDB() {
+  Options opt = LDBCommand::PrepareOptionsForOpenDB();
+  opt.create_if_missing = IsFlagPresent(flags_, ARG_CREATE_IF_MISSING);
+  return opt;
+}
+
+
+const char* DBQuerierCommand::HELP_CMD = "help";
+const char* DBQuerierCommand::GET_CMD = "get";
+const char* DBQuerierCommand::PUT_CMD = "put";
+const char* DBQuerierCommand::DELETE_CMD = "delete";
+
+DBQuerierCommand::DBQuerierCommand(const vector<string>& params,
+    const map<string, string>& options, const vector<string>& flags) :
+  LDBCommand(options, flags, false,
+             BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX,
+                                  ARG_VALUE_HEX})) {
+
+}
+
+void DBQuerierCommand::Help(string& ret) {
+  ret.append("  ");
+  ret.append(DBQuerierCommand::Name());
+  ret.append(" [--" + ARG_TTL + "]");
+  ret.append("\n");
+  ret.append("    Starts a REPL shell.  Type help for list of available "
+             "commands.");
+  ret.append("\n");
+}
+
+void DBQuerierCommand::DoCommand() {
+  if (!db_) {
+    return;
+  }
+
+  ReadOptions read_options;
+  WriteOptions write_options;
+
+  string line;
+  string key;
+  string value;
+  while (getline(cin, line, '\n')) {
+
+    // Parse line into vector<string>
+    vector<string> tokens;
+    size_t pos = 0;
+    while (true) {
+      size_t pos2 = line.find(' ', pos);
+      if (pos2 == string::npos) {
+        break;
+      }
+      tokens.push_back(line.substr(pos, pos2-pos));
+      pos = pos2 + 1;
+    }
+    tokens.push_back(line.substr(pos));
+
+    const string& cmd = tokens[0];
+
+    if (cmd == HELP_CMD) {
+      fprintf(stdout,
+              "get <key>\n"
+              "put <key> <value>\n"
+              "delete <key>\n");
+    } else if (cmd == DELETE_CMD && tokens.size() == 2) {
+      key = (is_key_hex_ ? HexToString(tokens[1]) : tokens[1]);
+      db_->Delete(write_options, Slice(key));
+      fprintf(stdout, "Successfully deleted %s\n", tokens[1].c_str());
+    } else if (cmd == PUT_CMD && tokens.size() == 3) {
+      key = (is_key_hex_ ? HexToString(tokens[1]) : tokens[1]);
+      value = (is_value_hex_ ? HexToString(tokens[2]) : tokens[2]);
+      db_->Put(write_options, Slice(key), Slice(value));
+      fprintf(stdout, "Successfully put %s %s\n",
+              tokens[1].c_str(), tokens[2].c_str());
+    } else if (cmd == GET_CMD && tokens.size() == 2) {
+      key = (is_key_hex_ ? HexToString(tokens[1]) : tokens[1]);
+      if (db_->Get(read_options, Slice(key), &value).ok()) {
+        fprintf(stdout, "%s\n", PrintKeyValue(key, value,
+              is_key_hex_, is_value_hex_).c_str());
+      } else {
+        fprintf(stdout, "Not found %s\n", tokens[1].c_str());
+      }
+    } else {
+      fprintf(stdout, "Unknown command %s\n", line.c_str());
+    }
+  }
+}
+
+CheckConsistencyCommand::CheckConsistencyCommand(const vector<string>& params,
+    const map<string, string>& options, const vector<string>& flags) :
+  LDBCommand(options, flags, false,
+             BuildCmdLineOptions({})) {
+}
+
+void CheckConsistencyCommand::Help(string& ret) {
+  ret.append("  ");
+  ret.append(CheckConsistencyCommand::Name());
+  ret.append("\n");
+}
+
+void CheckConsistencyCommand::DoCommand() {
+  Options opt = PrepareOptionsForOpenDB();
+  opt.paranoid_checks = true;
+  if (!exec_state_.IsNotStarted()) {
+    return;
+  }
+  DB* db;
+  Status st = DB::OpenForReadOnly(opt, db_path_, &db, false);
+  delete db;
+  if (st.ok()) {
+    fprintf(stdout, "OK\n");
+  } else {
+    exec_state_ = LDBCommandExecuteResult::FAILED(st.ToString());
+  }
+}
+
+}   // namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/util/ldb_cmd.h b/util/ldb_cmd.h
new file mode 100644 (file)
index 0000000..4f760e0
--- /dev/null
@@ -0,0 +1,722 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#pragma once
+#include <string>
+#include <iostream>
+#include <sstream>
+#include <stdlib.h>
+#include <algorithm>
+#include <stdio.h>
+
+#include "db/version_set.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/slice.h"
+#include "util/logging.h"
+#include "util/ldb_cmd_execute_result.h"
+#include "util/string_util.h"
+#include "utilities/db_ttl.h"
+#include "utilities/ttl/db_ttl_impl.h"
+
+using std::string;
+using std::map;
+using std::vector;
+using std::ostringstream;
+
+namespace rocksdb {
+
+class LDBCommand {
+public:
+
+  // Command-line arguments
+  static const string ARG_DB;
+  static const string ARG_HEX;
+  static const string ARG_KEY_HEX;
+  static const string ARG_VALUE_HEX;
+  static const string ARG_TTL;
+  static const string ARG_TTL_START;
+  static const string ARG_TTL_END;
+  static const string ARG_TIMESTAMP;
+  static const string ARG_FROM;
+  static const string ARG_TO;
+  static const string ARG_MAX_KEYS;
+  static const string ARG_BLOOM_BITS;
+  static const string ARG_COMPRESSION_TYPE;
+  static const string ARG_BLOCK_SIZE;
+  static const string ARG_AUTO_COMPACTION;
+  static const string ARG_WRITE_BUFFER_SIZE;
+  static const string ARG_FILE_SIZE;
+  static const string ARG_CREATE_IF_MISSING;
+
+  static LDBCommand* InitFromCmdLineArgs(
+    const vector<string>& args,
+    const Options& options = Options()
+  );
+
+  static LDBCommand* InitFromCmdLineArgs(
+    int argc,
+    char** argv,
+    const Options& options = Options()
+  );
+
+  bool ValidateCmdLineOptions();
+
+  virtual Options PrepareOptionsForOpenDB();
+
+  virtual void SetOptions(Options options) {
+    options_ = options;
+  }
+
+  virtual bool NoDBOpen() {
+    return false;
+  }
+
+  virtual ~LDBCommand() {
+    if (db_ != nullptr) {
+      delete db_;
+      db_ = nullptr;
+    }
+  }
+
+  /* Run the command, and return the execute result. */
+  void Run() {
+    if (!exec_state_.IsNotStarted()) {
+      return;
+    }
+
+    if (db_ == nullptr && !NoDBOpen()) {
+      OpenDB();
+      if (!exec_state_.IsNotStarted()) {
+        return;
+      }
+    }
+
+    DoCommand();
+    if (exec_state_.IsNotStarted()) {
+      exec_state_ = LDBCommandExecuteResult::SUCCEED("");
+    }
+
+    if (db_ != nullptr) {
+      CloseDB ();
+    }
+  }
+
+  virtual void DoCommand() = 0;
+
+  LDBCommandExecuteResult GetExecuteState() {
+    return exec_state_;
+  }
+
+  void ClearPreviousRunState() {
+    exec_state_.Reset();
+  }
+
+  static string HexToString(const string& str) {
+    string parsed;
+    if (str[0] != '0' || str[1] != 'x') {
+      fprintf(stderr, "Invalid hex input %s.  Must start with 0x\n",
+              str.c_str());
+      throw "Invalid hex input";
+    }
+
+    for (unsigned int i = 2; i < str.length();) {
+      int c;
+      sscanf(str.c_str() + i, "%2X", &c);
+      parsed.push_back(c);
+      i += 2;
+    }
+    return parsed;
+  }
+
+  static string StringToHex(const string& str) {
+    string result = "0x";
+    char buf[10];
+    for (size_t i = 0; i < str.length(); i++) {
+      snprintf(buf, 10, "%02X", (unsigned char)str[i]);
+      result += buf;
+    }
+    return result;
+  }
+
+  static const char* DELIM;
+
+protected:
+
+  LDBCommandExecuteResult exec_state_;
+  string db_path_;
+  DB* db_;
+  DBWithTTL* db_ttl_;
+
+  /**
+   * true implies that this command can work if the db is opened in read-only
+   * mode.
+   */
+  bool is_read_only_;
+
+  /** If true, the key is input/output as hex in get/put/scan/delete etc. */
+  bool is_key_hex_;
+
+  /** If true, the value is input/output as hex in get/put/scan/delete etc. */
+  bool is_value_hex_;
+
+  /** If true, the value is treated as timestamp suffixed */
+  bool is_db_ttl_;
+
+  // If true, the kvs are output with their insert/modify timestamp in a ttl db
+  bool timestamp_;
+
+  /**
+   * Map of options passed on the command-line.
+   */
+  const map<string, string> option_map_;
+
+  /**
+   * Flags passed on the command-line.
+   */
+  const vector<string> flags_;
+
+  /** List of command-line options valid for this command */
+  const vector<string> valid_cmd_line_options_;
+
+  bool ParseKeyValue(const string& line, string* key, string* value,
+                      bool is_key_hex, bool is_value_hex);
+
+  LDBCommand(const map<string, string>& options, const vector<string>& flags,
+             bool is_read_only, const vector<string>& valid_cmd_line_options) :
+      db_(nullptr),
+      is_read_only_(is_read_only),
+      is_key_hex_(false),
+      is_value_hex_(false),
+      is_db_ttl_(false),
+      timestamp_(false),
+      option_map_(options),
+      flags_(flags),
+      valid_cmd_line_options_(valid_cmd_line_options) {
+
+    map<string, string>::const_iterator itr = options.find(ARG_DB);
+    if (itr != options.end()) {
+      db_path_ = itr->second;
+    }
+
+    is_key_hex_ = IsKeyHex(options, flags);
+    is_value_hex_ = IsValueHex(options, flags);
+    is_db_ttl_ = IsFlagPresent(flags, ARG_TTL);
+    timestamp_ = IsFlagPresent(flags, ARG_TIMESTAMP);
+  }
+
+  void OpenDB() {
+    Options opt = PrepareOptionsForOpenDB();
+    if (!exec_state_.IsNotStarted()) {
+      return;
+    }
+    // Open the DB.
+    Status st;
+    if (is_db_ttl_) {
+      if (is_read_only_) {
+        st = DBWithTTL::Open(opt, db_path_, &db_ttl_, 0, true);
+      } else {
+        st = DBWithTTL::Open(opt, db_path_, &db_ttl_);
+      }
+      db_ = db_ttl_;
+    } else if (is_read_only_) {
+      st = DB::OpenForReadOnly(opt, db_path_, &db_);
+    } else {
+      st = DB::Open(opt, db_path_, &db_);
+    }
+    if (!st.ok()) {
+      string msg = st.ToString();
+      exec_state_ = LDBCommandExecuteResult::FAILED(msg);
+    }
+
+    options_ = opt;
+  }
+
+  void CloseDB () {
+    if (db_ != nullptr) {
+      delete db_;
+      db_ = nullptr;
+    }
+  }
+
+  static string PrintKeyValue(const string& key, const string& value,
+        bool is_key_hex, bool is_value_hex) {
+    string result;
+    result.append(is_key_hex ? StringToHex(key) : key);
+    result.append(DELIM);
+    result.append(is_value_hex ? StringToHex(value) : value);
+    return result;
+  }
+
+  static string PrintKeyValue(const string& key, const string& value,
+        bool is_hex) {
+    return PrintKeyValue(key, value, is_hex, is_hex);
+  }
+
+  /**
+   * Return true if the specified flag is present in the specified flags vector
+   */
+  static bool IsFlagPresent(const vector<string>& flags, const string& flag) {
+    return (std::find(flags.begin(), flags.end(), flag) != flags.end());
+  }
+
+  static string HelpRangeCmdArgs() {
+    ostringstream str_stream;
+    str_stream << " ";
+    str_stream << "[--" << ARG_FROM << "] ";
+    str_stream << "[--" << ARG_TO << "] ";
+    return str_stream.str();
+  }
+
+  /**
+   * A helper function that returns a list of command line options
+   * used by this command.  It includes the common options and the ones
+   * passed in.
+   */
+  vector<string> BuildCmdLineOptions(vector<string> options) {
+    vector<string> ret = {ARG_DB, ARG_BLOOM_BITS, ARG_BLOCK_SIZE,
+                          ARG_AUTO_COMPACTION, ARG_COMPRESSION_TYPE,
+                          ARG_WRITE_BUFFER_SIZE, ARG_FILE_SIZE};
+    ret.insert(ret.end(), options.begin(), options.end());
+    return ret;
+  }
+
+  bool ParseIntOption(const map<string, string>& options, const string& option,
+                      int& value, LDBCommandExecuteResult& exec_state);
+
+  bool ParseStringOption(const map<string, string>& options,
+                         const string& option, string* value);
+
+  Options options_;
+
+private:
+
+  /**
+   * Interpret command line options and flags to determine if the key
+   * should be input/output in hex.
+   */
+  bool IsKeyHex(const map<string, string>& options,
+      const vector<string>& flags) {
+    return (IsFlagPresent(flags, ARG_HEX) ||
+        IsFlagPresent(flags, ARG_KEY_HEX) ||
+        ParseBooleanOption(options, ARG_HEX, false) ||
+        ParseBooleanOption(options, ARG_KEY_HEX, false));
+  }
+
+  /**
+   * Interpret command line options and flags to determine if the value
+   * should be input/output in hex.
+   */
+  bool IsValueHex(const map<string, string>& options,
+      const vector<string>& flags) {
+    return (IsFlagPresent(flags, ARG_HEX) ||
+          IsFlagPresent(flags, ARG_VALUE_HEX) ||
+          ParseBooleanOption(options, ARG_HEX, false) ||
+          ParseBooleanOption(options, ARG_VALUE_HEX, false));
+  }
+
+  /**
+   * Returns the value of the specified option as a boolean.
+   * default_val is used if the option is not found in options.
+   * Throws an exception if the value of the option is not
+   * "true" or "false" (case insensitive).
+   */
+  bool ParseBooleanOption(const map<string, string>& options,
+      const string& option, bool default_val) {
+
+    map<string, string>::const_iterator itr = options.find(option);
+    if (itr != options.end()) {
+      string option_val = itr->second;
+      return StringToBool(itr->second);
+    }
+    return default_val;
+  }
+
+  /**
+   * Converts val to a boolean.
+   * val must be either true or false (case insensitive).
+   * Otherwise an exception is thrown.
+   */
+  bool StringToBool(string val) {
+    std::transform(val.begin(), val.end(), val.begin(), ::tolower);
+    if (val == "true") {
+      return true;
+    } else if (val == "false") {
+      return false;
+    } else {
+      throw "Invalid value for boolean argument";
+    }
+  }
+
+  static LDBCommand* SelectCommand(
+    const string& cmd,
+    const vector<string>& cmdParams,
+    const map<string, string>& option_map,
+    const vector<string>& flags
+  );
+
+};
+
+class CompactorCommand: public LDBCommand {
+public:
+  static string Name() { return "compact"; }
+
+  CompactorCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags);
+
+  static void Help(string& ret);
+
+  virtual void DoCommand();
+
+private:
+  bool null_from_;
+  string from_;
+  bool null_to_;
+  string to_;
+};
+
+class DBDumperCommand: public LDBCommand {
+public:
+  static string Name() { return "dump"; }
+
+  DBDumperCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags);
+
+  static void Help(string& ret);
+
+  virtual void DoCommand();
+
+private:
+  bool null_from_;
+  string from_;
+  bool null_to_;
+  string to_;
+  int max_keys_;
+  string delim_;
+  bool count_only_;
+  bool count_delim_;
+  bool print_stats_;
+
+  static const string ARG_COUNT_ONLY;
+  static const string ARG_COUNT_DELIM;
+  static const string ARG_STATS;
+  static const string ARG_TTL_BUCKET;
+};
+
+class InternalDumpCommand: public LDBCommand {
+public:
+  static string Name() { return "idump"; }
+
+  InternalDumpCommand(const vector<string>& params,
+                      const map<string, string>& options,
+                      const vector<string>& flags);
+
+  static void Help(string& ret);
+
+  virtual void DoCommand();
+
+private:
+  bool has_from_;
+  string from_;
+  bool has_to_;
+  string to_;
+  int max_keys_;
+  string delim_;
+  bool count_only_;
+  bool count_delim_;
+  bool print_stats_;
+  bool is_input_key_hex_;
+
+  static const string ARG_DELIM;
+  static const string ARG_COUNT_ONLY;
+  static const string ARG_COUNT_DELIM;
+  static const string ARG_STATS;
+  static const string ARG_INPUT_KEY_HEX;
+};
+
+class DBLoaderCommand: public LDBCommand {
+public:
+  static string Name() { return "load"; }
+
+  DBLoaderCommand(string& db_name, vector<string>& args);
+
+  DBLoaderCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags);
+
+  static void Help(string& ret);
+  virtual void DoCommand();
+
+  virtual Options PrepareOptionsForOpenDB();
+
+private:
+  bool create_if_missing_;
+  bool disable_wal_;
+  bool bulk_load_;
+  bool compact_;
+
+  static const string ARG_DISABLE_WAL;
+  static const string ARG_BULK_LOAD;
+  static const string ARG_COMPACT;
+};
+
+class ManifestDumpCommand: public LDBCommand {
+public:
+  static string Name() { return "manifest_dump"; }
+
+  ManifestDumpCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags);
+
+  static void Help(string& ret);
+  virtual void DoCommand();
+
+  virtual bool NoDBOpen() {
+    return true;
+  }
+
+private:
+  bool verbose_;
+  string path_;
+
+  static const string ARG_VERBOSE;
+  static const string ARG_PATH;
+};
+
+class ListColumnFamiliesCommand : public LDBCommand {
+ public:
+  static string Name() { return "list_column_families"; }
+
+  ListColumnFamiliesCommand(const vector<string>& params,
+                            const map<string, string>& options,
+                            const vector<string>& flags);
+
+  static void Help(string& ret);
+  virtual void DoCommand();
+
+  virtual bool NoDBOpen() { return true; }
+
+ private:
+  string dbname_;
+};
+
+class ReduceDBLevelsCommand : public LDBCommand {
+public:
+  static string Name() { return "reduce_levels"; }
+
+  ReduceDBLevelsCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags);
+
+  virtual Options PrepareOptionsForOpenDB();
+
+  virtual void DoCommand();
+
+  virtual bool NoDBOpen() {
+    return true;
+  }
+
+  static void Help(string& msg);
+
+  static vector<string> PrepareArgs(const string& db_path, int new_levels,
+      bool print_old_level = false);
+
+private:
+  int old_levels_;
+  int new_levels_;
+  bool print_old_levels_;
+
+  static const string ARG_NEW_LEVELS;
+  static const string ARG_PRINT_OLD_LEVELS;
+
+  Status GetOldNumOfLevels(Options& opt, int* levels);
+};
+
+class ChangeCompactionStyleCommand : public LDBCommand {
+public:
+  static string Name() { return "change_compaction_style"; }
+
+  ChangeCompactionStyleCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags);
+
+  virtual Options PrepareOptionsForOpenDB();
+
+  virtual void DoCommand();
+
+  static void Help(string& msg);
+
+private:
+  int old_compaction_style_;
+  int new_compaction_style_;
+
+  static const string ARG_OLD_COMPACTION_STYLE;
+  static const string ARG_NEW_COMPACTION_STYLE;
+};
+
+class WALDumperCommand : public LDBCommand {
+public:
+  static string Name() { return "dump_wal"; }
+
+  WALDumperCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags);
+
+  virtual bool  NoDBOpen() {
+    return true;
+  }
+
+  static void Help(string& ret);
+  virtual void DoCommand();
+
+private:
+  bool print_header_;
+  string wal_file_;
+  bool print_values_;
+
+  static const string ARG_WAL_FILE;
+  static const string ARG_PRINT_HEADER;
+  static const string ARG_PRINT_VALUE;
+};
+
+
+class GetCommand : public LDBCommand {
+public:
+  static string Name() { return "get"; }
+
+  GetCommand(const vector<string>& params, const map<string, string>& options,
+      const vector<string>& flags);
+
+  virtual void DoCommand();
+
+  static void Help(string& ret);
+
+private:
+  string key_;
+};
+
+class ApproxSizeCommand : public LDBCommand {
+public:
+  static string Name() { return "approxsize"; }
+
+  ApproxSizeCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags);
+
+  virtual void DoCommand();
+
+  static void Help(string& ret);
+
+private:
+  string start_key_;
+  string end_key_;
+};
+
+class BatchPutCommand : public LDBCommand {
+public:
+  static string Name() { return "batchput"; }
+
+  BatchPutCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags);
+
+  virtual void DoCommand();
+
+  static void Help(string& ret);
+
+  virtual Options PrepareOptionsForOpenDB();
+
+private:
+  /**
+   * The key-values to be inserted.
+   */
+  vector<std::pair<string, string>> key_values_;
+};
+
+class ScanCommand : public LDBCommand {
+public:
+  static string Name() { return "scan"; }
+
+  ScanCommand(const vector<string>& params, const map<string, string>& options,
+      const vector<string>& flags);
+
+  virtual void DoCommand();
+
+  static void Help(string& ret);
+
+private:
+  string start_key_;
+  string end_key_;
+  bool start_key_specified_;
+  bool end_key_specified_;
+  int max_keys_scanned_;
+};
+
+class DeleteCommand : public LDBCommand {
+public:
+  static string Name() { return "delete"; }
+
+  DeleteCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags);
+
+  virtual void DoCommand();
+
+  static void Help(string& ret);
+
+private:
+  string key_;
+};
+
+class PutCommand : public LDBCommand {
+public:
+  static string Name() { return "put"; }
+
+  PutCommand(const vector<string>& params, const map<string, string>& options,
+      const vector<string>& flags);
+
+  virtual void DoCommand();
+
+  static void Help(string& ret);
+
+  virtual Options PrepareOptionsForOpenDB();
+
+private:
+  string key_;
+  string value_;
+};
+
+/**
+ * Command that starts up a REPL shell that allows
+ * get/put/delete.
+ */
+class DBQuerierCommand: public LDBCommand {
+public:
+  static string Name() { return "query"; }
+
+  DBQuerierCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags);
+
+  static void Help(string& ret);
+
+  virtual void DoCommand();
+
+private:
+  static const char* HELP_CMD;
+  static const char* GET_CMD;
+  static const char* PUT_CMD;
+  static const char* DELETE_CMD;
+};
+
+class CheckConsistencyCommand : public LDBCommand {
+public:
+  static string Name() { return "checkconsistency"; }
+
+  CheckConsistencyCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags);
+
+  virtual void DoCommand();
+
+  virtual bool NoDBOpen() {
+    return true;
+  }
+
+  static void Help(string& ret);
+};
+
+} // namespace rocksdb
diff --git a/util/ldb_cmd_execute_result.h b/util/ldb_cmd_execute_result.h
new file mode 100644 (file)
index 0000000..b9121b2
--- /dev/null
@@ -0,0 +1,76 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#pragma once
+
+namespace rocksdb {
+
+class LDBCommandExecuteResult {
+public:
+  enum State {
+    EXEC_NOT_STARTED = 0, EXEC_SUCCEED = 1, EXEC_FAILED = 2,
+  };
+
+  LDBCommandExecuteResult() {
+    state_ = EXEC_NOT_STARTED;
+    message_ = "";
+  }
+
+  LDBCommandExecuteResult(State state, std::string& msg) {
+    state_ = state;
+    message_ = msg;
+  }
+
+  std::string ToString() {
+    std::string ret;
+    switch (state_) {
+    case EXEC_SUCCEED:
+      break;
+    case EXEC_FAILED:
+      ret.append("Failed: ");
+      break;
+    case EXEC_NOT_STARTED:
+      ret.append("Not started: ");
+    }
+    if (!message_.empty()) {
+      ret.append(message_);
+    }
+    return ret;
+  }
+
+  void Reset() {
+    state_ = EXEC_NOT_STARTED;
+    message_ = "";
+  }
+
+  bool IsSucceed() {
+    return state_ == EXEC_SUCCEED;
+  }
+
+  bool IsNotStarted() {
+    return state_ == EXEC_NOT_STARTED;
+  }
+
+  bool IsFailed() {
+    return state_ == EXEC_FAILED;
+  }
+
+  static LDBCommandExecuteResult SUCCEED(std::string msg) {
+    return LDBCommandExecuteResult(EXEC_SUCCEED, msg);
+  }
+
+  static LDBCommandExecuteResult FAILED(std::string msg) {
+    return LDBCommandExecuteResult(EXEC_FAILED, msg);
+  }
+
+private:
+  State state_;
+  std::string message_;
+
+  bool operator==(const LDBCommandExecuteResult&);
+  bool operator!=(const LDBCommandExecuteResult&);
+};
+
+}
diff --git a/util/ldb_tool.cc b/util/ldb_tool.cc
new file mode 100644 (file)
index 0000000..8439b63
--- /dev/null
@@ -0,0 +1,107 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#ifndef ROCKSDB_LITE
+#include "rocksdb/ldb_tool.h"
+#include "util/ldb_cmd.h"
+
+namespace rocksdb {
+
+class LDBCommandRunner {
+public:
+
+  static void PrintHelp(const char* exec_name) {
+    string ret;
+
+    ret.append("ldb - LevelDB Tool");
+    ret.append("\n\n");
+    ret.append("commands MUST specify --" + LDBCommand::ARG_DB +
+        "=<full_path_to_db_directory> when necessary\n");
+    ret.append("\n");
+    ret.append("The following optional parameters control if keys/values are "
+        "input/output as hex or as plain strings:\n");
+    ret.append("  --" + LDBCommand::ARG_KEY_HEX +
+        " : Keys are input/output as hex\n");
+    ret.append("  --" + LDBCommand::ARG_VALUE_HEX +
+        " : Values are input/output as hex\n");
+    ret.append("  --" + LDBCommand::ARG_HEX +
+        " : Both keys and values are input/output as hex\n");
+    ret.append("\n");
+
+    ret.append("The following optional parameters control the database "
+        "internals:\n");
+    ret.append("  --" + LDBCommand::ARG_TTL +
+        " with 'put','get','scan','dump','query','batchput'"
+        " : DB supports ttl and value is internally timestamp-suffixed\n");
+    ret.append("  --" + LDBCommand::ARG_BLOOM_BITS + "=<int,e.g.:14>\n");
+    ret.append("  --" + LDBCommand::ARG_COMPRESSION_TYPE +
+        "=<no|snappy|zlib|bzip2>\n");
+    ret.append("  --" + LDBCommand::ARG_BLOCK_SIZE +
+        "=<block_size_in_bytes>\n");
+    ret.append("  --" + LDBCommand::ARG_AUTO_COMPACTION + "=<true|false>\n");
+    ret.append("  --" + LDBCommand::ARG_WRITE_BUFFER_SIZE +
+        "=<int,e.g.:4194304>\n");
+    ret.append("  --" + LDBCommand::ARG_FILE_SIZE + "=<int,e.g.:2097152>\n");
+
+    ret.append("\n\n");
+    ret.append("Data Access Commands:\n");
+    PutCommand::Help(ret);
+    GetCommand::Help(ret);
+    BatchPutCommand::Help(ret);
+    ScanCommand::Help(ret);
+    DeleteCommand::Help(ret);
+    DBQuerierCommand::Help(ret);
+    ApproxSizeCommand::Help(ret);
+    CheckConsistencyCommand::Help(ret);
+
+    ret.append("\n\n");
+    ret.append("Admin Commands:\n");
+    WALDumperCommand::Help(ret);
+    CompactorCommand::Help(ret);
+    ReduceDBLevelsCommand::Help(ret);
+    ChangeCompactionStyleCommand::Help(ret);
+    DBDumperCommand::Help(ret);
+    DBLoaderCommand::Help(ret);
+    ManifestDumpCommand::Help(ret);
+    ListColumnFamiliesCommand::Help(ret);
+    InternalDumpCommand::Help(ret);
+
+    fprintf(stderr, "%s\n", ret.c_str());
+  }
+
+  static void RunCommand(int argc, char** argv, Options options) {
+    if (argc <= 2) {
+      PrintHelp(argv[0]);
+      exit(1);
+    }
+
+    LDBCommand* cmdObj = LDBCommand::InitFromCmdLineArgs(argc, argv, options);
+    if (cmdObj == nullptr) {
+      fprintf(stderr, "Unknown command\n");
+      PrintHelp(argv[0]);
+      exit(1);
+    }
+
+    if (!cmdObj->ValidateCmdLineOptions()) {
+      exit(1);
+    }
+
+    cmdObj->Run();
+    LDBCommandExecuteResult ret = cmdObj->GetExecuteState();
+    fprintf(stderr, "%s\n", ret.ToString().c_str());
+    delete cmdObj;
+
+    exit(ret.IsFailed());
+  }
+
+};
+
+
+void LDBTool::Run(int argc, char** argv, Options options) {
+  LDBCommandRunner::RunCommand(argc, argv, options);
+}
+} // namespace rocksdb
+
+#endif  // ROCKSDB_LITE
diff --git a/util/log_buffer.cc b/util/log_buffer.cc
new file mode 100644 (file)
index 0000000..726c014
--- /dev/null
@@ -0,0 +1,73 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include "util/log_buffer.h"
+
+#include <sys/time.h>
+
+namespace rocksdb {
+
+LogBuffer::LogBuffer(const InfoLogLevel log_level,
+                     Logger*info_log)
+    : log_level_(log_level), info_log_(info_log) {}
+
+void LogBuffer::AddLogToBuffer(const char* format, va_list ap) {
+  if (!info_log_ || log_level_ < info_log_->GetInfoLogLevel()) {
+    // Skip the level because of its level.
+    return;
+  }
+
+  const size_t kLogSizeLimit = 512;
+  char* alloc_mem = arena_.AllocateAligned(kLogSizeLimit);
+  BufferedLog* buffered_log = new (alloc_mem) BufferedLog();
+  char* p = buffered_log->message;
+  char* limit = alloc_mem + kLogSizeLimit - 1;
+
+  // store the time
+  gettimeofday(&(buffered_log->now_tv), nullptr);
+
+  // Print the message
+  if (p < limit) {
+    va_list backup_ap;
+    va_copy(backup_ap, ap);
+    auto n = vsnprintf(p, limit - p, format, backup_ap);
+    assert(n >= 0);
+    p += n;
+    va_end(backup_ap);
+  }
+
+  if (p > limit) {
+    p = limit;
+  }
+
+  // Add '\0' to the end
+  *p = '\0';
+
+  logs_.push_back(buffered_log);
+}
+
+void LogBuffer::FlushBufferToLog() {
+  for (BufferedLog* log : logs_) {
+    const time_t seconds = log->now_tv.tv_sec;
+    struct tm t;
+    localtime_r(&seconds, &t);
+    Log(log_level_, info_log_,
+        "(Original Log Time %04d/%02d/%02d-%02d:%02d:%02d.%06d) %s",
+        t.tm_year + 1900, t.tm_mon + 1, t.tm_mday, t.tm_hour, t.tm_min,
+        t.tm_sec, static_cast<int>(log->now_tv.tv_usec), log->message);
+  }
+  logs_.clear();
+}
+
+void LogToBuffer(LogBuffer* log_buffer, const char* format, ...) {
+  if (log_buffer != nullptr) {
+    va_list ap;
+    va_start(ap, format);
+    log_buffer->AddLogToBuffer(format, ap);
+    va_end(ap);
+  }
+}
+
+}  // namespace rocksdb
diff --git a/util/log_buffer.h b/util/log_buffer.h
new file mode 100644 (file)
index 0000000..8ebe92e
--- /dev/null
@@ -0,0 +1,48 @@
+// Copyright (c) 2014, Facebook, Inc. All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+
+#include "rocksdb/env.h"
+#include "util/arena.h"
+#include "util/autovector.h"
+
+namespace rocksdb {
+
+class Logger;
+
+// A class to buffer info log entries and flush them in the end.
+class LogBuffer {
+ public:
+  // log_level: the log level for all the logs
+  // info_log:  logger to write the logs to
+  LogBuffer(const InfoLogLevel log_level, Logger* info_log);
+
+  // Add a log entry to the buffer.
+  void AddLogToBuffer(const char* format, va_list ap);
+
+  size_t IsEmpty() const { return logs_.empty(); }
+
+  // Flush all buffered log to the info log.
+  void FlushBufferToLog();
+
+ private:
+  // One log entry with its timestamp
+  struct BufferedLog {
+    struct timeval now_tv;  // Timestamp of the log
+    char message[1];        // Beginning of log message
+  };
+
+  const InfoLogLevel log_level_;
+  Logger* info_log_;
+  Arena arena_;
+  autovector<BufferedLog*> logs_;
+};
+
+// Add log to the LogBuffer for a delayed info logging. It can be used when
+// we want to add some logs inside a mutex.
+extern void LogToBuffer(LogBuffer* log_buffer, const char* format, ...);
+
+}  // namespace rocksdb
diff --git a/util/log_write_bench.cc b/util/log_write_bench.cc
new file mode 100644 (file)
index 0000000..536d14f
--- /dev/null
@@ -0,0 +1,69 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include <gflags/gflags.h>
+
+#include "rocksdb/env.h"
+#include "util/histogram.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+// A simple benchmark to simulate transactional logs
+
+DEFINE_int32(num_records, 6000, "Number of records.");
+DEFINE_int32(record_size, 249, "Size of each record.");
+DEFINE_int32(record_interval, 10000, "Interval between records (microSec)");
+DEFINE_int32(bytes_per_sync, 0, "bytes_per_sync parameter in EnvOptions");
+DEFINE_bool(enable_sync, false, "sync after each write.");
+
+namespace rocksdb {
+void RunBenchmark() {
+  std::string file_name = test::TmpDir() + "/log_write_benchmark.log";
+  Env* env = Env::Default();
+  EnvOptions env_options;
+  env_options.use_mmap_writes = false;
+  env_options.bytes_per_sync = FLAGS_bytes_per_sync;
+  unique_ptr<WritableFile> file;
+  env->NewWritableFile(file_name, &file, env_options);
+
+  std::string record;
+  record.assign('X', FLAGS_record_size);
+
+  HistogramImpl hist;
+
+  uint64_t start_time = env->NowMicros();
+  for (int i = 0; i < FLAGS_num_records; i++) {
+    uint64_t start_nanos = env->NowNanos();
+    file->Append(record);
+    file->Flush();
+    if (FLAGS_enable_sync) {
+      file->Sync();
+    }
+    hist.Add(env->NowNanos() - start_nanos);
+
+    if (i % 1000 == 1) {
+      fprintf(stderr, "Wrote %d records...\n", i);
+    }
+
+    int time_to_sleep =
+        (i + 1) * FLAGS_record_interval - (env->NowMicros() - start_time);
+    if (time_to_sleep > 0) {
+      env->SleepForMicroseconds(time_to_sleep);
+    }
+  }
+
+  fprintf(stderr, "Distribution of latency of append+flush: \n%s",
+          hist.ToString().c_str());
+}
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  google::SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
+                          " [OPTIONS]...");
+  google::ParseCommandLineFlags(&argc, &argv, true);
+
+  rocksdb::RunBenchmark();
+  return 0;
+}
diff --git a/util/logging.cc b/util/logging.cc
new file mode 100644 (file)
index 0000000..02e3560
--- /dev/null
@@ -0,0 +1,77 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/logging.h"
+
+#include <errno.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "rocksdb/env.h"
+#include "rocksdb/slice.h"
+
+namespace rocksdb {
+
+void AppendNumberTo(std::string* str, uint64_t num) {
+  char buf[30];
+  snprintf(buf, sizeof(buf), "%llu", (unsigned long long) num);
+  str->append(buf);
+}
+
+void AppendEscapedStringTo(std::string* str, const Slice& value) {
+  for (size_t i = 0; i < value.size(); i++) {
+    char c = value[i];
+    if (c >= ' ' && c <= '~') {
+      str->push_back(c);
+    } else {
+      char buf[10];
+      snprintf(buf, sizeof(buf), "\\x%02x",
+               static_cast<unsigned int>(c) & 0xff);
+      str->append(buf);
+    }
+  }
+}
+
+std::string NumberToString(uint64_t num) {
+  std::string r;
+  AppendNumberTo(&r, num);
+  return r;
+}
+
+std::string EscapeString(const Slice& value) {
+  std::string r;
+  AppendEscapedStringTo(&r, value);
+  return r;
+}
+
+bool ConsumeDecimalNumber(Slice* in, uint64_t* val) {
+  uint64_t v = 0;
+  int digits = 0;
+  while (!in->empty()) {
+    char c = (*in)[0];
+    if (c >= '0' && c <= '9') {
+      ++digits;
+      const unsigned int delta = (c - '0');
+      static const uint64_t kMaxUint64 = ~static_cast<uint64_t>(0);
+      if (v > kMaxUint64/10 ||
+          (v == kMaxUint64/10 && delta > kMaxUint64%10)) {
+        // Overflow
+        return false;
+      }
+      v = (v * 10) + delta;
+      in->remove_prefix(1);
+    } else {
+      break;
+    }
+  }
+  *val = v;
+  return (digits > 0);
+}
+
+}  // namespace rocksdb
diff --git a/util/logging.h b/util/logging.h
new file mode 100644 (file)
index 0000000..d8ce45e
--- /dev/null
@@ -0,0 +1,44 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Must not be included from any .h files to avoid polluting the namespace
+// with macros.
+
+#pragma once
+#include <stdio.h>
+#include <stdint.h>
+#include <string>
+#include "port/port.h"
+
+namespace rocksdb {
+
+class Slice;
+class WritableFile;
+
+// Append a human-readable printout of "num" to *str
+extern void AppendNumberTo(std::string* str, uint64_t num);
+
+// Append a human-readable printout of "value" to *str.
+// Escapes any non-printable characters found in "value".
+extern void AppendEscapedStringTo(std::string* str, const Slice& value);
+
+// Return a human-readable printout of "num"
+extern std::string NumberToString(uint64_t num);
+
+// Return a human-readable version of "value".
+// Escapes any non-printable characters found in "value".
+extern std::string EscapeString(const Slice& value);
+
+// Parse a human-readable number from "*in" into *value.  On success,
+// advances "*in" past the consumed number and sets "*val" to the
+// numeric value.  Otherwise, returns false and leaves *in in an
+// unspecified state.
+extern bool ConsumeDecimalNumber(Slice* in, uint64_t* val);
+
+}  // namespace rocksdb
diff --git a/util/manual_compaction_test.cc b/util/manual_compaction_test.cc
new file mode 100644 (file)
index 0000000..dd615f0
--- /dev/null
@@ -0,0 +1,156 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Test for issue 178: a manual compaction causes deleted data to reappear.
+#include <iostream>
+#include <sstream>
+#include <cstdlib>
+
+#include "rocksdb/db.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/write_batch.h"
+#include "util/testharness.h"
+
+using namespace rocksdb;
+
+namespace {
+
+const int kNumKeys = 1100000;
+
+std::string Key1(int i) {
+  char buf[100];
+  snprintf(buf, sizeof(buf), "my_key_%d", i);
+  return buf;
+}
+
+std::string Key2(int i) {
+  return Key1(i) + "_xxx";
+}
+
+class ManualCompactionTest {
+ public:
+  ManualCompactionTest() {
+    // Get rid of any state from an old run.
+    dbname_ = rocksdb::test::TmpDir() + "/rocksdb_cbug_test";
+    DestroyDB(dbname_, rocksdb::Options());
+  }
+
+  std::string dbname_;
+};
+
+class DestroyAllCompactionFilter : public CompactionFilter {
+ public:
+  DestroyAllCompactionFilter() {}
+
+  virtual bool Filter(int level,
+                      const Slice& key,
+                      const Slice& existing_value,
+                      std::string* new_value,
+                      bool* value_changed) const {
+    return existing_value.ToString() == "destroy";
+  }
+
+  virtual const char* Name() const {
+    return "DestroyAllCompactionFilter";
+  }
+};
+
+TEST(ManualCompactionTest, CompactTouchesAllKeys) {
+  for (int iter = 0; iter < 2; ++iter) {
+    DB* db;
+    Options options;
+    if (iter == 0) { // level compaction
+      options.num_levels = 3;
+      options.compaction_style = kCompactionStyleLevel;
+    } else { // universal compaction
+      options.compaction_style = kCompactionStyleUniversal;
+    }
+    options.create_if_missing = true;
+    options.compression = rocksdb::kNoCompression;
+    options.compaction_filter = new DestroyAllCompactionFilter();
+    ASSERT_OK(DB::Open(options, dbname_, &db));
+
+    db->Put(WriteOptions(), Slice("key1"), Slice("destroy"));
+    db->Put(WriteOptions(), Slice("key2"), Slice("destroy"));
+    db->Put(WriteOptions(), Slice("key3"), Slice("value3"));
+    db->Put(WriteOptions(), Slice("key4"), Slice("destroy"));
+
+    Slice key4("key4");
+    db->CompactRange(nullptr, &key4);
+    Iterator* itr = db->NewIterator(ReadOptions());
+    itr->SeekToFirst();
+    ASSERT_TRUE(itr->Valid());
+    ASSERT_EQ("key3", itr->key().ToString());
+    itr->Next();
+    ASSERT_TRUE(!itr->Valid());
+    delete itr;
+
+    delete options.compaction_filter;
+    delete db;
+    DestroyDB(dbname_, options);
+  }
+}
+
+TEST(ManualCompactionTest, Test) {
+
+  // Open database.  Disable compression since it affects the creation
+  // of layers and the code below is trying to test against a very
+  // specific scenario.
+  rocksdb::DB* db;
+  rocksdb::Options db_options;
+  db_options.create_if_missing = true;
+  db_options.compression = rocksdb::kNoCompression;
+  ASSERT_OK(rocksdb::DB::Open(db_options, dbname_, &db));
+
+  // create first key range
+  rocksdb::WriteBatch batch;
+  for (int i = 0; i < kNumKeys; i++) {
+    batch.Put(Key1(i), "value for range 1 key");
+  }
+  ASSERT_OK(db->Write(rocksdb::WriteOptions(), &batch));
+
+  // create second key range
+  batch.Clear();
+  for (int i = 0; i < kNumKeys; i++) {
+    batch.Put(Key2(i), "value for range 2 key");
+  }
+  ASSERT_OK(db->Write(rocksdb::WriteOptions(), &batch));
+
+  // delete second key range
+  batch.Clear();
+  for (int i = 0; i < kNumKeys; i++) {
+    batch.Delete(Key2(i));
+  }
+  ASSERT_OK(db->Write(rocksdb::WriteOptions(), &batch));
+
+  // compact database
+  std::string start_key = Key1(0);
+  std::string end_key = Key1(kNumKeys - 1);
+  rocksdb::Slice least(start_key.data(), start_key.size());
+  rocksdb::Slice greatest(end_key.data(), end_key.size());
+
+  // commenting out the line below causes the example to work correctly
+  db->CompactRange(&least, &greatest);
+
+  // count the keys
+  rocksdb::Iterator* iter = db->NewIterator(rocksdb::ReadOptions());
+  int num_keys = 0;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    num_keys++;
+  }
+  delete iter;
+  ASSERT_EQ(kNumKeys, num_keys) << "Bad number of keys";
+
+  // close database
+  delete db;
+  DestroyDB(dbname_, rocksdb::Options());
+}
+
+}  // anonymous namespace
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/util/murmurhash.cc b/util/murmurhash.cc
new file mode 100644 (file)
index 0000000..d9d8b70
--- /dev/null
@@ -0,0 +1,183 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+/*
+  Murmurhash from http://sites.google.com/site/murmurhash/
+
+  All code is released to the public domain. For business purposes, Murmurhash is
+  under the MIT license.
+*/
+#include "murmurhash.h"
+
+#if defined(__x86_64__)
+
+// -------------------------------------------------------------------
+//
+// The same caveats as 32-bit MurmurHash2 apply here - beware of alignment
+// and endian-ness issues if used across multiple platforms.
+//
+// 64-bit hash for 64-bit platforms
+
+uint64_t MurmurHash64A ( const void * key, int len, unsigned int seed )
+{
+    const uint64_t m = 0xc6a4a7935bd1e995;
+    const int r = 47;
+
+    uint64_t h = seed ^ (len * m);
+
+    const uint64_t * data = (const uint64_t *)key;
+    const uint64_t * end = data + (len/8);
+
+    while(data != end)
+    {
+        uint64_t k = *data++;
+
+        k *= m;
+        k ^= k >> r;
+        k *= m;
+
+        h ^= k;
+        h *= m;
+    }
+
+    const unsigned char * data2 = (const unsigned char*)data;
+
+    switch(len & 7)
+    {
+    case 7: h ^= ((uint64_t)data2[6]) << 48;
+    case 6: h ^= ((uint64_t)data2[5]) << 40;
+    case 5: h ^= ((uint64_t)data2[4]) << 32;
+    case 4: h ^= ((uint64_t)data2[3]) << 24;
+    case 3: h ^= ((uint64_t)data2[2]) << 16;
+    case 2: h ^= ((uint64_t)data2[1]) << 8;
+    case 1: h ^= ((uint64_t)data2[0]);
+        h *= m;
+    };
+
+    h ^= h >> r;
+    h *= m;
+    h ^= h >> r;
+
+    return h;
+}
+
+#elif defined(__i386__)
+
+// -------------------------------------------------------------------
+//
+// Note - This code makes a few assumptions about how your machine behaves -
+//
+// 1. We can read a 4-byte value from any address without crashing
+// 2. sizeof(int) == 4
+//
+// And it has a few limitations -
+//
+// 1. It will not work incrementally.
+// 2. It will not produce the same results on little-endian and big-endian
+//    machines.
+
+unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed )
+{
+    // 'm' and 'r' are mixing constants generated offline.
+    // They're not really 'magic', they just happen to work well.
+
+    const unsigned int m = 0x5bd1e995;
+    const int r = 24;
+
+    // Initialize the hash to a 'random' value
+
+    unsigned int h = seed ^ len;
+
+    // Mix 4 bytes at a time into the hash
+
+    const unsigned char * data = (const unsigned char *)key;
+
+    while(len >= 4)
+    {
+        unsigned int k = *(unsigned int *)data;
+
+        k *= m;
+        k ^= k >> r;
+        k *= m;
+
+        h *= m;
+        h ^= k;
+
+        data += 4;
+        len -= 4;
+    }
+
+    // Handle the last few bytes of the input array
+
+    switch(len)
+    {
+    case 3: h ^= data[2] << 16;
+    case 2: h ^= data[1] << 8;
+    case 1: h ^= data[0];
+        h *= m;
+    };
+
+    // Do a few final mixes of the hash to ensure the last few
+    // bytes are well-incorporated.
+
+    h ^= h >> 13;
+    h *= m;
+    h ^= h >> 15;
+
+    return h;
+}
+
+#else
+
+// -------------------------------------------------------------------
+//
+// Same as MurmurHash2, but endian- and alignment-neutral.
+// Half the speed though, alas.
+
+unsigned int MurmurHashNeutral2 ( const void * key, int len, unsigned int seed )
+{
+    const unsigned int m = 0x5bd1e995;
+    const int r = 24;
+
+    unsigned int h = seed ^ len;
+
+    const unsigned char * data = (const unsigned char *)key;
+
+    while(len >= 4)
+    {
+        unsigned int k;
+
+        k  = data[0];
+        k |= data[1] << 8;
+        k |= data[2] << 16;
+        k |= data[3] << 24;
+
+        k *= m;
+        k ^= k >> r;
+        k *= m;
+
+        h *= m;
+        h ^= k;
+
+        data += 4;
+        len -= 4;
+    }
+
+    switch(len)
+    {
+    case 3: h ^= data[2] << 16;
+    case 2: h ^= data[1] << 8;
+    case 1: h ^= data[0];
+        h *= m;
+    };
+
+    h ^= h >> 13;
+    h *= m;
+    h ^= h >> 15;
+
+    return h;
+}
+
+#endif
diff --git a/util/murmurhash.h b/util/murmurhash.h
new file mode 100644 (file)
index 0000000..faa8655
--- /dev/null
@@ -0,0 +1,42 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+/*
+  Murmurhash from http://sites.google.com/site/murmurhash/
+
+  All code is released to the public domain. For business purposes, Murmurhash is
+  under the MIT license.
+*/
+#pragma once
+#include <stdint.h>
+#include "rocksdb/slice.h"
+
+#if defined(__x86_64__)
+#define MURMUR_HASH MurmurHash64A
+uint64_t MurmurHash64A ( const void * key, int len, unsigned int seed );
+#define MurmurHash MurmurHash64A
+typedef uint64_t murmur_t;
+
+#elif defined(__i386__)
+#define MURMUR_HASH MurmurHash2
+unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed );
+#define MurmurHash MurmurHash2
+typedef unsigned int murmur_t;
+
+#else
+#define MURMUR_HASH MurmurHashNeutral2
+unsigned int MurmurHashNeutral2 ( const void * key, int len, unsigned int seed );
+#define MurmurHash MurmurHashNeutral2
+typedef unsigned int murmur_t;
+#endif
+
+// Allow slice to be hashable by murmur hash.
+namespace rocksdb {
+struct murmur_hash {
+  size_t operator()(const Slice& slice) const {
+    return MurmurHash(slice.data(), slice.size(), 0);
+  }
+};
+}  // rocksdb
diff --git a/util/mutexlock.h b/util/mutexlock.h
new file mode 100644 (file)
index 0000000..0f4e5c8
--- /dev/null
@@ -0,0 +1,78 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include "port/port.h"
+
+namespace rocksdb {
+
+// Helper class that locks a mutex on construction and unlocks the mutex when
+// the destructor of the MutexLock object is invoked.
+//
+// Typical usage:
+//
+//   void MyClass::MyMethod() {
+//     MutexLock l(&mu_);       // mu_ is an instance variable
+//     ... some complex code, possibly with multiple return paths ...
+//   }
+
+class MutexLock {
+ public:
+  explicit MutexLock(port::Mutex *mu) : mu_(mu) {
+    this->mu_->Lock();
+  }
+  ~MutexLock() { this->mu_->Unlock(); }
+
+ private:
+  port::Mutex *const mu_;
+  // No copying allowed
+  MutexLock(const MutexLock&);
+  void operator=(const MutexLock&);
+};
+
+//
+// Acquire a ReadLock on the specified RWMutex.
+// The Lock will be automatically released then the
+// object goes out of scope.
+//
+class ReadLock {
+ public:
+  explicit ReadLock(port::RWMutex *mu) : mu_(mu) {
+    this->mu_->ReadLock();
+  }
+  ~ReadLock() { this->mu_->Unlock(); }
+
+ private:
+  port::RWMutex *const mu_;
+  // No copying allowed
+  ReadLock(const ReadLock&);
+  void operator=(const ReadLock&);
+};
+
+
+//
+// Acquire a WriteLock on the specified RWMutex.
+// The Lock will be automatically released then the
+// object goes out of scope.
+//
+class WriteLock {
+ public:
+  explicit WriteLock(port::RWMutex *mu) : mu_(mu) {
+    this->mu_->WriteLock();
+  }
+  ~WriteLock() { this->mu_->Unlock(); }
+
+ private:
+  port::RWMutex *const mu_;
+  // No copying allowed
+  WriteLock(const WriteLock&);
+  void operator=(const WriteLock&);
+};
+
+}  // namespace rocksdb
diff --git a/util/options.cc b/util/options.cc
new file mode 100644 (file)
index 0000000..e33d44e
--- /dev/null
@@ -0,0 +1,479 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/options.h"
+
+#define __STDC_FORMAT_MACROS
+#include <inttypes.h>
+#include <limits>
+
+#include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+#include "table/block_based_table_factory.h"
+
+namespace rocksdb {
+
+ColumnFamilyOptions::ColumnFamilyOptions()
+    : comparator(BytewiseComparator()),
+      merge_operator(nullptr),
+      compaction_filter(nullptr),
+      compaction_filter_factory(std::shared_ptr<CompactionFilterFactory>(
+          new DefaultCompactionFilterFactory())),
+      compaction_filter_factory_v2(
+          new DefaultCompactionFilterFactoryV2()),
+      write_buffer_size(4 << 20),
+      max_write_buffer_number(2),
+      min_write_buffer_number_to_merge(1),
+      block_cache(nullptr),
+      block_cache_compressed(nullptr),
+      block_size(4096),
+      block_restart_interval(16),
+      compression(kSnappyCompression),
+      filter_policy(nullptr),
+      prefix_extractor(nullptr),
+      whole_key_filtering(true),
+      num_levels(7),
+      level0_file_num_compaction_trigger(4),
+      level0_slowdown_writes_trigger(20),
+      level0_stop_writes_trigger(24),
+      max_mem_compaction_level(2),
+      target_file_size_base(2 * 1048576),
+      target_file_size_multiplier(1),
+      max_bytes_for_level_base(10 * 1048576),
+      max_bytes_for_level_multiplier(10),
+      max_bytes_for_level_multiplier_additional(num_levels, 1),
+      expanded_compaction_factor(25),
+      source_compaction_factor(1),
+      max_grandparent_overlap_factor(10),
+      disable_seek_compaction(true),
+      soft_rate_limit(0.0),
+      hard_rate_limit(0.0),
+      rate_limit_delay_max_milliseconds(1000),
+      no_block_cache(false),
+      arena_block_size(0),
+      disable_auto_compactions(false),
+      purge_redundant_kvs_while_flush(true),
+      block_size_deviation(10),
+      compaction_style(kCompactionStyleLevel),
+      verify_checksums_in_compaction(true),
+      filter_deletes(false),
+      max_sequential_skip_in_iterations(8),
+      memtable_factory(std::shared_ptr<SkipListFactory>(new SkipListFactory)),
+      table_factory(
+          std::shared_ptr<TableFactory>(new BlockBasedTableFactory())),
+      inplace_update_support(false),
+      inplace_update_num_locks(10000),
+      inplace_callback(nullptr),
+      memtable_prefix_bloom_bits(0),
+      memtable_prefix_bloom_probes(6),
+      bloom_locality(0),
+      max_successive_merges(0),
+      min_partial_merge_operands(2) {
+  assert(memtable_factory.get() != nullptr);
+}
+
+ColumnFamilyOptions::ColumnFamilyOptions(const Options& options)
+    : comparator(options.comparator),
+      merge_operator(options.merge_operator),
+      compaction_filter(options.compaction_filter),
+      compaction_filter_factory(options.compaction_filter_factory),
+      compaction_filter_factory_v2(options.compaction_filter_factory_v2),
+      write_buffer_size(options.write_buffer_size),
+      max_write_buffer_number(options.max_write_buffer_number),
+      min_write_buffer_number_to_merge(
+          options.min_write_buffer_number_to_merge),
+      block_cache(options.block_cache),
+      block_cache_compressed(options.block_cache_compressed),
+      block_size(options.block_size),
+      block_restart_interval(options.block_restart_interval),
+      compression(options.compression),
+      compression_per_level(options.compression_per_level),
+      compression_opts(options.compression_opts),
+      filter_policy(options.filter_policy),
+      prefix_extractor(options.prefix_extractor),
+      whole_key_filtering(options.whole_key_filtering),
+      num_levels(options.num_levels),
+      level0_file_num_compaction_trigger(
+          options.level0_file_num_compaction_trigger),
+      level0_slowdown_writes_trigger(options.level0_slowdown_writes_trigger),
+      level0_stop_writes_trigger(options.level0_stop_writes_trigger),
+      max_mem_compaction_level(options.max_mem_compaction_level),
+      target_file_size_base(options.target_file_size_base),
+      target_file_size_multiplier(options.target_file_size_multiplier),
+      max_bytes_for_level_base(options.max_bytes_for_level_base),
+      max_bytes_for_level_multiplier(options.max_bytes_for_level_multiplier),
+      max_bytes_for_level_multiplier_additional(
+          options.max_bytes_for_level_multiplier_additional),
+      expanded_compaction_factor(options.expanded_compaction_factor),
+      source_compaction_factor(options.source_compaction_factor),
+      max_grandparent_overlap_factor(options.max_grandparent_overlap_factor),
+      disable_seek_compaction(options.disable_seek_compaction),
+      soft_rate_limit(options.soft_rate_limit),
+      hard_rate_limit(options.hard_rate_limit),
+      rate_limit_delay_max_milliseconds(
+          options.rate_limit_delay_max_milliseconds),
+      no_block_cache(options.no_block_cache),
+      arena_block_size(options.arena_block_size),
+      disable_auto_compactions(options.disable_auto_compactions),
+      purge_redundant_kvs_while_flush(options.purge_redundant_kvs_while_flush),
+      block_size_deviation(options.block_size_deviation),
+      compaction_style(options.compaction_style),
+      verify_checksums_in_compaction(options.verify_checksums_in_compaction),
+      compaction_options_universal(options.compaction_options_universal),
+      filter_deletes(options.filter_deletes),
+      max_sequential_skip_in_iterations(
+          options.max_sequential_skip_in_iterations),
+      memtable_factory(options.memtable_factory),
+      table_factory(options.table_factory),
+      table_properties_collectors(options.table_properties_collectors),
+      inplace_update_support(options.inplace_update_support),
+      inplace_update_num_locks(options.inplace_update_num_locks),
+      inplace_callback(options.inplace_callback),
+      memtable_prefix_bloom_bits(options.memtable_prefix_bloom_bits),
+      memtable_prefix_bloom_probes(options.memtable_prefix_bloom_probes),
+      bloom_locality(options.bloom_locality),
+      max_successive_merges(options.max_successive_merges),
+      min_partial_merge_operands(options.min_partial_merge_operands) {
+  assert(memtable_factory.get() != nullptr);
+}
+
+DBOptions::DBOptions()
+    : create_if_missing(false),
+      error_if_exists(false),
+      paranoid_checks(true),
+      env(Env::Default()),
+      info_log(nullptr),
+      info_log_level(INFO_LEVEL),
+      max_open_files(5000),
+      max_total_wal_size(0),
+      statistics(nullptr),
+      disableDataSync(false),
+      use_fsync(false),
+      db_stats_log_interval(1800),
+      db_log_dir(""),
+      wal_dir(""),
+      delete_obsolete_files_period_micros(6 * 60 * 60 * 1000000UL),
+      max_background_compactions(1),
+      max_background_flushes(1),
+      max_log_file_size(0),
+      log_file_time_to_roll(0),
+      keep_log_file_num(1000),
+      max_manifest_file_size(std::numeric_limits<uint64_t>::max()),
+      table_cache_numshardbits(4),
+      table_cache_remove_scan_count_limit(16),
+      WAL_ttl_seconds(0),
+      WAL_size_limit_MB(0),
+      manifest_preallocation_size(4 * 1024 * 1024),
+      allow_os_buffer(true),
+      allow_mmap_reads(false),
+      allow_mmap_writes(false),
+      is_fd_close_on_exec(true),
+      skip_log_error_on_recovery(false),
+      stats_dump_period_sec(3600),
+      advise_random_on_open(true),
+      access_hint_on_compaction_start(NORMAL),
+      use_adaptive_mutex(false),
+      bytes_per_sync(0),
+      allow_thread_local(true) {}
+
+DBOptions::DBOptions(const Options& options)
+    : create_if_missing(options.create_if_missing),
+      error_if_exists(options.error_if_exists),
+      paranoid_checks(options.paranoid_checks),
+      env(options.env),
+      info_log(options.info_log),
+      info_log_level(options.info_log_level),
+      max_open_files(options.max_open_files),
+      max_total_wal_size(options.max_total_wal_size),
+      statistics(options.statistics),
+      disableDataSync(options.disableDataSync),
+      use_fsync(options.use_fsync),
+      db_stats_log_interval(options.db_stats_log_interval),
+      db_log_dir(options.db_log_dir),
+      wal_dir(options.wal_dir),
+      delete_obsolete_files_period_micros(
+          options.delete_obsolete_files_period_micros),
+      max_background_compactions(options.max_background_compactions),
+      max_background_flushes(options.max_background_flushes),
+      max_log_file_size(options.max_log_file_size),
+      log_file_time_to_roll(options.log_file_time_to_roll),
+      keep_log_file_num(options.keep_log_file_num),
+      max_manifest_file_size(options.max_manifest_file_size),
+      table_cache_numshardbits(options.table_cache_numshardbits),
+      table_cache_remove_scan_count_limit(
+          options.table_cache_remove_scan_count_limit),
+      WAL_ttl_seconds(options.WAL_ttl_seconds),
+      WAL_size_limit_MB(options.WAL_size_limit_MB),
+      manifest_preallocation_size(options.manifest_preallocation_size),
+      allow_os_buffer(options.allow_os_buffer),
+      allow_mmap_reads(options.allow_mmap_reads),
+      allow_mmap_writes(options.allow_mmap_writes),
+      is_fd_close_on_exec(options.is_fd_close_on_exec),
+      skip_log_error_on_recovery(options.skip_log_error_on_recovery),
+      stats_dump_period_sec(options.stats_dump_period_sec),
+      advise_random_on_open(options.advise_random_on_open),
+      access_hint_on_compaction_start(options.access_hint_on_compaction_start),
+      use_adaptive_mutex(options.use_adaptive_mutex),
+      bytes_per_sync(options.bytes_per_sync),
+      allow_thread_local(options.allow_thread_local) {}
+
+static const char* const access_hints[] = {
+  "NONE", "NORMAL", "SEQUENTIAL", "WILLNEED"
+};
+
+void DBOptions::Dump(Logger* log) const {
+    Log(log,"         Options.error_if_exists: %d", error_if_exists);
+    Log(log,"       Options.create_if_missing: %d", create_if_missing);
+    Log(log,"         Options.paranoid_checks: %d", paranoid_checks);
+    Log(log,"                     Options.env: %p", env);
+    Log(log,"                Options.info_log: %p", info_log.get());
+    Log(log,"          Options.max_open_files: %d", max_open_files);
+    Log(log,"      Options.max_total_wal_size: %" PRIu64, max_total_wal_size);
+    Log(log, "       Options.disableDataSync: %d", disableDataSync);
+    Log(log, "             Options.use_fsync: %d", use_fsync);
+    Log(log, "     Options.max_log_file_size: %zu", max_log_file_size);
+    Log(log, "Options.max_manifest_file_size: %lu",
+        (unsigned long)max_manifest_file_size);
+    Log(log, "     Options.log_file_time_to_roll: %zu", log_file_time_to_roll);
+    Log(log, "     Options.keep_log_file_num: %zu", keep_log_file_num);
+    Log(log, " Options.db_stats_log_interval: %d", db_stats_log_interval);
+    Log(log, "       Options.allow_os_buffer: %d", allow_os_buffer);
+    Log(log, "      Options.allow_mmap_reads: %d", allow_mmap_reads);
+    Log(log, "     Options.allow_mmap_writes: %d", allow_mmap_writes);
+    Log(log, "                             Options.db_log_dir: %s",
+        db_log_dir.c_str());
+    Log(log, "                             Options.wal_dir: %s",
+        wal_dir.c_str());
+    Log(log, "               Options.table_cache_numshardbits: %d",
+        table_cache_numshardbits);
+    Log(log, "    Options.table_cache_remove_scan_count_limit: %d",
+        table_cache_remove_scan_count_limit);
+    Log(log, "    Options.delete_obsolete_files_period_micros: %lu",
+        (unsigned long)delete_obsolete_files_period_micros);
+    Log(log, "             Options.max_background_compactions: %d",
+        max_background_compactions);
+    Log(log, "                 Options.max_background_flushes: %d",
+        max_background_flushes);
+    Log(log, "                        Options.WAL_ttl_seconds: %lu",
+        (unsigned long)WAL_ttl_seconds);
+    Log(log, "                      Options.WAL_size_limit_MB: %lu",
+        (unsigned long)WAL_size_limit_MB);
+    Log(log, "            Options.manifest_preallocation_size: %zu",
+        manifest_preallocation_size);
+    Log(log, "                         Options.allow_os_buffer: %d",
+        allow_os_buffer);
+    Log(log, "                        Options.allow_mmap_reads: %d",
+        allow_mmap_reads);
+    Log(log, "                       Options.allow_mmap_writes: %d",
+        allow_mmap_writes);
+    Log(log, "                     Options.is_fd_close_on_exec: %d",
+        is_fd_close_on_exec);
+    Log(log, "              Options.skip_log_error_on_recovery: %d",
+        skip_log_error_on_recovery);
+    Log(log, "                   Options.stats_dump_period_sec: %u",
+        stats_dump_period_sec);
+    Log(log, "                   Options.advise_random_on_open: %d",
+        advise_random_on_open);
+    Log(log, "         Options.access_hint_on_compaction_start: %s",
+        access_hints[access_hint_on_compaction_start]);
+    Log(log, "                      Options.use_adaptive_mutex: %d",
+        use_adaptive_mutex);
+    Log(log, "                          Options.bytes_per_sync: %lu",
+        (unsigned long)bytes_per_sync);
+}  // DBOptions::Dump
+
+void ColumnFamilyOptions::Dump(Logger* log) const {
+  Log(log, "              Options.comparator: %s", comparator->Name());
+  Log(log, "          Options.merge_operator: %s",
+      merge_operator ? merge_operator->Name() : "None");
+  Log(log, "       Options.compaction_filter_factory: %s",
+      compaction_filter_factory->Name());
+  Log(log, "       Options.compaction_filter_factory_v2: %s",
+      compaction_filter_factory_v2->Name());
+  Log(log, "        Options.memtable_factory: %s", memtable_factory->Name());
+  Log(log, "           Options.table_factory: %s", table_factory->Name());
+  Log(log, "       Options.write_buffer_size: %zd", write_buffer_size);
+  Log(log, " Options.max_write_buffer_number: %d", max_write_buffer_number);
+    Log(log,"             Options.block_cache: %p", block_cache.get());
+    Log(log,"  Options.block_cache_compressed: %p",
+        block_cache_compressed.get());
+    if (block_cache) {
+      Log(log,"        Options.block_cache_size: %zd",
+          block_cache->GetCapacity());
+    }
+    if (block_cache_compressed) {
+      Log(log,"Options.block_cache_compressed_size: %zd",
+          block_cache_compressed->GetCapacity());
+    }
+    Log(log,"              Options.block_size: %zd", block_size);
+    Log(log,"  Options.block_restart_interval: %d", block_restart_interval);
+    if (!compression_per_level.empty()) {
+      for (unsigned int i = 0; i < compression_per_level.size(); i++) {
+          Log(log,"       Options.compression[%d]: %d",
+              i, compression_per_level[i]);
+       }
+    } else {
+      Log(log,"         Options.compression: %d", compression);
+    }
+    Log(log,"         Options.filter_policy: %s",
+        filter_policy == nullptr ? "nullptr" : filter_policy->Name());
+    Log(log,"      Options.prefix_extractor: %s",
+        prefix_extractor == nullptr ? "nullptr" : prefix_extractor->Name());
+    Log(log,"   Options.whole_key_filtering: %d", whole_key_filtering);
+    Log(log,"            Options.num_levels: %d", num_levels);
+    Log(log,"       Options.min_write_buffer_number_to_merge: %d",
+        min_write_buffer_number_to_merge);
+    Log(log,"        Options.purge_redundant_kvs_while_flush: %d",
+         purge_redundant_kvs_while_flush);
+    Log(log,"           Options.compression_opts.window_bits: %d",
+        compression_opts.window_bits);
+    Log(log,"                 Options.compression_opts.level: %d",
+        compression_opts.level);
+    Log(log,"              Options.compression_opts.strategy: %d",
+        compression_opts.strategy);
+    Log(log,"     Options.level0_file_num_compaction_trigger: %d",
+        level0_file_num_compaction_trigger);
+    Log(log,"         Options.level0_slowdown_writes_trigger: %d",
+        level0_slowdown_writes_trigger);
+    Log(log,"             Options.level0_stop_writes_trigger: %d",
+        level0_stop_writes_trigger);
+    Log(log,"               Options.max_mem_compaction_level: %d",
+        max_mem_compaction_level);
+    Log(log,"                  Options.target_file_size_base: %d",
+        target_file_size_base);
+    Log(log,"            Options.target_file_size_multiplier: %d",
+        target_file_size_multiplier);
+    Log(log,"               Options.max_bytes_for_level_base: %lu",
+        (unsigned long)max_bytes_for_level_base);
+    Log(log,"         Options.max_bytes_for_level_multiplier: %d",
+        max_bytes_for_level_multiplier);
+    for (int i = 0; i < num_levels; i++) {
+      Log(log,"Options.max_bytes_for_level_multiplier_addtl[%d]: %d",
+          i, max_bytes_for_level_multiplier_additional[i]);
+    }
+    Log(log,"      Options.max_sequential_skip_in_iterations: %lu",
+        (unsigned long)max_sequential_skip_in_iterations);
+    Log(log,"             Options.expanded_compaction_factor: %d",
+        expanded_compaction_factor);
+    Log(log,"               Options.source_compaction_factor: %d",
+        source_compaction_factor);
+    Log(log,"         Options.max_grandparent_overlap_factor: %d",
+        max_grandparent_overlap_factor);
+    Log(log,"                Options.disable_seek_compaction: %d",
+        disable_seek_compaction);
+    Log(log,"                         Options.no_block_cache: %d",
+        no_block_cache);
+    Log(log,"                       Options.arena_block_size: %zu",
+        arena_block_size);
+    Log(log,"                      Options.soft_rate_limit: %.2f",
+        soft_rate_limit);
+    Log(log,"                      Options.hard_rate_limit: %.2f",
+        hard_rate_limit);
+    Log(log,"      Options.rate_limit_delay_max_milliseconds: %u",
+        rate_limit_delay_max_milliseconds);
+    Log(log,"               Options.disable_auto_compactions: %d",
+        disable_auto_compactions);
+    Log(log,"         Options.purge_redundant_kvs_while_flush: %d",
+        purge_redundant_kvs_while_flush);
+    Log(log,"                    Options.block_size_deviation: %d",
+        block_size_deviation);
+    Log(log,"                          Options.filter_deletes: %d",
+        filter_deletes);
+    Log(log, "          Options.verify_checksums_in_compaction: %d",
+        verify_checksums_in_compaction);
+    Log(log,"                        Options.compaction_style: %d",
+        compaction_style);
+    Log(log," Options.compaction_options_universal.size_ratio: %u",
+        compaction_options_universal.size_ratio);
+    Log(log,"Options.compaction_options_universal.min_merge_width: %u",
+        compaction_options_universal.min_merge_width);
+    Log(log,"Options.compaction_options_universal.max_merge_width: %u",
+        compaction_options_universal.max_merge_width);
+    Log(log,"Options.compaction_options_universal."
+            "max_size_amplification_percent: %u",
+        compaction_options_universal.max_size_amplification_percent);
+    Log(log,
+        "Options.compaction_options_universal.compression_size_percent: %u",
+        compaction_options_universal.compression_size_percent);
+    std::string collector_names;
+    for (auto collector : table_properties_collectors) {
+      collector_names.append(collector->Name());
+      collector_names.append("; ");
+    }
+    Log(log, "                  Options.table_properties_collectors: %s",
+        collector_names.c_str());
+    Log(log, "                  Options.inplace_update_support: %d",
+        inplace_update_support);
+    Log(log, "                Options.inplace_update_num_locks: %zd",
+        inplace_update_num_locks);
+    Log(log, "              Options.min_partial_merge_operands: %u",
+        min_partial_merge_operands);
+    // TODO: easier config for bloom (maybe based on avg key/value size)
+    Log(log, "              Options.memtable_prefix_bloom_bits: %d",
+        memtable_prefix_bloom_bits);
+    Log(log, "            Options.memtable_prefix_bloom_probes: %d",
+        memtable_prefix_bloom_probes);
+    Log(log, "                          Options.bloom_locality: %d",
+        bloom_locality);
+    Log(log, "                   Options.max_successive_merges: %zd",
+        max_successive_merges);
+}  // ColumnFamilyOptions::Dump
+
+void Options::Dump(Logger* log) const {
+  DBOptions::Dump(log);
+  ColumnFamilyOptions::Dump(log);
+}   // Options::Dump
+
+//
+// The goal of this method is to create a configuration that
+// allows an application to write all files into L0 and
+// then do a single compaction to output all files into L1.
+Options*
+Options::PrepareForBulkLoad()
+{
+  // never slowdown ingest.
+  level0_file_num_compaction_trigger = (1<<30);
+  level0_slowdown_writes_trigger = (1<<30);
+  level0_stop_writes_trigger = (1<<30);
+
+  // no auto compactions please. The application should issue a
+  // manual compaction after all data is loaded into L0.
+  disable_auto_compactions = true;
+  disable_seek_compaction = true;
+  disableDataSync = true;
+
+  // A manual compaction run should pick all files in L0 in
+  // a single compaction run.
+  source_compaction_factor = (1<<30);
+
+  // It is better to have only 2 levels, otherwise a manual
+  // compaction would compact at every possible level, thereby
+  // increasing the total time needed for compactions.
+  num_levels = 2;
+
+  // Prevent a memtable flush to automatically promote files
+  // to L1. This is helpful so that all files that are
+  // input to the manual compaction are all at L0.
+  max_background_compactions = 2;
+
+  // The compaction would create large files in L1.
+  target_file_size_base = 256 * 1024 * 1024;
+  return this;
+}
+
+}  // namespace rocksdb
diff --git a/util/perf_context.cc b/util/perf_context.cc
new file mode 100644 (file)
index 0000000..264b10d
--- /dev/null
@@ -0,0 +1,86 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+
+#include <sstream>
+#include "util/perf_context_imp.h"
+
+namespace rocksdb {
+
+#if defined(NPERF_CONTEXT) || defined(IOS_CROSS_COMPILE)
+PerfLevel perf_level = kEnableCount;
+// This is a dummy variable since some place references it
+PerfContext perf_context;
+#else
+__thread PerfLevel perf_level = kEnableCount;
+__thread PerfContext perf_context;
+#endif
+
+void SetPerfLevel(PerfLevel level) {
+  perf_level = level;
+}
+
+void PerfContext::Reset() {
+#if !defined(NPERF_CONTEXT) && !defined(IOS_CROSS_COMPILE)
+  user_key_comparison_count = 0;
+  block_cache_hit_count = 0;
+  block_read_count = 0;
+  block_read_byte = 0;
+  block_read_time = 0;
+  block_checksum_time = 0;
+  block_decompress_time = 0;
+  internal_key_skipped_count = 0;
+  internal_delete_skipped_count = 0;
+  write_wal_time = 0;
+
+  get_snapshot_time = 0;
+  get_from_memtable_time = 0;
+  get_from_memtable_count = 0;
+  get_post_process_time = 0;
+  get_from_output_files_time = 0;
+  seek_child_seek_time = 0;
+  seek_child_seek_count = 0;
+  seek_min_heap_time = 0;
+  seek_internal_seek_time = 0;
+  find_next_user_entry_time = 0;
+  write_pre_and_post_process_time = 0;
+  write_memtable_time = 0;
+#endif
+}
+
+#define OUTPUT(counter) #counter << " = " << counter << ", "
+
+std::string PerfContext::ToString() const {
+#if defined(NPERF_CONTEXT) || defined(IOS_CROSS_COMPILE)
+  return "";
+#else
+  std::ostringstream ss;
+  ss << OUTPUT(user_key_comparison_count)
+     << OUTPUT(block_cache_hit_count)
+     << OUTPUT(block_read_count)
+     << OUTPUT(block_read_byte)
+     << OUTPUT(block_read_time)
+     << OUTPUT(block_checksum_time)
+     << OUTPUT(block_decompress_time)
+     << OUTPUT(internal_key_skipped_count)
+     << OUTPUT(internal_delete_skipped_count)
+     << OUTPUT(write_wal_time)
+     << OUTPUT(get_snapshot_time)
+     << OUTPUT(get_from_memtable_time)
+     << OUTPUT(get_from_memtable_count)
+     << OUTPUT(get_post_process_time)
+     << OUTPUT(get_from_output_files_time)
+     << OUTPUT(seek_child_seek_time)
+     << OUTPUT(seek_child_seek_count)
+     << OUTPUT(seek_min_heap_time)
+     << OUTPUT(seek_internal_seek_time)
+     << OUTPUT(find_next_user_entry_time)
+     << OUTPUT(write_pre_and_post_process_time)
+     << OUTPUT(write_memtable_time);
+  return ss.str();
+#endif
+}
+
+}
diff --git a/util/perf_context_imp.h b/util/perf_context_imp.h
new file mode 100644 (file)
index 0000000..dc4ae95
--- /dev/null
@@ -0,0 +1,88 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#pragma once
+#include "rocksdb/perf_context.h"
+#include "util/stop_watch.h"
+
+namespace rocksdb {
+
+#if defined(NPERF_CONTEXT) || defined(IOS_CROSS_COMPILE)
+
+#define PERF_TIMER_DECLARE()
+#define PERF_TIMER_START(metric)
+#define PERF_TIMER_AUTO(metric)
+#define PERF_TIMER_MEASURE(metric)
+#define PERF_TIMER_STOP(metric)
+#define PERF_COUNTER_ADD(metric, value)
+
+#else
+
+extern __thread PerfLevel perf_level;
+
+class PerfStepTimer {
+ public:
+  PerfStepTimer()
+    : enabled_(perf_level >= PerfLevel::kEnableTime),
+      env_(enabled_ ? Env::Default() : nullptr),
+      start_(0) {
+  }
+
+  void Start() {
+    if (enabled_) {
+      start_ = env_->NowNanos();
+    }
+  }
+
+  void Measure(uint64_t* metric) {
+    if (start_) {
+      uint64_t now = env_->NowNanos();
+      *metric += now - start_;
+      start_ = now;
+    }
+  }
+
+  void Stop(uint64_t* metric) {
+    if (start_) {
+      *metric += env_->NowNanos() - start_;
+      start_ = 0;
+    }
+  }
+
+ private:
+  const bool enabled_;
+  Env* const env_;
+  uint64_t start_;
+};
+
+// Declare the local timer object to be used later on
+#define PERF_TIMER_DECLARE()           \
+  PerfStepTimer perf_step_timer;
+
+// Set start time of the timer
+#define PERF_TIMER_START(metric)          \
+  perf_step_timer.Start();
+
+// Declare and set start time of the timer
+#define PERF_TIMER_AUTO(metric)           \
+  PerfStepTimer perf_step_timer;          \
+  perf_step_timer.Start();
+
+// Update metric with time elapsed since last START. start time is reset
+// to current timestamp.
+#define PERF_TIMER_MEASURE(metric)        \
+  perf_step_timer.Measure(&(perf_context.metric));
+
+// Update metric with time elapsed since last START. But start time is not set.
+#define PERF_TIMER_STOP(metric)        \
+  perf_step_timer.Stop(&(perf_context.metric));
+
+// Increase metric value
+#define PERF_COUNTER_ADD(metric, value)     \
+  perf_context.metric += value;
+
+#endif
+
+}
diff --git a/util/posix_logger.h b/util/posix_logger.h
new file mode 100644 (file)
index 0000000..6aba769
--- /dev/null
@@ -0,0 +1,161 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Logger implementation that can be shared by all environments
+// where enough posix functionality is available.
+
+#pragma once
+#include <algorithm>
+#include <stdio.h>
+#include <sys/time.h>
+#include <time.h>
+#include <fcntl.h>
+#include <unistd.h>
+#ifdef OS_LINUX
+#include <linux/falloc.h>
+#endif
+#include "rocksdb/env.h"
+#include <atomic>
+
+namespace rocksdb {
+
+const int kDebugLogChunkSize = 128 * 1024;
+
+class PosixLogger : public Logger {
+ private:
+  FILE* file_;
+  uint64_t (*gettid_)();  // Return the thread id for the current thread
+  std::atomic_size_t log_size_;
+  int fd_;
+  const static uint64_t flush_every_seconds_ = 5;
+  std::atomic_uint_fast64_t last_flush_micros_;
+  Env* env_;
+  bool flush_pending_;
+ public:
+  PosixLogger(FILE* f, uint64_t (*gettid)(), Env* env,
+              const InfoLogLevel log_level = InfoLogLevel::ERROR_LEVEL)
+      : Logger(log_level),
+        file_(f),
+        gettid_(gettid),
+        log_size_(0),
+        fd_(fileno(f)),
+        last_flush_micros_(0),
+        env_(env),
+        flush_pending_(false) {}
+  virtual ~PosixLogger() {
+    fclose(file_);
+  }
+  virtual void Flush() {
+    if (flush_pending_) {
+      flush_pending_ = false;
+      fflush(file_);
+    }
+    last_flush_micros_ = env_->NowMicros();
+  }
+  virtual void Logv(const char* format, va_list ap) {
+    const uint64_t thread_id = (*gettid_)();
+
+    // We try twice: the first time with a fixed-size stack allocated buffer,
+    // and the second time with a much larger dynamically allocated buffer.
+    char buffer[500];
+    for (int iter = 0; iter < 2; iter++) {
+      char* base;
+      int bufsize;
+      if (iter == 0) {
+        bufsize = sizeof(buffer);
+        base = buffer;
+      } else {
+        bufsize = 30000;
+        base = new char[bufsize];
+      }
+      char* p = base;
+      char* limit = base + bufsize;
+
+      struct timeval now_tv;
+      gettimeofday(&now_tv, nullptr);
+      const time_t seconds = now_tv.tv_sec;
+      struct tm t;
+      localtime_r(&seconds, &t);
+      p += snprintf(p, limit - p,
+                    "%04d/%02d/%02d-%02d:%02d:%02d.%06d %llx ",
+                    t.tm_year + 1900,
+                    t.tm_mon + 1,
+                    t.tm_mday,
+                    t.tm_hour,
+                    t.tm_min,
+                    t.tm_sec,
+                    static_cast<int>(now_tv.tv_usec),
+                    static_cast<long long unsigned int>(thread_id));
+
+      // Print the message
+      if (p < limit) {
+        va_list backup_ap;
+        va_copy(backup_ap, ap);
+        p += vsnprintf(p, limit - p, format, backup_ap);
+        va_end(backup_ap);
+      }
+
+      // Truncate to available space if necessary
+      if (p >= limit) {
+        if (iter == 0) {
+          continue;       // Try again with larger buffer
+        } else {
+          p = limit - 1;
+        }
+      }
+
+      // Add newline if necessary
+      if (p == base || p[-1] != '\n') {
+        *p++ = '\n';
+      }
+
+      assert(p <= limit);
+      const size_t write_size = p - base;
+
+#ifdef ROCKSDB_FALLOCATE_PRESENT
+      // If this write would cross a boundary of kDebugLogChunkSize
+      // space, pre-allocate more space to avoid overly large
+      // allocations from filesystem allocsize options.
+      const size_t log_size = log_size_;
+      const int last_allocation_chunk =
+        ((kDebugLogChunkSize - 1 + log_size) / kDebugLogChunkSize);
+      const int desired_allocation_chunk =
+        ((kDebugLogChunkSize - 1 + log_size + write_size) /
+           kDebugLogChunkSize);
+      if (last_allocation_chunk != desired_allocation_chunk) {
+        fallocate(fd_, FALLOC_FL_KEEP_SIZE, 0,
+                  desired_allocation_chunk * kDebugLogChunkSize);
+      }
+#endif
+
+      size_t sz = fwrite(base, 1, write_size, file_);
+      flush_pending_ = true;
+      assert(sz == write_size);
+      if (sz > 0) {
+        log_size_ += write_size;
+      }
+      uint64_t now_micros = static_cast<uint64_t>(now_tv.tv_sec) * 1000000 +
+        now_tv.tv_usec;
+      if (now_micros - last_flush_micros_ >= flush_every_seconds_ * 1000000) {
+        flush_pending_ = false;
+        fflush(file_);
+        last_flush_micros_ = now_micros;
+      }
+      if (base != buffer) {
+        delete[] base;
+      }
+      break;
+    }
+  }
+  size_t GetLogFileSize() const {
+    return log_size_;
+  }
+};
+
+}  // namespace rocksdb
diff --git a/util/random.h b/util/random.h
new file mode 100644 (file)
index 0000000..e5b3315
--- /dev/null
@@ -0,0 +1,90 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <random>
+#include <stdint.h>
+
+namespace rocksdb {
+
+// A very simple random number generator.  Not especially good at
+// generating truly random bits, but good enough for our needs in this
+// package.
+class Random {
+ private:
+  uint32_t seed_;
+ public:
+  explicit Random(uint32_t s) : seed_(s & 0x7fffffffu) { }
+  uint32_t Next() {
+    static const uint32_t M = 2147483647L;   // 2^31-1
+    static const uint64_t A = 16807;  // bits 14, 8, 7, 5, 2, 1, 0
+    // We are computing
+    //       seed_ = (seed_ * A) % M,    where M = 2^31-1
+    //
+    // seed_ must not be zero or M, or else all subsequent computed values
+    // will be zero or M respectively.  For all other values, seed_ will end
+    // up cycling through every number in [1,M-1]
+    uint64_t product = seed_ * A;
+
+    // Compute (product % M) using the fact that ((x << 31) % M) == x.
+    seed_ = static_cast<uint32_t>((product >> 31) + (product & M));
+    // The first reduction may overflow by 1 bit, so we may need to
+    // repeat.  mod == M is not possible; using > allows the faster
+    // sign-bit-based test.
+    if (seed_ > M) {
+      seed_ -= M;
+    }
+    return seed_;
+  }
+  // Returns a uniformly distributed value in the range [0..n-1]
+  // REQUIRES: n > 0
+  uint32_t Uniform(int n) { return Next() % n; }
+
+  // Randomly returns true ~"1/n" of the time, and false otherwise.
+  // REQUIRES: n > 0
+  bool OneIn(int n) { return (Next() % n) == 0; }
+
+  // Skewed: pick "base" uniformly from range [0,max_log] and then
+  // return "base" random bits.  The effect is to pick a number in the
+  // range [0,2^max_log-1] with exponential bias towards smaller numbers.
+  uint32_t Skewed(int max_log) {
+    return Uniform(1 << Uniform(max_log + 1));
+  }
+};
+
+// A simple 64bit random number generator based on std::mt19937_64
+class Random64 {
+ private:
+  std::mt19937_64 generator_;
+
+ public:
+  explicit Random64(uint64_t s) : generator_(s) { }
+
+  // Generates the next random number
+  uint64_t Next() { return generator_(); }
+
+  // Returns a uniformly distributed value in the range [0..n-1]
+  // REQUIRES: n > 0
+  uint64_t Uniform(uint64_t n) {
+    return std::uniform_int_distribution<uint64_t>(0, n - 1)(generator_);
+  }
+
+  // Randomly returns true ~"1/n" of the time, and false otherwise.
+  // REQUIRES: n > 0
+  bool OneIn(uint64_t n) { return Uniform(n) == 0; }
+
+  // Skewed: pick "base" uniformly from range [0,max_log] and then
+  // return "base" random bits.  The effect is to pick a number in the
+  // range [0,2^max_log-1] with exponential bias towards smaller numbers.
+  uint64_t Skewed(int max_log) {
+    return Uniform(1 << Uniform(max_log + 1));
+  }
+};
+
+}  // namespace rocksdb
diff --git a/util/signal_test.cc b/util/signal_test.cc
new file mode 100644 (file)
index 0000000..f51fa54
--- /dev/null
@@ -0,0 +1,34 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#include "port/stack_trace.h"
+#include <assert.h>
+
+namespace {
+void f0() {
+  char *p = nullptr;
+  *p = 10;  /* SIGSEGV here!! */
+}
+
+void f1() {
+  f0();
+}
+
+void f2() {
+  f1();
+}
+
+void f3() {
+  f2();
+}
+}  // namespace
+
+int main() {
+  rocksdb::port::InstallStackTraceHandler();
+
+  f3();
+
+  return 0;
+}
diff --git a/util/skiplistrep.cc b/util/skiplistrep.cc
new file mode 100644 (file)
index 0000000..93f7134
--- /dev/null
@@ -0,0 +1,123 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#include "rocksdb/memtablerep.h"
+#include "db/memtable.h"
+#include "db/skiplist.h"
+
+namespace rocksdb {
+namespace {
+class SkipListRep : public MemTableRep {
+  SkipList<const char*, const MemTableRep::KeyComparator&> skip_list_;
+public:
+  explicit SkipListRep(const MemTableRep::KeyComparator& compare, Arena* arena)
+    : MemTableRep(arena), skip_list_(compare, arena) {
+  }
+
+  // Insert key into the list.
+  // REQUIRES: nothing that compares equal to key is currently in the list.
+  virtual void Insert(KeyHandle handle) override {
+    skip_list_.Insert(static_cast<char*>(handle));
+  }
+
+  // Returns true iff an entry that compares equal to key is in the list.
+  virtual bool Contains(const char* key) const override {
+    return skip_list_.Contains(key);
+  }
+
+  virtual size_t ApproximateMemoryUsage() override {
+    // All memory is allocated through arena; nothing to report here
+    return 0;
+  }
+
+  virtual void Get(const LookupKey& k, void* callback_args,
+                   bool (*callback_func)(void* arg,
+                                         const char* entry)) override {
+    SkipListRep::Iterator iter(&skip_list_);
+    Slice dummy_slice;
+    for (iter.Seek(dummy_slice, k.memtable_key().data());
+         iter.Valid() && callback_func(callback_args, iter.key());
+         iter.Next()) {
+    }
+  }
+
+  virtual ~SkipListRep() override { }
+
+  // Iteration over the contents of a skip list
+  class Iterator : public MemTableRep::Iterator {
+    SkipList<const char*, const MemTableRep::KeyComparator&>::Iterator iter_;
+   public:
+    // Initialize an iterator over the specified list.
+    // The returned iterator is not valid.
+    explicit Iterator(
+      const SkipList<const char*, const MemTableRep::KeyComparator&>* list
+    ) : iter_(list) { }
+
+    virtual ~Iterator() override { }
+
+    // Returns true iff the iterator is positioned at a valid node.
+    virtual bool Valid() const override {
+      return iter_.Valid();
+    }
+
+    // Returns the key at the current position.
+    // REQUIRES: Valid()
+    virtual const char* key() const override {
+      return iter_.key();
+    }
+
+    // Advances to the next position.
+    // REQUIRES: Valid()
+    virtual void Next() override {
+      iter_.Next();
+    }
+
+    // Advances to the previous position.
+    // REQUIRES: Valid()
+    virtual void Prev() override {
+      iter_.Prev();
+    }
+
+    // Advance to the first entry with a key >= target
+    virtual void Seek(const Slice& user_key, const char* memtable_key)
+        override {
+      if (memtable_key != nullptr) {
+        iter_.Seek(memtable_key);
+      } else {
+        iter_.Seek(EncodeKey(&tmp_, user_key));
+      }
+    }
+
+    // Position at the first entry in list.
+    // Final state of iterator is Valid() iff list is not empty.
+    virtual void SeekToFirst() override {
+      iter_.SeekToFirst();
+    }
+
+    // Position at the last entry in list.
+    // Final state of iterator is Valid() iff list is not empty.
+    virtual void SeekToLast() override {
+      iter_.SeekToLast();
+    }
+   protected:
+    std::string tmp_;       // For passing to EncodeKey
+  };
+
+  // Unhide default implementations of GetIterator
+  using MemTableRep::GetIterator;
+
+  virtual MemTableRep::Iterator* GetIterator() override {
+    return new SkipListRep::Iterator(&skip_list_);
+  }
+};
+}
+
+MemTableRep* SkipListFactory::CreateMemTableRep(
+    const MemTableRep::KeyComparator& compare, Arena* arena,
+    const SliceTransform*) {
+  return new SkipListRep(compare, arena);
+}
+
+} // namespace rocksdb
diff --git a/util/slice.cc b/util/slice.cc
new file mode 100644 (file)
index 0000000..55f561f
--- /dev/null
@@ -0,0 +1,73 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/slice.h"
+
+namespace rocksdb {
+
+namespace {
+
+class FixedPrefixTransform : public SliceTransform {
+ private:
+  size_t prefix_len_;
+
+ public:
+  explicit FixedPrefixTransform(size_t prefix_len) : prefix_len_(prefix_len) { }
+
+  virtual const char* Name() const {
+    return "rocksdb.FixedPrefix";
+  }
+
+  virtual Slice Transform(const Slice& src) const {
+    assert(InDomain(src));
+    return Slice(src.data(), prefix_len_);
+  }
+
+  virtual bool InDomain(const Slice& src) const {
+    return (src.size() >= prefix_len_);
+  }
+
+  virtual bool InRange(const Slice& dst) const {
+    return (dst.size() == prefix_len_);
+  }
+};
+
+class NoopTransform : public SliceTransform {
+ public:
+  explicit NoopTransform() { }
+
+  virtual const char* Name() const {
+    return "rocksdb.Noop";
+  }
+
+  virtual Slice Transform(const Slice& src) const {
+    return src;
+  }
+
+  virtual bool InDomain(const Slice& src) const {
+    return true;
+  }
+
+  virtual bool InRange(const Slice& dst) const {
+    return true;
+  }
+};
+
+}
+
+const SliceTransform* NewFixedPrefixTransform(size_t prefix_len) {
+  return new FixedPrefixTransform(prefix_len);
+}
+
+const SliceTransform* NewNoopTransform() {
+  return new NoopTransform;
+}
+
+}  // namespace rocksdb
diff --git a/util/statistics.cc b/util/statistics.cc
new file mode 100644 (file)
index 0000000..4fc2400
--- /dev/null
@@ -0,0 +1,94 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#include "util/statistics.h"
+#include "rocksdb/statistics.h"
+#include <algorithm>
+#include <cstdio>
+
+namespace rocksdb {
+
+std::shared_ptr<Statistics> CreateDBStatistics() {
+  return std::make_shared<StatisticsImpl>();
+}
+
+StatisticsImpl::StatisticsImpl() {}
+
+StatisticsImpl::~StatisticsImpl() {}
+
+long StatisticsImpl::getTickerCount(Tickers tickerType) {
+  assert(tickerType < TICKER_ENUM_MAX);
+  return tickers_[tickerType].value;
+}
+
+void StatisticsImpl::setTickerCount(Tickers tickerType, uint64_t count) {
+  assert(tickerType < TICKER_ENUM_MAX);
+  tickers_[tickerType].value = count;
+}
+
+void StatisticsImpl::recordTick(Tickers tickerType, uint64_t count) {
+  assert(tickerType < TICKER_ENUM_MAX);
+  tickers_[tickerType].value += count;
+}
+
+void StatisticsImpl::measureTime(Histograms histogramType, uint64_t value) {
+  assert(histogramType < HISTOGRAM_ENUM_MAX);
+  histograms_[histogramType].Add(value);
+}
+
+void StatisticsImpl::histogramData(Histograms histogramType,
+                                   HistogramData* const data) {
+  assert(histogramType < HISTOGRAM_ENUM_MAX);
+  histograms_[histogramType].Data(data);
+}
+
+namespace {
+
+// a buffer size used for temp string buffers
+const int kBufferSize = 200;
+
+std::string HistogramToString (
+    Statistics* dbstats,
+    const Histograms& histogram_type,
+    const std::string& name) {
+
+  char buffer[kBufferSize];
+  HistogramData histogramData;
+  dbstats->histogramData(histogram_type, &histogramData);
+  snprintf(
+      buffer,
+      kBufferSize,
+      "%s statistics Percentiles :=> 50 : %f 95 : %f 99 : %f\n",
+      name.c_str(),
+      histogramData.median,
+      histogramData.percentile95,
+      histogramData.percentile99
+  );
+  return std::string(buffer);
+};
+
+std::string TickerToString(Statistics* dbstats, const Tickers& ticker,
+                           const std::string& name) {
+  char buffer[kBufferSize];
+  snprintf(buffer, kBufferSize, "%s COUNT : %ld\n",
+            name.c_str(), dbstats->getTickerCount(ticker));
+  return std::string(buffer);
+};
+} // namespace
+
+std::string Statistics::ToString() {
+  std::string res;
+  res.reserve(20000);
+  for (const auto& t : TickersNameMap) {
+    res.append(TickerToString(this, t.first, t.second));
+  }
+  for (const auto& h : HistogramsNameMap) {
+    res.append(HistogramToString(this, h.first, h.second));
+  }
+  res.shrink_to_fit();
+  return res;
+}
+
+} // namespace rocksdb
diff --git a/util/statistics.h b/util/statistics.h
new file mode 100644 (file)
index 0000000..d57a1dd
--- /dev/null
@@ -0,0 +1,66 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#pragma once
+#include "rocksdb/statistics.h"
+#include "util/histogram.h"
+#include "util/mutexlock.h"
+#include "port/likely.h"
+
+#include <vector>
+#include <atomic>
+
+
+namespace rocksdb {
+
+class StatisticsImpl : public Statistics {
+ public:
+  StatisticsImpl();
+  virtual ~StatisticsImpl();
+
+  virtual long getTickerCount(Tickers tickerType);
+  virtual void setTickerCount(Tickers tickerType, uint64_t count);
+  virtual void recordTick(Tickers tickerType, uint64_t count);
+  virtual void measureTime(Histograms histogramType, uint64_t value);
+  virtual void histogramData(Histograms histogramType,
+                             HistogramData* const data);
+
+ private:
+  struct Ticker {
+    Ticker() : value(uint_fast64_t()) {}
+
+    std::atomic_uint_fast64_t value;
+    // Pad the structure to make it size of 64 bytes. A plain array of
+    // std::atomic_uint_fast64_t results in huge performance degradataion
+    // due to false sharing.
+    char padding[64 - sizeof(std::atomic_uint_fast64_t)];
+  };
+
+  Ticker tickers_[TICKER_ENUM_MAX] __attribute__((aligned(64)));
+  HistogramImpl histograms_[HISTOGRAM_ENUM_MAX] __attribute__((aligned(64)));
+};
+
+// Utility functions
+inline void MeasureTime(Statistics* statistics, Histograms histogramType,
+                        uint64_t value) {
+  if (statistics) {
+    statistics->measureTime(histogramType, value);
+  }
+}
+
+inline void RecordTick(Statistics* statistics, Tickers ticker,
+                       uint64_t count = 1) {
+  if (statistics) {
+    statistics->recordTick(ticker, count);
+  }
+}
+
+inline void SetTickerCount(Statistics* statistics, Tickers ticker,
+                           uint64_t count) {
+  if (statistics) {
+    statistics->setTickerCount(ticker, count);
+  }
+}
+}
diff --git a/util/stats_logger.h b/util/stats_logger.h
new file mode 100644 (file)
index 0000000..f0b4540
--- /dev/null
@@ -0,0 +1,26 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#pragma once
+
+namespace rocksdb {
+
+class StatsLogger {
+
+ public:
+
+  virtual void Log_Deploy_Stats(const std::string& db_version,
+                                const std::string& machine_info,
+                                const std::string& data_dir,
+                                const uint64_t data_size,
+                                const uint32_t file_number,
+                                const std::string& data_size_per_level,
+                                const std::string& file_number_per_level,
+                                const int64_t& ts_unix) = 0;
+  virtual ~StatsLogger() {}
+
+};
+
+}
diff --git a/util/status.cc b/util/status.cc
new file mode 100644 (file)
index 0000000..2a5f05a
--- /dev/null
@@ -0,0 +1,86 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <stdio.h>
+#include "port/port.h"
+#include "rocksdb/status.h"
+
+namespace rocksdb {
+
+const char* Status::CopyState(const char* state) {
+  uint32_t size;
+  memcpy(&size, state, sizeof(size));
+  char* result = new char[size + 4];
+  memcpy(result, state, size + 4);
+  return result;
+}
+
+Status::Status(Code code, const Slice& msg, const Slice& msg2) :
+    code_(code) {
+  assert(code != kOk);
+  const uint32_t len1 = msg.size();
+  const uint32_t len2 = msg2.size();
+  const uint32_t size = len1 + (len2 ? (2 + len2) : 0);
+  char* result = new char[size + 4];
+  memcpy(result, &size, sizeof(size));
+  memcpy(result + 4, msg.data(), len1);
+  if (len2) {
+    result[4 + len1] = ':';
+    result[5 + len1] = ' ';
+    memcpy(result + 6 + len1, msg2.data(), len2);
+  }
+  state_ = result;
+}
+
+std::string Status::ToString() const {
+  char tmp[30];
+  const char* type;
+  switch (code_) {
+    case kOk:
+      return "OK";
+    case kNotFound:
+      type = "NotFound: ";
+      break;
+    case kCorruption:
+      type = "Corruption: ";
+      break;
+    case kNotSupported:
+      type = "Not implemented: ";
+      break;
+    case kInvalidArgument:
+      type = "Invalid argument: ";
+      break;
+    case kIOError:
+      type = "IO error: ";
+      break;
+    case kMergeInProgress:
+      type = "Merge in progress: ";
+      break;
+    case kIncomplete:
+      type = "Result incomplete: ";
+      break;
+    case kShutdownInProgress:
+      type = "Shutdown in progress: ";
+      break;
+    default:
+      snprintf(tmp, sizeof(tmp), "Unknown code(%d): ",
+               static_cast<int>(code()));
+      type = tmp;
+      break;
+  }
+  std::string result(type);
+  if (state_ != nullptr) {
+    uint32_t length;
+    memcpy(&length, state_, sizeof(length));
+    result.append(state_ + 4, length);
+  }
+  return result;
+}
+
+}  // namespace rocksdb
diff --git a/util/stl_wrappers.h b/util/stl_wrappers.h
new file mode 100644 (file)
index 0000000..b4c14b4
--- /dev/null
@@ -0,0 +1,32 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#pragma once
+
+#include "util/murmurhash.h"
+#include "util/coding.h"
+
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/slice.h"
+
+namespace rocksdb {
+namespace stl_wrappers {
+  class Base {
+   protected:
+    const MemTableRep::KeyComparator& compare_;
+    explicit Base(const MemTableRep::KeyComparator& compare)
+      : compare_(compare) { }
+  };
+
+  struct Compare : private Base {
+    explicit Compare(const MemTableRep::KeyComparator& compare)
+      : Base(compare) { }
+    inline bool operator()(const char* a, const char* b) const {
+      return compare_(a, b) < 0;
+    }
+  };
+
+}
+}
diff --git a/util/stop_watch.h b/util/stop_watch.h
new file mode 100644 (file)
index 0000000..48e1b01
--- /dev/null
@@ -0,0 +1,67 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#pragma once
+#include "rocksdb/env.h"
+#include "util/statistics.h"
+
+namespace rocksdb {
+// Auto-scoped.
+// Records the statistic into the corresponding histogram.
+class StopWatch {
+ public:
+  explicit StopWatch(
+    Env * const env,
+    Statistics* statistics = nullptr,
+    const Histograms histogram_name = DB_GET,
+    bool auto_start = true) :
+      env_(env),
+      start_time_((!auto_start && !statistics) ? 0 : env->NowMicros()),
+      statistics_(statistics),
+      histogram_name_(histogram_name) {}
+
+
+
+  uint64_t ElapsedMicros() {
+    return env_->NowMicros() - start_time_;
+  }
+
+  ~StopWatch() { MeasureTime(statistics_, histogram_name_, ElapsedMicros()); }
+
+ private:
+  Env* const env_;
+  const uint64_t start_time_;
+  Statistics* statistics_;
+  const Histograms histogram_name_;
+
+};
+
+// a nano second precision stopwatch
+class StopWatchNano {
+ public:
+  explicit StopWatchNano(Env* const env, bool auto_start = false)
+      : env_(env), start_(0) {
+    if (auto_start) {
+      Start();
+    }
+  }
+
+  void Start() { start_ = env_->NowNanos(); }
+
+  uint64_t ElapsedNanos(bool reset = false) {
+    auto now = env_->NowNanos();
+    auto elapsed = now - start_;
+    if (reset) {
+      start_ = now;
+    }
+    return elapsed;
+  }
+
+ private:
+  Env* const env_;
+  uint64_t start_;
+};
+
+} // namespace rocksdb
diff --git a/util/string_util.cc b/util/string_util.cc
new file mode 100644 (file)
index 0000000..97b7f9d
--- /dev/null
@@ -0,0 +1,23 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#include <sstream>
+#include <string>
+#include <vector>
+#include "util/string_util.h"
+
+namespace rocksdb {
+
+std::vector<std::string> stringSplit(std::string arg, char delim) {
+  std::vector<std::string> splits;
+  std::stringstream ss(arg);
+  std::string item;
+  while (std::getline(ss, item, delim)) {
+    splits.push_back(item);
+  }
+  return splits;
+}
+
+}  // namespace rocksdb
diff --git a/util/string_util.h b/util/string_util.h
new file mode 100644 (file)
index 0000000..676f4aa
--- /dev/null
@@ -0,0 +1,15 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#include <sstream>
+#include <string>
+#include <vector>
+
+#pragma once
+namespace rocksdb {
+
+extern std::vector<std::string> stringSplit(std::string arg, char delim);
+
+}  // namespace rocksdb
diff --git a/util/sync_point.cc b/util/sync_point.cc
new file mode 100644 (file)
index 0000000..4e4c46a
--- /dev/null
@@ -0,0 +1,64 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include "util/sync_point.h"
+
+#ifndef NDEBUG
+namespace rocksdb {
+
+SyncPoint* SyncPoint::GetInstance() {
+  static SyncPoint sync_point;
+  return &sync_point;
+}
+
+void SyncPoint::LoadDependency(const std::vector<Dependency>& dependencies) {
+  successors_.clear();
+  predecessors_.clear();
+  cleared_points_.clear();
+  for (const auto& dependency : dependencies) {
+    successors_[dependency.predecessor].push_back(dependency.successor);
+    predecessors_[dependency.successor].push_back(dependency.predecessor);
+  }
+}
+
+bool SyncPoint::PredecessorsAllCleared(const std::string& point) {
+  for (const auto& pred : predecessors_[point]) {
+    if (cleared_points_.count(pred) == 0) {
+      return false;
+    }
+  }
+  return true;
+}
+
+void SyncPoint::EnableProcessing() {
+  std::unique_lock<std::mutex> lock(mutex_);
+  enabled_ = true;
+}
+
+void SyncPoint::DisableProcessing() {
+  std::unique_lock<std::mutex> lock(mutex_);
+  enabled_ = false;
+}
+
+void SyncPoint::ClearTrace() {
+  std::unique_lock<std::mutex> lock(mutex_);
+  cleared_points_.clear();
+}
+
+void SyncPoint::Process(const std::string& point) {
+  std::unique_lock<std::mutex> lock(mutex_);
+
+  if (!enabled_) return;
+
+  while (!PredecessorsAllCleared(point)) {
+    cv_.wait(lock);
+  }
+
+  cleared_points_.insert(point);
+  cv_.notify_all();
+}
+
+}  // namespace rocksdb
+#endif  // NDEBUG
diff --git a/util/sync_point.h b/util/sync_point.h
new file mode 100644 (file)
index 0000000..b4b61a9
--- /dev/null
@@ -0,0 +1,80 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+#pragma once
+
+#include <condition_variable>
+#include <mutex>
+#include <string>
+#include <unordered_set>
+#include <unordered_map>
+#include <vector>
+
+#ifdef NDEBUG
+#define TEST_SYNC_POINT(x)
+#else
+
+namespace rocksdb {
+
+// This class provides facility to reproduce race conditions deterministically
+// in unit tests.
+// Developer could specify sync points in the codebase via TEST_SYNC_POINT.
+// Each sync point represents a position in the execution stream of a thread.
+// In the unit test, 'Happens After' relationship among sync points could be
+// setup via SyncPoint::LoadDependency, to reproduce a desired interleave of
+// threads execution.
+// Refer to (DBTest,TransactionLogIteratorRace), for an exmaple use case.
+
+class SyncPoint {
+ public:
+  static SyncPoint* GetInstance();
+
+  struct Dependency {
+    std::string predecessor;
+    std::string successor;
+  };
+  // call once at the beginning of a test to setup the dependency between
+  // sync points
+  void LoadDependency(const std::vector<Dependency>& dependencies);
+
+  // enable sync point processing (disabled on startup)
+  void EnableProcessing();
+
+  // disable sync point processing
+  void DisableProcessing();
+
+  // remove the execution trace of all sync points
+  void ClearTrace();
+
+  // triggered by TEST_SYNC_POINT, blocking execution until all predecessors
+  // are executed.
+  void Process(const std::string& point);
+
+  // TODO: it might be useful to provide a function that blocks until all
+  // sync points are cleared.
+
+ private:
+  bool PredecessorsAllCleared(const std::string& point);
+
+  // successor/predecessor map loaded from LoadDependency
+  std::unordered_map<std::string, std::vector<std::string>> successors_;
+  std::unordered_map<std::string, std::vector<std::string>> predecessors_;
+
+  std::mutex mutex_;
+  std::condition_variable cv_;
+  // sync points that have been passed through
+  std::unordered_set<std::string> cleared_points_;
+  bool enabled_ = false;
+};
+
+}  // namespace rocksdb
+
+// Use TEST_SYNC_POINT to specify sync points inside code base.
+// Sync points can have happens-after depedency on other sync points,
+// configured at runtime via SyncPoint::LoadDependency. This could be
+// utilized to re-produce race conditions between threads.
+// See TransactionLogIteratorRace in db_test.cc for an example use case.
+// TEST_SYNC_POINT is no op in release build.
+#define TEST_SYNC_POINT(x) rocksdb::SyncPoint::GetInstance()->Process(x)
+#endif  // NDEBUG
diff --git a/util/testharness.cc b/util/testharness.cc
new file mode 100644 (file)
index 0000000..4208d2c
--- /dev/null
@@ -0,0 +1,84 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/testharness.h"
+#include <string>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include "port/stack_trace.h"
+
+namespace rocksdb {
+namespace test {
+
+namespace {
+struct Test {
+  const char* base;
+  const char* name;
+  void (*func)();
+};
+std::vector<Test>* tests;
+}
+
+bool RegisterTest(const char* base, const char* name, void (*func)()) {
+  if (tests == nullptr) {
+    tests = new std::vector<Test>;
+  }
+  Test t;
+  t.base = base;
+  t.name = name;
+  t.func = func;
+  tests->push_back(t);
+  return true;
+}
+
+int RunAllTests() {
+  port::InstallStackTraceHandler();
+
+  const char* matcher = getenv("ROCKSDB_TESTS");
+
+  int num = 0;
+  if (tests != nullptr) {
+    for (unsigned int i = 0; i < tests->size(); i++) {
+      const Test& t = (*tests)[i];
+      if (matcher != nullptr) {
+        std::string name = t.base;
+        name.push_back('.');
+        name.append(t.name);
+        if (strstr(name.c_str(), matcher) == nullptr) {
+          continue;
+        }
+      }
+      fprintf(stderr, "==== Test %s.%s\n", t.base, t.name);
+      (*t.func)();
+      ++num;
+    }
+  }
+  fprintf(stderr, "==== PASSED %d tests\n", num);
+  return 0;
+}
+
+std::string TmpDir() {
+  std::string dir;
+  Status s = Env::Default()->GetTestDirectory(&dir);
+  ASSERT_TRUE(s.ok()) << s.ToString();
+  return dir;
+}
+
+int RandomSeed() {
+  const char* env = getenv("TEST_RANDOM_SEED");
+  int result = (env != nullptr ? atoi(env) : 301);
+  if (result <= 0) {
+    result = 301;
+  }
+  return result;
+}
+
+}  // namespace test
+}  // namespace rocksdb
diff --git a/util/testharness.h b/util/testharness.h
new file mode 100644 (file)
index 0000000..52c2984
--- /dev/null
@@ -0,0 +1,142 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sstream>
+#include "port/stack_trace.h"
+#include "rocksdb/env.h"
+#include "rocksdb/slice.h"
+#include "util/random.h"
+
+namespace rocksdb {
+namespace test {
+
+// Run some of the tests registered by the TEST() macro.  If the
+// environment variable "ROCKSDB_TESTS" is not set, runs all tests.
+// Otherwise, runs only the tests whose name contains the value of
+// "ROCKSDB_TESTS" as a substring.  E.g., suppose the tests are:
+//    TEST(Foo, Hello) { ... }
+//    TEST(Foo, World) { ... }
+// ROCKSDB_TESTS=Hello will run the first test
+// ROCKSDB_TESTS=o     will run both tests
+// ROCKSDB_TESTS=Junk  will run no tests
+//
+// Returns 0 if all tests pass.
+// Dies or returns a non-zero value if some test fails.
+extern int RunAllTests();
+
+// Return the directory to use for temporary storage.
+extern std::string TmpDir();
+
+// Return a randomization seed for this run.  Typically returns the
+// same number on repeated invocations of this binary, but automated
+// runs may be able to vary the seed.
+extern int RandomSeed();
+
+// An instance of Tester is allocated to hold temporary state during
+// the execution of an assertion.
+class Tester {
+ private:
+  bool ok_;
+  const char* fname_;
+  int line_;
+  std::stringstream ss_;
+
+ public:
+  Tester(const char* f, int l)
+      : ok_(true), fname_(f), line_(l) {
+  }
+
+  ~Tester() {
+    if (!ok_) {
+      fprintf(stderr, "%s:%d:%s\n", fname_, line_, ss_.str().c_str());
+      port::PrintStack(2);
+      exit(1);
+    }
+  }
+
+  Tester& Is(bool b, const char* msg) {
+    if (!b) {
+      ss_ << " Assertion failure " << msg;
+      ok_ = false;
+    }
+    return *this;
+  }
+
+  Tester& IsOk(const Status& s) {
+    if (!s.ok()) {
+      ss_ << " " << s.ToString();
+      ok_ = false;
+    }
+    return *this;
+  }
+
+#define BINARY_OP(name,op)                              \
+  template <class X, class Y>                           \
+  Tester& name(const X& x, const Y& y) {                \
+    if (! (x op y)) {                                   \
+      ss_ << " failed: " << x << (" " #op " ") << y;    \
+      ok_ = false;                                      \
+    }                                                   \
+    return *this;                                       \
+  }
+
+  BINARY_OP(IsEq, ==)
+  BINARY_OP(IsNe, !=)
+  BINARY_OP(IsGe, >=)
+  BINARY_OP(IsGt, >)
+  BINARY_OP(IsLe, <=)
+  BINARY_OP(IsLt, <)
+#undef BINARY_OP
+
+  // Attach the specified value to the error message if an error has occurred
+  template <class V>
+  Tester& operator<<(const V& value) {
+    if (!ok_) {
+      ss_ << " " << value;
+    }
+    return *this;
+  }
+};
+
+#define ASSERT_TRUE(c) ::rocksdb::test::Tester(__FILE__, __LINE__).Is((c), #c)
+#define ASSERT_OK(s) ::rocksdb::test::Tester(__FILE__, __LINE__).IsOk((s))
+#define ASSERT_EQ(a,b) ::rocksdb::test::Tester(__FILE__, __LINE__).IsEq((a),(b))
+#define ASSERT_NE(a,b) ::rocksdb::test::Tester(__FILE__, __LINE__).IsNe((a),(b))
+#define ASSERT_GE(a,b) ::rocksdb::test::Tester(__FILE__, __LINE__).IsGe((a),(b))
+#define ASSERT_GT(a,b) ::rocksdb::test::Tester(__FILE__, __LINE__).IsGt((a),(b))
+#define ASSERT_LE(a,b) ::rocksdb::test::Tester(__FILE__, __LINE__).IsLe((a),(b))
+#define ASSERT_LT(a,b) ::rocksdb::test::Tester(__FILE__, __LINE__).IsLt((a),(b))
+
+#define TCONCAT(a,b) TCONCAT1(a,b)
+#define TCONCAT1(a,b) a##b
+
+#define TEST(base,name)                                                 \
+class TCONCAT(_Test_,name) : public base {                              \
+ public:                                                                \
+  void _Run();                                                          \
+  static void _RunIt() {                                                \
+    TCONCAT(_Test_,name) t;                                             \
+    t._Run();                                                           \
+  }                                                                     \
+};                                                                      \
+bool TCONCAT(_Test_ignored_,name) =                                     \
+  ::rocksdb::test::RegisterTest(#base, #name, &TCONCAT(_Test_,name)::_RunIt); \
+void TCONCAT(_Test_,name)::_Run()
+
+// Register the specified test.  Typically not used directly, but
+// invoked via the macro expansion of TEST.
+extern bool RegisterTest(const char* base, const char* name, void (*func)());
+
+
+}  // namespace test
+}  // namespace rocksdb
diff --git a/util/testutil.cc b/util/testutil.cc
new file mode 100644 (file)
index 0000000..13e781e
--- /dev/null
@@ -0,0 +1,56 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/testutil.h"
+
+#include "util/random.h"
+
+namespace rocksdb {
+namespace test {
+
+Slice RandomString(Random* rnd, int len, std::string* dst) {
+  dst->resize(len);
+  for (int i = 0; i < len; i++) {
+    (*dst)[i] = static_cast<char>(' ' + rnd->Uniform(95));   // ' ' .. '~'
+  }
+  return Slice(*dst);
+}
+
+std::string RandomKey(Random* rnd, int len) {
+  // Make sure to generate a wide variety of characters so we
+  // test the boundary conditions for short-key optimizations.
+  static const char kTestChars[] = {
+    '\0', '\1', 'a', 'b', 'c', 'd', 'e', '\xfd', '\xfe', '\xff'
+  };
+  std::string result;
+  for (int i = 0; i < len; i++) {
+    result += kTestChars[rnd->Uniform(sizeof(kTestChars))];
+  }
+  return result;
+}
+
+
+extern Slice CompressibleString(Random* rnd, double compressed_fraction,
+                                int len, std::string* dst) {
+  int raw = static_cast<int>(len * compressed_fraction);
+  if (raw < 1) raw = 1;
+  std::string raw_data;
+  RandomString(rnd, raw, &raw_data);
+
+  // Duplicate the random data until we have filled "len" bytes
+  dst->clear();
+  while (dst->size() < (unsigned int)len) {
+    dst->append(raw_data);
+  }
+  dst->resize(len);
+  return Slice(*dst);
+}
+
+}  // namespace test
+}  // namespace rocksdb
diff --git a/util/testutil.h b/util/testutil.h
new file mode 100644 (file)
index 0000000..4fc8c0f
--- /dev/null
@@ -0,0 +1,80 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <string>
+#include "db/dbformat.h"
+#include "rocksdb/env.h"
+#include "rocksdb/slice.h"
+#include "util/random.h"
+
+namespace rocksdb {
+namespace test {
+
+// Store in *dst a random string of length "len" and return a Slice that
+// references the generated data.
+extern Slice RandomString(Random* rnd, int len, std::string* dst);
+
+// Return a random key with the specified length that may contain interesting
+// characters (e.g. \x00, \xff, etc.).
+extern std::string RandomKey(Random* rnd, int len);
+
+// Store in *dst a string of length "len" that will compress to
+// "N*compressed_fraction" bytes and return a Slice that references
+// the generated data.
+extern Slice CompressibleString(Random* rnd, double compressed_fraction,
+                                int len, std::string* dst);
+
+// A wrapper that allows injection of errors.
+class ErrorEnv : public EnvWrapper {
+ public:
+  bool writable_file_error_;
+  int num_writable_file_errors_;
+
+  ErrorEnv() : EnvWrapper(Env::Default()),
+               writable_file_error_(false),
+               num_writable_file_errors_(0) { }
+
+  virtual Status NewWritableFile(const std::string& fname,
+                                 unique_ptr<WritableFile>* result,
+                                 const EnvOptions& soptions) {
+    result->reset();
+    if (writable_file_error_) {
+      ++num_writable_file_errors_;
+      return Status::IOError(fname, "fake error");
+    }
+    return target()->NewWritableFile(fname, result, soptions);
+  }
+};
+
+// An internal comparator that just forward comparing results from the
+// user comparator in it. Can be used to test entities that have no dependency
+// on internal key structure but consumes InternalKeyComparator, like
+// BlockBasedTable.
+class PlainInternalKeyComparator : public InternalKeyComparator {
+ public:
+  explicit PlainInternalKeyComparator(const Comparator* c)
+      : InternalKeyComparator(c) {}
+
+  virtual ~PlainInternalKeyComparator() {}
+
+  virtual int Compare(const Slice& a, const Slice& b) const override {
+    return user_comparator()->Compare(a, b);
+  }
+  virtual void FindShortestSeparator(std::string* start,
+                                     const Slice& limit) const override {
+    user_comparator()->FindShortestSeparator(start, limit);
+  }
+  virtual void FindShortSuccessor(std::string* key) const override {
+    user_comparator()->FindShortSuccessor(key);
+  }
+};
+
+}  // namespace test
+}  // namespace rocksdb
diff --git a/util/thread_local.cc b/util/thread_local.cc
new file mode 100644 (file)
index 0000000..bc8a4c7
--- /dev/null
@@ -0,0 +1,243 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/thread_local.h"
+#include "util/mutexlock.h"
+#include "port/likely.h"
+
+
+namespace rocksdb {
+
+port::Mutex ThreadLocalPtr::StaticMeta::mutex_;
+#if !defined(OS_MACOSX)
+__thread ThreadLocalPtr::ThreadData* ThreadLocalPtr::StaticMeta::tls_ = nullptr;
+#endif
+
+ThreadLocalPtr::StaticMeta* ThreadLocalPtr::Instance() {
+  static ThreadLocalPtr::StaticMeta inst;
+  return &inst;
+}
+
+void ThreadLocalPtr::StaticMeta::OnThreadExit(void* ptr) {
+  auto* tls = static_cast<ThreadData*>(ptr);
+  assert(tls != nullptr);
+
+  auto* inst = Instance();
+  pthread_setspecific(inst->pthread_key_, nullptr);
+
+  MutexLock l(&mutex_);
+  inst->RemoveThreadData(tls);
+  // Unref stored pointers of current thread from all instances
+  uint32_t id = 0;
+  for (auto& e : tls->entries) {
+    void* raw = e.ptr.load(std::memory_order_relaxed);
+    if (raw != nullptr) {
+      auto unref = inst->GetHandler(id);
+      if (unref != nullptr) {
+        unref(raw);
+      }
+    }
+    ++id;
+  }
+  // Delete thread local structure no matter if it is Mac platform
+  delete tls;
+}
+
+ThreadLocalPtr::StaticMeta::StaticMeta() : next_instance_id_(0) {
+  if (pthread_key_create(&pthread_key_, &OnThreadExit) != 0) {
+    throw std::runtime_error("pthread_key_create failed");
+  }
+  head_.next = &head_;
+  head_.prev = &head_;
+}
+
+void ThreadLocalPtr::StaticMeta::AddThreadData(ThreadLocalPtr::ThreadData* d) {
+  mutex_.AssertHeld();
+  d->next = &head_;
+  d->prev = head_.prev;
+  head_.prev->next = d;
+  head_.prev = d;
+}
+
+void ThreadLocalPtr::StaticMeta::RemoveThreadData(
+    ThreadLocalPtr::ThreadData* d) {
+  mutex_.AssertHeld();
+  d->next->prev = d->prev;
+  d->prev->next = d->next;
+  d->next = d->prev = d;
+}
+
+ThreadLocalPtr::ThreadData* ThreadLocalPtr::StaticMeta::GetThreadLocal() {
+#if defined(OS_MACOSX)
+  // Make this local variable name look like a member variable so that we
+  // can share all the code below
+  ThreadData* tls_ =
+      static_cast<ThreadData*>(pthread_getspecific(Instance()->pthread_key_));
+#endif
+
+  if (UNLIKELY(tls_ == nullptr)) {
+    auto* inst = Instance();
+    tls_ = new ThreadData();
+    {
+      // Register it in the global chain, needs to be done before thread exit
+      // handler registration
+      MutexLock l(&mutex_);
+      inst->AddThreadData(tls_);
+    }
+    // Even it is not OS_MACOSX, need to register value for pthread_key_ so that
+    // its exit handler will be triggered.
+    if (pthread_setspecific(inst->pthread_key_, tls_) != 0) {
+      {
+        MutexLock l(&mutex_);
+        inst->RemoveThreadData(tls_);
+      }
+      delete tls_;
+      throw std::runtime_error("pthread_setspecific failed");
+    }
+  }
+  return tls_;
+}
+
+void* ThreadLocalPtr::StaticMeta::Get(uint32_t id) const {
+  auto* tls = GetThreadLocal();
+  if (UNLIKELY(id >= tls->entries.size())) {
+    return nullptr;
+  }
+  return tls->entries[id].ptr.load(std::memory_order_relaxed);
+}
+
+void ThreadLocalPtr::StaticMeta::Reset(uint32_t id, void* ptr) {
+  auto* tls = GetThreadLocal();
+  if (UNLIKELY(id >= tls->entries.size())) {
+    // Need mutex to protect entries access within ReclaimId
+    MutexLock l(&mutex_);
+    tls->entries.resize(id + 1);
+  }
+  tls->entries[id].ptr.store(ptr, std::memory_order_relaxed);
+}
+
+void* ThreadLocalPtr::StaticMeta::Swap(uint32_t id, void* ptr) {
+  auto* tls = GetThreadLocal();
+  if (UNLIKELY(id >= tls->entries.size())) {
+    // Need mutex to protect entries access within ReclaimId
+    MutexLock l(&mutex_);
+    tls->entries.resize(id + 1);
+  }
+  return tls->entries[id].ptr.exchange(ptr, std::memory_order_relaxed);
+}
+
+bool ThreadLocalPtr::StaticMeta::CompareAndSwap(uint32_t id, void* ptr,
+    void*& expected) {
+  auto* tls = GetThreadLocal();
+  if (UNLIKELY(id >= tls->entries.size())) {
+    // Need mutex to protect entries access within ReclaimId
+    MutexLock l(&mutex_);
+    tls->entries.resize(id + 1);
+  }
+  return tls->entries[id].ptr.compare_exchange_strong(expected, ptr,
+      std::memory_order_relaxed, std::memory_order_relaxed);
+}
+
+void ThreadLocalPtr::StaticMeta::Scrape(uint32_t id, autovector<void*>* ptrs,
+    void* const replacement) {
+  MutexLock l(&mutex_);
+  for (ThreadData* t = head_.next; t != &head_; t = t->next) {
+    if (id < t->entries.size()) {
+      void* ptr =
+          t->entries[id].ptr.exchange(replacement, std::memory_order_relaxed);
+      if (ptr != nullptr) {
+        ptrs->push_back(ptr);
+      }
+    }
+  }
+}
+
+void ThreadLocalPtr::StaticMeta::SetHandler(uint32_t id, UnrefHandler handler) {
+  MutexLock l(&mutex_);
+  handler_map_[id] = handler;
+}
+
+UnrefHandler ThreadLocalPtr::StaticMeta::GetHandler(uint32_t id) {
+  mutex_.AssertHeld();
+  auto iter = handler_map_.find(id);
+  if (iter == handler_map_.end()) {
+    return nullptr;
+  }
+  return iter->second;
+}
+
+uint32_t ThreadLocalPtr::StaticMeta::GetId() {
+  MutexLock l(&mutex_);
+  if (free_instance_ids_.empty()) {
+    return next_instance_id_++;
+  }
+
+  uint32_t id = free_instance_ids_.back();
+  free_instance_ids_.pop_back();
+  return id;
+}
+
+uint32_t ThreadLocalPtr::StaticMeta::PeekId() const {
+  MutexLock l(&mutex_);
+  if (!free_instance_ids_.empty()) {
+    return free_instance_ids_.back();
+  }
+  return next_instance_id_;
+}
+
+void ThreadLocalPtr::StaticMeta::ReclaimId(uint32_t id) {
+  // This id is not used, go through all thread local data and release
+  // corresponding value
+  MutexLock l(&mutex_);
+  auto unref = GetHandler(id);
+  for (ThreadData* t = head_.next; t != &head_; t = t->next) {
+    if (id < t->entries.size()) {
+      void* ptr =
+          t->entries[id].ptr.exchange(nullptr, std::memory_order_relaxed);
+      if (ptr != nullptr && unref != nullptr) {
+        unref(ptr);
+      }
+    }
+  }
+  handler_map_[id] = nullptr;
+  free_instance_ids_.push_back(id);
+}
+
+ThreadLocalPtr::ThreadLocalPtr(UnrefHandler handler)
+    : id_(Instance()->GetId()) {
+  if (handler != nullptr) {
+    Instance()->SetHandler(id_, handler);
+  }
+}
+
+ThreadLocalPtr::~ThreadLocalPtr() {
+  Instance()->ReclaimId(id_);
+}
+
+void* ThreadLocalPtr::Get() const {
+  return Instance()->Get(id_);
+}
+
+void ThreadLocalPtr::Reset(void* ptr) {
+  Instance()->Reset(id_, ptr);
+}
+
+void* ThreadLocalPtr::Swap(void* ptr) {
+  return Instance()->Swap(id_, ptr);
+}
+
+bool ThreadLocalPtr::CompareAndSwap(void* ptr, void*& expected) {
+  return Instance()->CompareAndSwap(id_, ptr, expected);
+}
+
+void ThreadLocalPtr::Scrape(autovector<void*>* ptrs, void* const replacement) {
+  Instance()->Scrape(id_, ptrs, replacement);
+}
+
+}  // namespace rocksdb
diff --git a/util/thread_local.h b/util/thread_local.h
new file mode 100644 (file)
index 0000000..a037a9c
--- /dev/null
@@ -0,0 +1,166 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <atomic>
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include "util/autovector.h"
+#include "port/port_posix.h"
+#include "util/thread_local.h"
+
+namespace rocksdb {
+
+// Cleanup function that will be called for a stored thread local
+// pointer (if not NULL) when one of the following happens:
+// (1) a thread terminates
+// (2) a ThreadLocalPtr is destroyed
+typedef void (*UnrefHandler)(void* ptr);
+
+// Thread local storage that only stores value of pointer type. The storage
+// distinguish data coming from different thread and different ThreadLocalPtr
+// instances. For example, if a regular thread_local variable A is declared
+// in DBImpl, two DBImpl objects would share the same A. ThreadLocalPtr avoids
+// the confliction. The total storage size equals to # of threads * # of
+// ThreadLocalPtr instances. It is not efficient in terms of space, but it
+// should serve most of our use cases well and keep code simple.
+class ThreadLocalPtr {
+ public:
+  explicit ThreadLocalPtr(UnrefHandler handler = nullptr);
+
+  ~ThreadLocalPtr();
+
+  // Return the current pointer stored in thread local
+  void* Get() const;
+
+  // Set a new pointer value to the thread local storage.
+  void Reset(void* ptr);
+
+  // Atomically swap the supplied ptr and return the previous value
+  void* Swap(void* ptr);
+
+  // Atomically compare the stored value with expected. Set the new
+  // pointer value to thread local only if the comparision is true.
+  // Otherwise, expected returns the stored value.
+  // Return true on success, false on failure
+  bool CompareAndSwap(void* ptr, void*& expected);
+
+  // Reset all thread local data to replacement, and return non-nullptr
+  // data for all existing threads
+  void Scrape(autovector<void*>* ptrs, void* const replacement);
+
+ protected:
+  struct Entry {
+    Entry() : ptr(nullptr) {}
+    Entry(const Entry& e) : ptr(e.ptr.load(std::memory_order_relaxed)) {}
+    std::atomic<void*> ptr;
+  };
+
+  // This is the structure that is declared as "thread_local" storage.
+  // The vector keep list of atomic pointer for all instances for "current"
+  // thread. The vector is indexed by an Id that is unique in process and
+  // associated with one ThreadLocalPtr instance. The Id is assigned by a
+  // global StaticMeta singleton. So if we instantiated 3 ThreadLocalPtr
+  // instances, each thread will have a ThreadData with a vector of size 3:
+  //     ---------------------------------------------------
+  //     |          | instance 1 | instance 2 | instnace 3 |
+  //     ---------------------------------------------------
+  //     | thread 1 |    void*   |    void*   |    void*   | <- ThreadData
+  //     ---------------------------------------------------
+  //     | thread 2 |    void*   |    void*   |    void*   | <- ThreadData
+  //     ---------------------------------------------------
+  //     | thread 3 |    void*   |    void*   |    void*   | <- ThreadData
+  //     ---------------------------------------------------
+  struct ThreadData {
+    ThreadData() : entries() {}
+    std::vector<Entry> entries;
+    ThreadData* next;
+    ThreadData* prev;
+  };
+
+  class StaticMeta {
+   public:
+    StaticMeta();
+
+    // Return the next available Id
+    uint32_t GetId();
+    // Return the next availabe Id without claiming it
+    uint32_t PeekId() const;
+    // Return the given Id back to the free pool. This also triggers
+    // UnrefHandler for associated pointer value (if not NULL) for all threads.
+    void ReclaimId(uint32_t id);
+
+    // Return the pointer value for the given id for the current thread.
+    void* Get(uint32_t id) const;
+    // Reset the pointer value for the given id for the current thread.
+    // It triggers UnrefHanlder if the id has existing pointer value.
+    void Reset(uint32_t id, void* ptr);
+    // Atomically swap the supplied ptr and return the previous value
+    void* Swap(uint32_t id, void* ptr);
+    // Atomically compare and swap the provided value only if it equals
+    // to expected value.
+    bool CompareAndSwap(uint32_t id, void* ptr, void*& expected);
+    // Reset all thread local data to replacement, and return non-nullptr
+    // data for all existing threads
+    void Scrape(uint32_t id, autovector<void*>* ptrs, void* const replacement);
+
+    // Register the UnrefHandler for id
+    void SetHandler(uint32_t id, UnrefHandler handler);
+
+   private:
+    // Get UnrefHandler for id with acquiring mutex
+    // REQUIRES: mutex locked
+    UnrefHandler GetHandler(uint32_t id);
+
+    // Triggered before a thread terminates
+    static void OnThreadExit(void* ptr);
+
+    // Add current thread's ThreadData to the global chain
+    // REQUIRES: mutex locked
+    void AddThreadData(ThreadData* d);
+
+    // Remove current thread's ThreadData from the global chain
+    // REQUIRES: mutex locked
+    void RemoveThreadData(ThreadData* d);
+
+    static ThreadData* GetThreadLocal();
+
+    uint32_t next_instance_id_;
+    // Used to recycle Ids in case ThreadLocalPtr is instantiated and destroyed
+    // frequently. This also prevents it from blowing up the vector space.
+    autovector<uint32_t> free_instance_ids_;
+    // Chain all thread local structure together. This is necessary since
+    // when one ThreadLocalPtr gets destroyed, we need to loop over each
+    // thread's version of pointer corresponding to that instance and
+    // call UnrefHandler for it.
+    ThreadData head_;
+
+    std::unordered_map<uint32_t, UnrefHandler> handler_map_;
+
+    // protect inst, next_instance_id_, free_instance_ids_, head_,
+    // ThreadData.entries
+    static port::Mutex mutex_;
+#if !defined(OS_MACOSX)
+    // Thread local storage
+    static __thread ThreadData* tls_;
+#endif
+    // Used to make thread exit trigger possible if !defined(OS_MACOSX).
+    // Otherwise, used to retrieve thread data.
+    pthread_key_t pthread_key_;
+  };
+
+  static StaticMeta* Instance();
+
+  const uint32_t id_;
+};
+
+}  // namespace rocksdb
diff --git a/util/thread_local_test.cc b/util/thread_local_test.cc
new file mode 100644 (file)
index 0000000..70dfa95
--- /dev/null
@@ -0,0 +1,472 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include <atomic>
+
+#include "rocksdb/env.h"
+#include "port/port_posix.h"
+#include "util/autovector.h"
+#include "util/thread_local.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+
+class ThreadLocalTest {
+ public:
+  ThreadLocalTest() : env_(Env::Default()) {}
+
+  Env* env_;
+};
+
+namespace {
+
+struct Params {
+  Params(port::Mutex* m, port::CondVar* c, int* unref, int n,
+         UnrefHandler handler = nullptr)
+      : mu(m),
+        cv(c),
+        unref(unref),
+        total(n),
+        started(0),
+        completed(0),
+        doWrite(false),
+        tls1(handler),
+        tls2(nullptr) {}
+
+  port::Mutex* mu;
+  port::CondVar* cv;
+  int* unref;
+  int total;
+  int started;
+  int completed;
+  bool doWrite;
+  ThreadLocalPtr tls1;
+  ThreadLocalPtr* tls2;
+};
+
+class IDChecker : public ThreadLocalPtr {
+ public:
+  static uint32_t PeekId() { return Instance()->PeekId(); }
+};
+
+}  // anonymous namespace
+
+TEST(ThreadLocalTest, UniqueIdTest) {
+  port::Mutex mu;
+  port::CondVar cv(&mu);
+
+  ASSERT_EQ(IDChecker::PeekId(), 0u);
+  // New ThreadLocal instance bumps id by 1
+  {
+    // Id used 0
+    Params p1(&mu, &cv, nullptr, 1u);
+    ASSERT_EQ(IDChecker::PeekId(), 1u);
+    // Id used 1
+    Params p2(&mu, &cv, nullptr, 1u);
+    ASSERT_EQ(IDChecker::PeekId(), 2u);
+    // Id used 2
+    Params p3(&mu, &cv, nullptr, 1u);
+    ASSERT_EQ(IDChecker::PeekId(), 3u);
+    // Id used 3
+    Params p4(&mu, &cv, nullptr, 1u);
+    ASSERT_EQ(IDChecker::PeekId(), 4u);
+  }
+  // id 3, 2, 1, 0 are in the free queue in order
+  ASSERT_EQ(IDChecker::PeekId(), 0u);
+
+  // pick up 0
+  Params p1(&mu, &cv, nullptr, 1u);
+  ASSERT_EQ(IDChecker::PeekId(), 1u);
+  // pick up 1
+  Params* p2 = new Params(&mu, &cv, nullptr, 1u);
+  ASSERT_EQ(IDChecker::PeekId(), 2u);
+  // pick up 2
+  Params p3(&mu, &cv, nullptr, 1u);
+  ASSERT_EQ(IDChecker::PeekId(), 3u);
+  // return up 1
+  delete p2;
+  ASSERT_EQ(IDChecker::PeekId(), 1u);
+  // Now we have 3, 1 in queue
+  // pick up 1
+  Params p4(&mu, &cv, nullptr, 1u);
+  ASSERT_EQ(IDChecker::PeekId(), 3u);
+  // pick up 3
+  Params p5(&mu, &cv, nullptr, 1u);
+  // next new id
+  ASSERT_EQ(IDChecker::PeekId(), 4u);
+  // After exit, id sequence in queue:
+  // 3, 1, 2, 0
+}
+
+TEST(ThreadLocalTest, SequentialReadWriteTest) {
+  // global id list carries over 3, 1, 2, 0
+  ASSERT_EQ(IDChecker::PeekId(), 0u);
+
+  port::Mutex mu;
+  port::CondVar cv(&mu);
+  Params p(&mu, &cv, nullptr, 1);
+  ThreadLocalPtr tls2;
+  p.tls2 = &tls2;
+
+  auto func = [](void* ptr) {
+    auto& p = *static_cast<Params*>(ptr);
+
+    ASSERT_TRUE(p.tls1.Get() == nullptr);
+    p.tls1.Reset(reinterpret_cast<int*>(1));
+    ASSERT_TRUE(p.tls1.Get() == reinterpret_cast<int*>(1));
+    p.tls1.Reset(reinterpret_cast<int*>(2));
+    ASSERT_TRUE(p.tls1.Get() == reinterpret_cast<int*>(2));
+
+    ASSERT_TRUE(p.tls2->Get() == nullptr);
+    p.tls2->Reset(reinterpret_cast<int*>(1));
+    ASSERT_TRUE(p.tls2->Get() == reinterpret_cast<int*>(1));
+    p.tls2->Reset(reinterpret_cast<int*>(2));
+    ASSERT_TRUE(p.tls2->Get() == reinterpret_cast<int*>(2));
+
+    p.mu->Lock();
+    ++(p.completed);
+    p.cv->SignalAll();
+    p.mu->Unlock();
+  };
+
+  for (int iter = 0; iter < 1024; ++iter) {
+    ASSERT_EQ(IDChecker::PeekId(), 1u);
+    // Another new thread, read/write should not see value from previous thread
+    env_->StartThread(func, static_cast<void*>(&p));
+    mu.Lock();
+    while (p.completed != iter + 1) {
+      cv.Wait();
+    }
+    mu.Unlock();
+    ASSERT_EQ(IDChecker::PeekId(), 1u);
+  }
+}
+
+TEST(ThreadLocalTest, ConcurrentReadWriteTest) {
+  // global id list carries over 3, 1, 2, 0
+  ASSERT_EQ(IDChecker::PeekId(), 0u);
+
+  ThreadLocalPtr tls2;
+  port::Mutex mu1;
+  port::CondVar cv1(&mu1);
+  Params p1(&mu1, &cv1, nullptr, 16);
+  p1.tls2 = &tls2;
+
+  port::Mutex mu2;
+  port::CondVar cv2(&mu2);
+  Params p2(&mu2, &cv2, nullptr, 16);
+  p2.doWrite = true;
+  p2.tls2 = &tls2;
+
+  auto func = [](void* ptr) {
+    auto& p = *static_cast<Params*>(ptr);
+
+    p.mu->Lock();
+    int own = ++(p.started);
+    p.cv->SignalAll();
+    while (p.started != p.total) {
+      p.cv->Wait();
+    }
+    p.mu->Unlock();
+
+    // Let write threads write a different value from the read threads
+    if (p.doWrite) {
+      own += 8192;
+    }
+
+    ASSERT_TRUE(p.tls1.Get() == nullptr);
+    ASSERT_TRUE(p.tls2->Get() == nullptr);
+
+    auto* env = Env::Default();
+    auto start = env->NowMicros();
+
+    p.tls1.Reset(reinterpret_cast<int*>(own));
+    p.tls2->Reset(reinterpret_cast<int*>(own + 1));
+    // Loop for 1 second
+    while (env->NowMicros() - start < 1000 * 1000) {
+      for (int iter = 0; iter < 100000; ++iter) {
+        ASSERT_TRUE(p.tls1.Get() == reinterpret_cast<int*>(own));
+        ASSERT_TRUE(p.tls2->Get() == reinterpret_cast<int*>(own + 1));
+        if (p.doWrite) {
+          p.tls1.Reset(reinterpret_cast<int*>(own));
+          p.tls2->Reset(reinterpret_cast<int*>(own + 1));
+        }
+      }
+    }
+
+    p.mu->Lock();
+    ++(p.completed);
+    p.cv->SignalAll();
+    p.mu->Unlock();
+  };
+
+  // Initiate 2 instnaces: one keeps writing and one keeps reading.
+  // The read instance should not see data from the write instance.
+  // Each thread local copy of the value are also different from each
+  // other.
+  for (int th = 0; th < p1.total; ++th) {
+    env_->StartThread(func, static_cast<void*>(&p1));
+  }
+  for (int th = 0; th < p2.total; ++th) {
+    env_->StartThread(func, static_cast<void*>(&p2));
+  }
+
+  mu1.Lock();
+  while (p1.completed != p1.total) {
+    cv1.Wait();
+  }
+  mu1.Unlock();
+
+  mu2.Lock();
+  while (p2.completed != p2.total) {
+    cv2.Wait();
+  }
+  mu2.Unlock();
+
+  ASSERT_EQ(IDChecker::PeekId(), 3u);
+}
+
+TEST(ThreadLocalTest, Unref) {
+  ASSERT_EQ(IDChecker::PeekId(), 0u);
+
+  auto unref = [](void* ptr) {
+    auto& p = *static_cast<Params*>(ptr);
+    p.mu->Lock();
+    ++(*p.unref);
+    p.mu->Unlock();
+  };
+
+  // Case 0: no unref triggered if ThreadLocalPtr is never accessed
+  auto func0 = [](void* ptr) {
+    auto& p = *static_cast<Params*>(ptr);
+
+    p.mu->Lock();
+    ++(p.started);
+    p.cv->SignalAll();
+    while (p.started != p.total) {
+      p.cv->Wait();
+    }
+    p.mu->Unlock();
+  };
+
+  for (int th = 1; th <= 128; th += th) {
+    port::Mutex mu;
+    port::CondVar cv(&mu);
+    int unref_count = 0;
+    Params p(&mu, &cv, &unref_count, th, unref);
+
+    for (int i = 0; i < p.total; ++i) {
+      env_->StartThread(func0, static_cast<void*>(&p));
+    }
+    env_->WaitForJoin();
+    ASSERT_EQ(unref_count, 0);
+  }
+
+  // Case 1: unref triggered by thread exit
+  auto func1 = [](void* ptr) {
+    auto& p = *static_cast<Params*>(ptr);
+
+    p.mu->Lock();
+    ++(p.started);
+    p.cv->SignalAll();
+    while (p.started != p.total) {
+      p.cv->Wait();
+    }
+    p.mu->Unlock();
+
+    ASSERT_TRUE(p.tls1.Get() == nullptr);
+    ASSERT_TRUE(p.tls2->Get() == nullptr);
+
+    p.tls1.Reset(ptr);
+    p.tls2->Reset(ptr);
+
+    p.tls1.Reset(ptr);
+    p.tls2->Reset(ptr);
+  };
+
+  for (int th = 1; th <= 128; th += th) {
+    port::Mutex mu;
+    port::CondVar cv(&mu);
+    int unref_count = 0;
+    ThreadLocalPtr tls2(unref);
+    Params p(&mu, &cv, &unref_count, th, unref);
+    p.tls2 = &tls2;
+
+    for (int i = 0; i < p.total; ++i) {
+      env_->StartThread(func1, static_cast<void*>(&p));
+    }
+
+    env_->WaitForJoin();
+
+    // N threads x 2 ThreadLocal instance cleanup on thread exit
+    ASSERT_EQ(unref_count, 2 * p.total);
+  }
+
+  // Case 2: unref triggered by ThreadLocal instance destruction
+  auto func2 = [](void* ptr) {
+    auto& p = *static_cast<Params*>(ptr);
+
+    p.mu->Lock();
+    ++(p.started);
+    p.cv->SignalAll();
+    while (p.started != p.total) {
+      p.cv->Wait();
+    }
+    p.mu->Unlock();
+
+    ASSERT_TRUE(p.tls1.Get() == nullptr);
+    ASSERT_TRUE(p.tls2->Get() == nullptr);
+
+    p.tls1.Reset(ptr);
+    p.tls2->Reset(ptr);
+
+    p.tls1.Reset(ptr);
+    p.tls2->Reset(ptr);
+
+    p.mu->Lock();
+    ++(p.completed);
+    p.cv->SignalAll();
+
+    // Waiting for instruction to exit thread
+    while (p.completed != 0) {
+      p.cv->Wait();
+    }
+    p.mu->Unlock();
+  };
+
+  for (int th = 1; th <= 128; th += th) {
+    port::Mutex mu;
+    port::CondVar cv(&mu);
+    int unref_count = 0;
+    Params p(&mu, &cv, &unref_count, th, unref);
+    p.tls2 = new ThreadLocalPtr(unref);
+
+    for (int i = 0; i < p.total; ++i) {
+      env_->StartThread(func2, static_cast<void*>(&p));
+    }
+
+    // Wait for all threads to finish using Params
+    mu.Lock();
+    while (p.completed != p.total) {
+      cv.Wait();
+    }
+    mu.Unlock();
+
+    // Now destroy one ThreadLocal instance
+    delete p.tls2;
+    p.tls2 = nullptr;
+    // instance destroy for N threads
+    ASSERT_EQ(unref_count, p.total);
+
+    // Signal to exit
+    mu.Lock();
+    p.completed = 0;
+    cv.SignalAll();
+    mu.Unlock();
+    env_->WaitForJoin();
+    // additional N threads exit unref for the left instance
+    ASSERT_EQ(unref_count, 2 * p.total);
+  }
+}
+
+TEST(ThreadLocalTest, Swap) {
+  ThreadLocalPtr tls;
+  tls.Reset(reinterpret_cast<void*>(1));
+  ASSERT_EQ(reinterpret_cast<int64_t>(tls.Swap(nullptr)), 1);
+  ASSERT_TRUE(tls.Swap(reinterpret_cast<void*>(2)) == nullptr);
+  ASSERT_EQ(reinterpret_cast<int64_t>(tls.Get()), 2);
+  ASSERT_EQ(reinterpret_cast<int64_t>(tls.Swap(reinterpret_cast<void*>(3))), 2);
+}
+
+TEST(ThreadLocalTest, Scrape) {
+  auto unref = [](void* ptr) {
+    auto& p = *static_cast<Params*>(ptr);
+    p.mu->Lock();
+    ++(*p.unref);
+    p.mu->Unlock();
+  };
+
+  auto func = [](void* ptr) {
+    auto& p = *static_cast<Params*>(ptr);
+
+    ASSERT_TRUE(p.tls1.Get() == nullptr);
+    ASSERT_TRUE(p.tls2->Get() == nullptr);
+
+    p.tls1.Reset(ptr);
+    p.tls2->Reset(ptr);
+
+    p.tls1.Reset(ptr);
+    p.tls2->Reset(ptr);
+
+    p.mu->Lock();
+    ++(p.completed);
+    p.cv->SignalAll();
+
+    // Waiting for instruction to exit thread
+    while (p.completed != 0) {
+      p.cv->Wait();
+    }
+    p.mu->Unlock();
+  };
+
+  for (int th = 1; th <= 128; th += th) {
+    port::Mutex mu;
+    port::CondVar cv(&mu);
+    int unref_count = 0;
+    Params p(&mu, &cv, &unref_count, th, unref);
+    p.tls2 = new ThreadLocalPtr(unref);
+
+    for (int i = 0; i < p.total; ++i) {
+      env_->StartThread(func, static_cast<void*>(&p));
+    }
+
+    // Wait for all threads to finish using Params
+    mu.Lock();
+    while (p.completed != p.total) {
+      cv.Wait();
+    }
+    mu.Unlock();
+
+    ASSERT_EQ(unref_count, 0);
+
+    // Scrape all thread local data. No unref at thread
+    // exit or ThreadLocalPtr destruction
+    autovector<void*> ptrs;
+    p.tls1.Scrape(&ptrs, nullptr);
+    p.tls2->Scrape(&ptrs, nullptr);
+    delete p.tls2;
+    // Signal to exit
+    mu.Lock();
+    p.completed = 0;
+    cv.SignalAll();
+    mu.Unlock();
+    env_->WaitForJoin();
+
+    ASSERT_EQ(unref_count, 0);
+  }
+}
+
+TEST(ThreadLocalTest, CompareAndSwap) {
+  ThreadLocalPtr tls;
+  ASSERT_TRUE(tls.Swap(reinterpret_cast<void*>(1)) == nullptr);
+  void* expected = reinterpret_cast<void*>(1);
+  // Swap in 2
+  ASSERT_TRUE(tls.CompareAndSwap(reinterpret_cast<void*>(2), expected));
+  expected = reinterpret_cast<void*>(100);
+  // Fail Swap, still 2
+  ASSERT_TRUE(!tls.CompareAndSwap(reinterpret_cast<void*>(2), expected));
+  ASSERT_EQ(expected, reinterpret_cast<void*>(2));
+  // Swap in 3
+  expected = reinterpret_cast<void*>(2);
+  ASSERT_TRUE(tls.CompareAndSwap(reinterpret_cast<void*>(3), expected));
+  ASSERT_EQ(tls.Get(), reinterpret_cast<void*>(3));
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/util/vectorrep.cc b/util/vectorrep.cc
new file mode 100644 (file)
index 0000000..c7f9cca
--- /dev/null
@@ -0,0 +1,282 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#ifndef ROCKSDB_LITE
+#include "rocksdb/memtablerep.h"
+
+#include <unordered_set>
+#include <set>
+#include <memory>
+#include <algorithm>
+#include <type_traits>
+
+#include "util/arena.h"
+#include "db/memtable.h"
+#include "port/port.h"
+#include "util/mutexlock.h"
+#include "util/stl_wrappers.h"
+
+namespace rocksdb {
+namespace {
+
+using namespace stl_wrappers;
+
+class VectorRep : public MemTableRep {
+ public:
+  VectorRep(const KeyComparator& compare, Arena* arena, size_t count);
+
+  // Insert key into the collection. (The caller will pack key and value into a
+  // single buffer and pass that in as the parameter to Insert)
+  // REQUIRES: nothing that compares equal to key is currently in the
+  // collection.
+  virtual void Insert(KeyHandle handle) override;
+
+  // Returns true iff an entry that compares equal to key is in the collection.
+  virtual bool Contains(const char* key) const override;
+
+  virtual void MarkReadOnly() override;
+
+  virtual size_t ApproximateMemoryUsage() override;
+
+  virtual void Get(const LookupKey& k, void* callback_args,
+                   bool (*callback_func)(void* arg,
+                                         const char* entry)) override;
+
+  virtual ~VectorRep() override { }
+
+  class Iterator : public MemTableRep::Iterator {
+    class VectorRep* vrep_;
+    std::shared_ptr<std::vector<const char*>> bucket_;
+    typename std::vector<const char*>::const_iterator mutable cit_;
+    const KeyComparator& compare_;
+    std::string tmp_;       // For passing to EncodeKey
+    bool mutable sorted_;
+    void DoSort() const;
+   public:
+    explicit Iterator(class VectorRep* vrep,
+      std::shared_ptr<std::vector<const char*>> bucket,
+      const KeyComparator& compare);
+
+    // Initialize an iterator over the specified collection.
+    // The returned iterator is not valid.
+    // explicit Iterator(const MemTableRep* collection);
+    virtual ~Iterator() override { };
+
+    // Returns true iff the iterator is positioned at a valid node.
+    virtual bool Valid() const override;
+
+    // Returns the key at the current position.
+    // REQUIRES: Valid()
+    virtual const char* key() const override;
+
+    // Advances to the next position.
+    // REQUIRES: Valid()
+    virtual void Next() override;
+
+    // Advances to the previous position.
+    // REQUIRES: Valid()
+    virtual void Prev() override;
+
+    // Advance to the first entry with a key >= target
+    virtual void Seek(const Slice& user_key, const char* memtable_key) override;
+
+    // Position at the first entry in collection.
+    // Final state of iterator is Valid() iff collection is not empty.
+    virtual void SeekToFirst() override;
+
+    // Position at the last entry in collection.
+    // Final state of iterator is Valid() iff collection is not empty.
+    virtual void SeekToLast() override;
+  };
+
+  // Unhide default implementations of GetIterator()
+  using MemTableRep::GetIterator;
+
+  // Return an iterator over the keys in this representation.
+  virtual MemTableRep::Iterator* GetIterator() override;
+
+ private:
+  friend class Iterator;
+  typedef std::vector<const char*> Bucket;
+  std::shared_ptr<Bucket> bucket_;
+  mutable port::RWMutex rwlock_;
+  bool immutable_;
+  bool sorted_;
+  const KeyComparator& compare_;
+};
+
+void VectorRep::Insert(KeyHandle handle) {
+  auto* key = static_cast<char*>(handle);
+  assert(!Contains(key));
+  WriteLock l(&rwlock_);
+  assert(!immutable_);
+  bucket_->push_back(key);
+}
+
+// Returns true iff an entry that compares equal to key is in the collection.
+bool VectorRep::Contains(const char* key) const {
+  ReadLock l(&rwlock_);
+  return std::find(bucket_->begin(), bucket_->end(), key) != bucket_->end();
+}
+
+void VectorRep::MarkReadOnly() {
+  WriteLock l(&rwlock_);
+  immutable_ = true;
+}
+
+size_t VectorRep::ApproximateMemoryUsage() {
+  return
+    sizeof(bucket_) + sizeof(*bucket_) +
+    bucket_->size() *
+    sizeof(
+      std::remove_reference<decltype(*bucket_)>::type::value_type
+    );
+}
+
+VectorRep::VectorRep(const KeyComparator& compare, Arena* arena, size_t count)
+  : MemTableRep(arena),
+    bucket_(new Bucket()),
+    immutable_(false),
+    sorted_(false),
+    compare_(compare) { bucket_.get()->reserve(count); }
+
+VectorRep::Iterator::Iterator(class VectorRep* vrep,
+                   std::shared_ptr<std::vector<const char*>> bucket,
+                   const KeyComparator& compare)
+: vrep_(vrep),
+  bucket_(bucket),
+  cit_(bucket_->end()),
+  compare_(compare),
+  sorted_(false) { }
+
+void VectorRep::Iterator::DoSort() const {
+  // vrep is non-null means that we are working on an immutable memtable
+  if (!sorted_ && vrep_ != nullptr) {
+    WriteLock l(&vrep_->rwlock_);
+    if (!vrep_->sorted_) {
+      std::sort(bucket_->begin(), bucket_->end(), Compare(compare_));
+      cit_ = bucket_->begin();
+      vrep_->sorted_ = true;
+    }
+    sorted_ = true;
+  }
+  if (!sorted_) {
+    std::sort(bucket_->begin(), bucket_->end(), Compare(compare_));
+    cit_ = bucket_->begin();
+    sorted_ = true;
+  }
+  assert(sorted_);
+  assert(vrep_ == nullptr || vrep_->sorted_);
+}
+
+// Returns true iff the iterator is positioned at a valid node.
+bool VectorRep::Iterator::Valid() const {
+  DoSort();
+  return cit_ != bucket_->end();
+}
+
+// Returns the key at the current position.
+// REQUIRES: Valid()
+const char* VectorRep::Iterator::key() const {
+  assert(Valid());
+  return *cit_;
+}
+
+// Advances to the next position.
+// REQUIRES: Valid()
+void VectorRep::Iterator::Next() {
+  assert(Valid());
+  if (cit_ == bucket_->end()) {
+    return;
+  }
+  ++cit_;
+}
+
+// Advances to the previous position.
+// REQUIRES: Valid()
+void VectorRep::Iterator::Prev() {
+  assert(Valid());
+  if (cit_ == bucket_->begin()) {
+    // If you try to go back from the first element, the iterator should be
+    // invalidated. So we set it to past-the-end. This means that you can
+    // treat the container circularly.
+    cit_ = bucket_->end();
+  } else {
+    --cit_;
+  }
+}
+
+// Advance to the first entry with a key >= target
+void VectorRep::Iterator::Seek(const Slice& user_key,
+                               const char* memtable_key) {
+  DoSort();
+  // Do binary search to find first value not less than the target
+  const char* encoded_key =
+      (memtable_key != nullptr) ? memtable_key : EncodeKey(&tmp_, user_key);
+  cit_ = std::equal_range(bucket_->begin(),
+                          bucket_->end(),
+                          encoded_key,
+                          [this] (const char* a, const char* b) {
+                            return compare_(a, b) < 0;
+                          }).first;
+}
+
+// Position at the first entry in collection.
+// Final state of iterator is Valid() iff collection is not empty.
+void VectorRep::Iterator::SeekToFirst() {
+  DoSort();
+  cit_ = bucket_->begin();
+}
+
+// Position at the last entry in collection.
+// Final state of iterator is Valid() iff collection is not empty.
+void VectorRep::Iterator::SeekToLast() {
+  DoSort();
+  cit_ = bucket_->end();
+  if (bucket_->size() != 0) {
+    --cit_;
+  }
+}
+
+void VectorRep::Get(const LookupKey& k, void* callback_args,
+                    bool (*callback_func)(void* arg, const char* entry)) {
+  rwlock_.ReadLock();
+  VectorRep* vector_rep;
+  std::shared_ptr<Bucket> bucket;
+  if (immutable_) {
+    vector_rep = this;
+  } else {
+    vector_rep = nullptr;
+    bucket.reset(new Bucket(*bucket_));  // make a copy
+  }
+  VectorRep::Iterator iter(vector_rep, immutable_ ? bucket_ : bucket, compare_);
+  rwlock_.Unlock();
+
+  for (iter.Seek(k.user_key(), k.memtable_key().data());
+       iter.Valid() && callback_func(callback_args, iter.key()); iter.Next()) {
+  }
+}
+
+MemTableRep::Iterator* VectorRep::GetIterator() {
+  ReadLock l(&rwlock_);
+  // Do not sort here. The sorting would be done the first time
+  // a Seek is performed on the iterator.
+  if (immutable_) {
+    return new Iterator(this, bucket_, compare_);
+  } else {
+    std::shared_ptr<Bucket> tmp;
+    tmp.reset(new Bucket(*bucket_)); // make a copy
+    return new Iterator(nullptr, tmp, compare_);
+  }
+}
+} // anon namespace
+
+MemTableRep* VectorRepFactory::CreateMemTableRep(
+    const MemTableRep::KeyComparator& compare, Arena* arena,
+    const SliceTransform*) {
+  return new VectorRep(compare, arena, count_);
+}
+} // namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/util/xxhash.cc b/util/xxhash.cc
new file mode 100644 (file)
index 0000000..6dfd4b2
--- /dev/null
@@ -0,0 +1,475 @@
+/*
+xxHash - Fast Hash algorithm
+Copyright (C) 2012-2014, Yann Collet.
+BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+You can contact the author at :
+- xxHash source repository : http://code.google.com/p/xxhash/
+*/
+
+
+//**************************************
+// Tuning parameters
+//**************************************
+// Unaligned memory access is automatically enabled for "common" CPU, such as x86.
+// For others CPU, the compiler will be more cautious, and insert extra code to ensure aligned access is respected.
+// If you know your target CPU supports unaligned memory access, you want to force this option manually to improve performance.
+// You can also enable this parameter if you know your input data will always be aligned (boundaries of 4, for U32).
+#if defined(__ARM_FEATURE_UNALIGNED) || defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
+#  define XXH_USE_UNALIGNED_ACCESS 1
+#endif
+
+// XXH_ACCEPT_NULL_INPUT_POINTER :
+// If the input pointer is a null pointer, xxHash default behavior is to trigger a memory access error, since it is a bad pointer.
+// When this option is enabled, xxHash output for null input pointers will be the same as a null-length input.
+// This option has a very small performance cost (only measurable on small inputs).
+// By default, this option is disabled. To enable it, uncomment below define :
+//#define XXH_ACCEPT_NULL_INPUT_POINTER 1
+
+// XXH_FORCE_NATIVE_FORMAT :
+// By default, xxHash library provides endian-independent Hash values, based on little-endian convention.
+// Results are therefore identical for little-endian and big-endian CPU.
+// This comes at a performance cost for big-endian CPU, since some swapping is required to emulate little-endian format.
+// Should endian-independance be of no importance for your application, you may set the #define below to 1.
+// It will improve speed for Big-endian CPU.
+// This option has no impact on Little_Endian CPU.
+#define XXH_FORCE_NATIVE_FORMAT 0
+
+
+//**************************************
+// Compiler Specific Options
+//**************************************
+// Disable some Visual warning messages
+#ifdef _MSC_VER  // Visual Studio
+#  pragma warning(disable : 4127)      // disable: C4127: conditional expression is constant
+#endif
+
+#ifdef _MSC_VER    // Visual Studio
+#  define FORCE_INLINE static __forceinline
+#else
+#  ifdef __GNUC__
+#    define FORCE_INLINE static inline __attribute__((always_inline))
+#  else
+#    define FORCE_INLINE static inline
+#  endif
+#endif
+
+
+//**************************************
+// Includes & Memory related functions
+//**************************************
+#include "xxhash.h"
+// Modify the local functions below should you wish to use some other memory related routines
+// for malloc(), free()
+#include <stdlib.h>
+FORCE_INLINE void* XXH_malloc(size_t s) { return malloc(s); }
+FORCE_INLINE void  XXH_free  (void* p)  { free(p); }
+// for memcpy()
+#include <string.h>
+FORCE_INLINE void* XXH_memcpy(void* dest, const void* src, size_t size) { return memcpy(dest,src,size); }
+
+
+//**************************************
+// Basic Types
+//**************************************
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   // C99
+# include <stdint.h>
+  typedef uint8_t  BYTE;
+  typedef uint16_t U16;
+  typedef uint32_t U32;
+  typedef  int32_t S32;
+  typedef uint64_t U64;
+#else
+  typedef unsigned char      BYTE;
+  typedef unsigned short     U16;
+  typedef unsigned int       U32;
+  typedef   signed int       S32;
+  typedef unsigned long long U64;
+#endif
+
+#if defined(__GNUC__)  && !defined(XXH_USE_UNALIGNED_ACCESS)
+#  define _PACKED __attribute__ ((packed))
+#else
+#  define _PACKED
+#endif
+
+#if !defined(XXH_USE_UNALIGNED_ACCESS) && !defined(__GNUC__)
+#  ifdef __IBMC__
+#    pragma pack(1)
+#  else
+#    pragma pack(push, 1)
+#  endif
+#endif
+
+typedef struct _U32_S { U32 v; } _PACKED U32_S;
+
+#if !defined(XXH_USE_UNALIGNED_ACCESS) && !defined(__GNUC__)
+#  pragma pack(pop)
+#endif
+
+#define A32(x) (((U32_S *)(x))->v)
+
+
+//***************************************
+// Compiler-specific Functions and Macros
+//***************************************
+#define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+
+// Note : although _rotl exists for minGW (GCC under windows), performance seems poor
+#if defined(_MSC_VER)
+#  define XXH_rotl32(x,r) _rotl(x,r)
+#else
+#  define XXH_rotl32(x,r) ((x << r) | (x >> (32 - r)))
+#endif
+
+#if defined(_MSC_VER)     // Visual Studio
+#  define XXH_swap32 _byteswap_ulong
+#elif GCC_VERSION >= 403
+#  define XXH_swap32 __builtin_bswap32
+#else
+static inline U32 XXH_swap32 (U32 x) {
+    return  ((x << 24) & 0xff000000 ) |
+        ((x <<  8) & 0x00ff0000 ) |
+        ((x >>  8) & 0x0000ff00 ) |
+        ((x >> 24) & 0x000000ff );}
+#endif
+
+
+//**************************************
+// Constants
+//**************************************
+#define PRIME32_1   2654435761U
+#define PRIME32_2   2246822519U
+#define PRIME32_3   3266489917U
+#define PRIME32_4    668265263U
+#define PRIME32_5    374761393U
+
+
+//**************************************
+// Architecture Macros
+//**************************************
+typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess;
+#ifndef XXH_CPU_LITTLE_ENDIAN   // It is possible to define XXH_CPU_LITTLE_ENDIAN externally, for example using a compiler switch
+    static const int one = 1;
+#   define XXH_CPU_LITTLE_ENDIAN   (*(char*)(&one))
+#endif
+
+
+//**************************************
+// Macros
+//**************************************
+#define XXH_STATIC_ASSERT(c)   { enum { XXH_static_assert = 1/(!!(c)) }; }    // use only *after* variable declarations
+
+
+//****************************
+// Memory reads
+//****************************
+typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment;
+
+FORCE_INLINE U32 XXH_readLE32_align(const U32* ptr, XXH_endianess endian, XXH_alignment align)
+{
+    if (align==XXH_unaligned)
+        return endian==XXH_littleEndian ? A32(ptr) : XXH_swap32(A32(ptr));
+    else
+        return endian==XXH_littleEndian ? *ptr : XXH_swap32(*ptr);
+}
+
+FORCE_INLINE U32 XXH_readLE32(const U32* ptr, XXH_endianess endian) { return XXH_readLE32_align(ptr, endian, XXH_unaligned); }
+
+
+//****************************
+// Simple Hash Functions
+//****************************
+FORCE_INLINE U32 XXH32_endian_align(const void* input, int len, U32 seed, XXH_endianess endian, XXH_alignment align)
+{
+    const BYTE* p = (const BYTE*)input;
+    const BYTE* const bEnd = p + len;
+    U32 h32;
+
+#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
+    if (p==NULL) { len=0; p=(const BYTE*)(size_t)16; }
+#endif
+
+    if (len>=16)
+    {
+        const BYTE* const limit = bEnd - 16;
+        U32 v1 = seed + PRIME32_1 + PRIME32_2;
+        U32 v2 = seed + PRIME32_2;
+        U32 v3 = seed + 0;
+        U32 v4 = seed - PRIME32_1;
+
+        do
+        {
+            v1 += XXH_readLE32_align((const U32*)p, endian, align) * PRIME32_2; v1 = XXH_rotl32(v1, 13); v1 *= PRIME32_1; p+=4;
+            v2 += XXH_readLE32_align((const U32*)p, endian, align) * PRIME32_2; v2 = XXH_rotl32(v2, 13); v2 *= PRIME32_1; p+=4;
+            v3 += XXH_readLE32_align((const U32*)p, endian, align) * PRIME32_2; v3 = XXH_rotl32(v3, 13); v3 *= PRIME32_1; p+=4;
+            v4 += XXH_readLE32_align((const U32*)p, endian, align) * PRIME32_2; v4 = XXH_rotl32(v4, 13); v4 *= PRIME32_1; p+=4;
+        } while (p<=limit);
+
+        h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
+    }
+    else
+    {
+        h32  = seed + PRIME32_5;
+    }
+
+    h32 += (U32) len;
+
+    while (p<=bEnd-4)
+    {
+        h32 += XXH_readLE32_align((const U32*)p, endian, align) * PRIME32_3;
+        h32  = XXH_rotl32(h32, 17) * PRIME32_4 ;
+        p+=4;
+    }
+
+    while (p<bEnd)
+    {
+        h32 += (*p) * PRIME32_5;
+        h32 = XXH_rotl32(h32, 11) * PRIME32_1 ;
+        p++;
+    }
+
+    h32 ^= h32 >> 15;
+    h32 *= PRIME32_2;
+    h32 ^= h32 >> 13;
+    h32 *= PRIME32_3;
+    h32 ^= h32 >> 16;
+
+    return h32;
+}
+
+
+U32 XXH32(const void* input, int len, U32 seed)
+{
+#if 0
+    // Simple version, good for code maintenance, but unfortunately slow for small inputs
+    void* state = XXH32_init(seed);
+    XXH32_update(state, input, len);
+    return XXH32_digest(state);
+#else
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+#  if !defined(XXH_USE_UNALIGNED_ACCESS)
+    if ((((size_t)input) & 3))   // Input is aligned, let's leverage the speed advantage
+    {
+        if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+            return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned);
+        else
+            return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned);
+    }
+#  endif
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned);
+    else
+        return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned);
+#endif
+}
+
+
+//****************************
+// Advanced Hash Functions
+//****************************
+
+struct XXH_state32_t
+{
+    U64 total_len;
+    U32 seed;
+    U32 v1;
+    U32 v2;
+    U32 v3;
+    U32 v4;
+    int memsize;
+    char memory[16];
+};
+
+
+int XXH32_sizeofState()
+{
+    XXH_STATIC_ASSERT(XXH32_SIZEOFSTATE >= sizeof(struct XXH_state32_t));   // A compilation error here means XXH32_SIZEOFSTATE is not large enough
+    return sizeof(struct XXH_state32_t);
+}
+
+
+XXH_errorcode XXH32_resetState(void* state_in, U32 seed)
+{
+    struct XXH_state32_t * state = (struct XXH_state32_t *) state_in;
+    state->seed = seed;
+    state->v1 = seed + PRIME32_1 + PRIME32_2;
+    state->v2 = seed + PRIME32_2;
+    state->v3 = seed + 0;
+    state->v4 = seed - PRIME32_1;
+    state->total_len = 0;
+    state->memsize = 0;
+    return XXH_OK;
+}
+
+
+void* XXH32_init (U32 seed)
+{
+    void* state = XXH_malloc (sizeof(struct XXH_state32_t));
+    XXH32_resetState(state, seed);
+    return state;
+}
+
+
+FORCE_INLINE XXH_errorcode XXH32_update_endian (void* state_in, const void* input, int len, XXH_endianess endian)
+{
+    struct XXH_state32_t * state = (struct XXH_state32_t *) state_in;
+    const BYTE* p = (const BYTE*)input;
+    const BYTE* const bEnd = p + len;
+
+#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
+    if (input==NULL) return XXH_ERROR;
+#endif
+
+    state->total_len += len;
+
+    if (state->memsize + len < 16)   // fill in tmp buffer
+    {
+        XXH_memcpy(state->memory + state->memsize, input, len);
+        state->memsize +=  len;
+        return XXH_OK;
+    }
+
+    if (state->memsize)   // some data left from previous update
+    {
+        XXH_memcpy(state->memory + state->memsize, input, 16-state->memsize);
+        {
+            const U32* p32 = (const U32*)state->memory;
+            state->v1 += XXH_readLE32(p32, endian) * PRIME32_2; state->v1 = XXH_rotl32(state->v1, 13); state->v1 *= PRIME32_1; p32++;
+            state->v2 += XXH_readLE32(p32, endian) * PRIME32_2; state->v2 = XXH_rotl32(state->v2, 13); state->v2 *= PRIME32_1; p32++;
+            state->v3 += XXH_readLE32(p32, endian) * PRIME32_2; state->v3 = XXH_rotl32(state->v3, 13); state->v3 *= PRIME32_1; p32++;
+            state->v4 += XXH_readLE32(p32, endian) * PRIME32_2; state->v4 = XXH_rotl32(state->v4, 13); state->v4 *= PRIME32_1; p32++;
+        }
+        p += 16-state->memsize;
+        state->memsize = 0;
+    }
+
+    if (p <= bEnd-16)
+    {
+        const BYTE* const limit = bEnd - 16;
+        U32 v1 = state->v1;
+        U32 v2 = state->v2;
+        U32 v3 = state->v3;
+        U32 v4 = state->v4;
+
+        do
+        {
+            v1 += XXH_readLE32((const U32*)p, endian) * PRIME32_2; v1 = XXH_rotl32(v1, 13); v1 *= PRIME32_1; p+=4;
+            v2 += XXH_readLE32((const U32*)p, endian) * PRIME32_2; v2 = XXH_rotl32(v2, 13); v2 *= PRIME32_1; p+=4;
+            v3 += XXH_readLE32((const U32*)p, endian) * PRIME32_2; v3 = XXH_rotl32(v3, 13); v3 *= PRIME32_1; p+=4;
+            v4 += XXH_readLE32((const U32*)p, endian) * PRIME32_2; v4 = XXH_rotl32(v4, 13); v4 *= PRIME32_1; p+=4;
+        } while (p<=limit);
+
+        state->v1 = v1;
+        state->v2 = v2;
+        state->v3 = v3;
+        state->v4 = v4;
+    }
+
+    if (p < bEnd)
+    {
+        XXH_memcpy(state->memory, p, bEnd-p);
+        state->memsize = (int)(bEnd-p);
+    }
+
+    return XXH_OK;
+}
+
+XXH_errorcode XXH32_update (void* state_in, const void* input, int len)
+{
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH32_update_endian(state_in, input, len, XXH_littleEndian);
+    else
+        return XXH32_update_endian(state_in, input, len, XXH_bigEndian);
+}
+
+
+
+FORCE_INLINE U32 XXH32_intermediateDigest_endian (void* state_in, XXH_endianess endian)
+{
+    struct XXH_state32_t * state = (struct XXH_state32_t *) state_in;
+    const BYTE * p = (const BYTE*)state->memory;
+    BYTE* bEnd = (BYTE*)state->memory + state->memsize;
+    U32 h32;
+
+    if (state->total_len >= 16)
+    {
+        h32 = XXH_rotl32(state->v1, 1) + XXH_rotl32(state->v2, 7) + XXH_rotl32(state->v3, 12) + XXH_rotl32(state->v4, 18);
+    }
+    else
+    {
+        h32  = state->seed + PRIME32_5;
+    }
+
+    h32 += (U32) state->total_len;
+
+    while (p<=bEnd-4)
+    {
+        h32 += XXH_readLE32((const U32*)p, endian) * PRIME32_3;
+        h32  = XXH_rotl32(h32, 17) * PRIME32_4;
+        p+=4;
+    }
+
+    while (p<bEnd)
+    {
+        h32 += (*p) * PRIME32_5;
+        h32 = XXH_rotl32(h32, 11) * PRIME32_1;
+        p++;
+    }
+
+    h32 ^= h32 >> 15;
+    h32 *= PRIME32_2;
+    h32 ^= h32 >> 13;
+    h32 *= PRIME32_3;
+    h32 ^= h32 >> 16;
+
+    return h32;
+}
+
+
+U32 XXH32_intermediateDigest (void* state_in)
+{
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH32_intermediateDigest_endian(state_in, XXH_littleEndian);
+    else
+        return XXH32_intermediateDigest_endian(state_in, XXH_bigEndian);
+}
+
+
+U32 XXH32_digest (void* state_in)
+{
+    U32 h32 = XXH32_intermediateDigest(state_in);
+
+    XXH_free(state_in);
+
+    return h32;
+}
diff --git a/util/xxhash.h b/util/xxhash.h
new file mode 100644 (file)
index 0000000..ceff066
--- /dev/null
@@ -0,0 +1,164 @@
+/*
+   xxHash - Fast Hash algorithm
+   Header File
+   Copyright (C) 2012-2014, Yann Collet.
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - xxHash source repository : http://code.google.com/p/xxhash/
+*/
+
+/* Notice extracted from xxHash homepage :
+
+xxHash is an extremely fast Hash algorithm, running at RAM speed limits.
+It also successfully passes all tests from the SMHasher suite.
+
+Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz)
+
+Name            Speed       Q.Score   Author
+xxHash          5.4 GB/s     10
+CrapWow         3.2 GB/s      2       Andrew
+MumurHash 3a    2.7 GB/s     10       Austin Appleby
+SpookyHash      2.0 GB/s     10       Bob Jenkins
+SBox            1.4 GB/s      9       Bret Mulvey
+Lookup3         1.2 GB/s      9       Bob Jenkins
+SuperFastHash   1.2 GB/s      1       Paul Hsieh
+CityHash64      1.05 GB/s    10       Pike & Alakuijala
+FNV             0.55 GB/s     5       Fowler, Noll, Vo
+CRC32           0.43 GB/s     9
+MD5-32          0.33 GB/s    10       Ronald L. Rivest
+SHA1-32         0.28 GB/s    10
+
+Q.Score is a measure of quality of the hash function.
+It depends on successfully passing SMHasher test set.
+10 is a perfect score.
+*/
+
+#pragma once
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+//****************************
+// Type
+//****************************
+typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
+
+
+
+//****************************
+// Simple Hash Functions
+//****************************
+
+unsigned int XXH32 (const void* input, int len, unsigned int seed);
+
+/*
+XXH32() :
+    Calculate the 32-bits hash of sequence of length "len" stored at memory address "input".
+    The memory between input & input+len must be valid (allocated and read-accessible).
+    "seed" can be used to alter the result predictably.
+    This function successfully passes all SMHasher tests.
+    Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s
+    Note that "len" is type "int", which means it is limited to 2^31-1.
+    If your data is larger, use the advanced functions below.
+*/
+
+
+
+//****************************
+// Advanced Hash Functions
+//****************************
+
+void*         XXH32_init   (unsigned int seed);
+XXH_errorcode XXH32_update (void* state, const void* input, int len);
+unsigned int  XXH32_digest (void* state);
+
+/*
+These functions calculate the xxhash of an input provided in several small packets,
+as opposed to an input provided as a single block.
+
+It must be started with :
+void* XXH32_init()
+The function returns a pointer which holds the state of calculation.
+
+This pointer must be provided as "void* state" parameter for XXH32_update().
+XXH32_update() can be called as many times as necessary.
+The user must provide a valid (allocated) input.
+The function returns an error code, with 0 meaning OK, and any other value meaning there is an error.
+Note that "len" is type "int", which means it is limited to 2^31-1.
+If your data is larger, it is recommended to chunk your data into blocks
+of size for example 2^30 (1GB) to avoid any "int" overflow issue.
+
+Finally, you can end the calculation anytime, by using XXH32_digest().
+This function returns the final 32-bits hash.
+You must provide the same "void* state" parameter created by XXH32_init().
+Memory will be freed by XXH32_digest().
+*/
+
+
+int           XXH32_sizeofState();
+XXH_errorcode XXH32_resetState(void* state, unsigned int seed);
+
+#define       XXH32_SIZEOFSTATE 48
+typedef struct { long long ll[(XXH32_SIZEOFSTATE+(sizeof(long long)-1))/sizeof(long long)]; } XXH32_stateSpace_t;
+/*
+These functions allow user application to make its own allocation for state.
+
+XXH32_sizeofState() is used to know how much space must be allocated for the xxHash 32-bits state.
+Note that the state must be aligned to access 'long long' fields. Memory must be allocated and referenced by a pointer.
+This pointer must then be provided as 'state' into XXH32_resetState(), which initializes the state.
+
+For static allocation purposes (such as allocation on stack, or freestanding systems without malloc()),
+use the structure XXH32_stateSpace_t, which will ensure that memory space is large enough and correctly aligned to access 'long long' fields.
+*/
+
+
+unsigned int XXH32_intermediateDigest (void* state);
+/*
+This function does the same as XXH32_digest(), generating a 32-bit hash,
+but preserve memory context.
+This way, it becomes possible to generate intermediate hashes, and then continue feeding data with XXH32_update().
+To free memory context, use XXH32_digest(), or free().
+*/
+
+
+
+//****************************
+// Deprecated function names
+//****************************
+// The following translations are provided to ease code transition
+// You are encouraged to no longer this function names
+#define XXH32_feed   XXH32_update
+#define XXH32_result XXH32_digest
+#define XXH32_getIntermediateResult XXH32_intermediateDigest
+
+
+
+#if defined (__cplusplus)
+}
+#endif
diff --git a/utilities/backupable/backupable_db.cc b/utilities/backupable/backupable_db.cc
new file mode 100644 (file)
index 0000000..87901e0
--- /dev/null
@@ -0,0 +1,1306 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/backupable_db.h"
+#include "db/filename.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+#include "rocksdb/transaction_log.h"
+
+#define __STDC_FORMAT_MACROS
+
+#include <inttypes.h>
+#include <algorithm>
+#include <vector>
+#include <map>
+#include <string>
+#include <limits>
+#include <atomic>
+#include <unordered_map>
+
+namespace rocksdb {
+
+namespace {
+class RateLimiter {
+ public:
+  RateLimiter(Env* env, uint64_t max_bytes_per_second, uint64_t bytes_per_check)
+      : env_(env),
+        max_bytes_per_second_(max_bytes_per_second),
+        bytes_per_check_(bytes_per_check),
+        micros_start_time_(env->NowMicros()),
+        bytes_since_start_(0) {}
+
+  void ReportAndWait(uint64_t bytes_since_last_call) {
+    bytes_since_start_ += bytes_since_last_call;
+    if (bytes_since_start_ < bytes_per_check_) {
+      // not enough bytes to be rate-limited
+      return;
+    }
+
+    uint64_t now = env_->NowMicros();
+    uint64_t interval = now - micros_start_time_;
+    uint64_t should_take_micros =
+        (bytes_since_start_ * kMicrosInSecond) / max_bytes_per_second_;
+
+    if (should_take_micros > interval) {
+      env_->SleepForMicroseconds(should_take_micros - interval);
+      now = env_->NowMicros();
+    }
+    // reset interval
+    micros_start_time_ = now;
+    bytes_since_start_ = 0;
+  }
+
+ private:
+  Env* env_;
+  uint64_t max_bytes_per_second_;
+  uint64_t bytes_per_check_;
+  uint64_t micros_start_time_;
+  uint64_t bytes_since_start_;
+  static const uint64_t kMicrosInSecond = 1000 * 1000LL;
+};
+}  // namespace
+
+void BackupableDBOptions::Dump(Logger* logger) const {
+  Log(logger, "        Options.backup_dir: %s", backup_dir.c_str());
+  Log(logger, "        Options.backup_env: %p", backup_env);
+  Log(logger, " Options.share_table_files: %d",
+      static_cast<int>(share_table_files));
+  Log(logger, "          Options.info_log: %p", info_log);
+  Log(logger, "              Options.sync: %d", static_cast<int>(sync));
+  Log(logger, "  Options.destroy_old_data: %d",
+      static_cast<int>(destroy_old_data));
+  Log(logger, "  Options.backup_log_files: %d",
+      static_cast<int>(backup_log_files));
+  Log(logger, " Options.backup_rate_limit: %" PRIu64, backup_rate_limit);
+  Log(logger, "Options.restore_rate_limit: %" PRIu64, restore_rate_limit);
+}
+
+// -------- BackupEngineImpl class ---------
+class BackupEngineImpl : public BackupEngine {
+ public:
+  BackupEngineImpl(Env* db_env, const BackupableDBOptions& options,
+                   bool read_only = false);
+  ~BackupEngineImpl();
+  Status CreateNewBackup(DB* db, bool flush_before_backup = false);
+  Status PurgeOldBackups(uint32_t num_backups_to_keep);
+  Status DeleteBackup(BackupID backup_id);
+  void StopBackup() {
+    stop_backup_.store(true, std::memory_order_release);
+  }
+
+  void GetBackupInfo(std::vector<BackupInfo>* backup_info);
+  Status RestoreDBFromBackup(BackupID backup_id, const std::string& db_dir,
+                             const std::string& wal_dir,
+                             const RestoreOptions& restore_options =
+                                 RestoreOptions());
+  Status RestoreDBFromLatestBackup(const std::string& db_dir,
+                                   const std::string& wal_dir,
+                                   const RestoreOptions& restore_options =
+                                       RestoreOptions()) {
+    return RestoreDBFromBackup(latest_backup_id_, db_dir, wal_dir,
+                               restore_options);
+  }
+
+ private:
+  void DeleteChildren(const std::string& dir, uint32_t file_type_filter = 0);
+
+  struct FileInfo {
+    FileInfo(const std::string& fname, uint64_t sz, uint32_t checksum)
+      : refs(0), filename(fname), size(sz), checksum_value(checksum) {}
+
+    int refs;
+    const std::string filename;
+    const uint64_t size;
+    uint32_t checksum_value;
+  };
+
+  class BackupMeta {
+   public:
+    BackupMeta(const std::string& meta_filename,
+        std::unordered_map<std::string, FileInfo>* file_infos, Env* env)
+      : timestamp_(0), size_(0), meta_filename_(meta_filename),
+        file_infos_(file_infos), env_(env) {}
+
+    ~BackupMeta() {}
+
+    void RecordTimestamp() {
+      env_->GetCurrentTime(&timestamp_);
+    }
+    int64_t GetTimestamp() const {
+      return timestamp_;
+    }
+    uint64_t GetSize() const {
+      return size_;
+    }
+    void SetSequenceNumber(uint64_t sequence_number) {
+      sequence_number_ = sequence_number;
+    }
+    uint64_t GetSequenceNumber() {
+      return sequence_number_;
+    }
+
+    Status AddFile(const FileInfo& file_info);
+
+    void Delete(bool delete_meta = true);
+
+    bool Empty() {
+      return files_.empty();
+    }
+
+    const std::vector<std::string>& GetFiles() {
+      return files_;
+    }
+
+    Status LoadFromFile(const std::string& backup_dir);
+    Status StoreToFile(bool sync);
+
+   private:
+    int64_t timestamp_;
+    // sequence number is only approximate, should not be used
+    // by clients
+    uint64_t sequence_number_;
+    uint64_t size_;
+    std::string const meta_filename_;
+    // files with relative paths (without "/" prefix!!)
+    std::vector<std::string> files_;
+    std::unordered_map<std::string, FileInfo>* file_infos_;
+    Env* env_;
+
+    static const size_t max_backup_meta_file_size_ = 10 * 1024 * 1024;  // 10MB
+  };  // BackupMeta
+
+  inline std::string GetAbsolutePath(
+      const std::string &relative_path = "") const {
+    assert(relative_path.size() == 0 || relative_path[0] != '/');
+    return options_.backup_dir + "/" + relative_path;
+  }
+  inline std::string GetPrivateDirRel() const {
+    return "private";
+  }
+  inline std::string GetSharedChecksumDirRel() const {
+    return "shared_checksum";
+  }
+  inline std::string GetPrivateFileRel(BackupID backup_id,
+                                       bool tmp = false,
+                                       const std::string& file = "") const {
+    assert(file.size() == 0 || file[0] != '/');
+    return GetPrivateDirRel() + "/" + std::to_string(backup_id) +
+           (tmp ? ".tmp" : "") + "/" + file;
+  }
+  inline std::string GetSharedFileRel(const std::string& file = "",
+                                      bool tmp = false) const {
+    assert(file.size() == 0 || file[0] != '/');
+    return "shared/" + file + (tmp ? ".tmp" : "");
+  }
+  inline std::string GetSharedFileWithChecksumRel(const std::string& file = "",
+                                                  bool tmp = false) const {
+    assert(file.size() == 0 || file[0] != '/');
+    return GetSharedChecksumDirRel() + "/" + file + (tmp ? ".tmp" : "");
+  }
+  inline std::string GetSharedFileWithChecksum(const std::string& file,
+                                               const uint32_t checksum_value,
+                                               const uint64_t file_size) const {
+    assert(file.size() == 0 || file[0] != '/');
+    std::string file_copy = file;
+    return file_copy.insert(file_copy.find_last_of('.'),
+                            "_" + std::to_string(checksum_value)
+                              + "_" + std::to_string(file_size));
+  }
+  inline std::string GetFileFromChecksumFile(const std::string& file) const {
+    assert(file.size() == 0 || file[0] != '/');
+    std::string file_copy = file;
+    size_t first_underscore = file_copy.find_first_of('_');
+    return file_copy.erase(first_underscore,
+                           file_copy.find_last_of('.') - first_underscore);
+  }
+  inline std::string GetLatestBackupFile(bool tmp = false) const {
+    return GetAbsolutePath(std::string("LATEST_BACKUP") + (tmp ? ".tmp" : ""));
+  }
+  inline std::string GetBackupMetaDir() const {
+    return GetAbsolutePath("meta");
+  }
+  inline std::string GetBackupMetaFile(BackupID backup_id) const {
+    return GetBackupMetaDir() + "/" + std::to_string(backup_id);
+  }
+
+  Status GetLatestBackupFileContents(uint32_t* latest_backup);
+  Status PutLatestBackupFileContents(uint32_t latest_backup);
+  // if size_limit == 0, there is no size limit, copy everything
+  Status CopyFile(const std::string& src,
+                  const std::string& dst,
+                  Env* src_env,
+                  Env* dst_env,
+                  bool sync,
+                  RateLimiter* rate_limiter,
+                  uint64_t* size = nullptr,
+                  uint32_t* checksum_value = nullptr,
+                  uint64_t size_limit = 0);
+  // if size_limit == 0, there is no size limit, copy everything
+  Status BackupFile(BackupID backup_id,
+                    BackupMeta* backup,
+                    bool shared,
+                    const std::string& src_dir,
+                    const std::string& src_fname,  // starts with "/"
+                    RateLimiter* rate_limiter,
+                    uint64_t size_limit = 0,
+                    bool shared_checksum = false);
+
+  Status CalculateChecksum(const std::string& src,
+                           Env* src_env,
+                           uint64_t size_limit,
+                           uint32_t* checksum_value);
+
+  // Will delete all the files we don't need anymore
+  // If full_scan == true, it will do the full scan of files/ directory
+  // and delete all the files that are not referenced from backuped_file_infos__
+  void GarbageCollection(bool full_scan);
+
+  // backup state data
+  BackupID latest_backup_id_;
+  std::map<BackupID, BackupMeta> backups_;
+  std::unordered_map<std::string, FileInfo> backuped_file_infos_;
+  std::vector<BackupID> obsolete_backups_;
+  std::atomic<bool> stop_backup_;
+
+  // options data
+  BackupableDBOptions options_;
+  Env* db_env_;
+  Env* backup_env_;
+
+  // directories
+  unique_ptr<Directory> backup_directory_;
+  unique_ptr<Directory> shared_directory_;
+  unique_ptr<Directory> meta_directory_;
+  unique_ptr<Directory> private_directory_;
+
+  static const size_t kDefaultCopyFileBufferSize = 5 * 1024 * 1024LL;  // 5MB
+  size_t copy_file_buffer_size_;
+  bool read_only_;
+};
+
+BackupEngine* BackupEngine::NewBackupEngine(
+    Env* db_env, const BackupableDBOptions& options) {
+  return new BackupEngineImpl(db_env, options);
+}
+
+BackupEngineImpl::BackupEngineImpl(Env* db_env,
+                                   const BackupableDBOptions& options,
+                                   bool read_only)
+    : stop_backup_(false),
+      options_(options),
+      db_env_(db_env),
+      backup_env_(options.backup_env != nullptr ? options.backup_env : db_env_),
+      copy_file_buffer_size_(kDefaultCopyFileBufferSize),
+      read_only_(read_only) {
+  if (read_only_) {
+    Log(options_.info_log, "Starting read_only backup engine");
+  }
+  options_.Dump(options_.info_log);
+
+  if (!read_only_) {
+    // create all the dirs we need
+    backup_env_->CreateDirIfMissing(GetAbsolutePath());
+    backup_env_->NewDirectory(GetAbsolutePath(), &backup_directory_);
+    if (options_.share_table_files) {
+      if (options_.share_files_with_checksum) {
+        backup_env_->CreateDirIfMissing(GetAbsolutePath(
+            GetSharedFileWithChecksumRel()));
+        backup_env_->NewDirectory(GetAbsolutePath(
+            GetSharedFileWithChecksumRel()), &shared_directory_);
+      } else {
+        backup_env_->CreateDirIfMissing(GetAbsolutePath(GetSharedFileRel()));
+        backup_env_->NewDirectory(GetAbsolutePath(GetSharedFileRel()),
+                                  &shared_directory_);
+      }
+    }
+    backup_env_->CreateDirIfMissing(GetAbsolutePath(GetPrivateDirRel()));
+    backup_env_->NewDirectory(GetAbsolutePath(GetPrivateDirRel()),
+                              &private_directory_);
+    backup_env_->CreateDirIfMissing(GetBackupMetaDir());
+    backup_env_->NewDirectory(GetBackupMetaDir(), &meta_directory_);
+  }
+
+  std::vector<std::string> backup_meta_files;
+  backup_env_->GetChildren(GetBackupMetaDir(), &backup_meta_files);
+  // create backups_ structure
+  for (auto& file : backup_meta_files) {
+    BackupID backup_id = 0;
+    sscanf(file.c_str(), "%u", &backup_id);
+    if (backup_id == 0 || file != std::to_string(backup_id)) {
+      if (!read_only_) {
+        // invalid file name, delete that
+        backup_env_->DeleteFile(GetBackupMetaDir() + "/" + file);
+      }
+      continue;
+    }
+    assert(backups_.find(backup_id) == backups_.end());
+    backups_.insert(std::make_pair(
+        backup_id, BackupMeta(GetBackupMetaFile(backup_id),
+                              &backuped_file_infos_, backup_env_)));
+  }
+
+  if (options_.destroy_old_data) {  // Destory old data
+    assert(!read_only_);
+    for (auto& backup : backups_) {
+      backup.second.Delete();
+      obsolete_backups_.push_back(backup.first);
+    }
+    backups_.clear();
+    // start from beginning
+    latest_backup_id_ = 0;
+    // GarbageCollection() will do the actual deletion
+  } else {  // Load data from storage
+    // load the backups if any
+    for (auto& backup : backups_) {
+      Status s = backup.second.LoadFromFile(options_.backup_dir);
+      if (!s.ok()) {
+        Log(options_.info_log, "Backup %u corrupted -- %s", backup.first,
+            s.ToString().c_str());
+        if (!read_only_) {
+          Log(options_.info_log, "-> Deleting backup %u", backup.first);
+        }
+        backup.second.Delete(!read_only_);
+        obsolete_backups_.push_back(backup.first);
+      }
+    }
+    // delete obsolete backups from the structure
+    for (auto ob : obsolete_backups_) {
+      backups_.erase(ob);
+    }
+
+    Status s = GetLatestBackupFileContents(&latest_backup_id_);
+
+    // If latest backup file is corrupted or non-existent
+    // set latest backup as the biggest backup we have
+    // or 0 if we have no backups
+    if (!s.ok() ||
+        backups_.find(latest_backup_id_) == backups_.end()) {
+      auto itr = backups_.end();
+      latest_backup_id_ = (itr == backups_.begin()) ? 0 : (--itr)->first;
+    }
+  }
+
+  // delete any backups that claim to be later than latest
+  for (auto itr = backups_.upper_bound(latest_backup_id_);
+       itr != backups_.end();) {
+    itr->second.Delete();
+    obsolete_backups_.push_back(itr->first);
+    itr = backups_.erase(itr);
+  }
+
+  if (!read_only_) {
+    PutLatestBackupFileContents(latest_backup_id_);  // Ignore errors
+    GarbageCollection(true);
+  }
+  Log(options_.info_log, "Initialized BackupEngine, the latest backup is %u.",
+      latest_backup_id_);
+}
+
+BackupEngineImpl::~BackupEngineImpl() { LogFlush(options_.info_log); }
+
+Status BackupEngineImpl::CreateNewBackup(DB* db, bool flush_before_backup) {
+  assert(!read_only_);
+  Status s;
+  std::vector<std::string> live_files;
+  VectorLogPtr live_wal_files;
+  uint64_t manifest_file_size = 0;
+  uint64_t sequence_number = db->GetLatestSequenceNumber();
+
+  s = db->DisableFileDeletions();
+  if (s.ok()) {
+    // this will return live_files prefixed with "/"
+    s = db->GetLiveFiles(live_files, &manifest_file_size, flush_before_backup);
+  }
+  // if we didn't flush before backup, we need to also get WAL files
+  if (s.ok() && !flush_before_backup && options_.backup_log_files) {
+    // returns file names prefixed with "/"
+    s = db->GetSortedWalFiles(live_wal_files);
+  }
+  if (!s.ok()) {
+    db->EnableFileDeletions();
+    return s;
+  }
+
+  BackupID new_backup_id = latest_backup_id_ + 1;
+  assert(backups_.find(new_backup_id) == backups_.end());
+  auto ret = backups_.insert(std::make_pair(
+      new_backup_id, BackupMeta(GetBackupMetaFile(new_backup_id),
+                                &backuped_file_infos_, backup_env_)));
+  assert(ret.second == true);
+  auto& new_backup = ret.first->second;
+  new_backup.RecordTimestamp();
+  new_backup.SetSequenceNumber(sequence_number);
+
+  Log(options_.info_log, "Started the backup process -- creating backup %u",
+      new_backup_id);
+
+  // create temporary private dir
+  s = backup_env_->CreateDir(
+      GetAbsolutePath(GetPrivateFileRel(new_backup_id, true)));
+
+  unique_ptr<RateLimiter> rate_limiter;
+  if (options_.backup_rate_limit > 0) {
+    copy_file_buffer_size_ = options_.backup_rate_limit / 10;
+    rate_limiter.reset(new RateLimiter(db_env_, options_.backup_rate_limit,
+                                       copy_file_buffer_size_));
+  }
+
+  // copy live_files
+  for (size_t i = 0; s.ok() && i < live_files.size(); ++i) {
+    uint64_t number;
+    FileType type;
+    bool ok = ParseFileName(live_files[i], &number, &type);
+    if (!ok) {
+      assert(false);
+      return Status::Corruption("Can't parse file name. This is very bad");
+    }
+    // we should only get sst, manifest and current files here
+    assert(type == kTableFile || type == kDescriptorFile ||
+           type == kCurrentFile);
+
+    // rules:
+    // * if it's kTableFile, then it's shared
+    // * if it's kDescriptorFile, limit the size to manifest_file_size
+    s = BackupFile(new_backup_id,
+                   &new_backup,
+                   options_.share_table_files && type == kTableFile,
+                   db->GetName(),            /* src_dir */
+                   live_files[i],            /* src_fname */
+                   rate_limiter.get(),
+                   (type == kDescriptorFile) ? manifest_file_size : 0,
+                   options_.share_files_with_checksum && type == kTableFile);
+  }
+
+  // copy WAL files
+  for (size_t i = 0; s.ok() && i < live_wal_files.size(); ++i) {
+    if (live_wal_files[i]->Type() == kAliveLogFile) {
+      // we only care about live log files
+      // copy the file into backup_dir/files/<new backup>/
+      s = BackupFile(new_backup_id,
+                     &new_backup,
+                     false, /* not shared */
+                     db->GetOptions().wal_dir,
+                     live_wal_files[i]->PathName(),
+                     rate_limiter.get());
+    }
+  }
+
+  // we copied all the files, enable file deletions
+  db->EnableFileDeletions();
+
+  if (s.ok()) {
+    // move tmp private backup to real backup folder
+    s = backup_env_->RenameFile(
+        GetAbsolutePath(GetPrivateFileRel(new_backup_id, true)),  // tmp
+        GetAbsolutePath(GetPrivateFileRel(new_backup_id, false)));
+  }
+
+  if (s.ok()) {
+    // persist the backup metadata on the disk
+    s = new_backup.StoreToFile(options_.sync);
+  }
+  if (s.ok()) {
+    // install the newly created backup meta! (atomic)
+    s = PutLatestBackupFileContents(new_backup_id);
+  }
+  if (s.ok() && options_.sync) {
+    unique_ptr<Directory> backup_private_directory;
+    backup_env_->NewDirectory(
+        GetAbsolutePath(GetPrivateFileRel(new_backup_id, false)),
+        &backup_private_directory);
+    if (backup_private_directory != nullptr) {
+      backup_private_directory->Fsync();
+    }
+    if (private_directory_ != nullptr) {
+      private_directory_->Fsync();
+    }
+    if (meta_directory_ != nullptr) {
+      meta_directory_->Fsync();
+    }
+    if (shared_directory_ != nullptr) {
+      shared_directory_->Fsync();
+    }
+    if (backup_directory_ != nullptr) {
+      backup_directory_->Fsync();
+    }
+  }
+
+  if (!s.ok()) {
+    // clean all the files we might have created
+    Log(options_.info_log, "Backup failed -- %s", s.ToString().c_str());
+    backups_.erase(new_backup_id);
+    GarbageCollection(true);
+    return s;
+  }
+
+  // here we know that we succeeded and installed the new backup
+  // in the LATEST_BACKUP file
+  latest_backup_id_ = new_backup_id;
+  Log(options_.info_log, "Backup DONE. All is good");
+  return s;
+}
+
+Status BackupEngineImpl::PurgeOldBackups(uint32_t num_backups_to_keep) {
+  assert(!read_only_);
+  Log(options_.info_log, "Purging old backups, keeping %u",
+      num_backups_to_keep);
+  while (num_backups_to_keep < backups_.size()) {
+    Log(options_.info_log, "Deleting backup %u", backups_.begin()->first);
+    backups_.begin()->second.Delete();
+    obsolete_backups_.push_back(backups_.begin()->first);
+    backups_.erase(backups_.begin());
+  }
+  GarbageCollection(false);
+  return Status::OK();
+}
+
+Status BackupEngineImpl::DeleteBackup(BackupID backup_id) {
+  assert(!read_only_);
+  Log(options_.info_log, "Deleting backup %u", backup_id);
+  auto backup = backups_.find(backup_id);
+  if (backup == backups_.end()) {
+    return Status::NotFound("Backup not found");
+  }
+  backup->second.Delete();
+  obsolete_backups_.push_back(backup_id);
+  backups_.erase(backup);
+  GarbageCollection(false);
+  return Status::OK();
+}
+
+void BackupEngineImpl::GetBackupInfo(std::vector<BackupInfo>* backup_info) {
+  backup_info->reserve(backups_.size());
+  for (auto& backup : backups_) {
+    if (!backup.second.Empty()) {
+      backup_info->push_back(BackupInfo(
+          backup.first, backup.second.GetTimestamp(), backup.second.GetSize()));
+    }
+  }
+}
+
+Status BackupEngineImpl::RestoreDBFromBackup(
+    BackupID backup_id, const std::string& db_dir, const std::string& wal_dir,
+    const RestoreOptions& restore_options) {
+  auto backup_itr = backups_.find(backup_id);
+  if (backup_itr == backups_.end()) {
+    return Status::NotFound("Backup not found");
+  }
+  auto& backup = backup_itr->second;
+  if (backup.Empty()) {
+    return Status::NotFound("Backup not found");
+  }
+
+  Log(options_.info_log, "Restoring backup id %u\n", backup_id);
+  Log(options_.info_log, "keep_log_files: %d\n",
+      static_cast<int>(restore_options.keep_log_files));
+
+  // just in case. Ignore errors
+  db_env_->CreateDirIfMissing(db_dir);
+  db_env_->CreateDirIfMissing(wal_dir);
+
+  if (restore_options.keep_log_files) {
+    // delete files in db_dir, but keep all the log files
+    DeleteChildren(db_dir, 1 << kLogFile);
+    // move all the files from archive dir to wal_dir
+    std::string archive_dir = ArchivalDirectory(wal_dir);
+    std::vector<std::string> archive_files;
+    db_env_->GetChildren(archive_dir, &archive_files);  // ignore errors
+    for (const auto& f : archive_files) {
+      uint64_t number;
+      FileType type;
+      bool ok = ParseFileName(f, &number, &type);
+      if (ok && type == kLogFile) {
+        Log(options_.info_log, "Moving log file from archive/ to wal_dir: %s",
+            f.c_str());
+        Status s =
+            db_env_->RenameFile(archive_dir + "/" + f, wal_dir + "/" + f);
+        if (!s.ok()) {
+          // if we can't move log file from archive_dir to wal_dir,
+          // we should fail, since it might mean data loss
+          return s;
+        }
+      }
+    }
+  } else {
+    DeleteChildren(wal_dir);
+    DeleteChildren(ArchivalDirectory(wal_dir));
+    DeleteChildren(db_dir);
+  }
+
+  unique_ptr<RateLimiter> rate_limiter;
+  if (options_.restore_rate_limit > 0) {
+    copy_file_buffer_size_ = options_.restore_rate_limit / 10;
+    rate_limiter.reset(new RateLimiter(db_env_, options_.restore_rate_limit,
+                                       copy_file_buffer_size_));
+  }
+  Status s;
+  for (auto& file : backup.GetFiles()) {
+    std::string dst;
+    // 1. extract the filename
+    size_t slash = file.find_last_of('/');
+    // file will either be shared/<file>, shared_checksum/<file_crc32_size>
+    // or private/<number>/<file>
+    assert(slash != std::string::npos);
+    dst = file.substr(slash + 1);
+
+    // if the file was in shared_checksum, extract the real file name
+    // in this case the file is <number>_<checksum>_<size>.<type>
+    if (file.substr(0, slash) == GetSharedChecksumDirRel()) {
+      dst = GetFileFromChecksumFile(dst);
+    }
+
+    // 2. find the filetype
+    uint64_t number;
+    FileType type;
+    bool ok = ParseFileName(dst, &number, &type);
+    if (!ok) {
+      return Status::Corruption("Backup corrupted");
+    }
+    // 3. Construct the final path
+    // kLogFile lives in wal_dir and all the rest live in db_dir
+    dst = ((type == kLogFile) ? wal_dir : db_dir) +
+      "/" + dst;
+
+    Log(options_.info_log, "Restoring %s to %s\n", file.c_str(), dst.c_str());
+    uint32_t checksum_value;
+    s = CopyFile(GetAbsolutePath(file), dst, backup_env_, db_env_, false,
+                 rate_limiter.get(), nullptr /* size */, &checksum_value);
+    if (!s.ok()) {
+      break;
+    }
+
+    const auto iter = backuped_file_infos_.find(file);
+    assert(iter != backuped_file_infos_.end());
+    if (iter->second.checksum_value != checksum_value) {
+      s = Status::Corruption("Checksum check failed");
+      break;
+    }
+  }
+
+  Log(options_.info_log, "Restoring done -- %s\n", s.ToString().c_str());
+  return s;
+}
+
+// latest backup id is an ASCII representation of latest backup id
+Status BackupEngineImpl::GetLatestBackupFileContents(uint32_t* latest_backup) {
+  Status s;
+  unique_ptr<SequentialFile> file;
+  s = backup_env_->NewSequentialFile(GetLatestBackupFile(),
+                                     &file,
+                                     EnvOptions());
+  if (!s.ok()) {
+    return s;
+  }
+
+  char buf[11];
+  Slice data;
+  s = file->Read(10, &data, buf);
+  if (!s.ok() || data.size() == 0) {
+    return s.ok() ? Status::Corruption("Latest backup file corrupted") : s;
+  }
+  buf[data.size()] = 0;
+
+  *latest_backup = 0;
+  sscanf(data.data(), "%u", latest_backup);
+  if (backup_env_->FileExists(GetBackupMetaFile(*latest_backup)) == false) {
+    s = Status::Corruption("Latest backup file corrupted");
+  }
+  return Status::OK();
+}
+
+// this operation HAS to be atomic
+// writing 4 bytes to the file is atomic alright, but we should *never*
+// do something like 1. delete file, 2. write new file
+// We write to a tmp file and then atomically rename
+Status BackupEngineImpl::PutLatestBackupFileContents(uint32_t latest_backup) {
+  assert(!read_only_);
+  Status s;
+  unique_ptr<WritableFile> file;
+  EnvOptions env_options;
+  env_options.use_mmap_writes = false;
+  s = backup_env_->NewWritableFile(GetLatestBackupFile(true),
+                                   &file,
+                                   env_options);
+  if (!s.ok()) {
+    backup_env_->DeleteFile(GetLatestBackupFile(true));
+    return s;
+  }
+
+  char file_contents[10];
+  int len = sprintf(file_contents, "%u\n", latest_backup);
+  s = file->Append(Slice(file_contents, len));
+  if (s.ok() && options_.sync) {
+    file->Sync();
+  }
+  if (s.ok()) {
+    s = file->Close();
+  }
+  if (s.ok()) {
+    // atomically replace real file with new tmp
+    s = backup_env_->RenameFile(GetLatestBackupFile(true),
+                                GetLatestBackupFile(false));
+  }
+  return s;
+}
+
+Status BackupEngineImpl::CopyFile(const std::string& src,
+                                  const std::string& dst, Env* src_env,
+                                  Env* dst_env, bool sync,
+                                  RateLimiter* rate_limiter, uint64_t* size,
+                                  uint32_t* checksum_value,
+                                  uint64_t size_limit) {
+  Status s;
+  unique_ptr<WritableFile> dst_file;
+  unique_ptr<SequentialFile> src_file;
+  EnvOptions env_options;
+  env_options.use_mmap_writes = false;
+  env_options.use_os_buffer = false;
+  if (size != nullptr) {
+    *size = 0;
+  }
+  if (checksum_value != nullptr) {
+    *checksum_value = 0;
+  }
+
+  // Check if size limit is set. if not, set it to very big number
+  if (size_limit == 0) {
+    size_limit = std::numeric_limits<uint64_t>::max();
+  }
+
+  s = src_env->NewSequentialFile(src, &src_file, env_options);
+  if (s.ok()) {
+    s = dst_env->NewWritableFile(dst, &dst_file, env_options);
+  }
+  if (!s.ok()) {
+    return s;
+  }
+
+  unique_ptr<char[]> buf(new char[copy_file_buffer_size_]);
+  Slice data;
+
+  do {
+    if (stop_backup_.load(std::memory_order_acquire)) {
+      return Status::Incomplete("Backup stopped");
+    }
+    size_t buffer_to_read = (copy_file_buffer_size_ < size_limit) ?
+      copy_file_buffer_size_ : size_limit;
+    s = src_file->Read(buffer_to_read, &data, buf.get());
+    size_limit -= data.size();
+
+    if (!s.ok()) {
+      return s;
+    }
+
+    if (size != nullptr) {
+      *size += data.size();
+    }
+    if (checksum_value != nullptr) {
+      *checksum_value = crc32c::Extend(*checksum_value, data.data(),
+                                       data.size());
+    }
+    s = dst_file->Append(data);
+    if (rate_limiter != nullptr) {
+      rate_limiter->ReportAndWait(data.size());
+    }
+  } while (s.ok() && data.size() > 0 && size_limit > 0);
+
+  if (s.ok() && sync) {
+    s = dst_file->Sync();
+  }
+
+  return s;
+}
+
+// src_fname will always start with "/"
+Status BackupEngineImpl::BackupFile(BackupID backup_id, BackupMeta* backup,
+                                    bool shared, const std::string& src_dir,
+                                    const std::string& src_fname,
+                                    RateLimiter* rate_limiter,
+                                    uint64_t size_limit,
+                                    bool shared_checksum) {
+
+  assert(src_fname.size() > 0 && src_fname[0] == '/');
+  std::string dst_relative = src_fname.substr(1);
+  std::string dst_relative_tmp;
+  Status s;
+  uint64_t size;
+  uint32_t checksum_value = 0;
+
+  if (shared && shared_checksum) {
+    // add checksum and file length to the file name
+    s = CalculateChecksum(src_dir + src_fname,
+                          db_env_,
+                          size_limit,
+                          &checksum_value);
+    if (s.ok()) {
+        s = db_env_->GetFileSize(src_dir + src_fname, &size);
+    }
+    if (!s.ok()) {
+         return s;
+    }
+    dst_relative = GetSharedFileWithChecksum(dst_relative, checksum_value,
+                                             size);
+    dst_relative_tmp = GetSharedFileWithChecksumRel(dst_relative, true);
+    dst_relative = GetSharedFileWithChecksumRel(dst_relative, false);
+  } else if (shared) {
+    dst_relative_tmp = GetSharedFileRel(dst_relative, true);
+    dst_relative = GetSharedFileRel(dst_relative, false);
+  } else {
+    dst_relative_tmp = GetPrivateFileRel(backup_id, true, dst_relative);
+    dst_relative = GetPrivateFileRel(backup_id, false, dst_relative);
+  }
+  std::string dst_path = GetAbsolutePath(dst_relative);
+  std::string dst_path_tmp = GetAbsolutePath(dst_relative_tmp);
+
+  // if it's shared, we also need to check if it exists -- if it does,
+  // no need to copy it again
+  if (shared && backup_env_->FileExists(dst_path)) {
+    if (shared_checksum) {
+      Log(options_.info_log,
+          "%s already present, with checksum %u and size %" PRIu64,
+          src_fname.c_str(), checksum_value, size);
+    } else {
+      backup_env_->GetFileSize(dst_path, &size);  // Ignore error
+      Log(options_.info_log, "%s already present, calculate checksum",
+          src_fname.c_str());
+      s = CalculateChecksum(src_dir + src_fname,
+                            db_env_,
+                            size_limit,
+                            &checksum_value);
+    }
+  } else {
+    Log(options_.info_log, "Copying %s", src_fname.c_str());
+    s = CopyFile(src_dir + src_fname,
+                 dst_path_tmp,
+                 db_env_,
+                 backup_env_,
+                 options_.sync,
+                 rate_limiter,
+                 &size,
+                 &checksum_value,
+                 size_limit);
+    if (s.ok() && shared) {
+      s = backup_env_->RenameFile(dst_path_tmp, dst_path);
+    }
+  }
+  if (s.ok()) {
+    s = backup->AddFile(FileInfo(dst_relative, size, checksum_value));
+  }
+  return s;
+}
+
+Status BackupEngineImpl::CalculateChecksum(const std::string& src, Env* src_env,
+                                           uint64_t size_limit,
+                                           uint32_t* checksum_value) {
+  *checksum_value = 0;
+  if (size_limit == 0) {
+    size_limit = std::numeric_limits<uint64_t>::max();
+  }
+
+  EnvOptions env_options;
+  env_options.use_mmap_writes = false;
+  env_options.use_os_buffer = false;
+
+  std::unique_ptr<SequentialFile> src_file;
+  Status s = src_env->NewSequentialFile(src, &src_file, env_options);
+  if (!s.ok()) {
+    return s;
+  }
+
+  std::unique_ptr<char[]> buf(new char[copy_file_buffer_size_]);
+  Slice data;
+
+  do {
+    if (stop_backup_.load(std::memory_order_acquire)) {
+      return Status::Incomplete("Backup stopped");
+    }
+    size_t buffer_to_read = (copy_file_buffer_size_ < size_limit) ?
+      copy_file_buffer_size_ : size_limit;
+    s = src_file->Read(buffer_to_read, &data, buf.get());
+
+    if (!s.ok()) {
+      return s;
+    }
+
+    size_limit -= data.size();
+    *checksum_value = crc32c::Extend(*checksum_value, data.data(), data.size());
+  } while (data.size() > 0 && size_limit > 0);
+
+  return s;
+}
+
+void BackupEngineImpl::DeleteChildren(const std::string& dir,
+                                      uint32_t file_type_filter) {
+  std::vector<std::string> children;
+  db_env_->GetChildren(dir, &children);  // ignore errors
+
+  for (const auto& f : children) {
+    uint64_t number;
+    FileType type;
+    bool ok = ParseFileName(f, &number, &type);
+    if (ok && (file_type_filter & (1 << type))) {
+      // don't delete this file
+      continue;
+    }
+    db_env_->DeleteFile(dir + "/" + f);  // ignore errors
+  }
+}
+
+void BackupEngineImpl::GarbageCollection(bool full_scan) {
+  assert(!read_only_);
+  Log(options_.info_log, "Starting garbage collection");
+  std::vector<std::string> to_delete;
+  for (auto& itr : backuped_file_infos_) {
+    if (itr.second.refs == 0) {
+      Status s = backup_env_->DeleteFile(GetAbsolutePath(itr.first));
+      Log(options_.info_log, "Deleting %s -- %s", itr.first.c_str(),
+          s.ToString().c_str());
+      to_delete.push_back(itr.first);
+    }
+  }
+  for (auto& td : to_delete) {
+    backuped_file_infos_.erase(td);
+  }
+  if (!full_scan) {
+    // take care of private dirs -- if full_scan == true, then full_scan will
+    // take care of them
+    for (auto backup_id : obsolete_backups_) {
+      std::string private_dir = GetPrivateFileRel(backup_id);
+      Status s = backup_env_->DeleteDir(GetAbsolutePath(private_dir));
+      Log(options_.info_log, "Deleting private dir %s -- %s",
+          private_dir.c_str(), s.ToString().c_str());
+    }
+  }
+  obsolete_backups_.clear();
+
+  if (full_scan) {
+    Log(options_.info_log, "Starting full scan garbage collection");
+    // delete obsolete shared files
+    std::vector<std::string> shared_children;
+    backup_env_->GetChildren(GetAbsolutePath(GetSharedFileRel()),
+                             &shared_children);
+    for (auto& child : shared_children) {
+      std::string rel_fname = GetSharedFileRel(child);
+      // if it's not refcounted, delete it
+      if (backuped_file_infos_.find(rel_fname) == backuped_file_infos_.end()) {
+        // this might be a directory, but DeleteFile will just fail in that
+        // case, so we're good
+        Status s = backup_env_->DeleteFile(GetAbsolutePath(rel_fname));
+        if (s.ok()) {
+          Log(options_.info_log, "Deleted %s", rel_fname.c_str());
+        }
+      }
+    }
+
+    // delete obsolete private files
+    std::vector<std::string> private_children;
+    backup_env_->GetChildren(GetAbsolutePath(GetPrivateDirRel()),
+                             &private_children);
+    for (auto& child : private_children) {
+      BackupID backup_id = 0;
+      bool tmp_dir = child.find(".tmp") != std::string::npos;
+      sscanf(child.c_str(), "%u", &backup_id);
+      if (!tmp_dir &&  // if it's tmp_dir, delete it
+          (backup_id == 0 || backups_.find(backup_id) != backups_.end())) {
+        // it's either not a number or it's still alive. continue
+        continue;
+      }
+      // here we have to delete the dir and all its children
+      std::string full_private_path =
+          GetAbsolutePath(GetPrivateFileRel(backup_id, tmp_dir));
+      std::vector<std::string> subchildren;
+      backup_env_->GetChildren(full_private_path, &subchildren);
+      for (auto& subchild : subchildren) {
+        Status s = backup_env_->DeleteFile(full_private_path + subchild);
+        if (s.ok()) {
+          Log(options_.info_log, "Deleted %s",
+              (full_private_path + subchild).c_str());
+        }
+      }
+      // finally delete the private dir
+      Status s = backup_env_->DeleteDir(full_private_path);
+      Log(options_.info_log, "Deleted dir %s -- %s", full_private_path.c_str(),
+          s.ToString().c_str());
+    }
+  }
+}
+
+// ------- BackupMeta class --------
+
+Status BackupEngineImpl::BackupMeta::AddFile(const FileInfo& file_info) {
+  size_ += file_info.size;
+  files_.push_back(file_info.filename);
+
+  auto itr = file_infos_->find(file_info.filename);
+  if (itr == file_infos_->end()) {
+    auto ret = file_infos_->insert({file_info.filename, file_info});
+    if (ret.second) {
+      ret.first->second.refs = 1;
+    } else {
+      // if this happens, something is seriously wrong
+      return Status::Corruption("In memory metadata insertion error");
+    }
+  } else {
+    if (itr->second.checksum_value != file_info.checksum_value) {
+      return Status::Corruption("Checksum mismatch for existing backup file");
+    }
+    ++itr->second.refs;  // increase refcount if already present
+  }
+
+  return Status::OK();
+}
+
+void BackupEngineImpl::BackupMeta::Delete(bool delete_meta) {
+  for (const auto& file : files_) {
+    auto itr = file_infos_->find(file);
+    assert(itr != file_infos_->end());
+    --(itr->second.refs);  // decrease refcount
+  }
+  files_.clear();
+  // delete meta file
+  if (delete_meta) {
+    env_->DeleteFile(meta_filename_);
+  }
+  timestamp_ = 0;
+}
+
+// each backup meta file is of the format:
+// <timestamp>
+// <seq number>
+// <number of files>
+// <file1> <crc32(literal string)> <crc32_value>
+// <file2> <crc32(literal string)> <crc32_value>
+// ...
+Status BackupEngineImpl::BackupMeta::LoadFromFile(
+    const std::string& backup_dir) {
+  assert(Empty());
+  Status s;
+  unique_ptr<SequentialFile> backup_meta_file;
+  s = env_->NewSequentialFile(meta_filename_, &backup_meta_file, EnvOptions());
+  if (!s.ok()) {
+    return s;
+  }
+
+  unique_ptr<char[]> buf(new char[max_backup_meta_file_size_ + 1]);
+  Slice data;
+  s = backup_meta_file->Read(max_backup_meta_file_size_, &data, buf.get());
+
+  if (!s.ok() || data.size() == max_backup_meta_file_size_) {
+    return s.ok() ? Status::Corruption("File size too big") : s;
+  }
+  buf[data.size()] = 0;
+
+  uint32_t num_files = 0;
+  int bytes_read = 0;
+  sscanf(data.data(), "%" PRId64 "%n", &timestamp_, &bytes_read);
+  data.remove_prefix(bytes_read + 1);  // +1 for '\n'
+  sscanf(data.data(), "%" PRIu64 "%n", &sequence_number_, &bytes_read);
+  data.remove_prefix(bytes_read + 1);  // +1 for '\n'
+  sscanf(data.data(), "%u%n", &num_files, &bytes_read);
+  data.remove_prefix(bytes_read + 1);  // +1 for '\n'
+
+  std::vector<FileInfo> files;
+
+  for (uint32_t i = 0; s.ok() && i < num_files; ++i) {
+    auto line = GetSliceUntil(&data, '\n');
+    std::string filename = GetSliceUntil(&line, ' ').ToString();
+
+    uint64_t size;
+    s = env_->GetFileSize(backup_dir + "/" + filename, &size);
+    if (!s.ok()) {
+      return s;
+    }
+
+    if (line.empty()) {
+      return Status::Corruption("File checksum is missing");
+    }
+
+    uint32_t checksum_value = 0;
+    if (line.starts_with("crc32 ")) {
+      line.remove_prefix(6);
+      sscanf(line.data(), "%u", &checksum_value);
+      if (memcmp(line.data(), std::to_string(checksum_value).c_str(),
+                 line.size() - 1) != 0) {
+        return Status::Corruption("Invalid checksum value");
+      }
+    } else {
+      return Status::Corruption("Unknown checksum type");
+    }
+
+    files.emplace_back(filename, size, checksum_value);
+  }
+
+  if (s.ok() && data.size() > 0) {
+    // file has to be read completely. if not, we count it as corruption
+    s = Status::Corruption("Tailing data in backup meta file");
+  }
+
+  if (s.ok()) {
+    for (const auto& file_info : files) {
+      s = AddFile(file_info);
+      if (!s.ok()) {
+        break;
+      }
+    }
+  }
+
+  return s;
+}
+
+Status BackupEngineImpl::BackupMeta::StoreToFile(bool sync) {
+  Status s;
+  unique_ptr<WritableFile> backup_meta_file;
+  EnvOptions env_options;
+  env_options.use_mmap_writes = false;
+  s = env_->NewWritableFile(meta_filename_ + ".tmp", &backup_meta_file,
+                            env_options);
+  if (!s.ok()) {
+    return s;
+  }
+
+  unique_ptr<char[]> buf(new char[max_backup_meta_file_size_]);
+  int len = 0, buf_size = max_backup_meta_file_size_;
+  len += snprintf(buf.get(), buf_size, "%" PRId64 "\n", timestamp_);
+  len += snprintf(buf.get() + len, buf_size - len, "%" PRIu64 "\n",
+                  sequence_number_);
+  len += snprintf(buf.get() + len, buf_size - len, "%zu\n", files_.size());
+  for (const auto& file : files_) {
+    const auto& iter = file_infos_->find(file);
+
+    assert(iter != file_infos_->end());
+    // use crc32 for now, switch to something else if needed
+    len += snprintf(buf.get() + len, buf_size - len, "%s crc32 %u\n",
+                    file.c_str(), iter->second.checksum_value);
+  }
+
+  s = backup_meta_file->Append(Slice(buf.get(), (size_t)len));
+  if (s.ok() && sync) {
+    s = backup_meta_file->Sync();
+  }
+  if (s.ok()) {
+    s = backup_meta_file->Close();
+  }
+  if (s.ok()) {
+    s = env_->RenameFile(meta_filename_ + ".tmp", meta_filename_);
+  }
+  return s;
+}
+
+// -------- BackupEngineReadOnlyImpl ---------
+class BackupEngineReadOnlyImpl : public BackupEngineReadOnly {
+ public:
+  BackupEngineReadOnlyImpl(Env* db_env, const BackupableDBOptions& options)
+      : backup_engine_(new BackupEngineImpl(db_env, options, true)) {}
+
+  virtual ~BackupEngineReadOnlyImpl() {}
+
+  virtual void GetBackupInfo(std::vector<BackupInfo>* backup_info) {
+    backup_engine_->GetBackupInfo(backup_info);
+  }
+
+  virtual Status RestoreDBFromBackup(
+      BackupID backup_id, const std::string& db_dir, const std::string& wal_dir,
+      const RestoreOptions& restore_options = RestoreOptions()) {
+    return backup_engine_->RestoreDBFromBackup(backup_id, db_dir, wal_dir,
+                                               restore_options);
+  }
+
+  virtual Status RestoreDBFromLatestBackup(
+      const std::string& db_dir, const std::string& wal_dir,
+      const RestoreOptions& restore_options = RestoreOptions()) {
+    return backup_engine_->RestoreDBFromLatestBackup(db_dir, wal_dir,
+                                                     restore_options);
+  }
+
+ private:
+  std::unique_ptr<BackupEngineImpl> backup_engine_;
+};
+
+BackupEngineReadOnly* BackupEngineReadOnly::NewReadOnlyBackupEngine(
+    Env* db_env, const BackupableDBOptions& options) {
+  if (options.destroy_old_data) {
+    assert(false);
+    return nullptr;
+  }
+  return new BackupEngineReadOnlyImpl(db_env, options);
+}
+
+// --- BackupableDB methods --------
+
+BackupableDB::BackupableDB(DB* db, const BackupableDBOptions& options)
+    : StackableDB(db),
+      backup_engine_(new BackupEngineImpl(db->GetEnv(), options)) {}
+
+BackupableDB::~BackupableDB() {
+  delete backup_engine_;
+}
+
+Status BackupableDB::CreateNewBackup(bool flush_before_backup) {
+  return backup_engine_->CreateNewBackup(this, flush_before_backup);
+}
+
+void BackupableDB::GetBackupInfo(std::vector<BackupInfo>* backup_info) {
+  backup_engine_->GetBackupInfo(backup_info);
+}
+
+Status BackupableDB::PurgeOldBackups(uint32_t num_backups_to_keep) {
+  return backup_engine_->PurgeOldBackups(num_backups_to_keep);
+}
+
+Status BackupableDB::DeleteBackup(BackupID backup_id) {
+  return backup_engine_->DeleteBackup(backup_id);
+}
+
+void BackupableDB::StopBackup() {
+  backup_engine_->StopBackup();
+}
+
+// --- RestoreBackupableDB methods ------
+
+RestoreBackupableDB::RestoreBackupableDB(Env* db_env,
+                                         const BackupableDBOptions& options)
+    : backup_engine_(new BackupEngineImpl(db_env, options)) {}
+
+RestoreBackupableDB::~RestoreBackupableDB() {
+  delete backup_engine_;
+}
+
+void
+RestoreBackupableDB::GetBackupInfo(std::vector<BackupInfo>* backup_info) {
+  backup_engine_->GetBackupInfo(backup_info);
+}
+
+Status RestoreBackupableDB::RestoreDBFromBackup(
+    BackupID backup_id, const std::string& db_dir, const std::string& wal_dir,
+    const RestoreOptions& restore_options) {
+  return backup_engine_->RestoreDBFromBackup(backup_id, db_dir, wal_dir,
+                                             restore_options);
+}
+
+Status RestoreBackupableDB::RestoreDBFromLatestBackup(
+    const std::string& db_dir, const std::string& wal_dir,
+    const RestoreOptions& restore_options) {
+  return backup_engine_->RestoreDBFromLatestBackup(db_dir, wal_dir,
+                                                   restore_options);
+}
+
+Status RestoreBackupableDB::PurgeOldBackups(uint32_t num_backups_to_keep) {
+  return backup_engine_->PurgeOldBackups(num_backups_to_keep);
+}
+
+Status RestoreBackupableDB::DeleteBackup(BackupID backup_id) {
+  return backup_engine_->DeleteBackup(backup_id);
+}
+
+}  // namespace rocksdb
+
+#endif  // ROCKSDB_LITE
diff --git a/utilities/backupable/backupable_db_test.cc b/utilities/backupable/backupable_db_test.cc
new file mode 100644 (file)
index 0000000..ef34cf0
--- /dev/null
@@ -0,0 +1,963 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <string>
+#include <algorithm>
+#include <iostream>
+
+#include "rocksdb/types.h"
+#include "rocksdb/transaction_log.h"
+#include "utilities/utility_db.h"
+#include "utilities/backupable_db.h"
+#include "util/testharness.h"
+#include "util/random.h"
+#include "util/testutil.h"
+#include "util/auto_roll_logger.h"
+
+namespace rocksdb {
+
+namespace {
+
+using std::unique_ptr;
+
+class DummyDB : public StackableDB {
+ public:
+  /* implicit */
+  DummyDB(const Options& options, const std::string& dbname)
+     : StackableDB(nullptr), options_(options), dbname_(dbname),
+       deletions_enabled_(true), sequence_number_(0) {}
+
+  virtual SequenceNumber GetLatestSequenceNumber() const {
+    return ++sequence_number_;
+  }
+
+  virtual const std::string& GetName() const override {
+    return dbname_;
+  }
+
+  virtual Env* GetEnv() const override {
+    return options_.env;
+  }
+
+  using DB::GetOptions;
+  virtual const Options& GetOptions(ColumnFamilyHandle* column_family) const
+      override {
+    return options_;
+  }
+
+  virtual Status EnableFileDeletions(bool force) override {
+    ASSERT_TRUE(!deletions_enabled_);
+    deletions_enabled_ = true;
+    return Status::OK();
+  }
+
+  virtual Status DisableFileDeletions() override {
+    ASSERT_TRUE(deletions_enabled_);
+    deletions_enabled_ = false;
+    return Status::OK();
+  }
+
+  virtual Status GetLiveFiles(std::vector<std::string>& vec, uint64_t* mfs,
+                              bool flush_memtable = true) override {
+    ASSERT_TRUE(!deletions_enabled_);
+    vec = live_files_;
+    *mfs = 100;
+    return Status::OK();
+  }
+
+  virtual ColumnFamilyHandle* DefaultColumnFamily() const override {
+    return nullptr;
+  }
+
+  class DummyLogFile : public LogFile {
+   public:
+    /* implicit */
+     DummyLogFile(const std::string& path, bool alive = true)
+         : path_(path), alive_(alive) {}
+
+    virtual std::string PathName() const override {
+      return path_;
+    }
+
+    virtual uint64_t LogNumber() const {
+      // what business do you have calling this method?
+      ASSERT_TRUE(false);
+      return 0;
+    }
+
+    virtual WalFileType Type() const override {
+      return alive_ ? kAliveLogFile : kArchivedLogFile;
+    }
+
+    virtual SequenceNumber StartSequence() const {
+      // backupabledb should not need this method
+      ASSERT_TRUE(false);
+      return 0;
+    }
+
+    virtual uint64_t SizeFileBytes() const {
+      // backupabledb should not need this method
+      ASSERT_TRUE(false);
+      return 0;
+    }
+
+   private:
+    std::string path_;
+    bool alive_;
+  }; // DummyLogFile
+
+  virtual Status GetSortedWalFiles(VectorLogPtr& files) override {
+    ASSERT_TRUE(!deletions_enabled_);
+    files.resize(wal_files_.size());
+    for (size_t i = 0; i < files.size(); ++i) {
+      files[i].reset(
+          new DummyLogFile(wal_files_[i].first, wal_files_[i].second));
+    }
+    return Status::OK();
+  }
+
+  std::vector<std::string> live_files_;
+  // pair<filename, alive?>
+  std::vector<std::pair<std::string, bool>> wal_files_;
+ private:
+  Options options_;
+  std::string dbname_;
+  bool deletions_enabled_;
+  mutable SequenceNumber sequence_number_;
+}; // DummyDB
+
+class TestEnv : public EnvWrapper {
+ public:
+  explicit TestEnv(Env* t) : EnvWrapper(t) {}
+
+  class DummySequentialFile : public SequentialFile {
+   public:
+    DummySequentialFile() : SequentialFile(), rnd_(5) {}
+    virtual Status Read(size_t n, Slice* result, char* scratch) {
+      size_t read_size = (n > size_left) ? size_left : n;
+      for (size_t i = 0; i < read_size; ++i) {
+        scratch[i] = rnd_.Next() & 255;
+      }
+      *result = Slice(scratch, read_size);
+      size_left -= read_size;
+      return Status::OK();
+    }
+
+    virtual Status Skip(uint64_t n) {
+      size_left = (n > size_left) ? size_left - n : 0;
+      return Status::OK();
+    }
+   private:
+    size_t size_left = 200;
+    Random rnd_;
+  };
+
+  Status NewSequentialFile(const std::string& f,
+                           unique_ptr<SequentialFile>* r,
+                           const EnvOptions& options) {
+    if (dummy_sequential_file_) {
+      r->reset(new TestEnv::DummySequentialFile());
+      return Status::OK();
+    } else {
+      return EnvWrapper::NewSequentialFile(f, r, options);
+    }
+  }
+
+  Status NewWritableFile(const std::string& f, unique_ptr<WritableFile>* r,
+                         const EnvOptions& options) {
+    written_files_.push_back(f);
+    if (limit_written_files_ <= 0) {
+      return Status::NotSupported("Sorry, can't do this");
+    }
+    limit_written_files_--;
+    return EnvWrapper::NewWritableFile(f, r, options);
+  }
+
+  virtual Status DeleteFile(const std::string& fname) override {
+    ASSERT_GT(limit_delete_files_, 0U);
+    limit_delete_files_--;
+    return EnvWrapper::DeleteFile(fname);
+  }
+
+  void AssertWrittenFiles(std::vector<std::string>& should_have_written) {
+    sort(should_have_written.begin(), should_have_written.end());
+    sort(written_files_.begin(), written_files_.end());
+    ASSERT_TRUE(written_files_ == should_have_written);
+  }
+
+  void ClearWrittenFiles() {
+    written_files_.clear();
+  }
+
+  void SetLimitWrittenFiles(uint64_t limit) {
+    limit_written_files_ = limit;
+  }
+
+  void SetLimitDeleteFiles(uint64_t limit) { limit_delete_files_ = limit; }
+
+  void SetDummySequentialFile(bool dummy_sequential_file) {
+    dummy_sequential_file_ = dummy_sequential_file;
+  }
+
+ private:
+  bool dummy_sequential_file_ = false;
+  std::vector<std::string> written_files_;
+  uint64_t limit_written_files_ = 1000000;
+  uint64_t limit_delete_files_ = 1000000;
+};  // TestEnv
+
+class FileManager : public EnvWrapper {
+ public:
+  explicit FileManager(Env* t) : EnvWrapper(t), rnd_(5) {}
+
+  Status DeleteRandomFileInDir(const std::string dir) {
+    std::vector<std::string> children;
+    GetChildren(dir, &children);
+    if (children.size() <= 2) { // . and ..
+      return Status::NotFound("");
+    }
+    while (true) {
+      int i = rnd_.Next() % children.size();
+      if (children[i] != "." && children[i] != "..") {
+        return DeleteFile(dir + "/" + children[i]);
+      }
+    }
+    // should never get here
+    assert(false);
+    return Status::NotFound("");
+  }
+
+  Status CorruptFile(const std::string& fname, uint64_t bytes_to_corrupt) {
+    uint64_t size;
+    Status s = GetFileSize(fname, &size);
+    if (!s.ok()) {
+      return s;
+    }
+    unique_ptr<RandomRWFile> file;
+    EnvOptions env_options;
+    env_options.use_mmap_writes = false;
+    s = NewRandomRWFile(fname, &file, env_options);
+    if (!s.ok()) {
+      return s;
+    }
+
+    for (uint64_t i = 0; s.ok() && i < bytes_to_corrupt; ++i) {
+      std::string tmp;
+      // write one random byte to a random position
+      s = file->Write(rnd_.Next() % size, test::RandomString(&rnd_, 1, &tmp));
+    }
+    return s;
+  }
+
+  Status CorruptChecksum(const std::string& fname, bool appear_valid) {
+    std::string metadata;
+    Status s = ReadFileToString(this, fname, &metadata);
+    if (!s.ok()) {
+      return s;
+    }
+    s = DeleteFile(fname);
+    if (!s.ok()) {
+      return s;
+    }
+
+    auto pos = metadata.find("private");
+    if (pos == std::string::npos) {
+      return Status::Corruption("private file is expected");
+    }
+    pos = metadata.find(" crc32 ", pos + 6);
+    if (pos == std::string::npos) {
+      return Status::Corruption("checksum not found");
+    }
+
+    if (metadata.size() < pos + 7) {
+      return Status::Corruption("bad CRC32 checksum value");
+    }
+
+    if (appear_valid) {
+      if (metadata[pos + 8] == '\n') {
+        // single digit value, safe to insert one more digit
+        metadata.insert(pos + 8, 1, '0');
+      } else {
+        metadata.erase(pos + 8, 1);
+      }
+    } else {
+      metadata[pos + 7] = 'a';
+    }
+
+    return WriteToFile(fname, metadata);
+  }
+
+  Status WriteToFile(const std::string& fname, const std::string& data) {
+    unique_ptr<WritableFile> file;
+    EnvOptions env_options;
+    env_options.use_mmap_writes = false;
+    Status s = EnvWrapper::NewWritableFile(fname, &file, env_options);
+    if (!s.ok()) {
+      return s;
+    }
+    return file->Append(Slice(data));
+  }
+
+ private:
+  Random rnd_;
+}; // FileManager
+
+// utility functions
+static size_t FillDB(DB* db, int from, int to) {
+  size_t bytes_written = 0;
+  for (int i = from; i < to; ++i) {
+    std::string key = "testkey" + std::to_string(i);
+    std::string value = "testvalue" + std::to_string(i);
+    bytes_written += key.size() + value.size();
+
+    ASSERT_OK(db->Put(WriteOptions(), Slice(key), Slice(value)));
+  }
+  return bytes_written;
+}
+
+static void AssertExists(DB* db, int from, int to) {
+  for (int i = from; i < to; ++i) {
+    std::string key = "testkey" + std::to_string(i);
+    std::string value;
+    Status s = db->Get(ReadOptions(), Slice(key), &value);
+    ASSERT_EQ(value, "testvalue" + std::to_string(i));
+  }
+}
+
+static void AssertEmpty(DB* db, int from, int to) {
+  for (int i = from; i < to; ++i) {
+    std::string key = "testkey" + std::to_string(i);
+    std::string value = "testvalue" + std::to_string(i);
+
+    Status s = db->Get(ReadOptions(), Slice(key), &value);
+    ASSERT_TRUE(s.IsNotFound());
+  }
+}
+
+class BackupableDBTest {
+ public:
+  BackupableDBTest() {
+    // set up files
+    dbname_ = test::TmpDir() + "/backupable_db";
+    backupdir_ = test::TmpDir() + "/backupable_db_backup";
+
+    // set up envs
+    env_ = Env::Default();
+    test_db_env_.reset(new TestEnv(env_));
+    test_backup_env_.reset(new TestEnv(env_));
+    file_manager_.reset(new FileManager(env_));
+
+    // set up db options
+    options_.create_if_missing = true;
+    options_.paranoid_checks = true;
+    options_.write_buffer_size = 1 << 17; // 128KB
+    options_.env = test_db_env_.get();
+    options_.wal_dir = dbname_;
+    // set up backup db options
+    CreateLoggerFromOptions(dbname_, backupdir_, env_,
+                            DBOptions(), &logger_);
+    backupable_options_.reset(new BackupableDBOptions(
+        backupdir_, test_backup_env_.get(), true, logger_.get(), true));
+
+    // delete old files in db
+    DestroyDB(dbname_, Options());
+  }
+
+  DB* OpenDB() {
+    DB* db;
+    ASSERT_OK(DB::Open(options_, dbname_, &db));
+    return db;
+  }
+
+  void OpenBackupableDB(bool destroy_old_data = false, bool dummy = false,
+                        bool share_table_files = true,
+                        bool share_with_checksums = false) {
+    // reset all the defaults
+    test_backup_env_->SetLimitWrittenFiles(1000000);
+    test_db_env_->SetLimitWrittenFiles(1000000);
+    test_db_env_->SetDummySequentialFile(dummy);
+
+    DB* db;
+    if (dummy) {
+      dummy_db_ = new DummyDB(options_, dbname_);
+      db = dummy_db_;
+    } else {
+      ASSERT_OK(DB::Open(options_, dbname_, &db));
+    }
+    backupable_options_->destroy_old_data = destroy_old_data;
+    backupable_options_->share_table_files = share_table_files;
+    backupable_options_->share_files_with_checksum = share_with_checksums;
+    db_.reset(new BackupableDB(db, *backupable_options_));
+  }
+
+  void CloseBackupableDB() {
+    db_.reset(nullptr);
+  }
+
+  void OpenRestoreDB() {
+    backupable_options_->destroy_old_data = false;
+    restore_db_.reset(
+        new RestoreBackupableDB(test_db_env_.get(), *backupable_options_));
+  }
+
+  void CloseRestoreDB() {
+    restore_db_.reset(nullptr);
+  }
+
+  // restores backup backup_id and asserts the existence of
+  // [start_exist, end_exist> and not-existence of
+  // [end_exist, end>
+  //
+  // if backup_id == 0, it means restore from latest
+  // if end == 0, don't check AssertEmpty
+  void AssertBackupConsistency(BackupID backup_id, uint32_t start_exist,
+                               uint32_t end_exist, uint32_t end = 0,
+                               bool keep_log_files = false) {
+    RestoreOptions restore_options(keep_log_files);
+    bool opened_restore = false;
+    if (restore_db_.get() == nullptr) {
+      opened_restore = true;
+      OpenRestoreDB();
+    }
+    if (backup_id > 0) {
+      ASSERT_OK(restore_db_->RestoreDBFromBackup(backup_id, dbname_, dbname_,
+                                                 restore_options));
+    } else {
+      ASSERT_OK(restore_db_->RestoreDBFromLatestBackup(dbname_, dbname_,
+                                                       restore_options));
+    }
+    DB* db = OpenDB();
+    AssertExists(db, start_exist, end_exist);
+    if (end != 0) {
+      AssertEmpty(db, end_exist, end);
+    }
+    delete db;
+    if (opened_restore) {
+      CloseRestoreDB();
+    }
+  }
+
+  void DeleteLogFiles() {
+    std::vector<std::string> delete_logs;
+    env_->GetChildren(dbname_, &delete_logs);
+    for (auto f : delete_logs) {
+      uint64_t number;
+      FileType type;
+      bool ok = ParseFileName(f, &number, &type);
+      if (ok && type == kLogFile) {
+        env_->DeleteFile(dbname_ + "/" + f);
+      }
+    }
+  }
+
+  // files
+  std::string dbname_;
+  std::string backupdir_;
+
+  // envs
+  Env* env_;
+  unique_ptr<TestEnv> test_db_env_;
+  unique_ptr<TestEnv> test_backup_env_;
+  unique_ptr<FileManager> file_manager_;
+
+  // all the dbs!
+  DummyDB* dummy_db_; // BackupableDB owns dummy_db_
+  unique_ptr<BackupableDB> db_;
+  unique_ptr<RestoreBackupableDB> restore_db_;
+
+  // options
+  Options options_;
+  unique_ptr<BackupableDBOptions> backupable_options_;
+  std::shared_ptr<Logger> logger_;
+}; // BackupableDBTest
+
+void AppendPath(const std::string& path, std::vector<std::string>& v) {
+  for (auto& f : v) {
+    f = path + f;
+  }
+}
+
+// this will make sure that backup does not copy the same file twice
+TEST(BackupableDBTest, NoDoubleCopy) {
+  OpenBackupableDB(true, true);
+
+  // should write 5 DB files + LATEST_BACKUP + one meta file
+  test_backup_env_->SetLimitWrittenFiles(7);
+  test_backup_env_->ClearWrittenFiles();
+  test_db_env_->SetLimitWrittenFiles(0);
+  dummy_db_->live_files_ = { "/00010.sst", "/00011.sst",
+                             "/CURRENT",   "/MANIFEST-01" };
+  dummy_db_->wal_files_ = {{"/00011.log", true}, {"/00012.log", false}};
+  ASSERT_OK(db_->CreateNewBackup(false));
+  std::vector<std::string> should_have_written = {
+    "/shared/00010.sst.tmp",
+    "/shared/00011.sst.tmp",
+    "/private/1.tmp/CURRENT",
+    "/private/1.tmp/MANIFEST-01",
+    "/private/1.tmp/00011.log",
+    "/meta/1.tmp",
+    "/LATEST_BACKUP.tmp"
+  };
+  AppendPath(dbname_ + "_backup", should_have_written);
+  test_backup_env_->AssertWrittenFiles(should_have_written);
+
+  // should write 4 new DB files + LATEST_BACKUP + one meta file
+  // should not write/copy 00010.sst, since it's already there!
+  test_backup_env_->SetLimitWrittenFiles(6);
+  test_backup_env_->ClearWrittenFiles();
+  dummy_db_->live_files_ = { "/00010.sst", "/00015.sst",
+                             "/CURRENT",   "/MANIFEST-01" };
+  dummy_db_->wal_files_ = {{"/00011.log", true}, {"/00012.log", false}};
+  ASSERT_OK(db_->CreateNewBackup(false));
+  // should not open 00010.sst - it's already there
+  should_have_written = {
+    "/shared/00015.sst.tmp",
+    "/private/2.tmp/CURRENT",
+    "/private/2.tmp/MANIFEST-01",
+    "/private/2.tmp/00011.log",
+    "/meta/2.tmp",
+    "/LATEST_BACKUP.tmp"
+  };
+  AppendPath(dbname_ + "_backup", should_have_written);
+  test_backup_env_->AssertWrittenFiles(should_have_written);
+
+  ASSERT_OK(db_->DeleteBackup(1));
+  ASSERT_EQ(true,
+            test_backup_env_->FileExists(backupdir_ + "/shared/00010.sst"));
+  // 00011.sst was only in backup 1, should be deleted
+  ASSERT_EQ(false,
+            test_backup_env_->FileExists(backupdir_ + "/shared/00011.sst"));
+  ASSERT_EQ(true,
+            test_backup_env_->FileExists(backupdir_ + "/shared/00015.sst"));
+
+  // MANIFEST file size should be only 100
+  uint64_t size;
+  test_backup_env_->GetFileSize(backupdir_ + "/private/2/MANIFEST-01", &size);
+  ASSERT_EQ(100UL, size);
+  test_backup_env_->GetFileSize(backupdir_ + "/shared/00015.sst", &size);
+  ASSERT_EQ(200UL, size);
+
+  CloseBackupableDB();
+}
+
+// test various kind of corruptions that may happen:
+// 1. Not able to write a file for backup - that backup should fail,
+//      everything else should work
+// 2. Corrupted/deleted LATEST_BACKUP - everything should work fine
+// 3. Corrupted backup meta file or missing backuped file - we should
+//      not be able to open that backup, but all other backups should be
+//      fine
+// 4. Corrupted checksum value - if the checksum is not a valid uint32_t,
+//      db open should fail, otherwise, it aborts during the restore process.
+TEST(BackupableDBTest, CorruptionsTest) {
+  const int keys_iteration = 5000;
+  Random rnd(6);
+  Status s;
+
+  OpenBackupableDB(true);
+  // create five backups
+  for (int i = 0; i < 5; ++i) {
+    FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1));
+    ASSERT_OK(db_->CreateNewBackup(!!(rnd.Next() % 2)));
+  }
+
+  // ---------- case 1. - fail a write -----------
+  // try creating backup 6, but fail a write
+  FillDB(db_.get(), keys_iteration * 5, keys_iteration * 6);
+  test_backup_env_->SetLimitWrittenFiles(2);
+  // should fail
+  s = db_->CreateNewBackup(!!(rnd.Next() % 2));
+  ASSERT_TRUE(!s.ok());
+  test_backup_env_->SetLimitWrittenFiles(1000000);
+  // latest backup should have all the keys
+  CloseBackupableDB();
+  AssertBackupConsistency(0, 0, keys_iteration * 5, keys_iteration * 6);
+
+  // ---------- case 2. - corrupt/delete latest backup -----------
+  ASSERT_OK(file_manager_->CorruptFile(backupdir_ + "/LATEST_BACKUP", 2));
+  AssertBackupConsistency(0, 0, keys_iteration * 5);
+  ASSERT_OK(file_manager_->DeleteFile(backupdir_ + "/LATEST_BACKUP"));
+  AssertBackupConsistency(0, 0, keys_iteration * 5);
+  // create backup 6, point LATEST_BACKUP to 5
+  OpenBackupableDB();
+  FillDB(db_.get(), keys_iteration * 5, keys_iteration * 6);
+  ASSERT_OK(db_->CreateNewBackup(false));
+  CloseBackupableDB();
+  ASSERT_OK(file_manager_->WriteToFile(backupdir_ + "/LATEST_BACKUP", "5"));
+  AssertBackupConsistency(0, 0, keys_iteration * 5, keys_iteration * 6);
+  // assert that all 6 data is gone!
+  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/6") == false);
+  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/private/6") == false);
+
+  // --------- case 3. corrupted backup meta or missing backuped file ----
+  ASSERT_OK(file_manager_->CorruptFile(backupdir_ + "/meta/5", 3));
+  // since 5 meta is now corrupted, latest backup should be 4
+  AssertBackupConsistency(0, 0, keys_iteration * 4, keys_iteration * 5);
+  OpenRestoreDB();
+  s = restore_db_->RestoreDBFromBackup(5, dbname_, dbname_);
+  ASSERT_TRUE(!s.ok());
+  CloseRestoreDB();
+  ASSERT_OK(file_manager_->DeleteRandomFileInDir(backupdir_ + "/private/4"));
+  // 4 is corrupted, 3 is the latest backup now
+  AssertBackupConsistency(0, 0, keys_iteration * 3, keys_iteration * 5);
+  OpenRestoreDB();
+  s = restore_db_->RestoreDBFromBackup(4, dbname_, dbname_);
+  CloseRestoreDB();
+  ASSERT_TRUE(!s.ok());
+
+  // --------- case 4. corrupted checksum value ----
+  ASSERT_OK(file_manager_->CorruptChecksum(backupdir_ + "/meta/3", false));
+  // checksum of backup 3 is an invalid value, this can be detected at
+  // db open time, and it reverts to the previous backup automatically
+  AssertBackupConsistency(0, 0, keys_iteration * 2, keys_iteration * 5);
+  // checksum of the backup 2 appears to be valid, this can cause checksum
+  // mismatch and abort restore process
+  ASSERT_OK(file_manager_->CorruptChecksum(backupdir_ + "/meta/2", true));
+  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/2"));
+  OpenRestoreDB();
+  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/2"));
+  s = restore_db_->RestoreDBFromBackup(2, dbname_, dbname_);
+  ASSERT_TRUE(!s.ok());
+  ASSERT_OK(restore_db_->DeleteBackup(2));
+  CloseRestoreDB();
+  AssertBackupConsistency(0, 0, keys_iteration * 1, keys_iteration * 5);
+
+  // new backup should be 2!
+  OpenBackupableDB();
+  FillDB(db_.get(), keys_iteration * 1, keys_iteration * 2);
+  ASSERT_OK(db_->CreateNewBackup(!!(rnd.Next() % 2)));
+  CloseBackupableDB();
+  AssertBackupConsistency(2, 0, keys_iteration * 2, keys_iteration * 5);
+}
+
+// open DB, write, close DB, backup, restore, repeat
+TEST(BackupableDBTest, OfflineIntegrationTest) {
+  // has to be a big number, so that it triggers the memtable flush
+  const int keys_iteration = 5000;
+  const int max_key = keys_iteration * 4 + 10;
+  // first iter -- flush before backup
+  // second iter -- don't flush before backup
+  for (int iter = 0; iter < 2; ++iter) {
+    // delete old data
+    DestroyDB(dbname_, Options());
+    bool destroy_data = true;
+
+    // every iteration --
+    // 1. insert new data in the DB
+    // 2. backup the DB
+    // 3. destroy the db
+    // 4. restore the db, check everything is still there
+    for (int i = 0; i < 5; ++i) {
+      // in last iteration, put smaller amount of data,
+      int fill_up_to = std::min(keys_iteration * (i + 1), max_key);
+      // ---- insert new data and back up ----
+      OpenBackupableDB(destroy_data);
+      destroy_data = false;
+      FillDB(db_.get(), keys_iteration * i, fill_up_to);
+      ASSERT_OK(db_->CreateNewBackup(iter == 0));
+      CloseBackupableDB();
+      DestroyDB(dbname_, Options());
+
+      // ---- make sure it's empty ----
+      DB* db = OpenDB();
+      AssertEmpty(db, 0, fill_up_to);
+      delete db;
+
+      // ---- restore the DB ----
+      OpenRestoreDB();
+      if (i >= 3) { // test purge old backups
+        // when i == 4, purge to only 1 backup
+        // when i == 3, purge to 2 backups
+        ASSERT_OK(restore_db_->PurgeOldBackups(5 - i));
+      }
+      // ---- make sure the data is there ---
+      AssertBackupConsistency(0, 0, fill_up_to, max_key);
+      CloseRestoreDB();
+    }
+  }
+}
+
+// open DB, write, backup, write, backup, close, restore
+TEST(BackupableDBTest, OnlineIntegrationTest) {
+  // has to be a big number, so that it triggers the memtable flush
+  const int keys_iteration = 5000;
+  const int max_key = keys_iteration * 4 + 10;
+  Random rnd(7);
+  // delete old data
+  DestroyDB(dbname_, Options());
+
+  OpenBackupableDB(true);
+  // write some data, backup, repeat
+  for (int i = 0; i < 5; ++i) {
+    if (i == 4) {
+      // delete backup number 2, online delete!
+      OpenRestoreDB();
+      ASSERT_OK(restore_db_->DeleteBackup(2));
+      CloseRestoreDB();
+    }
+    // in last iteration, put smaller amount of data,
+    // so that backups can share sst files
+    int fill_up_to = std::min(keys_iteration * (i + 1), max_key);
+    FillDB(db_.get(), keys_iteration * i, fill_up_to);
+    // we should get consistent results with flush_before_backup
+    // set to both true and false
+    ASSERT_OK(db_->CreateNewBackup(!!(rnd.Next() % 2)));
+  }
+  // close and destroy
+  CloseBackupableDB();
+  DestroyDB(dbname_, Options());
+
+  // ---- make sure it's empty ----
+  DB* db = OpenDB();
+  AssertEmpty(db, 0, max_key);
+  delete db;
+
+  // ---- restore every backup and verify all the data is there ----
+  OpenRestoreDB();
+  for (int i = 1; i <= 5; ++i) {
+    if (i == 2) {
+      // we deleted backup 2
+      Status s = restore_db_->RestoreDBFromBackup(2, dbname_, dbname_);
+      ASSERT_TRUE(!s.ok());
+    } else {
+      int fill_up_to = std::min(keys_iteration * i, max_key);
+      AssertBackupConsistency(i, 0, fill_up_to, max_key);
+    }
+  }
+
+  // delete some backups -- this should leave only backups 3 and 5 alive
+  ASSERT_OK(restore_db_->DeleteBackup(4));
+  ASSERT_OK(restore_db_->PurgeOldBackups(2));
+
+  std::vector<BackupInfo> backup_info;
+  restore_db_->GetBackupInfo(&backup_info);
+  ASSERT_EQ(2UL, backup_info.size());
+
+  // check backup 3
+  AssertBackupConsistency(3, 0, 3 * keys_iteration, max_key);
+  // check backup 5
+  AssertBackupConsistency(5, 0, max_key);
+
+  CloseRestoreDB();
+}
+
+TEST(BackupableDBTest, FailOverwritingBackups) {
+  options_.write_buffer_size = 1024 * 1024 * 1024;  // 1GB
+  // create backups 1, 2, 3, 4, 5
+  OpenBackupableDB(true);
+  for (int i = 0; i < 5; ++i) {
+    CloseBackupableDB();
+    DeleteLogFiles();
+    OpenBackupableDB(false);
+    FillDB(db_.get(), 100 * i, 100 * (i + 1));
+    ASSERT_OK(db_->CreateNewBackup(true));
+  }
+  CloseBackupableDB();
+
+  // restore 3
+  OpenRestoreDB();
+  ASSERT_OK(restore_db_->RestoreDBFromBackup(3, dbname_, dbname_));
+  CloseRestoreDB();
+
+  OpenBackupableDB(false);
+  FillDB(db_.get(), 0, 300);
+  Status s = db_->CreateNewBackup(true);
+  // the new backup fails because new table files
+  // clash with old table files from backups 4 and 5
+  // (since write_buffer_size is huge, we can be sure that
+  // each backup will generate only one sst file and that
+  // a file generated by a new backup is the same as
+  // sst file generated by backup 4)
+  ASSERT_TRUE(s.IsCorruption());
+  ASSERT_OK(db_->DeleteBackup(4));
+  ASSERT_OK(db_->DeleteBackup(5));
+  // now, the backup can succeed
+  ASSERT_OK(db_->CreateNewBackup(true));
+  CloseBackupableDB();
+}
+
+TEST(BackupableDBTest, NoShareTableFiles) {
+  const int keys_iteration = 5000;
+  OpenBackupableDB(true, false, false);
+  for (int i = 0; i < 5; ++i) {
+    FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1));
+    ASSERT_OK(db_->CreateNewBackup(!!(i % 2)));
+  }
+  CloseBackupableDB();
+
+  for (int i = 0; i < 5; ++i) {
+    AssertBackupConsistency(i + 1, 0, keys_iteration * (i + 1),
+                            keys_iteration * 6);
+  }
+}
+
+// Verify that you can backup and restore with share_files_with_checksum on
+TEST(BackupableDBTest, ShareTableFilesWithChecksums) {
+  const int keys_iteration = 5000;
+  OpenBackupableDB(true, false, true, true);
+  for (int i = 0; i < 5; ++i) {
+    FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1));
+    ASSERT_OK(db_->CreateNewBackup(!!(i % 2)));
+  }
+  CloseBackupableDB();
+
+  for (int i = 0; i < 5; ++i) {
+    AssertBackupConsistency(i + 1, 0, keys_iteration * (i + 1),
+                            keys_iteration * 6);
+  }
+}
+
+// Verify that you can backup and restore using share_files_with_checksum set to
+// false and then transition this option to true
+TEST(BackupableDBTest, ShareTableFilesWithChecksumsTransition) {
+  const int keys_iteration = 5000;
+  // set share_files_with_checksum to false
+  OpenBackupableDB(true, false, true, false);
+  for (int i = 0; i < 5; ++i) {
+    FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1));
+    ASSERT_OK(db_->CreateNewBackup(true));
+  }
+  CloseBackupableDB();
+
+  for (int i = 0; i < 5; ++i) {
+    AssertBackupConsistency(i + 1, 0, keys_iteration * (i + 1),
+                            keys_iteration * 6);
+  }
+
+  // set share_files_with_checksum to true and do some more backups
+  OpenBackupableDB(true, false, true, true);
+  for (int i = 5; i < 10; ++i) {
+    FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1));
+    ASSERT_OK(db_->CreateNewBackup(true));
+  }
+  CloseBackupableDB();
+
+  for (int i = 0; i < 5; ++i) {
+    AssertBackupConsistency(i + 1, 0, keys_iteration * (i + 5 + 1),
+                            keys_iteration * 11);
+  }
+}
+
+TEST(BackupableDBTest, DeleteTmpFiles) {
+  OpenBackupableDB();
+  CloseBackupableDB();
+  std::string shared_tmp = backupdir_ + "/shared/00006.sst.tmp";
+  std::string private_tmp_dir = backupdir_ + "/private/10.tmp";
+  std::string private_tmp_file = private_tmp_dir + "/00003.sst";
+  file_manager_->WriteToFile(shared_tmp, "tmp");
+  file_manager_->CreateDir(private_tmp_dir);
+  file_manager_->WriteToFile(private_tmp_file, "tmp");
+  ASSERT_EQ(true, file_manager_->FileExists(private_tmp_dir));
+  OpenBackupableDB();
+  CloseBackupableDB();
+  ASSERT_EQ(false, file_manager_->FileExists(shared_tmp));
+  ASSERT_EQ(false, file_manager_->FileExists(private_tmp_file));
+  ASSERT_EQ(false, file_manager_->FileExists(private_tmp_dir));
+}
+
+TEST(BackupableDBTest, KeepLogFiles) {
+  backupable_options_->backup_log_files = false;
+  // basically infinite
+  options_.WAL_ttl_seconds = 24 * 60 * 60;
+  OpenBackupableDB(true);
+  FillDB(db_.get(), 0, 100);
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  FillDB(db_.get(), 100, 200);
+  ASSERT_OK(db_->CreateNewBackup(false));
+  FillDB(db_.get(), 200, 300);
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  FillDB(db_.get(), 300, 400);
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  FillDB(db_.get(), 400, 500);
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  CloseBackupableDB();
+
+  // all data should be there if we call with keep_log_files = true
+  AssertBackupConsistency(0, 0, 500, 600, true);
+}
+
+TEST(BackupableDBTest, RateLimiting) {
+  uint64_t const KB = 1024 * 1024;
+  size_t const kMicrosPerSec = 1000 * 1000LL;
+
+  std::vector<std::pair<uint64_t, uint64_t>> limits(
+      {{KB, 5 * KB}, {2 * KB, 3 * KB}});
+
+  for (const auto& limit : limits) {
+    // destroy old data
+    DestroyDB(dbname_, Options());
+
+    backupable_options_->backup_rate_limit = limit.first;
+    backupable_options_->restore_rate_limit = limit.second;
+    options_.compression = kNoCompression;
+    OpenBackupableDB(true);
+    size_t bytes_written = FillDB(db_.get(), 0, 100000);
+
+    auto start_backup = env_->NowMicros();
+    ASSERT_OK(db_->CreateNewBackup(false));
+    auto backup_time = env_->NowMicros() - start_backup;
+    auto rate_limited_backup_time = (bytes_written * kMicrosPerSec) /
+                                    backupable_options_->backup_rate_limit;
+    ASSERT_GT(backup_time, 0.9 * rate_limited_backup_time);
+    ASSERT_LT(backup_time, 2.5 * rate_limited_backup_time);
+
+    CloseBackupableDB();
+
+    OpenRestoreDB();
+    auto start_restore = env_->NowMicros();
+    ASSERT_OK(restore_db_->RestoreDBFromLatestBackup(dbname_, dbname_));
+    auto restore_time = env_->NowMicros() - start_restore;
+    CloseRestoreDB();
+    auto rate_limited_restore_time = (bytes_written * kMicrosPerSec) /
+                                     backupable_options_->restore_rate_limit;
+    ASSERT_GT(restore_time, 0.9 * rate_limited_restore_time);
+    ASSERT_LT(restore_time, 2.5 * rate_limited_restore_time);
+
+    AssertBackupConsistency(0, 0, 100000, 100010);
+  }
+}
+
+TEST(BackupableDBTest, ReadOnlyBackupEngine) {
+  DestroyDB(dbname_, Options());
+  OpenBackupableDB(true);
+  FillDB(db_.get(), 0, 100);
+  ASSERT_OK(db_->CreateNewBackup(true));
+  FillDB(db_.get(), 100, 200);
+  ASSERT_OK(db_->CreateNewBackup(true));
+  CloseBackupableDB();
+  DestroyDB(dbname_, Options());
+
+  backupable_options_->destroy_old_data = false;
+  test_backup_env_->ClearWrittenFiles();
+  test_backup_env_->SetLimitDeleteFiles(0);
+  auto read_only_backup_engine =
+      BackupEngineReadOnly::NewReadOnlyBackupEngine(env_, *backupable_options_);
+  std::vector<BackupInfo> backup_info;
+  read_only_backup_engine->GetBackupInfo(&backup_info);
+  ASSERT_EQ(backup_info.size(), 2U);
+
+  RestoreOptions restore_options(false);
+  ASSERT_OK(read_only_backup_engine->RestoreDBFromLatestBackup(
+      dbname_, dbname_, restore_options));
+  delete read_only_backup_engine;
+  std::vector<std::string> should_have_written;
+  test_backup_env_->AssertWrittenFiles(should_have_written);
+
+  DB* db = OpenDB();
+  AssertExists(db, 0, 200);
+  delete db;
+}
+
+}  // anon namespace
+
+} //  namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/utilities/geodb/geodb_impl.cc b/utilities/geodb/geodb_impl.cc
new file mode 100644 (file)
index 0000000..065e5ca
--- /dev/null
@@ -0,0 +1,431 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#ifndef ROCKSDB_LITE
+
+#include "utilities/geodb/geodb_impl.h"
+
+#define __STDC_FORMAT_MACROS
+
+#include <vector>
+#include <map>
+#include <string>
+#include <limits>
+#include "db/filename.h"
+#include "util/coding.h"
+
+//
+// There are two types of keys. The first type of key-values
+// maps a geo location to the set of object ids and their values.
+// Table 1
+//   key     : p + : + $quadkey + : + $id +
+//             : + $latitude + : + $longitude
+//   value  :  value of the object
+// This table can be used to find all objects that reside near
+// a specified geolocation.
+//
+// Table 2
+//   key  : 'k' + : + $id
+//   value:  $quadkey
+
+namespace rocksdb {
+
+GeoDBImpl::GeoDBImpl(DB* db, const GeoDBOptions& options) :
+  GeoDB(db, options), db_(db), options_(options) {
+}
+
+GeoDBImpl::~GeoDBImpl() {
+}
+
+Status GeoDBImpl::Insert(const GeoObject& obj) {
+  WriteBatch batch;
+
+  // It is possible that this id is already associated with
+  // with a different position. We first have to remove that
+  // association before we can insert the new one.
+
+  // remove existing object, if it exists
+  GeoObject old;
+  Status status = GetById(obj.id, &old);
+  if (status.ok()) {
+    assert(obj.id.compare(old.id) == 0);
+    std::string quadkey = PositionToQuad(old.position, Detail);
+    std::string key1 = MakeKey1(old.position, old.id, quadkey);
+    std::string key2 = MakeKey2(old.id);
+    batch.Delete(Slice(key1));
+    batch.Delete(Slice(key2));
+  } else if (status.IsNotFound()) {
+    // What if another thread is trying to insert the same ID concurrently?
+  } else {
+    return status;
+  }
+
+  // insert new object
+  std::string quadkey = PositionToQuad(obj.position, Detail);
+  std::string key1 = MakeKey1(obj.position, obj.id, quadkey);
+  std::string key2 = MakeKey2(obj.id);
+  batch.Put(Slice(key1), Slice(obj.value));
+  batch.Put(Slice(key2), Slice(quadkey));
+  return db_->Write(woptions_, &batch);
+}
+
+Status GeoDBImpl::GetByPosition(const GeoPosition& pos,
+                                const Slice& id,
+                                std::string* value) {
+  std::string quadkey = PositionToQuad(pos, Detail);
+  std::string key1 = MakeKey1(pos, id, quadkey);
+  return db_->Get(roptions_, Slice(key1), value);
+}
+
+Status GeoDBImpl::GetById(const Slice& id, GeoObject* object) {
+  Status status;
+  Slice quadkey;
+
+  // create an iterator so that we can get a consistent picture
+  // of the database.
+  Iterator* iter = db_->NewIterator(roptions_);
+
+  // create key for table2
+  std::string kt = MakeKey2(id);
+  Slice key2(kt);
+
+  iter->Seek(key2);
+  if (iter->Valid() && iter->status().ok()) {
+    if (iter->key().compare(key2) == 0) {
+      quadkey = iter->value();
+    }
+  }
+  if (quadkey.size() == 0) {
+    delete iter;
+    return Status::NotFound(key2);
+  }
+
+  //
+  // Seek to the quadkey + id prefix
+  //
+  std::string prefix = MakeKey1Prefix(quadkey.ToString(), id);
+  iter->Seek(Slice(prefix));
+  assert(iter->Valid());
+  if (!iter->Valid() || !iter->status().ok()) {
+    delete iter;
+    return Status::NotFound();
+  }
+
+  // split the key into p + quadkey + id + lat + lon
+  std::vector<std::string> parts;
+  Slice key = iter->key();
+  StringSplit(&parts, key.ToString(), ':');
+  assert(parts.size() == 5);
+  assert(parts[0] == "p");
+  assert(parts[1] == quadkey);
+  assert(parts[2] == id);
+
+  // fill up output parameters
+  object->position.latitude = atof(parts[3].c_str());
+  object->position.longitude = atof(parts[4].c_str());
+  object->id = id.ToString();  // this is redundant
+  object->value = iter->value().ToString();
+  delete iter;
+  return Status::OK();
+}
+
+
+Status GeoDBImpl::Remove(const Slice& id) {
+  // Read the object from the database
+  GeoObject obj;
+  Status status = GetById(id, &obj);
+  if (!status.ok()) {
+    return status;
+  }
+
+  // remove the object by atomically deleting it from both tables
+  std::string quadkey = PositionToQuad(obj.position, Detail);
+  std::string key1 = MakeKey1(obj.position, obj.id, quadkey);
+  std::string key2 = MakeKey2(obj.id);
+  WriteBatch batch;
+  batch.Delete(Slice(key1));
+  batch.Delete(Slice(key2));
+  return db_->Write(woptions_, &batch);
+}
+
+Status GeoDBImpl::SearchRadial(const GeoPosition& pos,
+  double radius,
+  std::vector<GeoObject>* values,
+  int number_of_values) {
+  // Gather all bounding quadkeys
+  std::vector<std::string> qids;
+  Status s = searchQuadIds(pos, radius, &qids);
+  if (!s.ok()) {
+    return s;
+  }
+
+  // create an iterator
+  Iterator* iter = db_->NewIterator(ReadOptions());
+
+  // Process each prospective quadkey
+  for (std::string qid : qids) {
+    // The user is interested in only these many objects.
+    if (number_of_values == 0) {
+      break;
+    }
+
+    // convert quadkey to db key prefix
+    std::string dbkey = MakeQuadKeyPrefix(qid);
+
+    for (iter->Seek(dbkey);
+         number_of_values > 0 && iter->Valid() && iter->status().ok();
+         iter->Next()) {
+      // split the key into p + quadkey + id + lat + lon
+      std::vector<std::string> parts;
+      Slice key = iter->key();
+      StringSplit(&parts, key.ToString(), ':');
+      assert(parts.size() == 5);
+      assert(parts[0] == "p");
+      std::string* quadkey = &parts[1];
+
+      // If the key we are looking for is a prefix of the key
+      // we found from the database, then this is one of the keys
+      // we are looking for.
+      auto res = std::mismatch(qid.begin(), qid.end(), quadkey->begin());
+      if (res.first == qid.end()) {
+        GeoPosition pos(atof(parts[3].c_str()), atof(parts[4].c_str()));
+        GeoObject obj(pos, parts[4], iter->value().ToString());
+        values->push_back(obj);
+        number_of_values--;
+      } else {
+        break;
+      }
+    }
+  }
+  delete iter;
+  return Status::OK();
+}
+
+std::string GeoDBImpl::MakeKey1(const GeoPosition& pos, Slice id,
+                                std::string quadkey) {
+  std::string lat = std::to_string(pos.latitude);
+  std::string lon = std::to_string(pos.longitude);
+  std::string key = "p:";
+  key.reserve(5 + quadkey.size() + id.size() + lat.size() + lon.size());
+  key.append(quadkey);
+  key.append(":");
+  key.append(id.ToString());
+  key.append(":");
+  key.append(lat);
+  key.append(":");
+  key.append(lon);
+  return key;
+}
+
+std::string GeoDBImpl::MakeKey2(Slice id) {
+  std::string key = "k:";
+  key.append(id.ToString());
+  return key;
+}
+
+std::string GeoDBImpl::MakeKey1Prefix(std::string quadkey,
+                                      Slice id) {
+  std::string key = "p:";
+  key.reserve(3 + quadkey.size() + id.size());
+  key.append(quadkey);
+  key.append(":");
+  key.append(id.ToString());
+  return key;
+}
+
+std::string GeoDBImpl::MakeQuadKeyPrefix(std::string quadkey) {
+  std::string key = "p:";
+  key.append(quadkey);
+  return key;
+}
+
+void GeoDBImpl::StringSplit(std::vector<std::string>* tokens,
+                            const std::string &text, char sep) {
+  std::size_t start = 0, end = 0;
+  while ((end = text.find(sep, start)) != std::string::npos) {
+    tokens->push_back(text.substr(start, end - start));
+    start = end + 1;
+  }
+  tokens->push_back(text.substr(start));
+}
+
+// convert degrees to radians
+double GeoDBImpl::radians(double x) {
+  return (x * PI) / 180;
+}
+
+// convert radians to degrees
+double GeoDBImpl::degrees(double x) {
+  return (x * 180) / PI;
+}
+
+// convert a gps location to quad coordinate
+std::string GeoDBImpl::PositionToQuad(const GeoPosition& pos,
+                                      int levelOfDetail) {
+  Pixel p = PositionToPixel(pos, levelOfDetail);
+  Tile tile = PixelToTile(p);
+  return TileToQuadKey(tile, levelOfDetail);
+}
+
+GeoPosition GeoDBImpl::displaceLatLon(double lat, double lon,
+                                      double deltay, double deltax) {
+  double dLat = deltay / EarthRadius;
+  double dLon = deltax / (EarthRadius * cos(radians(lat)));
+  return GeoPosition(lat + degrees(dLat),
+                     lon + degrees(dLon));
+}
+
+//
+// Return the distance between two positions on the earth
+//
+double GeoDBImpl::distance(double lat1, double lon1,
+                           double lat2, double lon2) {
+  double lon = radians(lon2 - lon1);
+  double lat = radians(lat2 - lat1);
+
+  double a = (sin(lat / 2) * sin(lat / 2)) +
+              cos(radians(lat1)) * cos(radians(lat2)) *
+              (sin(lon / 2) * sin(lon / 2));
+  double angle = 2 * atan2(sqrt(a), sqrt(1 - a));
+  return angle * EarthRadius;
+}
+
+//
+// Returns all the quadkeys inside the search range
+//
+Status GeoDBImpl::searchQuadIds(const GeoPosition& position,
+                                double radius,
+                                std::vector<std::string>* quadKeys) {
+  // get the outline of the search square
+  GeoPosition topLeftPos = boundingTopLeft(position, radius);
+  GeoPosition bottomRightPos = boundingBottomRight(position, radius);
+
+  Pixel topLeft =  PositionToPixel(topLeftPos, Detail);
+  Pixel bottomRight =  PositionToPixel(bottomRightPos, Detail);
+
+  // how many level of details to look for
+  int numberOfTilesAtMaxDepth = floor((bottomRight.x - topLeft.x) / 256);
+  int zoomLevelsToRise = floor(log(numberOfTilesAtMaxDepth) / log(2));
+  zoomLevelsToRise++;
+  int levels = std::max(0, Detail - zoomLevelsToRise);
+
+  quadKeys->push_back(PositionToQuad(GeoPosition(topLeftPos.latitude,
+                                                 topLeftPos.longitude),
+                                     levels));
+  quadKeys->push_back(PositionToQuad(GeoPosition(topLeftPos.latitude,
+                                                 bottomRightPos.longitude),
+                                     levels));
+  quadKeys->push_back(PositionToQuad(GeoPosition(bottomRightPos.latitude,
+                                                 topLeftPos.longitude),
+                                     levels));
+  quadKeys->push_back(PositionToQuad(GeoPosition(bottomRightPos.latitude,
+                                                 bottomRightPos.longitude),
+                                     levels));
+  return Status::OK();
+}
+
+// Determines the ground resolution (in meters per pixel) at a specified
+// latitude and level of detail.
+// Latitude (in degrees) at which to measure the ground resolution.
+// Level of detail, from 1 (lowest detail) to 23 (highest detail).
+// Returns the ground resolution, in meters per pixel.
+double GeoDBImpl::GroundResolution(double latitude, int levelOfDetail) {
+  latitude = clip(latitude, MinLatitude, MaxLatitude);
+  return cos(latitude * PI / 180) * 2 * PI * EarthRadius /
+         MapSize(levelOfDetail);
+}
+
+// Converts a point from latitude/longitude WGS-84 coordinates (in degrees)
+// into pixel XY coordinates at a specified level of detail.
+GeoDBImpl::Pixel GeoDBImpl::PositionToPixel(const GeoPosition& pos,
+                                            int levelOfDetail) {
+  double latitude = clip(pos.latitude, MinLatitude, MaxLatitude);
+  double x = (pos.longitude + 180) / 360;
+  double sinLatitude = sin(latitude * PI / 180);
+  double y = 0.5 - log((1 + sinLatitude) / (1 - sinLatitude)) / (4 * PI);
+  double mapSize = MapSize(levelOfDetail);
+  double X = floor(clip(x * mapSize + 0.5, 0, mapSize - 1));
+  double Y = floor(clip(y * mapSize + 0.5, 0, mapSize - 1));
+  return Pixel((unsigned int)X, (unsigned int)Y);
+}
+
+GeoPosition GeoDBImpl::PixelToPosition(const Pixel& pixel, int levelOfDetail) {
+  double mapSize = MapSize(levelOfDetail);
+  double x = (clip(pixel.x, 0, mapSize - 1) / mapSize) - 0.5;
+  double y = 0.5 - (clip(pixel.y, 0, mapSize - 1) / mapSize);
+  double latitude = 90 - 360 * atan(exp(-y * 2 * PI)) / PI;
+  double longitude = 360 * x;
+  return GeoPosition(latitude, longitude);
+}
+
+// Converts a Pixel to a Tile
+GeoDBImpl::Tile GeoDBImpl::PixelToTile(const Pixel& pixel) {
+  unsigned int tileX = floor(pixel.x / 256);
+  unsigned int tileY = floor(pixel.y / 256);
+  return Tile(tileX, tileY);
+}
+
+GeoDBImpl::Pixel GeoDBImpl::TileToPixel(const Tile& tile) {
+  unsigned int pixelX = tile.x * 256;
+  unsigned int pixelY = tile.y * 256;
+  return Pixel(pixelX, pixelY);
+}
+
+// Convert a Tile to a quadkey
+std::string GeoDBImpl::TileToQuadKey(const Tile& tile, int levelOfDetail) {
+  std::stringstream quadKey;
+  for (int i = levelOfDetail; i > 0; i--) {
+    char digit = '0';
+    int mask = 1 << (i - 1);
+    if ((tile.x & mask) != 0) {
+      digit++;
+    }
+    if ((tile.y & mask) != 0) {
+      digit++;
+      digit++;
+    }
+    quadKey << digit;
+  }
+  return quadKey.str();
+}
+
+//
+// Convert a quadkey to a tile and its level of detail
+//
+void GeoDBImpl::QuadKeyToTile(std::string quadkey, Tile* tile,
+                                     int *levelOfDetail) {
+  tile->x = tile->y = 0;
+  *levelOfDetail = quadkey.size();
+  const char* key = reinterpret_cast<const char *>(quadkey.c_str());
+  for (int i = *levelOfDetail; i > 0; i--) {
+    int mask = 1 << (i - 1);
+    switch (key[*levelOfDetail - i]) {
+      case '0':
+        break;
+
+      case '1':
+        tile->x |= mask;
+        break;
+
+      case '2':
+        tile->y |= mask;
+        break;
+
+      case '3':
+        tile->x |= mask;
+        tile->y |= mask;
+        break;
+
+      default:
+        std::stringstream msg;
+        msg << quadkey;
+        msg << " Invalid QuadKey.";
+        throw std::runtime_error(msg.str());
+    }
+  }
+}
+}  // namespace rocksdb
+
+#endif  // ROCKSDB_LITE
diff --git a/utilities/geodb/geodb_impl.h b/utilities/geodb/geodb_impl.h
new file mode 100644 (file)
index 0000000..4ee42ad
--- /dev/null
@@ -0,0 +1,191 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+
+#ifndef ROCKSDB_LITE
+
+#pragma once
+#include <algorithm>
+#include <cmath>
+#include <string>
+#include <sstream>
+#include <stdexcept>
+#include <vector>
+
+#include "utilities/geo_db.h"
+#include "utilities/stackable_db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/status.h"
+
+namespace rocksdb {
+
+// A specific implementation of GeoDB
+
+class GeoDBImpl : public GeoDB {
+ public:
+  GeoDBImpl(DB* db, const GeoDBOptions& options);
+  ~GeoDBImpl();
+
+  // Associate the GPS location with the identified by 'id'. The value
+  // is a blob that is associated with this object.
+  virtual Status Insert(const GeoObject& object);
+
+  // Retrieve the value of the object located at the specified GPS
+  // location and is identified by the 'id'.
+  virtual Status GetByPosition(const GeoPosition& pos,
+                               const Slice& id,
+                               std::string* value);
+
+  // Retrieve the value of the object identified by the 'id'. This method
+  // could be potentially slower than GetByPosition
+  virtual Status GetById(const Slice& id, GeoObject* object);
+
+  // Delete the specified object
+  virtual Status Remove(const Slice& id);
+
+  // Returns a list of all items within a circular radius from the
+  // specified gps location
+  virtual Status SearchRadial(const GeoPosition& pos,
+                              double radius,
+                              std::vector<GeoObject>* values,
+                              int number_of_values);
+
+ private:
+  DB* db_;
+  const GeoDBOptions options_;
+  const WriteOptions woptions_;
+  const ReadOptions roptions_;
+
+  // The value of PI
+  static constexpr double PI = 3.141592653589793;
+
+  // convert degrees to radians
+  static double radians(double x);
+
+  // convert radians to degrees
+  static double degrees(double x);
+
+  // A pixel class that captures X and Y coordinates
+  class Pixel {
+   public:
+    unsigned int x;
+    unsigned int y;
+    Pixel(unsigned int a, unsigned int b) :
+     x(a), y(b) {
+    }
+  };
+
+  // A Tile in the geoid
+  class Tile {
+   public:
+    unsigned int x;
+    unsigned int y;
+    Tile(unsigned int a, unsigned int b) :
+     x(a), y(b) {
+    }
+  };
+
+  // convert a gps location to quad coordinate
+  static std::string PositionToQuad(const GeoPosition& pos, int levelOfDetail);
+
+  // arbitrary constant use for WGS84 via
+  // http://en.wikipedia.org/wiki/World_Geodetic_System
+  // http://mathforum.org/library/drmath/view/51832.html
+  // http://msdn.microsoft.com/en-us/library/bb259689.aspx
+  // http://www.tuicool.com/articles/NBrE73
+  //
+  const int Detail = 23;
+  static constexpr double EarthRadius = 6378137;
+  static constexpr double MinLatitude = -85.05112878;
+  static constexpr double MaxLatitude = 85.05112878;
+  static constexpr double MinLongitude = -180;
+  static constexpr double MaxLongitude = 180;
+
+  // clips a number to the specified minimum and maximum values.
+  static double clip(double n, double minValue, double maxValue) {
+    return fmin(fmax(n, minValue), maxValue);
+  }
+
+  // Determines the map width and height (in pixels) at a specified level
+  // of detail, from 1 (lowest detail) to 23 (highest detail).
+  // Returns the map width and height in pixels.
+  static unsigned int MapSize(int levelOfDetail) {
+    return (unsigned int)(256 << levelOfDetail);
+  }
+
+  // Determines the ground resolution (in meters per pixel) at a specified
+  // latitude and level of detail.
+  // Latitude (in degrees) at which to measure the ground resolution.
+  // Level of detail, from 1 (lowest detail) to 23 (highest detail).
+  // Returns the ground resolution, in meters per pixel.
+  static double GroundResolution(double latitude, int levelOfDetail);
+
+  // Converts a point from latitude/longitude WGS-84 coordinates (in degrees)
+  // into pixel XY coordinates at a specified level of detail.
+  static Pixel PositionToPixel(const GeoPosition& pos, int levelOfDetail);
+
+  static GeoPosition PixelToPosition(const Pixel& pixel, int levelOfDetail);
+
+  // Converts a Pixel to a Tile
+  static Tile PixelToTile(const Pixel& pixel);
+
+  static Pixel TileToPixel(const Tile& tile);
+
+  // Convert a Tile to a quadkey
+  static std::string TileToQuadKey(const Tile& tile, int levelOfDetail);
+
+  // Convert a quadkey to a tile and its level of detail
+  static void QuadKeyToTile(std::string quadkey, Tile* tile,
+                            int *levelOfDetail);
+
+  // Return the distance between two positions on the earth
+  static double distance(double lat1, double lon1,
+                         double lat2, double lon2);
+  static GeoPosition displaceLatLon(double lat, double lon,
+                                    double deltay, double deltax);
+
+  //
+  // Returns the top left position after applying the delta to
+  // the specified position
+  //
+  static GeoPosition boundingTopLeft(const GeoPosition& in, double radius) {
+    return displaceLatLon(in.latitude, in.longitude, -radius, -radius);
+  }
+
+  //
+  // Returns the bottom right position after applying the delta to
+  // the specified position
+  static GeoPosition boundingBottomRight(const GeoPosition& in,
+                                         double radius) {
+    return displaceLatLon(in.latitude, in.longitude, radius, radius);
+  }
+
+  //
+  // Get all quadkeys within a radius of a specified position
+  //
+  Status searchQuadIds(const GeoPosition& position,
+                       double radius,
+                       std::vector<std::string>* quadKeys);
+
+  // splits a string into its components
+  static void StringSplit(std::vector<std::string>* tokens,
+                          const std::string &text,
+                          char sep);
+
+  //
+  // Create keys for accessing rocksdb table(s)
+  //
+  static std::string MakeKey1(const GeoPosition& pos,
+                              Slice id,
+                              std::string quadkey);
+  static std::string MakeKey2(Slice id);
+  static std::string MakeKey1Prefix(std::string quadkey,
+                                    Slice id);
+  static std::string MakeQuadKeyPrefix(std::string quadkey);
+};
+
+}  // namespace rocksdb
+
+#endif  // ROCKSDB_LITE
diff --git a/utilities/geodb/geodb_test.cc b/utilities/geodb/geodb_test.cc
new file mode 100644 (file)
index 0000000..1a42e32
--- /dev/null
@@ -0,0 +1,123 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+//
+#include "utilities/geodb/geodb_impl.h"
+
+#include <cctype>
+#include "util/testharness.h"
+
+namespace rocksdb {
+
+class GeoDBTest {
+ public:
+  static const std::string kDefaultDbName;
+  static Options options;
+  DB* db;
+  GeoDB* geodb;
+
+  GeoDBTest() {
+    GeoDBOptions geodb_options;
+    ASSERT_OK(DestroyDB(kDefaultDbName, options));
+    options.create_if_missing = true;
+    Status status = DB::Open(options, kDefaultDbName, &db);
+    geodb =  new GeoDBImpl(db, geodb_options);
+  }
+
+  ~GeoDBTest() {
+    delete geodb;
+  }
+
+  GeoDB* getdb() {
+    return geodb;
+  }
+};
+
+const std::string GeoDBTest::kDefaultDbName = "/tmp/geodefault";
+Options GeoDBTest::options = Options();
+
+// Insert, Get and Remove
+TEST(GeoDBTest, SimpleTest) {
+  GeoPosition pos1(100, 101);
+  std::string id1("id1");
+  std::string value1("value1");
+
+  // insert first object into database
+  GeoObject obj1(pos1, id1, value1);
+  Status status = getdb()->Insert(obj1);
+  ASSERT_TRUE(status.ok());
+
+  // insert second object into database
+  GeoPosition pos2(200, 201);
+  std::string id2("id2");
+  std::string value2 = "value2";
+  GeoObject obj2(pos2, id2, value2);
+  status = getdb()->Insert(obj2);
+  ASSERT_TRUE(status.ok());
+
+  // retrieve first object using position
+  std::string value;
+  status = getdb()->GetByPosition(pos1, Slice(id1), &value);
+  ASSERT_TRUE(status.ok());
+  ASSERT_EQ(value, value1);
+
+  // retrieve first object using id
+  GeoObject obj;
+  status = getdb()->GetById(Slice(id1), &obj);
+  ASSERT_TRUE(status.ok());
+  ASSERT_EQ(obj.position.latitude, 100);
+  ASSERT_EQ(obj.position.longitude, 101);
+  ASSERT_EQ(obj.id.compare(id1), 0);
+  ASSERT_EQ(obj.value, value1);
+
+  // delete first object
+  status = getdb()->Remove(Slice(id1));
+  ASSERT_TRUE(status.ok());
+  status = getdb()->GetByPosition(pos1, Slice(id1), &value);
+  ASSERT_TRUE(status.IsNotFound());
+  status = getdb()->GetById(id1, &obj);
+  ASSERT_TRUE(status.IsNotFound());
+
+  // check that we can still find second object
+  status = getdb()->GetByPosition(pos2, id2, &value);
+  ASSERT_TRUE(status.ok());
+  ASSERT_EQ(value, value2);
+  status = getdb()->GetById(id2, &obj);
+  ASSERT_TRUE(status.ok());
+}
+
+// Search.
+// Verify distances via http://www.stevemorse.org/nearest/distance.php
+TEST(GeoDBTest, Search) {
+  GeoPosition pos1(45, 45);
+  std::string id1("mid1");
+  std::string value1 = "midvalue1";
+
+  // insert object at 45 degree latitude
+  GeoObject obj1(pos1, id1, value1);
+  Status status = getdb()->Insert(obj1);
+  ASSERT_TRUE(status.ok());
+
+  // search all objects centered at 46 degree latitude with
+  // a radius of 200 kilometers. We should find the one object that
+  // we inserted earlier.
+  std::vector<GeoObject> values;
+  status = getdb()->SearchRadial(GeoPosition(46, 46), 200000, &values);
+  ASSERT_TRUE(status.ok());
+  ASSERT_EQ(values.size(), 1U);
+
+  // search all objects centered at 46 degree latitude with
+  // a radius of 2 kilometers. There should be none.
+  values.clear();
+  status = getdb()->SearchRadial(GeoPosition(46, 46), 2, &values);
+  ASSERT_TRUE(status.ok());
+  ASSERT_EQ(values.size(), 0U);
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char* argv[]) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/utilities/merge_operators.h b/utilities/merge_operators.h
new file mode 100644 (file)
index 0000000..fdf0664
--- /dev/null
@@ -0,0 +1,45 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#ifndef MERGE_OPERATORS_H
+#define MERGE_OPERATORS_H
+
+#include <memory>
+#include <stdio.h>
+
+#include "rocksdb/merge_operator.h"
+
+namespace rocksdb {
+
+class MergeOperators {
+ public:
+  static std::shared_ptr<MergeOperator> CreatePutOperator();
+  static std::shared_ptr<MergeOperator> CreateUInt64AddOperator();
+  static std::shared_ptr<MergeOperator> CreateStringAppendOperator();
+  static std::shared_ptr<MergeOperator> CreateStringAppendTESTOperator();
+
+  // Will return a different merge operator depending on the string.
+  // TODO: Hook the "name" up to the actual Name() of the MergeOperators?
+  static std::shared_ptr<MergeOperator> CreateFromStringId(
+      const std::string& name) {
+    if (name == "put") {
+      return CreatePutOperator();
+    } else if ( name == "uint64add") {
+      return CreateUInt64AddOperator();
+    } else if (name == "stringappend") {
+      return CreateStringAppendOperator();
+    } else if (name == "stringappendtest") {
+      return CreateStringAppendTESTOperator();
+    } else {
+      // Empty or unknown, just return nullptr
+      return nullptr;
+    }
+  }
+
+};
+
+} // namespace rocksdb
+
+#endif
diff --git a/utilities/merge_operators/put.cc b/utilities/merge_operators/put.cc
new file mode 100644 (file)
index 0000000..3330843
--- /dev/null
@@ -0,0 +1,68 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include <memory>
+#include "rocksdb/slice.h"
+#include "rocksdb/merge_operator.h"
+#include "utilities/merge_operators.h"
+
+using namespace rocksdb;
+
+namespace { // anonymous namespace
+
+// A merge operator that mimics Put semantics
+// Since this merge-operator will not be used in production,
+// it is implemented as a non-associative merge operator to illustrate the
+// new interface and for testing purposes. (That is, we inherit from
+// the MergeOperator class rather than the AssociativeMergeOperator
+// which would be simpler in this case).
+//
+// From the client-perspective, semantics are the same.
+class PutOperator : public MergeOperator {
+ public:
+  virtual bool FullMerge(const Slice& key,
+                         const Slice* existing_value,
+                         const std::deque<std::string>& operand_sequence,
+                         std::string* new_value,
+                         Logger* logger) const override {
+    // Put basically only looks at the current/latest value
+    assert(!operand_sequence.empty());
+    assert(new_value != nullptr);
+    new_value->assign(operand_sequence.back());
+    return true;
+  }
+
+  virtual bool PartialMerge(const Slice& key,
+                            const Slice& left_operand,
+                            const Slice& right_operand,
+                            std::string* new_value,
+                            Logger* logger) const override {
+    new_value->assign(right_operand.data(), right_operand.size());
+    return true;
+  }
+
+  using MergeOperator::PartialMergeMulti;
+  virtual bool PartialMergeMulti(const Slice& key,
+                                 const std::deque<Slice>& operand_list,
+                                 std::string* new_value, Logger* logger) const
+      override {
+    new_value->assign(operand_list.back().data(), operand_list.back().size());
+    return true;
+  }
+
+  virtual const char* Name() const override {
+    return "PutOperator";
+  }
+};
+
+} // end of anonymous namespace
+
+namespace rocksdb {
+
+std::shared_ptr<MergeOperator> MergeOperators::CreatePutOperator() {
+  return std::make_shared<PutOperator>();
+}
+
+}
diff --git a/utilities/merge_operators/string_append/stringappend.cc b/utilities/merge_operators/string_append/stringappend.cc
new file mode 100644 (file)
index 0000000..38cd22e
--- /dev/null
@@ -0,0 +1,60 @@
+/**
+ * A MergeOperator for rocksdb that implements string append.
+ * @author Deon Nicholas (dnicholas@fb.com)
+ * Copyright 2013 Facebook
+ */
+
+#include "stringappend.h"
+
+#include <memory>
+#include <assert.h>
+
+#include "rocksdb/slice.h"
+#include "rocksdb/merge_operator.h"
+#include "utilities/merge_operators.h"
+
+namespace rocksdb {
+
+// Constructor: also specify the delimiter character.
+StringAppendOperator::StringAppendOperator(char delim_char)
+    : delim_(delim_char) {
+}
+
+// Implementation for the merge operation (concatenates two strings)
+bool StringAppendOperator::Merge(const Slice& key,
+                                 const Slice* existing_value,
+                                 const Slice& value,
+                                 std::string* new_value,
+                                 Logger* logger) const {
+
+  // Clear the *new_value for writing.
+  assert(new_value);
+  new_value->clear();
+
+  if (!existing_value) {
+    // No existing_value. Set *new_value = value
+    new_value->assign(value.data(),value.size());
+  } else {
+    // Generic append (existing_value != null).
+    // Reserve *new_value to correct size, and apply concatenation.
+    new_value->reserve(existing_value->size() + 1 + value.size());
+    new_value->assign(existing_value->data(),existing_value->size());
+    new_value->append(1,delim_);
+    new_value->append(value.data(), value.size());
+  }
+
+  return true;
+}
+
+const char* StringAppendOperator::Name() const  {
+  return "StringAppendOperator";
+}
+
+std::shared_ptr<MergeOperator> MergeOperators::CreateStringAppendOperator() {
+  return std::make_shared<StringAppendOperator>(',');
+}
+
+} // namespace rocksdb
+
+
+
diff --git a/utilities/merge_operators/string_append/stringappend.h b/utilities/merge_operators/string_append/stringappend.h
new file mode 100644 (file)
index 0000000..ca5b97e
--- /dev/null
@@ -0,0 +1,31 @@
+/**
+ * A MergeOperator for rocksdb that implements string append.
+ * @author Deon Nicholas (dnicholas@fb.com)
+ * Copyright 2013 Facebook
+ */
+
+#pragma once
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/slice.h"
+
+namespace rocksdb {
+
+class StringAppendOperator : public AssociativeMergeOperator {
+ public:
+  StringAppendOperator(char delim_char);    /// Constructor: specify delimiter
+
+  virtual bool Merge(const Slice& key,
+                     const Slice* existing_value,
+                     const Slice& value,
+                     std::string* new_value,
+                     Logger* logger) const override;
+
+  virtual const char* Name() const override;
+
+ private:
+  char delim_;         // The delimiter is inserted between elements
+
+};
+
+} // namespace rocksdb
+
diff --git a/utilities/merge_operators/string_append/stringappend2.cc b/utilities/merge_operators/string_append/stringappend2.cc
new file mode 100644 (file)
index 0000000..b2e0358
--- /dev/null
@@ -0,0 +1,113 @@
+/**
+ * @author Deon Nicholas (dnicholas@fb.com)
+ * Copyright 2013 Facebook
+ */
+
+#include "stringappend2.h"
+
+#include <memory>
+#include <string>
+#include <assert.h>
+
+#include "rocksdb/slice.h"
+#include "rocksdb/merge_operator.h"
+#include "utilities/merge_operators.h"
+
+namespace rocksdb {
+
+// Constructor: also specify the delimiter character.
+StringAppendTESTOperator::StringAppendTESTOperator(char delim_char)
+    : delim_(delim_char) {
+}
+
+// Implementation for the merge operation (concatenates two strings)
+bool StringAppendTESTOperator::FullMerge(
+    const Slice& key,
+    const Slice* existing_value,
+    const std::deque<std::string>& operands,
+    std::string* new_value,
+    Logger* logger) const {
+
+  // Clear the *new_value for writing.
+  assert(new_value);
+  new_value->clear();
+
+  // Compute the space needed for the final result.
+  int numBytes = 0;
+  for(auto it = operands.begin(); it != operands.end(); ++it) {
+    numBytes += it->size() + 1;   // Plus 1 for the delimiter
+  }
+
+  // Only print the delimiter after the first entry has been printed
+  bool printDelim = false;
+
+  // Prepend the *existing_value if one exists.
+  if (existing_value) {
+    new_value->reserve(numBytes + existing_value->size());
+    new_value->append(existing_value->data(), existing_value->size());
+    printDelim = true;
+  } else if (numBytes) {
+    new_value->reserve(numBytes-1); // Minus 1 since we have one less delimiter
+  }
+
+  // Concatenate the sequence of strings (and add a delimiter between each)
+  for(auto it = operands.begin(); it != operands.end(); ++it) {
+    if (printDelim) {
+      new_value->append(1,delim_);
+    }
+    new_value->append(*it);
+    printDelim = true;
+  }
+
+  return true;
+}
+
+bool StringAppendTESTOperator::PartialMergeMulti(
+    const Slice& key, const std::deque<Slice>& operand_list,
+    std::string* new_value, Logger* logger) const {
+  return false;
+}
+
+// A version of PartialMerge that actually performs "partial merging".
+// Use this to simulate the exact behaviour of the StringAppendOperator.
+bool StringAppendTESTOperator::_AssocPartialMergeMulti(
+    const Slice& key, const std::deque<Slice>& operand_list,
+    std::string* new_value, Logger* logger) const {
+  // Clear the *new_value for writing
+  assert(new_value);
+  new_value->clear();
+  assert(operand_list.size() >= 2);
+
+  // Generic append
+  // Determine and reserve correct size for *new_value.
+  size_t size = 0;
+  for (const auto& operand : operand_list) {
+    size += operand.size();
+  }
+  size += operand_list.size() - 1;  // Delimiters
+  new_value->reserve(size);
+
+  // Apply concatenation
+  new_value->assign(operand_list.front().data(), operand_list.front().size());
+
+  for (std::deque<Slice>::const_iterator it = operand_list.begin() + 1;
+       it != operand_list.end(); ++it) {
+    new_value->append(1, delim_);
+    new_value->append(it->data(), it->size());
+  }
+
+  return true;
+}
+
+const char* StringAppendTESTOperator::Name() const  {
+  return "StringAppendTESTOperator";
+}
+
+
+std::shared_ptr<MergeOperator>
+MergeOperators::CreateStringAppendTESTOperator() {
+  return std::make_shared<StringAppendTESTOperator>(',');
+}
+
+} // namespace rocksdb
+
diff --git a/utilities/merge_operators/string_append/stringappend2.h b/utilities/merge_operators/string_append/stringappend2.h
new file mode 100644 (file)
index 0000000..5e506ef
--- /dev/null
@@ -0,0 +1,51 @@
+/**
+ * A TEST MergeOperator for rocksdb that implements string append.
+ * It is built using the MergeOperator interface rather than the simpler
+ * AssociativeMergeOperator interface. This is useful for testing/benchmarking.
+ * While the two operators are semantically the same, all production code
+ * should use the StringAppendOperator defined in stringappend.{h,cc}. The
+ * operator defined in the present file is primarily for testing.
+ *
+ * @author Deon Nicholas (dnicholas@fb.com)
+ * Copyright 2013 Facebook
+ */
+
+#pragma once
+#include <deque>
+#include <string>
+
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/slice.h"
+
+namespace rocksdb {
+
+class StringAppendTESTOperator : public MergeOperator {
+ public:
+  // Constructor with delimiter
+  explicit StringAppendTESTOperator(char delim_char);
+
+  virtual bool FullMerge(const Slice& key,
+                         const Slice* existing_value,
+                         const std::deque<std::string>& operand_sequence,
+                         std::string* new_value,
+                         Logger* logger) const override;
+
+  virtual bool PartialMergeMulti(const Slice& key,
+                                 const std::deque<Slice>& operand_list,
+                                 std::string* new_value, Logger* logger) const
+      override;
+
+  virtual const char* Name() const override;
+
+ private:
+  // A version of PartialMerge that actually performs "partial merging".
+  // Use this to simulate the exact behaviour of the StringAppendOperator.
+  bool _AssocPartialMergeMulti(const Slice& key,
+                               const std::deque<Slice>& operand_list,
+                               std::string* new_value, Logger* logger) const;
+
+  char delim_;         // The delimiter is inserted between elements
+
+};
+
+} // namespace rocksdb
diff --git a/utilities/merge_operators/string_append/stringappend_test.cc b/utilities/merge_operators/string_append/stringappend_test.cc
new file mode 100644 (file)
index 0000000..a68186a
--- /dev/null
@@ -0,0 +1,595 @@
+/**
+ * An persistent map : key -> (list of strings), using rocksdb merge.
+ * This file is a test-harness / use-case for the StringAppendOperator.
+ *
+ * @author Deon Nicholas (dnicholas@fb.com)
+ * Copyright 2013 Facebook, Inc.
+*/
+
+#include <iostream>
+#include <map>
+
+#include "rocksdb/db.h"
+#include "rocksdb/merge_operator.h"
+#include "utilities/merge_operators.h"
+#include "utilities/merge_operators/string_append/stringappend.h"
+#include "utilities/merge_operators/string_append/stringappend2.h"
+#include "utilities/db_ttl.h"
+#include "util/testharness.h"
+#include "util/random.h"
+
+using namespace rocksdb;
+
+namespace rocksdb {
+
+// Path to the database on file system
+const std::string kDbName = "/tmp/mergetestdb";
+
+namespace {
+// OpenDb opens a (possibly new) rocksdb database with a StringAppendOperator
+std::shared_ptr<DB> OpenNormalDb(char delim_char) {
+  DB* db;
+  Options options;
+  options.create_if_missing = true;
+  options.merge_operator.reset(new StringAppendOperator(delim_char));
+  ASSERT_OK(DB::Open(options, kDbName,  &db));
+  return std::shared_ptr<DB>(db);
+}
+
+// Open a TtlDB with a non-associative StringAppendTESTOperator
+std::shared_ptr<DB> OpenTtlDb(char delim_char) {
+  DBWithTTL* db;
+  Options options;
+  options.create_if_missing = true;
+  options.merge_operator.reset(new StringAppendTESTOperator(delim_char));
+  ASSERT_OK(DBWithTTL::Open(options, kDbName, &db, 123456));
+  return std::shared_ptr<DB>(db);
+}
+}  // namespace
+
+/// StringLists represents a set of string-lists, each with a key-index.
+/// Supports Append(list, string) and Get(list)
+class StringLists {
+ public:
+
+  //Constructor: specifies the rocksdb db
+  /* implicit */
+  StringLists(std::shared_ptr<DB> db)
+      : db_(db),
+        merge_option_(),
+        get_option_() {
+    assert(db);
+  }
+
+  // Append string val onto the list defined by key; return true on success
+  bool Append(const std::string& key, const std::string& val){
+    Slice valSlice(val.data(), val.size());
+    auto s = db_->Merge(merge_option_, key, valSlice);
+
+    if (s.ok()) {
+      return true;
+    } else {
+      std::cerr << "ERROR " << s.ToString() << std::endl;
+      return false;
+    }
+  }
+
+  // Returns the list of strings associated with key (or "" if does not exist)
+  bool Get(const std::string& key, std::string* const result){
+    assert(result != nullptr); // we should have a place to store the result
+    auto s = db_->Get(get_option_, key, result);
+
+    if (s.ok()) {
+      return true;
+    }
+
+    // Either key does not exist, or there is some error.
+    *result = "";       // Always return empty string (just for convention)
+
+    //NotFound is okay; just return empty (similar to std::map)
+    //But network or db errors, etc, should fail the test (or at least yell)
+    if (!s.IsNotFound()) {
+      std::cerr << "ERROR " << s.ToString() << std::endl;
+    }
+
+    // Always return false if s.ok() was not true
+    return false;
+  }
+
+
+ private:
+  std::shared_ptr<DB> db_;
+  WriteOptions merge_option_;
+  ReadOptions get_option_;
+
+};
+
+
+// The class for unit-testing
+class StringAppendOperatorTest {
+ public:
+  StringAppendOperatorTest() {
+    DestroyDB(kDbName, Options());    // Start each test with a fresh DB
+  }
+
+  typedef std::shared_ptr<DB> (* OpenFuncPtr)(char);
+
+  // Allows user to open databases with different configurations.
+  // e.g.: Can open a DB or a TtlDB, etc.
+  static void SetOpenDbFunction(OpenFuncPtr func) {
+    OpenDb = func;
+  }
+
+ protected:
+  static OpenFuncPtr OpenDb;
+};
+StringAppendOperatorTest::OpenFuncPtr StringAppendOperatorTest::OpenDb = nullptr;
+
+// THE TEST CASES BEGIN HERE
+
+TEST(StringAppendOperatorTest, IteratorTest) {
+  auto db_ = OpenDb(',');
+  StringLists slists(db_);
+
+  slists.Append("k1", "v1");
+  slists.Append("k1", "v2");
+  slists.Append("k1", "v3");
+
+  slists.Append("k2", "a1");
+  slists.Append("k2", "a2");
+  slists.Append("k2", "a3");
+
+  std::string res;
+  std::unique_ptr<rocksdb::Iterator> it(db_->NewIterator(ReadOptions()));
+  std::string k1("k1");
+  std::string k2("k2");
+  bool first = true;
+  for (it->Seek(k1); it->Valid(); it->Next()) {
+    res = it->value().ToString();
+    if (first) {
+      ASSERT_EQ(res, "v1,v2,v3");
+      first = false;
+    } else {
+      ASSERT_EQ(res, "a1,a2,a3");
+    }
+  }
+  slists.Append("k2", "a4");
+  slists.Append("k1", "v4");
+
+  // Snapshot should still be the same. Should ignore a4 and v4.
+  first = true;
+  for (it->Seek(k1); it->Valid(); it->Next()) {
+    res = it->value().ToString();
+    if (first) {
+      ASSERT_EQ(res, "v1,v2,v3");
+      first = false;
+    } else {
+      ASSERT_EQ(res, "a1,a2,a3");
+    }
+  }
+
+
+  // Should release the snapshot and be aware of the new stuff now
+  it.reset(db_->NewIterator(ReadOptions()));
+  first = true;
+  for (it->Seek(k1); it->Valid(); it->Next()) {
+    res = it->value().ToString();
+    if (first) {
+      ASSERT_EQ(res, "v1,v2,v3,v4");
+      first = false;
+    } else {
+      ASSERT_EQ(res, "a1,a2,a3,a4");
+    }
+  }
+
+  // start from k2 this time.
+  for (it->Seek(k2); it->Valid(); it->Next()) {
+    res = it->value().ToString();
+    if (first) {
+      ASSERT_EQ(res, "v1,v2,v3,v4");
+      first = false;
+    } else {
+      ASSERT_EQ(res, "a1,a2,a3,a4");
+    }
+  }
+
+  slists.Append("k3", "g1");
+
+  it.reset(db_->NewIterator(ReadOptions()));
+  first = true;
+  std::string k3("k3");
+  for(it->Seek(k2); it->Valid(); it->Next()) {
+    res = it->value().ToString();
+    if (first) {
+      ASSERT_EQ(res, "a1,a2,a3,a4");
+      first = false;
+    } else {
+      ASSERT_EQ(res, "g1");
+    }
+  }
+  for(it->Seek(k3); it->Valid(); it->Next()) {
+    res = it->value().ToString();
+    if (first) {
+      // should not be hit
+      ASSERT_EQ(res, "a1,a2,a3,a4");
+      first = false;
+    } else {
+      ASSERT_EQ(res, "g1");
+    }
+  }
+
+}
+
+TEST(StringAppendOperatorTest, SimpleTest) {
+  auto db = OpenDb(',');
+  StringLists slists(db);
+
+  slists.Append("k1", "v1");
+  slists.Append("k1", "v2");
+  slists.Append("k1", "v3");
+
+  std::string res;
+  bool status = slists.Get("k1", &res);
+
+  ASSERT_TRUE(status);
+  ASSERT_EQ(res, "v1,v2,v3");
+}
+
+TEST(StringAppendOperatorTest, SimpleDelimiterTest) {
+  auto db = OpenDb('|');
+  StringLists slists(db);
+
+  slists.Append("k1", "v1");
+  slists.Append("k1", "v2");
+  slists.Append("k1", "v3");
+
+  std::string res;
+  slists.Get("k1", &res);
+  ASSERT_EQ(res, "v1|v2|v3");
+}
+
+TEST(StringAppendOperatorTest, OneValueNoDelimiterTest) {
+  auto db = OpenDb('!');
+  StringLists slists(db);
+
+  slists.Append("random_key", "single_val");
+
+  std::string res;
+  slists.Get("random_key", &res);
+  ASSERT_EQ(res, "single_val");
+}
+
+TEST(StringAppendOperatorTest, VariousKeys) {
+  auto db = OpenDb('\n');
+  StringLists slists(db);
+
+  slists.Append("c", "asdasd");
+  slists.Append("a", "x");
+  slists.Append("b", "y");
+  slists.Append("a", "t");
+  slists.Append("a", "r");
+  slists.Append("b", "2");
+  slists.Append("c", "asdasd");
+
+  std::string a, b, c;
+  bool sa, sb, sc;
+  sa = slists.Get("a", &a);
+  sb = slists.Get("b", &b);
+  sc = slists.Get("c", &c);
+
+  ASSERT_TRUE(sa && sb && sc); // All three keys should have been found
+
+  ASSERT_EQ(a, "x\nt\nr");
+  ASSERT_EQ(b, "y\n2");
+  ASSERT_EQ(c, "asdasd\nasdasd");
+}
+
+// Generate semi random keys/words from a small distribution.
+TEST(StringAppendOperatorTest, RandomMixGetAppend) {
+  auto db = OpenDb(' ');
+  StringLists slists(db);
+
+  // Generate a list of random keys and values
+  const int kWordCount = 15;
+  std::string words[] = {"sdasd", "triejf", "fnjsdfn", "dfjisdfsf", "342839",
+                         "dsuha", "mabuais", "sadajsid", "jf9834hf", "2d9j89",
+                         "dj9823jd", "a", "dk02ed2dh", "$(jd4h984$(*", "mabz"};
+  const int kKeyCount = 6;
+  std::string keys[] = {"dhaiusdhu", "denidw", "daisda", "keykey", "muki",
+                        "shzassdianmd"};
+
+  // Will store a local copy of all data in order to verify correctness
+  std::map<std::string, std::string> parallel_copy;
+
+  // Generate a bunch of random queries (Append and Get)!
+  enum query_t  { APPEND_OP, GET_OP, NUM_OPS };
+  Random randomGen(1337);       //deterministic seed; always get same results!
+
+  const int kNumQueries = 30;
+  for (int q=0; q<kNumQueries; ++q) {
+    // Generate a random query (Append or Get) and random parameters
+    query_t query = (query_t)randomGen.Uniform((int)NUM_OPS);
+    std::string key = keys[randomGen.Uniform((int)kKeyCount)];
+    std::string word = words[randomGen.Uniform((int)kWordCount)];
+
+    // Apply the query and any checks.
+    if (query == APPEND_OP) {
+
+      // Apply the rocksdb test-harness Append defined above
+      slists.Append(key, word);  //apply the rocksdb append
+
+      // Apply the similar "Append" to the parallel copy
+      if (parallel_copy[key].size() > 0) {
+        parallel_copy[key] += " " + word;
+      } else {
+        parallel_copy[key] = word;
+      }
+
+    } else if (query == GET_OP) {
+      // Assumes that a non-existent key just returns <empty>
+      std::string res;
+      slists.Get(key, &res);
+      ASSERT_EQ(res, parallel_copy[key]);
+    }
+
+  }
+
+}
+
+TEST(StringAppendOperatorTest, BIGRandomMixGetAppend) {
+  auto db = OpenDb(' ');
+  StringLists slists(db);
+
+  // Generate a list of random keys and values
+  const int kWordCount = 15;
+  std::string words[] = {"sdasd", "triejf", "fnjsdfn", "dfjisdfsf", "342839",
+                         "dsuha", "mabuais", "sadajsid", "jf9834hf", "2d9j89",
+                         "dj9823jd", "a", "dk02ed2dh", "$(jd4h984$(*", "mabz"};
+  const int kKeyCount = 6;
+  std::string keys[] = {"dhaiusdhu", "denidw", "daisda", "keykey", "muki",
+                        "shzassdianmd"};
+
+  // Will store a local copy of all data in order to verify correctness
+  std::map<std::string, std::string> parallel_copy;
+
+  // Generate a bunch of random queries (Append and Get)!
+  enum query_t  { APPEND_OP, GET_OP, NUM_OPS };
+  Random randomGen(9138204);       // deterministic seed
+
+  const int kNumQueries = 1000;
+  for (int q=0; q<kNumQueries; ++q) {
+    // Generate a random query (Append or Get) and random parameters
+    query_t query = (query_t)randomGen.Uniform((int)NUM_OPS);
+    std::string key = keys[randomGen.Uniform((int)kKeyCount)];
+    std::string word = words[randomGen.Uniform((int)kWordCount)];
+
+    //Apply the query and any checks.
+    if (query == APPEND_OP) {
+
+      // Apply the rocksdb test-harness Append defined above
+      slists.Append(key, word);  //apply the rocksdb append
+
+      // Apply the similar "Append" to the parallel copy
+      if (parallel_copy[key].size() > 0) {
+        parallel_copy[key] += " " + word;
+      } else {
+        parallel_copy[key] = word;
+      }
+
+    } else if (query == GET_OP) {
+      // Assumes that a non-existent key just returns <empty>
+      std::string res;
+      slists.Get(key, &res);
+      ASSERT_EQ(res, parallel_copy[key]);
+    }
+
+  }
+
+}
+
+
+TEST(StringAppendOperatorTest, PersistentVariousKeys) {
+  // Perform the following operations in limited scope
+  {
+    auto db = OpenDb('\n');
+    StringLists slists(db);
+
+    slists.Append("c", "asdasd");
+    slists.Append("a", "x");
+    slists.Append("b", "y");
+    slists.Append("a", "t");
+    slists.Append("a", "r");
+    slists.Append("b", "2");
+    slists.Append("c", "asdasd");
+
+    std::string a, b, c;
+    slists.Get("a", &a);
+    slists.Get("b", &b);
+    slists.Get("c", &c);
+
+    ASSERT_EQ(a, "x\nt\nr");
+    ASSERT_EQ(b, "y\n2");
+    ASSERT_EQ(c, "asdasd\nasdasd");
+  }
+
+  // Reopen the database (the previous changes should persist / be remembered)
+  {
+    auto db = OpenDb('\n');
+    StringLists slists(db);
+
+    slists.Append("c", "bbnagnagsx");
+    slists.Append("a", "sa");
+    slists.Append("b", "df");
+    slists.Append("a", "gh");
+    slists.Append("a", "jk");
+    slists.Append("b", "l;");
+    slists.Append("c", "rogosh");
+
+    // The previous changes should be on disk (L0)
+    // The most recent changes should be in memory (MemTable)
+    // Hence, this will test both Get() paths.
+    std::string a, b, c;
+    slists.Get("a", &a);
+    slists.Get("b", &b);
+    slists.Get("c", &c);
+
+    ASSERT_EQ(a, "x\nt\nr\nsa\ngh\njk");
+    ASSERT_EQ(b, "y\n2\ndf\nl;");
+    ASSERT_EQ(c, "asdasd\nasdasd\nbbnagnagsx\nrogosh");
+  }
+
+  // Reopen the database (the previous changes should persist / be remembered)
+  {
+    auto db = OpenDb('\n');
+    StringLists slists(db);
+
+    // All changes should be on disk. This will test VersionSet Get()
+    std::string a, b, c;
+    slists.Get("a", &a);
+    slists.Get("b", &b);
+    slists.Get("c", &c);
+
+    ASSERT_EQ(a, "x\nt\nr\nsa\ngh\njk");
+    ASSERT_EQ(b, "y\n2\ndf\nl;");
+    ASSERT_EQ(c, "asdasd\nasdasd\nbbnagnagsx\nrogosh");
+  }
+}
+
+TEST(StringAppendOperatorTest, PersistentFlushAndCompaction) {
+  // Perform the following operations in limited scope
+  {
+    auto db = OpenDb('\n');
+    StringLists slists(db);
+    std::string a, b, c;
+    bool success;
+
+    // Append, Flush, Get
+    slists.Append("c", "asdasd");
+    db->Flush(rocksdb::FlushOptions());
+    success = slists.Get("c", &c);
+    ASSERT_TRUE(success);
+    ASSERT_EQ(c, "asdasd");
+
+    // Append, Flush, Append, Get
+    slists.Append("a", "x");
+    slists.Append("b", "y");
+    db->Flush(rocksdb::FlushOptions());
+    slists.Append("a", "t");
+    slists.Append("a", "r");
+    slists.Append("b", "2");
+
+    success = slists.Get("a", &a);
+    assert(success == true);
+    ASSERT_EQ(a, "x\nt\nr");
+
+    success = slists.Get("b", &b);
+    assert(success == true);
+    ASSERT_EQ(b, "y\n2");
+
+    // Append, Get
+    success = slists.Append("c", "asdasd");
+    assert(success);
+    success = slists.Append("b", "monkey");
+    assert(success);
+
+    // I omit the "assert(success)" checks here.
+    slists.Get("a", &a);
+    slists.Get("b", &b);
+    slists.Get("c", &c);
+
+    ASSERT_EQ(a, "x\nt\nr");
+    ASSERT_EQ(b, "y\n2\nmonkey");
+    ASSERT_EQ(c, "asdasd\nasdasd");
+  }
+
+  // Reopen the database (the previous changes should persist / be remembered)
+  {
+    auto db = OpenDb('\n');
+    StringLists slists(db);
+    std::string a, b, c;
+
+    // Get (Quick check for persistence of previous database)
+    slists.Get("a", &a);
+    ASSERT_EQ(a, "x\nt\nr");
+
+    //Append, Compact, Get
+    slists.Append("c", "bbnagnagsx");
+    slists.Append("a", "sa");
+    slists.Append("b", "df");
+    db->CompactRange(nullptr, nullptr);
+    slists.Get("a", &a);
+    slists.Get("b", &b);
+    slists.Get("c", &c);
+    ASSERT_EQ(a, "x\nt\nr\nsa");
+    ASSERT_EQ(b, "y\n2\nmonkey\ndf");
+    ASSERT_EQ(c, "asdasd\nasdasd\nbbnagnagsx");
+
+    // Append, Get
+    slists.Append("a", "gh");
+    slists.Append("a", "jk");
+    slists.Append("b", "l;");
+    slists.Append("c", "rogosh");
+    slists.Get("a", &a);
+    slists.Get("b", &b);
+    slists.Get("c", &c);
+    ASSERT_EQ(a, "x\nt\nr\nsa\ngh\njk");
+    ASSERT_EQ(b, "y\n2\nmonkey\ndf\nl;");
+    ASSERT_EQ(c, "asdasd\nasdasd\nbbnagnagsx\nrogosh");
+
+    // Compact, Get
+    db->CompactRange(nullptr, nullptr);
+    ASSERT_EQ(a, "x\nt\nr\nsa\ngh\njk");
+    ASSERT_EQ(b, "y\n2\nmonkey\ndf\nl;");
+    ASSERT_EQ(c, "asdasd\nasdasd\nbbnagnagsx\nrogosh");
+
+    // Append, Flush, Compact, Get
+    slists.Append("b", "afcg");
+    db->Flush(rocksdb::FlushOptions());
+    db->CompactRange(nullptr, nullptr);
+    slists.Get("b", &b);
+    ASSERT_EQ(b, "y\n2\nmonkey\ndf\nl;\nafcg");
+  }
+}
+
+TEST(StringAppendOperatorTest, SimpleTestNullDelimiter) {
+  auto db = OpenDb('\0');
+  StringLists slists(db);
+
+  slists.Append("k1", "v1");
+  slists.Append("k1", "v2");
+  slists.Append("k1", "v3");
+
+  std::string res;
+  bool status = slists.Get("k1", &res);
+  ASSERT_TRUE(status);
+
+  // Construct the desired string. Default constructor doesn't like '\0' chars.
+  std::string checker("v1,v2,v3");    // Verify that the string is right size.
+  checker[2] = '\0';                  // Use null delimiter instead of comma.
+  checker[5] = '\0';
+  assert(checker.size() == 8);        // Verify it is still the correct size
+
+  // Check that the rocksdb result string matches the desired string
+  assert(res.size() == checker.size());
+  ASSERT_EQ(res, checker);
+}
+
+} // namespace rocksdb
+
+int main(int arc, char** argv) {
+  // Run with regular database
+  {
+    fprintf(stderr, "Running tests with regular db and operator.\n");
+    StringAppendOperatorTest::SetOpenDbFunction(&OpenNormalDb);
+    rocksdb::test::RunAllTests();
+  }
+
+  // Run with TTL
+  {
+    fprintf(stderr, "Running tests with ttl db and generic operator.\n");
+    StringAppendOperatorTest::SetOpenDbFunction(&OpenTtlDb);
+    rocksdb::test::RunAllTests();
+  }
+
+  return 0;
+}
diff --git a/utilities/merge_operators/uint64add.cc b/utilities/merge_operators/uint64add.cc
new file mode 100644 (file)
index 0000000..9d78651
--- /dev/null
@@ -0,0 +1,65 @@
+#include <memory>
+#include "rocksdb/env.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/slice.h"
+#include "util/coding.h"
+#include "utilities/merge_operators.h"
+
+using namespace rocksdb;
+
+namespace { // anonymous namespace
+
+// A 'model' merge operator with uint64 addition semantics
+// Implemented as an AssociativeMergeOperator for simplicity and example.
+class UInt64AddOperator : public AssociativeMergeOperator {
+ public:
+  virtual bool Merge(const Slice& key,
+                     const Slice* existing_value,
+                     const Slice& value,
+                     std::string* new_value,
+                     Logger* logger) const override {
+    uint64_t orig_value = 0;
+    if (existing_value){
+      orig_value = DecodeInteger(*existing_value, logger);
+    }
+    uint64_t operand = DecodeInteger(value, logger);
+
+    assert(new_value);
+    new_value->clear();
+    PutFixed64(new_value, orig_value + operand);
+
+    return true;  // Return true always since corruption will be treated as 0
+  }
+
+  virtual const char* Name() const override {
+    return "UInt64AddOperator";
+  }
+
+ private:
+  // Takes the string and decodes it into a uint64_t
+  // On error, prints a message and returns 0
+  uint64_t DecodeInteger(const Slice& value, Logger* logger) const {
+    uint64_t result = 0;
+
+    if (value.size() == sizeof(uint64_t)) {
+      result = DecodeFixed64(value.data());
+    } else if (logger != nullptr) {
+      // If value is corrupted, treat it as 0
+      Log(logger, "uint64 value corruption, size: %zu > %zu",
+          value.size(), sizeof(uint64_t));
+    }
+
+    return result;
+  }
+
+};
+
+}
+
+namespace rocksdb {
+
+std::shared_ptr<MergeOperator> MergeOperators::CreateUInt64AddOperator() {
+  return std::make_shared<UInt64AddOperator>();
+}
+
+}
diff --git a/utilities/redis/README b/utilities/redis/README
new file mode 100644 (file)
index 0000000..8b17bc0
--- /dev/null
@@ -0,0 +1,14 @@
+This folder defines a REDIS-style interface for Rocksdb.
+Right now it is written as a simple tag-on in the rocksdb::RedisLists class.
+It implements Redis Lists, and supports only the "non-blocking operations".
+
+Internally, the set of lists are stored in a rocksdb database, mapping keys to
+values. Each "value" is the list itself, storing a sequence of "elements".
+Each element is stored as a 32-bit-integer, followed by a sequence of bytes.
+The 32-bit-integer represents the length of the element (that is, the number
+of bytes that follow). And then that many bytes follow.
+
+
+NOTE: This README file may be old. See the actual redis_lists.cc file for
+definitive details on the implementation. There should be a header at the top
+of that file, explaining a bit of the implementation details.
diff --git a/utilities/redis/redis_list_exception.h b/utilities/redis/redis_list_exception.h
new file mode 100644 (file)
index 0000000..0b0f376
--- /dev/null
@@ -0,0 +1,22 @@
+/**
+ * A simple structure for exceptions in RedisLists.
+ *
+ * @author Deon Nicholas (dnicholas@fb.com)
+ * Copyright 2013 Facebook
+ */
+
+#ifndef ROCKSDB_LITE
+#pragma once
+#include <exception>
+
+namespace rocksdb {
+
+class RedisListException: public std::exception {
+ public:
+  const char* what() const throw() {
+    return "Invalid operation or corrupt data in Redis List.";
+  }
+};
+
+} // namespace rocksdb
+#endif
diff --git a/utilities/redis/redis_list_iterator.h b/utilities/redis/redis_list_iterator.h
new file mode 100644 (file)
index 0000000..b776ada
--- /dev/null
@@ -0,0 +1,310 @@
+// Copyright 2013 Facebook
+/**
+ * RedisListIterator:
+ * An abstraction over the "list" concept (e.g.: for redis lists).
+ * Provides functionality to read, traverse, edit, and write these lists.
+ *
+ * Upon construction, the RedisListIterator is given a block of list data.
+ * Internally, it stores a pointer to the data and a pointer to current item.
+ * It also stores a "result" list that will be mutated over time.
+ *
+ * Traversal and mutation are done by "forward iteration".
+ * The Push() and Skip() methods will advance the iterator to the next item.
+ * However, Push() will also "write the current item to the result".
+ * Skip() will simply move to next item, causing current item to be dropped.
+ *
+ * Upon completion, the result (accessible by WriteResult()) will be saved.
+ * All "skipped" items will be gone; all "pushed" items will remain.
+ *
+ * @throws Any of the operations may throw a RedisListException if an invalid
+ *          operation is performed or if the data is found to be corrupt.
+ *
+ * @notes By default, if WriteResult() is called part-way through iteration,
+ *        it will automatically advance the iterator to the end, and Keep()
+ *        all items that haven't been traversed yet. This may be subject
+ *        to review.
+ *
+ * @notes Can access the "current" item via GetCurrent(), and other
+ *        list-specific information such as Length().
+ *
+ * @notes The internal representation is due to change at any time. Presently,
+ *        the list is represented as follows:
+ *          - 32-bit integer header: the number of items in the list
+ *          - For each item:
+ *              - 32-bit int (n): the number of bytes representing this item
+ *              - n bytes of data: the actual data.
+ *
+ * @author Deon Nicholas (dnicholas@fb.com)
+ */
+
+#ifndef ROCKSDB_LITE
+#pragma once
+
+#include <string>
+
+#include "redis_list_exception.h"
+#include "rocksdb/slice.h"
+#include "util/coding.h"
+
+namespace rocksdb {
+
+/// An abstraction over the "list" concept.
+/// All operations may throw a RedisListException
+class RedisListIterator {
+ public:
+  /// Construct a redis-list-iterator based on data.
+  /// If the data is non-empty, it must formatted according to @notes above.
+  ///
+  /// If the data is valid, we can assume the following invariant(s):
+  ///  a) length_, num_bytes_ are set correctly.
+  ///  b) cur_byte_ always refers to the start of the current element,
+  ///       just before the bytes that specify element length.
+  ///  c) cur_elem_ is always the index of the current element.
+  ///  d) cur_elem_length_ is always the number of bytes in current element,
+  ///       excluding the 4-byte header itself.
+  ///  e) result_ will always contain data_[0..cur_byte_) and a header
+  ///  f) Whenever corrupt data is encountered or an invalid operation is
+  ///      attempted, a RedisListException will immediately be thrown.
+  RedisListIterator(const std::string& list_data)
+      : data_(list_data.data()),
+        num_bytes_(list_data.size()),
+        cur_byte_(0),
+        cur_elem_(0),
+        cur_elem_length_(0),
+        length_(0),
+        result_() {
+
+    // Initialize the result_ (reserve enough space for header)
+    InitializeResult();
+
+    // Parse the data only if it is not empty.
+    if (num_bytes_ == 0) {
+      return;
+    }
+
+    // If non-empty, but less than 4 bytes, data must be corrupt
+    if (num_bytes_ < sizeof(length_)) {
+      ThrowError("Corrupt header.");    // Will break control flow
+    }
+
+    // Good. The first bytes specify the number of elements
+    length_ = DecodeFixed32(data_);
+    cur_byte_ = sizeof(length_);
+
+    // If we have at least one element, point to that element.
+    // Also, read the first integer of the element (specifying the size),
+    //   if possible.
+    if (length_ > 0) {
+      if (cur_byte_ + sizeof(cur_elem_length_) <= num_bytes_) {
+        cur_elem_length_ = DecodeFixed32(data_+cur_byte_);
+      } else {
+        ThrowError("Corrupt data for first element.");
+      }
+    }
+
+    // At this point, we are fully set-up.
+    // The invariants described in the header should now be true.
+  }
+
+  /// Reserve some space for the result_.
+  /// Equivalent to result_.reserve(bytes).
+  void Reserve(int bytes) {
+    result_.reserve(bytes);
+  }
+
+  /// Go to next element in data file.
+  /// Also writes the current element to result_.
+  RedisListIterator& Push() {
+    WriteCurrentElement();
+    MoveNext();
+    return *this;
+  }
+
+  /// Go to next element in data file.
+  /// Drops/skips the current element. It will not be written to result_.
+  RedisListIterator& Skip() {
+    MoveNext();
+    --length_;          // One less item
+    --cur_elem_;        // We moved one forward, but index did not change
+    return *this;
+  }
+
+  /// Insert elem into the result_ (just BEFORE the current element / byte)
+  /// Note: if Done() (i.e.: iterator points to end), this will append elem.
+  void InsertElement(const Slice& elem) {
+    // Ensure we are in a valid state
+    CheckErrors();
+
+    const int kOrigSize = result_.size();
+    result_.resize(kOrigSize + SizeOf(elem));
+    EncodeFixed32(result_.data() + kOrigSize, elem.size());
+    memcpy(result_.data() + kOrigSize + sizeof(uint32_t),
+           elem.data(),
+           elem.size());
+    ++length_;
+    ++cur_elem_;
+  }
+
+  /// Access the current element, and save the result into *curElem
+  void GetCurrent(Slice* curElem) {
+    // Ensure we are in a valid state
+    CheckErrors();
+
+    // Ensure that we are not past the last element.
+    if (Done()) {
+      ThrowError("Invalid dereferencing.");
+    }
+
+    // Dereference the element
+    *curElem = Slice(data_+cur_byte_+sizeof(cur_elem_length_),
+                     cur_elem_length_);
+  }
+
+  // Number of elements
+  int Length() const {
+    return length_;
+  }
+
+  // Number of bytes in the final representation (i.e: WriteResult().size())
+  int Size() const {
+    // result_ holds the currently written data
+    // data_[cur_byte..num_bytes-1] is the remainder of the data
+    return result_.size() + (num_bytes_ - cur_byte_);
+  }
+
+  // Reached the end?
+  bool Done() const {
+    return cur_byte_ >= num_bytes_ || cur_elem_ >= length_;
+  }
+
+  /// Returns a string representing the final, edited, data.
+  /// Assumes that all bytes of data_ in the range [0,cur_byte_) have been read
+  ///  and that result_ contains this data.
+  /// The rest of the data must still be written.
+  /// So, this method ADVANCES THE ITERATOR TO THE END before writing.
+  Slice WriteResult() {
+    CheckErrors();
+
+    // The header should currently be filled with dummy data (0's)
+    // Correctly update the header.
+    // Note, this is safe since result_ is a vector (guaranteed contiguous)
+    EncodeFixed32(&result_[0],length_);
+
+    // Append the remainder of the data to the result.
+    result_.insert(result_.end(),data_+cur_byte_, data_ +num_bytes_);
+
+    // Seek to end of file
+    cur_byte_ = num_bytes_;
+    cur_elem_ = length_;
+    cur_elem_length_ = 0;
+
+    // Return the result
+    return Slice(result_.data(),result_.size());
+  }
+
+ public: // Static public functions
+
+  /// An upper-bound on the amount of bytes needed to store this element.
+  /// This is used to hide representation information from the client.
+  /// E.G. This can be used to compute the bytes we want to Reserve().
+  static uint32_t SizeOf(const Slice& elem) {
+    // [Integer Length . Data]
+    return sizeof(uint32_t) + elem.size();
+  }
+
+ private: // Private functions
+
+  /// Initializes the result_ string.
+  /// It will fill the first few bytes with 0's so that there is
+  ///  enough space for header information when we need to write later.
+  /// Currently, "header information" means: the length (number of elements)
+  /// Assumes that result_ is empty to begin with
+  void InitializeResult() {
+    assert(result_.empty());            // Should always be true.
+    result_.resize(sizeof(uint32_t),0); // Put a block of 0's as the header
+  }
+
+  /// Go to the next element (used in Push() and Skip())
+  void MoveNext() {
+    CheckErrors();
+
+    // Check to make sure we are not already in a finished state
+    if (Done()) {
+      ThrowError("Attempting to iterate past end of list.");
+    }
+
+    // Move forward one element.
+    cur_byte_ += sizeof(cur_elem_length_) + cur_elem_length_;
+    ++cur_elem_;
+
+    // If we are at the end, finish
+    if (Done()) {
+      cur_elem_length_ = 0;
+      return;
+    }
+
+    // Otherwise, we should be able to read the new element's length
+    if (cur_byte_ + sizeof(cur_elem_length_) > num_bytes_) {
+      ThrowError("Corrupt element data.");
+    }
+
+    // Set the new element's length
+    cur_elem_length_ = DecodeFixed32(data_+cur_byte_);
+
+    return;
+  }
+
+  /// Append the current element (pointed to by cur_byte_) to result_
+  /// Assumes result_ has already been reserved appropriately.
+  void WriteCurrentElement() {
+    // First verify that the iterator is still valid.
+    CheckErrors();
+    if (Done()) {
+      ThrowError("Attempting to write invalid element.");
+    }
+
+    // Append the cur element.
+    result_.insert(result_.end(),
+                   data_+cur_byte_,
+                   data_+cur_byte_+ sizeof(uint32_t) + cur_elem_length_);
+  }
+
+  /// Will ThrowError() if neccessary.
+  /// Checks for common/ubiquitous errors that can arise after most operations.
+  /// This method should be called before any reading operation.
+  /// If this function succeeds, then we are guaranteed to be in a valid state.
+  /// Other member functions should check for errors and ThrowError() also
+  ///  if an error occurs that is specific to it even while in a valid state.
+  void CheckErrors() {
+    // Check if any crazy thing has happened recently
+    if ((cur_elem_ > length_) ||                              // Bad index
+        (cur_byte_ > num_bytes_) ||                           // No more bytes
+        (cur_byte_ + cur_elem_length_ > num_bytes_) ||        // Item too large
+        (cur_byte_ == num_bytes_ && cur_elem_ != length_) ||  // Too many items
+        (cur_elem_ == length_ && cur_byte_ != num_bytes_)) {  // Too many bytes
+      ThrowError("Corrupt data.");
+    }
+  }
+
+  /// Will throw an exception based on the passed-in message.
+  /// This function is guaranteed to STOP THE CONTROL-FLOW.
+  /// (i.e.: you do not have to call "return" after calling ThrowError)
+  void ThrowError(const char* const msg = NULL) {
+    // TODO: For now we ignore the msg parameter. This can be expanded later.
+    throw RedisListException();
+  }
+
+ private:
+  const char* const data_;      // A pointer to the data (the first byte)
+  const uint32_t num_bytes_;    // The number of bytes in this list
+
+  uint32_t cur_byte_;           // The current byte being read
+  uint32_t cur_elem_;           // The current element being read
+  uint32_t cur_elem_length_;    // The number of bytes in current element
+
+  uint32_t length_;             // The number of elements in this list
+  std::vector<char> result_;    // The output data
+};
+
+} // namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/utilities/redis/redis_lists.cc b/utilities/redis/redis_lists.cc
new file mode 100644 (file)
index 0000000..2b38a2d
--- /dev/null
@@ -0,0 +1,552 @@
+// Copyright 2013 Facebook
+/**
+ * A (persistent) Redis API built using the rocksdb backend.
+ * Implements Redis Lists as described on: http://redis.io/commands#list
+ *
+ * @throws All functions may throw a RedisListException on error/corruption.
+ *
+ * @notes Internally, the set of lists is stored in a rocksdb database,
+ *        mapping keys to values. Each "value" is the list itself, storing
+ *        some kind of internal representation of the data. All the
+ *        representation details are handled by the RedisListIterator class.
+ *        The present file should be oblivious to the representation details,
+ *        handling only the client (Redis) API, and the calls to rocksdb.
+ *
+ * @TODO  Presently, all operations take at least O(NV) time where
+ *        N is the number of elements in the list, and V is the average
+ *        number of bytes per value in the list. So maybe, with merge operator
+ *        we can improve this to an optimal O(V) amortized time, since we
+ *        wouldn't have to read and re-write the entire list.
+ *
+ * @author Deon Nicholas (dnicholas@fb.com)
+ */
+
+#ifndef ROCKSDB_LITE
+#include "redis_lists.h"
+
+#include <iostream>
+#include <memory>
+#include <cmath>
+
+#include "rocksdb/slice.h"
+#include "util/coding.h"
+
+namespace rocksdb
+{
+
+/// Constructors
+
+RedisLists::RedisLists(const std::string& db_path,
+                       Options options, bool destructive)
+    : put_option_(),
+      get_option_() {
+
+  // Store the name of the database
+  db_name_ = db_path;
+
+  // If destructive, destroy the DB before re-opening it.
+  if (destructive) {
+    DestroyDB(db_name_, Options());
+  }
+
+  // Now open and deal with the db
+  DB* db;
+  Status s = DB::Open(options, db_name_, &db);
+  if (!s.ok()) {
+    std::cerr << "ERROR " << s.ToString() << std::endl;
+    assert(false);
+  }
+
+  db_ = std::unique_ptr<DB>(db);
+}
+
+
+/// Accessors
+
+// Number of elements in the list associated with key
+//   : throws RedisListException
+int RedisLists::Length(const std::string& key) {
+  // Extract the string data representing the list.
+  std::string data;
+  db_->Get(get_option_, key, &data);
+
+  // Return the length
+  RedisListIterator it(data);
+  return it.Length();
+}
+
+// Get the element at the specified index in the (list: key)
+// Returns <empty> ("") on out-of-bounds
+//   : throws RedisListException
+bool RedisLists::Index(const std::string& key, int32_t index,
+                       std::string* result) {
+  // Extract the string data representing the list.
+  std::string data;
+  db_->Get(get_option_, key, &data);
+
+  // Handle REDIS negative indices (from the end); fast iff Length() takes O(1)
+  if (index < 0) {
+    index = Length(key) - (-index);  //replace (-i) with (N-i).
+  }
+
+  // Iterate through the list until the desired index is found.
+  int curIndex = 0;
+  RedisListIterator it(data);
+  while(curIndex < index && !it.Done()) {
+    ++curIndex;
+    it.Skip();
+  }
+
+  // If we actually found the index
+  if (curIndex == index && !it.Done()) {
+    Slice elem;
+    it.GetCurrent(&elem);
+    if (result != NULL) {
+      *result = elem.ToString();
+    }
+
+    return true;
+  } else {
+    return false;
+  }
+}
+
+// Return a truncated version of the list.
+// First, negative values for first/last are interpreted as "end of list".
+// So, if first == -1, then it is re-set to index: (Length(key) - 1)
+// Then, return exactly those indices i such that first <= i <= last.
+//   : throws RedisListException
+std::vector<std::string> RedisLists::Range(const std::string& key,
+                                           int32_t first, int32_t last) {
+  // Extract the string data representing the list.
+  std::string data;
+  db_->Get(get_option_, key, &data);
+
+  // Handle negative bounds (-1 means last element, etc.)
+  int listLen = Length(key);
+  if (first < 0) {
+    first = listLen - (-first);           // Replace (-x) with (N-x)
+  }
+  if (last < 0) {
+    last = listLen - (-last);
+  }
+
+  // Verify bounds (and truncate the range so that it is valid)
+  first = std::max(first, 0);
+  last = std::min(last, listLen-1);
+  int len = std::max(last-first+1, 0);
+
+  // Initialize the resulting list
+  std::vector<std::string> result(len);
+
+  // Traverse the list and update the vector
+  int curIdx = 0;
+  Slice elem;
+  for (RedisListIterator it(data); !it.Done() && curIdx<=last; it.Skip()) {
+    if (first <= curIdx && curIdx <= last) {
+      it.GetCurrent(&elem);
+      result[curIdx-first].assign(elem.data(),elem.size());
+    }
+
+    ++curIdx;
+  }
+
+  // Return the result. Might be empty
+  return result;
+}
+
+// Print the (list: key) out to stdout. For debugging mostly. Public for now.
+void RedisLists::Print(const std::string& key) {
+  // Extract the string data representing the list.
+  std::string data;
+  db_->Get(get_option_, key, &data);
+
+  // Iterate through the list and print the items
+  Slice elem;
+  for (RedisListIterator it(data); !it.Done(); it.Skip()) {
+    it.GetCurrent(&elem);
+    std::cout << "ITEM " << elem.ToString() << std::endl;
+  }
+
+  //Now print the byte data
+  RedisListIterator it(data);
+  std::cout << "==Printing data==" << std::endl;
+  std::cout << data.size() << std::endl;
+  std::cout << it.Size() << " " << it.Length() << std::endl;
+  Slice result = it.WriteResult();
+  std::cout << result.data() << std::endl;
+  if (true) {
+    std::cout << "size: " << result.size() << std::endl;
+    const char* val = result.data();
+    for(int i=0; i<(int)result.size(); ++i) {
+      std::cout << (int)val[i] << " " << (val[i]>=32?val[i]:' ') << std::endl;
+    }
+    std::cout << std::endl;
+  }
+}
+
+/// Insert/Update Functions
+/// Note: The "real" insert function is private. See below.
+
+// InsertBefore and InsertAfter are simply wrappers around the Insert function.
+int RedisLists::InsertBefore(const std::string& key, const std::string& pivot,
+                             const std::string& value) {
+  return Insert(key, pivot, value, false);
+}
+
+int RedisLists::InsertAfter(const std::string& key, const std::string& pivot,
+                            const std::string& value) {
+  return Insert(key, pivot, value, true);
+}
+
+// Prepend value onto beginning of (list: key)
+//   : throws RedisListException
+int RedisLists::PushLeft(const std::string& key, const std::string& value) {
+  // Get the original list data
+  std::string data;
+  db_->Get(get_option_, key, &data);
+
+  // Construct the result
+  RedisListIterator it(data);
+  it.Reserve(it.Size() + it.SizeOf(value));
+  it.InsertElement(value);
+
+  // Push the data back to the db and return the length
+  db_->Put(put_option_, key, it.WriteResult());
+  return it.Length();
+}
+
+// Append value onto end of (list: key)
+// TODO: Make this O(1) time. Might require MergeOperator.
+//   : throws RedisListException
+int RedisLists::PushRight(const std::string& key, const std::string& value) {
+  // Get the original list data
+  std::string data;
+  db_->Get(get_option_, key, &data);
+
+  // Create an iterator to the data and seek to the end.
+  RedisListIterator it(data);
+  it.Reserve(it.Size() + it.SizeOf(value));
+  while (!it.Done()) {
+    it.Push();    // Write each element as we go
+  }
+
+  // Insert the new element at the current position (the end)
+  it.InsertElement(value);
+
+  // Push it back to the db, and return length
+  db_->Put(put_option_, key, it.WriteResult());
+  return it.Length();
+}
+
+// Set (list: key)[idx] = val. Return true on success, false on fail.
+//   : throws RedisListException
+bool RedisLists::Set(const std::string& key, int32_t index,
+                     const std::string& value) {
+  // Get the original list data
+  std::string data;
+  db_->Get(get_option_, key, &data);
+
+  // Handle negative index for REDIS (meaning -index from end of list)
+  if (index < 0) {
+    index = Length(key) - (-index);
+  }
+
+  // Iterate through the list until we find the element we want
+  int curIndex = 0;
+  RedisListIterator it(data);
+  it.Reserve(it.Size() + it.SizeOf(value));  // Over-estimate is fine
+  while(curIndex < index && !it.Done()) {
+    it.Push();
+    ++curIndex;
+  }
+
+  // If not found, return false (this occurs when index was invalid)
+  if (it.Done() || curIndex != index) {
+    return false;
+  }
+
+  // Write the new element value, and drop the previous element value
+  it.InsertElement(value);
+  it.Skip();
+
+  // Write the data to the database
+  // Check status, since it needs to return true/false guarantee
+  Status s = db_->Put(put_option_, key, it.WriteResult());
+
+  // Success
+  return s.ok();
+}
+
+/// Delete / Remove / Pop functions
+
+// Trim (list: key) so that it will only contain the indices from start..stop
+//  Invalid indices will not generate an error, just empty,
+//  or the portion of the list that fits in this interval
+//   : throws RedisListException
+bool RedisLists::Trim(const std::string& key, int32_t start, int32_t stop) {
+  // Get the original list data
+  std::string data;
+  db_->Get(get_option_, key, &data);
+
+  // Handle negative indices in REDIS
+  int listLen = Length(key);
+  if (start < 0) {
+    start = listLen - (-start);
+  }
+  if (stop < 0) {
+    stop = listLen - (-stop);
+  }
+
+  // Truncate bounds to only fit in the list
+  start = std::max(start, 0);
+  stop = std::min(stop, listLen-1);
+
+  // Construct an iterator for the list. Drop all undesired elements.
+  int curIndex = 0;
+  RedisListIterator it(data);
+  it.Reserve(it.Size());          // Over-estimate
+  while(!it.Done()) {
+    // If not within the range, just skip the item (drop it).
+    // Otherwise, continue as usual.
+    if (start <= curIndex && curIndex <= stop) {
+      it.Push();
+    } else {
+      it.Skip();
+    }
+
+    // Increment the current index
+    ++curIndex;
+  }
+
+  // Write the (possibly empty) result to the database
+  Status s = db_->Put(put_option_, key, it.WriteResult());
+
+  // Return true as long as the write succeeded
+  return s.ok();
+}
+
+// Return and remove the first element in the list (or "" if empty)
+//   : throws RedisListException
+bool RedisLists::PopLeft(const std::string& key, std::string* result) {
+  // Get the original list data
+  std::string data;
+  db_->Get(get_option_, key, &data);
+
+  // Point to first element in the list (if it exists), and get its value/size
+  RedisListIterator it(data);
+  if (it.Length() > 0) {            // Proceed only if list is non-empty
+    Slice elem;
+    it.GetCurrent(&elem);           // Store the value of the first element
+    it.Reserve(it.Size() - it.SizeOf(elem));
+    it.Skip();                      // DROP the first item and move to next
+
+    // Update the db
+    db_->Put(put_option_, key, it.WriteResult());
+
+    // Return the value
+    if (result != NULL) {
+      *result = elem.ToString();
+    }
+    return true;
+  } else {
+    return false;
+  }
+}
+
+// Remove and return the last element in the list (or "" if empty)
+// TODO: Make this O(1). Might require MergeOperator.
+//   : throws RedisListException
+bool RedisLists::PopRight(const std::string& key, std::string* result) {
+  // Extract the original list data
+  std::string data;
+  db_->Get(get_option_, key, &data);
+
+  // Construct an iterator to the data and move to last element
+  RedisListIterator it(data);
+  it.Reserve(it.Size());
+  int len = it.Length();
+  int curIndex = 0;
+  while(curIndex < (len-1) && !it.Done()) {
+    it.Push();
+    ++curIndex;
+  }
+
+  // Extract and drop/skip the last element
+  if (curIndex == len-1) {
+    assert(!it.Done());         // Sanity check. Should not have ended here.
+
+    // Extract and pop the element
+    Slice elem;
+    it.GetCurrent(&elem);       // Save value of element.
+    it.Skip();                  // Skip the element
+
+    // Write the result to the database
+    db_->Put(put_option_, key, it.WriteResult());
+
+    // Return the value
+    if (result != NULL) {
+      *result = elem.ToString();
+    }
+    return true;
+  } else {
+    // Must have been an empty list
+    assert(it.Done() && len==0 && curIndex == 0);
+    return false;
+  }
+}
+
+// Remove the (first or last) "num" occurrences of value in (list: key)
+//   : throws RedisListException
+int RedisLists::Remove(const std::string& key, int32_t num,
+                       const std::string& value) {
+  // Negative num ==> RemoveLast; Positive num ==> Remove First
+  if (num < 0) {
+    return RemoveLast(key, -num, value);
+  } else if (num > 0) {
+    return RemoveFirst(key, num, value);
+  } else {
+    return RemoveFirst(key, Length(key), value);
+  }
+}
+
+// Remove the first "num" occurrences of value in (list: key).
+//   : throws RedisListException
+int RedisLists::RemoveFirst(const std::string& key, int32_t num,
+                            const std::string& value) {
+  // Ensure that the number is positive
+  assert(num >= 0);
+
+  // Extract the original list data
+  std::string data;
+  db_->Get(get_option_, key, &data);
+
+  // Traverse the list, appending all but the desired occurrences of value
+  int numSkipped = 0;         // Keep track of the number of times value is seen
+  Slice elem;
+  RedisListIterator it(data);
+  it.Reserve(it.Size());
+  while (!it.Done()) {
+    it.GetCurrent(&elem);
+
+    if (elem == value && numSkipped < num) {
+      // Drop this item if desired
+      it.Skip();
+      ++numSkipped;
+    } else {
+      // Otherwise keep the item and proceed as normal
+      it.Push();
+    }
+  }
+
+  // Put the result back to the database
+  db_->Put(put_option_, key, it.WriteResult());
+
+  // Return the number of elements removed
+  return numSkipped;
+}
+
+
+// Remove the last "num" occurrences of value in (list: key).
+// TODO: I traverse the list 2x. Make faster. Might require MergeOperator.
+//   : throws RedisListException
+int RedisLists::RemoveLast(const std::string& key, int32_t num,
+                           const std::string& value) {
+  // Ensure that the number is positive
+  assert(num >= 0);
+
+  // Extract the original list data
+  std::string data;
+  db_->Get(get_option_, key, &data);
+
+  // Temporary variable to hold the "current element" in the blocks below
+  Slice elem;
+
+  // Count the total number of occurrences of value
+  int totalOccs = 0;
+  for (RedisListIterator it(data); !it.Done(); it.Skip()) {
+    it.GetCurrent(&elem);
+    if (elem == value) {
+      ++totalOccs;
+    }
+  }
+
+  // Construct an iterator to the data. Reserve enough space for the result.
+  RedisListIterator it(data);
+  int bytesRemoved = std::min(num,totalOccs)*it.SizeOf(value);
+  it.Reserve(it.Size() - bytesRemoved);
+
+  // Traverse the list, appending all but the desired occurrences of value.
+  // Note: "Drop the last k occurrences" is equivalent to
+  //  "keep only the first n-k occurrences", where n is total occurrences.
+  int numKept = 0;          // Keep track of the number of times value is kept
+  while(!it.Done()) {
+    it.GetCurrent(&elem);
+
+    // If we are within the deletion range and equal to value, drop it.
+    // Otherwise, append/keep/push it.
+    if (elem == value) {
+      if (numKept < totalOccs - num) {
+        it.Push();
+        ++numKept;
+      } else {
+        it.Skip();
+      }
+    } else {
+      // Always append the others
+      it.Push();
+    }
+  }
+
+  // Put the result back to the database
+  db_->Put(put_option_, key, it.WriteResult());
+
+  // Return the number of elements removed
+  return totalOccs - numKept;
+}
+
+/// Private functions
+
+// Insert element value into (list: key), right before/after
+//  the first occurrence of pivot
+//   : throws RedisListException
+int RedisLists::Insert(const std::string& key, const std::string& pivot,
+                       const std::string& value, bool insert_after) {
+  // Get the original list data
+  std::string data;
+  db_->Get(get_option_, key, &data);
+
+  // Construct an iterator to the data and reserve enough space for result.
+  RedisListIterator it(data);
+  it.Reserve(it.Size() + it.SizeOf(value));
+
+  // Iterate through the list until we find the element we want
+  Slice elem;
+  bool found = false;
+  while(!it.Done() && !found) {
+    it.GetCurrent(&elem);
+
+    // When we find the element, insert the element and mark found
+    if (elem == pivot) {                // Found it!
+      found = true;
+      if (insert_after == true) {       // Skip one more, if inserting after it
+        it.Push();
+      }
+      it.InsertElement(value);
+    } else {
+      it.Push();
+    }
+
+  }
+
+  // Put the data (string) into the database
+  if (found) {
+    db_->Put(put_option_, key, it.WriteResult());
+  }
+
+  // Returns the new (possibly unchanged) length of the list
+  return it.Length();
+}
+
+}  // namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/utilities/redis/redis_lists.h b/utilities/redis/redis_lists.h
new file mode 100644 (file)
index 0000000..6c8b955
--- /dev/null
@@ -0,0 +1,108 @@
+/**
+ * A (persistent) Redis API built using the rocksdb backend.
+ * Implements Redis Lists as described on: http://redis.io/commands#list
+ *
+ * @throws All functions may throw a RedisListException
+ *
+ * @author Deon Nicholas (dnicholas@fb.com)
+ * Copyright 2013 Facebook
+ */
+
+#ifndef ROCKSDB_LITE
+#pragma once
+
+#include <string>
+#include "rocksdb/db.h"
+#include "redis_list_iterator.h"
+#include "redis_list_exception.h"
+
+namespace rocksdb {
+
+/// The Redis functionality (see http://redis.io/commands#list)
+/// All functions may THROW a RedisListException
+class RedisLists {
+ public: // Constructors / Destructors
+  /// Construct a new RedisLists database, with name/path of db.
+  /// Will clear the database on open iff destructive is true (default false).
+  /// Otherwise, it will restore saved changes.
+  /// May throw RedisListException
+  RedisLists(const std::string& db_path,
+             Options options, bool destructive = false);
+
+ public:  // Accessors
+  /// The number of items in (list: key)
+  int Length(const std::string& key);
+
+  /// Search the list for the (index)'th item (0-based) in (list:key)
+  /// A negative index indicates: "from end-of-list"
+  /// If index is within range: return true, and return the value in *result.
+  /// If (index < -length OR index>=length), then index is out of range:
+  ///   return false (and *result is left unchanged)
+  /// May throw RedisListException
+  bool Index(const std::string& key, int32_t index,
+             std::string* result);
+
+  /// Return (list: key)[first..last] (inclusive)
+  /// May throw RedisListException
+  std::vector<std::string> Range(const std::string& key,
+                                 int32_t first, int32_t last);
+
+  /// Prints the entire (list: key), for debugging.
+  void Print(const std::string& key);
+
+ public: // Insert/Update
+  /// Insert value before/after pivot in (list: key). Return the length.
+  /// May throw RedisListException
+  int InsertBefore(const std::string& key, const std::string& pivot,
+                   const std::string& value);
+  int InsertAfter(const std::string& key, const std::string& pivot,
+                  const std::string& value);
+
+  /// Push / Insert value at beginning/end of the list. Return the length.
+  /// May throw RedisListException
+  int PushLeft(const std::string& key, const std::string& value);
+  int PushRight(const std::string& key, const std::string& value);
+
+  /// Set (list: key)[idx] = val. Return true on success, false on fail
+  /// May throw RedisListException
+  bool Set(const std::string& key, int32_t index, const std::string& value);
+
+ public: // Delete / Remove / Pop / Trim
+  /// Trim (list: key) so that it will only contain the indices from start..stop
+  /// Returns true on success
+  /// May throw RedisListException
+  bool Trim(const std::string& key, int32_t start, int32_t stop);
+
+  /// If list is empty, return false and leave *result unchanged.
+  /// Else, remove the first/last elem, store it in *result, and return true
+  bool PopLeft(const std::string& key, std::string* result);  // First
+  bool PopRight(const std::string& key, std::string* result); // Last
+
+  /// Remove the first (or last) num occurrences of value from the list (key)
+  /// Return the number of elements removed.
+  /// May throw RedisListException
+  int Remove(const std::string& key, int32_t num,
+             const std::string& value);
+  int RemoveFirst(const std::string& key, int32_t num,
+                  const std::string& value);
+  int RemoveLast(const std::string& key, int32_t num,
+                 const std::string& value);
+
+ private: // Private Functions
+  /// Calls InsertBefore or InsertAfter
+  int Insert(const std::string& key, const std::string& pivot,
+             const std::string& value, bool insert_after);
+ private:
+  std::string db_name_;       // The actual database name/path
+  WriteOptions put_option_;
+  ReadOptions get_option_;
+
+  /// The backend rocksdb database.
+  /// Map : key --> list
+  ///       where a list is a sequence of elements
+  ///       and an element is a 4-byte integer (n), followed by n bytes of data
+  std::unique_ptr<DB> db_;
+};
+
+} // namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/utilities/redis/redis_lists_test.cc b/utilities/redis/redis_lists_test.cc
new file mode 100644 (file)
index 0000000..b05c6c7
--- /dev/null
@@ -0,0 +1,884 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+/**
+ * A test harness for the Redis API built on rocksdb.
+ *
+ * USAGE: Build with: "make redis_test" (in rocksdb directory).
+ *        Run unit tests with: "./redis_test"
+ *        Manual/Interactive user testing: "./redis_test -m"
+ *        Manual user testing + restart database: "./redis_test -m -d"
+ *
+ * TODO:  Add LARGE random test cases to verify efficiency and scalability
+ *
+ * @author Deon Nicholas (dnicholas@fb.com)
+ */
+
+
+#include <iostream>
+#include <cctype>
+
+#include "redis_lists.h"
+#include "util/testharness.h"
+#include "util/random.h"
+
+using namespace rocksdb;
+using namespace std;
+
+namespace rocksdb {
+
+class RedisListsTest {
+ public:
+  static const string kDefaultDbName;
+  static Options options;
+
+  RedisListsTest() {
+    options.create_if_missing = true;
+  }
+};
+
+const string RedisListsTest::kDefaultDbName = "/tmp/redisdefaultdb/";
+Options RedisListsTest::options = Options();
+
+// operator== and operator<< are defined below for vectors (lists)
+// Needed for ASSERT_EQ
+
+namespace {
+void AssertListEq(const std::vector<std::string>& result,
+                  const std::vector<std::string>& expected_result) {
+  ASSERT_EQ(result.size(), expected_result.size());
+  for (size_t i = 0; i < result.size(); ++i) {
+    ASSERT_EQ(result[i], expected_result[i]);
+  }
+}
+}  // namespace
+
+// PushRight, Length, Index, Range
+TEST(RedisListsTest, SimpleTest) {
+  RedisLists redis(kDefaultDbName, options, true);   // Destructive
+
+  string tempv; // Used below for all Index(), PopRight(), PopLeft()
+
+  // Simple PushRight (should return the new length each time)
+  ASSERT_EQ(redis.PushRight("k1", "v1"), 1);
+  ASSERT_EQ(redis.PushRight("k1", "v2"), 2);
+  ASSERT_EQ(redis.PushRight("k1", "v3"), 3);
+
+  // Check Length and Index() functions
+  ASSERT_EQ(redis.Length("k1"), 3);        // Check length
+  ASSERT_TRUE(redis.Index("k1", 0, &tempv));
+  ASSERT_EQ(tempv, "v1");   // Check valid indices
+  ASSERT_TRUE(redis.Index("k1", 1, &tempv));
+  ASSERT_EQ(tempv, "v2");
+  ASSERT_TRUE(redis.Index("k1", 2, &tempv));
+  ASSERT_EQ(tempv, "v3");
+
+  // Check range function and vectors
+  std::vector<std::string> result = redis.Range("k1", 0, 2);   // Get the list
+  std::vector<std::string> expected_result(3);
+  expected_result[0] = "v1";
+  expected_result[1] = "v2";
+  expected_result[2] = "v3";
+  AssertListEq(result, expected_result);
+}
+
+// PushLeft, Length, Index, Range
+TEST(RedisListsTest, SimpleTest2) {
+  RedisLists redis(kDefaultDbName, options, true);   // Destructive
+
+  string tempv; // Used below for all Index(), PopRight(), PopLeft()
+
+  // Simple PushRight
+  ASSERT_EQ(redis.PushLeft("k1", "v3"), 1);
+  ASSERT_EQ(redis.PushLeft("k1", "v2"), 2);
+  ASSERT_EQ(redis.PushLeft("k1", "v1"), 3);
+
+  // Check Length and Index() functions
+  ASSERT_EQ(redis.Length("k1"), 3);        // Check length
+  ASSERT_TRUE(redis.Index("k1", 0, &tempv));
+  ASSERT_EQ(tempv, "v1");   // Check valid indices
+  ASSERT_TRUE(redis.Index("k1", 1, &tempv));
+  ASSERT_EQ(tempv, "v2");
+  ASSERT_TRUE(redis.Index("k1", 2, &tempv));
+  ASSERT_EQ(tempv, "v3");
+
+  // Check range function and vectors
+  std::vector<std::string> result = redis.Range("k1", 0, 2);   // Get the list
+  std::vector<std::string> expected_result(3);
+  expected_result[0] = "v1";
+  expected_result[1] = "v2";
+  expected_result[2] = "v3";
+  AssertListEq(result, expected_result);
+}
+
+// Exhaustive test of the Index() function
+TEST(RedisListsTest, IndexTest) {
+  RedisLists redis(kDefaultDbName, options, true);   // Destructive
+
+  string tempv; // Used below for all Index(), PopRight(), PopLeft()
+
+  // Empty Index check (return empty and should not crash or edit tempv)
+  tempv = "yo";
+  ASSERT_TRUE(!redis.Index("k1", 0, &tempv));
+  ASSERT_EQ(tempv, "yo");
+  ASSERT_TRUE(!redis.Index("fda", 3, &tempv));
+  ASSERT_EQ(tempv, "yo");
+  ASSERT_TRUE(!redis.Index("random", -12391, &tempv));
+  ASSERT_EQ(tempv, "yo");
+
+  // Simple Pushes (will yield: [v6, v4, v4, v1, v2, v3]
+  redis.PushRight("k1", "v1");
+  redis.PushRight("k1", "v2");
+  redis.PushRight("k1", "v3");
+  redis.PushLeft("k1", "v4");
+  redis.PushLeft("k1", "v4");
+  redis.PushLeft("k1", "v6");
+
+  // Simple, non-negative indices
+  ASSERT_TRUE(redis.Index("k1", 0, &tempv));
+  ASSERT_EQ(tempv, "v6");
+  ASSERT_TRUE(redis.Index("k1", 1, &tempv));
+  ASSERT_EQ(tempv, "v4");
+  ASSERT_TRUE(redis.Index("k1", 2, &tempv));
+  ASSERT_EQ(tempv, "v4");
+  ASSERT_TRUE(redis.Index("k1", 3, &tempv));
+  ASSERT_EQ(tempv, "v1");
+  ASSERT_TRUE(redis.Index("k1", 4, &tempv));
+  ASSERT_EQ(tempv, "v2");
+  ASSERT_TRUE(redis.Index("k1", 5, &tempv));
+  ASSERT_EQ(tempv, "v3");
+
+  // Negative indices
+  ASSERT_TRUE(redis.Index("k1", -6, &tempv));
+  ASSERT_EQ(tempv, "v6");
+  ASSERT_TRUE(redis.Index("k1", -5, &tempv));
+  ASSERT_EQ(tempv, "v4");
+  ASSERT_TRUE(redis.Index("k1", -4, &tempv));
+  ASSERT_EQ(tempv, "v4");
+  ASSERT_TRUE(redis.Index("k1", -3, &tempv));
+  ASSERT_EQ(tempv, "v1");
+  ASSERT_TRUE(redis.Index("k1", -2, &tempv));
+  ASSERT_EQ(tempv, "v2");
+  ASSERT_TRUE(redis.Index("k1", -1, &tempv));
+  ASSERT_EQ(tempv, "v3");
+
+  // Out of bounds (return empty, no crash)
+  ASSERT_TRUE(!redis.Index("k1", 6, &tempv));
+  ASSERT_TRUE(!redis.Index("k1", 123219, &tempv));
+  ASSERT_TRUE(!redis.Index("k1", -7, &tempv));
+  ASSERT_TRUE(!redis.Index("k1", -129, &tempv));
+}
+
+
+// Exhaustive test of the Range() function
+TEST(RedisListsTest, RangeTest) {
+  RedisLists redis(kDefaultDbName, options, true);   // Destructive
+
+  string tempv; // Used below for all Index(), PopRight(), PopLeft()
+
+  // Simple Pushes (will yield: [v6, v4, v4, v1, v2, v3])
+  redis.PushRight("k1", "v1");
+  redis.PushRight("k1", "v2");
+  redis.PushRight("k1", "v3");
+  redis.PushLeft("k1", "v4");
+  redis.PushLeft("k1", "v4");
+  redis.PushLeft("k1", "v6");
+
+  // Sanity check (check the length;  make sure it's 6)
+  ASSERT_EQ(redis.Length("k1"), 6);
+
+  // Simple range
+  std::vector<std::string> res = redis.Range("k1", 1, 4);
+  ASSERT_EQ((int)res.size(), 4);
+  ASSERT_EQ(res[0], "v4");
+  ASSERT_EQ(res[1], "v4");
+  ASSERT_EQ(res[2], "v1");
+  ASSERT_EQ(res[3], "v2");
+
+  // Negative indices (i.e.: measured from the end)
+  res = redis.Range("k1", 2, -1);
+  ASSERT_EQ((int)res.size(), 4);
+  ASSERT_EQ(res[0], "v4");
+  ASSERT_EQ(res[1], "v1");
+  ASSERT_EQ(res[2], "v2");
+  ASSERT_EQ(res[3], "v3");
+
+  res = redis.Range("k1", -6, -4);
+  ASSERT_EQ((int)res.size(), 3);
+  ASSERT_EQ(res[0], "v6");
+  ASSERT_EQ(res[1], "v4");
+  ASSERT_EQ(res[2], "v4");
+
+  res = redis.Range("k1", -1, 5);
+  ASSERT_EQ((int)res.size(), 1);
+  ASSERT_EQ(res[0], "v3");
+
+  // Partial / Broken indices
+  res = redis.Range("k1", -3, 1000000);
+  ASSERT_EQ((int)res.size(), 3);
+  ASSERT_EQ(res[0], "v1");
+  ASSERT_EQ(res[1], "v2");
+  ASSERT_EQ(res[2], "v3");
+
+  res = redis.Range("k1", -1000000, 1);
+  ASSERT_EQ((int)res.size(), 2);
+  ASSERT_EQ(res[0], "v6");
+  ASSERT_EQ(res[1], "v4");
+
+  // Invalid indices
+  res = redis.Range("k1", 7, 9);
+  ASSERT_EQ((int)res.size(), 0);
+
+  res = redis.Range("k1", -8, -7);
+  ASSERT_EQ((int)res.size(), 0);
+
+  res = redis.Range("k1", 3, 2);
+  ASSERT_EQ((int)res.size(), 0);
+
+  res = redis.Range("k1", 5, -2);
+  ASSERT_EQ((int)res.size(), 0);
+
+  // Range matches Index
+  res = redis.Range("k1", -6, -4);
+  ASSERT_TRUE(redis.Index("k1", -6, &tempv));
+  ASSERT_EQ(tempv, res[0]);
+  ASSERT_TRUE(redis.Index("k1", -5, &tempv));
+  ASSERT_EQ(tempv, res[1]);
+  ASSERT_TRUE(redis.Index("k1", -4, &tempv));
+  ASSERT_EQ(tempv, res[2]);
+
+  // Last check
+  res = redis.Range("k1", 0, -6);
+  ASSERT_EQ((int)res.size(), 1);
+  ASSERT_EQ(res[0], "v6");
+}
+
+// Exhaustive test for InsertBefore(), and InsertAfter()
+TEST(RedisListsTest, InsertTest) {
+  RedisLists redis(kDefaultDbName, options, true);
+
+  string tempv; // Used below for all Index(), PopRight(), PopLeft()
+
+  // Insert on empty list (return 0, and do not crash)
+  ASSERT_EQ(redis.InsertBefore("k1", "non-exist", "a"), 0);
+  ASSERT_EQ(redis.InsertAfter("k1", "other-non-exist", "c"), 0);
+  ASSERT_EQ(redis.Length("k1"), 0);
+
+  // Push some preliminary stuff [g, f, e, d, c, b, a]
+  redis.PushLeft("k1", "a");
+  redis.PushLeft("k1", "b");
+  redis.PushLeft("k1", "c");
+  redis.PushLeft("k1", "d");
+  redis.PushLeft("k1", "e");
+  redis.PushLeft("k1", "f");
+  redis.PushLeft("k1", "g");
+  ASSERT_EQ(redis.Length("k1"), 7);
+
+  // Test InsertBefore
+  int newLength = redis.InsertBefore("k1", "e", "hello");
+  ASSERT_EQ(newLength, 8);
+  ASSERT_EQ(redis.Length("k1"), newLength);
+  ASSERT_TRUE(redis.Index("k1", 1, &tempv));
+  ASSERT_EQ(tempv, "f");
+  ASSERT_TRUE(redis.Index("k1", 3, &tempv));
+  ASSERT_EQ(tempv, "e");
+  ASSERT_TRUE(redis.Index("k1", 2, &tempv));
+  ASSERT_EQ(tempv, "hello");
+
+  // Test InsertAfter
+  newLength =  redis.InsertAfter("k1", "c", "bye");
+  ASSERT_EQ(newLength, 9);
+  ASSERT_EQ(redis.Length("k1"), newLength);
+  ASSERT_TRUE(redis.Index("k1", 6, &tempv));
+  ASSERT_EQ(tempv, "bye");
+
+  // Test bad value on InsertBefore
+  newLength = redis.InsertBefore("k1", "yo", "x");
+  ASSERT_EQ(newLength, 9);
+  ASSERT_EQ(redis.Length("k1"), newLength);
+
+  // Test bad value on InsertAfter
+  newLength = redis.InsertAfter("k1", "xxxx", "y");
+  ASSERT_EQ(newLength, 9);
+  ASSERT_EQ(redis.Length("k1"), newLength);
+
+  // Test InsertBefore beginning
+  newLength = redis.InsertBefore("k1", "g", "begggggggggggggggg");
+  ASSERT_EQ(newLength, 10);
+  ASSERT_EQ(redis.Length("k1"), newLength);
+
+  // Test InsertAfter end
+  newLength = redis.InsertAfter("k1", "a", "enddd");
+  ASSERT_EQ(newLength, 11);
+  ASSERT_EQ(redis.Length("k1"), newLength);
+
+  // Make sure nothing weird happened.
+  ASSERT_TRUE(redis.Index("k1", 0, &tempv));
+  ASSERT_EQ(tempv, "begggggggggggggggg");
+  ASSERT_TRUE(redis.Index("k1", 1, &tempv));
+  ASSERT_EQ(tempv, "g");
+  ASSERT_TRUE(redis.Index("k1", 2, &tempv));
+  ASSERT_EQ(tempv, "f");
+  ASSERT_TRUE(redis.Index("k1", 3, &tempv));
+  ASSERT_EQ(tempv, "hello");
+  ASSERT_TRUE(redis.Index("k1", 4, &tempv));
+  ASSERT_EQ(tempv, "e");
+  ASSERT_TRUE(redis.Index("k1", 5, &tempv));
+  ASSERT_EQ(tempv, "d");
+  ASSERT_TRUE(redis.Index("k1", 6, &tempv));
+  ASSERT_EQ(tempv, "c");
+  ASSERT_TRUE(redis.Index("k1", 7, &tempv));
+  ASSERT_EQ(tempv, "bye");
+  ASSERT_TRUE(redis.Index("k1", 8, &tempv));
+  ASSERT_EQ(tempv, "b");
+  ASSERT_TRUE(redis.Index("k1", 9, &tempv));
+  ASSERT_EQ(tempv, "a");
+  ASSERT_TRUE(redis.Index("k1", 10, &tempv));
+  ASSERT_EQ(tempv, "enddd");
+}
+
+// Exhaustive test of Set function
+TEST(RedisListsTest, SetTest) {
+  RedisLists redis(kDefaultDbName, options, true);
+
+  string tempv; // Used below for all Index(), PopRight(), PopLeft()
+
+  // Set on empty list (return false, and do not crash)
+  ASSERT_EQ(redis.Set("k1", 7, "a"), false);
+  ASSERT_EQ(redis.Set("k1", 0, "a"), false);
+  ASSERT_EQ(redis.Set("k1", -49, "cx"), false);
+  ASSERT_EQ(redis.Length("k1"), 0);
+
+  // Push some preliminary stuff [g, f, e, d, c, b, a]
+  redis.PushLeft("k1", "a");
+  redis.PushLeft("k1", "b");
+  redis.PushLeft("k1", "c");
+  redis.PushLeft("k1", "d");
+  redis.PushLeft("k1", "e");
+  redis.PushLeft("k1", "f");
+  redis.PushLeft("k1", "g");
+  ASSERT_EQ(redis.Length("k1"), 7);
+
+  // Test Regular Set
+  ASSERT_TRUE(redis.Set("k1", 0, "0"));
+  ASSERT_TRUE(redis.Set("k1", 3, "3"));
+  ASSERT_TRUE(redis.Set("k1", 6, "6"));
+  ASSERT_TRUE(redis.Set("k1", 2, "2"));
+  ASSERT_TRUE(redis.Set("k1", 5, "5"));
+  ASSERT_TRUE(redis.Set("k1", 1, "1"));
+  ASSERT_TRUE(redis.Set("k1", 4, "4"));
+
+  ASSERT_EQ(redis.Length("k1"), 7); // Size should not change
+  ASSERT_TRUE(redis.Index("k1", 0, &tempv));
+  ASSERT_EQ(tempv, "0");
+  ASSERT_TRUE(redis.Index("k1", 1, &tempv));
+  ASSERT_EQ(tempv, "1");
+  ASSERT_TRUE(redis.Index("k1", 2, &tempv));
+  ASSERT_EQ(tempv, "2");
+  ASSERT_TRUE(redis.Index("k1", 3, &tempv));
+  ASSERT_EQ(tempv, "3");
+  ASSERT_TRUE(redis.Index("k1", 4, &tempv));
+  ASSERT_EQ(tempv, "4");
+  ASSERT_TRUE(redis.Index("k1", 5, &tempv));
+  ASSERT_EQ(tempv, "5");
+  ASSERT_TRUE(redis.Index("k1", 6, &tempv));
+  ASSERT_EQ(tempv, "6");
+
+  // Set with negative indices
+  ASSERT_TRUE(redis.Set("k1", -7, "a"));
+  ASSERT_TRUE(redis.Set("k1", -4, "d"));
+  ASSERT_TRUE(redis.Set("k1", -1, "g"));
+  ASSERT_TRUE(redis.Set("k1", -5, "c"));
+  ASSERT_TRUE(redis.Set("k1", -2, "f"));
+  ASSERT_TRUE(redis.Set("k1", -6, "b"));
+  ASSERT_TRUE(redis.Set("k1", -3, "e"));
+
+  ASSERT_EQ(redis.Length("k1"), 7); // Size should not change
+  ASSERT_TRUE(redis.Index("k1", 0, &tempv));
+  ASSERT_EQ(tempv, "a");
+  ASSERT_TRUE(redis.Index("k1", 1, &tempv));
+  ASSERT_EQ(tempv, "b");
+  ASSERT_TRUE(redis.Index("k1", 2, &tempv));
+  ASSERT_EQ(tempv, "c");
+  ASSERT_TRUE(redis.Index("k1", 3, &tempv));
+  ASSERT_EQ(tempv, "d");
+  ASSERT_TRUE(redis.Index("k1", 4, &tempv));
+  ASSERT_EQ(tempv, "e");
+  ASSERT_TRUE(redis.Index("k1", 5, &tempv));
+  ASSERT_EQ(tempv, "f");
+  ASSERT_TRUE(redis.Index("k1", 6, &tempv));
+  ASSERT_EQ(tempv, "g");
+
+  // Bad indices (just out-of-bounds / off-by-one check)
+  ASSERT_EQ(redis.Set("k1", -8, "off-by-one in negative index"), false);
+  ASSERT_EQ(redis.Set("k1", 7, "off-by-one-error in positive index"), false);
+  ASSERT_EQ(redis.Set("k1", 43892, "big random index should fail"), false);
+  ASSERT_EQ(redis.Set("k1", -21391, "large negative index should fail"), false);
+
+  // One last check (to make sure nothing weird happened)
+  ASSERT_EQ(redis.Length("k1"), 7); // Size should not change
+  ASSERT_TRUE(redis.Index("k1", 0, &tempv));
+  ASSERT_EQ(tempv, "a");
+  ASSERT_TRUE(redis.Index("k1", 1, &tempv));
+  ASSERT_EQ(tempv, "b");
+  ASSERT_TRUE(redis.Index("k1", 2, &tempv));
+  ASSERT_EQ(tempv, "c");
+  ASSERT_TRUE(redis.Index("k1", 3, &tempv));
+  ASSERT_EQ(tempv, "d");
+  ASSERT_TRUE(redis.Index("k1", 4, &tempv));
+  ASSERT_EQ(tempv, "e");
+  ASSERT_TRUE(redis.Index("k1", 5, &tempv));
+  ASSERT_EQ(tempv, "f");
+  ASSERT_TRUE(redis.Index("k1", 6, &tempv));
+  ASSERT_EQ(tempv, "g");
+}
+
+// Testing Insert, Push, and Set, in a mixed environment
+TEST(RedisListsTest, InsertPushSetTest) {
+  RedisLists redis(kDefaultDbName, options, true);   // Destructive
+
+  string tempv; // Used below for all Index(), PopRight(), PopLeft()
+
+  // A series of pushes and insertions
+  // Will result in [newbegin, z, a, aftera, x, newend]
+  // Also, check the return value sometimes (should return length)
+  int lengthCheck;
+  lengthCheck = redis.PushLeft("k1", "a");
+  ASSERT_EQ(lengthCheck, 1);
+  redis.PushLeft("k1", "z");
+  redis.PushRight("k1", "x");
+  lengthCheck = redis.InsertAfter("k1", "a", "aftera");
+  ASSERT_EQ(lengthCheck , 4);
+  redis.InsertBefore("k1", "z", "newbegin");  // InsertBefore beginning of list
+  redis.InsertAfter("k1", "x", "newend");     // InsertAfter end of list
+
+  // Check
+  std::vector<std::string> res = redis.Range("k1", 0, -1); // Get the list
+  ASSERT_EQ((int)res.size(), 6);
+  ASSERT_EQ(res[0], "newbegin");
+  ASSERT_EQ(res[5], "newend");
+  ASSERT_EQ(res[3], "aftera");
+
+  // Testing duplicate values/pivots (multiple occurrences of 'a')
+  ASSERT_TRUE(redis.Set("k1", 0, "a"));     // [a, z, a, aftera, x, newend]
+  redis.InsertAfter("k1", "a", "happy");    // [a, happy, z, a, aftera, ...]
+  ASSERT_TRUE(redis.Index("k1", 1, &tempv));
+  ASSERT_EQ(tempv, "happy");
+  redis.InsertBefore("k1", "a", "sad");     // [sad, a, happy, z, a, aftera, ...]
+  ASSERT_TRUE(redis.Index("k1", 0, &tempv));
+  ASSERT_EQ(tempv, "sad");
+  ASSERT_TRUE(redis.Index("k1", 2, &tempv));
+  ASSERT_EQ(tempv, "happy");
+  ASSERT_TRUE(redis.Index("k1", 5, &tempv));
+  ASSERT_EQ(tempv, "aftera");
+  redis.InsertAfter("k1", "a", "zz");         // [sad, a, zz, happy, z, a, aftera, ...]
+  ASSERT_TRUE(redis.Index("k1", 2, &tempv));
+  ASSERT_EQ(tempv, "zz");
+  ASSERT_TRUE(redis.Index("k1", 6, &tempv));
+  ASSERT_EQ(tempv, "aftera");
+  ASSERT_TRUE(redis.Set("k1", 1, "nota"));    // [sad, nota, zz, happy, z, a, ...]
+  redis.InsertBefore("k1", "a", "ba");        // [sad, nota, zz, happy, z, ba, a, ...]
+  ASSERT_TRUE(redis.Index("k1", 4, &tempv));
+  ASSERT_EQ(tempv, "z");
+  ASSERT_TRUE(redis.Index("k1", 5, &tempv));
+  ASSERT_EQ(tempv, "ba");
+  ASSERT_TRUE(redis.Index("k1", 6, &tempv));
+  ASSERT_EQ(tempv, "a");
+
+  // We currently have: [sad, nota, zz, happy, z, ba, a, aftera, x, newend]
+  // redis.Print("k1");   // manually check
+
+  // Test Inserting before/after non-existent values
+  lengthCheck = redis.Length("k1"); // Ensure that the length doesn't change
+  ASSERT_EQ(lengthCheck, 10);
+  ASSERT_EQ(redis.InsertBefore("k1", "non-exist", "randval"), lengthCheck);
+  ASSERT_EQ(redis.InsertAfter("k1", "nothing", "a"), lengthCheck);
+  ASSERT_EQ(redis.InsertAfter("randKey", "randVal", "ranValue"), 0); // Empty
+  ASSERT_EQ(redis.Length("k1"), lengthCheck); // The length should not change
+
+  // Simply Test the Set() function
+  redis.Set("k1", 5, "ba2");
+  redis.InsertBefore("k1", "ba2", "beforeba2");
+  ASSERT_TRUE(redis.Index("k1", 4, &tempv));
+  ASSERT_EQ(tempv, "z");
+  ASSERT_TRUE(redis.Index("k1", 5, &tempv));
+  ASSERT_EQ(tempv, "beforeba2");
+  ASSERT_TRUE(redis.Index("k1", 6, &tempv));
+  ASSERT_EQ(tempv, "ba2");
+  ASSERT_TRUE(redis.Index("k1", 7, &tempv));
+  ASSERT_EQ(tempv, "a");
+
+  // We have: [sad, nota, zz, happy, z, beforeba2, ba2, a, aftera, x, newend]
+
+  // Set() with negative indices
+  redis.Set("k1", -1, "endprank");
+  ASSERT_TRUE(!redis.Index("k1", 11, &tempv));
+  ASSERT_TRUE(redis.Index("k1", 10, &tempv));
+  ASSERT_EQ(tempv, "endprank"); // Ensure Set worked correctly
+  redis.Set("k1", -11, "t");
+  ASSERT_TRUE(redis.Index("k1", 0, &tempv));
+  ASSERT_EQ(tempv, "t");
+
+  // Test out of bounds Set
+  ASSERT_EQ(redis.Set("k1", -12, "ssd"), false);
+  ASSERT_EQ(redis.Set("k1", 11, "sasd"), false);
+  ASSERT_EQ(redis.Set("k1", 1200, "big"), false);
+}
+
+// Testing Trim, Pop
+TEST(RedisListsTest, TrimPopTest) {
+  RedisLists redis(kDefaultDbName, options, true);   // Destructive
+
+  string tempv; // Used below for all Index(), PopRight(), PopLeft()
+
+  // A series of pushes and insertions
+  // Will result in [newbegin, z, a, aftera, x, newend]
+  redis.PushLeft("k1", "a");
+  redis.PushLeft("k1", "z");
+  redis.PushRight("k1", "x");
+  redis.InsertBefore("k1", "z", "newbegin");    // InsertBefore start of list
+  redis.InsertAfter("k1", "x", "newend");       // InsertAfter end of list
+  redis.InsertAfter("k1", "a", "aftera");
+
+  // Simple PopLeft/Right test
+  ASSERT_TRUE(redis.PopLeft("k1", &tempv));
+  ASSERT_EQ(tempv, "newbegin");
+  ASSERT_EQ(redis.Length("k1"), 5);
+  ASSERT_TRUE(redis.Index("k1", 0, &tempv));
+  ASSERT_EQ(tempv, "z");
+  ASSERT_TRUE(redis.PopRight("k1", &tempv));
+  ASSERT_EQ(tempv, "newend");
+  ASSERT_EQ(redis.Length("k1"), 4);
+  ASSERT_TRUE(redis.Index("k1", -1, &tempv));
+  ASSERT_EQ(tempv, "x");
+
+  // Now have: [z, a, aftera, x]
+
+  // Test Trim
+  ASSERT_TRUE(redis.Trim("k1", 0, -1));       // [z, a, aftera, x] (do nothing)
+  ASSERT_EQ(redis.Length("k1"), 4);
+  ASSERT_TRUE(redis.Trim("k1", 0, 2));                     // [z, a, aftera]
+  ASSERT_EQ(redis.Length("k1"), 3);
+  ASSERT_TRUE(redis.Index("k1", -1, &tempv));
+  ASSERT_EQ(tempv, "aftera");
+  ASSERT_TRUE(redis.Trim("k1", 1, 1));                     // [a]
+  ASSERT_EQ(redis.Length("k1"), 1);
+  ASSERT_TRUE(redis.Index("k1", 0, &tempv));
+  ASSERT_EQ(tempv, "a");
+
+  // Test out of bounds (empty) trim
+  ASSERT_TRUE(redis.Trim("k1", 1, 0));
+  ASSERT_EQ(redis.Length("k1"), 0);
+
+  // Popping with empty list (return empty without error)
+  ASSERT_TRUE(!redis.PopLeft("k1", &tempv));
+  ASSERT_TRUE(!redis.PopRight("k1", &tempv));
+  ASSERT_TRUE(redis.Trim("k1", 0, 5));
+
+  // Exhaustive Trim test (negative and invalid indices)
+  // Will start in [newbegin, z, a, aftera, x, newend]
+  redis.PushLeft("k1", "a");
+  redis.PushLeft("k1", "z");
+  redis.PushRight("k1", "x");
+  redis.InsertBefore("k1", "z", "newbegin");    // InsertBefore start of list
+  redis.InsertAfter("k1", "x", "newend");       // InsertAfter end of list
+  redis.InsertAfter("k1", "a", "aftera");
+  ASSERT_TRUE(redis.Trim("k1", -6, -1));                     // Should do nothing
+  ASSERT_EQ(redis.Length("k1"), 6);
+  ASSERT_TRUE(redis.Trim("k1", 1, -2));
+  ASSERT_TRUE(redis.Index("k1", 0, &tempv));
+  ASSERT_EQ(tempv, "z");
+  ASSERT_TRUE(redis.Index("k1", 3, &tempv));
+  ASSERT_EQ(tempv, "x");
+  ASSERT_EQ(redis.Length("k1"), 4);
+  ASSERT_TRUE(redis.Trim("k1", -3, -2));
+  ASSERT_EQ(redis.Length("k1"), 2);
+}
+
+// Testing Remove, RemoveFirst, RemoveLast
+TEST(RedisListsTest, RemoveTest) {
+  RedisLists redis(kDefaultDbName, options, true);   // Destructive
+
+  string tempv; // Used below for all Index(), PopRight(), PopLeft()
+
+  // A series of pushes and insertions
+  // Will result in [newbegin, z, a, aftera, x, newend, a, a]
+  redis.PushLeft("k1", "a");
+  redis.PushLeft("k1", "z");
+  redis.PushRight("k1", "x");
+  redis.InsertBefore("k1", "z", "newbegin");    // InsertBefore start of list
+  redis.InsertAfter("k1", "x", "newend");       // InsertAfter end of list
+  redis.InsertAfter("k1", "a", "aftera");
+  redis.PushRight("k1", "a");
+  redis.PushRight("k1", "a");
+
+  // Verify
+  ASSERT_TRUE(redis.Index("k1", 0, &tempv));
+  ASSERT_EQ(tempv, "newbegin");
+  ASSERT_TRUE(redis.Index("k1", -1, &tempv));
+  ASSERT_EQ(tempv, "a");
+
+  // Check RemoveFirst (Remove the first two 'a')
+  // Results in [newbegin, z, aftera, x, newend, a]
+  int numRemoved = redis.Remove("k1", 2, "a");
+  ASSERT_EQ(numRemoved, 2);
+  ASSERT_TRUE(redis.Index("k1", 0, &tempv));
+  ASSERT_EQ(tempv, "newbegin");
+  ASSERT_TRUE(redis.Index("k1", 1, &tempv));
+  ASSERT_EQ(tempv, "z");
+  ASSERT_TRUE(redis.Index("k1", 4, &tempv));
+  ASSERT_EQ(tempv, "newend");
+  ASSERT_TRUE(redis.Index("k1", 5, &tempv));
+  ASSERT_EQ(tempv, "a");
+  ASSERT_EQ(redis.Length("k1"), 6);
+
+  // Repopulate some stuff
+  // Results in: [x, x, x, x, x, newbegin, z, x, aftera, x, newend, a, x]
+  redis.PushLeft("k1", "x");
+  redis.PushLeft("k1", "x");
+  redis.PushLeft("k1", "x");
+  redis.PushLeft("k1", "x");
+  redis.PushLeft("k1", "x");
+  redis.PushRight("k1", "x");
+  redis.InsertAfter("k1", "z", "x");
+
+  // Test removal from end
+  numRemoved = redis.Remove("k1", -2, "x");
+  ASSERT_EQ(numRemoved, 2);
+  ASSERT_TRUE(redis.Index("k1", 8, &tempv));
+  ASSERT_EQ(tempv, "aftera");
+  ASSERT_TRUE(redis.Index("k1", 9, &tempv));
+  ASSERT_EQ(tempv, "newend");
+  ASSERT_TRUE(redis.Index("k1", 10, &tempv));
+  ASSERT_EQ(tempv, "a");
+  ASSERT_TRUE(!redis.Index("k1", 11, &tempv));
+  numRemoved = redis.Remove("k1", -2, "x");
+  ASSERT_EQ(numRemoved, 2);
+  ASSERT_TRUE(redis.Index("k1", 4, &tempv));
+  ASSERT_EQ(tempv, "newbegin");
+  ASSERT_TRUE(redis.Index("k1", 6, &tempv));
+  ASSERT_EQ(tempv, "aftera");
+
+  // We now have: [x, x, x, x, newbegin, z, aftera, newend, a]
+  ASSERT_EQ(redis.Length("k1"), 9);
+  ASSERT_TRUE(redis.Index("k1", -1, &tempv));
+  ASSERT_EQ(tempv, "a");
+  ASSERT_TRUE(redis.Index("k1", 0, &tempv));
+  ASSERT_EQ(tempv, "x");
+
+  // Test over-shooting (removing more than there exists)
+  numRemoved = redis.Remove("k1", -9000, "x");
+  ASSERT_EQ(numRemoved , 4);    // Only really removed 4
+  ASSERT_EQ(redis.Length("k1"), 5);
+  ASSERT_TRUE(redis.Index("k1", 0, &tempv));
+  ASSERT_EQ(tempv, "newbegin");
+  numRemoved = redis.Remove("k1", 1, "x");
+  ASSERT_EQ(numRemoved, 0);
+
+  // Try removing ALL!
+  numRemoved = redis.Remove("k1", 0, "newbegin");   // REMOVE 0 will remove all!
+  ASSERT_EQ(numRemoved, 1);
+
+  // Removal from an empty-list
+  ASSERT_TRUE(redis.Trim("k1", 1, 0));
+  numRemoved = redis.Remove("k1", 1, "z");
+  ASSERT_EQ(numRemoved, 0);
+}
+
+
+// Test Multiple keys and Persistence
+TEST(RedisListsTest, PersistenceMultiKeyTest) {
+
+  string tempv; // Used below for all Index(), PopRight(), PopLeft()
+
+  // Block one: populate a single key in the database
+  {
+    RedisLists redis(kDefaultDbName, options, true);   // Destructive
+
+    // A series of pushes and insertions
+    // Will result in [newbegin, z, a, aftera, x, newend, a, a]
+    redis.PushLeft("k1", "a");
+    redis.PushLeft("k1", "z");
+    redis.PushRight("k1", "x");
+    redis.InsertBefore("k1", "z", "newbegin");    // InsertBefore start of list
+    redis.InsertAfter("k1", "x", "newend");       // InsertAfter end of list
+    redis.InsertAfter("k1", "a", "aftera");
+    redis.PushRight("k1", "a");
+    redis.PushRight("k1", "a");
+
+    ASSERT_TRUE(redis.Index("k1", 3, &tempv));
+    ASSERT_EQ(tempv, "aftera");
+  }
+
+  // Block two: make sure changes were saved and add some other key
+  {
+    RedisLists redis(kDefaultDbName, options, false); // Persistent, non-destructive
+
+    // Check
+    ASSERT_EQ(redis.Length("k1"), 8);
+    ASSERT_TRUE(redis.Index("k1", 3, &tempv));
+    ASSERT_EQ(tempv, "aftera");
+
+    redis.PushRight("k2", "randomkey");
+    redis.PushLeft("k2", "sas");
+
+    redis.PopLeft("k1", &tempv);
+  }
+
+  // Block three: Verify the changes from block 2
+  {
+    RedisLists redis(kDefaultDbName, options, false); // Persistent, non-destructive
+
+    // Check
+    ASSERT_EQ(redis.Length("k1"), 7);
+    ASSERT_EQ(redis.Length("k2"), 2);
+    ASSERT_TRUE(redis.Index("k1", 0, &tempv));
+    ASSERT_EQ(tempv, "z");
+    ASSERT_TRUE(redis.Index("k2", -2, &tempv));
+    ASSERT_EQ(tempv, "sas");
+  }
+}
+
+/// THE manual REDIS TEST begins here
+/// THIS WILL ONLY OCCUR IF YOU RUN: ./redis_test -m
+
+namespace {
+void MakeUpper(std::string* const s) {
+  int len = s->length();
+  for(int i=0; i<len; ++i) {
+    (*s)[i] = toupper((*s)[i]); // C-version defined in <ctype.h>
+  }
+}
+
+/// Allows the user to enter in REDIS commands into the command-line.
+/// This is useful for manual / interacticve testing / debugging.
+///  Use destructive=true to clean the database before use.
+///  Use destructive=false to remember the previous state (i.e.: persistent)
+/// Should be called from main function.
+int manual_redis_test(bool destructive){
+  RedisLists redis(RedisListsTest::kDefaultDbName,
+                   RedisListsTest::options,
+                   destructive);
+
+  // TODO: Right now, please use spaces to separate each word.
+  //  In actual redis, you can use quotes to specify compound values
+  //  Example: RPUSH mylist "this is a compound value"
+
+  std::string command;
+  while(true) {
+    cin >> command;
+    MakeUpper(&command);
+
+    if (command == "LINSERT") {
+      std::string k, t, p, v;
+      cin >> k >> t >> p >> v;
+      MakeUpper(&t);
+      if (t=="BEFORE") {
+        std::cout << redis.InsertBefore(k, p, v) << std::endl;
+      } else if (t=="AFTER") {
+        std::cout << redis.InsertAfter(k, p, v) << std::endl;
+      }
+    } else if (command == "LPUSH") {
+      std::string k, v;
+      std::cin >> k >> v;
+      redis.PushLeft(k, v);
+    } else if (command == "RPUSH") {
+      std::string k, v;
+      std::cin >> k >> v;
+      redis.PushRight(k, v);
+    } else if (command == "LPOP") {
+      std::string k;
+      std::cin >> k;
+      string res;
+      redis.PopLeft(k, &res);
+      std::cout << res << std::endl;
+    } else if (command == "RPOP") {
+      std::string k;
+      std::cin >> k;
+      string res;
+      redis.PopRight(k, &res);
+      std::cout << res << std::endl;
+    } else if (command == "LREM") {
+      std::string k;
+      int amt;
+      std::string v;
+
+      std::cin >> k >> amt >> v;
+      std::cout << redis.Remove(k, amt, v) << std::endl;
+    } else if (command == "LLEN") {
+      std::string k;
+      std::cin >> k;
+      std::cout << redis.Length(k) << std::endl;
+    } else if (command == "LRANGE") {
+      std::string k;
+      int i, j;
+      std::cin >> k >> i >> j;
+      std::vector<std::string> res = redis.Range(k, i, j);
+      for (auto it = res.begin(); it != res.end(); ++it) {
+        std::cout << " " << (*it);
+      }
+      std::cout << std::endl;
+    } else if (command == "LTRIM") {
+      std::string k;
+      int i, j;
+      std::cin >> k >> i >> j;
+      redis.Trim(k, i, j);
+    } else if (command == "LSET") {
+      std::string k;
+      int idx;
+      std::string v;
+      cin >> k >> idx >> v;
+      redis.Set(k, idx, v);
+    } else if (command == "LINDEX") {
+      std::string k;
+      int idx;
+      std::cin >> k >> idx;
+      string res;
+      redis.Index(k, idx, &res);
+      std::cout << res << std::endl;
+    } else if (command == "PRINT") {      // Added by Deon
+      std::string k;
+      cin >> k;
+      redis.Print(k);
+    } else if (command == "QUIT") {
+      return 0;
+    } else {
+      std::cout << "unknown command: " << command << std::endl;
+    }
+  }
+}
+}  // namespace
+
+} // namespace rocksdb
+
+
+// USAGE: "./redis_test" for default (unit tests)
+//        "./redis_test -m" for manual testing (redis command api)
+//        "./redis_test -m -d" for destructive manual test (erase db before use)
+
+
+namespace {
+// Check for "want" argument in the argument list
+bool found_arg(int argc, char* argv[], const char* want){
+  for(int i=1; i<argc; ++i){
+    if (strcmp(argv[i], want) == 0) {
+      return true;
+    }
+  }
+  return false;
+}
+}  // namespace
+
+// Will run unit tests.
+// However, if -m is specified, it will do user manual/interactive testing
+// -m -d is manual and destructive (will clear the database before use)
+int main(int argc, char* argv[]) {
+  if (found_arg(argc, argv, "-m")) {
+    bool destructive = found_arg(argc, argv, "-d");
+    return rocksdb::manual_redis_test(destructive);
+  } else {
+    return rocksdb::test::RunAllTests();
+  }
+}
+
diff --git a/utilities/ttl/db_ttl_impl.cc b/utilities/ttl/db_ttl_impl.cc
new file mode 100644 (file)
index 0000000..f7a697f
--- /dev/null
@@ -0,0 +1,284 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#ifndef ROCKSDB_LITE
+
+#include "utilities/ttl/db_ttl_impl.h"
+
+#include "utilities/db_ttl.h"
+#include "db/filename.h"
+#include "db/write_batch_internal.h"
+#include "util/coding.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+
+namespace rocksdb {
+
+void DBWithTTLImpl::SanitizeOptions(int32_t ttl, ColumnFamilyOptions* options,
+                                    Env* env) {
+  if (options->compaction_filter) {
+    options->compaction_filter =
+        new TtlCompactionFilter(ttl, env, options->compaction_filter);
+  } else {
+    options->compaction_filter_factory =
+        std::shared_ptr<CompactionFilterFactory>(new TtlCompactionFilterFactory(
+            ttl, env, options->compaction_filter_factory));
+  }
+
+  if (options->merge_operator) {
+    options->merge_operator.reset(
+        new TtlMergeOperator(options->merge_operator, env));
+  }
+}
+
+// Open the db inside DBWithTTLImpl because options needs pointer to its ttl
+DBWithTTLImpl::DBWithTTLImpl(DB* db) : DBWithTTL(db) {}
+
+DBWithTTLImpl::~DBWithTTLImpl() { delete GetOptions().compaction_filter; }
+
+Status UtilityDB::OpenTtlDB(const Options& options, const std::string& dbname,
+                            StackableDB** dbptr, int32_t ttl, bool read_only) {
+  DBWithTTL* db;
+  Status s = DBWithTTL::Open(options, dbname, &db, ttl, read_only);
+  if (s.ok()) {
+    *dbptr = db;
+  } else {
+    *dbptr = nullptr;
+  }
+  return s;
+}
+
+Status DBWithTTL::Open(const Options& options, const std::string& dbname,
+                       DBWithTTL** dbptr, int32_t ttl, bool read_only) {
+
+  DBOptions db_options(options);
+  ColumnFamilyOptions cf_options(options);
+  std::vector<ColumnFamilyDescriptor> column_families;
+  column_families.push_back(
+      ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
+  std::vector<ColumnFamilyHandle*> handles;
+  Status s = DBWithTTL::Open(db_options, dbname, column_families, &handles,
+                             dbptr, {ttl}, read_only);
+  if (s.ok()) {
+    assert(handles.size() == 1);
+    // i can delete the handle since DBImpl is always holding a reference to
+    // default column family
+    delete handles[0];
+  }
+  return s;
+}
+
+Status DBWithTTL::Open(
+    const DBOptions& db_options, const std::string& dbname,
+    const std::vector<ColumnFamilyDescriptor>& column_families,
+    std::vector<ColumnFamilyHandle*>* handles, DBWithTTL** dbptr,
+    std::vector<int32_t> ttls, bool read_only) {
+
+  if (ttls.size() != column_families.size()) {
+    return Status::InvalidArgument(
+        "ttls size has to be the same as number of column families");
+  }
+
+  std::vector<ColumnFamilyDescriptor> column_families_sanitized =
+      column_families;
+  for (size_t i = 0; i < column_families_sanitized.size(); ++i) {
+    DBWithTTLImpl::SanitizeOptions(
+        ttls[i], &column_families_sanitized[i].options,
+        db_options.env == nullptr ? Env::Default() : db_options.env);
+  }
+  DB* db;
+
+  Status st;
+  if (read_only) {
+    st = DB::OpenForReadOnly(db_options, dbname, column_families_sanitized,
+                             handles, &db);
+  } else {
+    st = DB::Open(db_options, dbname, column_families_sanitized, handles, &db);
+  }
+  if (st.ok()) {
+    *dbptr = new DBWithTTLImpl(db);
+  } else {
+    *dbptr = nullptr;
+  }
+  return st;
+}
+
+Status DBWithTTLImpl::CreateColumnFamilyWithTtl(
+    const ColumnFamilyOptions& options, const std::string& column_family_name,
+    ColumnFamilyHandle** handle, int ttl) {
+  ColumnFamilyOptions sanitized_options = options;
+  DBWithTTLImpl::SanitizeOptions(ttl, &sanitized_options, GetEnv());
+
+  return DBWithTTL::CreateColumnFamily(sanitized_options, column_family_name,
+                                       handle);
+}
+
+Status DBWithTTLImpl::CreateColumnFamily(const ColumnFamilyOptions& options,
+                                         const std::string& column_family_name,
+                                         ColumnFamilyHandle** handle) {
+  return CreateColumnFamilyWithTtl(options, column_family_name, handle, 0);
+}
+
+// Appends the current timestamp to the string.
+// Returns false if could not get the current_time, true if append succeeds
+Status DBWithTTLImpl::AppendTS(const Slice& val, std::string* val_with_ts,
+                               Env* env) {
+  val_with_ts->reserve(kTSLength + val.size());
+  char ts_string[kTSLength];
+  int64_t curtime;
+  Status st = env->GetCurrentTime(&curtime);
+  if (!st.ok()) {
+    return st;
+  }
+  EncodeFixed32(ts_string, (int32_t)curtime);
+  val_with_ts->append(val.data(), val.size());
+  val_with_ts->append(ts_string, kTSLength);
+  return st;
+}
+
+// Returns corruption if the length of the string is lesser than timestamp, or
+// timestamp refers to a time lesser than ttl-feature release time
+Status DBWithTTLImpl::SanityCheckTimestamp(const Slice& str) {
+  if (str.size() < kTSLength) {
+    return Status::Corruption("Error: value's length less than timestamp's\n");
+  }
+  // Checks that TS is not lesser than kMinTimestamp
+  // Gaurds against corruption & normal database opened incorrectly in ttl mode
+  int32_t timestamp_value = DecodeFixed32(str.data() + str.size() - kTSLength);
+  if (timestamp_value < kMinTimestamp) {
+    return Status::Corruption("Error: Timestamp < ttl feature release time!\n");
+  }
+  return Status::OK();
+}
+
+// Checks if the string is stale or not according to TTl provided
+bool DBWithTTLImpl::IsStale(const Slice& value, int32_t ttl, Env* env) {
+  if (ttl <= 0) {  // Data is fresh if TTL is non-positive
+    return false;
+  }
+  int64_t curtime;
+  if (!env->GetCurrentTime(&curtime).ok()) {
+    return false;  // Treat the data as fresh if could not get current time
+  }
+  int32_t timestamp_value =
+      DecodeFixed32(value.data() + value.size() - kTSLength);
+  return (timestamp_value + ttl) < curtime;
+}
+
+// Strips the TS from the end of the string
+Status DBWithTTLImpl::StripTS(std::string* str) {
+  Status st;
+  if (str->length() < kTSLength) {
+    return Status::Corruption("Bad timestamp in key-value");
+  }
+  // Erasing characters which hold the TS
+  str->erase(str->length() - kTSLength, kTSLength);
+  return st;
+}
+
+Status DBWithTTLImpl::Put(const WriteOptions& options,
+                          ColumnFamilyHandle* column_family, const Slice& key,
+                          const Slice& val) {
+  WriteBatch batch;
+  batch.Put(column_family, key, val);
+  return Write(options, &batch);
+}
+
+Status DBWithTTLImpl::Get(const ReadOptions& options,
+                          ColumnFamilyHandle* column_family, const Slice& key,
+                          std::string* value) {
+  Status st = db_->Get(options, column_family, key, value);
+  if (!st.ok()) {
+    return st;
+  }
+  st = SanityCheckTimestamp(*value);
+  if (!st.ok()) {
+    return st;
+  }
+  return StripTS(value);
+}
+
+std::vector<Status> DBWithTTLImpl::MultiGet(
+    const ReadOptions& options,
+    const std::vector<ColumnFamilyHandle*>& column_family,
+    const std::vector<Slice>& keys, std::vector<std::string>* values) {
+  return std::vector<Status>(
+      keys.size(), Status::NotSupported("MultiGet not supported with TTL"));
+}
+
+bool DBWithTTLImpl::KeyMayExist(const ReadOptions& options,
+                                ColumnFamilyHandle* column_family,
+                                const Slice& key, std::string* value,
+                                bool* value_found) {
+  bool ret = db_->KeyMayExist(options, column_family, key, value, value_found);
+  if (ret && value != nullptr && value_found != nullptr && *value_found) {
+    if (!SanityCheckTimestamp(*value).ok() || !StripTS(value).ok()) {
+      return false;
+    }
+  }
+  return ret;
+}
+
+Status DBWithTTLImpl::Merge(const WriteOptions& options,
+                            ColumnFamilyHandle* column_family, const Slice& key,
+                            const Slice& value) {
+  WriteBatch batch;
+  batch.Merge(column_family, key, value);
+  return Write(options, &batch);
+}
+
+Status DBWithTTLImpl::Write(const WriteOptions& opts, WriteBatch* updates) {
+  class Handler : public WriteBatch::Handler {
+   public:
+    explicit Handler(Env* env) : env_(env) {}
+    WriteBatch updates_ttl;
+    Status batch_rewrite_status;
+    virtual Status PutCF(uint32_t column_family_id, const Slice& key,
+                         const Slice& value) {
+      std::string value_with_ts;
+      Status st = AppendTS(value, &value_with_ts, env_);
+      if (!st.ok()) {
+        batch_rewrite_status = st;
+      } else {
+        WriteBatchInternal::Put(&updates_ttl, column_family_id, key,
+                                value_with_ts);
+      }
+      return Status::OK();
+    }
+    virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
+                           const Slice& value) {
+      std::string value_with_ts;
+      Status st = AppendTS(value, &value_with_ts, env_);
+      if (!st.ok()) {
+        batch_rewrite_status = st;
+      } else {
+        WriteBatchInternal::Merge(&updates_ttl, column_family_id, key,
+                                  value_with_ts);
+      }
+      return Status::OK();
+    }
+    virtual Status DeleteCF(uint32_t column_family_id, const Slice& key) {
+      WriteBatchInternal::Delete(&updates_ttl, column_family_id, key);
+      return Status::OK();
+    }
+    virtual void LogData(const Slice& blob) { updates_ttl.PutLogData(blob); }
+
+   private:
+    Env* env_;
+  };
+  Handler handler(GetEnv());
+  updates->Iterate(&handler);
+  if (!handler.batch_rewrite_status.ok()) {
+    return handler.batch_rewrite_status;
+  } else {
+    return db_->Write(opts, &(handler.updates_ttl));
+  }
+}
+
+Iterator* DBWithTTLImpl::NewIterator(const ReadOptions& opts,
+                                     ColumnFamilyHandle* column_family) {
+  return new TtlIterator(db_->NewIterator(opts, column_family));
+}
+
+}  // namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/utilities/ttl/db_ttl_impl.h b/utilities/ttl/db_ttl_impl.h
new file mode 100644 (file)
index 0000000..a5c8fc8
--- /dev/null
@@ -0,0 +1,314 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+#include <deque>
+#include <string>
+#include <vector>
+
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/merge_operator.h"
+#include "utilities/utility_db.h"
+#include "utilities/db_ttl.h"
+#include "db/db_impl.h"
+
+namespace rocksdb {
+
+class DBWithTTLImpl : public DBWithTTL {
+ public:
+  static void SanitizeOptions(int32_t ttl, ColumnFamilyOptions* options,
+                              Env* env);
+
+  explicit DBWithTTLImpl(DB* db);
+
+  virtual ~DBWithTTLImpl();
+
+  Status CreateColumnFamilyWithTtl(const ColumnFamilyOptions& options,
+                                   const std::string& column_family_name,
+                                   ColumnFamilyHandle** handle,
+                                   int ttl) override;
+
+  Status CreateColumnFamily(const ColumnFamilyOptions& options,
+                            const std::string& column_family_name,
+                            ColumnFamilyHandle** handle) override;
+
+  using StackableDB::Put;
+  virtual Status Put(const WriteOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     const Slice& val) override;
+
+  using StackableDB::Get;
+  virtual Status Get(const ReadOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     std::string* value) override;
+
+  using StackableDB::MultiGet;
+  virtual std::vector<Status> MultiGet(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_family,
+      const std::vector<Slice>& keys,
+      std::vector<std::string>* values) override;
+
+  using StackableDB::KeyMayExist;
+  virtual bool KeyMayExist(const ReadOptions& options,
+                           ColumnFamilyHandle* column_family, const Slice& key,
+                           std::string* value,
+                           bool* value_found = nullptr) override;
+
+  using StackableDB::Merge;
+  virtual Status Merge(const WriteOptions& options,
+                       ColumnFamilyHandle* column_family, const Slice& key,
+                       const Slice& value) override;
+
+  virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override;
+
+  using StackableDB::NewIterator;
+  virtual Iterator* NewIterator(const ReadOptions& opts,
+                                ColumnFamilyHandle* column_family) override;
+
+  virtual DB* GetBaseDB() { return db_; }
+
+  static bool IsStale(const Slice& value, int32_t ttl, Env* env);
+
+  static Status AppendTS(const Slice& val, std::string* val_with_ts, Env* env);
+
+  static Status SanityCheckTimestamp(const Slice& str);
+
+  static Status StripTS(std::string* str);
+
+  static const uint32_t kTSLength = sizeof(int32_t);  // size of timestamp
+
+  static const int32_t kMinTimestamp = 1368146402;  // 05/09/2013:5:40PM GMT-8
+
+  static const int32_t kMaxTimestamp = 2147483647;  // 01/18/2038:7:14PM GMT-8
+};
+
+class TtlIterator : public Iterator {
+
+ public:
+  explicit TtlIterator(Iterator* iter) : iter_(iter) { assert(iter_); }
+
+  ~TtlIterator() { delete iter_; }
+
+  bool Valid() const { return iter_->Valid(); }
+
+  void SeekToFirst() { iter_->SeekToFirst(); }
+
+  void SeekToLast() { iter_->SeekToLast(); }
+
+  void Seek(const Slice& target) { iter_->Seek(target); }
+
+  void Next() { iter_->Next(); }
+
+  void Prev() { iter_->Prev(); }
+
+  Slice key() const { return iter_->key(); }
+
+  int32_t timestamp() const {
+    return DecodeFixed32(iter_->value().data() + iter_->value().size() -
+                         DBWithTTLImpl::kTSLength);
+  }
+
+  Slice value() const {
+    // TODO: handle timestamp corruption like in general iterator semantics
+    assert(DBWithTTLImpl::SanityCheckTimestamp(iter_->value()).ok());
+    Slice trimmed_value = iter_->value();
+    trimmed_value.size_ -= DBWithTTLImpl::kTSLength;
+    return trimmed_value;
+  }
+
+  Status status() const { return iter_->status(); }
+
+ private:
+  Iterator* iter_;
+};
+
+class TtlCompactionFilter : public CompactionFilter {
+ public:
+  TtlCompactionFilter(
+      int32_t ttl, Env* env, const CompactionFilter* user_comp_filter,
+      std::unique_ptr<const CompactionFilter> user_comp_filter_from_factory =
+          nullptr)
+      : ttl_(ttl),
+        env_(env),
+        user_comp_filter_(user_comp_filter),
+        user_comp_filter_from_factory_(
+            std::move(user_comp_filter_from_factory)) {
+    // Unlike the merge operator, compaction filter is necessary for TTL, hence
+    // this would be called even if user doesn't specify any compaction-filter
+    if (!user_comp_filter_) {
+      user_comp_filter_ = user_comp_filter_from_factory_.get();
+    }
+  }
+
+  virtual bool Filter(int level, const Slice& key, const Slice& old_val,
+                      std::string* new_val, bool* value_changed) const
+      override {
+    if (DBWithTTLImpl::IsStale(old_val, ttl_, env_)) {
+      return true;
+    }
+    if (user_comp_filter_ == nullptr) {
+      return false;
+    }
+    assert(old_val.size() >= DBWithTTLImpl::kTSLength);
+    Slice old_val_without_ts(old_val.data(),
+                             old_val.size() - DBWithTTLImpl::kTSLength);
+    if (user_comp_filter_->Filter(level, key, old_val_without_ts, new_val,
+                                  value_changed)) {
+      return true;
+    }
+    if (*value_changed) {
+      new_val->append(
+          old_val.data() + old_val.size() - DBWithTTLImpl::kTSLength,
+          DBWithTTLImpl::kTSLength);
+    }
+    return false;
+  }
+
+  virtual const char* Name() const override { return "Delete By TTL"; }
+
+ private:
+  int32_t ttl_;
+  Env* env_;
+  const CompactionFilter* user_comp_filter_;
+  std::unique_ptr<const CompactionFilter> user_comp_filter_from_factory_;
+};
+
+class TtlCompactionFilterFactory : public CompactionFilterFactory {
+ public:
+  TtlCompactionFilterFactory(
+      int32_t ttl, Env* env,
+      std::shared_ptr<CompactionFilterFactory> comp_filter_factory)
+      : ttl_(ttl), env_(env), user_comp_filter_factory_(comp_filter_factory) {}
+
+  virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& context) {
+    return std::unique_ptr<TtlCompactionFilter>(new TtlCompactionFilter(
+        ttl_, env_, nullptr,
+        std::move(user_comp_filter_factory_->CreateCompactionFilter(context))));
+  }
+
+  virtual const char* Name() const override {
+    return "TtlCompactionFilterFactory";
+  }
+
+ private:
+  int32_t ttl_;
+  Env* env_;
+  std::shared_ptr<CompactionFilterFactory> user_comp_filter_factory_;
+};
+
+class TtlMergeOperator : public MergeOperator {
+
+ public:
+  explicit TtlMergeOperator(const std::shared_ptr<MergeOperator> merge_op,
+                            Env* env)
+      : user_merge_op_(merge_op), env_(env) {
+    assert(merge_op);
+    assert(env);
+  }
+
+  virtual bool FullMerge(const Slice& key, const Slice* existing_value,
+                         const std::deque<std::string>& operands,
+                         std::string* new_value, Logger* logger) const
+      override {
+    const uint32_t ts_len = DBWithTTLImpl::kTSLength;
+    if (existing_value && existing_value->size() < ts_len) {
+      Log(logger, "Error: Could not remove timestamp from existing value.");
+      return false;
+    }
+
+    // Extract time-stamp from each operand to be passed to user_merge_op_
+    std::deque<std::string> operands_without_ts;
+    for (const auto& operand : operands) {
+      if (operand.size() < ts_len) {
+        Log(logger, "Error: Could not remove timestamp from operand value.");
+        return false;
+      }
+      operands_without_ts.push_back(operand.substr(0, operand.size() - ts_len));
+    }
+
+    // Apply the user merge operator (store result in *new_value)
+    bool good = true;
+    if (existing_value) {
+      Slice existing_value_without_ts(existing_value->data(),
+                                      existing_value->size() - ts_len);
+      good = user_merge_op_->FullMerge(key, &existing_value_without_ts,
+                                       operands_without_ts, new_value, logger);
+    } else {
+      good = user_merge_op_->FullMerge(key, nullptr, operands_without_ts,
+                                       new_value, logger);
+    }
+
+    // Return false if the user merge operator returned false
+    if (!good) {
+      return false;
+    }
+
+    // Augment the *new_value with the ttl time-stamp
+    int64_t curtime;
+    if (!env_->GetCurrentTime(&curtime).ok()) {
+      Log(logger,
+          "Error: Could not get current time to be attached internally "
+          "to the new value.");
+      return false;
+    } else {
+      char ts_string[ts_len];
+      EncodeFixed32(ts_string, (int32_t)curtime);
+      new_value->append(ts_string, ts_len);
+      return true;
+    }
+  }
+
+  virtual bool PartialMergeMulti(const Slice& key,
+                                 const std::deque<Slice>& operand_list,
+                                 std::string* new_value, Logger* logger) const
+      override {
+    const uint32_t ts_len = DBWithTTLImpl::kTSLength;
+    std::deque<Slice> operands_without_ts;
+
+    for (const auto& operand : operand_list) {
+      if (operand.size() < ts_len) {
+        Log(logger, "Error: Could not remove timestamp from value.");
+        return false;
+      }
+
+      operands_without_ts.push_back(
+          Slice(operand.data(), operand.size() - ts_len));
+    }
+
+    // Apply the user partial-merge operator (store result in *new_value)
+    assert(new_value);
+    if (!user_merge_op_->PartialMergeMulti(key, operands_without_ts, new_value,
+                                           logger)) {
+      return false;
+    }
+
+    // Augment the *new_value with the ttl time-stamp
+    int64_t curtime;
+    if (!env_->GetCurrentTime(&curtime).ok()) {
+      Log(logger,
+          "Error: Could not get current time to be attached internally "
+          "to the new value.");
+      return false;
+    } else {
+      char ts_string[ts_len];
+      EncodeFixed32(ts_string, (int32_t)curtime);
+      new_value->append(ts_string, ts_len);
+      return true;
+    }
+  }
+
+  virtual const char* Name() const override { return "Merge By TTL"; }
+
+ private:
+  std::shared_ptr<MergeOperator> user_merge_op_;
+  Env* env_;
+};
+}
+#endif  // ROCKSDB_LITE
diff --git a/utilities/ttl/ttl_test.cc b/utilities/ttl/ttl_test.cc
new file mode 100644 (file)
index 0000000..4791a2a
--- /dev/null
@@ -0,0 +1,595 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <memory>
+#include "rocksdb/compaction_filter.h"
+#include "utilities/db_ttl.h"
+#include "util/testharness.h"
+#include "util/logging.h"
+#include <map>
+#include <unistd.h>
+
+namespace rocksdb {
+
+namespace {
+
+typedef std::map<std::string, std::string> KVMap;
+
+enum BatchOperation {
+  PUT = 0,
+  DELETE = 1
+};
+}
+
+class SpecialTimeEnv : public EnvWrapper {
+ public:
+  explicit SpecialTimeEnv(Env* base) : EnvWrapper(base) {
+    base->GetCurrentTime(&current_time_);
+  }
+
+  void Sleep(int64_t sleep_time) { current_time_ += sleep_time; }
+  virtual Status GetCurrentTime(int64_t* current_time) {
+    *current_time = current_time_;
+    return Status::OK();
+  }
+
+ private:
+  int64_t current_time_;
+};
+
+class TtlTest {
+ public:
+  TtlTest() {
+    env_.reset(new SpecialTimeEnv(Env::Default()));
+    dbname_ = test::TmpDir() + "/db_ttl";
+    options_.create_if_missing = true;
+    options_.env = env_.get();
+    // ensure that compaction is kicked in to always strip timestamp from kvs
+    options_.max_grandparent_overlap_factor = 0;
+    // compaction should take place always from level0 for determinism
+    options_.max_mem_compaction_level = 0;
+    db_ttl_ = nullptr;
+    DestroyDB(dbname_, Options());
+  }
+
+  ~TtlTest() {
+    CloseTtl();
+    DestroyDB(dbname_, Options());
+  }
+
+  // Open database with TTL support when TTL not provided with db_ttl_ pointer
+  void OpenTtl() {
+    ASSERT_TRUE(db_ttl_ ==
+                nullptr);  //  db should be closed before opening again
+    ASSERT_OK(DBWithTTL::Open(options_, dbname_, &db_ttl_));
+  }
+
+  // Open database with TTL support when TTL provided with db_ttl_ pointer
+  void OpenTtl(int32_t ttl) {
+    ASSERT_TRUE(db_ttl_ == nullptr);
+    ASSERT_OK(DBWithTTL::Open(options_, dbname_, &db_ttl_, ttl));
+  }
+
+  // Open with TestFilter compaction filter
+  void OpenTtlWithTestCompaction(int32_t ttl) {
+    options_.compaction_filter_factory =
+      std::shared_ptr<CompactionFilterFactory>(
+          new TestFilterFactory(kSampleSize_, kNewValue_));
+    OpenTtl(ttl);
+  }
+
+  // Open database with TTL support in read_only mode
+  void OpenReadOnlyTtl(int32_t ttl) {
+    ASSERT_TRUE(db_ttl_ == nullptr);
+    ASSERT_OK(DBWithTTL::Open(options_, dbname_, &db_ttl_, ttl, true));
+  }
+
+  void CloseTtl() {
+    delete db_ttl_;
+    db_ttl_ = nullptr;
+  }
+
+  // Populates and returns a kv-map
+  void MakeKVMap(int64_t num_entries) {
+    kvmap_.clear();
+    int digits = 1;
+    for (int dummy = num_entries; dummy /= 10 ; ++digits);
+    int digits_in_i = 1;
+    for (int64_t i = 0; i < num_entries; i++) {
+      std::string key = "key";
+      std::string value = "value";
+      if (i % 10 == 0) {
+        digits_in_i++;
+      }
+      for(int j = digits_in_i; j < digits; j++) {
+        key.append("0");
+        value.append("0");
+      }
+      AppendNumberTo(&key, i);
+      AppendNumberTo(&value, i);
+      kvmap_[key] = value;
+    }
+    ASSERT_EQ((int)kvmap_.size(), num_entries);//check all insertions done
+  }
+
+  // Makes a write-batch with key-vals from kvmap_ and 'Write''s it
+  void MakePutWriteBatch(const BatchOperation* batch_ops, int num_ops) {
+    ASSERT_LE(num_ops, (int)kvmap_.size());
+    static WriteOptions wopts;
+    static FlushOptions flush_opts;
+    WriteBatch batch;
+    kv_it_ = kvmap_.begin();
+    for (int i = 0; i < num_ops && kv_it_ != kvmap_.end(); i++, kv_it_++) {
+      switch (batch_ops[i]) {
+        case PUT:
+          batch.Put(kv_it_->first, kv_it_->second);
+          break;
+        case DELETE:
+          batch.Delete(kv_it_->first);
+          break;
+        default:
+          ASSERT_TRUE(false);
+      }
+    }
+    db_ttl_->Write(wopts, &batch);
+    db_ttl_->Flush(flush_opts);
+  }
+
+  // Puts num_entries starting from start_pos_map from kvmap_ into the database
+  void PutValues(int start_pos_map, int num_entries, bool flush = true,
+                 ColumnFamilyHandle* cf = nullptr) {
+    ASSERT_TRUE(db_ttl_);
+    ASSERT_LE(start_pos_map + num_entries, (int)kvmap_.size());
+    static WriteOptions wopts;
+    static FlushOptions flush_opts;
+    kv_it_ = kvmap_.begin();
+    advance(kv_it_, start_pos_map);
+    for (int i = 0; kv_it_ != kvmap_.end() && i < num_entries; i++, kv_it_++) {
+      ASSERT_OK(cf == nullptr
+                    ? db_ttl_->Put(wopts, kv_it_->first, kv_it_->second)
+                    : db_ttl_->Put(wopts, cf, kv_it_->first, kv_it_->second));
+    }
+    // Put a mock kv at the end because CompactionFilter doesn't delete last key
+    ASSERT_OK(cf == nullptr ? db_ttl_->Put(wopts, "keymock", "valuemock")
+                            : db_ttl_->Put(wopts, cf, "keymock", "valuemock"));
+    if (flush) {
+      if (cf == nullptr) {
+        db_ttl_->Flush(flush_opts);
+      } else {
+        db_ttl_->Flush(flush_opts, cf);
+      }
+    }
+  }
+
+  // Runs a manual compaction
+  void ManualCompact(ColumnFamilyHandle* cf = nullptr) {
+    if (cf == nullptr) {
+      db_ttl_->CompactRange(nullptr, nullptr);
+    } else {
+      db_ttl_->CompactRange(cf, nullptr, nullptr);
+    }
+  }
+
+  // checks the whole kvmap_ to return correct values using KeyMayExist
+  void SimpleKeyMayExistCheck() {
+    static ReadOptions ropts;
+    bool value_found;
+    std::string val;
+    for(auto &kv : kvmap_) {
+      bool ret = db_ttl_->KeyMayExist(ropts, kv.first, &val, &value_found);
+      if (ret == false || value_found == false) {
+        fprintf(stderr, "KeyMayExist could not find key=%s in the database but"
+                        " should have\n", kv.first.c_str());
+        ASSERT_TRUE(false);
+      } else if (val.compare(kv.second) != 0) {
+        fprintf(stderr, " value for key=%s present in database is %s but"
+                        " should be %s\n", kv.first.c_str(), val.c_str(),
+                        kv.second.c_str());
+        ASSERT_TRUE(false);
+      }
+    }
+  }
+
+  // Sleeps for slp_tim then runs a manual compaction
+  // Checks span starting from st_pos from kvmap_ in the db and
+  // Gets should return true if check is true and false otherwise
+  // Also checks that value that we got is the same as inserted; and =kNewValue
+  //   if test_compaction_change is true
+  void SleepCompactCheck(int slp_tim, int st_pos, int span, bool check = true,
+                         bool test_compaction_change = false,
+                         ColumnFamilyHandle* cf = nullptr) {
+    ASSERT_TRUE(db_ttl_);
+
+    env_->Sleep(slp_tim);
+    ManualCompact(cf);
+    static ReadOptions ropts;
+    kv_it_ = kvmap_.begin();
+    advance(kv_it_, st_pos);
+    std::string v;
+    for (int i = 0; kv_it_ != kvmap_.end() && i < span; i++, kv_it_++) {
+      Status s = (cf == nullptr) ? db_ttl_->Get(ropts, kv_it_->first, &v)
+                                 : db_ttl_->Get(ropts, cf, kv_it_->first, &v);
+      if (s.ok() != check) {
+        fprintf(stderr, "key=%s ", kv_it_->first.c_str());
+        if (!s.ok()) {
+          fprintf(stderr, "is absent from db but was expected to be present\n");
+        } else {
+          fprintf(stderr, "is present in db but was expected to be absent\n");
+        }
+        ASSERT_TRUE(false);
+      } else if (s.ok()) {
+          if (test_compaction_change && v.compare(kNewValue_) != 0) {
+            fprintf(stderr, " value for key=%s present in database is %s but "
+                            " should be %s\n", kv_it_->first.c_str(), v.c_str(),
+                            kNewValue_.c_str());
+            ASSERT_TRUE(false);
+          } else if (!test_compaction_change && v.compare(kv_it_->second) !=0) {
+            fprintf(stderr, " value for key=%s present in database is %s but "
+                            " should be %s\n", kv_it_->first.c_str(), v.c_str(),
+                            kv_it_->second.c_str());
+            ASSERT_TRUE(false);
+          }
+      }
+    }
+  }
+
+  // Similar as SleepCompactCheck but uses TtlIterator to read from db
+  void SleepCompactCheckIter(int slp, int st_pos, int span, bool check=true) {
+    ASSERT_TRUE(db_ttl_);
+    env_->Sleep(slp);
+    ManualCompact();
+    static ReadOptions ropts;
+    Iterator *dbiter = db_ttl_->NewIterator(ropts);
+    kv_it_ = kvmap_.begin();
+    advance(kv_it_, st_pos);
+
+    dbiter->Seek(kv_it_->first);
+    if (!check) {
+      if (dbiter->Valid()) {
+        ASSERT_NE(dbiter->value().compare(kv_it_->second), 0);
+      }
+    } else {  // dbiter should have found out kvmap_[st_pos]
+      for (int i = st_pos;
+           kv_it_ != kvmap_.end() && i < st_pos + span;
+           i++, kv_it_++)  {
+        ASSERT_TRUE(dbiter->Valid());
+        ASSERT_EQ(dbiter->value().compare(kv_it_->second), 0);
+        dbiter->Next();
+      }
+    }
+    delete dbiter;
+  }
+
+  class TestFilter : public CompactionFilter {
+   public:
+    TestFilter(const int64_t kSampleSize, const std::string kNewValue)
+      : kSampleSize_(kSampleSize),
+        kNewValue_(kNewValue) {
+    }
+
+    // Works on keys of the form "key<number>"
+    // Drops key if number at the end of key is in [0, kSampleSize_/3),
+    // Keeps key if it is in [kSampleSize_/3, 2*kSampleSize_/3),
+    // Change value if it is in [2*kSampleSize_/3, kSampleSize_)
+    // Eg. kSampleSize_=6. Drop:key0-1...Keep:key2-3...Change:key4-5...
+    virtual bool Filter(int level, const Slice& key,
+                        const Slice& value, std::string* new_value,
+                        bool* value_changed) const override {
+      assert(new_value != nullptr);
+
+      std::string search_str = "0123456789";
+      std::string key_string = key.ToString();
+      size_t pos = key_string.find_first_of(search_str);
+      int num_key_end;
+      if (pos != std::string::npos) {
+        num_key_end = stoi(key_string.substr(pos, key.size() - pos));
+      } else {
+        return false; // Keep keys not matching the format "key<NUMBER>"
+      }
+
+      int partition = kSampleSize_ / 3;
+      if (num_key_end < partition) {
+        return true;
+      } else if (num_key_end < partition * 2) {
+        return false;
+      } else {
+        *new_value = kNewValue_;
+        *value_changed = true;
+        return false;
+      }
+    }
+
+    virtual const char* Name() const override {
+      return "TestFilter";
+    }
+
+   private:
+    const int64_t kSampleSize_;
+    const std::string kNewValue_;
+  };
+
+  class TestFilterFactory : public CompactionFilterFactory {
+    public:
+      TestFilterFactory(const int64_t kSampleSize, const std::string kNewValue)
+        : kSampleSize_(kSampleSize),
+          kNewValue_(kNewValue) {
+      }
+
+      virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+          const CompactionFilter::Context& context) override {
+        return std::unique_ptr<CompactionFilter>(
+            new TestFilter(kSampleSize_, kNewValue_));
+      }
+
+      virtual const char* Name() const override {
+        return "TestFilterFactory";
+      }
+
+    private:
+      const int64_t kSampleSize_;
+      const std::string kNewValue_;
+  };
+
+
+  // Choose carefully so that Put, Gets & Compaction complete in 1 second buffer
+  const int64_t kSampleSize_ = 100;
+  std::string dbname_;
+  DBWithTTL* db_ttl_;
+  unique_ptr<SpecialTimeEnv> env_;
+
+ private:
+  Options options_;
+  KVMap kvmap_;
+  KVMap::iterator kv_it_;
+  const std::string kNewValue_ = "new_value";
+  unique_ptr<CompactionFilter> test_comp_filter_;
+}; // class TtlTest
+
+// If TTL is non positive or not provided, the behaviour is TTL = infinity
+// This test opens the db 3 times with such default behavior and inserts a
+// bunch of kvs each time. All kvs should accumulate in the db till the end
+// Partitions the sample-size provided into 3 sets over boundary1 and boundary2
+TEST(TtlTest, NoEffect) {
+  MakeKVMap(kSampleSize_);
+  int boundary1 = kSampleSize_ / 3;
+  int boundary2 = 2 * boundary1;
+
+  OpenTtl();
+  PutValues(0, boundary1);                       //T=0: Set1 never deleted
+  SleepCompactCheck(1, 0, boundary1);            //T=1: Set1 still there
+  CloseTtl();
+
+  OpenTtl(0);
+  PutValues(boundary1, boundary2 - boundary1);   //T=1: Set2 never deleted
+  SleepCompactCheck(1, 0, boundary2);            //T=2: Sets1 & 2 still there
+  CloseTtl();
+
+  OpenTtl(-1);
+  PutValues(boundary2, kSampleSize_ - boundary2); //T=3: Set3 never deleted
+  SleepCompactCheck(1, 0, kSampleSize_, true);    //T=4: Sets 1,2,3 still there
+  CloseTtl();
+}
+
+// Puts a set of values and checks its presence using Get during ttl
+TEST(TtlTest, PresentDuringTTL) {
+  MakeKVMap(kSampleSize_);
+
+  OpenTtl(2);                                 // T=0:Open the db with ttl = 2
+  PutValues(0, kSampleSize_);                  // T=0:Insert Set1. Delete at t=2
+  SleepCompactCheck(1, 0, kSampleSize_, true); // T=1:Set1 should still be there
+  CloseTtl();
+}
+
+// Puts a set of values and checks its absence using Get after ttl
+TEST(TtlTest, AbsentAfterTTL) {
+  MakeKVMap(kSampleSize_);
+
+  OpenTtl(1);                                  // T=0:Open the db with ttl = 2
+  PutValues(0, kSampleSize_);                  // T=0:Insert Set1. Delete at t=2
+  SleepCompactCheck(2, 0, kSampleSize_, false); // T=2:Set1 should not be there
+  CloseTtl();
+}
+
+// Resets the timestamp of a set of kvs by updating them and checks that they
+// are not deleted according to the old timestamp
+TEST(TtlTest, ResetTimestamp) {
+  MakeKVMap(kSampleSize_);
+
+  OpenTtl(3);
+  PutValues(0, kSampleSize_);            // T=0: Insert Set1. Delete at t=3
+  env_->Sleep(2);                        // T=2
+  PutValues(0, kSampleSize_);            // T=2: Insert Set1. Delete at t=5
+  SleepCompactCheck(2, 0, kSampleSize_); // T=4: Set1 should still be there
+  CloseTtl();
+}
+
+// Similar to PresentDuringTTL but uses Iterator
+TEST(TtlTest, IterPresentDuringTTL) {
+  MakeKVMap(kSampleSize_);
+
+  OpenTtl(2);
+  PutValues(0, kSampleSize_);                 // T=0: Insert. Delete at t=2
+  SleepCompactCheckIter(1, 0, kSampleSize_);  // T=1: Set should be there
+  CloseTtl();
+}
+
+// Similar to AbsentAfterTTL but uses Iterator
+TEST(TtlTest, IterAbsentAfterTTL) {
+  MakeKVMap(kSampleSize_);
+
+  OpenTtl(1);
+  PutValues(0, kSampleSize_);                      // T=0: Insert. Delete at t=1
+  SleepCompactCheckIter(2, 0, kSampleSize_, false); // T=2: Should not be there
+  CloseTtl();
+}
+
+// Checks presence while opening the same db more than once with the same ttl
+// Note: The second open will open the same db
+TEST(TtlTest, MultiOpenSamePresent) {
+  MakeKVMap(kSampleSize_);
+
+  OpenTtl(2);
+  PutValues(0, kSampleSize_);                   // T=0: Insert. Delete at t=2
+  CloseTtl();
+
+  OpenTtl(2);                                  // T=0. Delete at t=2
+  SleepCompactCheck(1, 0, kSampleSize_);        // T=1: Set should be there
+  CloseTtl();
+}
+
+// Checks absence while opening the same db more than once with the same ttl
+// Note: The second open will open the same db
+TEST(TtlTest, MultiOpenSameAbsent) {
+  MakeKVMap(kSampleSize_);
+
+  OpenTtl(1);
+  PutValues(0, kSampleSize_);                   // T=0: Insert. Delete at t=1
+  CloseTtl();
+
+  OpenTtl(1);                                  // T=0.Delete at t=1
+  SleepCompactCheck(2, 0, kSampleSize_, false); // T=2: Set should not be there
+  CloseTtl();
+}
+
+// Checks presence while opening the same db more than once with bigger ttl
+TEST(TtlTest, MultiOpenDifferent) {
+  MakeKVMap(kSampleSize_);
+
+  OpenTtl(1);
+  PutValues(0, kSampleSize_);            // T=0: Insert. Delete at t=1
+  CloseTtl();
+
+  OpenTtl(3);                           // T=0: Set deleted at t=3
+  SleepCompactCheck(2, 0, kSampleSize_); // T=2: Set should be there
+  CloseTtl();
+}
+
+// Checks presence during ttl in read_only mode
+TEST(TtlTest, ReadOnlyPresentForever) {
+  MakeKVMap(kSampleSize_);
+
+  OpenTtl(1);                                 // T=0:Open the db normally
+  PutValues(0, kSampleSize_);                  // T=0:Insert Set1. Delete at t=1
+  CloseTtl();
+
+  OpenReadOnlyTtl(1);
+  SleepCompactCheck(2, 0, kSampleSize_);       // T=2:Set1 should still be there
+  CloseTtl();
+}
+
+// Checks whether WriteBatch works well with TTL
+// Puts all kvs in kvmap_ in a batch and writes first, then deletes first half
+TEST(TtlTest, WriteBatchTest) {
+  MakeKVMap(kSampleSize_);
+  BatchOperation batch_ops[kSampleSize_];
+  for (int i = 0; i < kSampleSize_; i++) {
+    batch_ops[i] = PUT;
+  }
+
+  OpenTtl(2);
+  MakePutWriteBatch(batch_ops, kSampleSize_);
+  for (int i = 0; i < kSampleSize_ / 2; i++) {
+    batch_ops[i] = DELETE;
+  }
+  MakePutWriteBatch(batch_ops, kSampleSize_ / 2);
+  SleepCompactCheck(0, 0, kSampleSize_ / 2, false);
+  SleepCompactCheck(0, kSampleSize_ / 2, kSampleSize_ - kSampleSize_ / 2);
+  CloseTtl();
+}
+
+// Checks user's compaction filter for correctness with TTL logic
+TEST(TtlTest, CompactionFilter) {
+  MakeKVMap(kSampleSize_);
+
+  OpenTtlWithTestCompaction(1);
+  PutValues(0, kSampleSize_);                  // T=0:Insert Set1. Delete at t=1
+  // T=2: TTL logic takes precedence over TestFilter:-Set1 should not be there
+  SleepCompactCheck(2, 0, kSampleSize_, false);
+  CloseTtl();
+
+  OpenTtlWithTestCompaction(3);
+  PutValues(0, kSampleSize_);                   // T=0:Insert Set1.
+  int partition = kSampleSize_ / 3;
+  SleepCompactCheck(1, 0, partition, false);   // Part dropped
+  SleepCompactCheck(0, partition, partition);  // Part kept
+  SleepCompactCheck(0, 2 * partition, partition, true, true); // Part changed
+  CloseTtl();
+}
+
+// Insert some key-values which KeyMayExist should be able to get and check that
+// values returned are fine
+TEST(TtlTest, KeyMayExist) {
+  MakeKVMap(kSampleSize_);
+
+  OpenTtl();
+  PutValues(0, kSampleSize_, false);
+
+  SimpleKeyMayExistCheck();
+
+  CloseTtl();
+}
+
+TEST(TtlTest, ColumnFamiliesTest) {
+  DB* db;
+  Options options;
+  options.create_if_missing = true;
+  options.env = env_.get();
+
+  DB::Open(options, dbname_, &db);
+  ColumnFamilyHandle* handle;
+  ASSERT_OK(db->CreateColumnFamily(ColumnFamilyOptions(options),
+                                   "ttl_column_family", &handle));
+
+  delete handle;
+  delete db;
+
+  std::vector<ColumnFamilyDescriptor> column_families;
+  column_families.push_back(ColumnFamilyDescriptor(
+      kDefaultColumnFamilyName, ColumnFamilyOptions(options)));
+  column_families.push_back(ColumnFamilyDescriptor(
+      "ttl_column_family", ColumnFamilyOptions(options)));
+
+  std::vector<ColumnFamilyHandle*> handles;
+
+  ASSERT_OK(DBWithTTL::Open(DBOptions(options), dbname_, column_families,
+                            &handles, &db_ttl_, {3, 5}, false));
+  ASSERT_EQ(handles.size(), 2U);
+  ColumnFamilyHandle* new_handle;
+  ASSERT_OK(db_ttl_->CreateColumnFamilyWithTtl(options, "ttl_column_family_2",
+                                               &new_handle, 2));
+  handles.push_back(new_handle);
+
+  MakeKVMap(kSampleSize_);
+  PutValues(0, kSampleSize_, false, handles[0]);
+  PutValues(0, kSampleSize_, false, handles[1]);
+  PutValues(0, kSampleSize_, false, handles[2]);
+
+  // everything should be there after 1 second
+  SleepCompactCheck(1, 0, kSampleSize_, true, false, handles[0]);
+  SleepCompactCheck(0, 0, kSampleSize_, true, false, handles[1]);
+  SleepCompactCheck(0, 0, kSampleSize_, true, false, handles[2]);
+
+  // only column family 1 should be alive after 4 seconds
+  SleepCompactCheck(3, 0, kSampleSize_, false, false, handles[0]);
+  SleepCompactCheck(0, 0, kSampleSize_, true, false, handles[1]);
+  SleepCompactCheck(0, 0, kSampleSize_, false, false, handles[2]);
+
+  // nothing should be there after 6 seconds
+  SleepCompactCheck(2, 0, kSampleSize_, false, false, handles[0]);
+  SleepCompactCheck(0, 0, kSampleSize_, false, false, handles[1]);
+  SleepCompactCheck(0, 0, kSampleSize_, false, false, handles[2]);
+
+  for (auto h : handles) {
+    delete h;
+  }
+  delete db_ttl_;
+  db_ttl_ = nullptr;
+}
+
+} //  namespace rocksdb
+
+// A black-box test for the ttl wrapper around rocksdb
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}