From: Igor Canadi Date: Thu, 16 Jan 2014 00:18:04 +0000 (-0800) Subject: Move functions from VersionSet to Version X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=2f4eda78906e5922c519f3ba49e7a3fe1bdd1403;p=rocksdb.git Move functions from VersionSet to Version Summary: There were some functions in VersionSet that had no reason to be there instead of Version. Moving them to Version will make column families implementation easier. The functions moved are: * NumLevelBytes * LevelSummary * LevelFileSummary * MaxNextLevelOverlappingBytes * AddLiveFiles (previously AddLiveFilesCurrentVersion()) * NeedSlowdownForNumLevel0Files The diff continues on (and depends on) D15171 Test Plan: make check Reviewers: dhruba, haobo, kailiu, sdong, emayanke Reviewed By: sdong CC: leveldb Differential Revision: https://reviews.facebook.net/D15183 --- 2f4eda78906e5922c519f3ba49e7a3fe1bdd1403 diff --git a/.arcconfig b/.arcconfig new file mode 100644 index 00000000..82d17715 --- /dev/null +++ b/.arcconfig @@ -0,0 +1,10 @@ +{ + "project_id" : "leveldb", + "conduit_uri" : "https://reviews.facebook.net/", + "copyright_holder" : "", + "load" : [ + "linters/src/" + ], + "lint.engine" : "FacebookFbcodeLintEngine", + "lint.engine.single.linter" : "FbcodeCppLinter" +} diff --git a/.clang-format b/.clang-format new file mode 100644 index 00000000..7c279811 --- /dev/null +++ b/.clang-format @@ -0,0 +1,5 @@ +# Complete list of style options can be found at: +# http://clang.llvm.org/docs/ClangFormatStyleOptions.html +--- +BasedOnStyle: Google +... diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..03a5f176 --- /dev/null +++ b/.gitignore @@ -0,0 +1,22 @@ +build_config.mk + +*.a +*.arc +*.d +*.dylib* +*.gcda +*.gcno +*.o +*.so +*.so.* +*_test +*_bench +*_stress + +ldb +manifest_dump +sst_dump +util/build_version.cc +build_tools/VALGRIND_LOGS/ +coverage/COVERAGE_REPORT +.gdbhistory diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 00000000..3a17a883 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,16 @@ +# Contributing to RocksDB + +## Contributor License Agreement ("CLA") + +In order to accept your pull request, we need you to submit a CLA. You +only need to do this once, so if you've done this for another Facebook +open source project, you're good to go. If you are submitting a pull +request for the first time, just let us know that you have completed +the CLA and we can cross-check with your GitHub username. + +Complete your CLA here: + +## License + +By contributing to RocksDB, you agree that your contributions will be +licensed under the [BSD License](LICENSE). diff --git a/INSTALL.md b/INSTALL.md new file mode 100644 index 00000000..ab046034 --- /dev/null +++ b/INSTALL.md @@ -0,0 +1,50 @@ +## Dependencies + +RocksDB is developed on Linux (CentOS release 5.2), with gcc 4.8.1. +It depends on gcc with C++11 support. + +* RocksDB depends on the following libraries: + - [zlib](http://www.zlib.net/) - a library for data compression. + - [bzip2](http://www.bzip.org/) - a library for data compression. + - [snappy](https://code.google.com/p/snappy/) - a library for fast + data compression. + - [gflags](https://code.google.com/p/gflags/) - a library that handles + command line flags processing. + +RocksDB will successfully compile without the compression libraries included, +but some things may fail. We do not support releases without the compression +libraries. You are on your own. + +## Supported platforms + +* **Linux** + * Upgrade your gcc to version at least 4.7 to get C++11 support. + * Install gflags. First, try: `sudo apt-get install libgflags-dev`. + If this doesn't work and you're using Ubuntu, here's a nice tutorial: + (http://askubuntu.com/questions/312173/installing-gflags-12-04) + * Install snappy. This is usually as easy as: + `sudo apt-get install libsnappy-dev`. + * Install zlib. Try: `sudo apt-get install zlib1g-dev`. + * Install bzip2: `sudo apt-get install libbz2-dev`. +* **OS X**: + * Install latest C++ compiler that supports C++ 11: + * Update XCode: run `xcode-select --install` (or install it from XCode App's settting). + * Install via [homebrew](http://brew.sh/). + * If you're first time developer in MacOS, you still need to run: `xcode-select --install` in your command line. + * run `brew tap homebrew/dupes; brew install gcc47 --use-llvm` to install gcc 4.7 (or higher). + * Install zlib, bzip2 and snappy libraries for compression. + * Install gflags. We have included a script + `build_tools/mac-install-gflags.sh`, which should automatically install it. + If you installed gflags by other means (for example, `brew install gflags`), + please set `LIBRARY_PATH` and `CPATH` accordingly. + * Please note that some of the optimizations/features are disabled in OSX. + We did not run any production workloads on it. + +## Compilation +`make clean; make` will compile librocksdb.a (RocskDB static library) and all +the unit tests. You can run all unit tests with `make check`. + +For shared library builds, exec `make librocksdb.so` instead. + +If you followed the above steps and your compile or unit tests fail, +please submit an issue: (https://github.com/facebook/rocksdb/issues) diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..716ad9e7 --- /dev/null +++ b/LICENSE @@ -0,0 +1,35 @@ +BSD License + +For rocksdb software + +Copyright (c) 2013, Facebook, Inc. +All rights reserved. +--------------------------------------------------------------------- + +Copyright (c) 2011 The LevelDB Authors. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/Makefile b/Makefile new file mode 100644 index 00000000..572e42e9 --- /dev/null +++ b/Makefile @@ -0,0 +1,427 @@ +# Copyright (c) 2011 The LevelDB Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. See the AUTHORS file for names of contributors. + +# Inherit some settings from environment variables, if available +INSTALL_PATH ?= $(CURDIR) + +#----------------------------------------------- +# Uncomment exactly one of the lines labelled (A), (B), and (C) below +# to switch between compilation modes. + +# OPT ?= -DNDEBUG # (A) Production use (optimized mode) +OPT += -O2 -fno-omit-frame-pointer -momit-leaf-frame-pointer +#----------------------------------------------- + +# detect what platform we're building on +$(shell (export ROCKSDB_ROOT=$(CURDIR); $(CURDIR)/build_tools/build_detect_platform $(CURDIR)/build_config.mk)) +# this file is generated by the previous line to set build flags and sources +include build_config.mk + +# ASAN doesn't work well with jemalloc. If we're compiling with ASAN, we should use regular malloc. +ifdef COMPILE_WITH_ASAN + # ASAN compile flags + EXEC_LDFLAGS += -fsanitize=address + PLATFORM_CCFLAGS += -fsanitize=address + PLATFORM_CXXFLAGS += -fsanitize=address +else + # if we're not compiling with ASAN, use jemalloc + EXEC_LDFLAGS := $(JEMALLOC_LIB) $(EXEC_LDFLAGS) + PLATFORM_CXXFLAGS += $(JEMALLOC_INCLUDE) -DHAVE_JEMALLOC + PLATFORM_CCFLAGS += $(JEMALLOC_INCLUDE) -DHAVE_JEMALLOC +endif + +WARNING_FLAGS = -Wall -Werror +CFLAGS += -g $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CCFLAGS) $(OPT) +CXXFLAGS += -g $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) -Woverloaded-virtual + +LDFLAGS += $(PLATFORM_LDFLAGS) + +LIBOBJECTS = $(SOURCES:.cc=.o) +LIBOBJECTS += $(SOURCESCPP:.cpp=.o) +MEMENVOBJECTS = $(MEMENV_SOURCES:.cc=.o) + +TESTUTIL = ./util/testutil.o +TESTHARNESS = ./util/testharness.o $(TESTUTIL) +VALGRIND_ERROR = 2 +VALGRIND_DIR = build_tools/VALGRIND_LOGS +VALGRIND_VER := $(join $(VALGRIND_VER),valgrind) +VALGRIND_OPTS = --error-exitcode=$(VALGRIND_ERROR) --leak-check=full + +TESTS = \ + db_test \ + autovector_test \ + table_properties_collector_test \ + arena_test \ + auto_roll_logger_test \ + block_test \ + bloom_test \ + c_test \ + cache_test \ + coding_test \ + corruption_test \ + crc32c_test \ + dbformat_test \ + env_test \ + blob_store_test \ + filelock_test \ + filename_test \ + filter_block_test \ + histogram_test \ + log_test \ + manual_compaction_test \ + memenv_test \ + merge_test \ + redis_test \ + reduce_levels_test \ + simple_table_db_test \ + skiplist_test \ + stringappend_test \ + ttl_test \ + backupable_db_test \ + version_edit_test \ + version_set_test \ + write_batch_test\ + deletefile_test \ + table_test + +TOOLS = \ + sst_dump \ + db_stress \ + ldb \ + db_repl_stress \ + blob_store_bench + +PROGRAMS = db_bench signal_test $(TESTS) $(TOOLS) +BENCHMARKS = db_bench_sqlite3 db_bench_tree_db table_reader_bench + +# The library name is configurable since we are maintaining libraries of both +# debug/release mode. +LIBNAME = librocksdb +LIBRARY = ${LIBNAME}.a +MEMENVLIBRARY = libmemenv.a + +default: all + +#----------------------------------------------- +# Create platform independent shared libraries. +#----------------------------------------------- +ifneq ($(PLATFORM_SHARED_EXT),) + +ifneq ($(PLATFORM_SHARED_VERSIONED),true) +SHARED1 = ${LIBNAME}.$(PLATFORM_SHARED_EXT) +SHARED2 = $(SHARED1) +SHARED3 = $(SHARED1) +SHARED = $(SHARED1) +else +# Update db.h if you change these. +SHARED_MAJOR = 2 +SHARED_MINOR = 0 +SHARED1 = ${LIBNAME}.$(PLATFORM_SHARED_EXT) +SHARED2 = $(SHARED1).$(SHARED_MAJOR) +SHARED3 = $(SHARED1).$(SHARED_MAJOR).$(SHARED_MINOR) +SHARED = $(SHARED1) $(SHARED2) $(SHARED3) +$(SHARED1): $(SHARED3) + ln -fs $(SHARED3) $(SHARED1) +$(SHARED2): $(SHARED3) + ln -fs $(SHARED3) $(SHARED2) +endif + +$(SHARED3): $(LIBOBJECTS) + $(CXX) $(PLATFORM_SHARED_LDFLAGS)$(SHARED2) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(LDFLAGS) $(SOURCES)-o $@ + +endif # PLATFORM_SHARED_EXT + +all: $(LIBRARY) $(PROGRAMS) + +.PHONY: blackbox_crash_test check clean coverage crash_test ldb_tests \ + release tags valgrind_check whitebox_crash_test format + +# Will also generate shared libraries. +release: + $(MAKE) clean + OPT=-DNDEBUG $(MAKE) all -j32 + OPT=-DNDEBUG $(MAKE) $(SHARED) -j32 + +coverage: + $(MAKE) clean + COVERAGEFLAGS="-fprofile-arcs -ftest-coverage" LDFLAGS+="-lgcov" $(MAKE) all check + (cd coverage; ./coverage_test.sh) + # Delete intermediate files + find . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \; + +check: all $(PROGRAMS) $(TESTS) $(TOOLS) + for t in $(TESTS); do echo "***** Running $$t"; ./$$t || exit 1; done + python tools/ldb_test.py + +ldb_tests: all $(PROGRAMS) $(TOOLS) + python tools/ldb_test.py + +crash_test: blackbox_crash_test whitebox_crash_test + +blackbox_crash_test: db_stress + python -u tools/db_crashtest.py + +whitebox_crash_test: db_stress + python -u tools/db_crashtest2.py + +asan_check: + $(MAKE) clean + COMPILE_WITH_ASAN=1 $(MAKE) check -j32 + $(MAKE) clean + +asan_crash_test: + $(MAKE) clean + COMPILE_WITH_ASAN=1 $(MAKE) crash_test -j32 + $(MAKE) clean + +valgrind_check: all $(PROGRAMS) $(TESTS) + mkdir -p $(VALGRIND_DIR) + echo TESTS THAT HAVE VALGRIND ERRORS > $(VALGRIND_DIR)/valgrind_failed_tests; \ + echo TIMES in seconds TAKEN BY TESTS ON VALGRIND > $(VALGRIND_DIR)/valgrind_tests_times; \ + for t in $(filter-out skiplist_test,$(TESTS)); do \ + stime=`date '+%s'`; \ + $(VALGRIND_VER) $(VALGRIND_OPTS) ./$$t; \ + if [ $$? -eq $(VALGRIND_ERROR) ] ; then \ + echo $$t >> $(VALGRIND_DIR)/valgrind_failed_tests; \ + fi; \ + etime=`date '+%s'`; \ + echo $$t $$((etime - stime)) >> $(VALGRIND_DIR)/valgrind_tests_times; \ + done + +clean: + -rm -f $(PROGRAMS) $(BENCHMARKS) $(LIBRARY) $(SHARED) $(MEMENVLIBRARY) build_config.mk + -rm -rf ios-x86/* ios-arm/* + -find . -name "*.[od]" -exec rm {} \; + -find . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \; +tags: + ctags * -R + cscope -b `find . -name '*.cc'` `find . -name '*.h'` + +format: + build_tools/format-diff.sh + +# --------------------------------------------------------------------------- +# Unit tests and tools +# --------------------------------------------------------------------------- +$(LIBRARY): $(LIBOBJECTS) + rm -f $@ + $(AR) -rs $@ $(LIBOBJECTS) + +db_bench: db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) + $(CXX) db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +db_stress: tools/db_stress.o $(LIBOBJECTS) $(TESTUTIL) + $(CXX) tools/db_stress.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +db_repl_stress: tools/db_repl_stress.o $(LIBOBJECTS) $(TESTUTIL) + $(CXX) tools/db_repl_stress.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +blob_store_bench: tools/blob_store_bench.o $(LIBOBJECTS) $(TESTUTIL) + $(CXX) tools/blob_store_bench.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +db_bench_sqlite3: doc/bench/db_bench_sqlite3.o $(LIBOBJECTS) $(TESTUTIL) + $(CXX) doc/bench/db_bench_sqlite3.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) -lsqlite3 $(COVERAGEFLAGS) + +db_bench_tree_db: doc/bench/db_bench_tree_db.o $(LIBOBJECTS) $(TESTUTIL) + $(CXX) doc/bench/db_bench_tree_db.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) -lkyotocabinet $(COVERAGEFLAGS) + +signal_test: util/signal_test.o $(LIBOBJECTS) + $(CXX) util/signal_test.o $(LIBOBJECTS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +arena_test: util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +autovector_test: util/autovector_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) util/autovector_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +table_properties_collector_test: db/table_properties_collector_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/table_properties_collector_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +bloom_test: util/bloom_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) util/bloom_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +c_test: db/c_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/c_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +cache_test: util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +coding_test: util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +blob_store_test: util/blob_store_test.o $(LIBOBJECTS) $(TESTHARNESS) $(TESTUTIL) + $(CXX) util/blob_store_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o$@ $(LDFLAGS) $(COVERAGEFLAGS) + +stringappend_test: utilities/merge_operators/string_append/stringappend_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) utilities/merge_operators/string_append/stringappend_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +redis_test: utilities/redis/redis_lists_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) utilities/redis/redis_lists_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +histogram_test: util/histogram_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) util/histogram_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o$@ $(LDFLAGS) $(COVERAGEFLAGS) + +corruption_test: db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +crc32c_test: util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +db_test: db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +simple_table_db_test: db/simple_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/simple_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +table_reader_bench: table/table_reader_bench.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) table/table_reader_bench.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +perf_context_test: db/perf_context_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/perf_context_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) + +prefix_test: db/prefix_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/prefix_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) + +backupable_db_test: utilities/backupable/backupable_db_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) utilities/backupable/backupable_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +ttl_test: utilities/ttl/ttl_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) utilities/ttl/ttl_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +dbformat_test: db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +env_test: util/env_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) util/env_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +filename_test: db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +filter_block_test: table/filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) table/filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +log_test: db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +table_test: table/table_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) table/table_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +block_test: table/block_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) table/block_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +skiplist_test: db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +version_edit_test: db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +version_set_test: db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +reduce_levels_test: tools/reduce_levels_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) tools/reduce_levels_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +write_batch_test: db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +merge_test: db/merge_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/merge_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +deletefile_test: db/deletefile_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/deletefile_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) + +$(MEMENVLIBRARY) : $(MEMENVOBJECTS) + rm -f $@ + $(AR) -rs $@ $(MEMENVOBJECTS) + +memenv_test : helpers/memenv/memenv_test.o $(MEMENVLIBRARY) $(LIBRARY) $(TESTHARNESS) + $(CXX) helpers/memenv/memenv_test.o $(MEMENVLIBRARY) $(LIBRARY) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +manual_compaction_test: util/manual_compaction_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) util/manual_compaction_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +rocksdb_shell: tools/shell/ShellContext.o tools/shell/ShellState.o tools/shell/LeveldbShell.o tools/shell/DBClientProxy.o tools/shell/ShellContext.h tools/shell/ShellState.h tools/shell/DBClientProxy.h $(LIBOBJECTS) + $(CXX) tools/shell/ShellContext.o tools/shell/ShellState.o tools/shell/LeveldbShell.o tools/shell/DBClientProxy.o $(LIBOBJECTS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +DBClientProxy_test: tools/shell/test/DBClientProxyTest.o tools/shell/DBClientProxy.o $(LIBRARY) + $(CXX) tools/shell/test/DBClientProxyTest.o tools/shell/DBClientProxy.o $(LIBRARY) $(EXEC_LDFLAGS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +filelock_test: util/filelock_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) util/filelock_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +auto_roll_logger_test: util/auto_roll_logger_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) util/auto_roll_logger_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +sst_dump: tools/sst_dump.o $(LIBOBJECTS) + $(CXX) tools/sst_dump.o $(LIBOBJECTS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +ldb: tools/ldb.o $(LIBOBJECTS) + $(CXX) tools/ldb.o $(LIBOBJECTS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +# --------------------------------------------------------------------------- +# Platform-specific compilation +# --------------------------------------------------------------------------- + +ifeq ($(PLATFORM), IOS) +# For iOS, create universal object files to be used on both the simulator and +# a device. +PLATFORMSROOT=/Applications/Xcode.app/Contents/Developer/Platforms +SIMULATORROOT=$(PLATFORMSROOT)/iPhoneSimulator.platform/Developer +DEVICEROOT=$(PLATFORMSROOT)/iPhoneOS.platform/Developer +IOSVERSION=$(shell defaults read $(PLATFORMSROOT)/iPhoneOS.platform/versionCFBundleShortVersionString) + +.cc.o: + mkdir -p ios-x86/$(dir $@) + $(SIMULATORROOT)/usr/bin/$(CXX) $(CXXFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -c $< -o ios-x86/$@ $(COVERAGEFLAGS) + mkdir -p ios-arm/$(dir $@) + $(DEVICEROOT)/usr/bin/$(CXX) $(CXXFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -c $< -o ios-arm/$@ $(COVERAGEFLAGS) + lipo ios-x86/$@ ios-arm/$@ -create -output $@ + +.c.o: + mkdir -p ios-x86/$(dir $@) + $(SIMULATORROOT)/usr/bin/$(CC) $(CFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -c $< -o ios-x86/$@ + mkdir -p ios-arm/$(dir $@) + $(DEVICEROOT)/usr/bin/$(CC) $(CFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -c $< -o ios-arm/$@ + lipo ios-x86/$@ ios-arm/$@ -create -output $@ + +else +.cc.o: + $(CXX) $(CXXFLAGS) -c $< -o $@ $(COVERAGEFLAGS) + +.c.o: + $(CC) $(CFLAGS) -c $< -o $@ +endif + +# --------------------------------------------------------------------------- +# Source files dependencies detection +# --------------------------------------------------------------------------- + +# Add proper dependency support so changing a .h file forces a .cc file to +# rebuild. + +# The .d file indicates .cc file's dependencies on .h files. We generate such +# dependency by g++'s -MM option, whose output is a make dependency rule. +# The sed command makes sure the "target" file in the generated .d file has +# the correct path prefix. +%.d: %.cc + $(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) -MM $< -o $@ +ifeq ($(PLATFORM), OS_MACOSX) + @sed -i '' -e 's,.*:,$*.o:,' $@ +else + @sed -i -e 's,.*:,$*.o:,' $@ +endif + +DEPFILES = $(filter-out util/build_version.d,$(SOURCES:.cc=.d)) + +depend: $(DEPFILES) + +# if the make goal is either "clean" or "format", we shouldn't +# try to import the *.d files. +# TODO(kailiu) The unfamiliarity of Make's conditions leads to the ugly +# working solution. +ifneq ($(MAKECMDGOALS),clean) +ifneq ($(MAKECMDGOALS),format) +-include $(DEPFILES) +endif +endif diff --git a/PATENTS b/PATENTS new file mode 100644 index 00000000..8a6fca4d --- /dev/null +++ b/PATENTS @@ -0,0 +1,23 @@ +Additional Grant of Patent Rights + +“Software” means the rocksdb software distributed by Facebook, Inc. + +Facebook hereby grants you a perpetual, worldwide, royalty-free, +non-exclusive, irrevocable (subject to the termination provision below) +license under any rights in any patent claims owned by Facebook, to make, +have made, use, sell, offer to sell, import, and otherwise transfer the +Software. For avoidance of doubt, no license is granted under Facebook’s +rights in any patent claims that are infringed by (i) modifications to the +Software made by you or a third party, or (ii) the Software in combination +with any software or other technology provided by you or a third party. + +The license granted hereunder will terminate, automatically and without +notice, for anyone that makes any claim (including by filing any lawsuit, +assertion or other action) alleging (a) direct, indirect, or contributory +infringement or inducement to infringe any patent: (i) by Facebook or any +of its subsidiaries or affiliates, whether or not such claim is related +to the Software, (ii) by any party if such claim arises in whole or in +part from any software, product or service of Facebook or any of its +subsidiaries or affiliates, whether or not such claim is related to the +Software, or (iii) by any party relating to the Software; or (b) that +any right in any patent claim of Facebook is invalid or unenforceable. diff --git a/README b/README new file mode 100644 index 00000000..473e4145 --- /dev/null +++ b/README @@ -0,0 +1,82 @@ +rocksdb: A persistent key-value store for flash storage +Authors: * The Facebook Database Engineering Team + * Build on earlier work on leveldb by Sanjay Ghemawat + (sanjay@google.com) and Jeff Dean (jeff@google.com) + +This code is a library that forms the core building block for a fast +key value server, especially suited for storing data on flash drives. +It has an Log-Structured-Merge-Database (LSM) design with flexible tradeoffs +between Write-Amplification-Factor(WAF), Read-Amplification-Factor (RAF) +and Space-Amplification-Factor(SAF). It has multi-threaded compactions, +making it specially suitable for storing multiple terabytes of data in a +single database. + +The core of this code has been derived from open-source leveldb. + +The code under this directory implements a system for maintaining a +persistent key/value store. + +See doc/index.html and github wiki (https://github.com/facebook/rocksdb/wiki) +for more explanation. + +The public interface is in include/*. Callers should not include or +rely on the details of any other header files in this package. Those +internal APIs may be changed without warning. + +Guide to header files: + +include/rocksdb/db.h + Main interface to the DB: Start here + +include/rocksdb/options.h + Control over the behavior of an entire database, and also + control over the behavior of individual reads and writes. + +include/rocksdb/comparator.h + Abstraction for user-specified comparison function. If you want + just bytewise comparison of keys, you can use the default comparator, + but clients can write their own comparator implementations if they + want custom ordering (e.g. to handle different character + encodings, etc.) + +include/rocksdb/iterator.h + Interface for iterating over data. You can get an iterator + from a DB object. + +include/rocksdb/write_batch.h + Interface for atomically applying multiple updates to a database. + +include/rocksdb/slice.h + A simple module for maintaining a pointer and a length into some + other byte array. + +include/rocksdb/status.h + Status is returned from many of the public interfaces and is used + to report success and various kinds of errors. + +include/rocksdb/env.h + Abstraction of the OS environment. A posix implementation of + this interface is in util/env_posix.cc + +include/rocksdb/table_builder.h + Lower-level modules that most clients probably won't use directly + +include/rocksdb/cache.h + An API for the block cache. + +include/rocksdb/compaction_filter.h + An API for a application filter invoked on every compaction. + +include/rocksdb/filter_policy.h + An API for configuring a bloom filter. + +include/rocksdb/memtablerep.h + An API for implementing a memtable. + +include/rocksdb/statistics.h + An API to retrieve various database statistics. + +include/rocksdb/transaction_log.h + An API to retrieve transaction logs from a database. + +Design discussions are conducted in https://www.facebook.com/groups/rocksdb.dev/ diff --git a/README.fb b/README.fb new file mode 100644 index 00000000..d3cc4110 --- /dev/null +++ b/README.fb @@ -0,0 +1,3 @@ +* Detailed instructions on how to compile using fbcode and jemalloc + +* Latest release is 2.7.fb diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform new file mode 100755 index 00000000..8e83ae49 --- /dev/null +++ b/build_tools/build_detect_platform @@ -0,0 +1,292 @@ +#!/bin/sh +# +# Detects OS we're compiling on and outputs a file specified by the first +# argument, which in turn gets read while processing Makefile. +# +# The output will set the following variables: +# CC C Compiler path +# CXX C++ Compiler path +# PLATFORM_LDFLAGS Linker flags +# PLATFORM_SHARED_EXT Extension for shared libraries +# PLATFORM_SHARED_LDFLAGS Flags for building shared library +# PLATFORM_SHARED_CFLAGS Flags for compiling objects for shared library +# PLATFORM_CCFLAGS C compiler flags +# PLATFORM_CXXFLAGS C++ compiler flags. Will contain: +# PLATFORM_SHARED_VERSIONED Set to 'true' if platform supports versioned +# shared libraries, empty otherwise. +# +# The PLATFORM_CCFLAGS and PLATFORM_CXXFLAGS might include the following: +# +# -DLEVELDB_PLATFORM_POSIX if cstdatomic is present +# -DLEVELDB_PLATFORM_NOATOMIC if it is not +# -DSNAPPY if the Snappy library is present +# +# Using gflags in rocksdb: +# Our project depends on gflags, which requires users to take some extra steps +# before they can compile the whole repository: +# 1. Install gflags. You may download it from here: +# https://code.google.com/p/gflags/ +# 2. Once install, add the include path/lib path for gflags to CPATH and +# LIBRARY_PATH respectively. If installed with default mode, the +# lib and include path will be /usr/local/lib and /usr/local/include +# Mac user can do this by running build_tools/mac-install-gflags.sh + +OUTPUT=$1 +if test -z "$OUTPUT"; then + echo "usage: $0 " >&2 + exit 1 +fi + +# we depend on C++11 +PLATFORM_CXXFLAGS="-std=gnu++11" +# we currently depend on POSIX platform +COMMON_FLAGS="-DROCKSDB_PLATFORM_POSIX" + +# Default to fbcode gcc on internal fb machines +if [ -d /mnt/gvfs/third-party -a -z "$CXX" ]; then + FBCODE_BUILD="true" + if [ -z "$USE_CLANG" ]; then + CENTOS_VERSION=`rpm -q --qf "%{VERSION}" \ + $(rpm -q --whatprovides redhat-release)` + if [ "$CENTOS_VERSION" = "6" ]; then + source $PWD/build_tools/fbcode.gcc481.sh + else + source $PWD/build_tools/fbcode.gcc471.sh + fi + else + source $PWD/build_tools/fbcode.clang31.sh + fi +fi + +# Delete existing output, if it exists +rm -f $OUTPUT +touch $OUTPUT + +if test -z "$CC"; then + CC=cc +fi + +if test -z "$CXX"; then + CXX=g++ +fi + +# Detect OS +if test -z "$TARGET_OS"; then + TARGET_OS=`uname -s` +fi + +COMMON_FLAGS="$COMMON_FLAGS ${CFLAGS}" +CROSS_COMPILE= +PLATFORM_CCFLAGS= +PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS ${CXXFLAGS}" +PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS" +PLATFORM_SHARED_EXT="so" +PLATFORM_SHARED_LDFLAGS="-shared -Wl,-soname -Wl," +PLATFORM_SHARED_CFLAGS="-fPIC" +PLATFORM_SHARED_VERSIONED=false + +# generic port files (working on all platform by #ifdef) go directly in /port +GENERIC_PORT_FILES=`find $ROCKSDB_ROOT/port -name '*.cc' | tr "\n" " "` + +# On GCC, we pick libc's memcmp over GCC's memcmp via -fno-builtin-memcmp +case "$TARGET_OS" in + Darwin) + PLATFORM=OS_MACOSX + COMMON_FLAGS="$COMMON_FLAGS -DOS_MACOSX" + PLATFORM_SHARED_EXT=dylib + PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name " + # PORT_FILES=port/darwin/darwin_specific.cc + ;; + Linux) + PLATFORM=OS_LINUX + COMMON_FLAGS="$COMMON_FLAGS -DOS_LINUX" + if [ -z "$USE_CLANG" ]; then + COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp" + fi + PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lrt" + # PORT_FILES=port/linux/linux_specific.cc + ;; + SunOS) + PLATFORM=OS_SOLARIS + COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_SOLARIS" + PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lrt" + # PORT_FILES=port/sunos/sunos_specific.cc + ;; + FreeBSD) + PLATFORM=OS_FREEBSD + COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_FREEBSD" + PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread" + # PORT_FILES=port/freebsd/freebsd_specific.cc + ;; + NetBSD) + PLATFORM=OS_NETBSD + COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_NETBSD" + PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lgcc_s" + # PORT_FILES=port/netbsd/netbsd_specific.cc + ;; + OpenBSD) + PLATFORM=OS_OPENBSD + COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_OPENBSD" + PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -pthread" + # PORT_FILES=port/openbsd/openbsd_specific.cc + ;; + DragonFly) + PLATFORM=OS_DRAGONFLYBSD + COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_DRAGONFLYBSD" + PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread" + # PORT_FILES=port/dragonfly/dragonfly_specific.cc + ;; + OS_ANDROID_CROSSCOMPILE) + PLATFORM=OS_ANDROID + COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_ANDROID -DLEVELDB_PLATFORM_POSIX" + PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS " # All pthread features are in the Android C library + # PORT_FILES=port/android/android.cc + CROSS_COMPILE=true + ;; + *) + echo "Unknown platform!" >&2 + exit 1 +esac + +$PWD/build_tools/build_detect_version + +# We want to make a list of all cc files within util, db, table, and helpers +# except for the test and benchmark files. By default, find will output a list +# of all files matching either rule, so we need to append -print to make the +# prune take effect. +DIRS="util db table utilities" + +set -f # temporarily disable globbing so that our patterns arent expanded +PRUNE_TEST="-name *test*.cc -prune" +PRUNE_BENCH="-name *_bench.cc -prune" +PORTABLE_FILES=`cd $ROCKSDB_ROOT; find $DIRS $PRUNE_TEST -o $PRUNE_BENCH -o -name '*.cc' -print | sort | tr "\n" " "` +PORTABLE_CPP=`cd $ROCKSDB_ROOT; find $DIRS $PRUNE_TEST -o $PRUNE_BENCH -o -name '*.cpp' -print | sort | tr "\n" " "` +set +f # re-enable globbing + +# The sources consist of the portable files, plus the platform-specific port +# file. +echo "SOURCES=$PORTABLE_FILES $GENERIC_PORT_FILES $PORT_FILES" >> $OUTPUT +echo "SOURCESCPP=$PORTABLE_CPP" >> $OUTPUT +echo "MEMENV_SOURCES=helpers/memenv/memenv.cc" >> $OUTPUT + +if [ "$CROSS_COMPILE" = "true" -o "$FBCODE_BUILD" = "true" ]; then + # Cross-compiling; do not try any compilation tests. + # Also don't need any compilation tests if compiling on fbcode + true +else + # do fPIC on 64 bit in non-fbcode environment + case "$TARGET_OS" in + x86_64) + PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS -fPIC" + esac + + # If -std=c++0x works, use . Otherwise use port_posix.h. + $CXX $CFLAGS -std=c++0x -x c++ - -o /dev/null 2>/dev/null < + int main() {} +EOF + if [ "$?" = 0 ]; then + COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_ATOMIC_PRESENT" + fi + + # Test whether fallocate is available + $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null < + int main() { + int fd = open("/dev/null", 0); + fallocate(fd, 0, 0, 1024); + } +EOF + if [ "$?" = 0 ]; then + COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_FALLOCATE_PRESENT" + fi + + # Test whether Snappy library is installed + # http://code.google.com/p/snappy/ + $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null < + int main() {} +EOF + if [ "$?" = 0 ]; then + COMMON_FLAGS="$COMMON_FLAGS -DSNAPPY" + PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lsnappy" + fi + + + # Test whether gflags library is installed + # http://code.google.com/p/gflags/ + $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null < + int main() {} +EOF + if [ "$?" = 0 ]; then + COMMON_FLAGS="$COMMON_FLAGS -DGFLAGS" + PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lgflags" + fi + + # Test whether zlib library is installed + $CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null < + int main() {} +EOF + if [ "$?" = 0 ]; then + COMMON_FLAGS="$COMMON_FLAGS -DZLIB" + PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lz" + fi + + # Test whether bzip library is installed + $CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null < + int main() {} +EOF + if [ "$?" = 0 ]; then + COMMON_FLAGS="$COMMON_FLAGS -DBZIP2" + PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lbz2" + fi + + # Test whether tcmalloc is available + $CXX $CFLAGS -x c++ - -o /dev/null -ltcmalloc 2>/dev/null <> $OUTPUT +echo "CXX=$CXX" >> $OUTPUT +echo "PLATFORM=$PLATFORM" >> $OUTPUT +echo "PLATFORM_LDFLAGS=$PLATFORM_LDFLAGS" >> $OUTPUT +echo "VALGRIND_VER=$VALGRIND_VER" >> $OUTPUT +echo "PLATFORM_CCFLAGS=$PLATFORM_CCFLAGS" >> $OUTPUT +echo "PLATFORM_CXXFLAGS=$PLATFORM_CXXFLAGS" >> $OUTPUT +echo "PLATFORM_SHARED_CFLAGS=$PLATFORM_SHARED_CFLAGS" >> $OUTPUT +echo "PLATFORM_SHARED_EXT=$PLATFORM_SHARED_EXT" >> $OUTPUT +echo "PLATFORM_SHARED_LDFLAGS=$PLATFORM_SHARED_LDFLAGS" >> $OUTPUT +echo "PLATFORM_SHARED_VERSIONED=$PLATFORM_SHARED_VERSIONED" >> $OUTPUT +echo "EXEC_LDFLAGS=$EXEC_LDFLAGS" >> $OUTPUT +echo "JEMALLOC_INCLUDE=$JEMALLOC_INCLUDE" >> $OUTPUT +echo "JEMALLOC_LIB=$JEMALLOC_LIB" >> $OUTPUT diff --git a/build_tools/build_detect_version b/build_tools/build_detect_version new file mode 100755 index 00000000..f7d711f0 --- /dev/null +++ b/build_tools/build_detect_version @@ -0,0 +1,22 @@ +#!/bin/sh +# +# Record the version of the source that we are compiling. +# We keep a record of the git revision in util/version.cc. This source file +# is then built as a regular source file as part of the compilation process. +# One can run "strings executable_filename | grep _build_" to find the version of +# the source that we used to build the executable file. + +OUTFILE="$PWD/util/build_version.cc" + +GIT_SHA="" +if command -v git >/dev/null 2>&1; then + GIT_SHA=$(git rev-parse HEAD 2>/dev/null) +fi + +cat > "${OUTFILE}" < /dev/null +then + echo "You didn't have clang-format-diff.py available in your computer!" + echo "You can download it by running: " + echo " curl https://fburl.com/clang-format-diff" + exit 128 +fi + +# Check argparse, a library that clang-format-diff.py requires. +python 2>/dev/null << EOF +import argparse +EOF + +if [ "$?" != 0 ] +then + echo "To run clang-format-diff.py, we'll need the library "argparse" to be" + echo "installed. You can try either of the follow ways to install it:" + echo " 1. Manually download argparse: https://pypi.python.org/pypi/argparse" + echo " 2. easy_install argparse (if you have easy_install)" + echo " 3. pip install argparse (if you have pip)" + exit 129 +fi + +# TODO(kailiu) following work is not complete since we still need to figure +# out how to add the modified files done pre-commit hook to git's commit index. +# +# Check if this script has already been added to pre-commit hook. +# Will suggest user to add this script to pre-commit hook if their pre-commit +# is empty. +# PRE_COMMIT_SCRIPT_PATH="`git rev-parse --show-toplevel`/.git/hooks/pre-commit" +# if ! ls $PRE_COMMIT_SCRIPT_PATH &> /dev/null +# then +# echo "Would you like to add this script to pre-commit hook, which will do " +# echo -n "the format check for all the affected lines before you check in (y/n):" +# read add_to_hook +# if [ "$add_to_hook" == "y" ] +# then +# ln -s `git rev-parse --show-toplevel`/build_tools/format-diff.sh $PRE_COMMIT_SCRIPT_PATH +# fi +# fi + +# Check the format of recently changed lines, +diffs=$(git diff -U0 HEAD^ | $CLANG_FORMAT_DIFF -p 1) + +if [ -z "$diffs" ] +then + echo "Nothing needs to be reformatted!" + exit 0 +fi + +# Highlight the insertion/deletion from the clang-format-diff.py's output +COLOR_END="\033[0m" +COLOR_RED="\033[0;31m" +COLOR_GREEN="\033[0;32m" + +echo -e "Detect lines that doesn't follow the format rules:\r" +# Add the color to the diff. lines added will be green; lines removed will be red. +echo "$diffs" | + sed -e "s/\(^-.*$\)/`echo -e \"$COLOR_RED\1$COLOR_END\"`/" | + sed -e "s/\(^+.*$\)/`echo -e \"$COLOR_GREEN\1$COLOR_END\"`/" +echo -e "Would you like to fix the format automatically (y/n): \c" + +# Make sure under any mode, we can read user input. +exec < /dev/tty +read to_fix + +if [ "$to_fix" != "y" ] +then + exit 1 +fi + +# Do in-place format adjustment. +git diff -U0 HEAD^ | $CLANG_FORMAT_DIFF -i -p 1 diff --git a/build_tools/mac-install-gflags.sh b/build_tools/mac-install-gflags.sh new file mode 100755 index 00000000..ef0339c3 --- /dev/null +++ b/build_tools/mac-install-gflags.sh @@ -0,0 +1,25 @@ +#!/bin/sh +# Install gflags for mac developers. + +set -e + +DIR=`mktemp -d /tmp/rocksdb_gflags_XXXX` + +cd $DIR +wget https://gflags.googlecode.com/files/gflags-2.0.tar.gz +tar xvfz gflags-2.0.tar.gz +cd gflags-2.0 + +./configure +make +make install + +# Add include/lib path for g++ +echo 'export LIBRARY_PATH+=":/usr/local/lib"' >> ~/.bash_profile +echo 'export CPATH+=":/usr/local/include"' >> ~/.bash_profile + +echo "" +echo "-----------------------------------------------------------------------------" +echo "| Installation Completed |" +echo "-----------------------------------------------------------------------------" +echo "Please run `. ~/bash_profile` to be able to compile with gflags" diff --git a/build_tools/make_new_version.sh b/build_tools/make_new_version.sh new file mode 100755 index 00000000..ca8a2126 --- /dev/null +++ b/build_tools/make_new_version.sh @@ -0,0 +1,61 @@ +#!/bin/bash +# Copyright (c) 2013, Facebook, Inc. All rights reserved. +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. An additional grant +# of patent rights can be found in the PATENTS file in the same directory. + +set -e +# Print out the colored progress info so that it can be brainlessly +# distinguished by users. +function title() { + echo -e "\033[1;32m$*\033[0m" +} + +usage="Create new rocksdb version and prepare it for the release process\n" +usage+="USAGE: ./make_new_version.sh " + +# -- Pre-check +if [[ $# < 1 ]]; then + echo -e $usage + exit 1 +fi + +ROCKSDB_VERSION=$1 + +GIT_BRANCH=`git rev-parse --abbrev-ref HEAD` +if [ $GIT_BRANCH != "master" ]; then + echo "Error: Current branch is '$GIT_BRANCH', Please switch to master branch." +fi + +# --Step 1: cutting new tag +title "Adding new tag for this release ..." +git tag -a "$ROCKSDB_VERSION.fb" -m "Rocksdb $ROCKSDB_VERSION" + +# Setting up the proxy for remote repo access +export http_proxy=http://172.31.255.99:8080 +export https_proxy="$http_proxy"; + +title "Pushing new tag to remote repo ..." +proxycmd.sh git push origin --tags + +# --Step 2: Update README.fb +title "Updating the latest version info in README.fb ..." +sed -i "s/Latest release is [0-9]\+.[0-9]\+.fb/Latest release is $ROCKSDB_VERSION.fb/" README.fb +git commit README.fb -m "update the latest version in README.fb to $ROCKSDB_VERSION" +proxycmd.sh git push + +# --Step 3: Prepare this repo for 3rd release +title "Cleaning up repo ..." +make clean +git clean -fxd + +title "Generating the build info ..." +# Comment out the call of `build_detection_version` so that the SHA number and build date of this +# release will remain constant. Otherwise everytime we run "make" util/build_version.cc will be +# overridden. +sed -i 's/^\$PWD\/build_tools\/build_detect_version$//' build_tools/build_detect_platform + +# Generate util/build_version.cc +build_tools/build_detect_version + +title "Done!" diff --git a/build_tools/regression_build_test.sh b/build_tools/regression_build_test.sh new file mode 100755 index 00000000..d38b67c3 --- /dev/null +++ b/build_tools/regression_build_test.sh @@ -0,0 +1,308 @@ +#!/bin/bash + +set -e + +NUM=10000000 + +if [ $# -eq 1 ];then + DATA_DIR=$1 +elif [ $# -eq 2 ];then + DATA_DIR=$1 + STAT_FILE=$2 +fi + +# On the production build servers, set data and stat +# files/directories not in /tmp or else the tempdir cleaning +# scripts will make you very unhappy. +DATA_DIR=${DATA_DIR:-$(mktemp -t -d rocksdb_XXXX)} +STAT_FILE=${STAT_FILE:-$(mktemp -t -u rocksdb_test_stats_XXXX)} + +function cleanup { + rm -rf $DATA_DIR + rm -f $STAT_FILE.fillseq + rm -f $STAT_FILE.readrandom + rm -f $STAT_FILE.overwrite + rm -f $STAT_FILE.memtablefillreadrandom +} + +trap cleanup EXIT + +if [ -z $GIT_BRANCH ]; then + git_br=`git rev-parse --abbrev-ref HEAD` +else + git_br=$(basename $GIT_BRANCH) +fi + +if [ $git_br == "master" ]; then + git_br="" +else + git_br="."$git_br +fi + +make release + +# measure fillseq + fill up the DB for overwrite benchmark +./db_bench \ + --benchmarks=fillseq \ + --db=$DATA_DIR \ + --use_existing_db=0 \ + --bloom_bits=10 \ + --num=$NUM \ + --writes=$NUM \ + --cache_size=6442450944 \ + --cache_numshardbits=6 \ + --table_cache_numshardbits=4 \ + --open_files=55000 \ + --statistics=1 \ + --histogram=1 \ + --disable_data_sync=1 \ + --disable_wal=1 \ + --sync=0 > ${STAT_FILE}.fillseq + +# measure overwrite performance +./db_bench \ + --benchmarks=overwrite \ + --db=$DATA_DIR \ + --use_existing_db=1 \ + --bloom_bits=10 \ + --num=$NUM \ + --writes=$((NUM / 10)) \ + --cache_size=6442450944 \ + --cache_numshardbits=6 \ + --table_cache_numshardbits=4 \ + --open_files=55000 \ + --statistics=1 \ + --histogram=1 \ + --disable_data_sync=1 \ + --disable_wal=1 \ + --sync=0 \ + --threads=8 > ${STAT_FILE}.overwrite + +# fill up the db for readrandom benchmark (1GB total size) +./db_bench \ + --benchmarks=fillseq \ + --db=$DATA_DIR \ + --use_existing_db=0 \ + --bloom_bits=10 \ + --num=$NUM \ + --writes=$NUM \ + --cache_size=6442450944 \ + --cache_numshardbits=6 \ + --table_cache_numshardbits=4 \ + --open_files=55000 \ + --statistics=1 \ + --histogram=1 \ + --disable_data_sync=1 \ + --disable_wal=1 \ + --sync=0 \ + --threads=1 > /dev/null + +# measure readrandom with 6GB block cache +./db_bench \ + --benchmarks=readrandom \ + --db=$DATA_DIR \ + --use_existing_db=1 \ + --bloom_bits=10 \ + --num=$NUM \ + --reads=$((NUM / 5)) \ + --cache_size=6442450944 \ + --cache_numshardbits=6 \ + --table_cache_numshardbits=4 \ + --open_files=55000 \ + --disable_seek_compaction=1 \ + --statistics=1 \ + --histogram=1 \ + --disable_data_sync=1 \ + --disable_wal=1 \ + --sync=0 \ + --threads=16 > ${STAT_FILE}.readrandom + +# measure readrandom with 100MB block cache +./db_bench \ + --benchmarks=readrandom \ + --db=$DATA_DIR \ + --use_existing_db=1 \ + --bloom_bits=10 \ + --num=$NUM \ + --reads=$((NUM / 5)) \ + --cache_size=104857600 \ + --cache_numshardbits=6 \ + --table_cache_numshardbits=4 \ + --open_files=55000 \ + --disable_seek_compaction=1 \ + --statistics=1 \ + --histogram=1 \ + --disable_data_sync=1 \ + --disable_wal=1 \ + --sync=0 \ + --threads=16 > ${STAT_FILE}.readrandomsmallblockcache + +# measure readrandom with 8k data in memtable +./db_bench \ + --benchmarks=overwrite,readrandom \ + --db=$DATA_DIR \ + --use_existing_db=1 \ + --bloom_bits=10 \ + --num=$NUM \ + --reads=$((NUM / 5)) \ + --writes=512 \ + --cache_size=6442450944 \ + --cache_numshardbits=6 \ + --table_cache_numshardbits=4 \ + --write_buffer_size=1000000000 \ + --open_files=55000 \ + --disable_seek_compaction=1 \ + --statistics=1 \ + --histogram=1 \ + --disable_data_sync=1 \ + --disable_wal=1 \ + --sync=0 \ + --threads=16 > ${STAT_FILE}.readrandom_mem_sst + + +# fill up the db for readrandom benchmark with filluniquerandom (1GB total size) +./db_bench \ + --benchmarks=filluniquerandom \ + --db=$DATA_DIR \ + --use_existing_db=0 \ + --bloom_bits=10 \ + --num=$((NUM / 4)) \ + --writes=$((NUM / 4)) \ + --cache_size=6442450944 \ + --cache_numshardbits=6 \ + --table_cache_numshardbits=4 \ + --open_files=55000 \ + --statistics=1 \ + --histogram=1 \ + --disable_data_sync=1 \ + --disable_wal=1 \ + --sync=0 \ + --threads=1 > /dev/null + +# dummy test just to compact the data +./db_bench \ + --benchmarks=readrandom \ + --db=$DATA_DIR \ + --use_existing_db=1 \ + --bloom_bits=10 \ + --num=$((NUM / 1000)) \ + --reads=$((NUM / 1000)) \ + --cache_size=6442450944 \ + --cache_numshardbits=6 \ + --table_cache_numshardbits=4 \ + --open_files=55000 \ + --statistics=1 \ + --histogram=1 \ + --disable_data_sync=1 \ + --disable_wal=1 \ + --sync=0 \ + --threads=16 > /dev/null + +# measure readrandom after load with filluniquerandom with 6GB block cache +./db_bench \ + --benchmarks=readrandom \ + --db=$DATA_DIR \ + --use_existing_db=1 \ + --bloom_bits=10 \ + --num=$((NUM / 4)) \ + --reads=$((NUM / 4)) \ + --cache_size=6442450944 \ + --cache_numshardbits=6 \ + --table_cache_numshardbits=4 \ + --open_files=55000 \ + --disable_seek_compaction=1 \ + --disable_auto_compactions=1 \ + --statistics=1 \ + --histogram=1 \ + --disable_data_sync=1 \ + --disable_wal=1 \ + --sync=0 \ + --threads=16 > ${STAT_FILE}.readrandom_filluniquerandom + +# measure readwhilewriting after load with filluniquerandom with 6GB block cache +./db_bench \ + --benchmarks=readwhilewriting \ + --db=$DATA_DIR \ + --use_existing_db=1 \ + --bloom_bits=10 \ + --num=$((NUM / 4)) \ + --reads=$((NUM / 4)) \ + --writes_per_second=1000 \ + --write_buffer_size=100000000 \ + --cache_size=6442450944 \ + --cache_numshardbits=6 \ + --table_cache_numshardbits=4 \ + --open_files=55000 \ + --disable_seek_compaction=1 \ + --statistics=1 \ + --histogram=1 \ + --disable_data_sync=1 \ + --disable_wal=1 \ + --sync=0 \ + --threads=16 > ${STAT_FILE}.readwhilewriting + +# measure memtable performance -- none of the data gets flushed to disk +./db_bench \ + --benchmarks=fillrandom,readrandom, \ + --db=$DATA_DIR \ + --use_existing_db=0 \ + --num=$((NUM / 10)) \ + --reads=$NUM \ + --cache_size=6442450944 \ + --cache_numshardbits=6 \ + --table_cache_numshardbits=4 \ + --write_buffer_size=1000000000 \ + --open_files=55000 \ + --disable_seek_compaction=1 \ + --statistics=1 \ + --histogram=1 \ + --disable_data_sync=1 \ + --disable_wal=1 \ + --sync=0 \ + --value_size=10 \ + --threads=16 > ${STAT_FILE}.memtablefillreadrandom + +# send data to ods +function send_to_ods { + key="$1" + value="$2" + + if [ -z $JENKINS_HOME ]; then + # running on devbox, just print out the values + echo $1 $2 + return + fi + + if [ -z "$value" ];then + echo >&2 "ERROR: Key $key doesn't have a value." + return + fi + curl -s "https://www.intern.facebook.com/intern/agent/ods_set.php?entity=rocksdb_build$git_br&key=$key&value=$value" \ + --connect-timeout 60 +} + +function send_benchmark_to_ods { + bench="$1" + bench_key="$2" + file="$3" + + QPS=$(grep $bench $file | awk '{print $5}') + P50_MICROS=$(grep $bench $file -A 4 | tail -n1 | awk '{print $3}' ) + P75_MICROS=$(grep $bench $file -A 4 | tail -n1 | awk '{print $5}' ) + P99_MICROS=$(grep $bench $file -A 4 | tail -n1 | awk '{print $7}' ) + + send_to_ods rocksdb.build.$bench_key.qps $QPS + send_to_ods rocksdb.build.$bench_key.p50_micros $P50_MICROS + send_to_ods rocksdb.build.$bench_key.p75_micros $P75_MICROS + send_to_ods rocksdb.build.$bench_key.p99_micros $P99_MICROS +} + +send_benchmark_to_ods overwrite overwrite $STAT_FILE.overwrite +send_benchmark_to_ods fillseq fillseq $STAT_FILE.fillseq +send_benchmark_to_ods readrandom readrandom $STAT_FILE.readrandom +send_benchmark_to_ods readrandom readrandom_smallblockcache $STAT_FILE.readrandomsmallblockcache +send_benchmark_to_ods readrandom readrandom_memtable_sst $STAT_FILE.readrandom_mem_sst +send_benchmark_to_ods readrandom readrandom_fillunique_random $STAT_FILE.readrandom_filluniquerandom +send_benchmark_to_ods fillrandom memtablefillrandom $STAT_FILE.memtablefillreadrandom +send_benchmark_to_ods readrandom memtablereadrandom $STAT_FILE.memtablefillreadrandom +send_benchmark_to_ods readwhilewriting readwhilewriting $STAT_FILE.readwhilewriting diff --git a/build_tools/valgrind_test.sh b/build_tools/valgrind_test.sh new file mode 100755 index 00000000..8c7e5213 --- /dev/null +++ b/build_tools/valgrind_test.sh @@ -0,0 +1,15 @@ +#!/bin/bash +#A shell script for Jenknis to run valgrind on rocksdb tests +#Returns 0 on success when there are no failed tests + +VALGRIND_DIR=build_tools/VALGRIND_LOGS +make clean +make -j$(nproc) valgrind_check +NUM_FAILED_TESTS=$((`wc -l $VALGRIND_DIR/valgrind_failed_tests | awk '{print $1}'` - 1)) +if [ $NUM_FAILED_TESTS -lt 1 ]; then + echo No tests have valgrind errors + exit 0 +else + cat $VALGRIND_DIR/valgrind_failed_tests + exit 1 +fi diff --git a/coverage/coverage_test.sh b/coverage/coverage_test.sh new file mode 100755 index 00000000..7a8b5e0f --- /dev/null +++ b/coverage/coverage_test.sh @@ -0,0 +1,73 @@ +#!/bin/bash + +# Exit on error. +set -e + +if [ -n "$USE_CLANG" ]; then + echo "Error: Coverage test is supported only for gcc." + exit 1 +fi + +ROOT=".." +# Fetch right version of gcov +if [ -d /mnt/gvfs/third-party -a -z "$CXX" ]; then + source $ROOT/build_tools/fbcode.gcc471.sh + GCOV=$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.7.1/cc6c9dc/bin/gcov +else + GCOV=$(which gcov) +fi + +COVERAGE_DIR="$PWD/COVERAGE_REPORT" +mkdir -p $COVERAGE_DIR + +# Find all gcno files to generate the coverage report + +GCNO_FILES=`find $ROOT -name "*.gcno"` +$GCOV --preserve-paths --relative-only --no-output $GCNO_FILES 2>/dev/null | + # Parse the raw gcov report to more human readable form. + python $ROOT/coverage/parse_gcov_output.py | + # Write the output to both stdout and report file. + tee $COVERAGE_DIR/coverage_report_all.txt && +echo -e "Generated coverage report for all files: $COVERAGE_DIR/coverage_report_all.txt\n" + +# TODO: we also need to get the files of the latest commits. +# Get the most recently committed files. +LATEST_FILES=` + git show --pretty="format:" --name-only HEAD | + grep -v "^$" | + paste -s -d,` +RECENT_REPORT=$COVERAGE_DIR/coverage_report_recent.txt + +echo -e "Recently updated files: $LATEST_FILES\n" > $RECENT_REPORT +$GCOV --preserve-paths --relative-only --no-output $GCNO_FILES 2>/dev/null | + python $ROOT/coverage/parse_gcov_output.py -interested-files $LATEST_FILES | + tee -a $RECENT_REPORT && +echo -e "Generated coverage report for recently updated files: $RECENT_REPORT\n" + +# Generate the html report. If we cannot find lcov in this machine, we'll simply +# skip this step. +echo "Generating the html coverage report..." + +LCOV=$(which lcov || true 2>/dev/null) +if [ -z $LCOV ] +then + echo "Skip: Cannot find lcov to generate the html report." + exit 0 +fi + +LCOV_VERSION=$(lcov -v | grep 1.1 || true) +if [ $LCOV_VERSION ] +then + echo "Not supported lcov version. Expect lcov 1.1." + exit 0 +fi + +(cd $ROOT; lcov --no-external \ + --capture \ + --directory $PWD \ + --gcov-tool $GCOV \ + --output-file $COVERAGE_DIR/coverage.info) + +genhtml $COVERAGE_DIR/coverage.info -o $COVERAGE_DIR + +echo "HTML Coverage report is generated in $COVERAGE_DIR" diff --git a/coverage/parse_gcov_output.py b/coverage/parse_gcov_output.py new file mode 100644 index 00000000..72e8b072 --- /dev/null +++ b/coverage/parse_gcov_output.py @@ -0,0 +1,118 @@ +import optparse +import re +import sys + +from optparse import OptionParser + +# the gcov report follows certain pattern. Each file will have two lines +# of report, from which we can extract the file name, total lines and coverage +# percentage. +def parse_gcov_report(gcov_input): + per_file_coverage = {} + total_coverage = None + + for line in sys.stdin: + line = line.strip() + + # --First line of the coverage report (with file name in it)? + match_obj = re.match("^File '(.*)'$", line) + if match_obj: + # fetch the file name from the first line of the report. + current_file = match_obj.group(1) + continue + + # -- Second line of the file report (with coverage percentage) + match_obj = re.match("^Lines executed:(.*)% of (.*)", line) + + if match_obj: + coverage = float(match_obj.group(1)) + lines = int(match_obj.group(2)) + + if current_file is not None: + per_file_coverage[current_file] = (coverage, lines) + current_file = None + else: + # If current_file is not set, we reach the last line of report, + # which contains the summarized coverage percentage. + total_coverage = (coverage, lines) + continue + + # If the line's pattern doesn't fall into the above categories. We + # can simply ignore them since they're either empty line or doesn't + # find executable lines of the given file. + current_file = None + + return per_file_coverage, total_coverage + +def get_option_parser(): + usage = "Parse the gcov output and generate more human-readable code " +\ + "coverage report." + parser = OptionParser(usage) + + parser.add_option( + "--interested-files", "-i", + dest="filenames", + help="Comma separated files names. if specified, we will display " + + "the coverage report only for interested source files. " + + "Otherwise we will display the coverage report for all " + + "source files." + ) + return parser + +def display_file_coverage(per_file_coverage, total_coverage): + # To print out auto-adjustable column, we need to know the longest + # length of file names. + max_file_name_length = max( + len(fname) for fname in per_file_coverage.keys() + ) + + # -- Print header + # size of separator is determined by 3 column sizes: + # file name, coverage percentage and lines. + header_template = \ + "%" + str(max_file_name_length) + "s\t%s\t%s" + separator = "-" * (max_file_name_length + 10 + 20) + print header_template % ("Filename", "Coverage", "Lines") + print separator + + # -- Print body + # template for printing coverage report for each file. + record_template = "%" + str(max_file_name_length) + "s\t%5.2f%%\t%10d" + + for fname, coverage_info in per_file_coverage.items(): + coverage, lines = coverage_info + print record_template % (fname, coverage, lines) + + # -- Print footer + if total_coverage: + print separator + print record_template % ("Total", total_coverage[0], total_coverage[1]) + +def report_coverage(): + parser = get_option_parser() + (options, args) = parser.parse_args() + + interested_files = set() + if options.filenames is not None: + interested_files = set(f.strip() for f in options.filenames.split(',')) + + # To make things simple, right now we only read gcov report from the input + per_file_coverage, total_coverage = parse_gcov_report(sys.stdin) + + # Check if we need to display coverage info for interested files. + if len(interested_files): + per_file_coverage = dict( + (fname, per_file_coverage[fname]) for fname in interested_files + if fname in per_file_coverage + ) + # If we only interested in several files, it makes no sense to report + # the total_coverage + total_coverage = None + + if not len(per_file_coverage): + print >> sys.stderr, "Cannot find coverage info for the given files." + return + display_file_coverage(per_file_coverage, total_coverage) + +if __name__ == "__main__": + report_coverage() diff --git a/db/builder.cc b/db/builder.cc new file mode 100644 index 00000000..61671db0 --- /dev/null +++ b/db/builder.cc @@ -0,0 +1,227 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/builder.h" + +#include "db/filename.h" +#include "db/dbformat.h" +#include "db/merge_helper.h" +#include "db/table_cache.h" +#include "db/version_edit.h" +#include "rocksdb/db.h" +#include "rocksdb/table.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/options.h" +#include "table/block_based_table_builder.h" +#include "util/stop_watch.h" + +namespace rocksdb { + +class TableFactory; + +TableBuilder* GetTableBuilder(const Options& options, WritableFile* file, + CompressionType compression_type) { + return options.table_factory->GetTableBuilder(options, file, + compression_type); +} + +Status BuildTable(const std::string& dbname, + Env* env, + const Options& options, + const EnvOptions& soptions, + TableCache* table_cache, + Iterator* iter, + FileMetaData* meta, + const Comparator* user_comparator, + const SequenceNumber newest_snapshot, + const SequenceNumber earliest_seqno_in_memtable, + const CompressionType compression) { + Status s; + meta->file_size = 0; + meta->smallest_seqno = meta->largest_seqno = 0; + iter->SeekToFirst(); + + // If the sequence number of the smallest entry in the memtable is + // smaller than the most recent snapshot, then we do not trigger + // removal of duplicate/deleted keys as part of this builder. + bool purge = options.purge_redundant_kvs_while_flush; + if (earliest_seqno_in_memtable <= newest_snapshot) { + purge = false; + } + + std::string fname = TableFileName(dbname, meta->number); + if (iter->Valid()) { + unique_ptr file; + s = env->NewWritableFile(fname, &file, soptions); + if (!s.ok()) { + return s; + } + + TableBuilder* builder = GetTableBuilder(options, file.get(), + compression); + + // the first key is the smallest key + Slice key = iter->key(); + meta->smallest.DecodeFrom(key); + meta->smallest_seqno = GetInternalKeySeqno(key); + meta->largest_seqno = meta->smallest_seqno; + + MergeHelper merge(user_comparator, options.merge_operator.get(), + options.info_log.get(), + true /* internal key corruption is not ok */); + + if (purge) { + // Ugly walkaround to avoid compiler error for release build + bool ok __attribute__((unused)) = true; + + // Will write to builder if current key != prev key + ParsedInternalKey prev_ikey; + std::string prev_key; + bool is_first_key = true; // Also write if this is the very first key + + while (iter->Valid()) { + bool iterator_at_next = false; + + // Get current key + ParsedInternalKey this_ikey; + Slice key = iter->key(); + Slice value = iter->value(); + + // In-memory key corruption is not ok; + // TODO: find a clean way to treat in memory key corruption + ok = ParseInternalKey(key, &this_ikey); + assert(ok); + assert(this_ikey.sequence >= earliest_seqno_in_memtable); + + // If the key is the same as the previous key (and it is not the + // first key), then we skip it, since it is an older version. + // Otherwise we output the key and mark it as the "new" previous key. + if (!is_first_key && !user_comparator->Compare(prev_ikey.user_key, + this_ikey.user_key)) { + // seqno within the same key are in decreasing order + assert(this_ikey.sequence < prev_ikey.sequence); + } else { + is_first_key = false; + + if (this_ikey.type == kTypeMerge) { + // Handle merge-type keys using the MergeHelper + // TODO: pass statistics to MergeUntil + merge.MergeUntil(iter, 0 /* don't worry about snapshot */); + iterator_at_next = true; + if (merge.IsSuccess()) { + // Merge completed correctly. + // Add the resulting merge key/value and continue to next + builder->Add(merge.key(), merge.value()); + prev_key.assign(merge.key().data(), merge.key().size()); + ok = ParseInternalKey(Slice(prev_key), &prev_ikey); + assert(ok); + } else { + // Merge did not find a Put/Delete. + // Can not compact these merges into a kValueType. + // Write them out one-by-one. (Proceed back() to front()) + const std::deque& keys = merge.keys(); + const std::deque& values = merge.values(); + assert(keys.size() == values.size() && keys.size() >= 1); + std::deque::const_reverse_iterator key_iter; + std::deque::const_reverse_iterator value_iter; + for (key_iter=keys.rbegin(), value_iter = values.rbegin(); + key_iter != keys.rend() && value_iter != values.rend(); + ++key_iter, ++value_iter) { + + builder->Add(Slice(*key_iter), Slice(*value_iter)); + } + + // Sanity check. Both iterators should end at the same time + assert(key_iter == keys.rend() && value_iter == values.rend()); + + prev_key.assign(keys.front()); + ok = ParseInternalKey(Slice(prev_key), &prev_ikey); + assert(ok); + } + } else { + // Handle Put/Delete-type keys by simply writing them + builder->Add(key, value); + prev_key.assign(key.data(), key.size()); + ok = ParseInternalKey(Slice(prev_key), &prev_ikey); + assert(ok); + } + } + + if (!iterator_at_next) iter->Next(); + } + + // The last key is the largest key + meta->largest.DecodeFrom(Slice(prev_key)); + SequenceNumber seqno = GetInternalKeySeqno(Slice(prev_key)); + meta->smallest_seqno = std::min(meta->smallest_seqno, seqno); + meta->largest_seqno = std::max(meta->largest_seqno, seqno); + + } else { + for (; iter->Valid(); iter->Next()) { + Slice key = iter->key(); + meta->largest.DecodeFrom(key); + builder->Add(key, iter->value()); + SequenceNumber seqno = GetInternalKeySeqno(key); + meta->smallest_seqno = std::min(meta->smallest_seqno, seqno); + meta->largest_seqno = std::max(meta->largest_seqno, seqno); + } + } + + // Finish and check for builder errors + if (s.ok()) { + s = builder->Finish(); + if (s.ok()) { + meta->file_size = builder->FileSize(); + assert(meta->file_size > 0); + } + } else { + builder->Abandon(); + } + delete builder; + + // Finish and check for file errors + if (s.ok() && !options.disableDataSync) { + if (options.use_fsync) { + StopWatch sw(env, options.statistics.get(), TABLE_SYNC_MICROS); + s = file->Fsync(); + } else { + StopWatch sw(env, options.statistics.get(), TABLE_SYNC_MICROS); + s = file->Sync(); + } + } + if (s.ok()) { + s = file->Close(); + } + + if (s.ok()) { + // Verify that the table is usable + Iterator* it = table_cache->NewIterator(ReadOptions(), + soptions, + meta->number, + meta->file_size); + s = it->status(); + delete it; + } + } + + // Check for input iterator errors + if (!iter->status().ok()) { + s = iter->status(); + } + + if (s.ok() && meta->file_size > 0) { + // Keep it + } else { + env->DeleteFile(fname); + } + return s; +} + +} // namespace rocksdb diff --git a/db/builder.h b/db/builder.h new file mode 100644 index 00000000..2600dc24 --- /dev/null +++ b/db/builder.h @@ -0,0 +1,48 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once +#include "rocksdb/comparator.h" +#include "rocksdb/status.h" +#include "rocksdb/types.h" +#include "rocksdb/options.h" + +namespace rocksdb { + +struct Options; +struct FileMetaData; + +class Env; +struct EnvOptions; +class Iterator; +class TableCache; +class VersionEdit; +class TableBuilder; +class WritableFile; + + +extern TableBuilder* GetTableBuilder(const Options& options, WritableFile* file, + CompressionType compression_type); + +// Build a Table file from the contents of *iter. The generated file +// will be named according to meta->number. On success, the rest of +// *meta will be filled with metadata about the generated table. +// If no data is present in *iter, meta->file_size will be set to +// zero, and no Table file will be produced. +extern Status BuildTable(const std::string& dbname, + Env* env, + const Options& options, + const EnvOptions& soptions, + TableCache* table_cache, + Iterator* iter, + FileMetaData* meta, + const Comparator* user_comparator, + const SequenceNumber newest_snapshot, + const SequenceNumber earliest_seqno_in_memtable, + const CompressionType compression); + +} // namespace rocksdb diff --git a/db/c.cc b/db/c.cc new file mode 100644 index 00000000..68f36133 --- /dev/null +++ b/db/c.cc @@ -0,0 +1,842 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/c.h" + +#include +#include +#include "rocksdb/cache.h" +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/iterator.h" +#include "rocksdb/options.h" +#include "rocksdb/status.h" +#include "rocksdb/write_batch.h" +#include "rocksdb/memtablerep.h" +#include "rocksdb/universal_compaction.h" + +using rocksdb::Cache; +using rocksdb::Comparator; +using rocksdb::CompressionType; +using rocksdb::DB; +using rocksdb::Env; +using rocksdb::FileLock; +using rocksdb::FilterPolicy; +using rocksdb::Iterator; +using rocksdb::Logger; +using rocksdb::NewBloomFilterPolicy; +using rocksdb::NewLRUCache; +using rocksdb::Options; +using rocksdb::RandomAccessFile; +using rocksdb::Range; +using rocksdb::ReadOptions; +using rocksdb::SequentialFile; +using rocksdb::Slice; +using rocksdb::Snapshot; +using rocksdb::Status; +using rocksdb::WritableFile; +using rocksdb::WriteBatch; +using rocksdb::WriteOptions; + +using std::shared_ptr; + +extern "C" { + +struct rocksdb_t { DB* rep; }; +struct rocksdb_iterator_t { Iterator* rep; }; +struct rocksdb_writebatch_t { WriteBatch rep; }; +struct rocksdb_snapshot_t { const Snapshot* rep; }; +struct rocksdb_readoptions_t { ReadOptions rep; }; +struct rocksdb_writeoptions_t { WriteOptions rep; }; +struct rocksdb_options_t { Options rep; }; +struct rocksdb_seqfile_t { SequentialFile* rep; }; +struct rocksdb_randomfile_t { RandomAccessFile* rep; }; +struct rocksdb_writablefile_t { WritableFile* rep; }; +struct rocksdb_filelock_t { FileLock* rep; }; +struct rocksdb_logger_t { shared_ptr rep; }; +struct rocksdb_cache_t { shared_ptr rep; }; + +struct rocksdb_comparator_t : public Comparator { + void* state_; + void (*destructor_)(void*); + int (*compare_)( + void*, + const char* a, size_t alen, + const char* b, size_t blen); + const char* (*name_)(void*); + + virtual ~rocksdb_comparator_t() { + (*destructor_)(state_); + } + + virtual int Compare(const Slice& a, const Slice& b) const { + return (*compare_)(state_, a.data(), a.size(), b.data(), b.size()); + } + + virtual const char* Name() const { + return (*name_)(state_); + } + + // No-ops since the C binding does not support key shortening methods. + virtual void FindShortestSeparator(std::string*, const Slice&) const { } + virtual void FindShortSuccessor(std::string* key) const { } +}; + +struct rocksdb_filterpolicy_t : public FilterPolicy { + void* state_; + void (*destructor_)(void*); + const char* (*name_)(void*); + char* (*create_)( + void*, + const char* const* key_array, const size_t* key_length_array, + int num_keys, + size_t* filter_length); + unsigned char (*key_match_)( + void*, + const char* key, size_t length, + const char* filter, size_t filter_length); + + virtual ~rocksdb_filterpolicy_t() { + (*destructor_)(state_); + } + + virtual const char* Name() const { + return (*name_)(state_); + } + + virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const { + std::vector key_pointers(n); + std::vector key_sizes(n); + for (int i = 0; i < n; i++) { + key_pointers[i] = keys[i].data(); + key_sizes[i] = keys[i].size(); + } + size_t len; + char* filter = (*create_)(state_, &key_pointers[0], &key_sizes[0], n, &len); + dst->append(filter, len); + free(filter); + } + + virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const { + return (*key_match_)(state_, key.data(), key.size(), + filter.data(), filter.size()); + } +}; + +struct rocksdb_env_t { + Env* rep; + bool is_default; +}; + +struct rocksdb_universal_compaction_options_t { + rocksdb::CompactionOptionsUniversal *rep; +}; + + +static bool SaveError(char** errptr, const Status& s) { + assert(errptr != NULL); + if (s.ok()) { + return false; + } else if (*errptr == NULL) { + *errptr = strdup(s.ToString().c_str()); + } else { + // TODO(sanjay): Merge with existing error? + free(*errptr); + *errptr = strdup(s.ToString().c_str()); + } + return true; +} + +static char* CopyString(const std::string& str) { + char* result = reinterpret_cast(malloc(sizeof(char) * str.size())); + memcpy(result, str.data(), sizeof(char) * str.size()); + return result; +} + +rocksdb_t* rocksdb_open( + const rocksdb_options_t* options, + const char* name, + char** errptr) { + DB* db; + if (SaveError(errptr, DB::Open(options->rep, std::string(name), &db))) { + return NULL; + } + rocksdb_t* result = new rocksdb_t; + result->rep = db; + return result; +} + +void rocksdb_close(rocksdb_t* db) { + delete db->rep; + delete db; +} + +void rocksdb_put( + rocksdb_t* db, + const rocksdb_writeoptions_t* options, + const char* key, size_t keylen, + const char* val, size_t vallen, + char** errptr) { + SaveError(errptr, + db->rep->Put(options->rep, Slice(key, keylen), Slice(val, vallen))); +} + +void rocksdb_delete( + rocksdb_t* db, + const rocksdb_writeoptions_t* options, + const char* key, size_t keylen, + char** errptr) { + SaveError(errptr, db->rep->Delete(options->rep, Slice(key, keylen))); +} + + +void rocksdb_write( + rocksdb_t* db, + const rocksdb_writeoptions_t* options, + rocksdb_writebatch_t* batch, + char** errptr) { + SaveError(errptr, db->rep->Write(options->rep, &batch->rep)); +} + +char* rocksdb_get( + rocksdb_t* db, + const rocksdb_readoptions_t* options, + const char* key, size_t keylen, + size_t* vallen, + char** errptr) { + char* result = NULL; + std::string tmp; + Status s = db->rep->Get(options->rep, Slice(key, keylen), &tmp); + if (s.ok()) { + *vallen = tmp.size(); + result = CopyString(tmp); + } else { + *vallen = 0; + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + } + return result; +} + +rocksdb_iterator_t* rocksdb_create_iterator( + rocksdb_t* db, + const rocksdb_readoptions_t* options) { + rocksdb_iterator_t* result = new rocksdb_iterator_t; + result->rep = db->rep->NewIterator(options->rep); + return result; +} + +const rocksdb_snapshot_t* rocksdb_create_snapshot( + rocksdb_t* db) { + rocksdb_snapshot_t* result = new rocksdb_snapshot_t; + result->rep = db->rep->GetSnapshot(); + return result; +} + +void rocksdb_release_snapshot( + rocksdb_t* db, + const rocksdb_snapshot_t* snapshot) { + db->rep->ReleaseSnapshot(snapshot->rep); + delete snapshot; +} + +char* rocksdb_property_value( + rocksdb_t* db, + const char* propname) { + std::string tmp; + if (db->rep->GetProperty(Slice(propname), &tmp)) { + // We use strdup() since we expect human readable output. + return strdup(tmp.c_str()); + } else { + return NULL; + } +} + +void rocksdb_approximate_sizes( + rocksdb_t* db, + int num_ranges, + const char* const* range_start_key, const size_t* range_start_key_len, + const char* const* range_limit_key, const size_t* range_limit_key_len, + uint64_t* sizes) { + Range* ranges = new Range[num_ranges]; + for (int i = 0; i < num_ranges; i++) { + ranges[i].start = Slice(range_start_key[i], range_start_key_len[i]); + ranges[i].limit = Slice(range_limit_key[i], range_limit_key_len[i]); + } + db->rep->GetApproximateSizes(ranges, num_ranges, sizes); + delete[] ranges; +} + +void rocksdb_compact_range( + rocksdb_t* db, + const char* start_key, size_t start_key_len, + const char* limit_key, size_t limit_key_len) { + Slice a, b; + db->rep->CompactRange( + // Pass NULL Slice if corresponding "const char*" is NULL + (start_key ? (a = Slice(start_key, start_key_len), &a) : NULL), + (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : NULL)); +} + +void rocksdb_destroy_db( + const rocksdb_options_t* options, + const char* name, + char** errptr) { + SaveError(errptr, DestroyDB(name, options->rep)); +} + +void rocksdb_repair_db( + const rocksdb_options_t* options, + const char* name, + char** errptr) { + SaveError(errptr, RepairDB(name, options->rep)); +} + +void rocksdb_iter_destroy(rocksdb_iterator_t* iter) { + delete iter->rep; + delete iter; +} + +unsigned char rocksdb_iter_valid(const rocksdb_iterator_t* iter) { + return iter->rep->Valid(); +} + +void rocksdb_iter_seek_to_first(rocksdb_iterator_t* iter) { + iter->rep->SeekToFirst(); +} + +void rocksdb_iter_seek_to_last(rocksdb_iterator_t* iter) { + iter->rep->SeekToLast(); +} + +void rocksdb_iter_seek(rocksdb_iterator_t* iter, const char* k, size_t klen) { + iter->rep->Seek(Slice(k, klen)); +} + +void rocksdb_iter_next(rocksdb_iterator_t* iter) { + iter->rep->Next(); +} + +void rocksdb_iter_prev(rocksdb_iterator_t* iter) { + iter->rep->Prev(); +} + +const char* rocksdb_iter_key(const rocksdb_iterator_t* iter, size_t* klen) { + Slice s = iter->rep->key(); + *klen = s.size(); + return s.data(); +} + +const char* rocksdb_iter_value(const rocksdb_iterator_t* iter, size_t* vlen) { + Slice s = iter->rep->value(); + *vlen = s.size(); + return s.data(); +} + +void rocksdb_iter_get_error(const rocksdb_iterator_t* iter, char** errptr) { + SaveError(errptr, iter->rep->status()); +} + +rocksdb_writebatch_t* rocksdb_writebatch_create() { + return new rocksdb_writebatch_t; +} + +void rocksdb_writebatch_destroy(rocksdb_writebatch_t* b) { + delete b; +} + +void rocksdb_writebatch_clear(rocksdb_writebatch_t* b) { + b->rep.Clear(); +} + +void rocksdb_writebatch_put( + rocksdb_writebatch_t* b, + const char* key, size_t klen, + const char* val, size_t vlen) { + b->rep.Put(Slice(key, klen), Slice(val, vlen)); +} + +void rocksdb_writebatch_delete( + rocksdb_writebatch_t* b, + const char* key, size_t klen) { + b->rep.Delete(Slice(key, klen)); +} + +void rocksdb_writebatch_iterate( + rocksdb_writebatch_t* b, + void* state, + void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen), + void (*deleted)(void*, const char* k, size_t klen)) { + class H : public WriteBatch::Handler { + public: + void* state_; + void (*put_)(void*, const char* k, size_t klen, const char* v, size_t vlen); + void (*deleted_)(void*, const char* k, size_t klen); + virtual void Put(const Slice& key, const Slice& value) { + (*put_)(state_, key.data(), key.size(), value.data(), value.size()); + } + virtual void Delete(const Slice& key) { + (*deleted_)(state_, key.data(), key.size()); + } + }; + H handler; + handler.state_ = state; + handler.put_ = put; + handler.deleted_ = deleted; + b->rep.Iterate(&handler); +} + +rocksdb_options_t* rocksdb_options_create() { + return new rocksdb_options_t; +} + +void rocksdb_options_destroy(rocksdb_options_t* options) { + delete options; +} + +void rocksdb_options_set_comparator( + rocksdb_options_t* opt, + rocksdb_comparator_t* cmp) { + opt->rep.comparator = cmp; +} + +void rocksdb_options_set_filter_policy( + rocksdb_options_t* opt, + rocksdb_filterpolicy_t* policy) { + opt->rep.filter_policy = policy; +} + +void rocksdb_options_set_create_if_missing( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.create_if_missing = v; +} + +void rocksdb_options_set_error_if_exists( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.error_if_exists = v; +} + +void rocksdb_options_set_paranoid_checks( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.paranoid_checks = v; +} + +void rocksdb_options_set_env(rocksdb_options_t* opt, rocksdb_env_t* env) { + opt->rep.env = (env ? env->rep : NULL); +} + +void rocksdb_options_set_info_log(rocksdb_options_t* opt, rocksdb_logger_t* l) { + if (l) { + opt->rep.info_log = l->rep; + } +} + +void rocksdb_options_set_write_buffer_size(rocksdb_options_t* opt, size_t s) { + opt->rep.write_buffer_size = s; +} + +void rocksdb_options_set_max_open_files(rocksdb_options_t* opt, int n) { + opt->rep.max_open_files = n; +} + +void rocksdb_options_set_cache(rocksdb_options_t* opt, rocksdb_cache_t* c) { + if (c) { + opt->rep.block_cache = c->rep; + } +} + +void rocksdb_options_set_block_size(rocksdb_options_t* opt, size_t s) { + opt->rep.block_size = s; +} + +void rocksdb_options_set_block_restart_interval(rocksdb_options_t* opt, int n) { + opt->rep.block_restart_interval = n; +} + +void rocksdb_options_set_target_file_size_base( + rocksdb_options_t* opt, uint64_t n) { + opt->rep.target_file_size_base = n; +} + +void rocksdb_options_set_target_file_size_multiplier( + rocksdb_options_t* opt, int n) { + opt->rep.target_file_size_multiplier = n; +} + +void rocksdb_options_set_max_bytes_for_level_base( + rocksdb_options_t* opt, uint64_t n) { + opt->rep.max_bytes_for_level_base = n; +} + +void rocksdb_options_set_max_bytes_for_level_multiplier( + rocksdb_options_t* opt, int n) { + opt->rep.max_bytes_for_level_multiplier = n; +} + +void rocksdb_options_set_expanded_compaction_factor( + rocksdb_options_t* opt, int n) { + opt->rep.expanded_compaction_factor = n; +} + +void rocksdb_options_set_max_grandparent_overlap_factor( + rocksdb_options_t* opt, int n) { + opt->rep.max_grandparent_overlap_factor = n; +} + +void rocksdb_options_set_num_levels(rocksdb_options_t* opt, int n) { + opt->rep.num_levels = n; +} + +void rocksdb_options_set_level0_file_num_compaction_trigger( + rocksdb_options_t* opt, int n) { + opt->rep.level0_file_num_compaction_trigger = n; +} + +void rocksdb_options_set_level0_slowdown_writes_trigger( + rocksdb_options_t* opt, int n) { + opt->rep.level0_slowdown_writes_trigger = n; +} + +void rocksdb_options_set_level0_stop_writes_trigger( + rocksdb_options_t* opt, int n) { + opt->rep.level0_stop_writes_trigger = n; +} + +void rocksdb_options_set_max_mem_compaction_level( + rocksdb_options_t* opt, int n) { + opt->rep.max_mem_compaction_level = n; +} + +void rocksdb_options_set_compression(rocksdb_options_t* opt, int t) { + opt->rep.compression = static_cast(t); +} + +void rocksdb_options_set_compression_per_level(rocksdb_options_t* opt, + int* level_values, + size_t num_levels) { + opt->rep.compression_per_level.resize(num_levels); + for (size_t i = 0; i < num_levels; ++i) { + opt->rep.compression_per_level[i] = + static_cast(level_values[i]); + } +} + +void rocksdb_options_set_compression_options( + rocksdb_options_t* opt, int w_bits, int level, int strategy) { + opt->rep.compression_opts.window_bits = w_bits; + opt->rep.compression_opts.level = level; + opt->rep.compression_opts.strategy = strategy; +} + +void rocksdb_options_set_disable_data_sync( + rocksdb_options_t* opt, int disable_data_sync) { + opt->rep.disableDataSync = disable_data_sync; +} + +void rocksdb_options_set_use_fsync( + rocksdb_options_t* opt, int use_fsync) { + opt->rep.use_fsync = use_fsync; +} + +void rocksdb_options_set_db_stats_log_interval( + rocksdb_options_t* opt, int db_stats_log_interval) { + opt->rep.db_stats_log_interval = db_stats_log_interval; +} + +void rocksdb_options_set_db_log_dir( + rocksdb_options_t* opt, const char* db_log_dir) { + opt->rep.db_log_dir = db_log_dir; +} + +void rocksdb_options_set_WAL_ttl_seconds(rocksdb_options_t* opt, uint64_t ttl) { + opt->rep.WAL_ttl_seconds = ttl; +} + +void rocksdb_options_set_WAL_size_limit_MB( + rocksdb_options_t* opt, uint64_t limit) { + opt->rep.WAL_size_limit_MB = limit; +} + +void rocksdb_options_set_max_write_buffer_number(rocksdb_options_t* opt, int n) { + opt->rep.max_write_buffer_number = n; +} + +void rocksdb_options_set_min_write_buffer_number_to_merge(rocksdb_options_t* opt, int n) { + opt->rep.min_write_buffer_number_to_merge = n; +} + +void rocksdb_options_set_max_background_compactions(rocksdb_options_t* opt, int n) { + opt->rep.max_background_compactions = n; +} + +void rocksdb_options_set_max_background_flushes(rocksdb_options_t* opt, int n) { + opt->rep.max_background_flushes = n; +} + +void rocksdb_options_set_disable_auto_compactions(rocksdb_options_t* opt, int disable) { + opt->rep.disable_auto_compactions = disable; +} + +void rocksdb_options_set_disable_seek_compaction(rocksdb_options_t* opt, int disable) { + opt->rep.disable_seek_compaction = disable; +} + +void rocksdb_options_set_source_compaction_factor( + rocksdb_options_t* opt, int n) { + opt->rep.expanded_compaction_factor = n; +} + +void rocksdb_options_prepare_for_bulk_load(rocksdb_options_t* opt) { + opt->rep.PrepareForBulkLoad(); +} + +void rocksdb_options_set_memtable_vector_rep(rocksdb_options_t *opt) { + static rocksdb::VectorRepFactory* factory = 0; + if (!factory) { + factory = new rocksdb::VectorRepFactory; + } + opt->rep.memtable_factory.reset(factory); +} + +void rocksdb_options_set_compaction_style(rocksdb_options_t *opt, int style) { + opt->rep.compaction_style = static_cast(style); +} + +void rocksdb_options_set_universal_compaction_options(rocksdb_options_t *opt, rocksdb_universal_compaction_options_t *uco) { + opt->rep.compaction_options_universal = *(uco->rep); +} + +/* +TODO: +merge_operator +compaction_filter +prefix_extractor +whole_key_filtering +max_bytes_for_level_multiplier_additional +delete_obsolete_files_period_micros +max_log_file_size +log_file_time_to_roll +keep_log_file_num +soft_rate_limit +hard_rate_limit +rate_limit_delay_max_milliseconds +max_manifest_file_size +no_block_cache +table_cache_numshardbits +table_cache_remove_scan_count_limit +arena_block_size +manifest_preallocation_size +purge_redundant_kvs_while_flush +allow_os_buffer +allow_mmap_reads +allow_mmap_writes +is_fd_close_on_exec +skip_log_error_on_recovery +stats_dump_period_sec +block_size_deviation +advise_random_on_open +access_hint_on_compaction_start +use_adaptive_mutex +bytes_per_sync +filter_deletes +max_sequential_skip_in_iterations +table_factory +table_properties_collectors +inplace_update_support +inplace_update_num_locks +*/ + +rocksdb_comparator_t* rocksdb_comparator_create( + void* state, + void (*destructor)(void*), + int (*compare)( + void*, + const char* a, size_t alen, + const char* b, size_t blen), + const char* (*name)(void*)) { + rocksdb_comparator_t* result = new rocksdb_comparator_t; + result->state_ = state; + result->destructor_ = destructor; + result->compare_ = compare; + result->name_ = name; + return result; +} + +void rocksdb_comparator_destroy(rocksdb_comparator_t* cmp) { + delete cmp; +} + +rocksdb_filterpolicy_t* rocksdb_filterpolicy_create( + void* state, + void (*destructor)(void*), + char* (*create_filter)( + void*, + const char* const* key_array, const size_t* key_length_array, + int num_keys, + size_t* filter_length), + unsigned char (*key_may_match)( + void*, + const char* key, size_t length, + const char* filter, size_t filter_length), + const char* (*name)(void*)) { + rocksdb_filterpolicy_t* result = new rocksdb_filterpolicy_t; + result->state_ = state; + result->destructor_ = destructor; + result->create_ = create_filter; + result->key_match_ = key_may_match; + result->name_ = name; + return result; +} + +void rocksdb_filterpolicy_destroy(rocksdb_filterpolicy_t* filter) { + delete filter; +} + +rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom(int bits_per_key) { + // Make a rocksdb_filterpolicy_t, but override all of its methods so + // they delegate to a NewBloomFilterPolicy() instead of user + // supplied C functions. + struct Wrapper : public rocksdb_filterpolicy_t { + const FilterPolicy* rep_; + ~Wrapper() { delete rep_; } + const char* Name() const { return rep_->Name(); } + void CreateFilter(const Slice* keys, int n, std::string* dst) const { + return rep_->CreateFilter(keys, n, dst); + } + bool KeyMayMatch(const Slice& key, const Slice& filter) const { + return rep_->KeyMayMatch(key, filter); + } + static void DoNothing(void*) { } + }; + Wrapper* wrapper = new Wrapper; + wrapper->rep_ = NewBloomFilterPolicy(bits_per_key); + wrapper->state_ = NULL; + wrapper->destructor_ = &Wrapper::DoNothing; + return wrapper; +} + +rocksdb_readoptions_t* rocksdb_readoptions_create() { + return new rocksdb_readoptions_t; +} + +void rocksdb_readoptions_destroy(rocksdb_readoptions_t* opt) { + delete opt; +} + +void rocksdb_readoptions_set_verify_checksums( + rocksdb_readoptions_t* opt, + unsigned char v) { + opt->rep.verify_checksums = v; +} + +void rocksdb_readoptions_set_fill_cache( + rocksdb_readoptions_t* opt, unsigned char v) { + opt->rep.fill_cache = v; +} + +void rocksdb_readoptions_set_snapshot( + rocksdb_readoptions_t* opt, + const rocksdb_snapshot_t* snap) { + opt->rep.snapshot = (snap ? snap->rep : NULL); +} + +rocksdb_writeoptions_t* rocksdb_writeoptions_create() { + return new rocksdb_writeoptions_t; +} + +void rocksdb_writeoptions_destroy(rocksdb_writeoptions_t* opt) { + delete opt; +} + +void rocksdb_writeoptions_set_sync( + rocksdb_writeoptions_t* opt, unsigned char v) { + opt->rep.sync = v; +} + +void rocksdb_writeoptions_disable_WAL(rocksdb_writeoptions_t* opt, int disable) { + opt->rep.disableWAL = disable; +} + + +rocksdb_cache_t* rocksdb_cache_create_lru(size_t capacity) { + rocksdb_cache_t* c = new rocksdb_cache_t; + c->rep = NewLRUCache(capacity); + return c; +} + +void rocksdb_cache_destroy(rocksdb_cache_t* cache) { + delete cache; +} + +rocksdb_env_t* rocksdb_create_default_env() { + rocksdb_env_t* result = new rocksdb_env_t; + result->rep = Env::Default(); + result->is_default = true; + return result; +} + +void rocksdb_env_set_background_threads(rocksdb_env_t* env, int n) { + env->rep->SetBackgroundThreads(n); +} + +void rocksdb_env_set_high_priority_background_threads(rocksdb_env_t* env, int n) { + env->rep->SetBackgroundThreads(n, Env::HIGH); +} + +void rocksdb_env_destroy(rocksdb_env_t* env) { + if (!env->is_default) delete env->rep; + delete env; +} + +rocksdb_universal_compaction_options_t* rocksdb_universal_compaction_options_create() { + rocksdb_universal_compaction_options_t* result = new rocksdb_universal_compaction_options_t; + result->rep = new rocksdb::CompactionOptionsUniversal; + return result; +} + +void rocksdb_universal_compaction_options_set_size_ratio( + rocksdb_universal_compaction_options_t* uco, int ratio) { + uco->rep->size_ratio = ratio; +} + +void rocksdb_universal_compaction_options_set_min_merge_width( + rocksdb_universal_compaction_options_t* uco, int w) { + uco->rep->min_merge_width = w; +} + +void rocksdb_universal_compaction_options_set_max_merge_width( + rocksdb_universal_compaction_options_t* uco, int w) { + uco->rep->max_merge_width = w; +} + +void rocksdb_universal_compaction_options_set_max_size_amplification_percent( + rocksdb_universal_compaction_options_t* uco, int p) { + uco->rep->max_size_amplification_percent = p; +} + +void rocksdb_universal_compaction_options_set_compression_size_percent( + rocksdb_universal_compaction_options_t* uco, int p) { + uco->rep->compression_size_percent = p; +} + +void rocksdb_universal_compaction_options_set_stop_style( + rocksdb_universal_compaction_options_t* uco, int style) { + uco->rep->stop_style = static_cast(style); +} + +void rocksdb_universal_compaction_options_destroy( + rocksdb_universal_compaction_options_t* uco) { + delete uco->rep; + delete uco; +} + +} // end extern "C" diff --git a/db/c_test.c b/db/c_test.c new file mode 100644 index 00000000..8c5e8e53 --- /dev/null +++ b/db/c_test.c @@ -0,0 +1,390 @@ +/* Copyright (c) 2011 The LevelDB Authors. All rights reserved. + Use of this source code is governed by a BSD-style license that can be + found in the LICENSE file. See the AUTHORS file for names of contributors. */ + +#include "rocksdb/c.h" + +#include +#include +#include +#include +#include +#include + +const char* phase = ""; +static char dbname[200]; + +static void StartPhase(const char* name) { + fprintf(stderr, "=== Test %s\n", name); + phase = name; +} + +static const char* GetTempDir(void) { + const char* ret = getenv("TEST_TMPDIR"); + if (ret == NULL || ret[0] == '\0') + ret = "/tmp"; + return ret; +} + +#define CheckNoError(err) \ + if ((err) != NULL) { \ + fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, __LINE__, phase, (err)); \ + abort(); \ + } + +#define CheckCondition(cond) \ + if (!(cond)) { \ + fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, __LINE__, phase, #cond); \ + abort(); \ + } + +static void CheckEqual(const char* expected, const char* v, size_t n) { + if (expected == NULL && v == NULL) { + // ok + } else if (expected != NULL && v != NULL && n == strlen(expected) && + memcmp(expected, v, n) == 0) { + // ok + return; + } else { + fprintf(stderr, "%s: expected '%s', got '%s'\n", + phase, + (expected ? expected : "(null)"), + (v ? v : "(null")); + abort(); + } +} + +static void Free(char** ptr) { + if (*ptr) { + free(*ptr); + *ptr = NULL; + } +} + +static void CheckGet( + rocksdb_t* db, + const rocksdb_readoptions_t* options, + const char* key, + const char* expected) { + char* err = NULL; + size_t val_len; + char* val; + val = rocksdb_get(db, options, key, strlen(key), &val_len, &err); + CheckNoError(err); + CheckEqual(expected, val, val_len); + Free(&val); +} + +static void CheckIter(rocksdb_iterator_t* iter, + const char* key, const char* val) { + size_t len; + const char* str; + str = rocksdb_iter_key(iter, &len); + CheckEqual(key, str, len); + str = rocksdb_iter_value(iter, &len); + CheckEqual(val, str, len); +} + +// Callback from rocksdb_writebatch_iterate() +static void CheckPut(void* ptr, + const char* k, size_t klen, + const char* v, size_t vlen) { + int* state = (int*) ptr; + CheckCondition(*state < 2); + switch (*state) { + case 0: + CheckEqual("bar", k, klen); + CheckEqual("b", v, vlen); + break; + case 1: + CheckEqual("box", k, klen); + CheckEqual("c", v, vlen); + break; + } + (*state)++; +} + +// Callback from rocksdb_writebatch_iterate() +static void CheckDel(void* ptr, const char* k, size_t klen) { + int* state = (int*) ptr; + CheckCondition(*state == 2); + CheckEqual("bar", k, klen); + (*state)++; +} + +static void CmpDestroy(void* arg) { } + +static int CmpCompare(void* arg, const char* a, size_t alen, + const char* b, size_t blen) { + int n = (alen < blen) ? alen : blen; + int r = memcmp(a, b, n); + if (r == 0) { + if (alen < blen) r = -1; + else if (alen > blen) r = +1; + } + return r; +} + +static const char* CmpName(void* arg) { + return "foo"; +} + +// Custom filter policy +static unsigned char fake_filter_result = 1; +static void FilterDestroy(void* arg) { } +static const char* FilterName(void* arg) { + return "TestFilter"; +} +static char* FilterCreate( + void* arg, + const char* const* key_array, const size_t* key_length_array, + int num_keys, + size_t* filter_length) { + *filter_length = 4; + char* result = malloc(4); + memcpy(result, "fake", 4); + return result; +} +unsigned char FilterKeyMatch( + void* arg, + const char* key, size_t length, + const char* filter, size_t filter_length) { + CheckCondition(filter_length == 4); + CheckCondition(memcmp(filter, "fake", 4) == 0); + return fake_filter_result; +} + +int main(int argc, char** argv) { + rocksdb_t* db; + rocksdb_comparator_t* cmp; + rocksdb_cache_t* cache; + rocksdb_env_t* env; + rocksdb_options_t* options; + rocksdb_readoptions_t* roptions; + rocksdb_writeoptions_t* woptions; + char* err = NULL; + int run = -1; + + snprintf(dbname, sizeof(dbname), + "%s/rocksdb_c_test-%d", + GetTempDir(), + ((int) geteuid())); + + StartPhase("create_objects"); + cmp = rocksdb_comparator_create(NULL, CmpDestroy, CmpCompare, CmpName); + env = rocksdb_create_default_env(); + cache = rocksdb_cache_create_lru(100000); + + options = rocksdb_options_create(); + rocksdb_options_set_comparator(options, cmp); + rocksdb_options_set_error_if_exists(options, 1); + rocksdb_options_set_cache(options, cache); + rocksdb_options_set_env(options, env); + rocksdb_options_set_info_log(options, NULL); + rocksdb_options_set_write_buffer_size(options, 100000); + rocksdb_options_set_paranoid_checks(options, 1); + rocksdb_options_set_max_open_files(options, 10); + rocksdb_options_set_block_size(options, 1024); + rocksdb_options_set_block_restart_interval(options, 8); + rocksdb_options_set_compression(options, rocksdb_no_compression); + rocksdb_options_set_compression_options(options, -14, -1, 0); + int compression_levels[] = {rocksdb_no_compression, rocksdb_no_compression, + rocksdb_no_compression, rocksdb_no_compression}; + rocksdb_options_set_compression_per_level(options, compression_levels, 4); + + roptions = rocksdb_readoptions_create(); + rocksdb_readoptions_set_verify_checksums(roptions, 1); + rocksdb_readoptions_set_fill_cache(roptions, 0); + + woptions = rocksdb_writeoptions_create(); + rocksdb_writeoptions_set_sync(woptions, 1); + + StartPhase("destroy"); + rocksdb_destroy_db(options, dbname, &err); + Free(&err); + + StartPhase("open_error"); + db = rocksdb_open(options, dbname, &err); + CheckCondition(err != NULL); + Free(&err); + + StartPhase("open"); + rocksdb_options_set_create_if_missing(options, 1); + db = rocksdb_open(options, dbname, &err); + CheckNoError(err); + CheckGet(db, roptions, "foo", NULL); + + StartPhase("put"); + rocksdb_put(db, woptions, "foo", 3, "hello", 5, &err); + CheckNoError(err); + CheckGet(db, roptions, "foo", "hello"); + + StartPhase("compactall"); + rocksdb_compact_range(db, NULL, 0, NULL, 0); + CheckGet(db, roptions, "foo", "hello"); + + StartPhase("compactrange"); + rocksdb_compact_range(db, "a", 1, "z", 1); + CheckGet(db, roptions, "foo", "hello"); + + StartPhase("writebatch"); + { + rocksdb_writebatch_t* wb = rocksdb_writebatch_create(); + rocksdb_writebatch_put(wb, "foo", 3, "a", 1); + rocksdb_writebatch_clear(wb); + rocksdb_writebatch_put(wb, "bar", 3, "b", 1); + rocksdb_writebatch_put(wb, "box", 3, "c", 1); + rocksdb_writebatch_delete(wb, "bar", 3); + rocksdb_write(db, woptions, wb, &err); + CheckNoError(err); + CheckGet(db, roptions, "foo", "hello"); + CheckGet(db, roptions, "bar", NULL); + CheckGet(db, roptions, "box", "c"); + int pos = 0; + rocksdb_writebatch_iterate(wb, &pos, CheckPut, CheckDel); + CheckCondition(pos == 3); + rocksdb_writebatch_destroy(wb); + } + + StartPhase("iter"); + { + rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions); + CheckCondition(!rocksdb_iter_valid(iter)); + rocksdb_iter_seek_to_first(iter); + CheckCondition(rocksdb_iter_valid(iter)); + CheckIter(iter, "box", "c"); + rocksdb_iter_next(iter); + CheckIter(iter, "foo", "hello"); + rocksdb_iter_prev(iter); + CheckIter(iter, "box", "c"); + rocksdb_iter_prev(iter); + CheckCondition(!rocksdb_iter_valid(iter)); + rocksdb_iter_seek_to_last(iter); + CheckIter(iter, "foo", "hello"); + rocksdb_iter_seek(iter, "b", 1); + CheckIter(iter, "box", "c"); + rocksdb_iter_get_error(iter, &err); + CheckNoError(err); + rocksdb_iter_destroy(iter); + } + + StartPhase("approximate_sizes"); + { + int i; + int n = 20000; + char keybuf[100]; + char valbuf[100]; + uint64_t sizes[2]; + const char* start[2] = { "a", "k00000000000000010000" }; + size_t start_len[2] = { 1, 21 }; + const char* limit[2] = { "k00000000000000010000", "z" }; + size_t limit_len[2] = { 21, 1 }; + rocksdb_writeoptions_set_sync(woptions, 0); + for (i = 0; i < n; i++) { + snprintf(keybuf, sizeof(keybuf), "k%020d", i); + snprintf(valbuf, sizeof(valbuf), "v%020d", i); + rocksdb_put(db, woptions, keybuf, strlen(keybuf), valbuf, strlen(valbuf), + &err); + CheckNoError(err); + } + rocksdb_approximate_sizes(db, 2, start, start_len, limit, limit_len, sizes); + CheckCondition(sizes[0] > 0); + CheckCondition(sizes[1] > 0); + } + + StartPhase("property"); + { + char* prop = rocksdb_property_value(db, "nosuchprop"); + CheckCondition(prop == NULL); + prop = rocksdb_property_value(db, "rocksdb.stats"); + CheckCondition(prop != NULL); + Free(&prop); + } + + StartPhase("snapshot"); + { + const rocksdb_snapshot_t* snap; + snap = rocksdb_create_snapshot(db); + rocksdb_delete(db, woptions, "foo", 3, &err); + CheckNoError(err); + rocksdb_readoptions_set_snapshot(roptions, snap); + CheckGet(db, roptions, "foo", "hello"); + rocksdb_readoptions_set_snapshot(roptions, NULL); + CheckGet(db, roptions, "foo", NULL); + rocksdb_release_snapshot(db, snap); + } + + StartPhase("repair"); + { + // If we do not compact here, then the lazy deletion of + // files (https://reviews.facebook.net/D6123) would leave + // around deleted files and the repair process will find + // those files and put them back into the database. + rocksdb_compact_range(db, NULL, 0, NULL, 0); + rocksdb_close(db); + rocksdb_options_set_create_if_missing(options, 0); + rocksdb_options_set_error_if_exists(options, 0); + rocksdb_repair_db(options, dbname, &err); + CheckNoError(err); + db = rocksdb_open(options, dbname, &err); + CheckNoError(err); + CheckGet(db, roptions, "foo", NULL); + CheckGet(db, roptions, "bar", NULL); + CheckGet(db, roptions, "box", "c"); + rocksdb_options_set_create_if_missing(options, 1); + rocksdb_options_set_error_if_exists(options, 1); + } + + StartPhase("filter"); + for (run = 0; run < 2; run++) { + // First run uses custom filter, second run uses bloom filter + CheckNoError(err); + rocksdb_filterpolicy_t* policy; + if (run == 0) { + policy = rocksdb_filterpolicy_create( + NULL, FilterDestroy, FilterCreate, FilterKeyMatch, FilterName); + } else { + policy = rocksdb_filterpolicy_create_bloom(10); + } + + // Create new database + rocksdb_close(db); + rocksdb_destroy_db(options, dbname, &err); + rocksdb_options_set_filter_policy(options, policy); + db = rocksdb_open(options, dbname, &err); + CheckNoError(err); + rocksdb_put(db, woptions, "foo", 3, "foovalue", 8, &err); + CheckNoError(err); + rocksdb_put(db, woptions, "bar", 3, "barvalue", 8, &err); + CheckNoError(err); + rocksdb_compact_range(db, NULL, 0, NULL, 0); + + fake_filter_result = 1; + CheckGet(db, roptions, "foo", "foovalue"); + CheckGet(db, roptions, "bar", "barvalue"); + if (phase == 0) { + // Must not find value when custom filter returns false + fake_filter_result = 0; + CheckGet(db, roptions, "foo", NULL); + CheckGet(db, roptions, "bar", NULL); + fake_filter_result = 1; + + CheckGet(db, roptions, "foo", "foovalue"); + CheckGet(db, roptions, "bar", "barvalue"); + } + rocksdb_options_set_filter_policy(options, NULL); + rocksdb_filterpolicy_destroy(policy); + } + + StartPhase("cleanup"); + rocksdb_close(db); + rocksdb_options_destroy(options); + rocksdb_readoptions_destroy(roptions); + rocksdb_writeoptions_destroy(woptions); + rocksdb_cache_destroy(cache); + rocksdb_comparator_destroy(cmp); + rocksdb_env_destroy(env); + + fprintf(stderr, "PASS\n"); + return 0; +} diff --git a/db/corruption_test.cc b/db/corruption_test.cc new file mode 100644 index 00000000..e7b7b4c8 --- /dev/null +++ b/db/corruption_test.cc @@ -0,0 +1,378 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/db.h" + +#include +#include +#include +#include +#include "rocksdb/cache.h" +#include "rocksdb/env.h" +#include "rocksdb/table.h" +#include "rocksdb/write_batch.h" +#include "db/db_impl.h" +#include "db/filename.h" +#include "db/log_format.h" +#include "db/version_set.h" +#include "util/logging.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { + +static const int kValueSize = 1000; + +class CorruptionTest { + public: + test::ErrorEnv env_; + std::string dbname_; + shared_ptr tiny_cache_; + Options options_; + DB* db_; + + CorruptionTest() { + tiny_cache_ = NewLRUCache(100); + options_.env = &env_; + dbname_ = test::TmpDir() + "/db_test"; + DestroyDB(dbname_, options_); + + db_ = nullptr; + options_.create_if_missing = true; + options_.block_size_deviation = 0; // make unit test pass for now + Reopen(); + options_.create_if_missing = false; + } + + ~CorruptionTest() { + delete db_; + DestroyDB(dbname_, Options()); + } + + Status TryReopen(Options* options = nullptr) { + delete db_; + db_ = nullptr; + Options opt = (options ? *options : options_); + opt.env = &env_; + opt.block_cache = tiny_cache_; + opt.block_size_deviation = 0; + opt.arena_block_size = 4096; + return DB::Open(opt, dbname_, &db_); + } + + void Reopen(Options* options = nullptr) { + ASSERT_OK(TryReopen(options)); + } + + void RepairDB() { + delete db_; + db_ = nullptr; + ASSERT_OK(::rocksdb::RepairDB(dbname_, options_)); + } + + void Build(int n) { + std::string key_space, value_space; + WriteBatch batch; + for (int i = 0; i < n; i++) { + //if ((i % 100) == 0) fprintf(stderr, "@ %d of %d\n", i, n); + Slice key = Key(i, &key_space); + batch.Clear(); + batch.Put(key, Value(i, &value_space)); + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + } + } + + void Check(int min_expected, int max_expected) { + unsigned int next_expected = 0; + int missed = 0; + int bad_keys = 0; + int bad_values = 0; + int correct = 0; + std::string value_space; + Iterator* iter = db_->NewIterator(ReadOptions()); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + uint64_t key; + Slice in(iter->key()); + if (!ConsumeDecimalNumber(&in, &key) || + !in.empty() || + key < next_expected) { + bad_keys++; + continue; + } + missed += (key - next_expected); + next_expected = key + 1; + if (iter->value() != Value(key, &value_space)) { + bad_values++; + } else { + correct++; + } + } + delete iter; + + fprintf(stderr, + "expected=%d..%d; got=%d; bad_keys=%d; bad_values=%d; missed=%d\n", + min_expected, max_expected, correct, bad_keys, bad_values, missed); + ASSERT_LE(min_expected, correct); + ASSERT_GE(max_expected, correct); + } + + void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) { + // Pick file to corrupt + std::vector filenames; + ASSERT_OK(env_.GetChildren(dbname_, &filenames)); + uint64_t number; + FileType type; + std::string fname; + int picked_number = -1; + for (unsigned int i = 0; i < filenames.size(); i++) { + if (ParseFileName(filenames[i], &number, &type) && + type == filetype && + int(number) > picked_number) { // Pick latest file + fname = dbname_ + "/" + filenames[i]; + picked_number = number; + } + } + ASSERT_TRUE(!fname.empty()) << filetype; + + struct stat sbuf; + if (stat(fname.c_str(), &sbuf) != 0) { + const char* msg = strerror(errno); + ASSERT_TRUE(false) << fname << ": " << msg; + } + + if (offset < 0) { + // Relative to end of file; make it absolute + if (-offset > sbuf.st_size) { + offset = 0; + } else { + offset = sbuf.st_size + offset; + } + } + if (offset > sbuf.st_size) { + offset = sbuf.st_size; + } + if (offset + bytes_to_corrupt > sbuf.st_size) { + bytes_to_corrupt = sbuf.st_size - offset; + } + + // Do it + std::string contents; + Status s = ReadFileToString(Env::Default(), fname, &contents); + ASSERT_TRUE(s.ok()) << s.ToString(); + for (int i = 0; i < bytes_to_corrupt; i++) { + contents[i + offset] ^= 0x80; + } + s = WriteStringToFile(Env::Default(), contents, fname); + ASSERT_TRUE(s.ok()) << s.ToString(); + } + + int Property(const std::string& name) { + std::string property; + int result; + if (db_->GetProperty(name, &property) && + sscanf(property.c_str(), "%d", &result) == 1) { + return result; + } else { + return -1; + } + } + + // Return the ith key + Slice Key(int i, std::string* storage) { + char buf[100]; + snprintf(buf, sizeof(buf), "%016d", i); + storage->assign(buf, strlen(buf)); + return Slice(*storage); + } + + // Return the value to associate with the specified key + Slice Value(int k, std::string* storage) { + Random r(k); + return test::RandomString(&r, kValueSize, storage); + } +}; + +TEST(CorruptionTest, Recovery) { + Build(100); + Check(100, 100); + Corrupt(kLogFile, 19, 1); // WriteBatch tag for first record + Corrupt(kLogFile, log::kBlockSize + 1000, 1); // Somewhere in second block + Reopen(); + + // The 64 records in the first two log blocks are completely lost. + Check(36, 36); +} + +TEST(CorruptionTest, RecoverWriteError) { + env_.writable_file_error_ = true; + Status s = TryReopen(); + ASSERT_TRUE(!s.ok()); +} + +TEST(CorruptionTest, NewFileErrorDuringWrite) { + // Do enough writing to force minor compaction + env_.writable_file_error_ = true; + const int num = 3 + (Options().write_buffer_size / kValueSize); + std::string value_storage; + Status s; + bool failed = false; + for (int i = 0; i < num; i++) { + WriteBatch batch; + batch.Put("a", Value(100, &value_storage)); + s = db_->Write(WriteOptions(), &batch); + if (!s.ok()) { + failed = true; + } + ASSERT_TRUE(!failed || !s.ok()); + } + ASSERT_TRUE(!s.ok()); + ASSERT_GE(env_.num_writable_file_errors_, 1); + env_.writable_file_error_ = false; + Reopen(); +} + +TEST(CorruptionTest, TableFile) { + Build(100); + DBImpl* dbi = reinterpret_cast(db_); + dbi->TEST_FlushMemTable(); + dbi->TEST_CompactRange(0, nullptr, nullptr); + dbi->TEST_CompactRange(1, nullptr, nullptr); + + Corrupt(kTableFile, 100, 1); + Check(99, 99); +} + +TEST(CorruptionTest, TableFileIndexData) { + Build(10000); // Enough to build multiple Tables + DBImpl* dbi = reinterpret_cast(db_); + dbi->TEST_FlushMemTable(); + + Corrupt(kTableFile, -2000, 500); + Reopen(); + Check(5000, 9999); +} + +TEST(CorruptionTest, MissingDescriptor) { + Build(1000); + RepairDB(); + Reopen(); + Check(1000, 1000); +} + +TEST(CorruptionTest, SequenceNumberRecovery) { + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1")); + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2")); + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v3")); + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v4")); + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v5")); + RepairDB(); + Reopen(); + std::string v; + ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); + ASSERT_EQ("v5", v); + // Write something. If sequence number was not recovered properly, + // it will be hidden by an earlier write. + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v6")); + ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); + ASSERT_EQ("v6", v); + Reopen(); + ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); + ASSERT_EQ("v6", v); +} + +TEST(CorruptionTest, CorruptedDescriptor) { + ASSERT_OK(db_->Put(WriteOptions(), "foo", "hello")); + DBImpl* dbi = reinterpret_cast(db_); + dbi->TEST_FlushMemTable(); + dbi->TEST_CompactRange(0, nullptr, nullptr); + + Corrupt(kDescriptorFile, 0, 1000); + Status s = TryReopen(); + ASSERT_TRUE(!s.ok()); + + RepairDB(); + Reopen(); + std::string v; + ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); + ASSERT_EQ("hello", v); +} + +TEST(CorruptionTest, CompactionInputError) { + Build(10); + DBImpl* dbi = reinterpret_cast(db_); + dbi->TEST_FlushMemTable(); + const int last = dbi->MaxMemCompactionLevel(); + ASSERT_EQ(1, Property("rocksdb.num-files-at-level" + NumberToString(last))); + + Corrupt(kTableFile, 100, 1); + Check(9, 9); + + // Force compactions by writing lots of values + Build(10000); + Check(10000, 10000); +} + +TEST(CorruptionTest, CompactionInputErrorParanoid) { + Options options; + options.paranoid_checks = true; + options.write_buffer_size = 1048576; + Reopen(&options); + DBImpl* dbi = reinterpret_cast(db_); + + // Fill levels >= 1 so memtable compaction outputs to level 1 + for (int level = 1; level < dbi->NumberLevels(); level++) { + dbi->Put(WriteOptions(), "", "begin"); + dbi->Put(WriteOptions(), "~", "end"); + dbi->TEST_FlushMemTable(); + } + + Build(10); + dbi->TEST_FlushMemTable(); + dbi->TEST_WaitForCompact(); + ASSERT_EQ(1, Property("rocksdb.num-files-at-level0")); + + Corrupt(kTableFile, 100, 1); + Check(9, 9); + + // Write must eventually fail because of corrupted table + Status s; + std::string tmp1, tmp2; + bool failed = false; + for (int i = 0; i < 10000 && s.ok(); i++) { + s = db_->Put(WriteOptions(), Key(i, &tmp1), Value(i, &tmp2)); + if (!s.ok()) { + failed = true; + } + // if one write failed, every subsequent write must fail, too + ASSERT_TRUE(!failed || !s.ok()) << "write did not fail in a corrupted db"; + } + ASSERT_TRUE(!s.ok()) << "write did not fail in corrupted paranoid db"; +} + +TEST(CorruptionTest, UnrelatedKeys) { + Build(10); + DBImpl* dbi = reinterpret_cast(db_); + dbi->TEST_FlushMemTable(); + Corrupt(kTableFile, 100, 1); + + std::string tmp1, tmp2; + ASSERT_OK(db_->Put(WriteOptions(), Key(1000, &tmp1), Value(1000, &tmp2))); + std::string v; + ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v)); + ASSERT_EQ(Value(1000, &tmp2).ToString(), v); + dbi->TEST_FlushMemTable(); + ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v)); + ASSERT_EQ(Value(1000, &tmp2).ToString(), v); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/db/db_bench.cc b/db/db_bench.cc new file mode 100644 index 00000000..e0ba5828 --- /dev/null +++ b/db/db_bench.cc @@ -0,0 +1,2585 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include +#include +#include +#include +#include "db/db_impl.h" +#include "db/version_set.h" +#include "db/db_statistics.h" +#include "rocksdb/options.h" +#include "rocksdb/cache.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/memtablerep.h" +#include "rocksdb/write_batch.h" +#include "rocksdb/statistics.h" +#include "port/port.h" +#include "util/bit_set.h" +#include "util/crc32c.h" +#include "util/histogram.h" +#include "util/mutexlock.h" +#include "util/random.h" +#include "util/stack_trace.h" +#include "util/string_util.h" +#include "util/testutil.h" +#include "hdfs/env_hdfs.h" +#include "utilities/merge_operators.h" + + +DEFINE_string(benchmarks, + + "fillseq," + "fillsync," + "fillrandom," + "overwrite," + "readrandom," + "readrandom," + "readseq," + "readreverse," + "compact," + "readrandom," + "readseq," + "readtocache," + "readreverse," + "readwhilewriting," + "readrandomwriterandom," + "updaterandom," + "randomwithverify," + "fill100K," + "crc32c," + "snappycomp," + "snappyuncomp," + "acquireload," + "fillfromstdin,", + + "Comma-separated list of operations to run in the specified order" + "Actual benchmarks:\n" + "\tfillseq -- write N values in sequential key" + " order in async mode\n" + "\tfillrandom -- write N values in random key order in async" + " mode\n" + "\toverwrite -- overwrite N values in random key order in" + " async mode\n" + "\tfillsync -- write N/100 values in random key order in " + "sync mode\n" + "\tfill100K -- write N/1000 100K values in random order in" + " async mode\n" + "\tdeleteseq -- delete N keys in sequential order\n" + "\tdeleterandom -- delete N keys in random order\n" + "\treadseq -- read N times sequentially\n" + "\treadtocache -- 1 thread reading database sequentially\n" + "\treadreverse -- read N times in reverse order\n" + "\treadrandom -- read N times in random order\n" + "\treadmissing -- read N missing keys in random order\n" + "\treadhot -- read N times in random order from 1% section " + "of DB\n" + "\treadwhilewriting -- 1 writer, N threads doing random " + "reads\n" + "\treadrandomwriterandom -- N threads doing random-read, " + "random-write\n" + "\tprefixscanrandom -- prefix scan N times in random order\n" + "\tupdaterandom -- N threads doing read-modify-write for random " + "keys\n" + "\tappendrandom -- N threads doing read-modify-write with " + "growing values\n" + "\tmergerandom -- same as updaterandom/appendrandom using merge" + " operator. " + "Must be used with merge_operator\n" + "\treadrandommergerandom -- perform N random read-or-merge " + "operations. Must be used with merge_operator\n" + "\tseekrandom -- N random seeks\n" + "\tcrc32c -- repeated crc32c of 4K of data\n" + "\tacquireload -- load N*1000 times\n" + "Meta operations:\n" + "\tcompact -- Compact the entire DB\n" + "\tstats -- Print DB stats\n" + "\tlevelstats -- Print the number of files and bytes per level\n" + "\tsstables -- Print sstable info\n" + "\theapprofile -- Dump a heap profile (if supported by this" + " port)\n"); + +DEFINE_int64(num, 1000000, "Number of key/values to place in database"); + +DEFINE_int64(numdistinct, 1000, + "Number of distinct keys to use. Used in RandomWithVerify to " + "read/write on fewer keys so that gets are more likely to find the" + " key and puts are more likely to update the same key"); + +DEFINE_int64(merge_keys, -1, + "Number of distinct keys to use for MergeRandom and " + "ReadRandomMergeRandom. " + "If negative, there will be FLAGS_num keys."); + +DEFINE_int64(reads, -1, "Number of read operations to do. " + "If negative, do FLAGS_num reads."); + +DEFINE_int64(read_range, 1, "When ==1 reads use ::Get, when >1 reads use" + " an iterator"); + +DEFINE_bool(use_prefix_blooms, false, "Whether to place prefixes in blooms"); + +DEFINE_bool(use_prefix_api, false, "Whether to set ReadOptions.prefix for" + " prefixscanrandom. If true, use_prefix_blooms must also be true."); + +DEFINE_int64(seed, 0, "Seed base for random number generators. " + "When 0 it is deterministic."); + +DEFINE_int32(threads, 1, "Number of concurrent threads to run."); + +DEFINE_int32(duration, 0, "Time in seconds for the random-ops tests to run." + " When 0 then num & reads determine the test duration"); + +DEFINE_int32(value_size, 100, "Size of each value"); + + +// the maximum size of key in bytes +static const int kMaxKeySize = 128; +static bool ValidateKeySize(const char* flagname, int32_t value) { + if (value > kMaxKeySize) { + fprintf(stderr, "Invalid value for --%s: %d, must be < %d\n", + flagname, value, kMaxKeySize); + return false; + } + return true; +} +DEFINE_int32(key_size, 16, "size of each key"); + +DEFINE_double(compression_ratio, 0.5, "Arrange to generate values that shrink" + " to this fraction of their original size after compression"); + +DEFINE_bool(histogram, false, "Print histogram of operation timings"); + +DEFINE_int64(write_buffer_size, rocksdb::Options().write_buffer_size, + "Number of bytes to buffer in memtable before compacting"); + +DEFINE_int32(max_write_buffer_number, + rocksdb::Options().max_write_buffer_number, + "The number of in-memory memtables. Each memtable is of size" + "write_buffer_size."); + +DEFINE_int32(min_write_buffer_number_to_merge, + rocksdb::Options().min_write_buffer_number_to_merge, + "The minimum number of write buffers that will be merged together" + "before writing to storage. This is cheap because it is an" + "in-memory merge. If this feature is not enabled, then all these" + "write buffers are flushed to L0 as separate files and this " + "increases read amplification because a get request has to check" + " in all of these files. Also, an in-memory merge may result in" + " writing less data to storage if there are duplicate records " + " in each of these individual write buffers."); + +DEFINE_int32(max_background_compactions, + rocksdb::Options().max_background_compactions, + "The maximum number of concurrent background compactions" + " that can occur in parallel."); + +static rocksdb::CompactionStyle FLAGS_compaction_style_e; +DEFINE_int32(compaction_style, (int32_t) rocksdb::Options().compaction_style, + "style of compaction: level-based vs universal"); + +DEFINE_int32(universal_size_ratio, 0, + "Percentage flexibility while comparing file size" + " (for universal compaction only)."); + +DEFINE_int32(universal_min_merge_width, 0, "The minimum number of files in a" + " single compaction run (for universal compaction only)."); + +DEFINE_int32(universal_max_merge_width, 0, "The max number of files to compact" + " in universal style compaction"); + +DEFINE_int32(universal_max_size_amplification_percent, 0, + "The max size amplification for universal style compaction"); + +DEFINE_int32(universal_compression_size_percent, -1, + "The percentage of the database to compress for universal " + "compaction. -1 means compress everything."); + +DEFINE_int64(cache_size, -1, "Number of bytes to use as a cache of uncompressed" + "data. Negative means use default settings."); + +DEFINE_int32(block_size, rocksdb::Options().block_size, + "Number of bytes in a block."); + +DEFINE_int64(compressed_cache_size, -1, + "Number of bytes to use as a cache of compressed data."); + +DEFINE_int32(open_files, rocksdb::Options().max_open_files, + "Maximum number of files to keep open at the same time" + " (use default if == 0)"); + +DEFINE_int32(bloom_bits, -1, "Bloom filter bits per key. Negative means" + " use default settings."); + +DEFINE_bool(use_existing_db, false, "If true, do not destroy the existing" + " database. If you set this flag and also specify a benchmark that" + " wants a fresh database, that benchmark will fail."); + +DEFINE_string(db, "", "Use the db with the following name."); + +static bool ValidateCacheNumshardbits(const char* flagname, int32_t value) { + if (value >= 20) { + fprintf(stderr, "Invalid value for --%s: %d, must be < 20\n", + flagname, value); + return false; + } + return true; +} +DEFINE_int32(cache_numshardbits, -1, "Number of shards for the block cache" + " is 2 ** cache_numshardbits. Negative means use default settings." + " This is applied only if FLAGS_cache_size is non-negative."); + +DEFINE_int32(cache_remove_scan_count_limit, 32, ""); + +DEFINE_bool(verify_checksum, false, "Verify checksum for every block read" + " from storage"); + +DEFINE_bool(statistics, false, "Database statistics"); +static class std::shared_ptr dbstats; + +DEFINE_int64(writes, -1, "Number of write operations to do. If negative, do" + " --num reads."); + +DEFINE_int32(writes_per_second, 0, "Per-thread rate limit on writes per second." + " No limit when <= 0. Only for the readwhilewriting test."); + +DEFINE_bool(sync, false, "Sync all writes to disk"); + +DEFINE_bool(disable_data_sync, false, "If true, do not wait until data is" + " synced to disk."); + +DEFINE_bool(use_fsync, false, "If true, issue fsync instead of fdatasync"); + +DEFINE_bool(disable_wal, false, "If true, do not write WAL for write."); + +DEFINE_bool(use_snapshot, false, "If true, create a snapshot per query when" + " randomread benchmark is used"); + +DEFINE_bool(get_approx, false, "If true, call GetApproximateSizes per query" + " when read_range is > 1 and randomread benchmark is used"); + +DEFINE_int32(num_levels, 7, "The total number of levels"); + +DEFINE_int32(target_file_size_base, 2 * 1048576, "Target file size at level-1"); + +DEFINE_int32(target_file_size_multiplier, 1, + "A multiplier to compute target level-N file size (N >= 2)"); + +DEFINE_uint64(max_bytes_for_level_base, 10 * 1048576, "Max bytes for level-1"); + +DEFINE_int32(max_bytes_for_level_multiplier, 10, + "A multiplier to compute max bytes for level-N (N >= 2)"); + +static std::vector FLAGS_max_bytes_for_level_multiplier_additional_v; +DEFINE_string(max_bytes_for_level_multiplier_additional, "", + "A vector that specifies additional fanout per level"); + +DEFINE_int32(level0_stop_writes_trigger, 12, "Number of files in level-0" + " that will trigger put stop."); + +DEFINE_int32(level0_slowdown_writes_trigger, 8, "Number of files in level-0" + " that will slow down writes."); + +DEFINE_int32(level0_file_num_compaction_trigger, 4, "Number of files in level-0" + " when compactions start"); + +static bool ValidateInt32Percent(const char* flagname, int32_t value) { + if (value <= 0 || value>=100) { + fprintf(stderr, "Invalid value for --%s: %d, 0< pct <100 \n", + flagname, value); + return false; + } + return true; +} +DEFINE_int32(readwritepercent, 90, "Ratio of reads to reads/writes (expressed" + " as percentage) for the ReadRandomWriteRandom workload. The " + "default value 90 means 90% operations out of all reads and writes" + " operations are reads. In other words, 9 gets for every 1 put."); + +DEFINE_int32(mergereadpercent, 70, "Ratio of merges to merges&reads (expressed" + " as percentage) for the ReadRandomMergeRandom workload. The" + " default value 70 means 70% out of all read and merge operations" + " are merges. In other words, 7 merges for every 3 gets."); + +DEFINE_int32(deletepercent, 2, "Percentage of deletes out of reads/writes/" + "deletes (used in RandomWithVerify only). RandomWithVerify " + "calculates writepercent as (100 - FLAGS_readwritepercent - " + "deletepercent), so deletepercent must be smaller than (100 - " + "FLAGS_readwritepercent)"); + +DEFINE_int32(disable_seek_compaction, false, "Option to disable compaction" + " triggered by read."); + +DEFINE_uint64(delete_obsolete_files_period_micros, 0, "Option to delete " + "obsolete files periodically. 0 means that obsolete files are" + " deleted after every compaction run."); + +enum rocksdb::CompressionType StringToCompressionType(const char* ctype) { + assert(ctype); + + if (!strcasecmp(ctype, "none")) + return rocksdb::kNoCompression; + else if (!strcasecmp(ctype, "snappy")) + return rocksdb::kSnappyCompression; + else if (!strcasecmp(ctype, "zlib")) + return rocksdb::kZlibCompression; + else if (!strcasecmp(ctype, "bzip2")) + return rocksdb::kBZip2Compression; + + fprintf(stdout, "Cannot parse compression type '%s'\n", ctype); + return rocksdb::kSnappyCompression; //default value +} +DEFINE_string(compression_type, "snappy", + "Algorithm to use to compress the database"); +static enum rocksdb::CompressionType FLAGS_compression_type_e = + rocksdb::kSnappyCompression; + +DEFINE_int32(compression_level, -1, + "Compression level. For zlib this should be -1 for the " + "default level, or between 0 and 9."); + +static bool ValidateCompressionLevel(const char* flagname, int32_t value) { + if (value < -1 || value > 9) { + fprintf(stderr, "Invalid value for --%s: %d, must be between -1 and 9\n", + flagname, value); + return false; + } + return true; +} + +static const bool FLAGS_compression_level_dummy = + google::RegisterFlagValidator(&FLAGS_compression_level, + &ValidateCompressionLevel); + +DEFINE_int32(min_level_to_compress, -1, "If non-negative, compression starts" + " from this level. Levels with number < min_level_to_compress are" + " not compressed. Otherwise, apply compression_type to " + "all levels."); + +static bool ValidateTableCacheNumshardbits(const char* flagname, + int32_t value) { + if (0 >= value || value > 20) { + fprintf(stderr, "Invalid value for --%s: %d, must be 0 < val <= 20\n", + flagname, value); + return false; + } + return true; +} +DEFINE_int32(table_cache_numshardbits, 4, ""); + +DEFINE_string(hdfs, "", "Name of hdfs environment"); +// posix or hdfs environment +static rocksdb::Env* FLAGS_env = rocksdb::Env::Default(); + +DEFINE_int64(stats_interval, 0, "Stats are reported every N operations when " + "this is greater than zero. When 0 the interval grows over time."); + +DEFINE_int32(stats_per_interval, 0, "Reports additional stats per interval when" + " this is greater than 0."); + +static bool ValidateRateLimit(const char* flagname, double value) { + static constexpr double EPSILON = 1e-10; + if ( value < -EPSILON ) { + fprintf(stderr, "Invalid value for --%s: %12.6f, must be >= 0.0\n", + flagname, value); + return false; + } + return true; +} +DEFINE_double(soft_rate_limit, 0.0, ""); + +DEFINE_double(hard_rate_limit, 0.0, "When not equal to 0 this make threads " + "sleep at each stats reporting interval until the compaction" + " score for all levels is less than or equal to this value."); + +DEFINE_int32(rate_limit_delay_max_milliseconds, 1000, + "When hard_rate_limit is set then this is the max time a put will" + " be stalled."); + +DEFINE_int32(max_grandparent_overlap_factor, 10, "Control maximum bytes of " + "overlaps in grandparent (i.e., level+2) before we stop building a" + " single file in a level->level+1 compaction."); + +DEFINE_bool(readonly, false, "Run read only benchmarks."); + +DEFINE_bool(disable_auto_compactions, false, "Do not auto trigger compactions"); + +DEFINE_int32(source_compaction_factor, 1, "Cap the size of data in level-K for" + " a compaction run that compacts Level-K with Level-(K+1) (for" + " K >= 1)"); + +DEFINE_uint64(wal_ttl_seconds, 0, "Set the TTL for the WAL Files in seconds."); +DEFINE_uint64(wal_size_limit_MB, 0, "Set the size limit for the WAL Files" + " in MB."); + +DEFINE_bool(bufferedio, rocksdb::EnvOptions().use_os_buffer, + "Allow buffered io using OS buffers"); + +DEFINE_bool(mmap_read, rocksdb::EnvOptions().use_mmap_reads, + "Allow reads to occur via mmap-ing files"); + +DEFINE_bool(mmap_write, rocksdb::EnvOptions().use_mmap_writes, + "Allow writes to occur via mmap-ing files"); + +DEFINE_bool(advise_random_on_open, rocksdb::Options().advise_random_on_open, + "Advise random access on table file open"); + +DEFINE_string(compaction_fadvice, "NORMAL", + "Access pattern advice when a file is compacted"); +static auto FLAGS_compaction_fadvice_e = + rocksdb::Options().access_hint_on_compaction_start; + +DEFINE_bool(use_multiget, false, + "Use multiget to access a series of keys instead of get"); + +DEFINE_int64(keys_per_multiget, 90, "If use_multiget is true, determines number" + " of keys to group per call Arbitrary default is good because it" + " agrees with readwritepercent"); + +// TODO: Apply this flag to generic Get calls too. Currently only with Multiget +DEFINE_bool(warn_missing_keys, true, "Print a message to user when a key is" + " missing in a Get/MultiGet call"); + +DEFINE_bool(use_adaptive_mutex, rocksdb::Options().use_adaptive_mutex, + "Use adaptive mutex"); + +DEFINE_uint64(bytes_per_sync, rocksdb::Options().bytes_per_sync, + "Allows OS to incrementally sync files to disk while they are" + " being written, in the background. Issue one request for every" + " bytes_per_sync written. 0 turns it off."); +DEFINE_bool(filter_deletes, false, " On true, deletes use bloom-filter and drop" + " the delete if key not present"); + +DEFINE_int32(max_successive_merges, 0, "Maximum number of successive merge" + " operations on a key in the memtable"); + +static bool ValidatePrefixSize(const char* flagname, int32_t value) { + if (value < 0 || value>=2000000000) { + fprintf(stderr, "Invalid value for --%s: %d. 0<= PrefixSize <=2000000000\n", + flagname, value); + return false; + } + return true; +} +DEFINE_int32(prefix_size, 0, "Control the prefix size for HashSkipList"); + +enum RepFactory { + kSkipList, + kPrefixHash, + kVectorRep +}; +enum RepFactory StringToRepFactory(const char* ctype) { + assert(ctype); + + if (!strcasecmp(ctype, "skip_list")) + return kSkipList; + else if (!strcasecmp(ctype, "prefix_hash")) + return kPrefixHash; + else if (!strcasecmp(ctype, "vector")) + return kVectorRep; + + fprintf(stdout, "Cannot parse memreptable %s\n", ctype); + return kSkipList; +} +static enum RepFactory FLAGS_rep_factory; +DEFINE_string(memtablerep, "skip_list", ""); + +DEFINE_string(merge_operator, "", "The merge operator to use with the database." + "If a new merge operator is specified, be sure to use fresh" + " database The possible merge operators are defined in" + " utilities/merge_operators.h"); + +static const bool FLAGS_soft_rate_limit_dummy __attribute__((unused)) = + google::RegisterFlagValidator(&FLAGS_soft_rate_limit, + &ValidateRateLimit); + +static const bool FLAGS_hard_rate_limit_dummy __attribute__((unused)) = + google::RegisterFlagValidator(&FLAGS_hard_rate_limit, &ValidateRateLimit); + +static const bool FLAGS_prefix_size_dummy __attribute__((unused)) = + google::RegisterFlagValidator(&FLAGS_prefix_size, &ValidatePrefixSize); + +static const bool FLAGS_key_size_dummy __attribute__((unused)) = + google::RegisterFlagValidator(&FLAGS_key_size, &ValidateKeySize); + +static const bool FLAGS_cache_numshardbits_dummy __attribute__((unused)) = + google::RegisterFlagValidator(&FLAGS_cache_numshardbits, + &ValidateCacheNumshardbits); + +static const bool FLAGS_readwritepercent_dummy __attribute__((unused)) = + google::RegisterFlagValidator(&FLAGS_readwritepercent, + &ValidateInt32Percent); + +static const bool FLAGS_deletepercent_dummy __attribute__((unused)) = + google::RegisterFlagValidator(&FLAGS_deletepercent, + &ValidateInt32Percent); +static const bool + FLAGS_table_cache_numshardbits_dummy __attribute__((unused)) = + google::RegisterFlagValidator(&FLAGS_table_cache_numshardbits, + &ValidateTableCacheNumshardbits); + +namespace rocksdb { + +// Helper for quickly generating random data. +class RandomGenerator { + private: + std::string data_; + unsigned int pos_; + + public: + RandomGenerator() { + // We use a limited amount of data over and over again and ensure + // that it is larger than the compression window (32KB), and also + // large enough to serve all typical value sizes we want to write. + Random rnd(301); + std::string piece; + while (data_.size() < (unsigned)std::max(1048576, FLAGS_value_size)) { + // Add a short fragment that is as compressible as specified + // by FLAGS_compression_ratio. + test::CompressibleString(&rnd, FLAGS_compression_ratio, 100, &piece); + data_.append(piece); + } + pos_ = 0; + } + + Slice Generate(unsigned int len) { + if (pos_ + len > data_.size()) { + pos_ = 0; + assert(len < data_.size()); + } + pos_ += len; + return Slice(data_.data() + pos_ - len, len); + } +}; + +static void AppendWithSpace(std::string* str, Slice msg) { + if (msg.empty()) return; + if (!str->empty()) { + str->push_back(' '); + } + str->append(msg.data(), msg.size()); +} + +class Stats { + private: + int id_; + double start_; + double finish_; + double seconds_; + long long done_; + long long last_report_done_; + long long next_report_; + int64_t bytes_; + double last_op_finish_; + double last_report_finish_; + HistogramImpl hist_; + std::string message_; + bool exclude_from_merge_; + + public: + Stats() { Start(-1); } + + void Start(int id) { + id_ = id; + next_report_ = FLAGS_stats_interval ? FLAGS_stats_interval : 100; + last_op_finish_ = start_; + hist_.Clear(); + done_ = 0; + last_report_done_ = 0; + bytes_ = 0; + seconds_ = 0; + start_ = FLAGS_env->NowMicros(); + finish_ = start_; + last_report_finish_ = start_; + message_.clear(); + // When set, stats from this thread won't be merged with others. + exclude_from_merge_ = false; + } + + void Merge(const Stats& other) { + if (other.exclude_from_merge_) + return; + + hist_.Merge(other.hist_); + done_ += other.done_; + bytes_ += other.bytes_; + seconds_ += other.seconds_; + if (other.start_ < start_) start_ = other.start_; + if (other.finish_ > finish_) finish_ = other.finish_; + + // Just keep the messages from one thread + if (message_.empty()) message_ = other.message_; + } + + void Stop() { + finish_ = FLAGS_env->NowMicros(); + seconds_ = (finish_ - start_) * 1e-6; + } + + void AddMessage(Slice msg) { + AppendWithSpace(&message_, msg); + } + + void SetId(int id) { id_ = id; } + void SetExcludeFromMerge() { exclude_from_merge_ = true; } + + void FinishedSingleOp(DB* db) { + if (FLAGS_histogram) { + double now = FLAGS_env->NowMicros(); + double micros = now - last_op_finish_; + hist_.Add(micros); + if (micros > 20000 && !FLAGS_stats_interval) { + fprintf(stderr, "long op: %.1f micros%30s\r", micros, ""); + fflush(stderr); + } + last_op_finish_ = now; + } + + done_++; + if (done_ >= next_report_) { + if (!FLAGS_stats_interval) { + if (next_report_ < 1000) next_report_ += 100; + else if (next_report_ < 5000) next_report_ += 500; + else if (next_report_ < 10000) next_report_ += 1000; + else if (next_report_ < 50000) next_report_ += 5000; + else if (next_report_ < 100000) next_report_ += 10000; + else if (next_report_ < 500000) next_report_ += 50000; + else next_report_ += 100000; + fprintf(stderr, "... finished %lld ops%30s\r", done_, ""); + fflush(stderr); + } else { + double now = FLAGS_env->NowMicros(); + fprintf(stderr, + "%s ... thread %d: (%lld,%lld) ops and " + "(%.1f,%.1f) ops/second in (%.6f,%.6f) seconds\n", + FLAGS_env->TimeToString((uint64_t) now/1000000).c_str(), + id_, + done_ - last_report_done_, done_, + (done_ - last_report_done_) / + ((now - last_report_finish_) / 1000000.0), + done_ / ((now - start_) / 1000000.0), + (now - last_report_finish_) / 1000000.0, + (now - start_) / 1000000.0); + + if (FLAGS_stats_per_interval) { + std::string stats; + if (db && db->GetProperty("rocksdb.stats", &stats)) + fprintf(stderr, "%s\n", stats.c_str()); + } + + fflush(stderr); + next_report_ += FLAGS_stats_interval; + last_report_finish_ = now; + last_report_done_ = done_; + } + } + } + + void AddBytes(int64_t n) { + bytes_ += n; + } + + void Report(const Slice& name) { + // Pretend at least one op was done in case we are running a benchmark + // that does not call FinishedSingleOp(). + if (done_ < 1) done_ = 1; + + std::string extra; + if (bytes_ > 0) { + // Rate is computed on actual elapsed time, not the sum of per-thread + // elapsed times. + double elapsed = (finish_ - start_) * 1e-6; + char rate[100]; + snprintf(rate, sizeof(rate), "%6.1f MB/s", + (bytes_ / 1048576.0) / elapsed); + extra = rate; + } + AppendWithSpace(&extra, message_); + double elapsed = (finish_ - start_) * 1e-6; + double throughput = (double)done_/elapsed; + + fprintf(stdout, "%-12s : %11.3f micros/op %ld ops/sec;%s%s\n", + name.ToString().c_str(), + elapsed * 1e6 / done_, + (long)throughput, + (extra.empty() ? "" : " "), + extra.c_str()); + if (FLAGS_histogram) { + fprintf(stdout, "Microseconds per op:\n%s\n", hist_.ToString().c_str()); + } + fflush(stdout); + } +}; + +// State shared by all concurrent executions of the same benchmark. +struct SharedState { + port::Mutex mu; + port::CondVar cv; + int total; + + // Each thread goes through the following states: + // (1) initializing + // (2) waiting for others to be initialized + // (3) running + // (4) done + + long num_initialized; + long num_done; + bool start; + + SharedState() : cv(&mu) { } +}; + +// Per-thread state for concurrent executions of the same benchmark. +struct ThreadState { + int tid; // 0..n-1 when running in n threads + Random64 rand; // Has different seeds for different threads + Stats stats; + SharedState* shared; + + /* implicit */ ThreadState(int index) + : tid(index), + rand((FLAGS_seed ? FLAGS_seed : 1000) + index) { + } +}; + +class Duration { + public: + Duration(int max_seconds, long long max_ops) { + max_seconds_ = max_seconds; + max_ops_= max_ops; + ops_ = 0; + start_at_ = FLAGS_env->NowMicros(); + } + + bool Done(int increment) { + if (increment <= 0) increment = 1; // avoid Done(0) and infinite loops + ops_ += increment; + + if (max_seconds_) { + // Recheck every appx 1000 ops (exact iff increment is factor of 1000) + if ((ops_/1000) != ((ops_-increment)/1000)) { + double now = FLAGS_env->NowMicros(); + return ((now - start_at_) / 1000000.0) >= max_seconds_; + } else { + return false; + } + } else { + return ops_ > max_ops_; + } + } + + private: + int max_seconds_; + long long max_ops_; + long long ops_; + double start_at_; +}; + +class Benchmark { + private: + shared_ptr cache_; + shared_ptr compressed_cache_; + const FilterPolicy* filter_policy_; + const SliceTransform* prefix_extractor_; + DB* db_; + long long num_; + int value_size_; + int key_size_; + int entries_per_batch_; + WriteOptions write_options_; + long long reads_; + long long writes_; + long long readwrites_; + long long merge_keys_; + int heap_counter_; + char keyFormat_[100]; // will contain the format of key. e.g "%016d" + void PrintHeader() { + PrintEnvironment(); + fprintf(stdout, "Keys: %d bytes each\n", FLAGS_key_size); + fprintf(stdout, "Values: %d bytes each (%d bytes after compression)\n", + FLAGS_value_size, + static_cast(FLAGS_value_size * FLAGS_compression_ratio + 0.5)); + fprintf(stdout, "Entries: %lld\n", num_); + fprintf(stdout, "RawSize: %.1f MB (estimated)\n", + ((static_cast(FLAGS_key_size + FLAGS_value_size) * num_) + / 1048576.0)); + fprintf(stdout, "FileSize: %.1f MB (estimated)\n", + (((FLAGS_key_size + FLAGS_value_size * FLAGS_compression_ratio) + * num_) + / 1048576.0)); + fprintf(stdout, "Write rate limit: %d\n", FLAGS_writes_per_second); + switch (FLAGS_compression_type_e) { + case rocksdb::kNoCompression: + fprintf(stdout, "Compression: none\n"); + break; + case rocksdb::kSnappyCompression: + fprintf(stdout, "Compression: snappy\n"); + break; + case rocksdb::kZlibCompression: + fprintf(stdout, "Compression: zlib\n"); + break; + case rocksdb::kBZip2Compression: + fprintf(stdout, "Compression: bzip2\n"); + break; + } + + switch (FLAGS_rep_factory) { + case kPrefixHash: + fprintf(stdout, "Memtablerep: prefix_hash\n"); + break; + case kSkipList: + fprintf(stdout, "Memtablerep: skip_list\n"); + break; + case kVectorRep: + fprintf(stdout, "Memtablerep: vector\n"); + break; + } + + PrintWarnings(); + fprintf(stdout, "------------------------------------------------\n"); + } + + void PrintWarnings() { +#if defined(__GNUC__) && !defined(__OPTIMIZE__) + fprintf(stdout, + "WARNING: Optimization is disabled: benchmarks unnecessarily slow\n" + ); +#endif +#ifndef NDEBUG + fprintf(stdout, + "WARNING: Assertions are enabled; benchmarks unnecessarily slow\n"); +#endif + if (FLAGS_compression_type_e != rocksdb::kNoCompression) { + // The test string should not be too small. + const int len = FLAGS_block_size; + char* text = (char*) malloc(len+1); + bool result = true; + const char* name = nullptr; + std::string compressed; + + memset(text, (int) 'y', len); + text[len] = '\0'; + switch (FLAGS_compression_type_e) { + case kSnappyCompression: + result = port::Snappy_Compress(Options().compression_opts, text, + strlen(text), &compressed); + name = "Snappy"; + break; + case kZlibCompression: + result = port::Zlib_Compress(Options().compression_opts, text, + strlen(text), &compressed); + name = "Zlib"; + break; + case kBZip2Compression: + result = port::BZip2_Compress(Options().compression_opts, text, + strlen(text), &compressed); + name = "BZip2"; + break; + case kNoCompression: + assert(false); // cannot happen + break; + } + + if (!result) { + fprintf(stdout, "WARNING: %s compression is not enabled\n", name); + } else if (name && compressed.size() >= strlen(text)) { + fprintf(stdout, "WARNING: %s compression is not effective\n", name); + } + + free(text); + } + } + +// Current the following isn't equivalent to OS_LINUX. +#if defined(__linux) + static Slice TrimSpace(Slice s) { + unsigned int start = 0; + while (start < s.size() && isspace(s[start])) { + start++; + } + unsigned int limit = s.size(); + while (limit > start && isspace(s[limit-1])) { + limit--; + } + return Slice(s.data() + start, limit - start); + } +#endif + + void PrintEnvironment() { + fprintf(stderr, "LevelDB: version %d.%d\n", + kMajorVersion, kMinorVersion); + +#if defined(__linux) + time_t now = time(nullptr); + fprintf(stderr, "Date: %s", ctime(&now)); // ctime() adds newline + + FILE* cpuinfo = fopen("/proc/cpuinfo", "r"); + if (cpuinfo != nullptr) { + char line[1000]; + int num_cpus = 0; + std::string cpu_type; + std::string cache_size; + while (fgets(line, sizeof(line), cpuinfo) != nullptr) { + const char* sep = strchr(line, ':'); + if (sep == nullptr) { + continue; + } + Slice key = TrimSpace(Slice(line, sep - 1 - line)); + Slice val = TrimSpace(Slice(sep + 1)); + if (key == "model name") { + ++num_cpus; + cpu_type = val.ToString(); + } else if (key == "cache size") { + cache_size = val.ToString(); + } + } + fclose(cpuinfo); + fprintf(stderr, "CPU: %d * %s\n", num_cpus, cpu_type.c_str()); + fprintf(stderr, "CPUCache: %s\n", cache_size.c_str()); + } +#endif + } + + public: + Benchmark() + : cache_(FLAGS_cache_size >= 0 ? + (FLAGS_cache_numshardbits >= 1 ? + NewLRUCache(FLAGS_cache_size, FLAGS_cache_numshardbits, + FLAGS_cache_remove_scan_count_limit) : + NewLRUCache(FLAGS_cache_size)) : nullptr), + compressed_cache_(FLAGS_compressed_cache_size >= 0 ? + (FLAGS_cache_numshardbits >= 1 ? + NewLRUCache(FLAGS_compressed_cache_size, FLAGS_cache_numshardbits) : + NewLRUCache(FLAGS_compressed_cache_size)) : nullptr), + filter_policy_(FLAGS_bloom_bits >= 0 + ? NewBloomFilterPolicy(FLAGS_bloom_bits) + : nullptr), + prefix_extractor_(NewFixedPrefixTransform(FLAGS_key_size-1)), + db_(nullptr), + num_(FLAGS_num), + value_size_(FLAGS_value_size), + key_size_(FLAGS_key_size), + entries_per_batch_(1), + reads_(FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads), + writes_(FLAGS_writes < 0 ? FLAGS_num : FLAGS_writes), + readwrites_((FLAGS_writes < 0 && FLAGS_reads < 0)? FLAGS_num : + ((FLAGS_writes > FLAGS_reads) ? FLAGS_writes : FLAGS_reads) + ), + merge_keys_(FLAGS_merge_keys < 0 ? FLAGS_num : FLAGS_merge_keys), + heap_counter_(0) { + std::vector files; + FLAGS_env->GetChildren(FLAGS_db, &files); + for (unsigned int i = 0; i < files.size(); i++) { + if (Slice(files[i]).starts_with("heap-")) { + FLAGS_env->DeleteFile(FLAGS_db + "/" + files[i]); + } + } + if (!FLAGS_use_existing_db) { + DestroyDB(FLAGS_db, Options()); + } + } + + ~Benchmark() { + delete db_; + delete filter_policy_; + delete prefix_extractor_; + } + + //this function will construct string format for key. e.g "%016lld" + void ConstructStrFormatForKey(char* str, int keySize) { + str[0] = '%'; + str[1] = '0'; + sprintf(str+2, "%dlld%s", keySize, "%s"); + } + + unique_ptr GenerateKeyFromInt(long long v, const char* suffix = "") { + unique_ptr keyInStr(new char[kMaxKeySize + 1]); + snprintf(keyInStr.get(), kMaxKeySize + 1, keyFormat_, v, suffix); + return keyInStr; + } + + void Run() { + PrintHeader(); + Open(); + const char* benchmarks = FLAGS_benchmarks.c_str(); + while (benchmarks != nullptr) { + const char* sep = strchr(benchmarks, ','); + Slice name; + if (sep == nullptr) { + name = benchmarks; + benchmarks = nullptr; + } else { + name = Slice(benchmarks, sep - benchmarks); + benchmarks = sep + 1; + } + + // Sanitize parameters + num_ = FLAGS_num; + reads_ = (FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads); + writes_ = (FLAGS_writes < 0 ? FLAGS_num : FLAGS_writes); + value_size_ = FLAGS_value_size; + key_size_ = FLAGS_key_size; + ConstructStrFormatForKey(keyFormat_, key_size_); + entries_per_batch_ = 1; + write_options_ = WriteOptions(); + if (FLAGS_sync) { + write_options_.sync = true; + } + write_options_.disableWAL = FLAGS_disable_wal; + + void (Benchmark::*method)(ThreadState*) = nullptr; + bool fresh_db = false; + int num_threads = FLAGS_threads; + + if (name == Slice("fillseq")) { + fresh_db = true; + method = &Benchmark::WriteSeq; + } else if (name == Slice("fillbatch")) { + fresh_db = true; + entries_per_batch_ = 1000; + method = &Benchmark::WriteSeq; + } else if (name == Slice("fillrandom")) { + fresh_db = true; + method = &Benchmark::WriteRandom; + } else if (name == Slice("fillfromstdin")) { + fresh_db = true; + method = &Benchmark::WriteFromStdin; + } else if (name == Slice("filluniquerandom")) { + fresh_db = true; + if (num_threads > 1) { + fprintf(stderr, "filluniquerandom multithreaded not supported" + " set --threads=1"); + exit(1); + } + method = &Benchmark::WriteUniqueRandom; + } else if (name == Slice("overwrite")) { + fresh_db = false; + method = &Benchmark::WriteRandom; + } else if (name == Slice("fillsync")) { + fresh_db = true; + num_ /= 1000; + write_options_.sync = true; + method = &Benchmark::WriteRandom; + } else if (name == Slice("fill100K")) { + fresh_db = true; + num_ /= 1000; + value_size_ = 100 * 1000; + method = &Benchmark::WriteRandom; + } else if (name == Slice("readseq")) { + method = &Benchmark::ReadSequential; + } else if (name == Slice("readtocache")) { + method = &Benchmark::ReadSequential; + num_threads = 1; + reads_ = num_; + } else if (name == Slice("readreverse")) { + method = &Benchmark::ReadReverse; + } else if (name == Slice("readrandom")) { + method = &Benchmark::ReadRandom; + } else if (name == Slice("readmissing")) { + method = &Benchmark::ReadMissing; + } else if (name == Slice("seekrandom")) { + method = &Benchmark::SeekRandom; + } else if (name == Slice("readhot")) { + method = &Benchmark::ReadHot; + } else if (name == Slice("readrandomsmall")) { + reads_ /= 1000; + method = &Benchmark::ReadRandom; + } else if (name == Slice("prefixscanrandom")) { + method = &Benchmark::PrefixScanRandom; + } else if (name == Slice("deleteseq")) { + method = &Benchmark::DeleteSeq; + } else if (name == Slice("deleterandom")) { + method = &Benchmark::DeleteRandom; + } else if (name == Slice("readwhilewriting")) { + num_threads++; // Add extra thread for writing + method = &Benchmark::ReadWhileWriting; + } else if (name == Slice("readrandomwriterandom")) { + method = &Benchmark::ReadRandomWriteRandom; + } else if (name == Slice("readrandommergerandom")) { + if (FLAGS_merge_operator.empty()) { + fprintf(stdout, "%-12s : skipped (--merge_operator is unknown)\n", + name.ToString().c_str()); + method = nullptr; + } else { + method = &Benchmark::ReadRandomMergeRandom; + } + } else if (name == Slice("updaterandom")) { + method = &Benchmark::UpdateRandom; + } else if (name == Slice("appendrandom")) { + method = &Benchmark::AppendRandom; + } else if (name == Slice("mergerandom")) { + if (FLAGS_merge_operator.empty()) { + fprintf(stdout, "%-12s : skipped (--merge_operator is unknown)\n", + name.ToString().c_str()); + method = nullptr; + } else { + method = &Benchmark::MergeRandom; + } + } else if (name == Slice("randomwithverify")) { + method = &Benchmark::RandomWithVerify; + } else if (name == Slice("compact")) { + method = &Benchmark::Compact; + } else if (name == Slice("crc32c")) { + method = &Benchmark::Crc32c; + } else if (name == Slice("acquireload")) { + method = &Benchmark::AcquireLoad; + } else if (name == Slice("snappycomp")) { + method = &Benchmark::SnappyCompress; + } else if (name == Slice("snappyuncomp")) { + method = &Benchmark::SnappyUncompress; + } else if (name == Slice("heapprofile")) { + HeapProfile(); + } else if (name == Slice("stats")) { + PrintStats("rocksdb.stats"); + } else if (name == Slice("levelstats")) { + PrintStats("rocksdb.levelstats"); + } else if (name == Slice("sstables")) { + PrintStats("rocksdb.sstables"); + } else { + if (name != Slice()) { // No error message for empty name + fprintf(stderr, "unknown benchmark '%s'\n", name.ToString().c_str()); + } + } + + if (fresh_db) { + if (FLAGS_use_existing_db) { + fprintf(stdout, "%-12s : skipped (--use_existing_db is true)\n", + name.ToString().c_str()); + method = nullptr; + } else { + delete db_; + db_ = nullptr; + DestroyDB(FLAGS_db, Options()); + Open(); + } + } + + if (method != nullptr) { + fprintf(stdout, "DB path: [%s]\n", FLAGS_db.c_str()); + RunBenchmark(num_threads, name, method); + } + } + if (FLAGS_statistics) { + fprintf(stdout, "STATISTICS:\n%s\n", dbstats->ToString().c_str()); + } + } + + private: + struct ThreadArg { + Benchmark* bm; + SharedState* shared; + ThreadState* thread; + void (Benchmark::*method)(ThreadState*); + }; + + static void ThreadBody(void* v) { + ThreadArg* arg = reinterpret_cast(v); + SharedState* shared = arg->shared; + ThreadState* thread = arg->thread; + { + MutexLock l(&shared->mu); + shared->num_initialized++; + if (shared->num_initialized >= shared->total) { + shared->cv.SignalAll(); + } + while (!shared->start) { + shared->cv.Wait(); + } + } + + thread->stats.Start(thread->tid); + (arg->bm->*(arg->method))(thread); + thread->stats.Stop(); + + { + MutexLock l(&shared->mu); + shared->num_done++; + if (shared->num_done >= shared->total) { + shared->cv.SignalAll(); + } + } + } + + void RunBenchmark(int n, Slice name, + void (Benchmark::*method)(ThreadState*)) { + SharedState shared; + shared.total = n; + shared.num_initialized = 0; + shared.num_done = 0; + shared.start = false; + + ThreadArg* arg = new ThreadArg[n]; + for (int i = 0; i < n; i++) { + arg[i].bm = this; + arg[i].method = method; + arg[i].shared = &shared; + arg[i].thread = new ThreadState(i); + arg[i].thread->shared = &shared; + FLAGS_env->StartThread(ThreadBody, &arg[i]); + } + + shared.mu.Lock(); + while (shared.num_initialized < n) { + shared.cv.Wait(); + } + + shared.start = true; + shared.cv.SignalAll(); + while (shared.num_done < n) { + shared.cv.Wait(); + } + shared.mu.Unlock(); + + // Stats for some threads can be excluded. + Stats merge_stats; + for (int i = 0; i < n; i++) { + merge_stats.Merge(arg[i].thread->stats); + } + merge_stats.Report(name); + + for (int i = 0; i < n; i++) { + delete arg[i].thread; + } + delete[] arg; + } + + void Crc32c(ThreadState* thread) { + // Checksum about 500MB of data total + const int size = 4096; + const char* label = "(4K per op)"; + std::string data(size, 'x'); + int64_t bytes = 0; + uint32_t crc = 0; + while (bytes < 500 * 1048576) { + crc = crc32c::Value(data.data(), size); + thread->stats.FinishedSingleOp(nullptr); + bytes += size; + } + // Print so result is not dead + fprintf(stderr, "... crc=0x%x\r", static_cast(crc)); + + thread->stats.AddBytes(bytes); + thread->stats.AddMessage(label); + } + + void AcquireLoad(ThreadState* thread) { + int dummy; + port::AtomicPointer ap(&dummy); + int count = 0; + void *ptr = nullptr; + thread->stats.AddMessage("(each op is 1000 loads)"); + while (count < 100000) { + for (int i = 0; i < 1000; i++) { + ptr = ap.Acquire_Load(); + } + count++; + thread->stats.FinishedSingleOp(nullptr); + } + if (ptr == nullptr) exit(1); // Disable unused variable warning. + } + + void SnappyCompress(ThreadState* thread) { + RandomGenerator gen; + Slice input = gen.Generate(Options().block_size); + int64_t bytes = 0; + int64_t produced = 0; + bool ok = true; + std::string compressed; + while (ok && bytes < 1024 * 1048576) { // Compress 1G + ok = port::Snappy_Compress(Options().compression_opts, input.data(), + input.size(), &compressed); + produced += compressed.size(); + bytes += input.size(); + thread->stats.FinishedSingleOp(nullptr); + } + + if (!ok) { + thread->stats.AddMessage("(snappy failure)"); + } else { + char buf[100]; + snprintf(buf, sizeof(buf), "(output: %.1f%%)", + (produced * 100.0) / bytes); + thread->stats.AddMessage(buf); + thread->stats.AddBytes(bytes); + } + } + + void SnappyUncompress(ThreadState* thread) { + RandomGenerator gen; + Slice input = gen.Generate(Options().block_size); + std::string compressed; + bool ok = port::Snappy_Compress(Options().compression_opts, input.data(), + input.size(), &compressed); + int64_t bytes = 0; + char* uncompressed = new char[input.size()]; + while (ok && bytes < 1024 * 1048576) { // Compress 1G + ok = port::Snappy_Uncompress(compressed.data(), compressed.size(), + uncompressed); + bytes += input.size(); + thread->stats.FinishedSingleOp(nullptr); + } + delete[] uncompressed; + + if (!ok) { + thread->stats.AddMessage("(snappy failure)"); + } else { + thread->stats.AddBytes(bytes); + } + } + + void Open() { + assert(db_ == nullptr); + Options options; + options.create_if_missing = !FLAGS_use_existing_db; + options.block_cache = cache_; + options.block_cache_compressed = compressed_cache_; + if (cache_ == nullptr) { + options.no_block_cache = true; + } + options.write_buffer_size = FLAGS_write_buffer_size; + options.max_write_buffer_number = FLAGS_max_write_buffer_number; + options.min_write_buffer_number_to_merge = + FLAGS_min_write_buffer_number_to_merge; + options.max_background_compactions = FLAGS_max_background_compactions; + options.compaction_style = FLAGS_compaction_style_e; + options.block_size = FLAGS_block_size; + options.filter_policy = filter_policy_; + options.prefix_extractor = FLAGS_use_prefix_blooms ? prefix_extractor_ + : nullptr; + options.max_open_files = FLAGS_open_files; + options.statistics = dbstats; + options.env = FLAGS_env; + options.disableDataSync = FLAGS_disable_data_sync; + options.use_fsync = FLAGS_use_fsync; + options.num_levels = FLAGS_num_levels; + options.target_file_size_base = FLAGS_target_file_size_base; + options.target_file_size_multiplier = FLAGS_target_file_size_multiplier; + options.max_bytes_for_level_base = FLAGS_max_bytes_for_level_base; + options.max_bytes_for_level_multiplier = + FLAGS_max_bytes_for_level_multiplier; + options.filter_deletes = FLAGS_filter_deletes; + if ((FLAGS_prefix_size == 0) == (FLAGS_rep_factory == kPrefixHash)) { + fprintf(stderr, + "prefix_size should be non-zero iff memtablerep == prefix_hash\n"); + exit(1); + } + switch (FLAGS_rep_factory) { + case kPrefixHash: + options.memtable_factory.reset(NewHashSkipListRepFactory( + NewFixedPrefixTransform(FLAGS_prefix_size))); + break; + case kSkipList: + // no need to do anything + break; + case kVectorRep: + options.memtable_factory.reset( + new VectorRepFactory + ); + break; + } + if (FLAGS_max_bytes_for_level_multiplier_additional_v.size() > 0) { + if (FLAGS_max_bytes_for_level_multiplier_additional_v.size() != + (unsigned int)FLAGS_num_levels) { + fprintf(stderr, "Insufficient number of fanouts specified %d\n", + (int)FLAGS_max_bytes_for_level_multiplier_additional_v.size()); + exit(1); + } + options.max_bytes_for_level_multiplier_additional = + FLAGS_max_bytes_for_level_multiplier_additional_v; + } + options.level0_stop_writes_trigger = FLAGS_level0_stop_writes_trigger; + options.level0_file_num_compaction_trigger = + FLAGS_level0_file_num_compaction_trigger; + options.level0_slowdown_writes_trigger = + FLAGS_level0_slowdown_writes_trigger; + options.compression = FLAGS_compression_type_e; + options.compression_opts.level = FLAGS_compression_level; + options.WAL_ttl_seconds = FLAGS_wal_ttl_seconds; + options.WAL_size_limit_MB = FLAGS_wal_size_limit_MB; + if (FLAGS_min_level_to_compress >= 0) { + assert(FLAGS_min_level_to_compress <= FLAGS_num_levels); + options.compression_per_level.resize(FLAGS_num_levels); + for (int i = 0; i < FLAGS_min_level_to_compress; i++) { + options.compression_per_level[i] = kNoCompression; + } + for (int i = FLAGS_min_level_to_compress; + i < FLAGS_num_levels; i++) { + options.compression_per_level[i] = FLAGS_compression_type_e; + } + } + options.disable_seek_compaction = FLAGS_disable_seek_compaction; + options.delete_obsolete_files_period_micros = + FLAGS_delete_obsolete_files_period_micros; + options.soft_rate_limit = FLAGS_soft_rate_limit; + options.hard_rate_limit = FLAGS_hard_rate_limit; + options.rate_limit_delay_max_milliseconds = + FLAGS_rate_limit_delay_max_milliseconds; + options.table_cache_numshardbits = FLAGS_table_cache_numshardbits; + options.max_grandparent_overlap_factor = + FLAGS_max_grandparent_overlap_factor; + options.disable_auto_compactions = FLAGS_disable_auto_compactions; + options.source_compaction_factor = FLAGS_source_compaction_factor; + + // fill storage options + options.allow_os_buffer = FLAGS_bufferedio; + options.allow_mmap_reads = FLAGS_mmap_read; + options.allow_mmap_writes = FLAGS_mmap_write; + options.advise_random_on_open = FLAGS_advise_random_on_open; + options.access_hint_on_compaction_start = FLAGS_compaction_fadvice_e; + options.use_adaptive_mutex = FLAGS_use_adaptive_mutex; + options.bytes_per_sync = FLAGS_bytes_per_sync; + + // merge operator options + options.merge_operator = MergeOperators::CreateFromStringId( + FLAGS_merge_operator); + if (options.merge_operator == nullptr && !FLAGS_merge_operator.empty()) { + fprintf(stderr, "invalid merge operator: %s\n", + FLAGS_merge_operator.c_str()); + exit(1); + } + options.max_successive_merges = FLAGS_max_successive_merges; + + // set universal style compaction configurations, if applicable + if (FLAGS_universal_size_ratio != 0) { + options.compaction_options_universal.size_ratio = + FLAGS_universal_size_ratio; + } + if (FLAGS_universal_min_merge_width != 0) { + options.compaction_options_universal.min_merge_width = + FLAGS_universal_min_merge_width; + } + if (FLAGS_universal_max_merge_width != 0) { + options.compaction_options_universal.max_merge_width = + FLAGS_universal_max_merge_width; + } + if (FLAGS_universal_max_size_amplification_percent != 0) { + options.compaction_options_universal.max_size_amplification_percent = + FLAGS_universal_max_size_amplification_percent; + } + if (FLAGS_universal_compression_size_percent != -1) { + options.compaction_options_universal.compression_size_percent = + FLAGS_universal_compression_size_percent; + } + + Status s; + if(FLAGS_readonly) { + s = DB::OpenForReadOnly(options, FLAGS_db, &db_); + } else { + s = DB::Open(options, FLAGS_db, &db_); + } + if (!s.ok()) { + fprintf(stderr, "open error: %s\n", s.ToString().c_str()); + exit(1); + } + if (FLAGS_min_level_to_compress >= 0) { + options.compression_per_level.clear(); + } + } + + enum WriteMode { + RANDOM, SEQUENTIAL, UNIQUE_RANDOM + }; + + void WriteSeq(ThreadState* thread) { + DoWrite(thread, SEQUENTIAL); + } + + void WriteRandom(ThreadState* thread) { + DoWrite(thread, RANDOM); + } + + void WriteUniqueRandom(ThreadState* thread) { + DoWrite(thread, UNIQUE_RANDOM); + } + + void writeOrFail(WriteBatch& batch) { + Status s = db_->Write(write_options_, &batch); + if (!s.ok()) { + fprintf(stderr, "put error: %s\n", s.ToString().c_str()); + exit(1); + } + } + + void WriteFromStdin(ThreadState* thread) { + size_t count = 0; + WriteBatch batch; + const size_t bufferLen = 32 << 20; + unique_ptr line = unique_ptr(new char[bufferLen]); + char* linep = line.get(); + const int batchSize = 100 << 10; + const char columnSeparator = '\t'; + const char lineSeparator = '\n'; + + while (fgets(linep, bufferLen, stdin) != nullptr) { + ++count; + char* tab = std::find(linep, linep + bufferLen, columnSeparator); + if (tab == linep + bufferLen) { + fprintf(stderr, "[Error] No Key delimiter TAB at line %ld\n", count); + continue; + } + Slice key(linep, tab - linep); + tab++; + char* endLine = std::find(tab, linep + bufferLen, lineSeparator); + if (endLine == linep + bufferLen) { + fprintf(stderr, "[Error] No ENTER at end of line # %ld\n", count); + continue; + } + Slice value(tab, endLine - tab); + thread->stats.FinishedSingleOp(db_); + thread->stats.AddBytes(endLine - linep - 1); + + if (batch.Count() < batchSize) { + batch.Put(key, value); + continue; + } + writeOrFail(batch); + batch.Clear(); + } + if (batch.Count() > 0) { + writeOrFail(batch); + } + } + + void DoWrite(ThreadState* thread, WriteMode write_mode) { + const int test_duration = write_mode == RANDOM ? FLAGS_duration : 0; + const int num_ops = writes_ == 0 ? num_ : writes_ ; + Duration duration(test_duration, num_ops); + unique_ptr bit_set; + + if (write_mode == UNIQUE_RANDOM) { + bit_set.reset(new BitSet(num_ops)); + } + + if (num_ != FLAGS_num) { + char msg[100]; + snprintf(msg, sizeof(msg), "(%lld ops)", num_); + thread->stats.AddMessage(msg); + } + + RandomGenerator gen; + WriteBatch batch; + Status s; + int64_t bytes = 0; + int i = 0; + while (!duration.Done(entries_per_batch_)) { + batch.Clear(); + for (int j = 0; j < entries_per_batch_; j++) { + long long k = 0; + switch(write_mode) { + case SEQUENTIAL: + k = i +j; + break; + case RANDOM: + k = thread->rand.Next() % FLAGS_num; + break; + case UNIQUE_RANDOM: + { + const long long t = thread->rand.Next() % FLAGS_num; + if (!bit_set->test(t)) { + // best case + k = t; + } else { + bool found = false; + // look forward + for (size_t i = t + 1; i < bit_set->size(); ++i) { + if (!bit_set->test(i)) { + found = true; + k = i; + break; + } + } + if (!found) { + for (size_t i = t; i-- > 0;) { + if (!bit_set->test(i)) { + found = true; + k = i; + break; + } + } + } + } + bit_set->set(k); + break; + } + }; + unique_ptr key = GenerateKeyFromInt(k); + batch.Put(key.get(), gen.Generate(value_size_)); + bytes += value_size_ + strlen(key.get()); + thread->stats.FinishedSingleOp(db_); + } + s = db_->Write(write_options_, &batch); + if (!s.ok()) { + fprintf(stderr, "put error: %s\n", s.ToString().c_str()); + exit(1); + } + i += entries_per_batch_; + } + thread->stats.AddBytes(bytes); + } + + void ReadSequential(ThreadState* thread) { + Iterator* iter = db_->NewIterator(ReadOptions(FLAGS_verify_checksum, true)); + long long i = 0; + int64_t bytes = 0; + for (iter->SeekToFirst(); i < reads_ && iter->Valid(); iter->Next()) { + bytes += iter->key().size() + iter->value().size(); + thread->stats.FinishedSingleOp(db_); + ++i; + } + delete iter; + thread->stats.AddBytes(bytes); + } + + void ReadReverse(ThreadState* thread) { + Iterator* iter = db_->NewIterator(ReadOptions(FLAGS_verify_checksum, true)); + long long i = 0; + int64_t bytes = 0; + for (iter->SeekToLast(); i < reads_ && iter->Valid(); iter->Prev()) { + bytes += iter->key().size() + iter->value().size(); + thread->stats.FinishedSingleOp(db_); + ++i; + } + delete iter; + thread->stats.AddBytes(bytes); + } + + // Calls MultiGet over a list of keys from a random distribution. + // Returns the total number of keys found. + long MultiGetRandom(ReadOptions& options, int num_keys, + Random64& rand, long long range, const char* suffix) { + assert(num_keys > 0); + std::vector keys(num_keys); + std::vector values(num_keys); + std::vector > gen_keys(num_keys); + + int i; + long long k; + + // Fill the keys vector + for(i=0; iGetSnapshot(); + } + + // Apply the operation + std::vector statuses = db_->MultiGet(options, keys, &values); + assert((long)statuses.size() == num_keys); + assert((long)keys.size() == num_keys); // Should always be the case. + assert((long)values.size() == num_keys); + + if (FLAGS_use_snapshot) { + db_->ReleaseSnapshot(options.snapshot); + options.snapshot = nullptr; + } + + // Count number found + long found = 0; + for(i=0; irand, FLAGS_num, ""); + thread->stats.FinishedSingleOp(db_); + keys_left -= num_keys; + } + } else { // Regular case. Do one "get" at a time Get + Iterator* iter = db_->NewIterator(options); + std::string value; + while (!duration.Done(1)) { + const long long k = thread->rand.Next() % FLAGS_num; + unique_ptr key = GenerateKeyFromInt(k); + if (FLAGS_use_snapshot) { + options.snapshot = db_->GetSnapshot(); + } + + if (FLAGS_read_range < 2) { + if (db_->Get(options, key.get(), &value).ok()) { + found++; + } + } else { + Slice skey(key.get()); + int count = 1; + + if (FLAGS_get_approx) { + unique_ptr key2 = + GenerateKeyFromInt(k + (int) FLAGS_read_range); + Slice skey2(key2.get()); + Range range(skey, skey2); + uint64_t sizes; + db_->GetApproximateSizes(&range, 1, &sizes); + } + + for (iter->Seek(skey); + iter->Valid() && count <= FLAGS_read_range; + ++count, iter->Next()) { + found++; + } + } + + if (FLAGS_use_snapshot) { + db_->ReleaseSnapshot(options.snapshot); + options.snapshot = nullptr; + } + + thread->stats.FinishedSingleOp(db_); + } + + delete iter; + } + + char msg[100]; + snprintf(msg, sizeof(msg), "(%lld of %lld found)", found, reads_); + thread->stats.AddMessage(msg); + } + + void PrefixScanRandom(ThreadState* thread) { + if (FLAGS_use_prefix_api) { + assert(FLAGS_use_prefix_blooms); + assert(FLAGS_bloom_bits >= 1); + } + + ReadOptions options(FLAGS_verify_checksum, true); + Duration duration(FLAGS_duration, reads_); + + long long found = 0; + + while (!duration.Done(1)) { + std::string value; + const int k = thread->rand.Next() % FLAGS_num; + unique_ptr key = GenerateKeyFromInt(k); + Slice skey(key.get()); + Slice prefix = prefix_extractor_->Transform(skey); + options.prefix = FLAGS_use_prefix_api ? &prefix : nullptr; + + Iterator* iter = db_->NewIterator(options); + for (iter->Seek(skey); + iter->Valid() && iter->key().starts_with(prefix); + iter->Next()) { + found++; + } + delete iter; + + thread->stats.FinishedSingleOp(db_); + } + + char msg[100]; + snprintf(msg, sizeof(msg), "(%lld of %lld found)", found, reads_); + thread->stats.AddMessage(msg); + } + + void ReadMissing(ThreadState* thread) { + FLAGS_warn_missing_keys = false; // Never warn about missing keys + + Duration duration(FLAGS_duration, reads_); + ReadOptions options(FLAGS_verify_checksum, true); + + if (FLAGS_use_multiget) { + const long& kpg = FLAGS_keys_per_multiget; // keys per multiget group + long keys_left = reads_; + + // Recalculate number of keys per group, and call MultiGet until done + long num_keys; + long found; + while(num_keys = std::min(keys_left, kpg), !duration.Done(num_keys)) { + found = MultiGetRandom(options, num_keys, thread->rand, FLAGS_num, "."); + + // We should not find any key since the key we try to get has a + // different suffix + if (found) { + assert(false); + } + + thread->stats.FinishedSingleOp(db_); + keys_left -= num_keys; + } + } else { // Regular case (not MultiGet) + std::string value; + Status s; + while (!duration.Done(1)) { + const long long k = thread->rand.Next() % FLAGS_num; + unique_ptr key = GenerateKeyFromInt(k, "."); + s = db_->Get(options, key.get(), &value); + assert(!s.ok() && s.IsNotFound()); + thread->stats.FinishedSingleOp(db_); + } + } + } + + void ReadHot(ThreadState* thread) { + Duration duration(FLAGS_duration, reads_); + ReadOptions options(FLAGS_verify_checksum, true); + const long long range = (FLAGS_num + 99) / 100; + long long found = 0; + + if (FLAGS_use_multiget) { + const long long kpg = FLAGS_keys_per_multiget; // keys per multiget group + long long keys_left = reads_; + + // Recalculate number of keys per group, and call MultiGet until done + long num_keys; + while(num_keys = std::min(keys_left, kpg), !duration.Done(num_keys)) { + found += MultiGetRandom(options, num_keys, thread->rand, range, ""); + thread->stats.FinishedSingleOp(db_); + keys_left -= num_keys; + } + } else { + std::string value; + while (!duration.Done(1)) { + const long long k = thread->rand.Next() % range; + unique_ptr key = GenerateKeyFromInt(k); + if (db_->Get(options, key.get(), &value).ok()){ + ++found; + } + thread->stats.FinishedSingleOp(db_); + } + } + + char msg[100]; + snprintf(msg, sizeof(msg), "(%lld of %lld found)", found, reads_); + thread->stats.AddMessage(msg); + } + + void SeekRandom(ThreadState* thread) { + Duration duration(FLAGS_duration, reads_); + ReadOptions options(FLAGS_verify_checksum, true); + std::string value; + long long found = 0; + while (!duration.Done(1)) { + Iterator* iter = db_->NewIterator(options); + const long long k = thread->rand.Next() % FLAGS_num; + unique_ptr key = GenerateKeyFromInt(k); + iter->Seek(key.get()); + if (iter->Valid() && iter->key() == key.get()) found++; + delete iter; + thread->stats.FinishedSingleOp(db_); + } + char msg[100]; + snprintf(msg, sizeof(msg), "(%lld of %lld found)", found, num_); + thread->stats.AddMessage(msg); + } + + void DoDelete(ThreadState* thread, bool seq) { + WriteBatch batch; + Status s; + Duration duration(seq ? 0 : FLAGS_duration, num_); + long i = 0; + while (!duration.Done(entries_per_batch_)) { + batch.Clear(); + for (int j = 0; j < entries_per_batch_; j++) { + const long long k = seq ? i+j : (thread->rand.Next() % FLAGS_num); + unique_ptr key = GenerateKeyFromInt(k); + batch.Delete(key.get()); + thread->stats.FinishedSingleOp(db_); + } + s = db_->Write(write_options_, &batch); + if (!s.ok()) { + fprintf(stderr, "del error: %s\n", s.ToString().c_str()); + exit(1); + } + ++i; + } + } + + void DeleteSeq(ThreadState* thread) { + DoDelete(thread, true); + } + + void DeleteRandom(ThreadState* thread) { + DoDelete(thread, false); + } + + void ReadWhileWriting(ThreadState* thread) { + if (thread->tid > 0) { + ReadRandom(thread); + } else { + // Special thread that keeps writing until other threads are done. + RandomGenerator gen; + double last = FLAGS_env->NowMicros(); + int writes_per_second_by_10 = 0; + int num_writes = 0; + + // --writes_per_second rate limit is enforced per 100 milliseconds + // intervals to avoid a burst of writes at the start of each second. + + if (FLAGS_writes_per_second > 0) + writes_per_second_by_10 = FLAGS_writes_per_second / 10; + + // Don't merge stats from this thread with the readers. + thread->stats.SetExcludeFromMerge(); + + while (true) { + { + MutexLock l(&thread->shared->mu); + if (thread->shared->num_done + 1 >= thread->shared->num_initialized) { + // Other threads have finished + break; + } + } + + const long long k = thread->rand.Next() % FLAGS_num; + unique_ptr key = GenerateKeyFromInt(k); + Status s = db_->Put(write_options_, key.get(), + gen.Generate(value_size_)); + if (!s.ok()) { + fprintf(stderr, "put error: %s\n", s.ToString().c_str()); + exit(1); + } + thread->stats.FinishedSingleOp(db_); + + ++num_writes; + if (writes_per_second_by_10 && num_writes >= writes_per_second_by_10) { + double now = FLAGS_env->NowMicros(); + double usecs_since_last = now - last; + + num_writes = 0; + last = now; + + if (usecs_since_last < 100000.0) { + FLAGS_env->SleepForMicroseconds(100000.0 - usecs_since_last); + last = FLAGS_env->NowMicros(); + } + } + } + } + } + + // Given a key K and value V, this puts (K+"0", V), (K+"1", V), (K+"2", V) + // in DB atomically i.e in a single batch. Also refer GetMany. + Status PutMany(const WriteOptions& writeoptions, + const Slice& key, const Slice& value) { + std::string suffixes[3] = {"2", "1", "0"}; + std::string keys[3]; + + WriteBatch batch; + Status s; + for (int i = 0; i < 3; i++) { + keys[i] = key.ToString() + suffixes[i]; + batch.Put(keys[i], value); + } + + s = db_->Write(writeoptions, &batch); + return s; + } + + + // Given a key K, this deletes (K+"0", V), (K+"1", V), (K+"2", V) + // in DB atomically i.e in a single batch. Also refer GetMany. + Status DeleteMany(const WriteOptions& writeoptions, + const Slice& key) { + std::string suffixes[3] = {"1", "2", "0"}; + std::string keys[3]; + + WriteBatch batch; + Status s; + for (int i = 0; i < 3; i++) { + keys[i] = key.ToString() + suffixes[i]; + batch.Delete(keys[i]); + } + + s = db_->Write(writeoptions, &batch); + return s; + } + + // Given a key K and value V, this gets values for K+"0", K+"1" and K+"2" + // in the same snapshot, and verifies that all the values are identical. + // ASSUMES that PutMany was used to put (K, V) into the DB. + Status GetMany(const ReadOptions& readoptions, + const Slice& key, std::string* value) { + std::string suffixes[3] = {"0", "1", "2"}; + std::string keys[3]; + Slice key_slices[3]; + std::string values[3]; + ReadOptions readoptionscopy = readoptions; + readoptionscopy.snapshot = db_->GetSnapshot(); + Status s; + for (int i = 0; i < 3; i++) { + keys[i] = key.ToString() + suffixes[i]; + key_slices[i] = keys[i]; + s = db_->Get(readoptionscopy, key_slices[i], value); + if (!s.ok() && !s.IsNotFound()) { + fprintf(stderr, "get error: %s\n", s.ToString().c_str()); + values[i] = ""; + // we continue after error rather than exiting so that we can + // find more errors if any + } else if (s.IsNotFound()) { + values[i] = ""; + } else { + values[i] = *value; + } + } + db_->ReleaseSnapshot(readoptionscopy.snapshot); + + if ((values[0] != values[1]) || (values[1] != values[2])) { + fprintf(stderr, "inconsistent values for key %s: %s, %s, %s\n", + key.ToString().c_str(), values[0].c_str(), values[1].c_str(), + values[2].c_str()); + // we continue after error rather than exiting so that we can + // find more errors if any + } + + return s; + } + + // Differs from readrandomwriterandom in the following ways: + // (a) Uses GetMany/PutMany to read/write key values. Refer to those funcs. + // (b) Does deletes as well (per FLAGS_deletepercent) + // (c) In order to achieve high % of 'found' during lookups, and to do + // multiple writes (including puts and deletes) it uses upto + // FLAGS_numdistinct distinct keys instead of FLAGS_num distinct keys. + // (d) Does not have a MultiGet option. + void RandomWithVerify(ThreadState* thread) { + ReadOptions options(FLAGS_verify_checksum, true); + RandomGenerator gen; + std::string value; + long long found = 0; + int get_weight = 0; + int put_weight = 0; + int delete_weight = 0; + long long gets_done = 0; + long long puts_done = 0; + long long deletes_done = 0; + + // the number of iterations is the larger of read_ or write_ + for (long long i = 0; i < readwrites_; i++) { + const long long k = thread->rand.Next() % (FLAGS_numdistinct); + unique_ptr key = GenerateKeyFromInt(k); + if (get_weight == 0 && put_weight == 0 && delete_weight == 0) { + // one batch completed, reinitialize for next batch + get_weight = FLAGS_readwritepercent; + delete_weight = FLAGS_deletepercent; + put_weight = 100 - get_weight - delete_weight; + } + if (get_weight > 0) { + // do all the gets first + Status s = GetMany(options, key.get(), &value); + if (!s.ok() && !s.IsNotFound()) { + fprintf(stderr, "getmany error: %s\n", s.ToString().c_str()); + // we continue after error rather than exiting so that we can + // find more errors if any + } else if (!s.IsNotFound()) { + found++; + } + get_weight--; + gets_done++; + } else if (put_weight > 0) { + // then do all the corresponding number of puts + // for all the gets we have done earlier + Status s = PutMany(write_options_, key.get(), + gen.Generate(value_size_)); + if (!s.ok()) { + fprintf(stderr, "putmany error: %s\n", s.ToString().c_str()); + exit(1); + } + put_weight--; + puts_done++; + } else if (delete_weight > 0) { + Status s = DeleteMany(write_options_, key.get()); + if (!s.ok()) { + fprintf(stderr, "deletemany error: %s\n", s.ToString().c_str()); + exit(1); + } + delete_weight--; + deletes_done++; + } + + thread->stats.FinishedSingleOp(db_); + } + char msg[100]; + snprintf(msg, sizeof(msg), + "( get:%lld put:%lld del:%lld total:%lld found:%lld)", + gets_done, puts_done, deletes_done, readwrites_, found); + thread->stats.AddMessage(msg); + } + + // This is different from ReadWhileWriting because it does not use + // an extra thread. + void ReadRandomWriteRandom(ThreadState* thread) { + if (FLAGS_use_multiget){ + // Separate function for multiget (for ease of reading) + ReadRandomWriteRandomMultiGet(thread); + return; + } + + ReadOptions options(FLAGS_verify_checksum, true); + RandomGenerator gen; + std::string value; + long long found = 0; + int get_weight = 0; + int put_weight = 0; + long long reads_done = 0; + long long writes_done = 0; + Duration duration(FLAGS_duration, readwrites_); + + // the number of iterations is the larger of read_ or write_ + while (!duration.Done(1)) { + const long long k = thread->rand.Next() % FLAGS_num; + unique_ptr key = GenerateKeyFromInt(k); + if (get_weight == 0 && put_weight == 0) { + // one batch completed, reinitialize for next batch + get_weight = FLAGS_readwritepercent; + put_weight = 100 - get_weight; + } + if (get_weight > 0) { + + if (FLAGS_use_snapshot) { + options.snapshot = db_->GetSnapshot(); + } + + if (FLAGS_get_approx) { + char key2[100]; + snprintf(key2, sizeof(key2), "%016lld", k + 1); + Slice skey2(key2); + Slice skey(key2); + Range range(skey, skey2); + uint64_t sizes; + db_->GetApproximateSizes(&range, 1, &sizes); + } + + // do all the gets first + Status s = db_->Get(options, key.get(), &value); + if (!s.ok() && !s.IsNotFound()) { + fprintf(stderr, "get error: %s\n", s.ToString().c_str()); + // we continue after error rather than exiting so that we can + // find more errors if any + } else if (!s.IsNotFound()) { + found++; + } + + get_weight--; + reads_done++; + + if (FLAGS_use_snapshot) { + db_->ReleaseSnapshot(options.snapshot); + } + + } else if (put_weight > 0) { + // then do all the corresponding number of puts + // for all the gets we have done earlier + Status s = db_->Put(write_options_, key.get(), + gen.Generate(value_size_)); + if (!s.ok()) { + fprintf(stderr, "put error: %s\n", s.ToString().c_str()); + exit(1); + } + put_weight--; + writes_done++; + } + thread->stats.FinishedSingleOp(db_); + } + char msg[100]; + snprintf(msg, sizeof(msg), + "( reads:%lld writes:%lld total:%lld found:%lld)", + reads_done, writes_done, readwrites_, found); + thread->stats.AddMessage(msg); + } + + // ReadRandomWriteRandom (with multiget) + // Does FLAGS_keys_per_multiget reads (per multiget), followed by some puts. + // FLAGS_readwritepercent will specify the ratio of gets to puts. + // e.g.: If FLAGS_keys_per_multiget == 100 and FLAGS_readwritepercent == 75 + // Then each block will do 100 multigets and 33 puts + // So there are 133 operations in-total: 100 of them (75%) are gets, and 33 + // of them (25%) are puts. + void ReadRandomWriteRandomMultiGet(ThreadState* thread) { + ReadOptions options(FLAGS_verify_checksum, true); + RandomGenerator gen; + + // For multiget + const long& kpg = FLAGS_keys_per_multiget; // keys per multiget group + + long keys_left = readwrites_; // number of keys still left to read + long num_keys; // number of keys to read in current group + long num_put_keys; // number of keys to put in current group + + long found = 0; + long reads_done = 0; + long writes_done = 0; + long multigets_done = 0; + + // the number of iterations is the larger of read_ or write_ + Duration duration(FLAGS_duration, readwrites_); + while(true) { + // Read num_keys keys, then write num_put_keys keys. + // The ratio of num_keys to num_put_keys is always FLAGS_readwritepercent + // And num_keys is set to be FLAGS_keys_per_multiget (kpg) + // num_put_keys is calculated accordingly (to maintain the ratio) + // Note: On the final iteration, num_keys and num_put_keys will be smaller + num_keys = std::min(keys_left*(FLAGS_readwritepercent + 99)/100, kpg); + num_put_keys = num_keys * (100-FLAGS_readwritepercent) + / FLAGS_readwritepercent; + + // This will break the loop when duration is complete + if (duration.Done(num_keys + num_put_keys)) { + break; + } + + // A quick check to make sure our formula doesn't break on edge cases + assert(num_keys >= 1); + assert(num_keys + num_put_keys <= keys_left); + + // Apply the MultiGet operations + found += MultiGetRandom(options, num_keys, thread->rand, FLAGS_num, ""); + ++multigets_done; + reads_done+=num_keys; + thread->stats.FinishedSingleOp(db_); + + // Now do the puts + int i; + long long k; + for(i=0; irand.Next() % FLAGS_num; + unique_ptr key = GenerateKeyFromInt(k); + Status s = db_->Put(write_options_, key.get(), + gen.Generate(value_size_)); + if (!s.ok()) { + fprintf(stderr, "put error: %s\n", s.ToString().c_str()); + exit(1); + } + writes_done++; + thread->stats.FinishedSingleOp(db_); + } + + keys_left -= (num_keys + num_put_keys); + } + char msg[100]; + snprintf(msg, sizeof(msg), + "( reads:%ld writes:%ld total:%lld multiget_ops:%ld found:%ld)", + reads_done, writes_done, readwrites_, multigets_done, found); + thread->stats.AddMessage(msg); + } + + // + // Read-modify-write for random keys + void UpdateRandom(ThreadState* thread) { + ReadOptions options(FLAGS_verify_checksum, true); + RandomGenerator gen; + std::string value; + long long found = 0; + Duration duration(FLAGS_duration, readwrites_); + + // the number of iterations is the larger of read_ or write_ + while (!duration.Done(1)) { + const long long k = thread->rand.Next() % FLAGS_num; + unique_ptr key = GenerateKeyFromInt(k); + + if (FLAGS_use_snapshot) { + options.snapshot = db_->GetSnapshot(); + } + + if (FLAGS_get_approx) { + char key2[100]; + snprintf(key2, sizeof(key2), "%016lld", k + 1); + Slice skey2(key2); + Slice skey(key2); + Range range(skey, skey2); + uint64_t sizes; + db_->GetApproximateSizes(&range, 1, &sizes); + } + + if (db_->Get(options, key.get(), &value).ok()) { + found++; + } + + if (FLAGS_use_snapshot) { + db_->ReleaseSnapshot(options.snapshot); + } + + Status s = db_->Put(write_options_, key.get(), gen.Generate(value_size_)); + if (!s.ok()) { + fprintf(stderr, "put error: %s\n", s.ToString().c_str()); + exit(1); + } + thread->stats.FinishedSingleOp(db_); + } + char msg[100]; + snprintf(msg, sizeof(msg), + "( updates:%lld found:%lld)", readwrites_, found); + thread->stats.AddMessage(msg); + } + + // Read-modify-write for random keys. + // Each operation causes the key grow by value_size (simulating an append). + // Generally used for benchmarking against merges of similar type + void AppendRandom(ThreadState* thread) { + ReadOptions options(FLAGS_verify_checksum, true); + RandomGenerator gen; + std::string value; + long found = 0; + + // The number of iterations is the larger of read_ or write_ + Duration duration(FLAGS_duration, readwrites_); + while (!duration.Done(1)) { + const long long k = thread->rand.Next() % FLAGS_num; + unique_ptr key = GenerateKeyFromInt(k); + + if (FLAGS_use_snapshot) { + options.snapshot = db_->GetSnapshot(); + } + + if (FLAGS_get_approx) { + char key2[100]; + snprintf(key2, sizeof(key2), "%016lld", k + 1); + Slice skey2(key2); + Slice skey(key2); + Range range(skey, skey2); + uint64_t sizes; + db_->GetApproximateSizes(&range, 1, &sizes); + } + + // Get the existing value + if (db_->Get(options, key.get(), &value).ok()) { + found++; + } else { + // If not existing, then just assume an empty string of data + value.clear(); + } + + if (FLAGS_use_snapshot) { + db_->ReleaseSnapshot(options.snapshot); + } + + // Update the value (by appending data) + Slice operand = gen.Generate(value_size_); + if (value.size() > 0) { + // Use a delimeter to match the semantics for StringAppendOperator + value.append(1,','); + } + value.append(operand.data(), operand.size()); + + // Write back to the database + Status s = db_->Put(write_options_, key.get(), value); + if (!s.ok()) { + fprintf(stderr, "put error: %s\n", s.ToString().c_str()); + exit(1); + } + thread->stats.FinishedSingleOp(db_); + } + char msg[100]; + snprintf(msg, sizeof(msg), "( updates:%lld found:%ld)", readwrites_, found); + thread->stats.AddMessage(msg); + } + + // Read-modify-write for random keys (using MergeOperator) + // The merge operator to use should be defined by FLAGS_merge_operator + // Adjust FLAGS_value_size so that the keys are reasonable for this operator + // Assumes that the merge operator is non-null (i.e.: is well-defined) + // + // For example, use FLAGS_merge_operator="uint64add" and FLAGS_value_size=8 + // to simulate random additions over 64-bit integers using merge. + // + // The number of merges on the same key can be controlled by adjusting + // FLAGS_merge_keys. + void MergeRandom(ThreadState* thread) { + RandomGenerator gen; + + // The number of iterations is the larger of read_ or write_ + Duration duration(FLAGS_duration, readwrites_); + while (!duration.Done(1)) { + const long long k = thread->rand.Next() % merge_keys_; + unique_ptr key = GenerateKeyFromInt(k); + + Status s = db_->Merge(write_options_, key.get(), + gen.Generate(value_size_)); + + if (!s.ok()) { + fprintf(stderr, "merge error: %s\n", s.ToString().c_str()); + exit(1); + } + thread->stats.FinishedSingleOp(db_); + } + + // Print some statistics + char msg[100]; + snprintf(msg, sizeof(msg), "( updates:%lld)", readwrites_); + thread->stats.AddMessage(msg); + } + + // Read and merge random keys. The amount of reads and merges are controlled + // by adjusting FLAGS_num and FLAGS_mergereadpercent. The number of distinct + // keys (and thus also the number of reads and merges on the same key) can be + // adjusted with FLAGS_merge_keys. + // + // As with MergeRandom, the merge operator to use should be defined by + // FLAGS_merge_operator. + void ReadRandomMergeRandom(ThreadState* thread) { + ReadOptions options(FLAGS_verify_checksum, true); + RandomGenerator gen; + std::string value; + long long num_hits = 0; + long long num_gets = 0; + long long num_merges = 0; + size_t max_length = 0; + + // the number of iterations is the larger of read_ or write_ + Duration duration(FLAGS_duration, readwrites_); + + while (!duration.Done(1)) { + const long long k = thread->rand.Next() % merge_keys_; + unique_ptr key = GenerateKeyFromInt(k); + + bool do_merge = int(thread->rand.Next() % 100) < FLAGS_mergereadpercent; + + if (do_merge) { + Status s = db_->Merge(write_options_, key.get(), + gen.Generate(value_size_)); + if (!s.ok()) { + fprintf(stderr, "merge error: %s\n", s.ToString().c_str()); + exit(1); + } + + num_merges++; + + } else { + Status s = db_->Get(options, key.get(), &value); + if (value.length() > max_length) + max_length = value.length(); + + if (!s.ok() && !s.IsNotFound()) { + fprintf(stderr, "get error: %s\n", s.ToString().c_str()); + // we continue after error rather than exiting so that we can + // find more errors if any + } else if (!s.IsNotFound()) { + num_hits++; + } + + num_gets++; + + } + + thread->stats.FinishedSingleOp(db_); + } + char msg[100]; + snprintf(msg, sizeof(msg), + "(reads:%lld merges:%lld total:%lld hits:%lld maxlength:%zu)", + num_gets, num_merges, readwrites_, num_hits, max_length); + thread->stats.AddMessage(msg); + } + + + void Compact(ThreadState* thread) { + db_->CompactRange(nullptr, nullptr); + } + + void PrintStats(const char* key) { + std::string stats; + if (!db_->GetProperty(key, &stats)) { + stats = "(failed)"; + } + fprintf(stdout, "\n%s\n", stats.c_str()); + } + + static void WriteToFile(void* arg, const char* buf, int n) { + reinterpret_cast(arg)->Append(Slice(buf, n)); + } + + void HeapProfile() { + char fname[100]; + EnvOptions soptions; + snprintf(fname, sizeof(fname), "%s/heap-%04d", FLAGS_db.c_str(), + ++heap_counter_); + unique_ptr file; + Status s = FLAGS_env->NewWritableFile(fname, &file, soptions); + if (!s.ok()) { + fprintf(stderr, "%s\n", s.ToString().c_str()); + return; + } + bool ok = port::GetHeapProfile(WriteToFile, file.get()); + if (!ok) { + fprintf(stderr, "heap profiling not supported\n"); + FLAGS_env->DeleteFile(fname); + } + } +}; + +} // namespace rocksdb + +int main(int argc, char** argv) { + rocksdb::InstallStackTraceHandler(); + google::SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) + + " [OPTIONS]..."); + google::ParseCommandLineFlags(&argc, &argv, true); + + FLAGS_compaction_style_e = (rocksdb::CompactionStyle) FLAGS_compaction_style; + if (FLAGS_statistics) { + dbstats = rocksdb::CreateDBStatistics(); + } + + std::vector fanout = + rocksdb::stringSplit(FLAGS_max_bytes_for_level_multiplier_additional, ','); + for (unsigned int j= 0; j < fanout.size(); j++) { + FLAGS_max_bytes_for_level_multiplier_additional_v.push_back( + std::stoi(fanout[j])); + } + + FLAGS_compression_type_e = + StringToCompressionType(FLAGS_compression_type.c_str()); + + if (!FLAGS_hdfs.empty()) { + FLAGS_env = new rocksdb::HdfsEnv(FLAGS_hdfs); + } + + if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "NONE")) + FLAGS_compaction_fadvice_e = rocksdb::Options::NONE; + else if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "NORMAL")) + FLAGS_compaction_fadvice_e = rocksdb::Options::NORMAL; + else if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "SEQUENTIAL")) + FLAGS_compaction_fadvice_e = rocksdb::Options::SEQUENTIAL; + else if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "WILLNEED")) + FLAGS_compaction_fadvice_e = rocksdb::Options::WILLNEED; + else { + fprintf(stdout, "Unknown compaction fadvice:%s\n", + FLAGS_compaction_fadvice.c_str()); + } + + FLAGS_rep_factory = StringToRepFactory(FLAGS_memtablerep.c_str()); + + // The number of background threads should be at least as much the + // max number of concurrent compactions. + FLAGS_env->SetBackgroundThreads(FLAGS_max_background_compactions); + // Choose a location for the test database if none given with --db= + if (FLAGS_db.empty()) { + std::string default_db_path; + rocksdb::Env::Default()->GetTestDirectory(&default_db_path); + default_db_path += "/dbbench"; + FLAGS_db = default_db_path; + } + + rocksdb::Benchmark benchmark; + benchmark.Run(); + return 0; +} diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc new file mode 100644 index 00000000..04d6d0e1 --- /dev/null +++ b/db/db_filesnapshot.cc @@ -0,0 +1,114 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2012 Facebook. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include +#include +#include +#include "db/db_impl.h" +#include "db/filename.h" +#include "db/version_set.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "port/port.h" +#include "util/mutexlock.h" + +namespace rocksdb { + +Status DBImpl::DisableFileDeletions() { + MutexLock l(&mutex_); + ++disable_delete_obsolete_files_; + if (disable_delete_obsolete_files_ == 1) { + // if not, it has already been disabled, so don't log anything + Log(options_.info_log, "File Deletions Disabled"); + } + return Status::OK(); +} + +Status DBImpl::EnableFileDeletions(bool force) { + DeletionState deletion_state; + bool should_purge_files = false; + { + MutexLock l(&mutex_); + if (force) { + // if force, we need to enable file deletions right away + disable_delete_obsolete_files_ = 0; + } else if (disable_delete_obsolete_files_ > 0) { + --disable_delete_obsolete_files_; + } + if (disable_delete_obsolete_files_ == 0) { + Log(options_.info_log, "File Deletions Enabled"); + should_purge_files = true; + FindObsoleteFiles(deletion_state, true); + } + } + if (should_purge_files) { + PurgeObsoleteFiles(deletion_state); + } + LogFlush(options_.info_log); + return Status::OK(); +} + +Status DBImpl::GetLiveFiles(std::vector& ret, + uint64_t* manifest_file_size, + bool flush_memtable) { + + *manifest_file_size = 0; + + if (flush_memtable) { + // flush all dirty data to disk. + Status status = Flush(FlushOptions()); + if (!status.ok()) { + Log(options_.info_log, "Cannot Flush data %s\n", + status.ToString().c_str()); + return status; + } + } + + MutexLock l(&mutex_); + + // Make a set of all of the live *.sst files + std::set live; + versions_->current()->AddLiveFiles(&live); + + ret.clear(); + ret.reserve(live.size() + 2); //*.sst + CURRENT + MANIFEST + + // create names of the live files. The names are not absolute + // paths, instead they are relative to dbname_; + for (auto live_file : live) { + ret.push_back(TableFileName("", live_file)); + } + + ret.push_back(CurrentFileName("")); + ret.push_back(DescriptorFileName("", versions_->ManifestFileNumber())); + + // find length of manifest file while holding the mutex lock + *manifest_file_size = versions_->ManifestFileSize(); + + return Status::OK(); +} + +Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) { + // First get sorted files in archive dir, then append sorted files from main + // dir to maintain sorted order + + // list wal files in archive dir. + Status s; + std::string archivedir = ArchivalDirectory(options_.wal_dir); + if (env_->FileExists(archivedir)) { + s = AppendSortedWalsOfType(archivedir, files, kArchivedLogFile); + if (!s.ok()) { + return s; + } + } + // list wal files in main db dir. + return AppendSortedWalsOfType(options_.wal_dir, files, kAliveLogFile); +} + +} diff --git a/db/db_impl.cc b/db/db_impl.cc new file mode 100644 index 00000000..e84817b9 --- /dev/null +++ b/db/db_impl.cc @@ -0,0 +1,3960 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_impl.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "db/builder.h" +#include "db/dbformat.h" +#include "db/db_iter.h" +#include "db/filename.h" +#include "db/log_reader.h" +#include "db/log_writer.h" +#include "db/memtable.h" +#include "db/memtablelist.h" +#include "db/merge_context.h" +#include "db/merge_helper.h" +#include "db/prefix_filter_iterator.h" +#include "db/table_cache.h" +#include "db/table_properties_collector.h" +#include "db/transaction_log_impl.h" +#include "db/version_set.h" +#include "db/write_batch_internal.h" +#include "port/port.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/statistics.h" +#include "rocksdb/status.h" +#include "rocksdb/table.h" +#include "port/port.h" +#include "table/block.h" +#include "table/block_based_table_factory.h" +#include "table/merger.h" +#include "table/two_level_iterator.h" +#include "util/auto_roll_logger.h" +#include "util/build_version.h" +#include "util/coding.h" +#include "util/hash_skiplist_rep.h" +#include "util/logging.h" +#include "util/mutexlock.h" +#include "util/perf_context_imp.h" +#include "util/stop_watch.h" +#include "util/autovector.h" + +namespace rocksdb { + +void dumpLeveldbBuildVersion(Logger * log); + +// Information kept for every waiting writer +struct DBImpl::Writer { + Status status; + WriteBatch* batch; + bool sync; + bool disableWAL; + bool done; + port::CondVar cv; + + explicit Writer(port::Mutex* mu) : cv(mu) { } +}; + +struct DBImpl::CompactionState { + Compaction* const compaction; + + // If there were two snapshots with seq numbers s1 and + // s2 and s1 < s2, and if we find two instances of a key k1 then lies + // entirely within s1 and s2, then the earlier version of k1 can be safely + // deleted because that version is not visible in any snapshot. + std::vector existing_snapshots; + + // Files produced by compaction + struct Output { + uint64_t number; + uint64_t file_size; + InternalKey smallest, largest; + SequenceNumber smallest_seqno, largest_seqno; + }; + std::vector outputs; + std::list allocated_file_numbers; + + // State kept for output being generated + unique_ptr outfile; + unique_ptr builder; + + uint64_t total_bytes; + + Output* current_output() { return &outputs[outputs.size()-1]; } + + explicit CompactionState(Compaction* c) + : compaction(c), + total_bytes(0) { + } + + // Create a client visible context of this compaction + CompactionFilter::Context GetFilterContext() { + CompactionFilter::Context context; + context.is_full_compaction = compaction->IsFullCompaction(); + return context; + } +}; + +// Fix user-supplied options to be reasonable +template +static void ClipToRange(T* ptr, V minvalue, V maxvalue) { + if (static_cast(*ptr) > maxvalue) *ptr = maxvalue; + if (static_cast(*ptr) < minvalue) *ptr = minvalue; +} +Options SanitizeOptions(const std::string& dbname, + const InternalKeyComparator* icmp, + const InternalFilterPolicy* ipolicy, + const Options& src) { + Options result = src; + result.comparator = icmp; + result.filter_policy = (src.filter_policy != nullptr) ? ipolicy : nullptr; + ClipToRange(&result.max_open_files, 20, 1000000); + ClipToRange(&result.write_buffer_size, ((size_t)64)<<10, + ((size_t)64)<<30); + ClipToRange(&result.block_size, 1<<10, 4<<20); + + // if user sets arena_block_size, we trust user to use this value. Otherwise, + // calculate a proper value from writer_buffer_size; + if (result.arena_block_size <= 0) { + result.arena_block_size = result.write_buffer_size / 10; + } + + result.min_write_buffer_number_to_merge = std::min( + result.min_write_buffer_number_to_merge, result.max_write_buffer_number-1); + if (result.info_log == nullptr) { + Status s = CreateLoggerFromOptions(dbname, result.db_log_dir, src.env, + result, &result.info_log); + if (!s.ok()) { + // No place suitable for logging + result.info_log = nullptr; + } + } + if (result.block_cache == nullptr && !result.no_block_cache) { + result.block_cache = NewLRUCache(8 << 20); + } + result.compression_per_level = src.compression_per_level; + if (result.block_size_deviation < 0 || result.block_size_deviation > 100) { + result.block_size_deviation = 0; + } + if (result.max_mem_compaction_level >= result.num_levels) { + result.max_mem_compaction_level = result.num_levels - 1; + } + if (result.soft_rate_limit > result.hard_rate_limit) { + result.soft_rate_limit = result.hard_rate_limit; + } + if (result.compaction_filter) { + Log(result.info_log, "Compaction filter specified, ignore factory"); + } + if (result.prefix_extractor) { + // If a prefix extractor has been supplied and a HashSkipListRepFactory is + // being used, make sure that the latter uses the former as its transform + // function. + auto factory = dynamic_cast( + result.memtable_factory.get()); + if (factory && + factory->GetTransform() != result.prefix_extractor) { + Log(result.info_log, "A prefix hash representation factory was supplied " + "whose prefix extractor does not match options.prefix_extractor. " + "Falling back to skip list representation factory"); + result.memtable_factory = std::make_shared(); + } else if (factory) { + Log(result.info_log, "Prefix hash memtable rep is in use."); + } + } + + if (result.wal_dir.empty()) { + // Use dbname as default + result.wal_dir = dbname; + } + + // -- Sanitize the table properties collector + // All user defined properties collectors will be wrapped by + // UserKeyTablePropertiesCollector since for them they only have the + // knowledge of the user keys; internal keys are invisible to them. + auto& collectors = result.table_properties_collectors; + for (size_t i = 0; i < result.table_properties_collectors.size(); ++i) { + assert(collectors[i]); + collectors[i] = + std::make_shared(collectors[i]); + } + + // Add collector to collect internal key statistics + collectors.push_back( + std::make_shared() + ); + + return result; +} + +CompressionType GetCompressionType(const Options& options, int level, + const bool enable_compression) { + if (!enable_compression) { + // disable compression + return kNoCompression; + } + // If the use has specified a different compression level for each level, + // then pick the compresison for that level. + if (!options.compression_per_level.empty()) { + const int n = options.compression_per_level.size() - 1; + // It is possible for level_ to be -1; in that case, we use level + // 0's compression. This occurs mostly in backwards compatibility + // situations when the builder doesn't know what level the file + // belongs to. Likewise, if level_ is beyond the end of the + // specified compression levels, use the last value. + return options.compression_per_level[std::max(0, std::min(level, n))]; + } else { + return options.compression; + } +} + +CompressionType GetCompressionFlush(const Options& options) { + // Compressing memtable flushes might not help unless the sequential load + // optimization is used for leveled compaction. Otherwise the CPU and + // latency overhead is not offset by saving much space. + + bool can_compress; + + if (options.compaction_style == kCompactionStyleUniversal) { + can_compress = + (options.compaction_options_universal.compression_size_percent < 0); + } else { + // For leveled compress when min_level_to_compress == 0. + can_compress = (GetCompressionType(options, 0, true) != kNoCompression); + } + + if (can_compress) { + return options.compression; + } else { + return kNoCompression; + } +} + +DBImpl::DBImpl(const Options& options, const std::string& dbname) + : env_(options.env), + dbname_(dbname), + internal_comparator_(options.comparator), + options_(SanitizeOptions(dbname, &internal_comparator_, + &internal_filter_policy_, options)), + internal_filter_policy_(options.filter_policy), + owns_info_log_(options_.info_log != options.info_log), + db_lock_(nullptr), + mutex_(options.use_adaptive_mutex), + shutting_down_(nullptr), + bg_cv_(&mutex_), + mem_rep_factory_(options_.memtable_factory.get()), + mem_(new MemTable(internal_comparator_, options_)), + logfile_number_(0), + super_version_(nullptr), + tmp_batch_(), + bg_compaction_scheduled_(0), + bg_manual_only_(0), + bg_flush_scheduled_(0), + bg_logstats_scheduled_(false), + manual_compaction_(nullptr), + logger_(nullptr), + disable_delete_obsolete_files_(0), + delete_obsolete_files_last_run_(options.env->NowMicros()), + purge_wal_files_last_run_(0), + last_stats_dump_time_microsec_(0), + default_interval_to_delete_obsolete_WAL_(600), + stall_level0_slowdown_(0), + stall_memtable_compaction_(0), + stall_level0_num_files_(0), + stall_level0_slowdown_count_(0), + stall_memtable_compaction_count_(0), + stall_level0_num_files_count_(0), + started_at_(options.env->NowMicros()), + flush_on_destroy_(false), + stats_(options.num_levels), + delayed_writes_(0), + storage_options_(options), + bg_work_gate_closed_(false), + refitting_level_(false) { + + mem_->Ref(); + + env_->GetAbsolutePath(dbname, &db_absolute_path_); + + stall_leveln_slowdown_.resize(options.num_levels); + stall_leveln_slowdown_count_.resize(options.num_levels); + for (int i = 0; i < options.num_levels; ++i) { + stall_leveln_slowdown_[i] = 0; + stall_leveln_slowdown_count_[i] = 0; + } + + // Reserve ten files or so for other uses and give the rest to TableCache. + const int table_cache_size = options_.max_open_files - 10; + table_cache_.reset(new TableCache(dbname_, &options_, + storage_options_, table_cache_size)); + + versions_.reset(new VersionSet(dbname_, &options_, storage_options_, + table_cache_.get(), &internal_comparator_)); + + dumpLeveldbBuildVersion(options_.info_log.get()); + options_.Dump(options_.info_log.get()); + + char name[100]; + Status st = env_->GetHostName(name, 100L); + if (st.ok()) { + host_name_ = name; + } else { + Log(options_.info_log, "Can't get hostname, use localhost as host name."); + host_name_ = "localhost"; + } + last_log_ts = 0; + + LogFlush(options_.info_log); +} + +DBImpl::~DBImpl() { + std::vector to_delete; + to_delete.reserve(options_.max_write_buffer_number); + + // Wait for background work to finish + if (flush_on_destroy_ && mem_->GetFirstSequenceNumber() != 0) { + FlushMemTable(FlushOptions()); + } + mutex_.Lock(); + shutting_down_.Release_Store(this); // Any non-nullptr value is ok + while (bg_compaction_scheduled_ || + bg_flush_scheduled_ || + bg_logstats_scheduled_) { + bg_cv_.Wait(); + } + if (super_version_ != nullptr) { + bool is_last_reference __attribute__((unused)); + is_last_reference = super_version_->Unref(); + assert(is_last_reference); + super_version_->Cleanup(); + delete super_version_; + } + mutex_.Unlock(); + + if (db_lock_ != nullptr) { + env_->UnlockFile(db_lock_); + } + + if (mem_ != nullptr) { + delete mem_->Unref(); + } + + imm_.UnrefAll(&to_delete); + for (MemTable* m: to_delete) { + delete m; + } + LogFlush(options_.info_log); +} + +// Do not flush and close database elegantly. Simulate a crash. +void DBImpl::TEST_Destroy_DBImpl() { + // ensure that no new memtable flushes can occur + flush_on_destroy_ = false; + + // wait till all background compactions are done. + mutex_.Lock(); + while (bg_compaction_scheduled_ || + bg_flush_scheduled_ || + bg_logstats_scheduled_) { + bg_cv_.Wait(); + } + if (super_version_ != nullptr) { + bool is_last_reference __attribute__((unused)); + is_last_reference = super_version_->Unref(); + assert(is_last_reference); + super_version_->Cleanup(); + delete super_version_; + } + + // Prevent new compactions from occuring. + bg_work_gate_closed_ = true; + const int LargeNumber = 10000000; + bg_compaction_scheduled_ += LargeNumber; + + mutex_.Unlock(); + LogFlush(options_.info_log); + + // force release the lock file. + if (db_lock_ != nullptr) { + env_->UnlockFile(db_lock_); + } + + log_.reset(); + versions_.reset(); + table_cache_.reset(); +} + +uint64_t DBImpl::TEST_Current_Manifest_FileNo() { + return versions_->ManifestFileNumber(); +} + +Status DBImpl::NewDB() { + VersionEdit new_db; + new_db.SetComparatorName(user_comparator()->Name()); + new_db.SetLogNumber(0); + new_db.SetNextFile(2); + new_db.SetLastSequence(0); + + const std::string manifest = DescriptorFileName(dbname_, 1); + unique_ptr file; + Status s = env_->NewWritableFile(manifest, &file, storage_options_); + if (!s.ok()) { + return s; + } + file->SetPreallocationBlockSize(options_.manifest_preallocation_size); + { + log::Writer log(std::move(file)); + std::string record; + new_db.EncodeTo(&record); + s = log.AddRecord(record); + } + if (s.ok()) { + // Make "CURRENT" file that points to the new manifest file. + s = SetCurrentFile(env_, dbname_, 1); + } else { + env_->DeleteFile(manifest); + } + return s; +} + +void DBImpl::MaybeIgnoreError(Status* s) const { + if (s->ok() || options_.paranoid_checks) { + // No change needed + } else { + Log(options_.info_log, "Ignoring error %s", s->ToString().c_str()); + *s = Status::OK(); + } +} + +const Status DBImpl::CreateArchivalDirectory() { + if (options_.WAL_ttl_seconds > 0 || options_.WAL_size_limit_MB > 0) { + std::string archivalPath = ArchivalDirectory(options_.wal_dir); + return env_->CreateDirIfMissing(archivalPath); + } + return Status::OK(); +} + +void DBImpl::PrintStatistics() { + auto dbstats = options_.statistics.get(); + if (dbstats) { + Log(options_.info_log, + "STATISTCS:\n %s", + dbstats->ToString().c_str()); + } +} + +void DBImpl::MaybeDumpStats() { + if (options_.stats_dump_period_sec == 0) return; + + const uint64_t now_micros = env_->NowMicros(); + + if (last_stats_dump_time_microsec_ + + options_.stats_dump_period_sec * 1000000 + <= now_micros) { + // Multiple threads could race in here simultaneously. + // However, the last one will update last_stats_dump_time_microsec_ + // atomically. We could see more than one dump during one dump + // period in rare cases. + last_stats_dump_time_microsec_ = now_micros; + std::string stats; + GetProperty("rocksdb.stats", &stats); + Log(options_.info_log, "%s", stats.c_str()); + PrintStatistics(); + } +} + +// DBImpl::SuperVersion methods +DBImpl::SuperVersion::SuperVersion(const int num_memtables) { + to_delete.resize(num_memtables); +} + +DBImpl::SuperVersion::~SuperVersion() { + for (auto td : to_delete) { + delete td; + } +} + +DBImpl::SuperVersion* DBImpl::SuperVersion::Ref() { + refs.fetch_add(1, std::memory_order_relaxed); + return this; +} + +bool DBImpl::SuperVersion::Unref() { + assert(refs > 0); + // fetch_sub returns the previous value of ref + return refs.fetch_sub(1, std::memory_order_relaxed) == 1; +} + +void DBImpl::SuperVersion::Cleanup() { + assert(refs.load(std::memory_order_relaxed) == 0); + imm.UnrefAll(&to_delete); + MemTable* m = mem->Unref(); + if (m != nullptr) { + to_delete.push_back(m); + } + current->Unref(); +} + +void DBImpl::SuperVersion::Init(MemTable* new_mem, const MemTableList& new_imm, + Version* new_current) { + mem = new_mem; + imm = new_imm; + current = new_current; + mem->Ref(); + imm.RefAll(); + current->Ref(); + refs.store(1, std::memory_order_relaxed); +} + +// Returns the list of live files in 'sst_live' and the list +// of all files in the filesystem in 'all_files'. +// no_full_scan = true -- never do the full scan using GetChildren() +// force = false -- don't force the full scan, except every +// options_.delete_obsolete_files_period_micros +// force = true -- force the full scan +void DBImpl::FindObsoleteFiles(DeletionState& deletion_state, + bool force, + bool no_full_scan) { + mutex_.AssertHeld(); + + // if deletion is disabled, do nothing + if (disable_delete_obsolete_files_ > 0) { + return; + } + + bool doing_the_full_scan = false; + + // logic for figurint out if we're doing the full scan + if (no_full_scan) { + doing_the_full_scan = false; + } else if (force || options_.delete_obsolete_files_period_micros == 0) { + doing_the_full_scan = true; + } else { + const uint64_t now_micros = env_->NowMicros(); + if (delete_obsolete_files_last_run_ + + options_.delete_obsolete_files_period_micros < now_micros) { + doing_the_full_scan = true; + delete_obsolete_files_last_run_ = now_micros; + } + } + + // get obsolete files + versions_->GetObsoleteFiles(&deletion_state.sst_delete_files); + + // store the current filenum, lognum, etc + deletion_state.manifest_file_number = versions_->ManifestFileNumber(); + deletion_state.log_number = versions_->LogNumber(); + deletion_state.prev_log_number = versions_->PrevLogNumber(); + + if (!doing_the_full_scan && !deletion_state.HaveSomethingToDelete()) { + // avoid filling up sst_live if we're sure that we + // are not going to do the full scan and that we don't have + // anything to delete at the moment + return; + } + + // don't delete live files + deletion_state.sst_live.assign(pending_outputs_.begin(), + pending_outputs_.end()); + versions_->AddLiveFiles(&deletion_state.sst_live); + + if (doing_the_full_scan) { + // set of all files in the directory + env_->GetChildren(dbname_, &deletion_state.all_files); // Ignore errors + + //Add log files in wal_dir + if (options_.wal_dir != dbname_) { + std::vector log_files; + env_->GetChildren(options_.wal_dir, &log_files); // Ignore errors + deletion_state.all_files.insert( + deletion_state.all_files.end(), + log_files.begin(), + log_files.end() + ); + } + } +} + +// Diffs the files listed in filenames and those that do not +// belong to live files are posibly removed. Also, removes all the +// files in sst_delete_files and log_delete_files. +// It is not necessary to hold the mutex when invoking this method. +void DBImpl::PurgeObsoleteFiles(DeletionState& state) { + + // check if there is anything to do + if (!state.all_files.size() && + !state.sst_delete_files.size() && + !state.log_delete_files.size()) { + return; + } + + // this checks if FindObsoleteFiles() was run before. If not, don't do + // PurgeObsoleteFiles(). If FindObsoleteFiles() was run, we need to also + // run PurgeObsoleteFiles(), even if disable_delete_obsolete_files_ is true + if (state.manifest_file_number == 0) { + return; + } + + uint64_t number; + FileType type; + std::vector old_log_files; + + // Now, convert live list to an unordered set, WITHOUT mutex held; + // set is slow. + std::unordered_set live_set(state.sst_live.begin(), + state.sst_live.end()); + + state.all_files.reserve(state.all_files.size() + + state.sst_delete_files.size()); + for (auto file : state.sst_delete_files) { + state.all_files.push_back(TableFileName("", file->number).substr(1)); + delete file; + } + + state.all_files.reserve(state.all_files.size() + + state.log_delete_files.size()); + for (auto filenum : state.log_delete_files) { + if (filenum > 0) { + state.all_files.push_back(LogFileName("", filenum).substr(1)); + } + } + + // dedup state.all_files so we don't try to delete the same + // file twice + sort(state.all_files.begin(), state.all_files.end()); + auto unique_end = unique(state.all_files.begin(), state.all_files.end()); + + for (size_t i = 0; state.all_files.begin() + i < unique_end; i++) { + if (ParseFileName(state.all_files[i], &number, &type)) { + bool keep = true; + switch (type) { + case kLogFile: + keep = ((number >= state.log_number) || + (number == state.prev_log_number)); + break; + case kDescriptorFile: + // Keep my manifest file, and any newer incarnations' + // (in case there is a race that allows other incarnations) + keep = (number >= state.manifest_file_number); + break; + case kTableFile: + keep = (live_set.find(number) != live_set.end()); + break; + case kTempFile: + // Any temp files that are currently being written to must + // be recorded in pending_outputs_, which is inserted into "live" + keep = (live_set.find(number) != live_set.end()); + break; + case kInfoLogFile: + keep = true; + if (number != 0) { + old_log_files.push_back(state.all_files[i]); + } + break; + case kCurrentFile: + case kDBLockFile: + case kIdentityFile: + case kMetaDatabase: + keep = true; + break; + } + + if (!keep) { + if (type == kTableFile) { + // evict from cache + table_cache_->Evict(number); + } + std::string fname = ((type == kLogFile) ? options_.wal_dir : dbname_) + + "/" + state.all_files[i]; + Log(options_.info_log, + "Delete type=%d #%lu", + int(type), + (unsigned long)number); + + Status st; + if (type == kLogFile && (options_.WAL_ttl_seconds > 0 || + options_.WAL_size_limit_MB > 0)) { + st = env_->RenameFile(fname, + ArchivedLogFileName(options_.wal_dir, number)); + if (!st.ok()) { + Log(options_.info_log, + "RenameFile logfile #%lu FAILED -- %s\n", + (unsigned long)number, st.ToString().c_str()); + } + } else { + st = env_->DeleteFile(fname); + if (!st.ok()) { + Log(options_.info_log, "Delete type=%d #%lu FAILED -- %s\n", + int(type), (unsigned long)number, st.ToString().c_str()); + } + } + } + } + } + + // Delete old info log files. + size_t old_log_file_count = old_log_files.size(); + // NOTE: Currently we only support log purge when options_.db_log_dir is + // located in `dbname` directory. + if (old_log_file_count >= options_.keep_log_file_num && + options_.db_log_dir.empty()) { + std::sort(old_log_files.begin(), old_log_files.end()); + size_t end = old_log_file_count - options_.keep_log_file_num; + for (unsigned int i = 0; i <= end; i++) { + std::string& to_delete = old_log_files.at(i); + // Log(options_.info_log, "Delete type=%d %s\n", + // int(kInfoLogFile), to_delete.c_str()); + env_->DeleteFile(dbname_ + "/" + to_delete); + } + } + PurgeObsoleteWALFiles(); + LogFlush(options_.info_log); +} + +void DBImpl::DeleteObsoleteFiles() { + mutex_.AssertHeld(); + DeletionState deletion_state; + FindObsoleteFiles(deletion_state, true); + PurgeObsoleteFiles(deletion_state); +} + +// 1. Go through all archived files and +// a. if ttl is enabled, delete outdated files +// b. if archive size limit is enabled, delete empty files, +// compute file number and size. +// 2. If size limit is enabled: +// a. compute how many files should be deleted +// b. get sorted non-empty archived logs +// c. delete what should be deleted +void DBImpl::PurgeObsoleteWALFiles() { + bool const ttl_enabled = options_.WAL_ttl_seconds > 0; + bool const size_limit_enabled = options_.WAL_size_limit_MB > 0; + if (!ttl_enabled && !size_limit_enabled) { + return; + } + + int64_t current_time; + Status s = env_->GetCurrentTime(¤t_time); + if (!s.ok()) { + Log(options_.info_log, "Can't get current time: %s", s.ToString().c_str()); + assert(false); + return; + } + uint64_t const now_seconds = static_cast(current_time); + uint64_t const time_to_check = (ttl_enabled && !size_limit_enabled) ? + options_.WAL_ttl_seconds / 2 : default_interval_to_delete_obsolete_WAL_; + + if (purge_wal_files_last_run_ + time_to_check > now_seconds) { + return; + } + + purge_wal_files_last_run_ = now_seconds; + + std::string archival_dir = ArchivalDirectory(options_.wal_dir); + std::vector files; + s = env_->GetChildren(archival_dir, &files); + if (!s.ok()) { + Log(options_.info_log, "Can't get archive files: %s", s.ToString().c_str()); + assert(false); + return; + } + + size_t log_files_num = 0; + uint64_t log_file_size = 0; + + for (auto& f : files) { + uint64_t number; + FileType type; + if (ParseFileName(f, &number, &type) && type == kLogFile) { + std::string const file_path = archival_dir + "/" + f; + if (ttl_enabled) { + uint64_t file_m_time; + Status const s = env_->GetFileModificationTime(file_path, + &file_m_time); + if (!s.ok()) { + Log(options_.info_log, "Can't get file mod time: %s: %s", + file_path.c_str(), s.ToString().c_str()); + continue; + } + if (now_seconds - file_m_time > options_.WAL_ttl_seconds) { + Status const s = env_->DeleteFile(file_path); + if (!s.ok()) { + Log(options_.info_log, "Can't delete file: %s: %s", + file_path.c_str(), s.ToString().c_str()); + continue; + } + continue; + } + } + + if (size_limit_enabled) { + uint64_t file_size; + Status const s = env_->GetFileSize(file_path, &file_size); + if (!s.ok()) { + Log(options_.info_log, "Can't get file size: %s: %s", + file_path.c_str(), s.ToString().c_str()); + return; + } else { + if (file_size > 0) { + log_file_size = std::max(log_file_size, file_size); + ++log_files_num; + } else { + Status s = env_->DeleteFile(file_path); + if (!s.ok()) { + Log(options_.info_log, "Can't delete file: %s: %s", + file_path.c_str(), s.ToString().c_str()); + continue; + } + } + } + } + } + } + + if (0 == log_files_num || !size_limit_enabled) { + return; + } + + size_t const files_keep_num = options_.WAL_size_limit_MB * + 1024 * 1024 / log_file_size; + if (log_files_num <= files_keep_num) { + return; + } + + size_t files_del_num = log_files_num - files_keep_num; + VectorLogPtr archived_logs; + AppendSortedWalsOfType(archival_dir, archived_logs, kArchivedLogFile); + + if (files_del_num > archived_logs.size()) { + Log(options_.info_log, "Trying to delete more archived log files than " + "exist. Deleting all"); + files_del_num = archived_logs.size(); + } + + for (size_t i = 0; i < files_del_num; ++i) { + std::string const file_path = archived_logs[i]->PathName(); + Status const s = DeleteFile(file_path); + if (!s.ok()) { + Log(options_.info_log, "Can't delete file: %s: %s", + file_path.c_str(), s.ToString().c_str()); + continue; + } + } +} + +// If externalTable is set, then apply recovered transactions +// to that table. This is used for readonly mode. +Status DBImpl::Recover(VersionEdit* edit, MemTable* external_table, + bool error_if_log_file_exist) { + mutex_.AssertHeld(); + + assert(db_lock_ == nullptr); + if (!external_table) { + // We call CreateDirIfMissing() as the directory may already exist (if we + // are reopening a DB), when this happens we don't want creating the + // directory to cause an error. However, we need to check if creating the + // directory fails or else we may get an obscure message about the lock + // file not existing. One real-world example of this occurring is if + // env->CreateDirIfMissing() doesn't create intermediate directories, e.g. + // when dbname_ is "dir/db" but when "dir" doesn't exist. + Status s = env_->CreateDirIfMissing(dbname_); + if (!s.ok()) { + return s; + } + + s = env_->LockFile(LockFileName(dbname_), &db_lock_); + if (!s.ok()) { + return s; + } + + if (!env_->FileExists(CurrentFileName(dbname_))) { + if (options_.create_if_missing) { + // TODO: add merge_operator name check + s = NewDB(); + if (!s.ok()) { + return s; + } + } else { + return Status::InvalidArgument( + dbname_, "does not exist (create_if_missing is false)"); + } + } else { + if (options_.error_if_exists) { + return Status::InvalidArgument( + dbname_, "exists (error_if_exists is true)"); + } + } + // Check for the IDENTITY file and create it if not there + if (!env_->FileExists(IdentityFileName(dbname_))) { + s = SetIdentityFile(env_, dbname_); + if (!s.ok()) { + return s; + } + } + } + + Status s = versions_->Recover(); + if (s.ok()) { + SequenceNumber max_sequence(0); + + // Recover from all newer log files than the ones named in the + // descriptor (new log files may have been added by the previous + // incarnation without registering them in the descriptor). + // + // Note that PrevLogNumber() is no longer used, but we pay + // attention to it in case we are recovering a database + // produced by an older version of rocksdb. + const uint64_t min_log = versions_->LogNumber(); + const uint64_t prev_log = versions_->PrevLogNumber(); + std::vector filenames; + s = env_->GetChildren(options_.wal_dir, &filenames); + if (!s.ok()) { + return s; + } + uint64_t number; + FileType type; + std::vector logs; + for (size_t i = 0; i < filenames.size(); i++) { + if (ParseFileName(filenames[i], &number, &type) + && type == kLogFile + && ((number >= min_log) || (number == prev_log))) { + logs.push_back(number); + } + } + + if (logs.size() > 0 && error_if_log_file_exist) { + return Status::Corruption("" + "The db was opened in readonly mode with error_if_log_file_exist" + "flag but a log file already exists"); + } + + // Recover in the order in which the logs were generated + std::sort(logs.begin(), logs.end()); + for (size_t i = 0; i < logs.size(); i++) { + s = RecoverLogFile(logs[i], edit, &max_sequence, external_table); + // The previous incarnation may not have written any MANIFEST + // records after allocating this log number. So we manually + // update the file number allocation counter in VersionSet. + versions_->MarkFileNumberUsed(logs[i]); + } + + if (s.ok()) { + if (versions_->LastSequence() < max_sequence) { + versions_->SetLastSequence(max_sequence); + } + SetTickerCount(options_.statistics.get(), SEQUENCE_NUMBER, + versions_->LastSequence()); + } + } + + return s; +} + +Status DBImpl::RecoverLogFile(uint64_t log_number, + VersionEdit* edit, + SequenceNumber* max_sequence, + MemTable* external_table) { + struct LogReporter : public log::Reader::Reporter { + Env* env; + Logger* info_log; + const char* fname; + Status* status; // nullptr if options_.paranoid_checks==false or + // options_.skip_log_error_on_recovery==true + virtual void Corruption(size_t bytes, const Status& s) { + Log(info_log, "%s%s: dropping %d bytes; %s", + (this->status == nullptr ? "(ignoring error) " : ""), + fname, static_cast(bytes), s.ToString().c_str()); + if (this->status != nullptr && this->status->ok()) *this->status = s; + } + }; + + mutex_.AssertHeld(); + + // Open the log file + std::string fname = LogFileName(options_.wal_dir, log_number); + unique_ptr file; + Status status = env_->NewSequentialFile(fname, &file, storage_options_); + if (!status.ok()) { + MaybeIgnoreError(&status); + return status; + } + + // Create the log reader. + LogReporter reporter; + reporter.env = env_; + reporter.info_log = options_.info_log.get(); + reporter.fname = fname.c_str(); + reporter.status = (options_.paranoid_checks && + !options_.skip_log_error_on_recovery ? &status : nullptr); + // We intentially make log::Reader do checksumming even if + // paranoid_checks==false so that corruptions cause entire commits + // to be skipped instead of propagating bad information (like overly + // large sequence numbers). + log::Reader reader(std::move(file), &reporter, true/*checksum*/, + 0/*initial_offset*/); + Log(options_.info_log, "Recovering log #%lu", + (unsigned long) log_number); + + // Read all the records and add to a memtable + std::string scratch; + Slice record; + WriteBatch batch; + MemTable* mem = nullptr; + if (external_table) { + mem = external_table; + } + while (reader.ReadRecord(&record, &scratch) && status.ok()) { + if (record.size() < 12) { + reporter.Corruption( + record.size(), Status::Corruption("log record too small")); + continue; + } + WriteBatchInternal::SetContents(&batch, record); + + if (mem == nullptr) { + mem = new MemTable(internal_comparator_, options_); + mem->Ref(); + } + status = WriteBatchInternal::InsertInto(&batch, mem, &options_); + MaybeIgnoreError(&status); + if (!status.ok()) { + break; + } + const SequenceNumber last_seq = + WriteBatchInternal::Sequence(&batch) + + WriteBatchInternal::Count(&batch) - 1; + if (last_seq > *max_sequence) { + *max_sequence = last_seq; + } + + if (!external_table && + mem->ApproximateMemoryUsage() > options_.write_buffer_size) { + status = WriteLevel0TableForRecovery(mem, edit); + if (!status.ok()) { + // Reflect errors immediately so that conditions like full + // file-systems cause the DB::Open() to fail. + break; + } + delete mem->Unref(); + mem = nullptr; + } + } + + if (status.ok() && mem != nullptr && !external_table) { + status = WriteLevel0TableForRecovery(mem, edit); + // Reflect errors immediately so that conditions like full + // file-systems cause the DB::Open() to fail. + } + + if (mem != nullptr && !external_table) { + delete mem->Unref(); + } + return status; +} + +Status DBImpl::WriteLevel0TableForRecovery(MemTable* mem, VersionEdit* edit) { + mutex_.AssertHeld(); + const uint64_t start_micros = env_->NowMicros(); + FileMetaData meta; + meta.number = versions_->NewFileNumber(); + pending_outputs_.insert(meta.number); + Iterator* iter = mem->NewIterator(); + const SequenceNumber newest_snapshot = snapshots_.GetNewest(); + const SequenceNumber earliest_seqno_in_memtable = + mem->GetFirstSequenceNumber(); + Log(options_.info_log, "Level-0 table #%lu: started", + (unsigned long) meta.number); + + Status s; + { + mutex_.Unlock(); + s = BuildTable(dbname_, env_, options_, storage_options_, + table_cache_.get(), iter, &meta, + user_comparator(), newest_snapshot, + earliest_seqno_in_memtable, + GetCompressionFlush(options_)); + LogFlush(options_.info_log); + mutex_.Lock(); + } + + Log(options_.info_log, "Level-0 table #%lu: %lu bytes %s", + (unsigned long) meta.number, + (unsigned long) meta.file_size, + s.ToString().c_str()); + delete iter; + + pending_outputs_.erase(meta.number); + + // Note that if file_size is zero, the file has been deleted and + // should not be added to the manifest. + int level = 0; + if (s.ok() && meta.file_size > 0) { + edit->AddFile(level, meta.number, meta.file_size, + meta.smallest, meta.largest, + meta.smallest_seqno, meta.largest_seqno); + } + + CompactionStats stats; + stats.micros = env_->NowMicros() - start_micros; + stats.bytes_written = meta.file_size; + stats.files_out_levelnp1 = 1; + stats_[level].Add(stats); + RecordTick(options_.statistics.get(), COMPACT_WRITE_BYTES, meta.file_size); + return s; +} + + +Status DBImpl::WriteLevel0Table(std::vector &mems, VersionEdit* edit, + uint64_t* filenumber) { + mutex_.AssertHeld(); + const uint64_t start_micros = env_->NowMicros(); + FileMetaData meta; + meta.number = versions_->NewFileNumber(); + *filenumber = meta.number; + pending_outputs_.insert(meta.number); + + const SequenceNumber newest_snapshot = snapshots_.GetNewest(); + const SequenceNumber earliest_seqno_in_memtable = + mems[0]->GetFirstSequenceNumber(); + Version* base = versions_->current(); + base->Ref(); // it is likely that we do not need this reference + Status s; + { + mutex_.Unlock(); + std::vector list; + for (MemTable* m : mems) { + Log(options_.info_log, + "Flushing memtable with log file: %lu\n", + (unsigned long)m->GetLogNumber()); + list.push_back(m->NewIterator()); + } + Iterator* iter = NewMergingIterator(&internal_comparator_, &list[0], + list.size()); + Log(options_.info_log, + "Level-0 flush table #%lu: started", + (unsigned long)meta.number); + + s = BuildTable(dbname_, env_, options_, storage_options_, + table_cache_.get(), iter, &meta, + user_comparator(), newest_snapshot, + earliest_seqno_in_memtable, GetCompressionFlush(options_)); + LogFlush(options_.info_log); + delete iter; + Log(options_.info_log, "Level-0 flush table #%lu: %lu bytes %s", + (unsigned long) meta.number, + (unsigned long) meta.file_size, + s.ToString().c_str()); + mutex_.Lock(); + } + base->Unref(); + + + // re-acquire the most current version + base = versions_->current(); + + // There could be multiple threads writing to its own level-0 file. + // The pending_outputs cannot be cleared here, otherwise this newly + // created file might not be considered as a live-file by another + // compaction thread that is concurrently deleting obselete files. + // The pending_outputs can be cleared only after the new version is + // committed so that other threads can recognize this file as a + // valid one. + // pending_outputs_.erase(meta.number); + + // Note that if file_size is zero, the file has been deleted and + // should not be added to the manifest. + int level = 0; + if (s.ok() && meta.file_size > 0) { + const Slice min_user_key = meta.smallest.user_key(); + const Slice max_user_key = meta.largest.user_key(); + // if we have more than 1 background thread, then we cannot + // insert files directly into higher levels because some other + // threads could be concurrently producing compacted files for + // that key range. + if (base != nullptr && options_.max_background_compactions <= 1 && + options_.compaction_style == kCompactionStyleLevel) { + level = base->PickLevelForMemTableOutput(min_user_key, max_user_key); + } + edit->AddFile(level, meta.number, meta.file_size, + meta.smallest, meta.largest, + meta.smallest_seqno, meta.largest_seqno); + } + + CompactionStats stats; + stats.micros = env_->NowMicros() - start_micros; + stats.bytes_written = meta.file_size; + stats_[level].Add(stats); + RecordTick(options_.statistics.get(), COMPACT_WRITE_BYTES, meta.file_size); + return s; +} + +Status DBImpl::FlushMemTableToOutputFile(bool* madeProgress, + DeletionState& deletion_state) { + mutex_.AssertHeld(); + assert(imm_.size() != 0); + + if (!imm_.IsFlushPending(options_.min_write_buffer_number_to_merge)) { + Log(options_.info_log, "FlushMemTableToOutputFile already in progress"); + Status s = Status::IOError("FlushMemTableToOutputFile already in progress"); + return s; + } + + // Save the contents of the earliest memtable as a new Table + uint64_t file_number; + std::vector mems; + imm_.PickMemtablesToFlush(&mems); + if (mems.empty()) { + Log(options_.info_log, "Nothing in memstore to flush"); + Status s = Status::IOError("Nothing in memstore to flush"); + return s; + } + + // record the logfile_number_ before we release the mutex + // entries mems are (implicitly) sorted in ascending order by their created + // time. We will use the first memtable's `edit` to keep the meta info for + // this flush. + MemTable* m = mems[0]; + VersionEdit* edit = m->GetEdits(); + edit->SetPrevLogNumber(0); + // SetLogNumber(log_num) indicates logs with number smaller than log_num + // will no longer be picked up for recovery. + edit->SetLogNumber( + mems.back()->GetNextLogNumber() + ); + + std::vector logs_to_delete; + for (auto mem : mems) { + logs_to_delete.push_back(mem->GetLogNumber()); + } + + // This will release and re-acquire the mutex. + Status s = WriteLevel0Table(mems, edit, &file_number); + + if (s.ok() && shutting_down_.Acquire_Load()) { + s = Status::IOError( + "Database shutdown started during memtable compaction" + ); + } + + // Replace immutable memtable with the generated Table + s = imm_.InstallMemtableFlushResults( + mems, versions_.get(), s, &mutex_, options_.info_log.get(), + file_number, pending_outputs_, &deletion_state.memtables_to_free); + + if (s.ok()) { + InstallSuperVersion(deletion_state); + if (madeProgress) { + *madeProgress = 1; + } + + MaybeScheduleLogDBDeployStats(); + + if (disable_delete_obsolete_files_ == 0) { + // add to deletion state + deletion_state.log_delete_files.insert( + deletion_state.log_delete_files.end(), + logs_to_delete.begin(), + logs_to_delete.end()); + } + } + return s; +} + +void DBImpl::CompactRange(const Slice* begin, + const Slice* end, + bool reduce_level, + int target_level) { + FlushMemTable(FlushOptions()); + int max_level_with_files = 1; + { + MutexLock l(&mutex_); + Version* base = versions_->current(); + for (int level = 1; level < NumberLevels(); level++) { + if (base->OverlapInLevel(level, begin, end)) { + max_level_with_files = level; + } + } + } + for (int level = 0; level <= max_level_with_files; level++) { + // in case the compaction is unversal or if we're compacting the + // bottom-most level, the output level will be the same as input one + if (options_.compaction_style == kCompactionStyleUniversal || + level == max_level_with_files) { + RunManualCompaction(level, level, begin, end); + } else { + RunManualCompaction(level, level + 1, begin, end); + } + } + + if (reduce_level) { + ReFitLevel(max_level_with_files, target_level); + } + LogFlush(options_.info_log); +} + +// return the same level if it cannot be moved +int DBImpl::FindMinimumEmptyLevelFitting(int level) { + mutex_.AssertHeld(); + Version* current = versions_->current(); + int minimum_level = level; + for (int i = level - 1; i > 0; --i) { + // stop if level i is not empty + if (current->NumLevelFiles(i) > 0) break; + // stop if level i is too small (cannot fit the level files) + if (versions_->MaxBytesForLevel(i) < current->NumLevelBytes(level)) break; + + minimum_level = i; + } + return minimum_level; +} + +void DBImpl::ReFitLevel(int level, int target_level) { + assert(level < NumberLevels()); + + SuperVersion* superversion_to_free = nullptr; + SuperVersion* new_superversion = + new SuperVersion(options_.max_write_buffer_number); + + mutex_.Lock(); + + // only allow one thread refitting + if (refitting_level_) { + mutex_.Unlock(); + Log(options_.info_log, "ReFitLevel: another thread is refitting"); + delete new_superversion; + return; + } + refitting_level_ = true; + + // wait for all background threads to stop + bg_work_gate_closed_ = true; + while (bg_compaction_scheduled_ > 0 || bg_flush_scheduled_) { + Log(options_.info_log, + "RefitLevel: waiting for background threads to stop: %d %d", + bg_compaction_scheduled_, bg_flush_scheduled_); + bg_cv_.Wait(); + } + + // move to a smaller level + int to_level = target_level; + if (target_level < 0) { + to_level = FindMinimumEmptyLevelFitting(level); + } + + assert(to_level <= level); + + if (to_level < level) { + Log(options_.info_log, "Before refitting:\n%s", + versions_->current()->DebugString().data()); + + VersionEdit edit; + for (const auto& f : versions_->current()->files_[level]) { + edit.DeleteFile(level, f->number); + edit.AddFile(to_level, f->number, f->file_size, f->smallest, f->largest, + f->smallest_seqno, f->largest_seqno); + } + Log(options_.info_log, "Apply version edit:\n%s", + edit.DebugString().data()); + + auto status = versions_->LogAndApply(&edit, &mutex_); + superversion_to_free = InstallSuperVersion(new_superversion); + new_superversion = nullptr; + + Log(options_.info_log, "LogAndApply: %s\n", status.ToString().data()); + + if (status.ok()) { + Log(options_.info_log, "After refitting:\n%s", + versions_->current()->DebugString().data()); + } + } + + refitting_level_ = false; + bg_work_gate_closed_ = false; + + mutex_.Unlock(); + delete superversion_to_free; + delete new_superversion; +} + +int DBImpl::NumberLevels() { + return options_.num_levels; +} + +int DBImpl::MaxMemCompactionLevel() { + return options_.max_mem_compaction_level; +} + +int DBImpl::Level0StopWriteTrigger() { + return options_.level0_stop_writes_trigger; +} + +Status DBImpl::Flush(const FlushOptions& options) { + Status status = FlushMemTable(options); + return status; +} + +SequenceNumber DBImpl::GetLatestSequenceNumber() const { + return versions_->LastSequence(); +} + +Status DBImpl::GetUpdatesSince(SequenceNumber seq, + unique_ptr* iter) { + + RecordTick(options_.statistics.get(), GET_UPDATES_SINCE_CALLS); + if (seq > versions_->LastSequence()) { + return Status::IOError("Requested sequence not yet written in the db"); + } + // Get all sorted Wal Files. + // Do binary search and open files and find the seq number. + + std::unique_ptr wal_files(new VectorLogPtr); + Status s = GetSortedWalFiles(*wal_files); + if (!s.ok()) { + return s; + } + + s = RetainProbableWalFiles(*wal_files, seq); + if (!s.ok()) { + return s; + } + iter->reset( + new TransactionLogIteratorImpl(options_.wal_dir, + &options_, + storage_options_, + seq, + std::move(wal_files), + this)); + return (*iter)->status(); +} + +Status DBImpl::RetainProbableWalFiles(VectorLogPtr& all_logs, + const SequenceNumber target) { + long start = 0; // signed to avoid overflow when target is < first file. + long end = static_cast(all_logs.size()) - 1; + // Binary Search. avoid opening all files. + while (end >= start) { + long mid = start + (end - start) / 2; // Avoid overflow. + SequenceNumber current_seq_num = all_logs.at(mid)->StartSequence(); + if (current_seq_num == target) { + end = mid; + break; + } else if (current_seq_num < target) { + start = mid + 1; + } else { + end = mid - 1; + } + } + size_t start_index = std::max(0l, end); // end could be -ve. + // The last wal file is always included + all_logs.erase(all_logs.begin(), all_logs.begin() + start_index); + return Status::OK(); +} + +bool DBImpl::CheckWalFileExistsAndEmpty(const WalFileType type, + const uint64_t number) { + const std::string fname = (type == kAliveLogFile) ? + LogFileName(options_.wal_dir, number) : + ArchivedLogFileName(options_.wal_dir, number); + uint64_t file_size; + Status s = env_->GetFileSize(fname, &file_size); + return (s.ok() && (file_size == 0)); +} + +Status DBImpl::ReadFirstRecord(const WalFileType type, const uint64_t number, + WriteBatch* const result) { + + if (type == kAliveLogFile) { + std::string fname = LogFileName(options_.wal_dir, number); + Status status = ReadFirstLine(fname, result); + if (!status.ok()) { + // check if the file got moved to archive. + std::string archived_file = + ArchivedLogFileName(options_.wal_dir, number); + Status s = ReadFirstLine(archived_file, result); + if (!s.ok()) { + return Status::IOError("Log File has been deleted: " + archived_file); + } + } + return Status::OK(); + } else if (type == kArchivedLogFile) { + std::string fname = ArchivedLogFileName(options_.wal_dir, number); + Status status = ReadFirstLine(fname, result); + return status; + } + return Status::NotSupported("File Type Not Known: " + std::to_string(type)); +} + +Status DBImpl::ReadFirstLine(const std::string& fname, + WriteBatch* const batch) { + struct LogReporter : public log::Reader::Reporter { + Env* env; + Logger* info_log; + const char* fname; + Status* status; // nullptr if options_.paranoid_checks==false + virtual void Corruption(size_t bytes, const Status& s) { + Log(info_log, "%s%s: dropping %d bytes; %s", + (this->status == nullptr ? "(ignoring error) " : ""), + fname, static_cast(bytes), s.ToString().c_str()); + if (this->status != nullptr && this->status->ok()) *this->status = s; + } + }; + + unique_ptr file; + Status status = env_->NewSequentialFile(fname, &file, storage_options_); + + if (!status.ok()) { + return status; + } + + + LogReporter reporter; + reporter.env = env_; + reporter.info_log = options_.info_log.get(); + reporter.fname = fname.c_str(); + reporter.status = (options_.paranoid_checks ? &status : nullptr); + log::Reader reader(std::move(file), &reporter, true/*checksum*/, + 0/*initial_offset*/); + std::string scratch; + Slice record; + + if (reader.ReadRecord(&record, &scratch) && status.ok()) { + if (record.size() < 12) { + reporter.Corruption( + record.size(), Status::Corruption("log record too small")); + return Status::IOError("Corruption noted"); + // TODO read record's till the first no corrupt entry? + } + WriteBatchInternal::SetContents(batch, record); + return Status::OK(); + } + return Status::IOError("Error reading from file " + fname); +} + +struct CompareLogByPointer { + bool operator() (const unique_ptr& a, + const unique_ptr& b) { + LogFileImpl* a_impl = dynamic_cast(a.get()); + LogFileImpl* b_impl = dynamic_cast(b.get()); + return *a_impl < *b_impl; + } +}; + +Status DBImpl::AppendSortedWalsOfType(const std::string& path, + VectorLogPtr& log_files, WalFileType log_type) { + std::vector all_files; + const Status status = env_->GetChildren(path, &all_files); + if (!status.ok()) { + return status; + } + log_files.reserve(log_files.size() + all_files.size()); + VectorLogPtr::iterator pos_start; + if (!log_files.empty()) { + pos_start = log_files.end() - 1; + } else { + pos_start = log_files.begin(); + } + for (const auto& f : all_files) { + uint64_t number; + FileType type; + if (ParseFileName(f, &number, &type) && type == kLogFile){ + + WriteBatch batch; + Status s = ReadFirstRecord(log_type, number, &batch); + if (!s.ok()) { + if (CheckWalFileExistsAndEmpty(log_type, number)) { + continue; + } + return s; + } + + uint64_t size_bytes; + s = env_->GetFileSize(LogFileName(path, number), &size_bytes); + if (!s.ok()) { + return s; + } + + log_files.push_back(std::move(unique_ptr(new LogFileImpl( + number, log_type, WriteBatchInternal::Sequence(&batch), size_bytes)))); + } + } + CompareLogByPointer compare_log_files; + std::sort(pos_start, log_files.end(), compare_log_files); + return status; +} + +void DBImpl::RunManualCompaction(int input_level, + int output_level, + const Slice* begin, + const Slice* end) { + assert(input_level >= 0); + + InternalKey begin_storage, end_storage; + + ManualCompaction manual; + manual.input_level = input_level; + manual.output_level = output_level; + manual.done = false; + manual.in_progress = false; + // For universal compaction, we enforce every manual compaction to compact + // all files. + if (begin == nullptr || + options_.compaction_style == kCompactionStyleUniversal) { + manual.begin = nullptr; + } else { + begin_storage = InternalKey(*begin, kMaxSequenceNumber, kValueTypeForSeek); + manual.begin = &begin_storage; + } + if (end == nullptr || + options_.compaction_style == kCompactionStyleUniversal) { + manual.end = nullptr; + } else { + end_storage = InternalKey(*end, 0, static_cast(0)); + manual.end = &end_storage; + } + + MutexLock l(&mutex_); + + // When a manual compaction arrives, temporarily disable scheduling of + // non-manual compactions and wait until the number of scheduled compaction + // jobs drops to zero. This is needed to ensure that this manual compaction + // can compact any range of keys/files. + // + // bg_manual_only_ is non-zero when at least one thread is inside + // RunManualCompaction(), i.e. during that time no other compaction will + // get scheduled (see MaybeScheduleFlushOrCompaction). + // + // Note that the following loop doesn't stop more that one thread calling + // RunManualCompaction() from getting to the second while loop below. + // However, only one of them will actually schedule compaction, while + // others will wait on a condition variable until it completes. + + ++bg_manual_only_; + while (bg_compaction_scheduled_ > 0) { + Log(options_.info_log, + "Manual compaction waiting for all other scheduled background " + "compactions to finish"); + bg_cv_.Wait(); + } + + Log(options_.info_log, "Manual compaction starting"); + + while (!manual.done && !shutting_down_.Acquire_Load() && bg_error_.ok()) { + assert(bg_manual_only_ > 0); + if (manual_compaction_ != nullptr) { + // Running either this or some other manual compaction + bg_cv_.Wait(); + } else { + manual_compaction_ = &manual; + MaybeScheduleFlushOrCompaction(); + } + } + + assert(!manual.in_progress); + assert(bg_manual_only_ > 0); + --bg_manual_only_; +} + +void DBImpl::TEST_CompactRange(int level, + const Slice* begin, + const Slice* end) { + int output_level = (options_.compaction_style == kCompactionStyleUniversal) + ? level + : level + 1; + RunManualCompaction(level, output_level, begin, end); +} + +Status DBImpl::FlushMemTable(const FlushOptions& options) { + // nullptr batch means just wait for earlier writes to be done + Status s = Write(WriteOptions(), nullptr); + if (s.ok() && options.wait) { + // Wait until the compaction completes + s = WaitForFlushMemTable(); + } + return s; +} + +Status DBImpl::WaitForFlushMemTable() { + Status s; + // Wait until the compaction completes + MutexLock l(&mutex_); + while (imm_.size() > 0 && bg_error_.ok()) { + bg_cv_.Wait(); + } + if (imm_.size() != 0) { + s = bg_error_; + } + return s; +} + +Status DBImpl::TEST_FlushMemTable() { + return FlushMemTable(FlushOptions()); +} + +Status DBImpl::TEST_WaitForFlushMemTable() { + return WaitForFlushMemTable(); +} + +Status DBImpl::TEST_WaitForCompact() { + // Wait until the compaction completes + + // TODO: a bug here. This function actually does not necessarily + // wait for compact. It actually waits for scheduled compaction + // OR flush to finish. + + MutexLock l(&mutex_); + while ((bg_compaction_scheduled_ || bg_flush_scheduled_) && + bg_error_.ok()) { + bg_cv_.Wait(); + } + return bg_error_; +} + +void DBImpl::MaybeScheduleFlushOrCompaction() { + mutex_.AssertHeld(); + if (bg_work_gate_closed_) { + // gate closed for backgrond work + } else if (shutting_down_.Acquire_Load()) { + // DB is being deleted; no more background compactions + } else { + bool is_flush_pending = + imm_.IsFlushPending(options_.min_write_buffer_number_to_merge); + if (is_flush_pending && + (bg_flush_scheduled_ < options_.max_background_flushes)) { + // memtable flush needed + bg_flush_scheduled_++; + env_->Schedule(&DBImpl::BGWorkFlush, this, Env::Priority::HIGH); + } + + // Schedule BGWorkCompaction if there's a compaction pending (or a memtable + // flush, but the HIGH pool is not enabled). Do it only if + // max_background_compactions hasn't been reached and, in case + // bg_manual_only_ > 0, if it's a manual compaction. + if ((manual_compaction_ || + versions_->NeedsCompaction() || + (is_flush_pending && (options_.max_background_flushes <= 0))) && + bg_compaction_scheduled_ < options_.max_background_compactions && + (!bg_manual_only_ || manual_compaction_)) { + + bg_compaction_scheduled_++; + env_->Schedule(&DBImpl::BGWorkCompaction, this, Env::Priority::LOW); + } + } +} + +void DBImpl::BGWorkFlush(void* db) { + reinterpret_cast(db)->BackgroundCallFlush(); +} + +void DBImpl::BGWorkCompaction(void* db) { + reinterpret_cast(db)->BackgroundCallCompaction(); +} + +Status DBImpl::BackgroundFlush(bool* madeProgress, + DeletionState& deletion_state) { + Status stat; + while (stat.ok() && + imm_.IsFlushPending(options_.min_write_buffer_number_to_merge)) { + Log(options_.info_log, + "BackgroundCallFlush doing FlushMemTableToOutputFile, flush slots available %d", + options_.max_background_flushes - bg_flush_scheduled_); + stat = FlushMemTableToOutputFile(madeProgress, deletion_state); + } + return stat; +} + +void DBImpl::BackgroundCallFlush() { + bool madeProgress = false; + DeletionState deletion_state(options_.max_write_buffer_number, true); + assert(bg_flush_scheduled_); + MutexLock l(&mutex_); + + Status s; + if (!shutting_down_.Acquire_Load()) { + s = BackgroundFlush(&madeProgress, deletion_state); + if (!s.ok()) { + // Wait a little bit before retrying background compaction in + // case this is an environmental problem and we do not want to + // chew up resources for failed compactions for the duration of + // the problem. + bg_cv_.SignalAll(); // In case a waiter can proceed despite the error + Log(options_.info_log, "Waiting after background flush error: %s", + s.ToString().c_str()); + mutex_.Unlock(); + LogFlush(options_.info_log); + env_->SleepForMicroseconds(1000000); + mutex_.Lock(); + } + } + + // If !s.ok(), this means that Flush failed. In that case, we want + // to delete all obsolete files and we force FindObsoleteFiles() + FindObsoleteFiles(deletion_state, !s.ok()); + // delete unnecessary files if any, this is done outside the mutex + if (deletion_state.HaveSomethingToDelete()) { + mutex_.Unlock(); + PurgeObsoleteFiles(deletion_state); + mutex_.Lock(); + } + + bg_flush_scheduled_--; + if (madeProgress) { + MaybeScheduleFlushOrCompaction(); + } + bg_cv_.SignalAll(); +} + + +void DBImpl::TEST_PurgeObsoleteteWAL() { + PurgeObsoleteWALFiles(); +} + +uint64_t DBImpl::TEST_GetLevel0TotalSize() { + MutexLock l(&mutex_); + return versions_->current()->NumLevelBytes(0); +} + +void DBImpl::BackgroundCallCompaction() { + bool madeProgress = false; + DeletionState deletion_state(options_.max_write_buffer_number, true); + + MaybeDumpStats(); + + MutexLock l(&mutex_); + // Log(options_.info_log, "XXX BG Thread %llx process new work item", pthread_self()); + assert(bg_compaction_scheduled_); + Status s; + if (!shutting_down_.Acquire_Load()) { + s = BackgroundCompaction(&madeProgress, deletion_state); + if (!s.ok()) { + // Wait a little bit before retrying background compaction in + // case this is an environmental problem and we do not want to + // chew up resources for failed compactions for the duration of + // the problem. + bg_cv_.SignalAll(); // In case a waiter can proceed despite the error + Log(options_.info_log, "Waiting after background compaction error: %s", + s.ToString().c_str()); + mutex_.Unlock(); + LogFlush(options_.info_log); + env_->SleepForMicroseconds(1000000); + mutex_.Lock(); + } + } + + // If !s.ok(), this means that Compaction failed. In that case, we want + // to delete all obsolete files we might have created and we force + // FindObsoleteFiles(). This is because deletion_state does not catch + // all created files if compaction failed. + FindObsoleteFiles(deletion_state, !s.ok()); + + // delete unnecessary files if any, this is done outside the mutex + if (deletion_state.HaveSomethingToDelete()) { + mutex_.Unlock(); + PurgeObsoleteFiles(deletion_state); + mutex_.Lock(); + } + + bg_compaction_scheduled_--; + + MaybeScheduleLogDBDeployStats(); + + // Previous compaction may have produced too many files in a level, + // So reschedule another compaction if we made progress in the + // last compaction. + if (madeProgress) { + MaybeScheduleFlushOrCompaction(); + } + bg_cv_.SignalAll(); + +} + +Status DBImpl::BackgroundCompaction(bool* madeProgress, + DeletionState& deletion_state) { + *madeProgress = false; + mutex_.AssertHeld(); + + // TODO: remove memtable flush from formal compaction + while (imm_.IsFlushPending(options_.min_write_buffer_number_to_merge)) { + Log(options_.info_log, + "BackgroundCompaction doing FlushMemTableToOutputFile, compaction slots " + "available %d", + options_.max_background_compactions - bg_compaction_scheduled_); + Status stat = FlushMemTableToOutputFile(madeProgress, deletion_state); + if (!stat.ok()) { + return stat; + } + } + + unique_ptr c; + bool is_manual = (manual_compaction_ != nullptr) && + (manual_compaction_->in_progress == false); + InternalKey manual_end_storage; + InternalKey* manual_end = &manual_end_storage; + if (is_manual) { + ManualCompaction* m = manual_compaction_; + assert(!m->in_progress); + m->in_progress = true; // another thread cannot pick up the same work + c.reset(versions_->CompactRange( + m->input_level, m->output_level, m->begin, m->end, &manual_end)); + if (!c) { + m->done = true; + } + Log(options_.info_log, + "Manual compaction from level-%d to level-%d from %s .. %s; will stop " + "at %s\n", + m->input_level, + m->output_level, + (m->begin ? m->begin->DebugString().c_str() : "(begin)"), + (m->end ? m->end->DebugString().c_str() : "(end)"), + ((m->done || manual_end == nullptr) + ? "(end)" + : manual_end->DebugString().c_str())); + } else if (!options_.disable_auto_compactions) { + c.reset(versions_->PickCompaction()); + } + + Status status; + if (!c) { + // Nothing to do + Log(options_.info_log, "Compaction nothing to do"); + } else if (!is_manual && c->IsTrivialMove()) { + // Move file to next level + assert(c->num_input_files(0) == 1); + FileMetaData* f = c->input(0, 0); + c->edit()->DeleteFile(c->level(), f->number); + c->edit()->AddFile(c->level() + 1, f->number, f->file_size, + f->smallest, f->largest, + f->smallest_seqno, f->largest_seqno); + status = versions_->LogAndApply(c->edit(), &mutex_); + InstallSuperVersion(deletion_state); + Version::LevelSummaryStorage tmp; + Log(options_.info_log, "Moved #%lld to level-%d %lld bytes %s: %s\n", + static_cast(f->number), c->level() + 1, + static_cast(f->file_size), + status.ToString().c_str(), versions_->current()->LevelSummary(&tmp)); + versions_->ReleaseCompactionFiles(c.get(), status); + *madeProgress = true; + } else { + MaybeScheduleFlushOrCompaction(); // do more compaction work in parallel. + CompactionState* compact = new CompactionState(c.get()); + status = DoCompactionWork(compact, deletion_state); + CleanupCompaction(compact, status); + versions_->ReleaseCompactionFiles(c.get(), status); + c->ReleaseInputs(); + *madeProgress = true; + } + c.reset(); + + if (status.ok()) { + // Done + } else if (shutting_down_.Acquire_Load()) { + // Ignore compaction errors found during shutting down + } else { + Log(options_.info_log, + "Compaction error: %s", status.ToString().c_str()); + if (options_.paranoid_checks && bg_error_.ok()) { + bg_error_ = status; + } + } + + if (is_manual) { + ManualCompaction* m = manual_compaction_; + if (!status.ok()) { + m->done = true; + } + // For universal compaction: + // Because universal compaction always happens at level 0, so one + // compaction will pick up all overlapped files. No files will be + // filtered out due to size limit and left for a successive compaction. + // So we can safely conclude the current compaction. + // + // Also note that, if we don't stop here, then the current compaction + // writes a new file back to level 0, which will be used in successive + // compaction. Hence the manual compaction will never finish. + // + // Stop the compaction if manual_end points to nullptr -- this means + // that we compacted the whole range. manual_end should always point + // to nullptr in case of universal compaction + if (manual_end == nullptr) { + m->done = true; + } + if (!m->done) { + // We only compacted part of the requested range. Update *m + // to the range that is left to be compacted. + // Universal compaction should always compact the whole range + assert(options_.compaction_style != kCompactionStyleUniversal); + m->tmp_storage = *manual_end; + m->begin = &m->tmp_storage; + } + m->in_progress = false; // not being processed anymore + manual_compaction_ = nullptr; + } + return status; +} + +void DBImpl::CleanupCompaction(CompactionState* compact, Status status) { + mutex_.AssertHeld(); + if (compact->builder != nullptr) { + // May happen if we get a shutdown call in the middle of compaction + compact->builder->Abandon(); + compact->builder.reset(); + } else { + assert(compact->outfile == nullptr); + } + for (size_t i = 0; i < compact->outputs.size(); i++) { + const CompactionState::Output& out = compact->outputs[i]; + pending_outputs_.erase(out.number); + + // If this file was inserted into the table cache then remove + // them here because this compaction was not committed. + if (!status.ok()) { + table_cache_->Evict(out.number); + } + } + delete compact; +} + +// Allocate the file numbers for the output file. We allocate as +// many output file numbers as there are files in level+1 (at least one) +// Insert them into pending_outputs so that they do not get deleted. +void DBImpl::AllocateCompactionOutputFileNumbers(CompactionState* compact) { + mutex_.AssertHeld(); + assert(compact != nullptr); + assert(compact->builder == nullptr); + int filesNeeded = compact->compaction->num_input_files(1); + for (int i = 0; i < std::max(filesNeeded, 1); i++) { + uint64_t file_number = versions_->NewFileNumber(); + pending_outputs_.insert(file_number); + compact->allocated_file_numbers.push_back(file_number); + } +} + +// Frees up unused file number. +void DBImpl::ReleaseCompactionUnusedFileNumbers(CompactionState* compact) { + mutex_.AssertHeld(); + for (const auto file_number : compact->allocated_file_numbers) { + pending_outputs_.erase(file_number); + // Log(options_.info_log, "XXX releasing unused file num %d", file_number); + } +} + +Status DBImpl::OpenCompactionOutputFile(CompactionState* compact) { + assert(compact != nullptr); + assert(compact->builder == nullptr); + uint64_t file_number; + // If we have not yet exhausted the pre-allocated file numbers, + // then use the one from the front. Otherwise, we have to acquire + // the heavyweight lock and allocate a new file number. + if (!compact->allocated_file_numbers.empty()) { + file_number = compact->allocated_file_numbers.front(); + compact->allocated_file_numbers.pop_front(); + } else { + mutex_.Lock(); + file_number = versions_->NewFileNumber(); + pending_outputs_.insert(file_number); + mutex_.Unlock(); + } + CompactionState::Output out; + out.number = file_number; + out.smallest.Clear(); + out.largest.Clear(); + out.smallest_seqno = out.largest_seqno = 0; + compact->outputs.push_back(out); + + // Make the output file + std::string fname = TableFileName(dbname_, file_number); + Status s = env_->NewWritableFile(fname, &compact->outfile, storage_options_); + + if (s.ok()) { + // Over-estimate slightly so we don't end up just barely crossing + // the threshold. + compact->outfile->SetPreallocationBlockSize( + 1.1 * versions_->MaxFileSizeForLevel(compact->compaction->output_level())); + + CompressionType compression_type = GetCompressionType( + options_, compact->compaction->output_level(), + compact->compaction->enable_compression()); + + compact->builder.reset( + GetTableBuilder(options_, compact->outfile.get(), compression_type)); + } + LogFlush(options_.info_log); + return s; +} + +Status DBImpl::FinishCompactionOutputFile(CompactionState* compact, + Iterator* input) { + assert(compact != nullptr); + assert(compact->outfile); + assert(compact->builder != nullptr); + + const uint64_t output_number = compact->current_output()->number; + assert(output_number != 0); + + // Check for iterator errors + Status s = input->status(); + const uint64_t current_entries = compact->builder->NumEntries(); + if (s.ok()) { + s = compact->builder->Finish(); + } else { + compact->builder->Abandon(); + } + const uint64_t current_bytes = compact->builder->FileSize(); + compact->current_output()->file_size = current_bytes; + compact->total_bytes += current_bytes; + compact->builder.reset(); + + // Finish and check for file errors + if (s.ok() && !options_.disableDataSync) { + if (options_.use_fsync) { + StopWatch sw(env_, options_.statistics.get(), + COMPACTION_OUTFILE_SYNC_MICROS, false); + s = compact->outfile->Fsync(); + } else { + StopWatch sw(env_, options_.statistics.get(), + COMPACTION_OUTFILE_SYNC_MICROS, false); + s = compact->outfile->Sync(); + } + } + if (s.ok()) { + s = compact->outfile->Close(); + } + compact->outfile.reset(); + + if (s.ok() && current_entries > 0) { + // Verify that the table is usable + Iterator* iter = table_cache_->NewIterator(ReadOptions(), + storage_options_, + output_number, + current_bytes); + s = iter->status(); + delete iter; + if (s.ok()) { + Log(options_.info_log, + "Generated table #%lu: %lu keys, %lu bytes", + (unsigned long) output_number, + (unsigned long) current_entries, + (unsigned long) current_bytes); + } + } + return s; +} + + +Status DBImpl::InstallCompactionResults(CompactionState* compact) { + mutex_.AssertHeld(); + + // paranoia: verify that the files that we started with + // still exist in the current version and in the same original level. + // This ensures that a concurrent compaction did not erroneously + // pick the same files to compact. + if (!versions_->VerifyCompactionFileConsistency(compact->compaction)) { + Log(options_.info_log, "Compaction %d@%d + %d@%d files aborted", + compact->compaction->num_input_files(0), + compact->compaction->level(), + compact->compaction->num_input_files(1), + compact->compaction->level() + 1); + return Status::IOError("Compaction input files inconsistent"); + } + + Log(options_.info_log, "Compacted %d@%d + %d@%d files => %lld bytes", + compact->compaction->num_input_files(0), + compact->compaction->level(), + compact->compaction->num_input_files(1), + compact->compaction->level() + 1, + static_cast(compact->total_bytes)); + + // Add compaction outputs + compact->compaction->AddInputDeletions(compact->compaction->edit()); + for (size_t i = 0; i < compact->outputs.size(); i++) { + const CompactionState::Output& out = compact->outputs[i]; + compact->compaction->edit()->AddFile( + compact->compaction->output_level(), out.number, out.file_size, + out.smallest, out.largest, out.smallest_seqno, out.largest_seqno); + } + return versions_->LogAndApply(compact->compaction->edit(), &mutex_); +} + +// +// Given a sequence number, return the sequence number of the +// earliest snapshot that this sequence number is visible in. +// The snapshots themselves are arranged in ascending order of +// sequence numbers. +// Employ a sequential search because the total number of +// snapshots are typically small. +inline SequenceNumber DBImpl::findEarliestVisibleSnapshot( + SequenceNumber in, std::vector& snapshots, + SequenceNumber* prev_snapshot) { + SequenceNumber prev __attribute__((unused)) = 0; + for (const auto cur : snapshots) { + assert(prev <= cur); + if (cur >= in) { + *prev_snapshot = prev; + return cur; + } + prev = cur; // assignment + assert(prev); + } + Log(options_.info_log, + "Looking for seqid %lu but maxseqid is %lu", + (unsigned long)in, + (unsigned long)snapshots[snapshots.size()-1]); + assert(0); + return 0; +} + +Status DBImpl::DoCompactionWork(CompactionState* compact, + DeletionState& deletion_state) { + assert(compact); + int64_t imm_micros = 0; // Micros spent doing imm_ compactions + Log(options_.info_log, + "Compacting %d@%d + %d@%d files, score %.2f slots available %d", + compact->compaction->num_input_files(0), + compact->compaction->level(), + compact->compaction->num_input_files(1), + compact->compaction->output_level(), + compact->compaction->score(), + options_.max_background_compactions - bg_compaction_scheduled_); + char scratch[256]; + compact->compaction->Summary(scratch, sizeof(scratch)); + Log(options_.info_log, "Compaction start summary: %s\n", scratch); + + assert(versions_->current()->NumLevelFiles(compact->compaction->level()) > 0); + assert(compact->builder == nullptr); + assert(!compact->outfile); + + SequenceNumber visible_at_tip = 0; + SequenceNumber earliest_snapshot; + SequenceNumber latest_snapshot = 0; + snapshots_.getAll(compact->existing_snapshots); + if (compact->existing_snapshots.size() == 0) { + // optimize for fast path if there are no snapshots + visible_at_tip = versions_->LastSequence(); + earliest_snapshot = visible_at_tip; + } else { + latest_snapshot = compact->existing_snapshots.back(); + // Add the current seqno as the 'latest' virtual + // snapshot to the end of this list. + compact->existing_snapshots.push_back(versions_->LastSequence()); + earliest_snapshot = compact->existing_snapshots[0]; + } + + // Is this compaction producing files at the bottommost level? + bool bottommost_level = compact->compaction->BottomMostLevel(); + + // Allocate the output file numbers before we release the lock + AllocateCompactionOutputFileNumbers(compact); + + // Release mutex while we're actually doing the compaction work + mutex_.Unlock(); + + const uint64_t start_micros = env_->NowMicros(); + unique_ptr input(versions_->MakeInputIterator(compact->compaction)); + input->SeekToFirst(); + Status status; + ParsedInternalKey ikey; + std::string current_user_key; + bool has_current_user_key = false; + SequenceNumber last_sequence_for_key __attribute__((unused)) = + kMaxSequenceNumber; + SequenceNumber visible_in_snapshot = kMaxSequenceNumber; + std::string compaction_filter_value; + std::vector delete_key; // for compaction filter + MergeHelper merge(user_comparator(), options_.merge_operator.get(), + options_.info_log.get(), + false /* internal key corruption is expected */); + auto compaction_filter = options_.compaction_filter; + std::unique_ptr compaction_filter_from_factory = nullptr; + if (!compaction_filter) { + auto context = compact->GetFilterContext(); + compaction_filter_from_factory = + options_.compaction_filter_factory->CreateCompactionFilter(context); + compaction_filter = compaction_filter_from_factory.get(); + } + + for (; input->Valid() && !shutting_down_.Acquire_Load(); ) { + // Prioritize immutable compaction work + // TODO: remove memtable flush from normal compaction work + if (imm_.imm_flush_needed.NoBarrier_Load() != nullptr) { + const uint64_t imm_start = env_->NowMicros(); + LogFlush(options_.info_log); + mutex_.Lock(); + if (imm_.IsFlushPending(options_.min_write_buffer_number_to_merge)) { + FlushMemTableToOutputFile(nullptr, deletion_state); + bg_cv_.SignalAll(); // Wakeup MakeRoomForWrite() if necessary + } + mutex_.Unlock(); + imm_micros += (env_->NowMicros() - imm_start); + } + + Slice key = input->key(); + Slice value = input->value(); + + if (compact->compaction->ShouldStopBefore(key) && + compact->builder != nullptr) { + status = FinishCompactionOutputFile(compact, input.get()); + if (!status.ok()) { + break; + } + } + + // Handle key/value, add to state, etc. + bool drop = false; + bool current_entry_is_merging = false; + if (!ParseInternalKey(key, &ikey)) { + // Do not hide error keys + // TODO: error key stays in db forever? Figure out the intention/rationale + // v10 error v8 : we cannot hide v8 even though it's pretty obvious. + current_user_key.clear(); + has_current_user_key = false; + last_sequence_for_key = kMaxSequenceNumber; + visible_in_snapshot = kMaxSequenceNumber; + } else { + if (!has_current_user_key || + user_comparator()->Compare(ikey.user_key, + Slice(current_user_key)) != 0) { + // First occurrence of this user key + current_user_key.assign(ikey.user_key.data(), ikey.user_key.size()); + has_current_user_key = true; + last_sequence_for_key = kMaxSequenceNumber; + visible_in_snapshot = kMaxSequenceNumber; + + // apply the compaction filter to the first occurrence of the user key + if (compaction_filter && + ikey.type == kTypeValue && + (visible_at_tip || ikey.sequence > latest_snapshot)) { + // If the user has specified a compaction filter and the sequence + // number is greater than any external snapshot, then invoke the + // filter. + // If the return value of the compaction filter is true, replace + // the entry with a delete marker. + bool value_changed = false; + compaction_filter_value.clear(); + bool to_delete = + compaction_filter->Filter(compact->compaction->level(), + ikey.user_key, value, + &compaction_filter_value, + &value_changed); + if (to_delete) { + // make a copy of the original key + delete_key.assign(key.data(), key.data() + key.size()); + // convert it to a delete + UpdateInternalKey(&delete_key[0], delete_key.size(), + ikey.sequence, kTypeDeletion); + // anchor the key again + key = Slice(&delete_key[0], delete_key.size()); + // needed because ikey is backed by key + ParseInternalKey(key, &ikey); + // no value associated with delete + value.clear(); + RecordTick(options_.statistics.get(), COMPACTION_KEY_DROP_USER); + } else if (value_changed) { + value = compaction_filter_value; + } + } + + } + + // If there are no snapshots, then this kv affect visibility at tip. + // Otherwise, search though all existing snapshots to find + // the earlist snapshot that is affected by this kv. + SequenceNumber prev_snapshot = 0; // 0 means no previous snapshot + SequenceNumber visible = visible_at_tip ? + visible_at_tip : + findEarliestVisibleSnapshot(ikey.sequence, + compact->existing_snapshots, + &prev_snapshot); + + if (visible_in_snapshot == visible) { + // If the earliest snapshot is which this key is visible in + // is the same as the visibily of a previous instance of the + // same key, then this kv is not visible in any snapshot. + // Hidden by an newer entry for same user key + // TODO: why not > ? + assert(last_sequence_for_key >= ikey.sequence); + drop = true; // (A) + RecordTick(options_.statistics.get(), COMPACTION_KEY_DROP_NEWER_ENTRY); + } else if (ikey.type == kTypeDeletion && + ikey.sequence <= earliest_snapshot && + compact->compaction->IsBaseLevelForKey(ikey.user_key)) { + // For this user key: + // (1) there is no data in higher levels + // (2) data in lower levels will have larger sequence numbers + // (3) data in layers that are being compacted here and have + // smaller sequence numbers will be dropped in the next + // few iterations of this loop (by rule (A) above). + // Therefore this deletion marker is obsolete and can be dropped. + drop = true; + RecordTick(options_.statistics.get(), COMPACTION_KEY_DROP_OBSOLETE); + } else if (ikey.type == kTypeMerge) { + // We know the merge type entry is not hidden, otherwise we would + // have hit (A) + // We encapsulate the merge related state machine in a different + // object to minimize change to the existing flow. Turn out this + // logic could also be nicely re-used for memtable flush purge + // optimization in BuildTable. + merge.MergeUntil(input.get(), prev_snapshot, bottommost_level, + options_.statistics.get()); + current_entry_is_merging = true; + if (merge.IsSuccess()) { + // Successfully found Put/Delete/(end-of-key-range) while merging + // Get the merge result + key = merge.key(); + ParseInternalKey(key, &ikey); + value = merge.value(); + } else { + // Did not find a Put/Delete/(end-of-key-range) while merging + // We now have some stack of merge operands to write out. + // NOTE: key,value, and ikey are now referring to old entries. + // These will be correctly set below. + assert(!merge.keys().empty()); + assert(merge.keys().size() == merge.values().size()); + + // Hack to make sure last_sequence_for_key is correct + ParseInternalKey(merge.keys().front(), &ikey); + } + } + + last_sequence_for_key = ikey.sequence; + visible_in_snapshot = visible; + } +#if 0 + Log(options_.info_log, + " Compact: %s, seq %d, type: %d %d, drop: %d, is_base: %d, " + "%d smallest_snapshot: %d level: %d bottommost %d", + ikey.user_key.ToString().c_str(), + (int)ikey.sequence, ikey.type, kTypeValue, drop, + compact->compaction->IsBaseLevelForKey(ikey.user_key), + (int)last_sequence_for_key, (int)earliest_snapshot, + compact->compaction->level(), bottommost_level); +#endif + + if (!drop) { + // We may write a single key (e.g.: for Put/Delete or successful merge). + // Or we may instead have to write a sequence/list of keys. + // We have to write a sequence iff we have an unsuccessful merge + bool has_merge_list = current_entry_is_merging && !merge.IsSuccess(); + const std::deque* keys = nullptr; + const std::deque* values = nullptr; + std::deque::const_reverse_iterator key_iter; + std::deque::const_reverse_iterator value_iter; + if (has_merge_list) { + keys = &merge.keys(); + values = &merge.values(); + key_iter = keys->rbegin(); // The back (*rbegin()) is the first key + value_iter = values->rbegin(); + + key = Slice(*key_iter); + value = Slice(*value_iter); + } + + // If we have a list of keys to write, traverse the list. + // If we have a single key to write, simply write that key. + while (true) { + // Invariant: key,value,ikey will always be the next entry to write + char* kptr = (char*)key.data(); + std::string kstr; + + // Zeroing out the sequence number leads to better compression. + // If this is the bottommost level (no files in lower levels) + // and the earliest snapshot is larger than this seqno + // then we can squash the seqno to zero. + if (options_.compaction_style == kCompactionStyleLevel && + bottommost_level && ikey.sequence < earliest_snapshot && + ikey.type != kTypeMerge) { + assert(ikey.type != kTypeDeletion); + // make a copy because updating in place would cause problems + // with the priority queue that is managing the input key iterator + kstr.assign(key.data(), key.size()); + kptr = (char *)kstr.c_str(); + UpdateInternalKey(kptr, key.size(), (uint64_t)0, ikey.type); + } + + Slice newkey(kptr, key.size()); + assert((key.clear(), 1)); // we do not need 'key' anymore + + // Open output file if necessary + if (compact->builder == nullptr) { + status = OpenCompactionOutputFile(compact); + if (!status.ok()) { + break; + } + } + + SequenceNumber seqno = GetInternalKeySeqno(newkey); + if (compact->builder->NumEntries() == 0) { + compact->current_output()->smallest.DecodeFrom(newkey); + compact->current_output()->smallest_seqno = seqno; + } else { + compact->current_output()->smallest_seqno = + std::min(compact->current_output()->smallest_seqno, seqno); + } + compact->current_output()->largest.DecodeFrom(newkey); + compact->builder->Add(newkey, value); + compact->current_output()->largest_seqno = + std::max(compact->current_output()->largest_seqno, seqno); + + // Close output file if it is big enough + if (compact->builder->FileSize() >= + compact->compaction->MaxOutputFileSize()) { + status = FinishCompactionOutputFile(compact, input.get()); + if (!status.ok()) { + break; + } + } + + // If we have a list of entries, move to next element + // If we only had one entry, then break the loop. + if (has_merge_list) { + ++key_iter; + ++value_iter; + + // If at end of list + if (key_iter == keys->rend() || value_iter == values->rend()) { + // Sanity Check: if one ends, then both end + assert(key_iter == keys->rend() && value_iter == values->rend()); + break; + } + + // Otherwise not at end of list. Update key, value, and ikey. + key = Slice(*key_iter); + value = Slice(*value_iter); + ParseInternalKey(key, &ikey); + + } else{ + // Only had one item to begin with (Put/Delete) + break; + } + } + } + + // MergeUntil has moved input to the next entry + if (!current_entry_is_merging) { + input->Next(); + } + } + + if (status.ok() && shutting_down_.Acquire_Load()) { + status = Status::IOError("Database shutdown started during compaction"); + } + if (status.ok() && compact->builder != nullptr) { + status = FinishCompactionOutputFile(compact, input.get()); + } + if (status.ok()) { + status = input->status(); + } + input.reset(); + + CompactionStats stats; + stats.micros = env_->NowMicros() - start_micros - imm_micros; + if (options_.statistics.get()) { + options_.statistics.get()->measureTime(COMPACTION_TIME, stats.micros); + } + stats.files_in_leveln = compact->compaction->num_input_files(0); + stats.files_in_levelnp1 = compact->compaction->num_input_files(1); + + int num_output_files = compact->outputs.size(); + if (compact->builder != nullptr) { + // An error occurred so ignore the last output. + assert(num_output_files > 0); + --num_output_files; + } + stats.files_out_levelnp1 = num_output_files; + + for (int i = 0; i < compact->compaction->num_input_files(0); i++) { + stats.bytes_readn += compact->compaction->input(0, i)->file_size; + RecordTick(options_.statistics.get(), COMPACT_READ_BYTES, + compact->compaction->input(0, i)->file_size); + } + + for (int i = 0; i < compact->compaction->num_input_files(1); i++) { + stats.bytes_readnp1 += compact->compaction->input(1, i)->file_size; + RecordTick(options_.statistics.get(), COMPACT_READ_BYTES, + compact->compaction->input(1, i)->file_size); + } + + for (int i = 0; i < num_output_files; i++) { + stats.bytes_written += compact->outputs[i].file_size; + RecordTick(options_.statistics.get(), COMPACT_WRITE_BYTES, + compact->outputs[i].file_size); + } + + LogFlush(options_.info_log); + mutex_.Lock(); + stats_[compact->compaction->output_level()].Add(stats); + + // if there were any unused file number (mostly in case of + // compaction error), free up the entry from pending_putputs + ReleaseCompactionUnusedFileNumbers(compact); + + if (status.ok()) { + status = InstallCompactionResults(compact); + InstallSuperVersion(deletion_state); + } + Version::LevelSummaryStorage tmp; + Log(options_.info_log, + "compacted to: %s, %.1f MB/sec, level %d, files in(%d, %d) out(%d) " + "MB in(%.1f, %.1f) out(%.1f), read-write-amplify(%.1f) " + "write-amplify(%.1f) %s\n", + versions_->current()->LevelSummary(&tmp), + (stats.bytes_readn + stats.bytes_readnp1 + stats.bytes_written) / + (double)stats.micros, + compact->compaction->output_level(), stats.files_in_leveln, + stats.files_in_levelnp1, stats.files_out_levelnp1, + stats.bytes_readn / 1048576.0, stats.bytes_readnp1 / 1048576.0, + stats.bytes_written / 1048576.0, + (stats.bytes_written + stats.bytes_readnp1 + stats.bytes_readn) / + (double)stats.bytes_readn, + stats.bytes_written / (double)stats.bytes_readn, + status.ToString().c_str()); + + return status; +} + +namespace { +struct IterState { + port::Mutex* mu; + Version* version; + std::vector mem; // includes both mem_ and imm_ + DBImpl *db; +}; + +static void CleanupIteratorState(void* arg1, void* arg2) { + IterState* state = reinterpret_cast(arg1); + DBImpl::DeletionState deletion_state(state->db->GetOptions(). + max_write_buffer_number); + state->mu->Lock(); + for (unsigned int i = 0; i < state->mem.size(); i++) { + MemTable* m = state->mem[i]->Unref(); + if (m != nullptr) { + deletion_state.memtables_to_free.push_back(m); + } + } + state->version->Unref(); + // fast path FindObsoleteFiles + state->db->FindObsoleteFiles(deletion_state, false, true); + state->mu->Unlock(); + state->db->PurgeObsoleteFiles(deletion_state); + delete state; +} +} // namespace + +Iterator* DBImpl::NewInternalIterator(const ReadOptions& options, + SequenceNumber* latest_snapshot) { + IterState* cleanup = new IterState; + MemTable* mutable_mem; + std::vector immutables; + Version* version; + + // Collect together all needed child iterators for mem + mutex_.Lock(); + *latest_snapshot = versions_->LastSequence(); + mem_->Ref(); + mutable_mem = mem_; + // Collect together all needed child iterators for imm_ + imm_.GetMemTables(&immutables); + for (unsigned int i = 0; i < immutables.size(); i++) { + immutables[i]->Ref(); + } + // Collect iterators for files in L0 - Ln + versions_->current()->Ref(); + version = versions_->current(); + mutex_.Unlock(); + + std::vector list; + list.push_back(mutable_mem->NewIterator(options)); + cleanup->mem.push_back(mutable_mem); + for (MemTable* m : immutables) { + list.push_back(m->NewIterator(options)); + cleanup->mem.push_back(m); + } + version->AddIterators(options, storage_options_, &list); + Iterator* internal_iter = + NewMergingIterator(&internal_comparator_, &list[0], list.size()); + cleanup->version = version; + cleanup->mu = &mutex_; + cleanup->db = this; + internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, nullptr); + + return internal_iter; +} + +Iterator* DBImpl::TEST_NewInternalIterator() { + SequenceNumber ignored; + return NewInternalIterator(ReadOptions(), &ignored); +} + +int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes() { + MutexLock l(&mutex_); + return versions_->current()->MaxNextLevelOverlappingBytes(); +} + +Status DBImpl::Get(const ReadOptions& options, + const Slice& key, + std::string* value) { + return GetImpl(options, key, value); +} + +// DeletionState gets created and destructed outside of the lock -- we +// use this convinently to: +// * malloc one SuperVersion() outside of the lock -- new_superversion +// * delete one SuperVersion() outside of the lock -- superversion_to_free +// +// However, if InstallSuperVersion() gets called twice with the same, +// deletion_state, we can't reuse the SuperVersion() that got malloced because +// first call already used it. In that rare case, we take a hit and create a +// new SuperVersion() inside of the mutex. We do similar thing +// for superversion_to_free +void DBImpl::InstallSuperVersion(DeletionState& deletion_state) { + // if new_superversion == nullptr, it means somebody already used it + SuperVersion* new_superversion = + (deletion_state.new_superversion != nullptr) ? + deletion_state.new_superversion : new SuperVersion(); + SuperVersion* old_superversion = InstallSuperVersion(new_superversion); + deletion_state.new_superversion = nullptr; + if (deletion_state.superversion_to_free != nullptr) { + // somebody already put it there + delete old_superversion; + } else { + deletion_state.superversion_to_free = old_superversion; + } +} + +DBImpl::SuperVersion* DBImpl::InstallSuperVersion( + SuperVersion* new_superversion) { + mutex_.AssertHeld(); + new_superversion->Init(mem_, imm_, versions_->current()); + SuperVersion* old_superversion = super_version_; + super_version_ = new_superversion; + if (old_superversion != nullptr && old_superversion->Unref()) { + old_superversion->Cleanup(); + return old_superversion; // will let caller delete outside of mutex + } + return nullptr; +} + +Status DBImpl::GetImpl(const ReadOptions& options, + const Slice& key, + std::string* value, + bool* value_found) { + Status s; + + StopWatch sw(env_, options_.statistics.get(), DB_GET, false); + SequenceNumber snapshot; + if (options.snapshot != nullptr) { + snapshot = reinterpret_cast(options.snapshot)->number_; + } else { + snapshot = versions_->LastSequence(); + } + + // This can be replaced by using atomics and spinlock instead of big mutex + mutex_.Lock(); + SuperVersion* get_version = super_version_->Ref(); + mutex_.Unlock(); + + bool have_stat_update = false; + Version::GetStats stats; + + // Prepare to store a list of merge operations if merge occurs. + MergeContext merge_context; + + // First look in the memtable, then in the immutable memtable (if any). + // s is both in/out. When in, s could either be OK or MergeInProgress. + // merge_operands will contain the sequence of merges in the latter case. + LookupKey lkey(key, snapshot); + if (get_version->mem->Get(lkey, value, &s, merge_context, options_)) { + // Done + RecordTick(options_.statistics.get(), MEMTABLE_HIT); + } else if (get_version->imm.Get(lkey, value, &s, merge_context, options_)) { + // Done + RecordTick(options_.statistics.get(), MEMTABLE_HIT); + } else { + get_version->current->Get(options, lkey, value, &s, &merge_context, &stats, + options_, value_found); + have_stat_update = true; + RecordTick(options_.statistics.get(), MEMTABLE_MISS); + } + + bool delete_get_version = false; + if (!options_.disable_seek_compaction && have_stat_update) { + mutex_.Lock(); + if (get_version->current->UpdateStats(stats)) { + MaybeScheduleFlushOrCompaction(); + } + if (get_version->Unref()) { + get_version->Cleanup(); + delete_get_version = true; + } + mutex_.Unlock(); + } else { + if (get_version->Unref()) { + mutex_.Lock(); + get_version->Cleanup(); + mutex_.Unlock(); + delete_get_version = true; + } + } + if (delete_get_version) { + delete get_version; + } + + // Note, tickers are atomic now - no lock protection needed any more. + RecordTick(options_.statistics.get(), NUMBER_KEYS_READ); + RecordTick(options_.statistics.get(), BYTES_READ, value->size()); + return s; +} + +std::vector DBImpl::MultiGet(const ReadOptions& options, + const std::vector& keys, + std::vector* values) { + + StopWatch sw(env_, options_.statistics.get(), DB_MULTIGET, false); + SequenceNumber snapshot; + std::vector to_delete; + + mutex_.Lock(); + if (options.snapshot != nullptr) { + snapshot = reinterpret_cast(options.snapshot)->number_; + } else { + snapshot = versions_->LastSequence(); + } + + MemTable* mem = mem_; + MemTableList imm = imm_; + Version* current = versions_->current(); + mem->Ref(); + imm.RefAll(); + current->Ref(); + + // Unlock while reading from files and memtables + + mutex_.Unlock(); + bool have_stat_update = false; + Version::GetStats stats; + + // Contain a list of merge operations if merge occurs. + MergeContext merge_context; + + // Note: this always resizes the values array + int numKeys = keys.size(); + std::vector statList(numKeys); + values->resize(numKeys); + + // Keep track of bytes that we read for statistics-recording later + uint64_t bytesRead = 0; + + // For each of the given keys, apply the entire "get" process as follows: + // First look in the memtable, then in the immutable memtable (if any). + // s is both in/out. When in, s could either be OK or MergeInProgress. + // merge_operands will contain the sequence of merges in the latter case. + for (int i=0; iGet(lkey, value, &s, merge_context, options_)) { + // Done + } else if (imm.Get(lkey, value, &s, merge_context, options_)) { + // Done + } else { + current->Get(options, lkey, value, &s, &merge_context, &stats, options_); + have_stat_update = true; + } + + if (s.ok()) { + bytesRead += value->size(); + } + } + + // Post processing (decrement reference counts and record statistics) + mutex_.Lock(); + if (!options_.disable_seek_compaction && + have_stat_update && current->UpdateStats(stats)) { + MaybeScheduleFlushOrCompaction(); + } + MemTable* m = mem->Unref(); + imm.UnrefAll(&to_delete); + current->Unref(); + mutex_.Unlock(); + + // free up all obsolete memtables outside the mutex + delete m; + for (MemTable* v: to_delete) delete v; + + RecordTick(options_.statistics.get(), NUMBER_MULTIGET_CALLS); + RecordTick(options_.statistics.get(), NUMBER_MULTIGET_KEYS_READ, numKeys); + RecordTick(options_.statistics.get(), NUMBER_MULTIGET_BYTES_READ, bytesRead); + + return statList; +} + +bool DBImpl::KeyMayExist(const ReadOptions& options, + const Slice& key, + std::string* value, + bool* value_found) { + if (value_found != nullptr) { + // falsify later if key-may-exist but can't fetch value + *value_found = true; + } + ReadOptions roptions = options; + roptions.read_tier = kBlockCacheTier; // read from block cache only + auto s = GetImpl(roptions, key, value, value_found); + + // If options.block_cache != nullptr and the index block of the table didn't + // not present in block_cache, the return value will be Status::Incomplete. + // In this case, key may still exist in the table. + return s.ok() || s.IsIncomplete(); +} + +Iterator* DBImpl::NewIterator(const ReadOptions& options) { + SequenceNumber latest_snapshot; + Iterator* iter = NewInternalIterator(options, &latest_snapshot); + iter = NewDBIterator( + &dbname_, env_, options_, user_comparator(), iter, + (options.snapshot != nullptr + ? reinterpret_cast(options.snapshot)->number_ + : latest_snapshot)); + if (options.prefix) { + // use extra wrapper to exclude any keys from the results which + // don't begin with the prefix + iter = new PrefixFilterIterator(iter, *options.prefix, + options_.prefix_extractor); + } + return iter; +} + +const Snapshot* DBImpl::GetSnapshot() { + MutexLock l(&mutex_); + return snapshots_.New(versions_->LastSequence()); +} + +void DBImpl::ReleaseSnapshot(const Snapshot* s) { + MutexLock l(&mutex_); + snapshots_.Delete(reinterpret_cast(s)); +} + +// Convenience methods +Status DBImpl::Put(const WriteOptions& o, const Slice& key, const Slice& val) { + return DB::Put(o, key, val); +} + +Status DBImpl::Merge(const WriteOptions& o, const Slice& key, + const Slice& val) { + if (!options_.merge_operator) { + return Status::NotSupported("Provide a merge_operator when opening DB"); + } else { + return DB::Merge(o, key, val); + } +} + +Status DBImpl::Delete(const WriteOptions& options, const Slice& key) { + return DB::Delete(options, key); +} + +Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { + Writer w(&mutex_); + w.batch = my_batch; + w.sync = options.sync; + w.disableWAL = options.disableWAL; + w.done = false; + + StopWatch sw(env_, options_.statistics.get(), DB_WRITE, false); + mutex_.Lock(); + writers_.push_back(&w); + while (!w.done && &w != writers_.front()) { + w.cv.Wait(); + } + + if (!options.disableWAL) { + RecordTick(options_.statistics.get(), WRITE_WITH_WAL, 1); + } + + if (w.done) { + mutex_.Unlock(); + RecordTick(options_.statistics.get(), WRITE_DONE_BY_OTHER, 1); + return w.status; + } else { + RecordTick(options_.statistics.get(), WRITE_DONE_BY_SELF, 1); + } + + // May temporarily unlock and wait. + SuperVersion* superversion_to_free = nullptr; + Status status = MakeRoomForWrite(my_batch == nullptr, &superversion_to_free); + uint64_t last_sequence = versions_->LastSequence(); + Writer* last_writer = &w; + if (status.ok() && my_batch != nullptr) { // nullptr batch is for compactions + autovector write_batch_group; + BuildBatchGroup(&last_writer, &write_batch_group); + + // Add to log and apply to memtable. We can release the lock + // during this phase since &w is currently responsible for logging + // and protects against concurrent loggers and concurrent writes + // into mem_. + { + mutex_.Unlock(); + WriteBatch* updates = nullptr; + if (write_batch_group.size() == 1) { + updates = write_batch_group[0]; + } else { + updates = &tmp_batch_; + for (size_t i = 0; i < write_batch_group.size(); ++i) { + WriteBatchInternal::Append(updates, write_batch_group[i]); + } + } + + const SequenceNumber current_sequence = last_sequence + 1; + WriteBatchInternal::SetSequence(updates, current_sequence); + int my_batch_count = WriteBatchInternal::Count(updates); + last_sequence += my_batch_count; + // Record statistics + RecordTick(options_.statistics.get(), + NUMBER_KEYS_WRITTEN, my_batch_count); + RecordTick(options_.statistics.get(), + BYTES_WRITTEN, + WriteBatchInternal::ByteSize(updates)); + if (options.disableWAL) { + flush_on_destroy_ = true; + } + + if (!options.disableWAL) { + StopWatchNano timer(env_); + StartPerfTimer(&timer); + Slice log_entry = WriteBatchInternal::Contents(updates); + status = log_->AddRecord(log_entry); + RecordTick(options_.statistics.get(), WAL_FILE_SYNCED, 1); + RecordTick(options_.statistics.get(), WAL_FILE_BYTES, log_entry.size()); + BumpPerfTime(&perf_context.wal_write_time, &timer); + if (status.ok() && options.sync) { + if (options_.use_fsync) { + StopWatch(env_, options_.statistics.get(), WAL_FILE_SYNC_MICROS); + status = log_->file()->Fsync(); + } else { + StopWatch(env_, options_.statistics.get(), WAL_FILE_SYNC_MICROS); + status = log_->file()->Sync(); + } + } + } + if (status.ok()) { + status = WriteBatchInternal::InsertInto(updates, mem_, &options_, this, + options_.filter_deletes); + if (!status.ok()) { + // Panic for in-memory corruptions + // Note that existing logic was not sound. Any partial failure writing + // into the memtable would result in a state that some write ops might + // have succeeded in memtable but Status reports error for all writes. + throw std::runtime_error("In memory WriteBatch corruption!"); + } + SetTickerCount(options_.statistics.get(), + SEQUENCE_NUMBER, last_sequence); + } + if (updates == &tmp_batch_) tmp_batch_.Clear(); + mutex_.Lock(); + if (status.ok()) { + versions_->SetLastSequence(last_sequence); + } + } + } + if (options_.paranoid_checks && !status.ok() && bg_error_.ok()) { + bg_error_ = status; // stop compaction & fail any further writes + } + + while (true) { + Writer* ready = writers_.front(); + writers_.pop_front(); + if (ready != &w) { + ready->status = status; + ready->done = true; + ready->cv.Signal(); + } + if (ready == last_writer) break; + } + + // Notify new head of write queue + if (!writers_.empty()) { + writers_.front()->cv.Signal(); + } + mutex_.Unlock(); + delete superversion_to_free; + return status; +} + +// REQUIRES: Writer list must be non-empty +// REQUIRES: First writer must have a non-nullptr batch +void DBImpl::BuildBatchGroup(Writer** last_writer, + autovector* write_batch_group) { + assert(!writers_.empty()); + Writer* first = writers_.front(); + assert(first->batch != nullptr); + + size_t size = WriteBatchInternal::ByteSize(first->batch); + write_batch_group->push_back(first->batch); + + // Allow the group to grow up to a maximum size, but if the + // original write is small, limit the growth so we do not slow + // down the small write too much. + size_t max_size = 1 << 20; + if (size <= (128<<10)) { + max_size = size + (128<<10); + } + + *last_writer = first; + std::deque::iterator iter = writers_.begin(); + ++iter; // Advance past "first" + for (; iter != writers_.end(); ++iter) { + Writer* w = *iter; + if (w->sync && !first->sync) { + // Do not include a sync write into a batch handled by a non-sync write. + break; + } + + if (!w->disableWAL && first->disableWAL) { + // Do not include a write that needs WAL into a batch that has + // WAL disabled. + break; + } + + if (w->batch != nullptr) { + size += WriteBatchInternal::ByteSize(w->batch); + if (size > max_size) { + // Do not make batch too big + break; + } + + write_batch_group->push_back(w->batch); + } + *last_writer = w; + } +} + +// This function computes the amount of time in microseconds by which a write +// should be delayed based on the number of level-0 files according to the +// following formula: +// if n < bottom, return 0; +// if n >= top, return 1000; +// otherwise, let r = (n - bottom) / +// (top - bottom) +// and return r^2 * 1000. +// The goal of this formula is to gradually increase the rate at which writes +// are slowed. We also tried linear delay (r * 1000), but it seemed to do +// slightly worse. There is no other particular reason for choosing quadratic. +uint64_t DBImpl::SlowdownAmount(int n, int top, int bottom) { + uint64_t delay; + if (n >= top) { + delay = 1000; + } + else if (n < bottom) { + delay = 0; + } + else { + // If we are here, we know that: + // level0_start_slowdown <= n < level0_slowdown + // since the previous two conditions are false. + float how_much = + (float) (n - bottom) / + (top - bottom); + delay = how_much * how_much * 1000; + } + assert(delay <= 1000); + return delay; +} + +// REQUIRES: mutex_ is held +// REQUIRES: this thread is currently at the front of the writer queue +Status DBImpl::MakeRoomForWrite(bool force, + SuperVersion** superversion_to_free) { + mutex_.AssertHeld(); + assert(!writers_.empty()); + bool allow_delay = !force; + bool allow_hard_rate_limit_delay = !force; + bool allow_soft_rate_limit_delay = !force; + uint64_t rate_limit_delay_millis = 0; + Status s; + double score; + *superversion_to_free = nullptr; + + while (true) { + if (!bg_error_.ok()) { + // Yield previous error + s = bg_error_; + break; + } else if (allow_delay && versions_->NeedSlowdownForNumLevel0Files()) { + // We are getting close to hitting a hard limit on the number of + // L0 files. Rather than delaying a single write by several + // seconds when we hit the hard limit, start delaying each + // individual write by 0-1ms to reduce latency variance. Also, + // this delay hands over some CPU to the compaction thread in + // case it is sharing the same core as the writer. + mutex_.Unlock(); + uint64_t delayed; + { + StopWatch sw(env_, options_.statistics.get(), STALL_L0_SLOWDOWN_COUNT); + env_->SleepForMicroseconds( + SlowdownAmount(versions_->current()->NumLevelFiles(0), + options_.level0_slowdown_writes_trigger, + options_.level0_stop_writes_trigger) + ); + delayed = sw.ElapsedMicros(); + } + RecordTick(options_.statistics.get(), STALL_L0_SLOWDOWN_MICROS, delayed); + stall_level0_slowdown_ += delayed; + stall_level0_slowdown_count_++; + allow_delay = false; // Do not delay a single write more than once + mutex_.Lock(); + delayed_writes_++; + } else if (!force && + (mem_->ApproximateMemoryUsage() <= options_.write_buffer_size)) { + // There is room in current memtable + if (allow_delay) { + DelayLoggingAndReset(); + } + break; + } else if (imm_.size() == options_.max_write_buffer_number - 1) { + // We have filled up the current memtable, but the previous + // ones are still being compacted, so we wait. + DelayLoggingAndReset(); + Log(options_.info_log, "wait for memtable compaction...\n"); + uint64_t stall; + { + StopWatch sw(env_, options_.statistics.get(), + STALL_MEMTABLE_COMPACTION_COUNT); + bg_cv_.Wait(); + stall = sw.ElapsedMicros(); + } + RecordTick(options_.statistics.get(), + STALL_MEMTABLE_COMPACTION_MICROS, stall); + stall_memtable_compaction_ += stall; + stall_memtable_compaction_count_++; + } else if (versions_->current()->NumLevelFiles(0) >= + options_.level0_stop_writes_trigger) { + // There are too many level-0 files. + DelayLoggingAndReset(); + Log(options_.info_log, "wait for fewer level0 files...\n"); + uint64_t stall; + { + StopWatch sw(env_, options_.statistics.get(), + STALL_L0_NUM_FILES_COUNT); + bg_cv_.Wait(); + stall = sw.ElapsedMicros(); + } + RecordTick(options_.statistics.get(), STALL_L0_NUM_FILES_MICROS, stall); + stall_level0_num_files_ += stall; + stall_level0_num_files_count_++; + } else if ( + allow_hard_rate_limit_delay && + options_.hard_rate_limit > 1.0 && + (score = versions_->MaxCompactionScore()) > options_.hard_rate_limit) { + // Delay a write when the compaction score for any level is too large. + int max_level = versions_->MaxCompactionScoreLevel(); + mutex_.Unlock(); + uint64_t delayed; + { + StopWatch sw(env_, options_.statistics.get(), + HARD_RATE_LIMIT_DELAY_COUNT); + env_->SleepForMicroseconds(1000); + delayed = sw.ElapsedMicros(); + } + stall_leveln_slowdown_[max_level] += delayed; + stall_leveln_slowdown_count_[max_level]++; + // Make sure the following value doesn't round to zero. + uint64_t rate_limit = std::max((delayed / 1000), (uint64_t) 1); + rate_limit_delay_millis += rate_limit; + RecordTick(options_.statistics.get(), + RATE_LIMIT_DELAY_MILLIS, rate_limit); + if (options_.rate_limit_delay_max_milliseconds > 0 && + rate_limit_delay_millis >= + (unsigned)options_.rate_limit_delay_max_milliseconds) { + allow_hard_rate_limit_delay = false; + } + mutex_.Lock(); + } else if ( + allow_soft_rate_limit_delay && + options_.soft_rate_limit > 0.0 && + (score = versions_->MaxCompactionScore()) > options_.soft_rate_limit) { + // Delay a write when the compaction score for any level is too large. + // TODO: add statistics + mutex_.Unlock(); + { + StopWatch sw(env_, options_.statistics.get(), + SOFT_RATE_LIMIT_DELAY_COUNT); + env_->SleepForMicroseconds(SlowdownAmount( + score, + options_.soft_rate_limit, + options_.hard_rate_limit) + ); + rate_limit_delay_millis += sw.ElapsedMicros(); + } + allow_soft_rate_limit_delay = false; + mutex_.Lock(); + + } else { + unique_ptr lfile; + MemTable* memtmp = nullptr; + + // Attempt to switch to a new memtable and trigger compaction of old. + // Do this without holding the dbmutex lock. + assert(versions_->PrevLogNumber() == 0); + uint64_t new_log_number = versions_->NewFileNumber(); + SuperVersion* new_superversion = nullptr; + mutex_.Unlock(); + { + EnvOptions soptions(storage_options_); + soptions.use_mmap_writes = false; + DelayLoggingAndReset(); + s = env_->NewWritableFile(LogFileName(options_.wal_dir, new_log_number), + &lfile, soptions); + if (s.ok()) { + // Our final size should be less than write_buffer_size + // (compression, etc) but err on the side of caution. + lfile->SetPreallocationBlockSize(1.1 * options_.write_buffer_size); + memtmp = new MemTable(internal_comparator_, options_); + new_superversion = new SuperVersion(options_.max_write_buffer_number); + } + } + mutex_.Lock(); + if (!s.ok()) { + // Avoid chewing through file number space in a tight loop. + versions_->ReuseFileNumber(new_log_number); + assert (!memtmp); + break; + } + logfile_number_ = new_log_number; + log_.reset(new log::Writer(std::move(lfile))); + mem_->SetNextLogNumber(logfile_number_); + imm_.Add(mem_); + if (force) { + imm_.FlushRequested(); + } + mem_ = memtmp; + mem_->Ref(); + Log(options_.info_log, + "New memtable created with log file: #%lu\n", + (unsigned long)logfile_number_); + mem_->SetLogNumber(logfile_number_); + force = false; // Do not force another compaction if have room + MaybeScheduleFlushOrCompaction(); + *superversion_to_free = InstallSuperVersion(new_superversion); + } + } + return s; +} + +const std::string& DBImpl::GetName() const { + return dbname_; +} + +Env* DBImpl::GetEnv() const { + return env_; +} + +const Options& DBImpl::GetOptions() const { + return options_; +} + +bool DBImpl::GetProperty(const Slice& property, std::string* value) { + value->clear(); + + MutexLock l(&mutex_); + Version* current = versions_->current(); + Slice in = property; + Slice prefix("rocksdb."); + if (!in.starts_with(prefix)) return false; + in.remove_prefix(prefix.size()); + + if (in.starts_with("num-files-at-level")) { + in.remove_prefix(strlen("num-files-at-level")); + uint64_t level; + bool ok = ConsumeDecimalNumber(&in, &level) && in.empty(); + if (!ok || (int)level >= NumberLevels()) { + return false; + } else { + char buf[100]; + snprintf(buf, sizeof(buf), "%d", + current->NumLevelFiles(static_cast(level))); + *value = buf; + return true; + } + } else if (in == "levelstats") { + char buf[1000]; + snprintf(buf, sizeof(buf), + "Level Files Size(MB)\n" + "--------------------\n"); + value->append(buf); + + for (int level = 0; level < NumberLevels(); level++) { + snprintf(buf, sizeof(buf), + "%3d %8d %8.0f\n", + level, + current->NumLevelFiles(level), + current->NumLevelBytes(level) / 1048576.0); + value->append(buf); + } + return true; + + } else if (in == "stats") { + char buf[1000]; + + uint64_t wal_bytes = 0; + uint64_t wal_synced = 0; + uint64_t user_bytes_written = 0; + uint64_t write_other = 0; + uint64_t write_self = 0; + uint64_t write_with_wal = 0; + uint64_t total_bytes_written = 0; + uint64_t total_bytes_read = 0; + uint64_t micros_up = env_->NowMicros() - started_at_; + // Add "+1" to make sure seconds_up is > 0 and avoid NaN later + double seconds_up = (micros_up + 1) / 1000000.0; + uint64_t total_slowdown = 0; + uint64_t total_slowdown_count = 0; + uint64_t interval_bytes_written = 0; + uint64_t interval_bytes_read = 0; + uint64_t interval_bytes_new = 0; + double interval_seconds_up = 0; + + Statistics* s = options_.statistics.get(); + if (s) { + wal_bytes = s->getTickerCount(WAL_FILE_BYTES); + wal_synced = s->getTickerCount(WAL_FILE_SYNCED); + user_bytes_written = s->getTickerCount(BYTES_WRITTEN); + write_other = s->getTickerCount(WRITE_DONE_BY_OTHER); + write_self = s->getTickerCount(WRITE_DONE_BY_SELF); + write_with_wal = s->getTickerCount(WRITE_WITH_WAL); + } + + // Pardon the long line but I think it is easier to read this way. + snprintf(buf, sizeof(buf), + " Compactions\n" + "Level Files Size(MB) Score Time(sec) Read(MB) Write(MB) Rn(MB) Rnp1(MB) Wnew(MB) RW-Amplify Read(MB/s) Write(MB/s) Rn Rnp1 Wnp1 NewW Count Ln-stall Stall-cnt\n" + "--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n" + ); + value->append(buf); + for (int level = 0; level < current->NumberLevels(); level++) { + int files = current->NumLevelFiles(level); + if (stats_[level].micros > 0 || files > 0) { + int64_t bytes_read = stats_[level].bytes_readn + + stats_[level].bytes_readnp1; + int64_t bytes_new = stats_[level].bytes_written - + stats_[level].bytes_readnp1; + double amplify = (stats_[level].bytes_readn == 0) + ? 0.0 + : (stats_[level].bytes_written + + stats_[level].bytes_readnp1 + + stats_[level].bytes_readn) / + (double) stats_[level].bytes_readn; + + total_bytes_read += bytes_read; + total_bytes_written += stats_[level].bytes_written; + + snprintf( + buf, sizeof(buf), + "%3d %8d %8.0f %5.1f %9.0f %9.0f %9.0f %9.0f %9.0f %9.0f %10.1f %9.1f %11.1f %8d %8d %8d %8d %8d %9.1f %9lu\n", + level, + files, + current->NumLevelBytes(level) / 1048576.0, + current->NumLevelBytes(level) / + versions_->MaxBytesForLevel(level), + stats_[level].micros / 1e6, + bytes_read / 1048576.0, + stats_[level].bytes_written / 1048576.0, + stats_[level].bytes_readn / 1048576.0, + stats_[level].bytes_readnp1 / 1048576.0, + bytes_new / 1048576.0, + amplify, + // +1 to avoid division by 0 + (bytes_read / 1048576.0) / ((stats_[level].micros+1) / 1000000.0), + (stats_[level].bytes_written / 1048576.0) / + ((stats_[level].micros+1) / 1000000.0), + stats_[level].files_in_leveln, + stats_[level].files_in_levelnp1, + stats_[level].files_out_levelnp1, + stats_[level].files_out_levelnp1 - stats_[level].files_in_levelnp1, + stats_[level].count, + stall_leveln_slowdown_[level] / 1000000.0, + (unsigned long) stall_leveln_slowdown_count_[level]); + total_slowdown += stall_leveln_slowdown_[level]; + total_slowdown_count += stall_leveln_slowdown_count_[level]; + value->append(buf); + } + } + + interval_bytes_new = user_bytes_written - last_stats_.ingest_bytes_; + interval_bytes_read = total_bytes_read - last_stats_.compaction_bytes_read_; + interval_bytes_written = + total_bytes_written - last_stats_.compaction_bytes_written_; + interval_seconds_up = seconds_up - last_stats_.seconds_up_; + + snprintf(buf, sizeof(buf), "Uptime(secs): %.1f total, %.1f interval\n", + seconds_up, interval_seconds_up); + value->append(buf); + + snprintf(buf, sizeof(buf), + "Writes cumulative: %llu total, %llu batches, " + "%.1f per batch, %.2f ingest GB\n", + (unsigned long long) (write_other + write_self), + (unsigned long long) write_self, + (write_other + write_self) / (double) (write_self + 1), + user_bytes_written / (1048576.0 * 1024)); + value->append(buf); + + snprintf(buf, sizeof(buf), + "WAL cumulative: %llu WAL writes, %llu WAL syncs, " + "%.2f writes per sync, %.2f GB written\n", + (unsigned long long) write_with_wal, + (unsigned long long ) wal_synced, + write_with_wal / (double) (wal_synced + 1), + wal_bytes / (1048576.0 * 1024)); + value->append(buf); + + snprintf(buf, sizeof(buf), + "Compaction IO cumulative (GB): " + "%.2f new, %.2f read, %.2f write, %.2f read+write\n", + user_bytes_written / (1048576.0 * 1024), + total_bytes_read / (1048576.0 * 1024), + total_bytes_written / (1048576.0 * 1024), + (total_bytes_read + total_bytes_written) / (1048576.0 * 1024)); + value->append(buf); + + snprintf(buf, sizeof(buf), + "Compaction IO cumulative (MB/sec): " + "%.1f new, %.1f read, %.1f write, %.1f read+write\n", + user_bytes_written / 1048576.0 / seconds_up, + total_bytes_read / 1048576.0 / seconds_up, + total_bytes_written / 1048576.0 / seconds_up, + (total_bytes_read + total_bytes_written) / 1048576.0 / seconds_up); + value->append(buf); + + // +1 to avoid divide by 0 and NaN + snprintf(buf, sizeof(buf), + "Amplification cumulative: %.1f write, %.1f compaction\n", + (double) (total_bytes_written + wal_bytes) + / (user_bytes_written + 1), + (double) (total_bytes_written + total_bytes_read + wal_bytes) + / (user_bytes_written + 1)); + value->append(buf); + + uint64_t interval_write_other = write_other - last_stats_.write_other_; + uint64_t interval_write_self = write_self - last_stats_.write_self_; + + snprintf(buf, sizeof(buf), + "Writes interval: %llu total, %llu batches, " + "%.1f per batch, %.1f ingest MB\n", + (unsigned long long) (interval_write_other + interval_write_self), + (unsigned long long) interval_write_self, + (double) (interval_write_other + interval_write_self) + / (interval_write_self + 1), + (user_bytes_written - last_stats_.ingest_bytes_) / 1048576.0); + value->append(buf); + + uint64_t interval_write_with_wal = + write_with_wal - last_stats_.write_with_wal_; + + uint64_t interval_wal_synced = wal_synced - last_stats_.wal_synced_; + uint64_t interval_wal_bytes = wal_bytes - last_stats_.wal_bytes_; + + snprintf(buf, sizeof(buf), + "WAL interval: %llu WAL writes, %llu WAL syncs, " + "%.2f writes per sync, %.2f MB written\n", + (unsigned long long) interval_write_with_wal, + (unsigned long long ) interval_wal_synced, + interval_write_with_wal / (double) (interval_wal_synced + 1), + interval_wal_bytes / (1048576.0 * 1024)); + value->append(buf); + + snprintf(buf, sizeof(buf), + "Compaction IO interval (MB): " + "%.2f new, %.2f read, %.2f write, %.2f read+write\n", + interval_bytes_new / 1048576.0, + interval_bytes_read/ 1048576.0, + interval_bytes_written / 1048576.0, + (interval_bytes_read + interval_bytes_written) / 1048576.0); + value->append(buf); + + snprintf(buf, sizeof(buf), + "Compaction IO interval (MB/sec): " + "%.1f new, %.1f read, %.1f write, %.1f read+write\n", + interval_bytes_new / 1048576.0 / interval_seconds_up, + interval_bytes_read / 1048576.0 / interval_seconds_up, + interval_bytes_written / 1048576.0 / interval_seconds_up, + (interval_bytes_read + interval_bytes_written) + / 1048576.0 / interval_seconds_up); + value->append(buf); + + // +1 to avoid divide by 0 and NaN + snprintf(buf, sizeof(buf), + "Amplification interval: %.1f write, %.1f compaction\n", + (double) (interval_bytes_written + wal_bytes) + / (interval_bytes_new + 1), + (double) (interval_bytes_written + interval_bytes_read + wal_bytes) + / (interval_bytes_new + 1)); + value->append(buf); + + snprintf(buf, sizeof(buf), + "Stalls(secs): %.3f level0_slowdown, %.3f level0_numfiles, " + "%.3f memtable_compaction, %.3f leveln_slowdown\n", + stall_level0_slowdown_ / 1000000.0, + stall_level0_num_files_ / 1000000.0, + stall_memtable_compaction_ / 1000000.0, + total_slowdown / 1000000.0); + value->append(buf); + + snprintf(buf, sizeof(buf), + "Stalls(count): %lu level0_slowdown, %lu level0_numfiles, " + "%lu memtable_compaction, %lu leveln_slowdown\n", + (unsigned long) stall_level0_slowdown_count_, + (unsigned long) stall_level0_num_files_count_, + (unsigned long) stall_memtable_compaction_count_, + (unsigned long) total_slowdown_count); + value->append(buf); + + last_stats_.compaction_bytes_read_ = total_bytes_read; + last_stats_.compaction_bytes_written_ = total_bytes_written; + last_stats_.ingest_bytes_ = user_bytes_written; + last_stats_.seconds_up_ = seconds_up; + last_stats_.wal_bytes_ = wal_bytes; + last_stats_.wal_synced_ = wal_synced; + last_stats_.write_with_wal_ = write_with_wal; + last_stats_.write_other_ = write_other; + last_stats_.write_self_ = write_self; + + return true; + } else if (in == "sstables") { + *value = versions_->current()->DebugString(); + return true; + } else if (in == "num-immutable-mem-table") { + *value = std::to_string(imm_.size()); + return true; + } + + return false; +} + +void DBImpl::GetApproximateSizes( + const Range* range, int n, + uint64_t* sizes) { + // TODO(opt): better implementation + Version* v; + { + MutexLock l(&mutex_); + versions_->current()->Ref(); + v = versions_->current(); + } + + for (int i = 0; i < n; i++) { + // Convert user_key into a corresponding internal key. + InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek); + InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek); + uint64_t start = versions_->ApproximateOffsetOf(v, k1); + uint64_t limit = versions_->ApproximateOffsetOf(v, k2); + sizes[i] = (limit >= start ? limit - start : 0); + } + + { + MutexLock l(&mutex_); + v->Unref(); + } +} + +inline void DBImpl::DelayLoggingAndReset() { + if (delayed_writes_ > 0) { + Log(options_.info_log, "delayed %d write...\n", delayed_writes_ ); + delayed_writes_ = 0; + } +} + +Status DBImpl::DeleteFile(std::string name) { + uint64_t number; + FileType type; + WalFileType log_type; + if (!ParseFileName(name, &number, &type, &log_type) || + (type != kTableFile && type != kLogFile)) { + Log(options_.info_log, "DeleteFile %s failed.\n", name.c_str()); + return Status::InvalidArgument("Invalid file name"); + } + + Status status; + if (type == kLogFile) { + // Only allow deleting archived log files + if (log_type != kArchivedLogFile) { + Log(options_.info_log, "DeleteFile %s failed.\n", name.c_str()); + return Status::NotSupported("Delete only supported for archived logs"); + } + status = env_->DeleteFile(options_.wal_dir + "/" + name.c_str()); + if (!status.ok()) { + Log(options_.info_log, "DeleteFile %s failed.\n", name.c_str()); + } + return status; + } + + int level; + FileMetaData metadata; + int maxlevel = NumberLevels(); + VersionEdit edit; + DeletionState deletion_state(0, true); + { + MutexLock l(&mutex_); + status = versions_->GetMetadataForFile(number, &level, &metadata); + if (!status.ok()) { + Log(options_.info_log, "DeleteFile %s failed. File not found\n", + name.c_str()); + return Status::InvalidArgument("File not found"); + } + assert((level > 0) && (level < maxlevel)); + + // If the file is being compacted no need to delete. + if (metadata.being_compacted) { + Log(options_.info_log, + "DeleteFile %s Skipped. File about to be compacted\n", name.c_str()); + return Status::OK(); + } + + // Only the files in the last level can be deleted externally. + // This is to make sure that any deletion tombstones are not + // lost. Check that the level passed is the last level. + for (int i = level + 1; i < maxlevel; i++) { + if (versions_->current()->NumLevelFiles(i) != 0) { + Log(options_.info_log, + "DeleteFile %s FAILED. File not in last level\n", name.c_str()); + return Status::InvalidArgument("File not in last level"); + } + } + edit.DeleteFile(level, number); + status = versions_->LogAndApply(&edit, &mutex_); + if (status.ok()) { + InstallSuperVersion(deletion_state); + } + FindObsoleteFiles(deletion_state, false); + } // lock released here + LogFlush(options_.info_log); + // remove files outside the db-lock + PurgeObsoleteFiles(deletion_state); + return status; +} + +void DBImpl::GetLiveFilesMetaData(std::vector *metadata) { + MutexLock l(&mutex_); + return versions_->GetLiveFilesMetaData(metadata); +} + +Status DBImpl::GetDbIdentity(std::string& identity) { + std::string idfilename = IdentityFileName(dbname_); + unique_ptr idfile; + const EnvOptions soptions; + Status s = env_->NewSequentialFile(idfilename, &idfile, soptions); + if (!s.ok()) { + return s; + } + uint64_t file_size; + s = env_->GetFileSize(idfilename, &file_size); + if (!s.ok()) { + return s; + } + char buffer[file_size]; + Slice id; + s = idfile->Read(file_size, &id, buffer); + if (!s.ok()) { + return s; + } + identity.assign(id.ToString()); + // If last character is '\n' remove it from identity + if (identity.size() > 0 && identity.back() == '\n') { + identity.pop_back(); + } + return s; +} + +// Default implementations of convenience methods that subclasses of DB +// can call if they wish +Status DB::Put(const WriteOptions& opt, const Slice& key, const Slice& value) { + // Pre-allocate size of write batch conservatively. + // 8 bytes are taken by header, 4 bytes for count, 1 byte for type, + // and we allocate 11 extra bytes for key length, as well as value length. + WriteBatch batch(key.size() + value.size() + 24); + batch.Put(key, value); + return Write(opt, &batch); +} + +Status DB::Delete(const WriteOptions& opt, const Slice& key) { + WriteBatch batch; + batch.Delete(key); + return Write(opt, &batch); +} + +Status DB::Merge(const WriteOptions& opt, const Slice& key, + const Slice& value) { + WriteBatch batch; + batch.Merge(key, value); + return Write(opt, &batch); +} + +DB::~DB() { } + +Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) { + *dbptr = nullptr; + EnvOptions soptions; + + if (options.block_cache != nullptr && options.no_block_cache) { + return Status::InvalidArgument( + "no_block_cache is true while block_cache is not nullptr"); + } + + DBImpl* impl = new DBImpl(options, dbname); + Status s = impl->env_->CreateDirIfMissing(impl->options_.wal_dir); + if (!s.ok()) { + delete impl; + return s; + } + + s = impl->CreateArchivalDirectory(); + if (!s.ok()) { + delete impl; + return s; + } + impl->mutex_.Lock(); + VersionEdit edit; + s = impl->Recover(&edit); // Handles create_if_missing, error_if_exists + if (s.ok()) { + uint64_t new_log_number = impl->versions_->NewFileNumber(); + unique_ptr lfile; + soptions.use_mmap_writes = false; + s = impl->options_.env->NewWritableFile( + LogFileName(impl->options_.wal_dir, new_log_number), + &lfile, + soptions + ); + if (s.ok()) { + lfile->SetPreallocationBlockSize(1.1 * impl->options_.write_buffer_size); + edit.SetLogNumber(new_log_number); + impl->logfile_number_ = new_log_number; + impl->log_.reset(new log::Writer(std::move(lfile))); + s = impl->versions_->LogAndApply(&edit, &impl->mutex_); + } + if (s.ok()) { + delete impl->InstallSuperVersion(new DBImpl::SuperVersion()); + impl->mem_->SetLogNumber(impl->logfile_number_); + impl->DeleteObsoleteFiles(); + impl->MaybeScheduleFlushOrCompaction(); + impl->MaybeScheduleLogDBDeployStats(); + } + } + + if (s.ok() && impl->options_.compaction_style == kCompactionStyleUniversal) { + Version* current = impl->versions_->current(); + for (int i = 1; i < impl->NumberLevels(); i++) { + int num_files = current->NumLevelFiles(i); + if (num_files > 0) { + s = Status::InvalidArgument("Not all files are at level 0. Cannot " + "open with universal compaction style."); + break; + } + } + } + + impl->mutex_.Unlock(); + + if (s.ok()) { + *dbptr = impl; + } else { + delete impl; + } + return s; +} + +Snapshot::~Snapshot() { +} + +Status DestroyDB(const std::string& dbname, const Options& options) { + const InternalKeyComparator comparator(options.comparator); + const InternalFilterPolicy filter_policy(options.filter_policy); + const Options& soptions(SanitizeOptions( + dbname, &comparator, &filter_policy, options)); + Env* env = soptions.env; + std::vector filenames; + std::vector archiveFiles; + + std::string archivedir = ArchivalDirectory(dbname); + // Ignore error in case directory does not exist + env->GetChildren(dbname, &filenames); + + if (dbname != soptions.wal_dir) { + std::vector logfilenames; + env->GetChildren(soptions.wal_dir, &logfilenames); + filenames.insert(filenames.end(), logfilenames.begin(), logfilenames.end()); + archivedir = ArchivalDirectory(soptions.wal_dir); + } + + if (filenames.empty()) { + return Status::OK(); + } + + FileLock* lock; + const std::string lockname = LockFileName(dbname); + Status result = env->LockFile(lockname, &lock); + if (result.ok()) { + uint64_t number; + FileType type; + for (size_t i = 0; i < filenames.size(); i++) { + if (ParseFileName(filenames[i], &number, &type) && + type != kDBLockFile) { // Lock file will be deleted at end + Status del; + if (type == kMetaDatabase) { + del = DestroyDB(dbname + "/" + filenames[i], options); + } else if (type == kLogFile) { + del = env->DeleteFile(soptions.wal_dir + "/" + filenames[i]); + } else { + del = env->DeleteFile(dbname + "/" + filenames[i]); + } + if (result.ok() && !del.ok()) { + result = del; + } + } + } + + env->GetChildren(archivedir, &archiveFiles); + // Delete archival files. + for (size_t i = 0; i < archiveFiles.size(); ++i) { + if (ParseFileName(archiveFiles[i], &number, &type) && + type == kLogFile) { + Status del = env->DeleteFile(archivedir + "/" + archiveFiles[i]); + if (result.ok() && !del.ok()) { + result = del; + } + } + } + // ignore case where no archival directory is present. + env->DeleteDir(archivedir); + + env->UnlockFile(lock); // Ignore error since state is already gone + env->DeleteFile(lockname); + env->DeleteDir(dbname); // Ignore error in case dir contains other files + env->DeleteDir(soptions.wal_dir); + } + return result; +} + +// +// A global method that can dump out the build version +void dumpLeveldbBuildVersion(Logger * log) { + Log(log, "Git sha %s", rocksdb_build_git_sha); + Log(log, "Compile time %s %s", + rocksdb_build_compile_time, rocksdb_build_compile_date); +} + +} // namespace rocksdb diff --git a/db/db_impl.h b/db/db_impl.h new file mode 100644 index 00000000..214affac --- /dev/null +++ b/db/db_impl.h @@ -0,0 +1,605 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once +#include +#include +#include +#include +#include "db/dbformat.h" +#include "db/log_writer.h" +#include "db/snapshot.h" +#include "db/version_edit.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/memtablerep.h" +#include "rocksdb/transaction_log.h" +#include "port/port.h" +#include "util/stats_logger.h" +#include "memtablelist.h" +#include "util/autovector.h" + +namespace rocksdb { + +class MemTable; +class TableCache; +class Version; +class VersionEdit; +class VersionSet; + +class DBImpl : public DB { + public: + DBImpl(const Options& options, const std::string& dbname); + virtual ~DBImpl(); + + // Implementations of the DB interface + virtual Status Put(const WriteOptions&, const Slice& key, const Slice& value); + virtual Status Merge(const WriteOptions&, const Slice& key, + const Slice& value); + virtual Status Delete(const WriteOptions&, const Slice& key); + virtual Status Write(const WriteOptions& options, WriteBatch* updates); + virtual Status Get(const ReadOptions& options, + const Slice& key, + std::string* value); + virtual std::vector MultiGet(const ReadOptions& options, + const std::vector& keys, + std::vector* values); + + // Returns false if key doesn't exist in the database and true if it may. + // If value_found is not passed in as null, then return the value if found in + // memory. On return, if value was found, then value_found will be set to true + // , otherwise false. + virtual bool KeyMayExist(const ReadOptions& options, + const Slice& key, + std::string* value, + bool* value_found = nullptr); + virtual Iterator* NewIterator(const ReadOptions&); + virtual const Snapshot* GetSnapshot(); + virtual void ReleaseSnapshot(const Snapshot* snapshot); + virtual bool GetProperty(const Slice& property, std::string* value); + virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes); + virtual void CompactRange(const Slice* begin, const Slice* end, + bool reduce_level = false, int target_level = -1); + virtual int NumberLevels(); + virtual int MaxMemCompactionLevel(); + virtual int Level0StopWriteTrigger(); + virtual const std::string& GetName() const; + virtual Env* GetEnv() const; + virtual const Options& GetOptions() const; + virtual Status Flush(const FlushOptions& options); + virtual Status DisableFileDeletions(); + virtual Status EnableFileDeletions(bool force); + // All the returned filenames start with "/" + virtual Status GetLiveFiles(std::vector&, + uint64_t* manifest_file_size, + bool flush_memtable = true); + virtual Status GetSortedWalFiles(VectorLogPtr& files); + virtual SequenceNumber GetLatestSequenceNumber() const; + virtual Status GetUpdatesSince(SequenceNumber seq_number, + unique_ptr* iter); + virtual Status DeleteFile(std::string name); + + virtual void GetLiveFilesMetaData( + std::vector *metadata); + + virtual Status GetDbIdentity(std::string& identity); + + void RunManualCompaction(int input_level, + int output_level, + const Slice* begin, + const Slice* end); + + // Extra methods (for testing) that are not in the public DB interface + + // Compact any files in the named level that overlap [*begin, *end] + void TEST_CompactRange(int level, + const Slice* begin, + const Slice* end); + + // Force current memtable contents to be flushed. + Status TEST_FlushMemTable(); + + // Wait for memtable compaction + Status TEST_WaitForFlushMemTable(); + + // Wait for any compaction + Status TEST_WaitForCompact(); + + // Return an internal iterator over the current state of the database. + // The keys of this iterator are internal keys (see format.h). + // The returned iterator should be deleted when no longer needed. + Iterator* TEST_NewInternalIterator(); + + // Return the maximum overlapping data (in bytes) at next level for any + // file at a level >= 1. + int64_t TEST_MaxNextLevelOverlappingBytes(); + + // Simulate a db crash, no elegant closing of database. + void TEST_Destroy_DBImpl(); + + // Return the current manifest file no. + uint64_t TEST_Current_Manifest_FileNo(); + + // Trigger's a background call for testing. + void TEST_PurgeObsoleteteWAL(); + + // get total level0 file size. Only for testing. + uint64_t TEST_GetLevel0TotalSize(); + + void TEST_SetDefaultTimeToCheck(uint64_t default_interval_to_delete_obsolete_WAL) + { + default_interval_to_delete_obsolete_WAL_ = default_interval_to_delete_obsolete_WAL; + } + + // holds references to memtable, all immutable memtables and version + struct SuperVersion { + MemTable* mem; + MemTableList imm; + Version* current; + std::atomic refs; + // We need to_delete because during Cleanup(), imm.UnrefAll() returns + // all memtables that we need to free through this vector. We then + // delete all those memtables outside of mutex, during destruction + std::vector to_delete; + + // should be called outside the mutex + explicit SuperVersion(const int num_memtables = 0); + ~SuperVersion(); + SuperVersion* Ref(); + // Returns true if this was the last reference and caller should + // call Clenaup() and delete the object + bool Unref(); + + // call these two methods with db mutex held + // Cleanup unrefs mem, imm and current. Also, it stores all memtables + // that needs to be deleted in to_delete vector. Unrefing those + // objects needs to be done in the mutex + void Cleanup(); + void Init(MemTable* new_mem, const MemTableList& new_imm, + Version* new_current); + }; + + // needed for CleanupIteratorState + struct DeletionState { + inline bool HaveSomethingToDelete() const { + return all_files.size() || + sst_delete_files.size() || + log_delete_files.size(); + } + + // a list of all files that we'll consider deleting + // (every once in a while this is filled up with all files + // in the DB directory) + std::vector all_files; + + // the list of all live sst files that cannot be deleted + std::vector sst_live; + + // a list of sst files that we need to delete + std::vector sst_delete_files; + + // a list of log files that we need to delete + std::vector log_delete_files; + + // a list of memtables to be free + std::vector memtables_to_free; + + SuperVersion* superversion_to_free; // if nullptr nothing to free + + SuperVersion* new_superversion; // if nullptr no new superversion + + // the current manifest_file_number, log_number and prev_log_number + // that corresponds to the set of files in 'live'. + uint64_t manifest_file_number, log_number, prev_log_number; + + explicit DeletionState(const int num_memtables = 0, + bool create_superversion = false) { + manifest_file_number = 0; + log_number = 0; + prev_log_number = 0; + memtables_to_free.reserve(num_memtables); + superversion_to_free = nullptr; + new_superversion = + create_superversion ? new SuperVersion(num_memtables) : nullptr; + } + + ~DeletionState() { + // free pending memtables + for (auto m : memtables_to_free) { + delete m; + } + // free superversion. if nullptr, this will be noop + delete superversion_to_free; + // if new_superversion was not used, it will be non-nullptr and needs + // to be freed here + delete new_superversion; + } + }; + + // Returns the list of live files in 'live' and the list + // of all files in the filesystem in 'all_files'. + // If force == false and the last call was less than + // options_.delete_obsolete_files_period_micros microseconds ago, + // it will not fill up the deletion_state + void FindObsoleteFiles(DeletionState& deletion_state, + bool force, + bool no_full_scan = false); + + // Diffs the files listed in filenames and those that do not + // belong to live files are posibly removed. Also, removes all the + // files in sst_delete_files and log_delete_files. + // It is not necessary to hold the mutex when invoking this method. + void PurgeObsoleteFiles(DeletionState& deletion_state); + + protected: + Env* const env_; + const std::string dbname_; + unique_ptr versions_; + const InternalKeyComparator internal_comparator_; + const Options options_; // options_.comparator == &internal_comparator_ + + const Comparator* user_comparator() const { + return internal_comparator_.user_comparator(); + } + + MemTable* GetMemTable() { + return mem_; + } + + Iterator* NewInternalIterator(const ReadOptions&, + SequenceNumber* latest_snapshot); + + private: + friend class DB; + struct CompactionState; + struct Writer; + + Status NewDB(); + + // Recover the descriptor from persistent storage. May do a significant + // amount of work to recover recently logged updates. Any changes to + // be made to the descriptor are added to *edit. + Status Recover(VersionEdit* edit, MemTable* external_table = nullptr, + bool error_if_log_file_exist = false); + + void MaybeIgnoreError(Status* s) const; + + const Status CreateArchivalDirectory(); + + // Delete any unneeded files and stale in-memory entries. + void DeleteObsoleteFiles(); + + // Flush the in-memory write buffer to storage. Switches to a new + // log-file/memtable and writes a new descriptor iff successful. + Status FlushMemTableToOutputFile(bool* madeProgress, + DeletionState& deletion_state); + + Status RecoverLogFile(uint64_t log_number, + VersionEdit* edit, + SequenceNumber* max_sequence, + MemTable* external_table); + + // The following two methods are used to flush a memtable to + // storage. The first one is used atdatabase RecoveryTime (when the + // database is opened) and is heavyweight because it holds the mutex + // for the entire period. The second method WriteLevel0Table supports + // concurrent flush memtables to storage. + Status WriteLevel0TableForRecovery(MemTable* mem, VersionEdit* edit); + Status WriteLevel0Table(std::vector &mems, VersionEdit* edit, + uint64_t* filenumber); + + uint64_t SlowdownAmount(int n, int top, int bottom); + // MakeRoomForWrite will return superversion_to_free through an arugment, + // which the caller needs to delete. We do it because caller can delete + // the superversion outside of mutex + Status MakeRoomForWrite(bool force /* compact even if there is room? */, + SuperVersion** superversion_to_free); + void BuildBatchGroup(Writer** last_writer, + autovector* write_batch_group); + + // Force current memtable contents to be flushed. + Status FlushMemTable(const FlushOptions& options); + + // Wait for memtable flushed + Status WaitForFlushMemTable(); + + void MaybeScheduleLogDBDeployStats(); + static void BGLogDBDeployStats(void* db); + void LogDBDeployStats(); + + void MaybeScheduleFlushOrCompaction(); + static void BGWorkCompaction(void* db); + static void BGWorkFlush(void* db); + void BackgroundCallCompaction(); + void BackgroundCallFlush(); + Status BackgroundCompaction(bool* madeProgress,DeletionState& deletion_state); + Status BackgroundFlush(bool* madeProgress, DeletionState& deletion_state); + void CleanupCompaction(CompactionState* compact, Status status); + Status DoCompactionWork(CompactionState* compact, + DeletionState& deletion_state); + + Status OpenCompactionOutputFile(CompactionState* compact); + Status FinishCompactionOutputFile(CompactionState* compact, Iterator* input); + Status InstallCompactionResults(CompactionState* compact); + void AllocateCompactionOutputFileNumbers(CompactionState* compact); + void ReleaseCompactionUnusedFileNumbers(CompactionState* compact); + + void PurgeObsoleteWALFiles(); + + Status AppendSortedWalsOfType(const std::string& path, + VectorLogPtr& log_files, + WalFileType type); + + // Requires: all_logs should be sorted with earliest log file first + // Retains all log files in all_logs which contain updates with seq no. + // Greater Than or Equal to the requested SequenceNumber. + Status RetainProbableWalFiles(VectorLogPtr& all_logs, + const SequenceNumber target); + // return true if + bool CheckWalFileExistsAndEmpty(const WalFileType type, + const uint64_t number); + + Status ReadFirstRecord(const WalFileType type, const uint64_t number, + WriteBatch* const result); + + Status ReadFirstLine(const std::string& fname, WriteBatch* const batch); + + void PrintStatistics(); + + // dump rocksdb.stats to LOG + void MaybeDumpStats(); + + // Return the minimum empty level that could hold the total data in the + // input level. Return the input level, if such level could not be found. + int FindMinimumEmptyLevelFitting(int level); + + // Move the files in the input level to the target level. + // If target_level < 0, automatically calculate the minimum level that could + // hold the data set. + void ReFitLevel(int level, int target_level = -1); + + // Constant after construction + const InternalFilterPolicy internal_filter_policy_; + bool owns_info_log_; + + // table_cache_ provides its own synchronization + unique_ptr table_cache_; + + // Lock over the persistent DB state. Non-nullptr iff successfully acquired. + FileLock* db_lock_; + + // State below is protected by mutex_ + port::Mutex mutex_; + port::AtomicPointer shutting_down_; + port::CondVar bg_cv_; // Signalled when background work finishes + MemTableRepFactory* mem_rep_factory_; + MemTable* mem_; + MemTableList imm_; // Memtable that are not changing + uint64_t logfile_number_; + unique_ptr log_; + + SuperVersion* super_version_; + + std::string host_name_; + + // Queue of writers. + std::deque writers_; + WriteBatch tmp_batch_; + + SnapshotList snapshots_; + + // Set of table files to protect from deletion because they are + // part of ongoing compactions. + std::set pending_outputs_; + + // count how many background compactions are running or have been scheduled + int bg_compaction_scheduled_; + + // If non-zero, MaybeScheduleFlushOrCompaction() will only schedule manual + // compactions (if manual_compaction_ is not null). This mechanism enables + // manual compactions to wait until all other compactions are finished. + int bg_manual_only_; + + // number of background memtable flush jobs, submitted to the HIGH pool + int bg_flush_scheduled_; + + // Has a background stats log thread scheduled? + bool bg_logstats_scheduled_; + + // Information for a manual compaction + struct ManualCompaction { + int input_level; + int output_level; + bool done; + bool in_progress; // compaction request being processed? + const InternalKey* begin; // nullptr means beginning of key range + const InternalKey* end; // nullptr means end of key range + InternalKey tmp_storage; // Used to keep track of compaction progress + }; + ManualCompaction* manual_compaction_; + + // Have we encountered a background error in paranoid mode? + Status bg_error_; + + std::unique_ptr logger_; + + int64_t volatile last_log_ts; + + // shall we disable deletion of obsolete files + // if 0 the deletion is enabled. + // if non-zero, files will not be getting deleted + // This enables two different threads to call + // EnableFileDeletions() and DisableFileDeletions() + // without any synchronization + int disable_delete_obsolete_files_; + + // last time when DeleteObsoleteFiles was invoked + uint64_t delete_obsolete_files_last_run_; + + // last time when PurgeObsoleteWALFiles ran. + uint64_t purge_wal_files_last_run_; + + // last time stats were dumped to LOG + std::atomic last_stats_dump_time_microsec_; + + // obsolete files will be deleted every this seconds if ttl deletion is + // enabled and archive size_limit is disabled. + uint64_t default_interval_to_delete_obsolete_WAL_; + + // These count the number of microseconds for which MakeRoomForWrite stalls. + uint64_t stall_level0_slowdown_; + uint64_t stall_memtable_compaction_; + uint64_t stall_level0_num_files_; + std::vector stall_leveln_slowdown_; + uint64_t stall_level0_slowdown_count_; + uint64_t stall_memtable_compaction_count_; + uint64_t stall_level0_num_files_count_; + std::vector stall_leveln_slowdown_count_; + + // Time at which this instance was started. + const uint64_t started_at_; + + bool flush_on_destroy_; // Used when disableWAL is true. + + // Per level compaction stats. stats_[level] stores the stats for + // compactions that produced data for the specified "level". + struct CompactionStats { + uint64_t micros; + + // Bytes read from level N during compaction between levels N and N+1 + int64_t bytes_readn; + + // Bytes read from level N+1 during compaction between levels N and N+1 + int64_t bytes_readnp1; + + // Total bytes written during compaction between levels N and N+1 + int64_t bytes_written; + + // Files read from level N during compaction between levels N and N+1 + int files_in_leveln; + + // Files read from level N+1 during compaction between levels N and N+1 + int files_in_levelnp1; + + // Files written during compaction between levels N and N+1 + int files_out_levelnp1; + + // Number of compactions done + int count; + + CompactionStats() : micros(0), bytes_readn(0), bytes_readnp1(0), + bytes_written(0), files_in_leveln(0), + files_in_levelnp1(0), files_out_levelnp1(0), + count(0) { } + + void Add(const CompactionStats& c) { + this->micros += c.micros; + this->bytes_readn += c.bytes_readn; + this->bytes_readnp1 += c.bytes_readnp1; + this->bytes_written += c.bytes_written; + this->files_in_leveln += c.files_in_leveln; + this->files_in_levelnp1 += c.files_in_levelnp1; + this->files_out_levelnp1 += c.files_out_levelnp1; + this->count += 1; + } + }; + + std::vector stats_; + + // Used to compute per-interval statistics + struct StatsSnapshot { + uint64_t compaction_bytes_read_; // Bytes read by compaction + uint64_t compaction_bytes_written_; // Bytes written by compaction + uint64_t ingest_bytes_; // Bytes written by user + uint64_t wal_bytes_; // Bytes written to WAL + uint64_t wal_synced_; // Number of times WAL is synced + uint64_t write_with_wal_; // Number of writes that request WAL + // These count the number of writes processed by the calling thread or + // another thread. + uint64_t write_other_; + uint64_t write_self_; + double seconds_up_; + + StatsSnapshot() : compaction_bytes_read_(0), compaction_bytes_written_(0), + ingest_bytes_(0), wal_bytes_(0), wal_synced_(0), + write_with_wal_(0), write_other_(0), write_self_(0), + seconds_up_(0) {} + }; + + // Counters from the previous time per-interval stats were computed + StatsSnapshot last_stats_; + + static const int KEEP_LOG_FILE_NUM = 1000; + std::string db_absolute_path_; + + // count of the number of contiguous delaying writes + int delayed_writes_; + + // The options to access storage files + const EnvOptions storage_options_; + + // A value of true temporarily disables scheduling of background work + bool bg_work_gate_closed_; + + // Guard against multiple concurrent refitting + bool refitting_level_; + + // No copying allowed + DBImpl(const DBImpl&); + void operator=(const DBImpl&); + + // dump the delayed_writes_ to the log file and reset counter. + void DelayLoggingAndReset(); + + // Return the earliest snapshot where seqno is visible. + // Store the snapshot right before that, if any, in prev_snapshot + inline SequenceNumber findEarliestVisibleSnapshot( + SequenceNumber in, + std::vector& snapshots, + SequenceNumber* prev_snapshot); + + // will return a pointer to SuperVersion* if previous SuperVersion + // if its reference count is zero and needs deletion or nullptr if not + // As argument takes a pointer to allocated SuperVersion + // Foreground threads call this function directly (they don't carry + // deletion state and have to handle their own creation and deletion + // of SuperVersion) + SuperVersion* InstallSuperVersion(SuperVersion* new_superversion); + // Background threads call this function, which is just a wrapper around + // the InstallSuperVersion() function above. Background threads carry + // deletion_state which can have new_superversion already allocated. + void InstallSuperVersion(DeletionState& deletion_state); + + // Function that Get and KeyMayExist call with no_io true or false + // Note: 'value_found' from KeyMayExist propagates here + Status GetImpl(const ReadOptions& options, + const Slice& key, + std::string* value, + bool* value_found = nullptr); +}; + +// Sanitize db options. The caller should delete result.info_log if +// it is not equal to src.info_log. +extern Options SanitizeOptions(const std::string& db, + const InternalKeyComparator* icmp, + const InternalFilterPolicy* ipolicy, + const Options& src); + + +// Determine compression type, based on user options, level of the output +// file and whether compression is disabled. +// If enable_compression is false, then compression is always disabled no +// matter what the values of the other two parameters are. +// Otherwise, the compression type is determined based on options and level. +CompressionType GetCompressionType(const Options& options, int level, + const bool enable_compression); + +// Determine compression type for L0 file written by memtable flush. +CompressionType GetCompressionFlush(const Options& options); + +} // namespace rocksdb diff --git a/db/db_impl_readonly.cc b/db/db_impl_readonly.cc new file mode 100644 index 00000000..04033b2f --- /dev/null +++ b/db/db_impl_readonly.cc @@ -0,0 +1,101 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2012 Facebook. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "db/db_impl_readonly.h" +#include "db/db_impl.h" + +#include +#include +#include +#include +#include +#include +#include +#include "db/db_iter.h" +#include "db/dbformat.h" +#include "db/filename.h" +#include "db/log_reader.h" +#include "db/log_writer.h" +#include "db/memtable.h" +#include "db/merge_context.h" +#include "db/table_cache.h" +#include "db/version_set.h" +#include "db/write_batch_internal.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/status.h" +#include "rocksdb/table.h" +#include "rocksdb/merge_operator.h" +#include "port/port.h" +#include "table/block.h" +#include "table/merger.h" +#include "table/two_level_iterator.h" +#include "util/coding.h" +#include "util/logging.h" +#include "util/build_version.h" + +namespace rocksdb { + +DBImplReadOnly::DBImplReadOnly(const Options& options, + const std::string& dbname) + : DBImpl(options, dbname) { + Log(options_.info_log, "Opening the db in read only mode"); +} + +DBImplReadOnly::~DBImplReadOnly() { +} + +// Implementations of the DB interface +Status DBImplReadOnly::Get(const ReadOptions& options, + const Slice& key, + std::string* value) { + Status s; + MemTable* mem = GetMemTable(); + Version* current = versions_->current(); + SequenceNumber snapshot = versions_->LastSequence(); + MergeContext merge_context; + LookupKey lkey(key, snapshot); + if (mem->Get(lkey, value, &s, merge_context, options_)) { + } else { + Version::GetStats stats; + current->Get(options, lkey, value, &s, &merge_context, &stats, options_); + } + return s; +} + +Iterator* DBImplReadOnly::NewIterator(const ReadOptions& options) { + SequenceNumber latest_snapshot; + Iterator* internal_iter = NewInternalIterator(options, &latest_snapshot); + return NewDBIterator( + &dbname_, env_, options_, user_comparator(),internal_iter, + (options.snapshot != nullptr + ? reinterpret_cast(options.snapshot)->number_ + : latest_snapshot)); +} + + +Status DB::OpenForReadOnly(const Options& options, const std::string& dbname, + DB** dbptr, bool error_if_log_file_exist) { + *dbptr = nullptr; + + DBImplReadOnly* impl = new DBImplReadOnly(options, dbname); + impl->mutex_.Lock(); + VersionEdit edit; + Status s = impl->Recover(&edit, impl->GetMemTable(), + error_if_log_file_exist); + impl->mutex_.Unlock(); + if (s.ok()) { + *dbptr = impl; + } else { + delete impl; + } + return s; +} + +} diff --git a/db/db_impl_readonly.h b/db/db_impl_readonly.h new file mode 100644 index 00000000..4beaedd0 --- /dev/null +++ b/db/db_impl_readonly.h @@ -0,0 +1,78 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2012 Facebook. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#pragma once +#include "db/db_impl.h" + +#include +#include +#include "db/dbformat.h" +#include "db/log_writer.h" +#include "db/snapshot.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "port/port.h" +#include "util/stats_logger.h" + +namespace rocksdb { + +class DBImplReadOnly : public DBImpl { +public: + DBImplReadOnly(const Options& options, const std::string& dbname); + virtual ~DBImplReadOnly(); + + // Implementations of the DB interface + virtual Status Get(const ReadOptions& options, + const Slice& key, + std::string* value); + + // TODO: Implement ReadOnly MultiGet? + + virtual Iterator* NewIterator(const ReadOptions&); + + virtual Status Put(const WriteOptions&, const Slice& key, const Slice& value) { + return Status::NotSupported("Not supported operation in read only mode."); + } + virtual Status Merge(const WriteOptions&, const Slice& key, + const Slice& value) { + return Status::NotSupported("Not supported operation in read only mode."); + } + virtual Status Delete(const WriteOptions&, const Slice& key) { + return Status::NotSupported("Not supported operation in read only mode."); + } + virtual Status Write(const WriteOptions& options, WriteBatch* updates) { + return Status::NotSupported("Not supported operation in read only mode."); + } + virtual void CompactRange(const Slice* begin, const Slice* end, + bool reduce_level = false, int target_level = -1) { + } + virtual Status DisableFileDeletions() { + return Status::NotSupported("Not supported operation in read only mode."); + } + virtual Status EnableFileDeletions(bool force) { + return Status::NotSupported("Not supported operation in read only mode."); + } + virtual Status GetLiveFiles(std::vector&, + uint64_t* manifest_file_size, + bool flush_memtable = true) { + return Status::NotSupported("Not supported operation in read only mode."); + } + virtual Status Flush(const FlushOptions& options) { + return Status::NotSupported("Not supported operation in read only mode."); + } + +private: + friend class DB; + + // No copying allowed + DBImplReadOnly(const DBImplReadOnly&); + void operator=(const DBImplReadOnly&); +}; + +} diff --git a/db/db_iter.cc b/db/db_iter.cc new file mode 100644 index 00000000..596a9f65 --- /dev/null +++ b/db/db_iter.cc @@ -0,0 +1,481 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_iter.h" +#include +#include + +#include "db/filename.h" +#include "db/dbformat.h" +#include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "rocksdb/iterator.h" +#include "rocksdb/merge_operator.h" +#include "port/port.h" +#include "util/logging.h" +#include "util/mutexlock.h" +#include "util/perf_context_imp.h" + +namespace rocksdb { + +#if 0 +static void DumpInternalIter(Iterator* iter) { + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ParsedInternalKey k; + if (!ParseInternalKey(iter->key(), &k)) { + fprintf(stderr, "Corrupt '%s'\n", EscapeString(iter->key()).c_str()); + } else { + fprintf(stderr, "@ '%s'\n", k.DebugString().c_str()); + } + } +} +#endif + +namespace { + +// Memtables and sstables that make the DB representation contain +// (userkey,seq,type) => uservalue entries. DBIter +// combines multiple entries for the same userkey found in the DB +// representation into a single entry while accounting for sequence +// numbers, deletion markers, overwrites, etc. +class DBIter: public Iterator { + public: + // The following is grossly complicated. TODO: clean it up + // Which direction is the iterator currently moving? + // (1) When moving forward, the internal iterator is positioned at + // the exact entry that yields this->key(), this->value() + // (2) When moving backwards, the internal iterator is positioned + // just before all entries whose user key == this->key(). + enum Direction { + kForward, + kReverse + }; + + DBIter(const std::string* dbname, Env* env, const Options& options, + const Comparator* cmp, Iterator* iter, SequenceNumber s) + : dbname_(dbname), + env_(env), + logger_(options.info_log.get()), + user_comparator_(cmp), + user_merge_operator_(options.merge_operator.get()), + iter_(iter), + sequence_(s), + direction_(kForward), + valid_(false), + current_entry_is_merged_(false), + statistics_(options.statistics.get()) { + RecordTick(statistics_, NO_ITERATORS, 1); + max_skip_ = options.max_sequential_skip_in_iterations; + } + virtual ~DBIter() { + RecordTick(statistics_, NO_ITERATORS, -1); + delete iter_; + } + virtual bool Valid() const { return valid_; } + virtual Slice key() const { + assert(valid_); + return saved_key_; + } + virtual Slice value() const { + assert(valid_); + return (direction_ == kForward && !current_entry_is_merged_) ? + iter_->value() : saved_value_; + } + virtual Status status() const { + if (status_.ok()) { + return iter_->status(); + } else { + return status_; + } + } + + virtual void Next(); + virtual void Prev(); + virtual void Seek(const Slice& target); + virtual void SeekToFirst(); + virtual void SeekToLast(); + + private: + void FindNextUserEntry(bool skipping); + void FindPrevUserEntry(); + bool ParseKey(ParsedInternalKey* key); + void MergeValuesNewToOld(); + + inline void SaveKey(const Slice& k, std::string* dst) { + dst->assign(k.data(), k.size()); + } + + inline void ClearSavedValue() { + if (saved_value_.capacity() > 1048576) { + std::string empty; + swap(empty, saved_value_); + } else { + saved_value_.clear(); + } + } + + const std::string* const dbname_; + Env* const env_; + Logger* logger_; + const Comparator* const user_comparator_; + const MergeOperator* const user_merge_operator_; + Iterator* const iter_; + SequenceNumber const sequence_; + + Status status_; + std::string saved_key_; // == current key when direction_==kReverse + std::string saved_value_; // == current raw value when direction_==kReverse + std::string skip_key_; + Direction direction_; + bool valid_; + bool current_entry_is_merged_; + Statistics* statistics_; + uint64_t max_skip_; + + // No copying allowed + DBIter(const DBIter&); + void operator=(const DBIter&); +}; + +inline bool DBIter::ParseKey(ParsedInternalKey* ikey) { + if (!ParseInternalKey(iter_->key(), ikey)) { + status_ = Status::Corruption("corrupted internal key in DBIter"); + Log(logger_, "corrupted internal key in DBIter: %s", + iter_->key().ToString(true).c_str()); + return false; + } else { + return true; + } +} + +void DBIter::Next() { + assert(valid_); + + if (direction_ == kReverse) { // Switch directions? + direction_ = kForward; + // iter_ is pointing just before the entries for this->key(), + // so advance into the range of entries for this->key() and then + // use the normal skipping code below. + if (!iter_->Valid()) { + iter_->SeekToFirst(); + } else { + iter_->Next(); + } + if (!iter_->Valid()) { + valid_ = false; + saved_key_.clear(); + return; + } + } + + // If the current value is merged, we might already hit end of iter_ + if (!iter_->Valid()) { + valid_ = false; + return; + } + FindNextUserEntry(true /* skipping the current user key */); +} + + +// PRE: saved_key_ has the current user key if skipping +// POST: saved_key_ should have the next user key if valid_, +// if the current entry is a result of merge +// current_entry_is_merged_ => true +// saved_value_ => the merged value +// +// NOTE: In between, saved_key_ can point to a user key that has +// a delete marker +void DBIter::FindNextUserEntry(bool skipping) { + // Loop until we hit an acceptable entry to yield + assert(iter_->Valid()); + assert(direction_ == kForward); + current_entry_is_merged_ = false; + uint64_t num_skipped = 0; + do { + ParsedInternalKey ikey; + if (ParseKey(&ikey) && ikey.sequence <= sequence_) { + if (skipping && + user_comparator_->Compare(ikey.user_key, saved_key_) <= 0) { + num_skipped++; // skip this entry + BumpPerfCount(&perf_context.internal_key_skipped_count); + } else { + skipping = false; + switch (ikey.type) { + case kTypeDeletion: + // Arrange to skip all upcoming entries for this key since + // they are hidden by this deletion. + SaveKey(ikey.user_key, &saved_key_); + skipping = true; + num_skipped = 0; + BumpPerfCount(&perf_context.internal_delete_skipped_count); + break; + case kTypeValue: + valid_ = true; + SaveKey(ikey.user_key, &saved_key_); + return; + case kTypeMerge: + // By now, we are sure the current ikey is going to yield a value + SaveKey(ikey.user_key, &saved_key_); + current_entry_is_merged_ = true; + valid_ = true; + MergeValuesNewToOld(); // Go to a different state machine + return; + case kTypeLogData: + assert(false); + break; + } + } + } + // If we have sequentially iterated via numerous keys and still not + // found the next user-key, then it is better to seek so that we can + // avoid too many key comparisons. We seek to the last occurence of + // our current key by looking for sequence number 0. + if (skipping && num_skipped > max_skip_) { + num_skipped = 0; + std::string last_key; + AppendInternalKey(&last_key, + ParsedInternalKey(Slice(saved_key_), 0, kValueTypeForSeek)); + iter_->Seek(last_key); + RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION); + } else { + iter_->Next(); + } + } while (iter_->Valid()); + valid_ = false; +} + +// Merge values of the same user key starting from the current iter_ position +// Scan from the newer entries to older entries. +// PRE: iter_->key() points to the first merge type entry +// saved_key_ stores the user key +// POST: saved_value_ has the merged value for the user key +// iter_ points to the next entry (or invalid) +void DBIter::MergeValuesNewToOld() { + if (!user_merge_operator_) { + Log(logger_, "Options::merge_operator is null."); + throw std::logic_error("DBIter::MergeValuesNewToOld() with" + " Options::merge_operator null"); + } + + // Start the merge process by pushing the first operand + std::deque operands; + operands.push_front(iter_->value().ToString()); + + std::string merge_result; // Temporary string to hold merge result later + ParsedInternalKey ikey; + for (iter_->Next(); iter_->Valid(); iter_->Next()) { + if (!ParseKey(&ikey)) { + // skip corrupted key + continue; + } + + if (user_comparator_->Compare(ikey.user_key, saved_key_) != 0) { + // hit the next user key, stop right here + break; + } + + if (kTypeDeletion == ikey.type) { + // hit a delete with the same user key, stop right here + // iter_ is positioned after delete + iter_->Next(); + break; + } + + if (kTypeValue == ikey.type) { + // hit a put, merge the put value with operands and store the + // final result in saved_value_. We are done! + // ignore corruption if there is any. + const Slice value = iter_->value(); + user_merge_operator_->FullMerge(ikey.user_key, &value, operands, + &saved_value_, logger_); + // iter_ is positioned after put + iter_->Next(); + return; + } + + if (kTypeMerge == ikey.type) { + // hit a merge, add the value as an operand and run associative merge. + // when complete, add result to operands and continue. + const Slice& value = iter_->value(); + operands.push_front(value.ToString()); + while(operands.size() >= 2) { + // Call user associative-merge until it returns false + if (user_merge_operator_->PartialMerge(ikey.user_key, + Slice(operands[0]), + Slice(operands[1]), + &merge_result, + logger_)) { + operands.pop_front(); + swap(operands.front(), merge_result); + } else { + // Associative merge returns false ==> stack the operands + break; + } + } + + } + } + + // we either exhausted all internal keys under this user key, or hit + // a deletion marker. + // feed null as the existing value to the merge operator, such that + // client can differentiate this scenario and do things accordingly. + user_merge_operator_->FullMerge(saved_key_, nullptr, operands, + &saved_value_, logger_); +} + +void DBIter::Prev() { + assert(valid_); + + // Throw an exception now if merge_operator is provided + // TODO: support backward iteration + if (user_merge_operator_) { + Log(logger_, "Prev not supported yet if merge_operator is provided"); + throw std::logic_error("DBIter::Prev backward iteration not supported" + " if merge_operator is provided"); + } + + if (direction_ == kForward) { // Switch directions? + // iter_ is pointing at the current entry. Scan backwards until + // the key changes so we can use the normal reverse scanning code. + assert(iter_->Valid()); // Otherwise valid_ would have been false + SaveKey(ExtractUserKey(iter_->key()), &saved_key_); + while (true) { + iter_->Prev(); + if (!iter_->Valid()) { + valid_ = false; + saved_key_.clear(); + ClearSavedValue(); + return; + } + if (user_comparator_->Compare(ExtractUserKey(iter_->key()), + saved_key_) < 0) { + break; + } + } + direction_ = kReverse; + } + + FindPrevUserEntry(); +} + +void DBIter::FindPrevUserEntry() { + assert(direction_ == kReverse); + uint64_t num_skipped = 0; + + ValueType value_type = kTypeDeletion; + if (iter_->Valid()) { + do { + ParsedInternalKey ikey; + bool saved_key_cleared = false; + if (ParseKey(&ikey) && ikey.sequence <= sequence_) { + if ((value_type != kTypeDeletion) && + user_comparator_->Compare(ikey.user_key, saved_key_) < 0) { + // We encountered a non-deleted value in entries for previous keys, + break; + } + value_type = ikey.type; + if (value_type == kTypeDeletion) { + saved_key_.clear(); + ClearSavedValue(); + saved_key_cleared = true; + } else { + Slice raw_value = iter_->value(); + if (saved_value_.capacity() > raw_value.size() + 1048576) { + std::string empty; + swap(empty, saved_value_); + } + SaveKey(ExtractUserKey(iter_->key()), &saved_key_); + saved_value_.assign(raw_value.data(), raw_value.size()); + } + } + num_skipped++; + // If we have sequentially iterated via numerous keys and still not + // found the prev user-key, then it is better to seek so that we can + // avoid too many key comparisons. We seek to the first occurence of + // our current key by looking for max sequence number. + if (!saved_key_cleared && num_skipped > max_skip_) { + num_skipped = 0; + std::string last_key; + AppendInternalKey(&last_key, + ParsedInternalKey(Slice(saved_key_), kMaxSequenceNumber, + kValueTypeForSeek)); + iter_->Seek(last_key); + RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION); + } else { + iter_->Prev(); + } + } while (iter_->Valid()); + } + + if (value_type == kTypeDeletion) { + // End + valid_ = false; + saved_key_.clear(); + ClearSavedValue(); + direction_ = kForward; + } else { + valid_ = true; + } +} + +void DBIter::Seek(const Slice& target) { + direction_ = kForward; + ClearSavedValue(); + saved_key_.clear(); + AppendInternalKey( + &saved_key_, ParsedInternalKey(target, sequence_, kValueTypeForSeek)); + iter_->Seek(saved_key_); + if (iter_->Valid()) { + FindNextUserEntry(false /*not skipping */); + } else { + valid_ = false; + } +} + +void DBIter::SeekToFirst() { + direction_ = kForward; + ClearSavedValue(); + iter_->SeekToFirst(); + if (iter_->Valid()) { + FindNextUserEntry(false /* not skipping */); + } else { + valid_ = false; + } +} + +void DBIter::SeekToLast() { + // Throw an exception for now if merge_operator is provided + // TODO: support backward iteration + if (user_merge_operator_) { + Log(logger_, "SeekToLast not supported yet if merge_operator is provided"); + throw std::logic_error("DBIter::SeekToLast: backward iteration not" + " supported if merge_operator is provided"); + } + + direction_ = kReverse; + ClearSavedValue(); + iter_->SeekToLast(); + FindPrevUserEntry(); +} + +} // anonymous namespace + +Iterator* NewDBIterator( + const std::string* dbname, + Env* env, + const Options& options, + const Comparator *user_key_comparator, + Iterator* internal_iter, + const SequenceNumber& sequence) { + return new DBIter(dbname, env, options, user_key_comparator, + internal_iter, sequence); +} + +} // namespace rocksdb diff --git a/db/db_iter.h b/db/db_iter.h new file mode 100644 index 00000000..b44e6745 --- /dev/null +++ b/db/db_iter.h @@ -0,0 +1,28 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include "rocksdb/db.h" +#include "db/dbformat.h" + +namespace rocksdb { + +// Return a new iterator that converts internal keys (yielded by +// "*internal_iter") that were live at the specified "sequence" number +// into appropriate user keys. +extern Iterator* NewDBIterator( + const std::string* dbname, + Env* env, + const Options& options, + const Comparator *user_key_comparator, + Iterator* internal_iter, + const SequenceNumber& sequence); + +} // namespace rocksdb diff --git a/db/db_statistics.cc b/db/db_statistics.cc new file mode 100644 index 00000000..f0cfd674 --- /dev/null +++ b/db/db_statistics.cc @@ -0,0 +1,14 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include "db/db_statistics.h" + +namespace rocksdb { + +std::shared_ptr CreateDBStatistics() { + return std::make_shared(); +} + +} // namespace rocksdb diff --git a/db/db_statistics.h b/db/db_statistics.h new file mode 100644 index 00000000..ec71e168 --- /dev/null +++ b/db/db_statistics.h @@ -0,0 +1,63 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include +#include +#include + +#include "rocksdb/statistics.h" +#include "util/histogram.h" +#include "port/port.h" +#include "util/mutexlock.h" + + +namespace rocksdb { + +class DBStatistics: public Statistics { + public: + DBStatistics() : allTickers_(TICKER_ENUM_MAX), + allHistograms_(HISTOGRAM_ENUM_MAX) { } + + virtual ~DBStatistics() {} + + virtual long getTickerCount(Tickers tickerType) { + assert(tickerType < TICKER_ENUM_MAX); + return allTickers_[tickerType].getCount(); + } + + virtual void setTickerCount(Tickers tickerType, uint64_t count) { + assert(tickerType < TICKER_ENUM_MAX); + allTickers_[tickerType].setTickerCount(count); + } + + virtual void recordTick(Tickers tickerType, uint64_t count) { + assert(tickerType < TICKER_ENUM_MAX); + allTickers_[tickerType].recordTick(count); + } + + virtual void measureTime(Histograms histogramType, uint64_t value) { + assert(histogramType < HISTOGRAM_ENUM_MAX); + allHistograms_[histogramType].Add(value); + } + + virtual void histogramData(Histograms histogramType, + HistogramData * const data) { + assert(histogramType < HISTOGRAM_ENUM_MAX); + allHistograms_[histogramType].Data(data); + } + + std::vector allTickers_; + std::vector allHistograms_; +}; + +std::shared_ptr CreateDBStatistics(); + +} // namespace rocksdb diff --git a/db/db_stats_logger.cc b/db/db_stats_logger.cc new file mode 100644 index 00000000..db86865c --- /dev/null +++ b/db/db_stats_logger.cc @@ -0,0 +1,94 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_impl.h" +#include +#include +#include +#include "db/version_set.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "port/port.h" +#include "util/mutexlock.h" + +namespace rocksdb { + +void DBImpl::MaybeScheduleLogDBDeployStats() { + + // There is a lock in the actual logger. + if (!logger_ || options_.db_stats_log_interval < 0 + || host_name_.empty()) { + return; + } + + if(bg_logstats_scheduled_ || shutting_down_.Acquire_Load()) { + // Already scheduled + } else { + int64_t current_ts = 0; + Status st = env_->GetCurrentTime(¤t_ts); + if (!st.ok()) { + return; + } + if ((current_ts - last_log_ts) < options_.db_stats_log_interval) { + return; + } + last_log_ts = current_ts; + bg_logstats_scheduled_ = true; + env_->Schedule(&DBImpl::BGLogDBDeployStats, this); + } +} + +void DBImpl::BGLogDBDeployStats(void* db) { + DBImpl* db_inst = reinterpret_cast(db); + db_inst->LogDBDeployStats(); +} + +void DBImpl::LogDBDeployStats() { + mutex_.Lock(); + + if (shutting_down_.Acquire_Load()) { + bg_logstats_scheduled_ = false; + bg_cv_.SignalAll(); + mutex_.Unlock(); + return; + } + + char tmp_ver[100]; + sprintf(tmp_ver, "%d.%d", kMajorVersion, kMinorVersion); + std::string version_info(tmp_ver); + + uint64_t file_total_size = 0; + uint32_t file_total_num = 0; + Version* current = versions_->current(); + for (int i = 0; i < current->NumberLevels(); i++) { + file_total_num += current->NumLevelFiles(i); + file_total_size += current->NumLevelBytes(i); + } + + Version::LevelSummaryStorage scratch; + const char* file_num_summary = current->LevelSummary(&scratch); + std::string file_num_per_level(file_num_summary); + std::string data_size_per_level(file_num_summary); + + mutex_.Unlock(); + + int64_t unix_ts; + env_->GetCurrentTime(&unix_ts); + + logger_->Log_Deploy_Stats(version_info, host_name_, + db_absolute_path_, file_total_size, file_total_num, file_num_per_level, + data_size_per_level, unix_ts); + + mutex_.Lock(); + bg_logstats_scheduled_ = false; + bg_cv_.SignalAll(); + mutex_.Unlock(); +} + +} diff --git a/db/db_test.cc b/db/db_test.cc new file mode 100644 index 00000000..9c8a97f9 --- /dev/null +++ b/db/db_test.cc @@ -0,0 +1,4991 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include +#include + +#include "rocksdb/db.h" +#include "rocksdb/filter_policy.h" +#include "db/db_impl.h" +#include "db/filename.h" +#include "db/version_set.h" +#include "db/write_batch_internal.h" +#include "db/db_statistics.h" +#include "rocksdb/cache.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/env.h" +#include "rocksdb/table.h" +#include "util/hash.h" +#include "util/logging.h" +#include "util/mutexlock.h" +#include "util/testharness.h" +#include "util/testutil.h" +#include "utilities/merge_operators.h" + +namespace rocksdb { + +static bool SnappyCompressionSupported(const CompressionOptions& options) { + std::string out; + Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; + return port::Snappy_Compress(options, in.data(), in.size(), &out); +} + +static bool ZlibCompressionSupported(const CompressionOptions& options) { + std::string out; + Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; + return port::Zlib_Compress(options, in.data(), in.size(), &out); +} + +static bool BZip2CompressionSupported(const CompressionOptions& options) { + std::string out; + Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; + return port::BZip2_Compress(options, in.data(), in.size(), &out); +} + +static std::string RandomString(Random* rnd, int len) { + std::string r; + test::RandomString(rnd, len, &r); + return r; +} + +namespace anon { +class AtomicCounter { + private: + port::Mutex mu_; + int count_; + public: + AtomicCounter() : count_(0) { } + void Increment() { + MutexLock l(&mu_); + count_++; + } + int Read() { + MutexLock l(&mu_); + return count_; + } + void Reset() { + MutexLock l(&mu_); + count_ = 0; + } +}; + +} + +// Special Env used to delay background operations +class SpecialEnv : public EnvWrapper { + public: + // sstable Sync() calls are blocked while this pointer is non-nullptr. + port::AtomicPointer delay_sstable_sync_; + + // Simulate no-space errors while this pointer is non-nullptr. + port::AtomicPointer no_space_; + + // Simulate non-writable file system while this pointer is non-nullptr + port::AtomicPointer non_writable_; + + // Force sync of manifest files to fail while this pointer is non-nullptr + port::AtomicPointer manifest_sync_error_; + + // Force write to manifest files to fail while this pointer is non-nullptr + port::AtomicPointer manifest_write_error_; + + // Force write to log files to fail while this pointer is non-nullptr + port::AtomicPointer log_write_error_; + + bool count_random_reads_; + anon::AtomicCounter random_read_counter_; + + anon::AtomicCounter sleep_counter_; + + explicit SpecialEnv(Env* base) : EnvWrapper(base) { + delay_sstable_sync_.Release_Store(nullptr); + no_space_.Release_Store(nullptr); + non_writable_.Release_Store(nullptr); + count_random_reads_ = false; + manifest_sync_error_.Release_Store(nullptr); + manifest_write_error_.Release_Store(nullptr); + log_write_error_.Release_Store(nullptr); + } + + Status NewWritableFile(const std::string& f, unique_ptr* r, + const EnvOptions& soptions) { + class SSTableFile : public WritableFile { + private: + SpecialEnv* env_; + unique_ptr base_; + + public: + SSTableFile(SpecialEnv* env, unique_ptr&& base) + : env_(env), + base_(std::move(base)) { + } + Status Append(const Slice& data) { + if (env_->no_space_.Acquire_Load() != nullptr) { + // Drop writes on the floor + return Status::OK(); + } else { + return base_->Append(data); + } + } + Status Close() { return base_->Close(); } + Status Flush() { return base_->Flush(); } + Status Sync() { + while (env_->delay_sstable_sync_.Acquire_Load() != nullptr) { + env_->SleepForMicroseconds(100000); + } + return base_->Sync(); + } + }; + class ManifestFile : public WritableFile { + private: + SpecialEnv* env_; + unique_ptr base_; + public: + ManifestFile(SpecialEnv* env, unique_ptr&& b) + : env_(env), base_(std::move(b)) { } + Status Append(const Slice& data) { + if (env_->manifest_write_error_.Acquire_Load() != nullptr) { + return Status::IOError("simulated writer error"); + } else { + return base_->Append(data); + } + } + Status Close() { return base_->Close(); } + Status Flush() { return base_->Flush(); } + Status Sync() { + if (env_->manifest_sync_error_.Acquire_Load() != nullptr) { + return Status::IOError("simulated sync error"); + } else { + return base_->Sync(); + } + } + }; + class LogFile : public WritableFile { + private: + SpecialEnv* env_; + unique_ptr base_; + public: + LogFile(SpecialEnv* env, unique_ptr&& b) + : env_(env), base_(std::move(b)) { } + Status Append(const Slice& data) { + if (env_->log_write_error_.Acquire_Load() != nullptr) { + return Status::IOError("simulated writer error"); + } else { + return base_->Append(data); + } + } + Status Close() { return base_->Close(); } + Status Flush() { return base_->Flush(); } + Status Sync() { return base_->Sync(); } + }; + + if (non_writable_.Acquire_Load() != nullptr) { + return Status::IOError("simulated write error"); + } + + Status s = target()->NewWritableFile(f, r, soptions); + if (s.ok()) { + if (strstr(f.c_str(), ".sst") != nullptr) { + r->reset(new SSTableFile(this, std::move(*r))); + } else if (strstr(f.c_str(), "MANIFEST") != nullptr) { + r->reset(new ManifestFile(this, std::move(*r))); + } else if (strstr(f.c_str(), "log") != nullptr) { + r->reset(new LogFile(this, std::move(*r))); + } + } + return s; + } + + Status NewRandomAccessFile(const std::string& f, + unique_ptr* r, + const EnvOptions& soptions) { + class CountingFile : public RandomAccessFile { + private: + unique_ptr target_; + anon::AtomicCounter* counter_; + public: + CountingFile(unique_ptr&& target, + anon::AtomicCounter* counter) + : target_(std::move(target)), counter_(counter) { + } + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + counter_->Increment(); + return target_->Read(offset, n, result, scratch); + } + }; + + Status s = target()->NewRandomAccessFile(f, r, soptions); + if (s.ok() && count_random_reads_) { + r->reset(new CountingFile(std::move(*r), &random_read_counter_)); + } + return s; + } + + virtual void SleepForMicroseconds(int micros) { + sleep_counter_.Increment(); + target()->SleepForMicroseconds(micros); + } +}; + +class DBTest { + private: + const FilterPolicy* filter_policy_; + + protected: + // Sequence of option configurations to try + enum OptionConfig { + kDefault, + kVectorRep, + kMergePut, + kFilter, + kUncompressed, + kNumLevel_3, + kDBLogDir, + kWalDir, + kManifestFileSize, + kCompactOnFlush, + kPerfOptions, + kDeletesFilterFirst, + kHashSkipList, + kUniversalCompaction, + kCompressedBlockCache, + kEnd + }; + int option_config_; + + public: + std::string dbname_; + SpecialEnv* env_; + DB* db_; + + Options last_options_; + + // Skip some options, as they may not be applicable to a specific test. + // To add more skip constants, use values 4, 8, 16, etc. + enum OptionSkip { + kNoSkip = 0, + kSkipDeletesFilterFirst = 1, + kSkipUniversalCompaction = 2, + kSkipMergePut = 4 + }; + + DBTest() : option_config_(kDefault), + env_(new SpecialEnv(Env::Default())) { + filter_policy_ = NewBloomFilterPolicy(10); + dbname_ = test::TmpDir() + "/db_test"; + ASSERT_OK(DestroyDB(dbname_, Options())); + db_ = nullptr; + Reopen(); + } + + ~DBTest() { + delete db_; + ASSERT_OK(DestroyDB(dbname_, Options())); + delete env_; + delete filter_policy_; + } + + // Switch to a fresh database with the next option configuration to + // test. Return false if there are no more configurations to test. + bool ChangeOptions(int skip_mask = kNoSkip) { + option_config_++; + + // skip some options + if (skip_mask & kSkipDeletesFilterFirst && + option_config_ == kDeletesFilterFirst) { + option_config_++; + } + if (skip_mask & kSkipUniversalCompaction && + option_config_ == kUniversalCompaction) { + option_config_++; + } + if (skip_mask & kSkipMergePut && option_config_ == kMergePut) { + option_config_++; + } + if (option_config_ >= kEnd) { + Destroy(&last_options_); + return false; + } else { + DestroyAndReopen(); + return true; + } + } + + // Switch between different compaction styles (we have only 2 now). + bool ChangeCompactOptions(Options* prev_options = nullptr) { + if (option_config_ == kDefault) { + option_config_ = kUniversalCompaction; + if (prev_options == nullptr) { + prev_options = &last_options_; + } + Destroy(prev_options); + TryReopen(); + return true; + } else { + return false; + } + } + + // Return the current option configuration. + Options CurrentOptions() { + Options options; + switch (option_config_) { + case kHashSkipList: + options.memtable_factory.reset( + NewHashSkipListRepFactory(NewFixedPrefixTransform(1))); + break; + case kMergePut: + options.merge_operator = MergeOperators::CreatePutOperator(); + break; + case kFilter: + options.filter_policy = filter_policy_; + break; + case kUncompressed: + options.compression = kNoCompression; + break; + case kNumLevel_3: + options.num_levels = 3; + break; + case kDBLogDir: + options.db_log_dir = test::TmpDir(); + break; + case kWalDir: + options.wal_dir = "/tmp/wal"; + break; + case kManifestFileSize: + options.max_manifest_file_size = 50; // 50 bytes + case kCompactOnFlush: + options.purge_redundant_kvs_while_flush = + !options.purge_redundant_kvs_while_flush; + break; + case kPerfOptions: + options.hard_rate_limit = 2.0; + options.rate_limit_delay_max_milliseconds = 2; + // TODO -- test more options + break; + case kDeletesFilterFirst: + options.filter_deletes = true; + break; + case kVectorRep: + options.memtable_factory.reset(new VectorRepFactory(100)); + break; + case kUniversalCompaction: + options.compaction_style = kCompactionStyleUniversal; + break; + case kCompressedBlockCache: + options.block_cache_compressed = NewLRUCache(8*1024*1024); + break; + default: + break; + } + return options; + } + + DBImpl* dbfull() { + return reinterpret_cast(db_); + } + + void Reopen(Options* options = nullptr) { + ASSERT_OK(TryReopen(options)); + } + + void Close() { + delete db_; + db_ = nullptr; + } + + void DestroyAndReopen(Options* options = nullptr) { + //Destroy using last options + Destroy(&last_options_); + ASSERT_OK(TryReopen(options)); + } + + void Destroy(Options* options) { + delete db_; + db_ = nullptr; + ASSERT_OK(DestroyDB(dbname_, *options)); + } + + Status PureReopen(Options* options, DB** db) { + return DB::Open(*options, dbname_, db); + } + + Status TryReopen(Options* options = nullptr) { + delete db_; + db_ = nullptr; + Options opts; + if (options != nullptr) { + opts = *options; + } else { + opts = CurrentOptions(); + opts.create_if_missing = true; + } + last_options_ = opts; + + return DB::Open(opts, dbname_, &db_); + } + + Status Put(const Slice& k, const Slice& v, WriteOptions wo = WriteOptions()) { + if (kMergePut == option_config_ ) { + return db_->Merge(wo, k, v); + } else { + return db_->Put(wo, k, v); + } + } + + Status Delete(const std::string& k) { + return db_->Delete(WriteOptions(), k); + } + + std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) { + ReadOptions options; + options.verify_checksums = true; + options.snapshot = snapshot; + std::string result; + Status s = db_->Get(options, k, &result); + if (s.IsNotFound()) { + result = "NOT_FOUND"; + } else if (!s.ok()) { + result = s.ToString(); + } + return result; + } + + // Return a string that contains all key,value pairs in order, + // formatted like "(k1->v1)(k2->v2)". + std::string Contents() { + std::vector forward; + std::string result; + Iterator* iter = db_->NewIterator(ReadOptions()); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + std::string s = IterStatus(iter); + result.push_back('('); + result.append(s); + result.push_back(')'); + forward.push_back(s); + } + + // Check reverse iteration results are the reverse of forward results + unsigned int matched = 0; + for (iter->SeekToLast(); iter->Valid(); iter->Prev()) { + ASSERT_LT(matched, forward.size()); + ASSERT_EQ(IterStatus(iter), forward[forward.size() - matched - 1]); + matched++; + } + ASSERT_EQ(matched, forward.size()); + + delete iter; + return result; + } + + std::string AllEntriesFor(const Slice& user_key) { + Iterator* iter = dbfull()->TEST_NewInternalIterator(); + InternalKey target(user_key, kMaxSequenceNumber, kTypeValue); + iter->Seek(target.Encode()); + std::string result; + if (!iter->status().ok()) { + result = iter->status().ToString(); + } else { + result = "[ "; + bool first = true; + while (iter->Valid()) { + ParsedInternalKey ikey(Slice(), 0, kTypeValue); + if (!ParseInternalKey(iter->key(), &ikey)) { + result += "CORRUPTED"; + } else { + if (last_options_.comparator->Compare(ikey.user_key, user_key) != 0) { + break; + } + if (!first) { + result += ", "; + } + first = false; + switch (ikey.type) { + case kTypeValue: + result += iter->value().ToString(); + break; + case kTypeMerge: + // keep it the same as kTypeValue for testing kMergePut + result += iter->value().ToString(); + break; + case kTypeDeletion: + result += "DEL"; + break; + case kTypeLogData: + assert(false); + break; + } + } + iter->Next(); + } + if (!first) { + result += " "; + } + result += "]"; + } + delete iter; + return result; + } + + int NumTableFilesAtLevel(int level) { + std::string property; + ASSERT_TRUE( + db_->GetProperty("rocksdb.num-files-at-level" + NumberToString(level), + &property)); + return atoi(property.c_str()); + } + + int TotalTableFiles() { + int result = 0; + for (int level = 0; level < db_->NumberLevels(); level++) { + result += NumTableFilesAtLevel(level); + } + return result; + } + + // Return spread of files per level + std::string FilesPerLevel() { + std::string result; + int last_non_zero_offset = 0; + for (int level = 0; level < db_->NumberLevels(); level++) { + int f = NumTableFilesAtLevel(level); + char buf[100]; + snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f); + result += buf; + if (f > 0) { + last_non_zero_offset = result.size(); + } + } + result.resize(last_non_zero_offset); + return result; + } + + int CountFiles() { + std::vector files; + env_->GetChildren(dbname_, &files); + + std::vector logfiles; + if (dbname_ != last_options_.wal_dir) { + env_->GetChildren(last_options_.wal_dir, &logfiles); + } + + return static_cast(files.size() + logfiles.size()); + } + + int CountLiveFiles() { + std::vector files; + uint64_t manifest_file_size; + db_->GetLiveFiles(files, &manifest_file_size); + return files.size(); + } + + uint64_t Size(const Slice& start, const Slice& limit) { + Range r(start, limit); + uint64_t size; + db_->GetApproximateSizes(&r, 1, &size); + return size; + } + + void Compact(const Slice& start, const Slice& limit) { + db_->CompactRange(&start, &limit); + } + + // Do n memtable compactions, each of which produces an sstable + // covering the range [small,large]. + void MakeTables(int n, const std::string& small, const std::string& large) { + for (int i = 0; i < n; i++) { + Put(small, "begin"); + Put(large, "end"); + dbfull()->TEST_FlushMemTable(); + } + } + + // Prevent pushing of new sstables into deeper levels by adding + // tables that cover a specified range to all levels. + void FillLevels(const std::string& smallest, const std::string& largest) { + MakeTables(db_->NumberLevels(), smallest, largest); + } + + void DumpFileCounts(const char* label) { + fprintf(stderr, "---\n%s:\n", label); + fprintf(stderr, "maxoverlap: %lld\n", + static_cast( + dbfull()->TEST_MaxNextLevelOverlappingBytes())); + for (int level = 0; level < db_->NumberLevels(); level++) { + int num = NumTableFilesAtLevel(level); + if (num > 0) { + fprintf(stderr, " level %3d : %d files\n", level, num); + } + } + } + + std::string DumpSSTableList() { + std::string property; + db_->GetProperty("rocksdb.sstables", &property); + return property; + } + + std::string IterStatus(Iterator* iter) { + std::string result; + if (iter->Valid()) { + result = iter->key().ToString() + "->" + iter->value().ToString(); + } else { + result = "(invalid)"; + } + return result; + } + + Options OptionsForLogIterTest() { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.WAL_ttl_seconds = 1000; + return options; + } + + std::unique_ptr OpenTransactionLogIter( + const SequenceNumber seq) { + unique_ptr iter; + Status status = dbfull()->GetUpdatesSince(seq, &iter); + ASSERT_OK(status); + ASSERT_TRUE(iter->Valid()); + return std::move(iter); + } + + std::string DummyString(size_t len, char c = 'a') { + return std::string(len, c); + } + + void VerifyIterLast(std::string expected_key) { + Iterator* iter = db_->NewIterator(ReadOptions()); + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), expected_key); + delete iter; + } +}; + +static std::string Key(int i) { + char buf[100]; + snprintf(buf, sizeof(buf), "key%06d", i); + return std::string(buf); +} + +TEST(DBTest, Empty) { + do { + ASSERT_TRUE(db_ != nullptr); + ASSERT_EQ("NOT_FOUND", Get("foo")); + } while (ChangeOptions()); +} + +TEST(DBTest, ReadWrite) { + do { + ASSERT_OK(Put("foo", "v1")); + ASSERT_EQ("v1", Get("foo")); + ASSERT_OK(Put("bar", "v2")); + ASSERT_OK(Put("foo", "v3")); + ASSERT_EQ("v3", Get("foo")); + ASSERT_EQ("v2", Get("bar")); + } while (ChangeOptions()); +} + +// Make sure that when options.block_cache is set, after a new table is +// created its index/filter blocks are added to block cache. +TEST(DBTest, IndexAndFilterBlocksOfNewTableAddedToCache) { + Options options = CurrentOptions(); + std::unique_ptr filter_policy(NewBloomFilterPolicy(20)); + options.filter_policy = filter_policy.get(); + options.create_if_missing = true; + options.statistics = rocksdb::CreateDBStatistics(); + DestroyAndReopen(&options); + + ASSERT_OK(db_->Put(WriteOptions(), "key", "val")); + // Create a new talbe. + dbfull()->Flush(FlushOptions()); + + // index/filter blocks added to block cache right after table creation. + ASSERT_EQ(1, + options.statistics.get()->getTickerCount(BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(1, + options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(2, /* only index/filter were added */ + options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD)); + ASSERT_EQ(0, + options.statistics.get()->getTickerCount(BLOCK_CACHE_DATA_MISS)); + + // Make sure filter block is in cache. + std::string value; + ReadOptions ropt; + db_->KeyMayExist(ReadOptions(), "key", &value); + + // Miss count should remain the same. + ASSERT_EQ(1, + options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(1, + options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_HIT)); + + db_->KeyMayExist(ReadOptions(), "key", &value); + ASSERT_EQ(1, + options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(2, + options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_HIT)); + + // Make sure index block is in cache. + auto index_block_hit = + options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_HIT); + value = Get("key"); + ASSERT_EQ(1, + options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(index_block_hit + 1, + options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_HIT)); + + value = Get("key"); + ASSERT_EQ(1, + options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(index_block_hit + 2, + options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_HIT)); +} + +TEST(DBTest, LevelLimitReopen) { + Options options = CurrentOptions(); + Reopen(&options); + + const std::string value(1024 * 1024, ' '); + int i = 0; + while (NumTableFilesAtLevel(2) == 0) { + ASSERT_OK(Put(Key(i++), value)); + } + + options.num_levels = 1; + options.max_bytes_for_level_multiplier_additional.resize(1, 1); + Status s = TryReopen(&options); + ASSERT_EQ(s.IsInvalidArgument(), true); + ASSERT_EQ(s.ToString(), + "Invalid argument: db has more levels than options.num_levels"); + + options.num_levels = 10; + options.max_bytes_for_level_multiplier_additional.resize(10, 1); + ASSERT_OK(TryReopen(&options)); +} + +TEST(DBTest, Preallocation) { + const std::string src = dbname_ + "/alloc_test"; + unique_ptr srcfile; + const EnvOptions soptions; + ASSERT_OK(env_->NewWritableFile(src, &srcfile, soptions)); + srcfile->SetPreallocationBlockSize(1024 * 1024); + + // No writes should mean no preallocation + size_t block_size, last_allocated_block; + srcfile->GetPreallocationStatus(&block_size, &last_allocated_block); + ASSERT_EQ(last_allocated_block, 0UL); + + // Small write should preallocate one block + srcfile->Append("test"); + srcfile->GetPreallocationStatus(&block_size, &last_allocated_block); + ASSERT_EQ(last_allocated_block, 1UL); + + // Write an entire preallocation block, make sure we increased by two. + std::string buf(block_size, ' '); + srcfile->Append(buf); + srcfile->GetPreallocationStatus(&block_size, &last_allocated_block); + ASSERT_EQ(last_allocated_block, 2UL); + + // Write five more blocks at once, ensure we're where we need to be. + buf = std::string(block_size * 5, ' '); + srcfile->Append(buf); + srcfile->GetPreallocationStatus(&block_size, &last_allocated_block); + ASSERT_EQ(last_allocated_block, 7UL); +} + +TEST(DBTest, PutDeleteGet) { + do { + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1")); + ASSERT_EQ("v1", Get("foo")); + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2")); + ASSERT_EQ("v2", Get("foo")); + ASSERT_OK(db_->Delete(WriteOptions(), "foo")); + ASSERT_EQ("NOT_FOUND", Get("foo")); + } while (ChangeOptions()); +} + + +TEST(DBTest, GetFromImmutableLayer) { + do { + Options options = CurrentOptions(); + options.env = env_; + options.write_buffer_size = 100000; // Small write buffer + Reopen(&options); + + ASSERT_OK(Put("foo", "v1")); + ASSERT_EQ("v1", Get("foo")); + + env_->delay_sstable_sync_.Release_Store(env_); // Block sync calls + Put("k1", std::string(100000, 'x')); // Fill memtable + Put("k2", std::string(100000, 'y')); // Trigger compaction + ASSERT_EQ("v1", Get("foo")); + env_->delay_sstable_sync_.Release_Store(nullptr); // Release sync calls + } while (ChangeOptions()); +} + +TEST(DBTest, GetFromVersions) { + do { + ASSERT_OK(Put("foo", "v1")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v1", Get("foo")); + } while (ChangeOptions()); +} + +TEST(DBTest, GetSnapshot) { + do { + // Try with both a short key and a long key + for (int i = 0; i < 2; i++) { + std::string key = (i == 0) ? std::string("foo") : std::string(200, 'x'); + ASSERT_OK(Put(key, "v1")); + const Snapshot* s1 = db_->GetSnapshot(); + ASSERT_OK(Put(key, "v2")); + ASSERT_EQ("v2", Get(key)); + ASSERT_EQ("v1", Get(key, s1)); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v2", Get(key)); + ASSERT_EQ("v1", Get(key, s1)); + db_->ReleaseSnapshot(s1); + } + } while (ChangeOptions()); +} + +TEST(DBTest, GetLevel0Ordering) { + do { + // Check that we process level-0 files in correct order. The code + // below generates two level-0 files where the earlier one comes + // before the later one in the level-0 file list since the earlier + // one has a smaller "smallest" key. + ASSERT_OK(Put("bar", "b")); + ASSERT_OK(Put("foo", "v1")); + dbfull()->TEST_FlushMemTable(); + ASSERT_OK(Put("foo", "v2")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v2", Get("foo")); + } while (ChangeOptions()); +} + +TEST(DBTest, GetOrderedByLevels) { + do { + ASSERT_OK(Put("foo", "v1")); + Compact("a", "z"); + ASSERT_EQ("v1", Get("foo")); + ASSERT_OK(Put("foo", "v2")); + ASSERT_EQ("v2", Get("foo")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v2", Get("foo")); + } while (ChangeOptions()); +} + +TEST(DBTest, GetPicksCorrectFile) { + do { + // Arrange to have multiple files in a non-level-0 level. + ASSERT_OK(Put("a", "va")); + Compact("a", "b"); + ASSERT_OK(Put("x", "vx")); + Compact("x", "y"); + ASSERT_OK(Put("f", "vf")); + Compact("f", "g"); + ASSERT_EQ("va", Get("a")); + ASSERT_EQ("vf", Get("f")); + ASSERT_EQ("vx", Get("x")); + } while (ChangeOptions()); +} + +TEST(DBTest, GetEncountersEmptyLevel) { + do { + // Arrange for the following to happen: + // * sstable A in level 0 + // * nothing in level 1 + // * sstable B in level 2 + // Then do enough Get() calls to arrange for an automatic compaction + // of sstable A. A bug would cause the compaction to be marked as + // occuring at level 1 (instead of the correct level 0). + + // Step 1: First place sstables in levels 0 and 2 + int compaction_count = 0; + while (NumTableFilesAtLevel(0) == 0 || + NumTableFilesAtLevel(2) == 0) { + ASSERT_LE(compaction_count, 100) << "could not fill levels 0 and 2"; + compaction_count++; + Put("a", "begin"); + Put("z", "end"); + dbfull()->TEST_FlushMemTable(); + } + + // Step 2: clear level 1 if necessary. + dbfull()->TEST_CompactRange(1, nullptr, nullptr); + ASSERT_EQ(NumTableFilesAtLevel(0), 1); + ASSERT_EQ(NumTableFilesAtLevel(1), 0); + ASSERT_EQ(NumTableFilesAtLevel(2), 1); + + // Step 3: read a bunch of times + for (int i = 0; i < 1000; i++) { + ASSERT_EQ("NOT_FOUND", Get("missing")); + } + + // Step 4: Wait for compaction to finish + env_->SleepForMicroseconds(1000000); + + ASSERT_EQ(NumTableFilesAtLevel(0), 1); // XXX + } while (ChangeOptions(kSkipUniversalCompaction)); +} + +// KeyMayExist can lead to a few false positives, but not false negatives. +// To make test deterministic, use a much larger number of bits per key-20 than +// bits in the key, so that false positives are eliminated +TEST(DBTest, KeyMayExist) { + do { + ReadOptions ropts; + std::string value; + Options options = CurrentOptions(); + options.filter_policy = NewBloomFilterPolicy(20); + options.statistics = rocksdb::CreateDBStatistics(); + Reopen(&options); + + ASSERT_TRUE(!db_->KeyMayExist(ropts, "a", &value)); + + ASSERT_OK(db_->Put(WriteOptions(), "a", "b")); + bool value_found = false; + ASSERT_TRUE(db_->KeyMayExist(ropts, "a", &value, &value_found)); + ASSERT_TRUE(value_found); + ASSERT_EQ("b", value); + + dbfull()->Flush(FlushOptions()); + value.clear(); + + long numopen = options.statistics.get()->getTickerCount(NO_FILE_OPENS); + long cache_added = + options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD); + ASSERT_TRUE(db_->KeyMayExist(ropts, "a", &value, &value_found)); + ASSERT_TRUE(!value_found); + // assert that no new files were opened and no new blocks were + // read into block cache. + ASSERT_EQ(numopen, options.statistics.get()->getTickerCount(NO_FILE_OPENS)); + ASSERT_EQ(cache_added, + options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD)); + + ASSERT_OK(db_->Delete(WriteOptions(), "a")); + + numopen = options.statistics.get()->getTickerCount(NO_FILE_OPENS); + cache_added = + options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD); + ASSERT_TRUE(!db_->KeyMayExist(ropts, "a", &value)); + ASSERT_EQ(numopen, options.statistics.get()->getTickerCount(NO_FILE_OPENS)); + ASSERT_EQ(cache_added, + options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD)); + + dbfull()->Flush(FlushOptions()); + dbfull()->CompactRange(nullptr, nullptr); + + numopen = options.statistics.get()->getTickerCount(NO_FILE_OPENS); + cache_added = + options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD); + ASSERT_TRUE(!db_->KeyMayExist(ropts, "a", &value)); + ASSERT_EQ(numopen, options.statistics.get()->getTickerCount(NO_FILE_OPENS)); + ASSERT_EQ(cache_added, + options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD)); + + ASSERT_OK(db_->Delete(WriteOptions(), "c")); + + numopen = options.statistics.get()->getTickerCount(NO_FILE_OPENS); + cache_added = + options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD); + ASSERT_TRUE(!db_->KeyMayExist(ropts, "c", &value)); + ASSERT_EQ(numopen, options.statistics.get()->getTickerCount(NO_FILE_OPENS)); + ASSERT_EQ(cache_added, + options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD)); + + delete options.filter_policy; + } while (ChangeOptions()); +} + +TEST(DBTest, NonBlockingIteration) { + do { + ReadOptions non_blocking_opts, regular_opts; + Options options = CurrentOptions(); + options.statistics = rocksdb::CreateDBStatistics(); + non_blocking_opts.read_tier = kBlockCacheTier; + Reopen(&options); + // write one kv to the database. + ASSERT_OK(db_->Put(WriteOptions(), "a", "b")); + + // scan using non-blocking iterator. We should find it because + // it is in memtable. + Iterator* iter = db_->NewIterator(non_blocking_opts); + int count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_OK(iter->status()); + count++; + } + ASSERT_EQ(count, 1); + delete iter; + + // flush memtable to storage. Now, the key should not be in the + // memtable neither in the block cache. + dbfull()->Flush(FlushOptions()); + + // verify that a non-blocking iterator does not find any + // kvs. Neither does it do any IOs to storage. + long numopen = options.statistics.get()->getTickerCount(NO_FILE_OPENS); + long cache_added = + options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD); + iter = db_->NewIterator(non_blocking_opts); + count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + count++; + } + ASSERT_EQ(count, 0); + ASSERT_TRUE(iter->status().IsIncomplete()); + ASSERT_EQ(numopen, options.statistics.get()->getTickerCount(NO_FILE_OPENS)); + ASSERT_EQ(cache_added, + options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD)); + delete iter; + + // read in the specified block via a regular get + ASSERT_EQ(Get("a"), "b"); + + // verify that we can find it via a non-blocking scan + numopen = options.statistics.get()->getTickerCount(NO_FILE_OPENS); + cache_added = + options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD); + iter = db_->NewIterator(non_blocking_opts); + count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_OK(iter->status()); + count++; + } + ASSERT_EQ(count, 1); + ASSERT_EQ(numopen, options.statistics.get()->getTickerCount(NO_FILE_OPENS)); + ASSERT_EQ(cache_added, + options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD)); + delete iter; + + } while (ChangeOptions()); +} + +// A delete is skipped for key if KeyMayExist(key) returns False +// Tests Writebatch consistency and proper delete behaviour +TEST(DBTest, FilterDeletes) { + do { + Options options = CurrentOptions(); + options.filter_policy = NewBloomFilterPolicy(20); + options.filter_deletes = true; + Reopen(&options); + WriteBatch batch; + + batch.Delete("a"); + dbfull()->Write(WriteOptions(), &batch); + ASSERT_EQ(AllEntriesFor("a"), "[ ]"); // Delete skipped + batch.Clear(); + + batch.Put("a", "b"); + batch.Delete("a"); + dbfull()->Write(WriteOptions(), &batch); + ASSERT_EQ(Get("a"), "NOT_FOUND"); + ASSERT_EQ(AllEntriesFor("a"), "[ DEL, b ]"); // Delete issued + batch.Clear(); + + batch.Delete("c"); + batch.Put("c", "d"); + dbfull()->Write(WriteOptions(), &batch); + ASSERT_EQ(Get("c"), "d"); + ASSERT_EQ(AllEntriesFor("c"), "[ d ]"); // Delete skipped + batch.Clear(); + + dbfull()->Flush(FlushOptions()); // A stray Flush + + batch.Delete("c"); + dbfull()->Write(WriteOptions(), &batch); + ASSERT_EQ(AllEntriesFor("c"), "[ DEL, d ]"); // Delete issued + batch.Clear(); + + delete options.filter_policy; + } while (ChangeCompactOptions()); +} + +TEST(DBTest, IterEmpty) { + do { + Iterator* iter = db_->NewIterator(ReadOptions()); + + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->Seek("foo"); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + delete iter; + } while (ChangeCompactOptions()); +} + +TEST(DBTest, IterSingle) { + do { + ASSERT_OK(Put("a", "va")); + Iterator* iter = db_->NewIterator(ReadOptions()); + + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->Seek(""); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->Seek("a"); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->Seek("b"); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + delete iter; + } while (ChangeCompactOptions()); +} + +TEST(DBTest, IterMulti) { + do { + ASSERT_OK(Put("a", "va")); + ASSERT_OK(Put("b", "vb")); + ASSERT_OK(Put("c", "vc")); + Iterator* iter = db_->NewIterator(ReadOptions()); + + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->Seek(""); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Seek("a"); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Seek("ax"); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->Seek("b"); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->Seek("z"); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + // Switch from reverse to forward + iter->SeekToLast(); + iter->Prev(); + iter->Prev(); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + + // Switch from forward to reverse + iter->SeekToFirst(); + iter->Next(); + iter->Next(); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + + // Make sure iter stays at snapshot + ASSERT_OK(Put("a", "va2")); + ASSERT_OK(Put("a2", "va3")); + ASSERT_OK(Put("b", "vb2")); + ASSERT_OK(Put("c", "vc2")); + ASSERT_OK(Delete("b")); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + delete iter; + } while (ChangeCompactOptions()); +} + +// Check that we can skip over a run of user keys +// by using reseek rather than sequential scan +TEST(DBTest, IterReseek) { + Options options = CurrentOptions(); + options.max_sequential_skip_in_iterations = 3; + options.create_if_missing = true; + options.statistics = rocksdb::CreateDBStatistics(); + DestroyAndReopen(&options); + + // insert two keys with same userkey and verify that + // reseek is not invoked. For each of these test cases, + // verify that we can find the next key "b". + ASSERT_OK(Put("a", "one")); + ASSERT_OK(Put("a", "two")); + ASSERT_OK(Put("b", "bone")); + Iterator* iter = db_->NewIterator(ReadOptions()); + iter->SeekToFirst(); + ASSERT_EQ(options.statistics.get()->getTickerCount( + NUMBER_OF_RESEEKS_IN_ITERATION), 0); + ASSERT_EQ(IterStatus(iter), "a->two"); + iter->Next(); + ASSERT_EQ(options.statistics.get()->getTickerCount( + NUMBER_OF_RESEEKS_IN_ITERATION), 0); + ASSERT_EQ(IterStatus(iter), "b->bone"); + delete iter; + + // insert a total of three keys with same userkey and verify + // that reseek is still not invoked. + ASSERT_OK(Put("a", "three")); + iter = db_->NewIterator(ReadOptions()); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->three"); + iter->Next(); + ASSERT_EQ(options.statistics.get()->getTickerCount( + NUMBER_OF_RESEEKS_IN_ITERATION), 0); + ASSERT_EQ(IterStatus(iter), "b->bone"); + delete iter; + + // insert a total of four keys with same userkey and verify + // that reseek is invoked. + ASSERT_OK(Put("a", "four")); + iter = db_->NewIterator(ReadOptions()); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->four"); + ASSERT_EQ(options.statistics.get()->getTickerCount( + NUMBER_OF_RESEEKS_IN_ITERATION), 0); + iter->Next(); + ASSERT_EQ(options.statistics.get()->getTickerCount( + NUMBER_OF_RESEEKS_IN_ITERATION), 1); + ASSERT_EQ(IterStatus(iter), "b->bone"); + delete iter; + + // Testing reverse iterator + // At this point, we have three versions of "a" and one version of "b". + // The reseek statistics is already at 1. + int num_reseeks = (int)options.statistics.get()->getTickerCount( + NUMBER_OF_RESEEKS_IN_ITERATION); + + // Insert another version of b and assert that reseek is not invoked + ASSERT_OK(Put("b", "btwo")); + iter = db_->NewIterator(ReadOptions()); + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "b->btwo"); + ASSERT_EQ(options.statistics.get()->getTickerCount( + NUMBER_OF_RESEEKS_IN_ITERATION), num_reseeks); + iter->Prev(); + ASSERT_EQ(options.statistics.get()->getTickerCount( + NUMBER_OF_RESEEKS_IN_ITERATION), num_reseeks+1); + ASSERT_EQ(IterStatus(iter), "a->four"); + delete iter; + + // insert two more versions of b. This makes a total of 4 versions + // of b and 4 versions of a. + ASSERT_OK(Put("b", "bthree")); + ASSERT_OK(Put("b", "bfour")); + iter = db_->NewIterator(ReadOptions()); + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "b->bfour"); + ASSERT_EQ(options.statistics.get()->getTickerCount( + NUMBER_OF_RESEEKS_IN_ITERATION), num_reseeks + 2); + iter->Prev(); + + // the previous Prev call should have invoked reseek + ASSERT_EQ(options.statistics.get()->getTickerCount( + NUMBER_OF_RESEEKS_IN_ITERATION), num_reseeks + 3); + ASSERT_EQ(IterStatus(iter), "a->four"); + delete iter; +} + +TEST(DBTest, IterSmallAndLargeMix) { + do { + ASSERT_OK(Put("a", "va")); + ASSERT_OK(Put("b", std::string(100000, 'b'))); + ASSERT_OK(Put("c", "vc")); + ASSERT_OK(Put("d", std::string(100000, 'd'))); + ASSERT_OK(Put("e", std::string(100000, 'e'))); + + Iterator* iter = db_->NewIterator(ReadOptions()); + + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b')); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd')); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e')); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e')); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd')); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b')); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + delete iter; + } while (ChangeCompactOptions()); +} + +TEST(DBTest, IterMultiWithDelete) { + do { + ASSERT_OK(Put("a", "va")); + ASSERT_OK(Put("b", "vb")); + ASSERT_OK(Put("c", "vc")); + ASSERT_OK(Delete("b")); + ASSERT_EQ("NOT_FOUND", Get("b")); + + Iterator* iter = db_->NewIterator(ReadOptions()); + iter->Seek("c"); + ASSERT_EQ(IterStatus(iter), "c->vc"); + if (!CurrentOptions().merge_operator) { + // TODO: merge operator does not support backward iteration yet + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "a->va"); + } + delete iter; + } while (ChangeOptions()); +} + +TEST(DBTest, IterPrevMaxSkip) { + do { + for (int i = 0; i < 2; i++) { + db_->Put(WriteOptions(), "key1", "v1"); + db_->Put(WriteOptions(), "key2", "v2"); + db_->Put(WriteOptions(), "key3", "v3"); + db_->Put(WriteOptions(), "key4", "v4"); + db_->Put(WriteOptions(), "key5", "v5"); + } + + VerifyIterLast("key5->v5"); + + ASSERT_OK(db_->Delete(WriteOptions(), "key5")); + VerifyIterLast("key4->v4"); + + ASSERT_OK(db_->Delete(WriteOptions(), "key4")); + VerifyIterLast("key3->v3"); + + ASSERT_OK(db_->Delete(WriteOptions(), "key3")); + VerifyIterLast("key2->v2"); + + ASSERT_OK(db_->Delete(WriteOptions(), "key2")); + VerifyIterLast("key1->v1"); + + ASSERT_OK(db_->Delete(WriteOptions(), "key1")); + VerifyIterLast("(invalid)"); + } while (ChangeOptions(kSkipMergePut)); +} + +TEST(DBTest, IterWithSnapshot) { + do { + ASSERT_OK(Put("key1", "val1")); + ASSERT_OK(Put("key2", "val2")); + ASSERT_OK(Put("key3", "val3")); + ASSERT_OK(Put("key4", "val4")); + ASSERT_OK(Put("key5", "val5")); + + const Snapshot *snapshot = db_->GetSnapshot(); + ReadOptions options; + options.snapshot = snapshot; + Iterator* iter = db_->NewIterator(options); + + // Put more values after the snapshot + ASSERT_OK(Put("key100", "val100")); + ASSERT_OK(Put("key101", "val101")); + + iter->Seek("key5"); + ASSERT_EQ(IterStatus(iter), "key5->val5"); + if (!CurrentOptions().merge_operator) { + // TODO: merge operator does not support backward iteration yet + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "key4->val4"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "key3->val3"); + + iter->Next(); + ASSERT_EQ(IterStatus(iter), "key4->val4"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "key5->val5"); + iter->Next(); + ASSERT_TRUE(!iter->Valid()); + } + db_->ReleaseSnapshot(snapshot); + delete iter; + } while (ChangeOptions()); +} + +TEST(DBTest, Recover) { + do { + ASSERT_OK(Put("foo", "v1")); + ASSERT_OK(Put("baz", "v5")); + + Reopen(); + ASSERT_EQ("v1", Get("foo")); + + ASSERT_EQ("v1", Get("foo")); + ASSERT_EQ("v5", Get("baz")); + ASSERT_OK(Put("bar", "v2")); + ASSERT_OK(Put("foo", "v3")); + + Reopen(); + ASSERT_EQ("v3", Get("foo")); + ASSERT_OK(Put("foo", "v4")); + ASSERT_EQ("v4", Get("foo")); + ASSERT_EQ("v2", Get("bar")); + ASSERT_EQ("v5", Get("baz")); + } while (ChangeOptions()); +} + +TEST(DBTest, RollLog) { + do { + ASSERT_OK(Put("foo", "v1")); + ASSERT_OK(Put("baz", "v5")); + + Reopen(); + for (int i = 0; i < 10; i++) { + Reopen(); + } + ASSERT_OK(Put("foo", "v4")); + for (int i = 0; i < 10; i++) { + Reopen(); + } + } while (ChangeOptions()); +} + +TEST(DBTest, WAL) { + do { + Options options = CurrentOptions(); + WriteOptions writeOpt = WriteOptions(); + writeOpt.disableWAL = true; + ASSERT_OK(dbfull()->Put(writeOpt, "foo", "v1")); + ASSERT_OK(dbfull()->Put(writeOpt, "bar", "v1")); + + Reopen(); + ASSERT_EQ("v1", Get("foo")); + ASSERT_EQ("v1", Get("bar")); + + writeOpt.disableWAL = false; + ASSERT_OK(dbfull()->Put(writeOpt, "bar", "v2")); + writeOpt.disableWAL = true; + ASSERT_OK(dbfull()->Put(writeOpt, "foo", "v2")); + + Reopen(); + // Both value's should be present. + ASSERT_EQ("v2", Get("bar")); + ASSERT_EQ("v2", Get("foo")); + + writeOpt.disableWAL = true; + ASSERT_OK(dbfull()->Put(writeOpt, "bar", "v3")); + writeOpt.disableWAL = false; + ASSERT_OK(dbfull()->Put(writeOpt, "foo", "v3")); + + Reopen(); + // again both values should be present. + ASSERT_EQ("v3", Get("foo")); + ASSERT_EQ("v3", Get("bar")); + } while (ChangeCompactOptions()); +} + +TEST(DBTest, CheckLock) { + do { + DB* localdb; + Options options = CurrentOptions(); + ASSERT_OK(TryReopen(&options)); + + // second open should fail + ASSERT_TRUE(!(PureReopen(&options, &localdb)).ok()); + } while (ChangeCompactOptions()); +} + +TEST(DBTest, FlushMultipleMemtable) { + do { + Options options = CurrentOptions(); + WriteOptions writeOpt = WriteOptions(); + writeOpt.disableWAL = true; + options.max_write_buffer_number = 4; + options.min_write_buffer_number_to_merge = 3; + Reopen(&options); + ASSERT_OK(dbfull()->Put(writeOpt, "foo", "v1")); + dbfull()->Flush(FlushOptions()); + ASSERT_OK(dbfull()->Put(writeOpt, "bar", "v1")); + + ASSERT_EQ("v1", Get("foo")); + ASSERT_EQ("v1", Get("bar")); + dbfull()->Flush(FlushOptions()); + } while (ChangeCompactOptions()); +} + +TEST(DBTest, NumImmutableMemTable) { + do { + Options options = CurrentOptions(); + WriteOptions writeOpt = WriteOptions(); + writeOpt.disableWAL = true; + options.max_write_buffer_number = 4; + options.min_write_buffer_number_to_merge = 3; + options.write_buffer_size = 1000000; + Reopen(&options); + + std::string big_value(1000000, 'x'); + std::string num; + + ASSERT_OK(dbfull()->Put(writeOpt, "k1", big_value)); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num)); + ASSERT_EQ(num, "0"); + + ASSERT_OK(dbfull()->Put(writeOpt, "k2", big_value)); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num)); + ASSERT_EQ(num, "1"); + + ASSERT_OK(dbfull()->Put(writeOpt, "k3", big_value)); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num)); + ASSERT_EQ(num, "2"); + + dbfull()->Flush(FlushOptions()); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num)); + ASSERT_EQ(num, "0"); + } while (ChangeCompactOptions()); +} + +TEST(DBTest, FLUSH) { + do { + Options options = CurrentOptions(); + WriteOptions writeOpt = WriteOptions(); + writeOpt.disableWAL = true; + ASSERT_OK(dbfull()->Put(writeOpt, "foo", "v1")); + // this will now also flush the last 2 writes + dbfull()->Flush(FlushOptions()); + ASSERT_OK(dbfull()->Put(writeOpt, "bar", "v1")); + + Reopen(); + ASSERT_EQ("v1", Get("foo")); + ASSERT_EQ("v1", Get("bar")); + + writeOpt.disableWAL = true; + ASSERT_OK(dbfull()->Put(writeOpt, "bar", "v2")); + ASSERT_OK(dbfull()->Put(writeOpt, "foo", "v2")); + dbfull()->Flush(FlushOptions()); + + Reopen(); + ASSERT_EQ("v2", Get("bar")); + ASSERT_EQ("v2", Get("foo")); + + writeOpt.disableWAL = false; + ASSERT_OK(dbfull()->Put(writeOpt, "bar", "v3")); + ASSERT_OK(dbfull()->Put(writeOpt, "foo", "v3")); + dbfull()->Flush(FlushOptions()); + + Reopen(); + // 'foo' should be there because its put + // has WAL enabled. + ASSERT_EQ("v3", Get("foo")); + ASSERT_EQ("v3", Get("bar")); + } while (ChangeCompactOptions()); +} + +TEST(DBTest, RecoveryWithEmptyLog) { + do { + ASSERT_OK(Put("foo", "v1")); + ASSERT_OK(Put("foo", "v2")); + Reopen(); + Reopen(); + ASSERT_OK(Put("foo", "v3")); + Reopen(); + ASSERT_EQ("v3", Get("foo")); + } while (ChangeOptions()); +} + +// Check that writes done during a memtable compaction are recovered +// if the database is shutdown during the memtable compaction. +TEST(DBTest, RecoverDuringMemtableCompaction) { + do { + Options options = CurrentOptions(); + options.env = env_; + options.write_buffer_size = 1000000; + Reopen(&options); + + // Trigger a long memtable compaction and reopen the database during it + ASSERT_OK(Put("foo", "v1")); // Goes to 1st log file + ASSERT_OK(Put("big1", std::string(10000000, 'x'))); // Fills memtable + ASSERT_OK(Put("big2", std::string(1000, 'y'))); // Triggers compaction + ASSERT_OK(Put("bar", "v2")); // Goes to new log file + + Reopen(&options); + ASSERT_EQ("v1", Get("foo")); + ASSERT_EQ("v2", Get("bar")); + ASSERT_EQ(std::string(10000000, 'x'), Get("big1")); + ASSERT_EQ(std::string(1000, 'y'), Get("big2")); + } while (ChangeOptions()); +} + +TEST(DBTest, MinorCompactionsHappen) { + do { + Options options = CurrentOptions(); + options.write_buffer_size = 10000; + Reopen(&options); + + const int N = 500; + + int starting_num_tables = TotalTableFiles(); + for (int i = 0; i < N; i++) { + ASSERT_OK(Put(Key(i), Key(i) + std::string(1000, 'v'))); + } + int ending_num_tables = TotalTableFiles(); + ASSERT_GT(ending_num_tables, starting_num_tables); + + for (int i = 0; i < N; i++) { + ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(Key(i))); + } + + Reopen(); + + for (int i = 0; i < N; i++) { + ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(Key(i))); + } + } while (ChangeCompactOptions()); +} + +TEST(DBTest, ManifestRollOver) { + do { + Options options = CurrentOptions(); + options.max_manifest_file_size = 10 ; // 10 bytes + Reopen(&options); + { + ASSERT_OK(Put("manifest_key1", std::string(1000, '1'))); + ASSERT_OK(Put("manifest_key2", std::string(1000, '2'))); + ASSERT_OK(Put("manifest_key3", std::string(1000, '3'))); + uint64_t manifest_before_flush = + dbfull()->TEST_Current_Manifest_FileNo(); + dbfull()->Flush(FlushOptions()); // This should trigger LogAndApply. + uint64_t manifest_after_flush = + dbfull()->TEST_Current_Manifest_FileNo(); + ASSERT_GT(manifest_after_flush, manifest_before_flush); + Reopen(&options); + ASSERT_GT(dbfull()->TEST_Current_Manifest_FileNo(), + manifest_after_flush); + // check if a new manifest file got inserted or not. + ASSERT_EQ(std::string(1000, '1'), Get("manifest_key1")); + ASSERT_EQ(std::string(1000, '2'), Get("manifest_key2")); + ASSERT_EQ(std::string(1000, '3'), Get("manifest_key3")); + } + } while (ChangeCompactOptions()); +} + +TEST(DBTest, IdentityAcrossRestarts) { + do { + std::string id1; + ASSERT_OK(db_->GetDbIdentity(id1)); + + Options options = CurrentOptions(); + Reopen(&options); + std::string id2; + ASSERT_OK(db_->GetDbIdentity(id2)); + // id1 should match id2 because identity was not regenerated + ASSERT_EQ(id1.compare(id2), 0); + + std::string idfilename = IdentityFileName(dbname_); + ASSERT_OK(env_->DeleteFile(idfilename)); + Reopen(&options); + std::string id3; + ASSERT_OK(db_->GetDbIdentity(id3)); + // id1 should NOT match id3 because identity was regenerated + ASSERT_NE(id1.compare(id3), 0); + } while (ChangeCompactOptions()); +} + +TEST(DBTest, RecoverWithLargeLog) { + do { + { + Options options = CurrentOptions(); + Reopen(&options); + ASSERT_OK(Put("big1", std::string(200000, '1'))); + ASSERT_OK(Put("big2", std::string(200000, '2'))); + ASSERT_OK(Put("small3", std::string(10, '3'))); + ASSERT_OK(Put("small4", std::string(10, '4'))); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + } + + // Make sure that if we re-open with a small write buffer size that + // we flush table files in the middle of a large log file. + Options options = CurrentOptions(); + options.write_buffer_size = 100000; + Reopen(&options); + ASSERT_EQ(NumTableFilesAtLevel(0), 3); + ASSERT_EQ(std::string(200000, '1'), Get("big1")); + ASSERT_EQ(std::string(200000, '2'), Get("big2")); + ASSERT_EQ(std::string(10, '3'), Get("small3")); + ASSERT_EQ(std::string(10, '4'), Get("small4")); + ASSERT_GT(NumTableFilesAtLevel(0), 1); + } while (ChangeCompactOptions()); +} + +TEST(DBTest, CompactionsGenerateMultipleFiles) { + Options options = CurrentOptions(); + options.write_buffer_size = 100000000; // Large write buffer + Reopen(&options); + + Random rnd(301); + + // Write 8MB (80 values, each 100K) + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + std::vector values; + for (int i = 0; i < 80; i++) { + values.push_back(RandomString(&rnd, 100000)); + ASSERT_OK(Put(Key(i), values[i])); + } + + // Reopening moves updates to level-0 + Reopen(&options); + dbfull()->TEST_CompactRange(0, nullptr, nullptr); + + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_GT(NumTableFilesAtLevel(1), 1); + for (int i = 0; i < 80; i++) { + ASSERT_EQ(Get(Key(i)), values[i]); + } +} + +TEST(DBTest, CompactionTrigger) { + Options options = CurrentOptions(); + options.write_buffer_size = 100<<10; //100KB + options.num_levels = 3; + options.max_mem_compaction_level = 0; + options.level0_file_num_compaction_trigger = 3; + Reopen(&options); + + Random rnd(301); + + for (int num = 0; + num < options.level0_file_num_compaction_trigger - 1; + num++) { + std::vector values; + // Write 120KB (12 values, each 10K) + for (int i = 0; i < 12; i++) { + values.push_back(RandomString(&rnd, 10000)); + ASSERT_OK(Put(Key(i), values[i])); + } + dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_EQ(NumTableFilesAtLevel(0), num + 1); + } + + //generate one more file in level-0, and should trigger level-0 compaction + std::vector values; + for (int i = 0; i < 12; i++) { + values.push_back(RandomString(&rnd, 10000)); + ASSERT_OK(Put(Key(i), values[i])); + } + dbfull()->TEST_WaitForCompact(); + + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_EQ(NumTableFilesAtLevel(1), 1); +} + +TEST(DBTest, UniversalCompactionTrigger) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.write_buffer_size = 100<<10; //100KB + // trigger compaction if there are >= 4 files + options.level0_file_num_compaction_trigger = 4; + Reopen(&options); + + Random rnd(301); + int key_idx = 0; + + // Stage 1: + // Generate a set of files at level 0, but don't trigger level-0 + // compaction. + for (int num = 0; + num < options.level0_file_num_compaction_trigger-1; + num++) { + // Write 120KB (12 values, each 10K) + for (int i = 0; i < 12; i++) { + ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_EQ(NumTableFilesAtLevel(0), num + 1); + } + + // Generate one more file at level-0, which should trigger level-0 + // compaction. + for (int i = 0; i < 12; i++) { + ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForCompact(); + // Suppose each file flushed from mem table has size 1. Now we compact + // (level0_file_num_compaction_trigger+1)=4 files and should have a big + // file of size 4. + ASSERT_EQ(NumTableFilesAtLevel(0), 1); + for (int i = 1; i < options.num_levels ; i++) { + ASSERT_EQ(NumTableFilesAtLevel(i), 0); + } + + // Stage 2: + // Now we have one file at level 0, with size 4. We also have some data in + // mem table. Let's continue generating new files at level 0, but don't + // trigger level-0 compaction. + // First, clean up memtable before inserting new data. This will generate + // a level-0 file, with size around 0.4 (according to previously written + // data amount). + dbfull()->Flush(FlushOptions()); + for (int num = 0; + num < options.level0_file_num_compaction_trigger-3; + num++) { + // Write 120KB (12 values, each 10K) + for (int i = 0; i < 12; i++) { + ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_EQ(NumTableFilesAtLevel(0), num + 3); + } + + // Generate one more file at level-0, which should trigger level-0 + // compaction. + for (int i = 0; i < 12; i++) { + ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForCompact(); + // Before compaction, we have 4 files at level 0, with size 4, 0.4, 1, 1. + // After comapction, we should have 2 files, with size 4, 2.4. + ASSERT_EQ(NumTableFilesAtLevel(0), 2); + for (int i = 1; i < options.num_levels ; i++) { + ASSERT_EQ(NumTableFilesAtLevel(i), 0); + } + + // Stage 3: + // Now we have 2 files at level 0, with size 4 and 2.4. Continue + // generating new files at level 0. + for (int num = 0; + num < options.level0_file_num_compaction_trigger-3; + num++) { + // Write 120KB (12 values, each 10K) + for (int i = 0; i < 12; i++) { + ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_EQ(NumTableFilesAtLevel(0), num + 3); + } + + // Generate one more file at level-0, which should trigger level-0 + // compaction. + for (int i = 0; i < 12; i++) { + ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForCompact(); + // Before compaction, we have 4 files at level 0, with size 4, 2.4, 1, 1. + // After comapction, we should have 3 files, with size 4, 2.4, 2. + ASSERT_EQ(NumTableFilesAtLevel(0), 3); + for (int i = 1; i < options.num_levels ; i++) { + ASSERT_EQ(NumTableFilesAtLevel(i), 0); + } + + // Stage 4: + // Now we have 3 files at level 0, with size 4, 2.4, 2. Let's generate a + // new file of size 1. + for (int i = 0; i < 12; i++) { + ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForCompact(); + // Level-0 compaction is triggered, but no file will be picked up. + ASSERT_EQ(NumTableFilesAtLevel(0), 4); + for (int i = 1; i < options.num_levels ; i++) { + ASSERT_EQ(NumTableFilesAtLevel(i), 0); + } + + // Stage 5: + // Now we have 4 files at level 0, with size 4, 2.4, 2, 1. Let's generate + // a new file of size 1. + for (int i = 0; i < 12; i++) { + ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForCompact(); + // All files at level 0 will be compacted into a single one. + ASSERT_EQ(NumTableFilesAtLevel(0), 1); + for (int i = 1; i < options.num_levels ; i++) { + ASSERT_EQ(NumTableFilesAtLevel(i), 0); + } +} + +TEST(DBTest, UniversalCompactionSizeAmplification) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.write_buffer_size = 100<<10; //100KB + options.level0_file_num_compaction_trigger = 3; + + // Trigger compaction if size amplification exceeds 110% + options.compaction_options_universal. + max_size_amplification_percent = 110; + Reopen(&options); + + Random rnd(301); + int key_idx = 0; + + // Generate two files in Level 0. Both files are approx the same size. + for (int num = 0; + num < options.level0_file_num_compaction_trigger-1; + num++) { + // Write 120KB (12 values, each 10K) + for (int i = 0; i < 12; i++) { + ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_EQ(NumTableFilesAtLevel(0), num + 1); + } + ASSERT_EQ(NumTableFilesAtLevel(0), 2); + + // Flush whatever is remaining in memtable. This is typically + // small, which should not trigger size ratio based compaction + // but will instead trigger size amplification. + dbfull()->Flush(FlushOptions()); + + dbfull()->TEST_WaitForCompact(); + + // Verify that size amplification did occur + ASSERT_EQ(NumTableFilesAtLevel(0), 1); +} + +TEST(DBTest, UniversalCompactionOptions) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.write_buffer_size = 100<<10; //100KB + options.level0_file_num_compaction_trigger = 4; + options.num_levels = 1; + options.compaction_options_universal.compression_size_percent = -1; + Reopen(&options); + + Random rnd(301); + int key_idx = 0; + + for (int num = 0; + num < options.level0_file_num_compaction_trigger; + num++) { + // Write 120KB (12 values, each 10K) + for (int i = 0; i < 12; i++) { + ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForFlushMemTable(); + + if (num < options.level0_file_num_compaction_trigger - 1) { + ASSERT_EQ(NumTableFilesAtLevel(0), num + 1); + } + } + + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(NumTableFilesAtLevel(0), 1); + for (int i = 1; i < options.num_levels ; i++) { + ASSERT_EQ(NumTableFilesAtLevel(i), 0); + } +} + +#if defined(SNAPPY) && defined(ZLIB) && defined(BZIP2) +TEST(DBTest, CompressedCache) { + int num_iter = 80; + + // Run this test three iterations. + // Iteration 1: only a uncompressed block cache + // Iteration 2: only a compressed block cache + // Iteration 3: both block cache and compressed cache + for (int iter = 0; iter < 3; iter++) { + Options options = CurrentOptions(); + options.write_buffer_size = 64*1024; // small write buffer + options.statistics = rocksdb::CreateDBStatistics(); + + switch (iter) { + case 0: + // only uncompressed block cache + options.block_cache = NewLRUCache(8*1024); + options.block_cache_compressed = nullptr; + break; + case 1: + // no block cache, only compressed cache + options.no_block_cache = true; + options.block_cache = nullptr; + options.block_cache_compressed = NewLRUCache(8*1024); + break; + case 2: + // both compressed and uncompressed block cache + options.block_cache = NewLRUCache(1024); + options.block_cache_compressed = NewLRUCache(8*1024); + break; + default: + ASSERT_TRUE(false); + } + Reopen(&options); + + Random rnd(301); + + // Write 8MB (80 values, each 100K) + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + std::vector values; + std::string str; + for (int i = 0; i < num_iter; i++) { + if (i % 4 == 0) { // high compression ratio + str = RandomString(&rnd, 1000); + } + values.push_back(str); + ASSERT_OK(Put(Key(i), values[i])); + } + + // flush all data from memtable so that reads are from block cache + dbfull()->Flush(FlushOptions()); + + for (int i = 0; i < num_iter; i++) { + ASSERT_EQ(Get(Key(i)), values[i]); + } + + // check that we triggered the appropriate code paths in the cache + switch (iter) { + case 0: + // only uncompressed block cache + ASSERT_GT(options.statistics.get()->getTickerCount(BLOCK_CACHE_MISS), + 0); + ASSERT_EQ(options.statistics.get()->getTickerCount + (BLOCK_CACHE_COMPRESSED_MISS), 0); + break; + case 1: + // no block cache, only compressed cache + ASSERT_EQ(options.statistics.get()->getTickerCount(BLOCK_CACHE_MISS), + 0); + ASSERT_GT(options.statistics.get()->getTickerCount + (BLOCK_CACHE_COMPRESSED_MISS), 0); + break; + case 2: + // both compressed and uncompressed block cache + ASSERT_GT(options.statistics.get()->getTickerCount(BLOCK_CACHE_MISS), + 0); + ASSERT_GT(options.statistics.get()->getTickerCount + (BLOCK_CACHE_COMPRESSED_MISS), 0); + break; + default: + ASSERT_TRUE(false); + } + } +} + +static std::string CompressibleString(Random* rnd, int len) { + std::string r; + test::CompressibleString(rnd, 0.8, len, &r); + return r; +} + +TEST(DBTest, UniversalCompactionCompressRatio1) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.write_buffer_size = 100<<10; //100KB + options.level0_file_num_compaction_trigger = 2; + options.num_levels = 1; + options.compaction_options_universal.compression_size_percent = 70; + Reopen(&options); + + Random rnd(301); + int key_idx = 0; + + // The first compaction (2) is compressed. + for (int num = 0; num < 2; num++) { + // Write 120KB (12 values, each 10K) + for (int i = 0; i < 12; i++) { + ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForCompact(); + } + ASSERT_LT((int ) dbfull()->TEST_GetLevel0TotalSize(), 120000 * 2 * 0.9); + + // The second compaction (4) is compressed + for (int num = 0; num < 2; num++) { + // Write 120KB (12 values, each 10K) + for (int i = 0; i < 12; i++) { + ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForCompact(); + } + ASSERT_LT((int ) dbfull()->TEST_GetLevel0TotalSize(), 120000 * 4 * 0.9); + + // The third compaction (2 4) is compressed since this time it is + // (1 1 3.2) and 3.2/5.2 doesn't reach ratio. + for (int num = 0; num < 2; num++) { + // Write 120KB (12 values, each 10K) + for (int i = 0; i < 12; i++) { + ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForCompact(); + } + ASSERT_LT((int ) dbfull()->TEST_GetLevel0TotalSize(), 120000 * 6 * 0.9); + + // When we start for the compaction up to (2 4 8), the latest + // compressed is not compressed. + for (int num = 0; num < 8; num++) { + // Write 120KB (12 values, each 10K) + for (int i = 0; i < 12; i++) { + ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForCompact(); + } + ASSERT_GT((int) dbfull()->TEST_GetLevel0TotalSize(), + 120000 * 12 * 0.8 + 110000 * 2); +} + +TEST(DBTest, UniversalCompactionCompressRatio2) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.write_buffer_size = 100<<10; //100KB + options.level0_file_num_compaction_trigger = 2; + options.num_levels = 1; + options.compaction_options_universal.compression_size_percent = 95; + Reopen(&options); + + Random rnd(301); + int key_idx = 0; + + // When we start for the compaction up to (2 4 8), the latest + // compressed is compressed given the size ratio to compress. + for (int num = 0; num < 14; num++) { + // Write 120KB (12 values, each 10K) + for (int i = 0; i < 12; i++) { + ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForCompact(); + } + ASSERT_LT((int ) dbfull()->TEST_GetLevel0TotalSize(), + 120000 * 12 * 0.8 + 110000 * 2); +} +#endif + +TEST(DBTest, ConvertCompactionStyle) { + Random rnd(301); + int max_key_level_insert = 200; + int max_key_universal_insert = 600; + + // Stage 1: generate a db with level compaction + Options options = CurrentOptions(); + options.write_buffer_size = 100<<10; //100KB + options.num_levels = 4; + options.level0_file_num_compaction_trigger = 3; + options.max_bytes_for_level_base = 500<<10; // 500KB + options.max_bytes_for_level_multiplier = 1; + options.target_file_size_base = 200<<10; // 200KB + options.target_file_size_multiplier = 1; + Reopen(&options); + + for (int i = 0; i <= max_key_level_insert; i++) { + // each value is 10K + ASSERT_OK(Put(Key(i), RandomString(&rnd, 10000))); + } + dbfull()->Flush(FlushOptions()); + dbfull()->TEST_WaitForCompact(); + + ASSERT_GT(TotalTableFiles(), 1); + int non_level0_num_files = 0; + for (int i = 1; i < dbfull()->NumberLevels(); i++) { + non_level0_num_files += NumTableFilesAtLevel(i); + } + ASSERT_GT(non_level0_num_files, 0); + + // Stage 2: reopen with universal compaction - should fail + options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + Status s = TryReopen(&options); + ASSERT_TRUE(s.IsInvalidArgument()); + + // Stage 3: compact into a single file and move the file to level 0 + options = CurrentOptions(); + options.disable_auto_compactions = true; + options.target_file_size_base = INT_MAX; + options.target_file_size_multiplier = 1; + options.max_bytes_for_level_base = INT_MAX; + options.max_bytes_for_level_multiplier = 1; + Reopen(&options); + + dbfull()->CompactRange(nullptr, nullptr, + true /* reduce level */, + 0 /* reduce to level 0 */); + + for (int i = 0; i < dbfull()->NumberLevels(); i++) { + int num = NumTableFilesAtLevel(i); + if (i == 0) { + ASSERT_EQ(num, 1); + } else { + ASSERT_EQ(num, 0); + } + } + + // Stage 4: re-open in universal compaction style and do some db operations + options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.write_buffer_size = 100<<10; //100KB + options.level0_file_num_compaction_trigger = 3; + Reopen(&options); + + for (int i = max_key_level_insert / 2; i <= max_key_universal_insert; i++) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, 10000))); + } + dbfull()->Flush(FlushOptions()); + dbfull()->TEST_WaitForCompact(); + + for (int i = 1; i < dbfull()->NumberLevels(); i++) { + ASSERT_EQ(NumTableFilesAtLevel(i), 0); + } + + // verify keys inserted in both level compaction style and universal + // compaction style + std::string keys_in_db; + Iterator* iter = dbfull()->NewIterator(ReadOptions()); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + keys_in_db.append(iter->key().ToString()); + keys_in_db.push_back(','); + } + delete iter; + + std::string expected_keys; + for (int i = 0; i <= max_key_universal_insert; i++) { + expected_keys.append(Key(i)); + expected_keys.push_back(','); + } + + ASSERT_EQ(keys_in_db, expected_keys); +} + +void MinLevelHelper(DBTest* self, Options& options) { + Random rnd(301); + + for (int num = 0; + num < options.level0_file_num_compaction_trigger - 1; + num++) + { + std::vector values; + // Write 120KB (12 values, each 10K) + for (int i = 0; i < 12; i++) { + values.push_back(RandomString(&rnd, 10000)); + ASSERT_OK(self->Put(Key(i), values[i])); + } + self->dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_EQ(self->NumTableFilesAtLevel(0), num + 1); + } + + //generate one more file in level-0, and should trigger level-0 compaction + std::vector values; + for (int i = 0; i < 12; i++) { + values.push_back(RandomString(&rnd, 10000)); + ASSERT_OK(self->Put(Key(i), values[i])); + } + self->dbfull()->TEST_WaitForCompact(); + + ASSERT_EQ(self->NumTableFilesAtLevel(0), 0); + ASSERT_EQ(self->NumTableFilesAtLevel(1), 1); +} + +// returns false if the calling-Test should be skipped +bool MinLevelToCompress(CompressionType& type, Options& options, int wbits, + int lev, int strategy) { + fprintf(stderr, "Test with compression options : window_bits = %d, level = %d, strategy = %d}\n", wbits, lev, strategy); + options.write_buffer_size = 100<<10; //100KB + options.num_levels = 3; + options.max_mem_compaction_level = 0; + options.level0_file_num_compaction_trigger = 3; + options.create_if_missing = true; + + if (SnappyCompressionSupported(CompressionOptions(wbits, lev, strategy))) { + type = kSnappyCompression; + fprintf(stderr, "using snappy\n"); + } else if (ZlibCompressionSupported( + CompressionOptions(wbits, lev, strategy))) { + type = kZlibCompression; + fprintf(stderr, "using zlib\n"); + } else if (BZip2CompressionSupported( + CompressionOptions(wbits, lev, strategy))) { + type = kBZip2Compression; + fprintf(stderr, "using bzip2\n"); + } else { + fprintf(stderr, "skipping test, compression disabled\n"); + return false; + } + options.compression_per_level.resize(options.num_levels); + + // do not compress L0 + for (int i = 0; i < 1; i++) { + options.compression_per_level[i] = kNoCompression; + } + for (int i = 1; i < options.num_levels; i++) { + options.compression_per_level[i] = type; + } + return true; +} + +TEST(DBTest, MinLevelToCompress1) { + Options options = CurrentOptions(); + CompressionType type; + if (!MinLevelToCompress(type, options, -14, -1, 0)) { + return; + } + Reopen(&options); + MinLevelHelper(this, options); + + // do not compress L0 and L1 + for (int i = 0; i < 2; i++) { + options.compression_per_level[i] = kNoCompression; + } + for (int i = 2; i < options.num_levels; i++) { + options.compression_per_level[i] = type; + } + DestroyAndReopen(&options); + MinLevelHelper(this, options); +} + +TEST(DBTest, MinLevelToCompress2) { + Options options = CurrentOptions(); + CompressionType type; + if (!MinLevelToCompress(type, options, 15, -1, 0)) { + return; + } + Reopen(&options); + MinLevelHelper(this, options); + + // do not compress L0 and L1 + for (int i = 0; i < 2; i++) { + options.compression_per_level[i] = kNoCompression; + } + for (int i = 2; i < options.num_levels; i++) { + options.compression_per_level[i] = type; + } + DestroyAndReopen(&options); + MinLevelHelper(this, options); +} + +TEST(DBTest, RepeatedWritesToSameKey) { + do { + Options options = CurrentOptions(); + options.env = env_; + options.write_buffer_size = 100000; // Small write buffer + Reopen(&options); + + // We must have at most one file per level except for level-0, + // which may have up to kL0_StopWritesTrigger files. + const int kMaxFiles = dbfull()->NumberLevels() + + dbfull()->Level0StopWriteTrigger(); + + Random rnd(301); + std::string value = RandomString(&rnd, 2 * options.write_buffer_size); + for (int i = 0; i < 5 * kMaxFiles; i++) { + Put("key", value); + ASSERT_LE(TotalTableFiles(), kMaxFiles); + } + } while (ChangeCompactOptions()); +} + +TEST(DBTest, InPlaceUpdate) { + do { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.inplace_update_support = true; + options.env = env_; + options.write_buffer_size = 100000; + + // Update key with values of smaller size + Reopen(&options); + int numValues = 10; + for (int i = numValues; i > 0; i--) { + std::string value = DummyString(i, 'a'); + ASSERT_OK(Put("key", value)); + ASSERT_EQ(value, Get("key")); + } + + int count = 0; + Iterator* iter = dbfull()->TEST_NewInternalIterator(); + iter->SeekToFirst(); + ASSERT_EQ(iter->status().ok(), true); + while (iter->Valid()) { + ParsedInternalKey ikey(Slice(), 0, kTypeValue); + ikey.sequence = -1; + ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); + count++; + // All updates with the same sequence number. + ASSERT_EQ(ikey.sequence, (unsigned)1); + iter->Next(); + } + // Only 1 instance for that key. + ASSERT_EQ(count, 1); + delete iter; + + // Update key with values of larger size + DestroyAndReopen(&options); + numValues = 10; + for (int i = 0; i < numValues; i++) { + std::string value = DummyString(i, 'a'); + ASSERT_OK(Put("key", value)); + ASSERT_EQ(value, Get("key")); + } + + count = 0; + iter = dbfull()->TEST_NewInternalIterator(); + iter->SeekToFirst(); + ASSERT_EQ(iter->status().ok(), true); + int seq = numValues; + while (iter->Valid()) { + ParsedInternalKey ikey(Slice(), 0, kTypeValue); + ikey.sequence = -1; + ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); + count++; + // No inplace updates. All updates are puts with new seq number + ASSERT_EQ(ikey.sequence, (unsigned)seq--); + iter->Next(); + } + // All 10 updates exist in the internal iterator + ASSERT_EQ(count, numValues); + delete iter; + + + } while (ChangeCompactOptions()); +} + +// This is a static filter used for filtering +// kvs during the compaction process. +static int cfilter_count; +static std::string NEW_VALUE = "NewValue"; + +class KeepFilter : public CompactionFilter { + public: + virtual bool Filter(int level, const Slice& key, + const Slice& value, std::string* new_value, + bool* value_changed) const override { + cfilter_count++; + return false; + } + + virtual const char* Name() const override { + return "KeepFilter"; + } + +}; + +class DeleteFilter : public CompactionFilter { + public: + virtual bool Filter(int level, const Slice& key, + const Slice& value, std::string* new_value, + bool* value_changed) const override { + cfilter_count++; + return true; + } + + virtual const char* Name() const override { + return "DeleteFilter"; + } +}; + +class ChangeFilter : public CompactionFilter { + public: + explicit ChangeFilter(int argv) { + assert(argv == 100); + } + + virtual bool Filter(int level, const Slice& key, + const Slice& value, std::string* new_value, + bool* value_changed) const override { + assert(new_value != nullptr); + *new_value = NEW_VALUE; + *value_changed = true; + return false; + } + + virtual const char* Name() const override { + return "ChangeFilter"; + } +}; + +class KeepFilterFactory : public CompactionFilterFactory { + public: + virtual std::unique_ptr + CreateCompactionFilter(const CompactionFilter::Context& context) override { + return std::unique_ptr(new KeepFilter()); + } + + virtual const char* Name() const override { + return "KeepFilterFactory"; + } +}; + +class DeleteFilterFactory : public CompactionFilterFactory { + public: + virtual std::unique_ptr + CreateCompactionFilter(const CompactionFilter::Context& context) override { + return std::unique_ptr(new DeleteFilter()); + } + + virtual const char* Name() const override { + return "DeleteFilterFactory"; + } +}; + +class ChangeFilterFactory : public CompactionFilterFactory { + public: + explicit ChangeFilterFactory(int argv) : argv_(argv) {} + + virtual std::unique_ptr + CreateCompactionFilter(const CompactionFilter::Context& context) override { + return std::unique_ptr(new ChangeFilter(argv_)); + } + + virtual const char* Name() const override { + return "ChangeFilterFactory"; + } + + private: + const int argv_; +}; + +TEST(DBTest, CompactionFilter) { + Options options = CurrentOptions(); + options.num_levels = 3; + options.max_mem_compaction_level = 0; + options.compaction_filter_factory = std::make_shared(); + Reopen(&options); + + // Write 100K keys, these are written to a few files in L0. + const std::string value(10, 'x'); + for (int i = 0; i < 100000; i++) { + char key[100]; + snprintf(key, sizeof(key), "B%010d", i); + Put(key, value); + } + dbfull()->TEST_FlushMemTable(); + + // Push all files to the highest level L2. Verify that + // the compaction is each level invokes the filter for + // all the keys in that level. + cfilter_count = 0; + dbfull()->TEST_CompactRange(0, nullptr, nullptr); + ASSERT_EQ(cfilter_count, 100000); + cfilter_count = 0; + dbfull()->TEST_CompactRange(1, nullptr, nullptr); + ASSERT_EQ(cfilter_count, 100000); + + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_EQ(NumTableFilesAtLevel(1), 0); + ASSERT_NE(NumTableFilesAtLevel(2), 0); + cfilter_count = 0; + + // All the files are in the lowest level. + // Verify that all but the 100001st record + // has sequence number zero. The 100001st record + // is at the tip of this snapshot and cannot + // be zeroed out. + // TODO: figure out sequence number squashtoo + int count = 0; + int total = 0; + Iterator* iter = dbfull()->TEST_NewInternalIterator(); + iter->SeekToFirst(); + ASSERT_OK(iter->status()); + while (iter->Valid()) { + ParsedInternalKey ikey(Slice(), 0, kTypeValue); + ikey.sequence = -1; + ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); + total++; + if (ikey.sequence != 0) { + count++; + } + iter->Next(); + } + ASSERT_EQ(total, 100000); + ASSERT_EQ(count, 1); + delete iter; + + // overwrite all the 100K keys once again. + for (int i = 0; i < 100000; i++) { + char key[100]; + snprintf(key, sizeof(key), "B%010d", i); + Put(key, value); + } + dbfull()->TEST_FlushMemTable(); + + // push all files to the highest level L2. This + // means that all keys should pass at least once + // via the compaction filter + cfilter_count = 0; + dbfull()->TEST_CompactRange(0, nullptr, nullptr); + ASSERT_EQ(cfilter_count, 100000); + cfilter_count = 0; + dbfull()->TEST_CompactRange(1, nullptr, nullptr); + ASSERT_EQ(cfilter_count, 100000); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_EQ(NumTableFilesAtLevel(1), 0); + ASSERT_NE(NumTableFilesAtLevel(2), 0); + + // create a new database with the compaction + // filter in such a way that it deletes all keys + options.compaction_filter_factory = std::make_shared(); + options.create_if_missing = true; + DestroyAndReopen(&options); + + // write all the keys once again. + for (int i = 0; i < 100000; i++) { + char key[100]; + snprintf(key, sizeof(key), "B%010d", i); + Put(key, value); + } + dbfull()->TEST_FlushMemTable(); + ASSERT_NE(NumTableFilesAtLevel(0), 0); + ASSERT_EQ(NumTableFilesAtLevel(1), 0); + ASSERT_EQ(NumTableFilesAtLevel(2), 0); + + // Push all files to the highest level L2. This + // triggers the compaction filter to delete all keys, + // verify that at the end of the compaction process, + // nothing is left. + cfilter_count = 0; + dbfull()->TEST_CompactRange(0, nullptr, nullptr); + ASSERT_EQ(cfilter_count, 100000); + cfilter_count = 0; + dbfull()->TEST_CompactRange(1, nullptr, nullptr); + ASSERT_EQ(cfilter_count, 0); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_EQ(NumTableFilesAtLevel(1), 0); + + // Scan the entire database to ensure that nothing is left + iter = db_->NewIterator(ReadOptions()); + iter->SeekToFirst(); + count = 0; + while (iter->Valid()) { + count++; + iter->Next(); + } + ASSERT_EQ(count, 0); + delete iter; + + // The sequence number of the remaining record + // is not zeroed out even though it is at the + // level Lmax because this record is at the tip + // TODO: remove the following or design a different + // test + count = 0; + iter = dbfull()->TEST_NewInternalIterator(); + iter->SeekToFirst(); + ASSERT_OK(iter->status()); + while (iter->Valid()) { + ParsedInternalKey ikey(Slice(), 0, kTypeValue); + ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); + ASSERT_NE(ikey.sequence, (unsigned)0); + count++; + iter->Next(); + } + ASSERT_EQ(count, 0); + delete iter; +} + +TEST(DBTest, CompactionFilterWithValueChange) { + do { + Options options = CurrentOptions(); + options.num_levels = 3; + options.max_mem_compaction_level = 0; + options.compaction_filter_factory = + std::make_shared(100); + Reopen(&options); + + // Write 100K+1 keys, these are written to a few files + // in L0. We do this so that the current snapshot points + // to the 100001 key.The compaction filter is not invoked + // on keys that are visible via a snapshot because we + // anyways cannot delete it. + const std::string value(10, 'x'); + for (int i = 0; i < 100001; i++) { + char key[100]; + snprintf(key, sizeof(key), "B%010d", i); + Put(key, value); + } + + // push all files to lower levels + dbfull()->TEST_FlushMemTable(); + dbfull()->TEST_CompactRange(0, nullptr, nullptr); + dbfull()->TEST_CompactRange(1, nullptr, nullptr); + + // re-write all data again + for (int i = 0; i < 100001; i++) { + char key[100]; + snprintf(key, sizeof(key), "B%010d", i); + Put(key, value); + } + + // push all files to lower levels. This should + // invoke the compaction filter for all 100000 keys. + dbfull()->TEST_FlushMemTable(); + dbfull()->TEST_CompactRange(0, nullptr, nullptr); + dbfull()->TEST_CompactRange(1, nullptr, nullptr); + + // verify that all keys now have the new value that + // was set by the compaction process. + for (int i = 0; i < 100000; i++) { + char key[100]; + snprintf(key, sizeof(key), "B%010d", i); + std::string newvalue = Get(key); + ASSERT_EQ(newvalue.compare(NEW_VALUE), 0); + } + } while (ChangeCompactOptions()); +} + +TEST(DBTest, SparseMerge) { + do { + Options options = CurrentOptions(); + options.compression = kNoCompression; + Reopen(&options); + + FillLevels("A", "Z"); + + // Suppose there is: + // small amount of data with prefix A + // large amount of data with prefix B + // small amount of data with prefix C + // and that recent updates have made small changes to all three prefixes. + // Check that we do not do a compaction that merges all of B in one shot. + const std::string value(1000, 'x'); + Put("A", "va"); + // Write approximately 100MB of "B" values + for (int i = 0; i < 100000; i++) { + char key[100]; + snprintf(key, sizeof(key), "B%010d", i); + Put(key, value); + } + Put("C", "vc"); + dbfull()->TEST_FlushMemTable(); + dbfull()->TEST_CompactRange(0, nullptr, nullptr); + + // Make sparse update + Put("A", "va2"); + Put("B100", "bvalue2"); + Put("C", "vc2"); + dbfull()->TEST_FlushMemTable(); + + // Compactions should not cause us to create a situation where + // a file overlaps too much data at the next level. + ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576); + dbfull()->TEST_CompactRange(0, nullptr, nullptr); + ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576); + dbfull()->TEST_CompactRange(1, nullptr, nullptr); + ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576); + } while (ChangeCompactOptions()); +} + +static bool Between(uint64_t val, uint64_t low, uint64_t high) { + bool result = (val >= low) && (val <= high); + if (!result) { + fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n", + (unsigned long long)(val), + (unsigned long long)(low), + (unsigned long long)(high)); + } + return result; +} + +TEST(DBTest, ApproximateSizes) { + do { + Options options = CurrentOptions(); + options.write_buffer_size = 100000000; // Large write buffer + options.compression = kNoCompression; + DestroyAndReopen(); + + ASSERT_TRUE(Between(Size("", "xyz"), 0, 0)); + Reopen(&options); + ASSERT_TRUE(Between(Size("", "xyz"), 0, 0)); + + // Write 8MB (80 values, each 100K) + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + const int N = 80; + static const int S1 = 100000; + static const int S2 = 105000; // Allow some expansion from metadata + Random rnd(301); + for (int i = 0; i < N; i++) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, S1))); + } + + // 0 because GetApproximateSizes() does not account for memtable space + ASSERT_TRUE(Between(Size("", Key(50)), 0, 0)); + + // Check sizes across recovery by reopening a few times + for (int run = 0; run < 3; run++) { + Reopen(&options); + + for (int compact_start = 0; compact_start < N; compact_start += 10) { + for (int i = 0; i < N; i += 10) { + ASSERT_TRUE(Between(Size("", Key(i)), S1*i, S2*i)); + ASSERT_TRUE(Between(Size("", Key(i)+".suffix"), S1*(i+1), S2*(i+1))); + ASSERT_TRUE(Between(Size(Key(i), Key(i+10)), S1*10, S2*10)); + } + ASSERT_TRUE(Between(Size("", Key(50)), S1*50, S2*50)); + ASSERT_TRUE(Between(Size("", Key(50)+".suffix"), S1*50, S2*50)); + + std::string cstart_str = Key(compact_start); + std::string cend_str = Key(compact_start + 9); + Slice cstart = cstart_str; + Slice cend = cend_str; + dbfull()->TEST_CompactRange(0, &cstart, &cend); + } + + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_GT(NumTableFilesAtLevel(1), 0); + } + } while (ChangeOptions(kSkipUniversalCompaction)); +} + +TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) { + do { + Options options = CurrentOptions(); + options.compression = kNoCompression; + Reopen(); + + Random rnd(301); + std::string big1 = RandomString(&rnd, 100000); + ASSERT_OK(Put(Key(0), RandomString(&rnd, 10000))); + ASSERT_OK(Put(Key(1), RandomString(&rnd, 10000))); + ASSERT_OK(Put(Key(2), big1)); + ASSERT_OK(Put(Key(3), RandomString(&rnd, 10000))); + ASSERT_OK(Put(Key(4), big1)); + ASSERT_OK(Put(Key(5), RandomString(&rnd, 10000))); + ASSERT_OK(Put(Key(6), RandomString(&rnd, 300000))); + ASSERT_OK(Put(Key(7), RandomString(&rnd, 10000))); + + // Check sizes across recovery by reopening a few times + for (int run = 0; run < 3; run++) { + Reopen(&options); + + ASSERT_TRUE(Between(Size("", Key(0)), 0, 0)); + ASSERT_TRUE(Between(Size("", Key(1)), 10000, 11000)); + ASSERT_TRUE(Between(Size("", Key(2)), 20000, 21000)); + ASSERT_TRUE(Between(Size("", Key(3)), 120000, 121000)); + ASSERT_TRUE(Between(Size("", Key(4)), 130000, 131000)); + ASSERT_TRUE(Between(Size("", Key(5)), 230000, 231000)); + ASSERT_TRUE(Between(Size("", Key(6)), 240000, 241000)); + ASSERT_TRUE(Between(Size("", Key(7)), 540000, 541000)); + ASSERT_TRUE(Between(Size("", Key(8)), 550000, 560000)); + + ASSERT_TRUE(Between(Size(Key(3), Key(5)), 110000, 111000)); + + dbfull()->TEST_CompactRange(0, nullptr, nullptr); + } + } while (ChangeOptions()); +} + +TEST(DBTest, IteratorPinsRef) { + do { + Put("foo", "hello"); + + // Get iterator that will yield the current contents of the DB. + Iterator* iter = db_->NewIterator(ReadOptions()); + + // Write to force compactions + Put("foo", "newvalue1"); + for (int i = 0; i < 100; i++) { + ASSERT_OK(Put(Key(i), Key(i) + std::string(100000, 'v'))); // 100K values + } + Put("foo", "newvalue2"); + + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("foo", iter->key().ToString()); + ASSERT_EQ("hello", iter->value().ToString()); + iter->Next(); + ASSERT_TRUE(!iter->Valid()); + delete iter; + } while (ChangeCompactOptions()); +} + +TEST(DBTest, Snapshot) { + do { + Put("foo", "v1"); + const Snapshot* s1 = db_->GetSnapshot(); + Put("foo", "v2"); + const Snapshot* s2 = db_->GetSnapshot(); + Put("foo", "v3"); + const Snapshot* s3 = db_->GetSnapshot(); + + Put("foo", "v4"); + ASSERT_EQ("v1", Get("foo", s1)); + ASSERT_EQ("v2", Get("foo", s2)); + ASSERT_EQ("v3", Get("foo", s3)); + ASSERT_EQ("v4", Get("foo")); + + db_->ReleaseSnapshot(s3); + ASSERT_EQ("v1", Get("foo", s1)); + ASSERT_EQ("v2", Get("foo", s2)); + ASSERT_EQ("v4", Get("foo")); + + db_->ReleaseSnapshot(s1); + ASSERT_EQ("v2", Get("foo", s2)); + ASSERT_EQ("v4", Get("foo")); + + db_->ReleaseSnapshot(s2); + ASSERT_EQ("v4", Get("foo")); + } while (ChangeOptions()); +} + +TEST(DBTest, HiddenValuesAreRemoved) { + do { + Random rnd(301); + FillLevels("a", "z"); + + std::string big = RandomString(&rnd, 50000); + Put("foo", big); + Put("pastfoo", "v"); + const Snapshot* snapshot = db_->GetSnapshot(); + Put("foo", "tiny"); + Put("pastfoo2", "v2"); // Advance sequence number one more + + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + ASSERT_GT(NumTableFilesAtLevel(0), 0); + + ASSERT_EQ(big, Get("foo", snapshot)); + ASSERT_TRUE(Between(Size("", "pastfoo"), 50000, 60000)); + db_->ReleaseSnapshot(snapshot); + ASSERT_EQ(AllEntriesFor("foo"), "[ tiny, " + big + " ]"); + Slice x("x"); + dbfull()->TEST_CompactRange(0, nullptr, &x); + ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]"); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_GE(NumTableFilesAtLevel(1), 1); + dbfull()->TEST_CompactRange(1, nullptr, &x); + ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]"); + + ASSERT_TRUE(Between(Size("", "pastfoo"), 0, 1000)); + } while (ChangeOptions(kSkipUniversalCompaction)); +} + +TEST(DBTest, CompactBetweenSnapshots) { + do { + Random rnd(301); + FillLevels("a", "z"); + + Put("foo", "first"); + const Snapshot* snapshot1 = db_->GetSnapshot(); + Put("foo", "second"); + Put("foo", "third"); + Put("foo", "fourth"); + const Snapshot* snapshot2 = db_->GetSnapshot(); + Put("foo", "fifth"); + Put("foo", "sixth"); + + // All entries (including duplicates) exist + // before any compaction is triggered. + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + ASSERT_EQ("sixth", Get("foo")); + ASSERT_EQ("fourth", Get("foo", snapshot2)); + ASSERT_EQ("first", Get("foo", snapshot1)); + ASSERT_EQ(AllEntriesFor("foo"), + "[ sixth, fifth, fourth, third, second, first ]"); + + // After a compaction, "second", "third" and "fifth" should + // be removed + FillLevels("a", "z"); + dbfull()->CompactRange(nullptr, nullptr); + ASSERT_EQ("sixth", Get("foo")); + ASSERT_EQ("fourth", Get("foo", snapshot2)); + ASSERT_EQ("first", Get("foo", snapshot1)); + ASSERT_EQ(AllEntriesFor("foo"), "[ sixth, fourth, first ]"); + + // after we release the snapshot1, only two values left + db_->ReleaseSnapshot(snapshot1); + FillLevels("a", "z"); + dbfull()->CompactRange(nullptr, nullptr); + + // We have only one valid snapshot snapshot2. Since snapshot1 is + // not valid anymore, "first" should be removed by a compaction. + ASSERT_EQ("sixth", Get("foo")); + ASSERT_EQ("fourth", Get("foo", snapshot2)); + ASSERT_EQ(AllEntriesFor("foo"), "[ sixth, fourth ]"); + + // after we release the snapshot2, only one value should be left + db_->ReleaseSnapshot(snapshot2); + FillLevels("a", "z"); + dbfull()->CompactRange(nullptr, nullptr); + ASSERT_EQ("sixth", Get("foo")); + ASSERT_EQ(AllEntriesFor("foo"), "[ sixth ]"); + + } while (ChangeOptions()); +} + +TEST(DBTest, DeletionMarkers1) { + Put("foo", "v1"); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + const int last = dbfull()->MaxMemCompactionLevel(); + ASSERT_EQ(NumTableFilesAtLevel(last), 1); // foo => v1 is now in last level + + // Place a table at level last-1 to prevent merging with preceding mutation + Put("a", "begin"); + Put("z", "end"); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ(NumTableFilesAtLevel(last), 1); + ASSERT_EQ(NumTableFilesAtLevel(last-1), 1); + + Delete("foo"); + Put("foo", "v2"); + ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]"); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); // Moves to level last-2 + if (CurrentOptions().purge_redundant_kvs_while_flush) { + ASSERT_EQ(AllEntriesFor("foo"), "[ v2, v1 ]"); + } else { + ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]"); + } + Slice z("z"); + dbfull()->TEST_CompactRange(last-2, nullptr, &z); + // DEL eliminated, but v1 remains because we aren't compacting that level + // (DEL can be eliminated because v2 hides v1). + ASSERT_EQ(AllEntriesFor("foo"), "[ v2, v1 ]"); + dbfull()->TEST_CompactRange(last-1, nullptr, nullptr); + // Merging last-1 w/ last, so we are the base level for "foo", so + // DEL is removed. (as is v1). + ASSERT_EQ(AllEntriesFor("foo"), "[ v2 ]"); +} + +TEST(DBTest, DeletionMarkers2) { + Put("foo", "v1"); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + const int last = dbfull()->MaxMemCompactionLevel(); + ASSERT_EQ(NumTableFilesAtLevel(last), 1); // foo => v1 is now in last level + + // Place a table at level last-1 to prevent merging with preceding mutation + Put("a", "begin"); + Put("z", "end"); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ(NumTableFilesAtLevel(last), 1); + ASSERT_EQ(NumTableFilesAtLevel(last-1), 1); + + Delete("foo"); + ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]"); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); // Moves to level last-2 + ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]"); + dbfull()->TEST_CompactRange(last-2, nullptr, nullptr); + // DEL kept: "last" file overlaps + ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]"); + dbfull()->TEST_CompactRange(last-1, nullptr, nullptr); + // Merging last-1 w/ last, so we are the base level for "foo", so + // DEL is removed. (as is v1). + ASSERT_EQ(AllEntriesFor("foo"), "[ ]"); +} + +TEST(DBTest, OverlapInLevel0) { + do { + int tmp = dbfull()->MaxMemCompactionLevel(); + ASSERT_EQ(tmp, 2) << "Fix test to match config"; + + //Fill levels 1 and 2 to disable the pushing of new memtables to levels > 0. + ASSERT_OK(Put("100", "v100")); + ASSERT_OK(Put("999", "v999")); + dbfull()->TEST_FlushMemTable(); + ASSERT_OK(Delete("100")); + ASSERT_OK(Delete("999")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("0,1,1", FilesPerLevel()); + + // Make files spanning the following ranges in level-0: + // files[0] 200 .. 900 + // files[1] 300 .. 500 + // Note that files are sorted by smallest key. + ASSERT_OK(Put("300", "v300")); + ASSERT_OK(Put("500", "v500")); + dbfull()->TEST_FlushMemTable(); + ASSERT_OK(Put("200", "v200")); + ASSERT_OK(Put("600", "v600")); + ASSERT_OK(Put("900", "v900")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("2,1,1", FilesPerLevel()); + + // Compact away the placeholder files we created initially + dbfull()->TEST_CompactRange(1, nullptr, nullptr); + dbfull()->TEST_CompactRange(2, nullptr, nullptr); + ASSERT_EQ("2", FilesPerLevel()); + + // Do a memtable compaction. Before bug-fix, the compaction would + // not detect the overlap with level-0 files and would incorrectly place + // the deletion in a deeper level. + ASSERT_OK(Delete("600")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("3", FilesPerLevel()); + ASSERT_EQ("NOT_FOUND", Get("600")); + } while (ChangeOptions(kSkipUniversalCompaction)); +} + +TEST(DBTest, L0_CompactionBug_Issue44_a) { + do { + Reopen(); + ASSERT_OK(Put("b", "v")); + Reopen(); + ASSERT_OK(Delete("b")); + ASSERT_OK(Delete("a")); + Reopen(); + ASSERT_OK(Delete("a")); + Reopen(); + ASSERT_OK(Put("a", "v")); + Reopen(); + Reopen(); + ASSERT_EQ("(a->v)", Contents()); + env_->SleepForMicroseconds(1000000); // Wait for compaction to finish + ASSERT_EQ("(a->v)", Contents()); + } while (ChangeCompactOptions()); +} + +TEST(DBTest, L0_CompactionBug_Issue44_b) { + do { + Reopen(); + Put("",""); + Reopen(); + Delete("e"); + Put("",""); + Reopen(); + Put("c", "cv"); + Reopen(); + Put("",""); + Reopen(); + Put("",""); + env_->SleepForMicroseconds(1000000); // Wait for compaction to finish + Reopen(); + Put("d","dv"); + Reopen(); + Put("",""); + Reopen(); + Delete("d"); + Delete("b"); + Reopen(); + ASSERT_EQ("(->)(c->cv)", Contents()); + env_->SleepForMicroseconds(1000000); // Wait for compaction to finish + ASSERT_EQ("(->)(c->cv)", Contents()); + } while (ChangeCompactOptions()); +} + +TEST(DBTest, ComparatorCheck) { + class NewComparator : public Comparator { + public: + virtual const char* Name() const { return "rocksdb.NewComparator"; } + virtual int Compare(const Slice& a, const Slice& b) const { + return BytewiseComparator()->Compare(a, b); + } + virtual void FindShortestSeparator(std::string* s, const Slice& l) const { + BytewiseComparator()->FindShortestSeparator(s, l); + } + virtual void FindShortSuccessor(std::string* key) const { + BytewiseComparator()->FindShortSuccessor(key); + } + }; + Options new_options; + NewComparator cmp; + do { + new_options = CurrentOptions(); + new_options.comparator = &cmp; + Status s = TryReopen(&new_options); + ASSERT_TRUE(!s.ok()); + ASSERT_TRUE(s.ToString().find("comparator") != std::string::npos) + << s.ToString(); + } while (ChangeCompactOptions(&new_options)); +} + +TEST(DBTest, CustomComparator) { + class NumberComparator : public Comparator { + public: + virtual const char* Name() const { return "test.NumberComparator"; } + virtual int Compare(const Slice& a, const Slice& b) const { + return ToNumber(a) - ToNumber(b); + } + virtual void FindShortestSeparator(std::string* s, const Slice& l) const { + ToNumber(*s); // Check format + ToNumber(l); // Check format + } + virtual void FindShortSuccessor(std::string* key) const { + ToNumber(*key); // Check format + } + private: + static int ToNumber(const Slice& x) { + // Check that there are no extra characters. + ASSERT_TRUE(x.size() >= 2 && x[0] == '[' && x[x.size()-1] == ']') + << EscapeString(x); + int val; + char ignored; + ASSERT_TRUE(sscanf(x.ToString().c_str(), "[%i]%c", &val, &ignored) == 1) + << EscapeString(x); + return val; + } + }; + Options new_options; + NumberComparator cmp; + do { + new_options = CurrentOptions(); + new_options.create_if_missing = true; + new_options.comparator = &cmp; + new_options.filter_policy = nullptr; // Cannot use bloom filters + new_options.write_buffer_size = 1000; // Compact more often + DestroyAndReopen(&new_options); + ASSERT_OK(Put("[10]", "ten")); + ASSERT_OK(Put("[0x14]", "twenty")); + for (int i = 0; i < 2; i++) { + ASSERT_EQ("ten", Get("[10]")); + ASSERT_EQ("ten", Get("[0xa]")); + ASSERT_EQ("twenty", Get("[20]")); + ASSERT_EQ("twenty", Get("[0x14]")); + ASSERT_EQ("NOT_FOUND", Get("[15]")); + ASSERT_EQ("NOT_FOUND", Get("[0xf]")); + Compact("[0]", "[9999]"); + } + + for (int run = 0; run < 2; run++) { + for (int i = 0; i < 1000; i++) { + char buf[100]; + snprintf(buf, sizeof(buf), "[%d]", i*10); + ASSERT_OK(Put(buf, buf)); + } + Compact("[0]", "[1000000]"); + } + } while (ChangeCompactOptions(&new_options)); +} + +TEST(DBTest, ManualCompaction) { + ASSERT_EQ(dbfull()->MaxMemCompactionLevel(), 2) + << "Need to update this test to match kMaxMemCompactLevel"; + + // iter - 0 with 7 levels + // iter - 1 with 3 levels + for (int iter = 0; iter < 2; ++iter) { + MakeTables(3, "p", "q"); + ASSERT_EQ("1,1,1", FilesPerLevel()); + + // Compaction range falls before files + Compact("", "c"); + ASSERT_EQ("1,1,1", FilesPerLevel()); + + // Compaction range falls after files + Compact("r", "z"); + ASSERT_EQ("1,1,1", FilesPerLevel()); + + // Compaction range overlaps files + Compact("p1", "p9"); + ASSERT_EQ("0,0,1", FilesPerLevel()); + + // Populate a different range + MakeTables(3, "c", "e"); + ASSERT_EQ("1,1,2", FilesPerLevel()); + + // Compact just the new range + Compact("b", "f"); + ASSERT_EQ("0,0,2", FilesPerLevel()); + + // Compact all + MakeTables(1, "a", "z"); + ASSERT_EQ("0,1,2", FilesPerLevel()); + db_->CompactRange(nullptr, nullptr); + ASSERT_EQ("0,0,1", FilesPerLevel()); + + if (iter == 0) { + Options options = CurrentOptions(); + options.num_levels = 3; + options.create_if_missing = true; + DestroyAndReopen(&options); + } + } + +} + +TEST(DBTest, DBOpen_Options) { + std::string dbname = test::TmpDir() + "/db_options_test"; + ASSERT_OK(DestroyDB(dbname, Options())); + + // Does not exist, and create_if_missing == false: error + DB* db = nullptr; + Options opts; + opts.create_if_missing = false; + Status s = DB::Open(opts, dbname, &db); + ASSERT_TRUE(strstr(s.ToString().c_str(), "does not exist") != nullptr); + ASSERT_TRUE(db == nullptr); + + // Does not exist, and create_if_missing == true: OK + opts.create_if_missing = true; + s = DB::Open(opts, dbname, &db); + ASSERT_OK(s); + ASSERT_TRUE(db != nullptr); + + delete db; + db = nullptr; + + // Does exist, and error_if_exists == true: error + opts.create_if_missing = false; + opts.error_if_exists = true; + s = DB::Open(opts, dbname, &db); + ASSERT_TRUE(strstr(s.ToString().c_str(), "exists") != nullptr); + ASSERT_TRUE(db == nullptr); + + // Does exist, and error_if_exists == false: OK + opts.create_if_missing = true; + opts.error_if_exists = false; + s = DB::Open(opts, dbname, &db); + ASSERT_OK(s); + ASSERT_TRUE(db != nullptr); + + delete db; + db = nullptr; +} + +TEST(DBTest, DBOpen_Change_NumLevels) { + std::string dbname = test::TmpDir() + "/db_change_num_levels"; + ASSERT_OK(DestroyDB(dbname, Options())); + Options opts; + Status s; + DB* db = nullptr; + opts.create_if_missing = true; + s = DB::Open(opts, dbname, &db); + ASSERT_OK(s); + ASSERT_TRUE(db != nullptr); + db->Put(WriteOptions(), "a", "123"); + db->Put(WriteOptions(), "b", "234"); + db->CompactRange(nullptr, nullptr); + delete db; + db = nullptr; + + opts.create_if_missing = false; + opts.num_levels = 2; + s = DB::Open(opts, dbname, &db); + ASSERT_TRUE(strstr(s.ToString().c_str(), "Invalid argument") != nullptr); + ASSERT_TRUE(db == nullptr); +} + +TEST(DBTest, DestroyDBMetaDatabase) { + std::string dbname = test::TmpDir() + "/db_meta"; + std::string metadbname = MetaDatabaseName(dbname, 0); + std::string metametadbname = MetaDatabaseName(metadbname, 0); + + // Destroy previous versions if they exist. Using the long way. + ASSERT_OK(DestroyDB(metametadbname, Options())); + ASSERT_OK(DestroyDB(metadbname, Options())); + ASSERT_OK(DestroyDB(dbname, Options())); + + // Setup databases + Options opts; + opts.create_if_missing = true; + DB* db = nullptr; + ASSERT_OK(DB::Open(opts, dbname, &db)); + delete db; + db = nullptr; + ASSERT_OK(DB::Open(opts, metadbname, &db)); + delete db; + db = nullptr; + ASSERT_OK(DB::Open(opts, metametadbname, &db)); + delete db; + db = nullptr; + + // Delete databases + ASSERT_OK(DestroyDB(dbname, Options())); + + // Check if deletion worked. + opts.create_if_missing = false; + ASSERT_TRUE(!(DB::Open(opts, dbname, &db)).ok()); + ASSERT_TRUE(!(DB::Open(opts, metadbname, &db)).ok()); + ASSERT_TRUE(!(DB::Open(opts, metametadbname, &db)).ok()); +} + +// Check that number of files does not grow when we are out of space +TEST(DBTest, NoSpace) { + do { + Options options = CurrentOptions(); + options.env = env_; + Reopen(&options); + + ASSERT_OK(Put("foo", "v1")); + ASSERT_EQ("v1", Get("foo")); + Compact("a", "z"); + const int num_files = CountFiles(); + env_->no_space_.Release_Store(env_); // Force out-of-space errors + env_->sleep_counter_.Reset(); + for (int i = 0; i < 5; i++) { + for (int level = 0; level < dbfull()->NumberLevels()-1; level++) { + dbfull()->TEST_CompactRange(level, nullptr, nullptr); + } + } + env_->no_space_.Release_Store(nullptr); + ASSERT_LT(CountFiles(), num_files + 3); + + // Check that compaction attempts slept after errors + ASSERT_GE(env_->sleep_counter_.Read(), 5); + } while (ChangeCompactOptions()); +} + +TEST(DBTest, NonWritableFileSystem) { + do { + Options options = CurrentOptions(); + options.write_buffer_size = 1000; + options.env = env_; + Reopen(&options); + ASSERT_OK(Put("foo", "v1")); + env_->non_writable_.Release_Store(env_); // Force errors for new files + std::string big(100000, 'x'); + int errors = 0; + for (int i = 0; i < 20; i++) { + if (!Put("foo", big).ok()) { + errors++; + env_->SleepForMicroseconds(100000); + } + } + ASSERT_GT(errors, 0); + env_->non_writable_.Release_Store(nullptr); + } while (ChangeCompactOptions()); +} + +TEST(DBTest, ManifestWriteError) { + // Test for the following problem: + // (a) Compaction produces file F + // (b) Log record containing F is written to MANIFEST file, but Sync() fails + // (c) GC deletes F + // (d) After reopening DB, reads fail since deleted F is named in log record + + // We iterate twice. In the second iteration, everything is the + // same except the log record never makes it to the MANIFEST file. + for (int iter = 0; iter < 2; iter++) { + port::AtomicPointer* error_type = (iter == 0) + ? &env_->manifest_sync_error_ + : &env_->manifest_write_error_; + + // Insert foo=>bar mapping + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.error_if_exists = false; + DestroyAndReopen(&options); + ASSERT_OK(Put("foo", "bar")); + ASSERT_EQ("bar", Get("foo")); + + // Memtable compaction (will succeed) + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("bar", Get("foo")); + const int last = dbfull()->MaxMemCompactionLevel(); + ASSERT_EQ(NumTableFilesAtLevel(last), 1); // foo=>bar is now in last level + + // Merging compaction (will fail) + error_type->Release_Store(env_); + dbfull()->TEST_CompactRange(last, nullptr, nullptr); // Should fail + ASSERT_EQ("bar", Get("foo")); + + // Recovery: should not lose data + error_type->Release_Store(nullptr); + Reopen(&options); + ASSERT_EQ("bar", Get("foo")); + } +} + +TEST(DBTest, PutFailsParanoid) { + // Test the following: + // (a) A random put fails in paranoid mode (simulate by sync fail) + // (b) All other puts have to fail, even if writes would succeed + // (c) All of that should happen ONLY if paranoid_checks = true + + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.error_if_exists = false; + options.paranoid_checks = true; + DestroyAndReopen(&options); + Status s; + + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("foo1", "bar1")); + // simulate error + env_->log_write_error_.Release_Store(env_); + s = Put("foo2", "bar2"); + ASSERT_TRUE(!s.ok()); + env_->log_write_error_.Release_Store(nullptr); + s = Put("foo3", "bar3"); + // the next put should fail, too + ASSERT_TRUE(!s.ok()); + // but we're still able to read + ASSERT_EQ("bar", Get("foo")); + + // do the same thing with paranoid checks off + options.paranoid_checks = false; + DestroyAndReopen(&options); + + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("foo1", "bar1")); + // simulate error + env_->log_write_error_.Release_Store(env_); + s = Put("foo2", "bar2"); + ASSERT_TRUE(!s.ok()); + env_->log_write_error_.Release_Store(nullptr); + s = Put("foo3", "bar3"); + // the next put should NOT fail + ASSERT_TRUE(s.ok()); +} + +TEST(DBTest, FilesDeletedAfterCompaction) { + do { + ASSERT_OK(Put("foo", "v2")); + Compact("a", "z"); + const int num_files = CountLiveFiles(); + for (int i = 0; i < 10; i++) { + ASSERT_OK(Put("foo", "v2")); + Compact("a", "z"); + } + ASSERT_EQ(CountLiveFiles(), num_files); + } while (ChangeCompactOptions()); +} + +TEST(DBTest, BloomFilter) { + do { + env_->count_random_reads_ = true; + Options options = CurrentOptions(); + options.env = env_; + options.no_block_cache = true; + options.filter_policy = NewBloomFilterPolicy(10); + Reopen(&options); + + // Populate multiple layers + const int N = 10000; + for (int i = 0; i < N; i++) { + ASSERT_OK(Put(Key(i), Key(i))); + } + Compact("a", "z"); + for (int i = 0; i < N; i += 100) { + ASSERT_OK(Put(Key(i), Key(i))); + } + dbfull()->TEST_FlushMemTable(); + + // Prevent auto compactions triggered by seeks + env_->delay_sstable_sync_.Release_Store(env_); + + // Lookup present keys. Should rarely read from small sstable. + env_->random_read_counter_.Reset(); + for (int i = 0; i < N; i++) { + ASSERT_EQ(Key(i), Get(Key(i))); + } + int reads = env_->random_read_counter_.Read(); + fprintf(stderr, "%d present => %d reads\n", N, reads); + ASSERT_GE(reads, N); + ASSERT_LE(reads, N + 2*N/100); + + // Lookup present keys. Should rarely read from either sstable. + env_->random_read_counter_.Reset(); + for (int i = 0; i < N; i++) { + ASSERT_EQ("NOT_FOUND", Get(Key(i) + ".missing")); + } + reads = env_->random_read_counter_.Read(); + fprintf(stderr, "%d missing => %d reads\n", N, reads); + ASSERT_LE(reads, 3*N/100); + + env_->delay_sstable_sync_.Release_Store(nullptr); + Close(); + delete options.filter_policy; + } while (ChangeCompactOptions()); +} + +TEST(DBTest, SnapshotFiles) { + do { + Options options = CurrentOptions(); + const EnvOptions soptions; + options.write_buffer_size = 100000000; // Large write buffer + Reopen(&options); + + Random rnd(301); + + // Write 8MB (80 values, each 100K) + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + std::vector values; + for (int i = 0; i < 80; i++) { + values.push_back(RandomString(&rnd, 100000)); + ASSERT_OK(Put(Key(i), values[i])); + } + + // assert that nothing makes it to disk yet. + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + + // get a file snapshot + uint64_t manifest_number = 0; + uint64_t manifest_size = 0; + std::vector files; + dbfull()->DisableFileDeletions(); + dbfull()->GetLiveFiles(files, &manifest_size); + + // CURRENT, MANIFEST, *.sst files + ASSERT_EQ(files.size(), 3U); + + uint64_t number = 0; + FileType type; + + // copy these files to a new snapshot directory + std::string snapdir = dbname_ + ".snapdir/"; + std::string mkdir = "mkdir -p " + snapdir; + ASSERT_EQ(system(mkdir.c_str()), 0); + + for (unsigned int i = 0; i < files.size(); i++) { + // our clients require that GetLiveFiles returns + // files with "/" as first character! + ASSERT_EQ(files[i][0], '/'); + std::string src = dbname_ + files[i]; + std::string dest = snapdir + files[i]; + + uint64_t size; + ASSERT_OK(env_->GetFileSize(src, &size)); + + // record the number and the size of the + // latest manifest file + if (ParseFileName(files[i].substr(1), &number, &type)) { + if (type == kDescriptorFile) { + if (number > manifest_number) { + manifest_number = number; + ASSERT_GE(size, manifest_size); + size = manifest_size; // copy only valid MANIFEST data + } + } + } + unique_ptr srcfile; + ASSERT_OK(env_->NewSequentialFile(src, &srcfile, soptions)); + unique_ptr destfile; + ASSERT_OK(env_->NewWritableFile(dest, &destfile, soptions)); + + char buffer[4096]; + Slice slice; + while (size > 0) { + uint64_t one = std::min(uint64_t(sizeof(buffer)), size); + ASSERT_OK(srcfile->Read(one, &slice, buffer)); + ASSERT_OK(destfile->Append(slice)); + size -= slice.size(); + } + ASSERT_OK(destfile->Close()); + } + + // release file snapshot + dbfull()->DisableFileDeletions(); + + // overwrite one key, this key should not appear in the snapshot + std::vector extras; + for (unsigned int i = 0; i < 1; i++) { + extras.push_back(RandomString(&rnd, 100000)); + ASSERT_OK(Put(Key(i), extras[i])); + } + + // verify that data in the snapshot are correct + Options opts; + DB* snapdb; + opts.create_if_missing = false; + Status stat = DB::Open(opts, snapdir, &snapdb); + ASSERT_OK(stat); + + ReadOptions roptions; + std::string val; + for (unsigned int i = 0; i < 80; i++) { + stat = snapdb->Get(roptions, Key(i), &val); + ASSERT_EQ(values[i].compare(val), 0); + } + delete snapdb; + + // look at the new live files after we added an 'extra' key + // and after we took the first snapshot. + uint64_t new_manifest_number = 0; + uint64_t new_manifest_size = 0; + std::vector newfiles; + dbfull()->DisableFileDeletions(); + dbfull()->GetLiveFiles(newfiles, &new_manifest_size); + + // find the new manifest file. assert that this manifest file is + // the same one as in the previous snapshot. But its size should be + // larger because we added an extra key after taking the + // previous shapshot. + for (unsigned int i = 0; i < newfiles.size(); i++) { + std::string src = dbname_ + "/" + newfiles[i]; + // record the lognumber and the size of the + // latest manifest file + if (ParseFileName(newfiles[i].substr(1), &number, &type)) { + if (type == kDescriptorFile) { + if (number > new_manifest_number) { + uint64_t size; + new_manifest_number = number; + ASSERT_OK(env_->GetFileSize(src, &size)); + ASSERT_GE(size, new_manifest_size); + } + } + } + } + ASSERT_EQ(manifest_number, new_manifest_number); + ASSERT_GT(new_manifest_size, manifest_size); + + // release file snapshot + dbfull()->DisableFileDeletions(); + } while (ChangeCompactOptions()); +} + +TEST(DBTest, CompactOnFlush) { + do { + Options options = CurrentOptions(); + options.purge_redundant_kvs_while_flush = true; + options.disable_auto_compactions = true; + Reopen(&options); + + Put("foo", "v1"); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + ASSERT_EQ(AllEntriesFor("foo"), "[ v1 ]"); + + // Write two new keys + Put("a", "begin"); + Put("z", "end"); + dbfull()->TEST_FlushMemTable(); + + // Case1: Delete followed by a put + Delete("foo"); + Put("foo", "v2"); + ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]"); + + // After the current memtable is flushed, the DEL should + // have been removed + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + ASSERT_EQ(AllEntriesFor("foo"), "[ v2, v1 ]"); + + dbfull()->CompactRange(nullptr, nullptr); + ASSERT_EQ(AllEntriesFor("foo"), "[ v2 ]"); + + // Case 2: Delete followed by another delete + Delete("foo"); + Delete("foo"); + ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, DEL, v2 ]"); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v2 ]"); + dbfull()->CompactRange(nullptr, nullptr); + ASSERT_EQ(AllEntriesFor("foo"), "[ ]"); + + // Case 3: Put followed by a delete + Put("foo", "v3"); + Delete("foo"); + ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v3 ]"); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + ASSERT_EQ(AllEntriesFor("foo"), "[ DEL ]"); + dbfull()->CompactRange(nullptr, nullptr); + ASSERT_EQ(AllEntriesFor("foo"), "[ ]"); + + // Case 4: Put followed by another Put + Put("foo", "v4"); + Put("foo", "v5"); + ASSERT_EQ(AllEntriesFor("foo"), "[ v5, v4 ]"); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + ASSERT_EQ(AllEntriesFor("foo"), "[ v5 ]"); + dbfull()->CompactRange(nullptr, nullptr); + ASSERT_EQ(AllEntriesFor("foo"), "[ v5 ]"); + + // clear database + Delete("foo"); + dbfull()->CompactRange(nullptr, nullptr); + ASSERT_EQ(AllEntriesFor("foo"), "[ ]"); + + // Case 5: Put followed by snapshot followed by another Put + // Both puts should remain. + Put("foo", "v6"); + const Snapshot* snapshot = db_->GetSnapshot(); + Put("foo", "v7"); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + ASSERT_EQ(AllEntriesFor("foo"), "[ v7, v6 ]"); + db_->ReleaseSnapshot(snapshot); + + // clear database + Delete("foo"); + dbfull()->CompactRange(nullptr, nullptr); + ASSERT_EQ(AllEntriesFor("foo"), "[ ]"); + + // Case 5: snapshot followed by a put followed by another Put + // Only the last put should remain. + const Snapshot* snapshot1 = db_->GetSnapshot(); + Put("foo", "v8"); + Put("foo", "v9"); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + ASSERT_EQ(AllEntriesFor("foo"), "[ v9 ]"); + db_->ReleaseSnapshot(snapshot1); + } while (ChangeCompactOptions()); +} + +std::vector ListLogFiles(Env* env, const std::string& path) { + std::vector files; + std::vector log_files; + env->GetChildren(path, &files); + uint64_t number; + FileType type; + for (size_t i = 0; i < files.size(); ++i) { + if (ParseFileName(files[i], &number, &type)) { + if (type == kLogFile) { + log_files.push_back(number); + } + } + } + return std::move(log_files); +} + +TEST(DBTest, WALArchivalTtl) { + do { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.WAL_ttl_seconds = 1000; + DestroyAndReopen(&options); + + // TEST : Create DB with a ttl and no size limit. + // Put some keys. Count the log files present in the DB just after insert. + // Re-open db. Causes deletion/archival to take place. + // Assert that the files moved under "/archive". + // Reopen db with small ttl. + // Assert that archive was removed. + + std::string archiveDir = ArchivalDirectory(dbname_); + + for (int i = 0; i < 10; ++i) { + for (int j = 0; j < 10; ++j) { + ASSERT_OK(Put(Key(10 * i + j), DummyString(1024))); + } + + std::vector log_files = ListLogFiles(env_, dbname_); + + options.create_if_missing = false; + Reopen(&options); + + std::vector logs = ListLogFiles(env_, archiveDir); + std::set archivedFiles(logs.begin(), logs.end()); + + for (auto& log : log_files) { + ASSERT_TRUE(archivedFiles.find(log) != archivedFiles.end()); + } + } + + std::vector log_files = ListLogFiles(env_, archiveDir); + ASSERT_TRUE(log_files.size() > 0); + + options.WAL_ttl_seconds = 1; + env_->SleepForMicroseconds(2 * 1000 * 1000); + Reopen(&options); + + log_files = ListLogFiles(env_, archiveDir); + ASSERT_TRUE(log_files.empty()); + } while (ChangeCompactOptions()); +} + +uint64_t GetLogDirSize(std::string dir_path, SpecialEnv* env) { + uint64_t dir_size = 0; + std::vector files; + env->GetChildren(dir_path, &files); + for (auto& f : files) { + uint64_t number; + FileType type; + if (ParseFileName(f, &number, &type) && type == kLogFile) { + std::string const file_path = dir_path + "/" + f; + uint64_t file_size; + env->GetFileSize(file_path, &file_size); + dir_size += file_size; + } + } + return dir_size; +} + +TEST(DBTest, WALArchivalSizeLimit) { + do { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.WAL_ttl_seconds = 0; + options.WAL_size_limit_MB = 1000; + + // TEST : Create DB with huge size limit and no ttl. + // Put some keys. Count the archived log files present in the DB + // just after insert. Assert that there are many enough. + // Change size limit. Re-open db. + // Assert that archive is not greater than WAL_size_limit_MB. + // Set ttl and time_to_check_ to small values. Re-open db. + // Assert that there are no archived logs left. + + DestroyAndReopen(&options); + for (int i = 0; i < 128 * 128; ++i) { + ASSERT_OK(Put(Key(i), DummyString(1024))); + } + Reopen(&options); + + std::string archive_dir = ArchivalDirectory(dbname_); + std::vector log_files = ListLogFiles(env_, archive_dir); + ASSERT_TRUE(log_files.size() > 2); + + options.WAL_size_limit_MB = 8; + Reopen(&options); + dbfull()->TEST_PurgeObsoleteteWAL(); + + uint64_t archive_size = GetLogDirSize(archive_dir, env_); + ASSERT_TRUE(archive_size <= options.WAL_size_limit_MB * 1024 * 1024); + + options.WAL_ttl_seconds = 1; + dbfull()->TEST_SetDefaultTimeToCheck(1); + env_->SleepForMicroseconds(2 * 1000 * 1000); + Reopen(&options); + dbfull()->TEST_PurgeObsoleteteWAL(); + + log_files = ListLogFiles(env_, archive_dir); + ASSERT_TRUE(log_files.empty()); + } while (ChangeCompactOptions()); +} + +SequenceNumber ReadRecords( + std::unique_ptr& iter, + int& count) { + count = 0; + SequenceNumber lastSequence = 0; + BatchResult res; + while (iter->Valid()) { + res = iter->GetBatch(); + ASSERT_TRUE(res.sequence > lastSequence); + ++count; + lastSequence = res.sequence; + ASSERT_OK(iter->status()); + iter->Next(); + } + return res.sequence; +} + +void ExpectRecords( + const int expected_no_records, + std::unique_ptr& iter) { + int num_records; + ReadRecords(iter, num_records); + ASSERT_EQ(num_records, expected_no_records); +} + +TEST(DBTest, TransactionLogIterator) { + do { + Options options = OptionsForLogIterTest(); + DestroyAndReopen(&options); + Put("key1", DummyString(1024)); + Put("key2", DummyString(1024)); + Put("key2", DummyString(1024)); + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 3U); + { + auto iter = OpenTransactionLogIter(0); + ExpectRecords(3, iter); + } + Reopen(&options); + env_->SleepForMicroseconds(2 * 1000 * 1000);{ + Put("key4", DummyString(1024)); + Put("key5", DummyString(1024)); + Put("key6", DummyString(1024)); + } + { + auto iter = OpenTransactionLogIter(0); + ExpectRecords(6, iter); + } + } while (ChangeCompactOptions()); +} + +TEST(DBTest, TransactionLogIteratorMoveOverZeroFiles) { + do { + Options options = OptionsForLogIterTest(); + DestroyAndReopen(&options); + // Do a plain Reopen. + Put("key1", DummyString(1024)); + // Two reopens should create a zero record WAL file. + Reopen(&options); + Reopen(&options); + + Put("key2", DummyString(1024)); + + auto iter = OpenTransactionLogIter(0); + ExpectRecords(2, iter); + } while (ChangeCompactOptions()); +} + +// TODO(kailiu) disable the in non-linux platforms to temporarily solve +// // the unit test failure. +#ifdef OS_LINUX +TEST(DBTest, TransactionLogIteratorStallAtLastRecord) { + do { + Options options = OptionsForLogIterTest(); + DestroyAndReopen(&options); + Put("key1", DummyString(1024)); + auto iter = OpenTransactionLogIter(0); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + iter->Next(); + ASSERT_TRUE(!iter->Valid()); + ASSERT_OK(iter->status()); + Put("key2", DummyString(1024)); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + } while (ChangeCompactOptions()); +} +#endif + +TEST(DBTest, TransactionLogIteratorJustEmptyFile) { + do { + Options options = OptionsForLogIterTest(); + DestroyAndReopen(&options); + unique_ptr iter; + Status status = dbfull()->GetUpdatesSince(0, &iter); + // Check that an empty iterator is returned + ASSERT_TRUE(!iter->Valid()); + } while (ChangeCompactOptions()); +} + +TEST(DBTest, TransactionLogIteratorCheckAfterRestart) { + do { + Options options = OptionsForLogIterTest(); + DestroyAndReopen(&options); + Put("key1", DummyString(1024)); + Put("key2", DummyString(1023)); + dbfull()->Flush(FlushOptions()); + Reopen(&options); + auto iter = OpenTransactionLogIter(0); + ExpectRecords(2, iter); + } while (ChangeCompactOptions()); +} + +TEST(DBTest, TransactionLogIteratorCorruptedLog) { + do { + Options options = OptionsForLogIterTest(); + DestroyAndReopen(&options); + for (int i = 0; i < 1024; i++) { + Put("key"+std::to_string(i), DummyString(10)); + } + dbfull()->Flush(FlushOptions()); + // Corrupt this log to create a gap + rocksdb::VectorLogPtr wal_files; + ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files)); + const auto logfilePath = dbname_ + "/" + wal_files.front()->PathName(); + ASSERT_EQ( + 0, + truncate(logfilePath.c_str(), wal_files.front()->SizeFileBytes() / 2)); + // Insert a new entry to a new log file + Put("key1025", DummyString(10)); + // Try to read from the beginning. Should stop before the gap and read less + // than 1025 entries + auto iter = OpenTransactionLogIter(0); + int count; + int last_sequence_read = ReadRecords(iter, count); + ASSERT_LT(last_sequence_read, 1025); + // Try to read past the gap, should be able to seek to key1025 + auto iter2 = OpenTransactionLogIter(last_sequence_read + 1); + ExpectRecords(1, iter2); + } while (ChangeCompactOptions()); +} + +TEST(DBTest, TransactionLogIteratorBatchOperations) { + do { + Options options = OptionsForLogIterTest(); + DestroyAndReopen(&options); + WriteBatch batch; + batch.Put("key1", DummyString(1024)); + batch.Put("key2", DummyString(1024)); + batch.Put("key3", DummyString(1024)); + batch.Delete("key2"); + dbfull()->Write(WriteOptions(), &batch); + dbfull()->Flush(FlushOptions()); + Reopen(&options); + Put("key4", DummyString(1024)); + auto iter = OpenTransactionLogIter(3); + ExpectRecords(2, iter); + } while (ChangeCompactOptions()); +} + +TEST(DBTest, TransactionLogIteratorBlobs) { + Options options = OptionsForLogIterTest(); + DestroyAndReopen(&options); + { + WriteBatch batch; + batch.Put("key1", DummyString(1024)); + batch.Put("key2", DummyString(1024)); + batch.PutLogData(Slice("blob1")); + batch.Put("key3", DummyString(1024)); + batch.PutLogData(Slice("blob2")); + batch.Delete("key2"); + dbfull()->Write(WriteOptions(), &batch); + Reopen(&options); + } + + auto res = OpenTransactionLogIter(0)->GetBatch(); + struct Handler : public WriteBatch::Handler { + std::string seen; + virtual void Put(const Slice& key, const Slice& value) { + seen += "Put(" + key.ToString() + ", " + std::to_string(value.size()) + + ")"; + } + virtual void Merge(const Slice& key, const Slice& value) { + seen += "Merge(" + key.ToString() + ", " + std::to_string(value.size()) + + ")"; + } + virtual void LogData(const Slice& blob) { + seen += "LogData(" + blob.ToString() + ")"; + } + virtual void Delete(const Slice& key) { + seen += "Delete(" + key.ToString() + ")"; + } + } handler; + res.writeBatchPtr->Iterate(&handler); + ASSERT_EQ("Put(key1, 1024)" + "Put(key2, 1024)" + "LogData(blob1)" + "Put(key3, 1024)" + "LogData(blob2)" + "Delete(key2)", handler.seen); +} + +TEST(DBTest, ReadCompaction) { + std::string value(4096, '4'); // a string of size 4K + { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.max_open_files = 20; // only 10 file in file-cache + options.target_file_size_base = 512; + options.write_buffer_size = 64 * 1024; + options.filter_policy = nullptr; + options.block_size = 4096; + options.no_block_cache = true; + + Reopen(&options); + + // Write 8MB (2000 values, each 4K) + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + std::vector values; + for (int i = 0; i < 2000; i++) { + ASSERT_OK(Put(Key(i), value)); + } + + // clear level 0 and 1 if necessary. + dbfull()->TEST_FlushMemTable(); + dbfull()->TEST_CompactRange(0, nullptr, nullptr); + dbfull()->TEST_CompactRange(1, nullptr, nullptr); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_EQ(NumTableFilesAtLevel(1), 0); + + // write some new keys into level 0 + for (int i = 0; i < 2000; i = i + 16) { + ASSERT_OK(Put(Key(i), value)); + } + dbfull()->Flush(FlushOptions()); + + // Wait for any write compaction to finish + dbfull()->TEST_WaitForCompact(); + + // remember number of files in each level + int l1 = NumTableFilesAtLevel(0); + int l2 = NumTableFilesAtLevel(1); + int l3 = NumTableFilesAtLevel(3); + ASSERT_NE(NumTableFilesAtLevel(0), 0); + ASSERT_NE(NumTableFilesAtLevel(1), 0); + ASSERT_NE(NumTableFilesAtLevel(2), 0); + + // read a bunch of times, trigger read compaction + for (int j = 0; j < 100; j++) { + for (int i = 0; i < 2000; i++) { + Get(Key(i)); + } + } + // wait for read compaction to finish + env_->SleepForMicroseconds(1000000); + + // verify that the number of files have decreased + // in some level, indicating that there was a compaction + ASSERT_TRUE(NumTableFilesAtLevel(0) < l1 || + NumTableFilesAtLevel(1) < l2 || + NumTableFilesAtLevel(2) < l3); + } +} + +// Multi-threaded test: +namespace { + +static const int kNumThreads = 4; +static const int kTestSeconds = 10; +static const int kNumKeys = 1000; + +struct MTState { + DBTest* test; + port::AtomicPointer stop; + port::AtomicPointer counter[kNumThreads]; + port::AtomicPointer thread_done[kNumThreads]; +}; + +struct MTThread { + MTState* state; + int id; +}; + +static void MTThreadBody(void* arg) { + MTThread* t = reinterpret_cast(arg); + int id = t->id; + DB* db = t->state->test->db_; + uintptr_t counter = 0; + fprintf(stderr, "... starting thread %d\n", id); + Random rnd(1000 + id); + std::string value; + char valbuf[1500]; + while (t->state->stop.Acquire_Load() == nullptr) { + t->state->counter[id].Release_Store(reinterpret_cast(counter)); + + int key = rnd.Uniform(kNumKeys); + char keybuf[20]; + snprintf(keybuf, sizeof(keybuf), "%016d", key); + + if (rnd.OneIn(2)) { + // Write values of the form . + // We add some padding for force compactions. + snprintf(valbuf, sizeof(valbuf), "%d.%d.%-1000d", + key, id, static_cast(counter)); + ASSERT_OK(t->state->test->Put(Slice(keybuf), Slice(valbuf))); + } else { + // Read a value and verify that it matches the pattern written above. + Status s = db->Get(ReadOptions(), Slice(keybuf), &value); + if (s.IsNotFound()) { + // Key has not yet been written + } else { + // Check that the writer thread counter is >= the counter in the value + ASSERT_OK(s); + int k, w, c; + ASSERT_EQ(3, sscanf(value.c_str(), "%d.%d.%d", &k, &w, &c)) << value; + ASSERT_EQ(k, key); + ASSERT_GE(w, 0); + ASSERT_LT(w, kNumThreads); + ASSERT_LE((unsigned int)c, reinterpret_cast( + t->state->counter[w].Acquire_Load())); + } + } + counter++; + } + t->state->thread_done[id].Release_Store(t); + fprintf(stderr, "... stopping thread %d after %d ops\n", id, int(counter)); +} + +} // namespace + +TEST(DBTest, MultiThreaded) { + do { + // Initialize state + MTState mt; + mt.test = this; + mt.stop.Release_Store(0); + for (int id = 0; id < kNumThreads; id++) { + mt.counter[id].Release_Store(0); + mt.thread_done[id].Release_Store(0); + } + + // Start threads + MTThread thread[kNumThreads]; + for (int id = 0; id < kNumThreads; id++) { + thread[id].state = &mt; + thread[id].id = id; + env_->StartThread(MTThreadBody, &thread[id]); + } + + // Let them run for a while + env_->SleepForMicroseconds(kTestSeconds * 1000000); + + // Stop the threads and wait for them to finish + mt.stop.Release_Store(&mt); + for (int id = 0; id < kNumThreads; id++) { + while (mt.thread_done[id].Acquire_Load() == nullptr) { + env_->SleepForMicroseconds(100000); + } + } + } while (ChangeOptions()); +} + +// Group commit test: +namespace { + +static const int kGCNumThreads = 4; +static const int kGCNumKeys = 1000; + +struct GCThread { + DB* db; + int id; + std::atomic done; +}; + +static void GCThreadBody(void* arg) { + GCThread* t = reinterpret_cast(arg); + int id = t->id; + DB* db = t->db; + WriteOptions wo; + + for (int i = 0; i < kGCNumKeys; ++i) { + std::string kv(std::to_string(i + id * kGCNumKeys)); + ASSERT_OK(db->Put(wo, kv, kv)); + } + t->done = true; +} + +} // namespace + +TEST(DBTest, GroupCommitTest) { + do { + // Start threads + GCThread thread[kGCNumThreads]; + for (int id = 0; id < kGCNumThreads; id++) { + thread[id].id = id; + thread[id].db = db_; + thread[id].done = false; + env_->StartThread(GCThreadBody, &thread[id]); + } + + for (int id = 0; id < kGCNumThreads; id++) { + while (thread[id].done == false) { + env_->SleepForMicroseconds(100000); + } + } + + std::vector expected_db; + for (int i = 0; i < kGCNumThreads * kGCNumKeys; ++i) { + expected_db.push_back(std::to_string(i)); + } + sort(expected_db.begin(), expected_db.end()); + + Iterator* itr = db_->NewIterator(ReadOptions()); + itr->SeekToFirst(); + for (auto x : expected_db) { + ASSERT_TRUE(itr->Valid()); + ASSERT_EQ(itr->key().ToString(), x); + ASSERT_EQ(itr->value().ToString(), x); + itr->Next(); + } + ASSERT_TRUE(!itr->Valid()); + delete itr; + + } while (ChangeOptions()); +} + +namespace { +typedef std::map KVMap; +} + +class ModelDB: public DB { + public: + class ModelSnapshot : public Snapshot { + public: + KVMap map_; + }; + + explicit ModelDB(const Options& options): options_(options) { } + virtual Status Put(const WriteOptions& o, const Slice& k, const Slice& v) { + return DB::Put(o, k, v); + } + virtual Status Merge(const WriteOptions& o, const Slice& k, const Slice& v) { + return DB::Merge(o, k, v); + } + virtual Status Delete(const WriteOptions& o, const Slice& key) { + return DB::Delete(o, key); + } + virtual Status Get(const ReadOptions& options, + const Slice& key, std::string* value) { + return Status::NotSupported(key); + } + + virtual std::vector MultiGet(const ReadOptions& options, + const std::vector& keys, + std::vector* values) { + std::vector s(keys.size(), + Status::NotSupported("Not implemented.")); + return s; + } + virtual bool KeyMayExist(const ReadOptions& options, + const Slice& key, + std::string* value, + bool* value_found = nullptr) { + if (value_found != nullptr) { + *value_found = false; + } + return true; // Not Supported directly + } + virtual Iterator* NewIterator(const ReadOptions& options) { + if (options.snapshot == nullptr) { + KVMap* saved = new KVMap; + *saved = map_; + return new ModelIter(saved, true); + } else { + const KVMap* snapshot_state = + &(reinterpret_cast(options.snapshot)->map_); + return new ModelIter(snapshot_state, false); + } + } + virtual const Snapshot* GetSnapshot() { + ModelSnapshot* snapshot = new ModelSnapshot; + snapshot->map_ = map_; + return snapshot; + } + + virtual void ReleaseSnapshot(const Snapshot* snapshot) { + delete reinterpret_cast(snapshot); + } + virtual Status Write(const WriteOptions& options, WriteBatch* batch) { + class Handler : public WriteBatch::Handler { + public: + KVMap* map_; + virtual void Put(const Slice& key, const Slice& value) { + (*map_)[key.ToString()] = value.ToString(); + } + virtual void Merge(const Slice& key, const Slice& value) { + // ignore merge for now + //(*map_)[key.ToString()] = value.ToString(); + } + virtual void Delete(const Slice& key) { + map_->erase(key.ToString()); + } + }; + Handler handler; + handler.map_ = &map_; + return batch->Iterate(&handler); + } + + virtual bool GetProperty(const Slice& property, std::string* value) { + return false; + } + virtual void GetApproximateSizes(const Range* r, int n, uint64_t* sizes) { + for (int i = 0; i < n; i++) { + sizes[i] = 0; + } + } + virtual void CompactRange(const Slice* start, const Slice* end, + bool reduce_level, int target_level) { + } + + virtual int NumberLevels() + { + return 1; + } + + virtual int MaxMemCompactionLevel() + { + return 1; + } + + virtual int Level0StopWriteTrigger() + { + return -1; + } + + virtual const std::string& GetName() const { + return name_; + } + + virtual Env* GetEnv() const { + return nullptr; + } + + virtual const Options& GetOptions() const { + return options_; + } + + virtual Status Flush(const rocksdb::FlushOptions& options) { + Status ret; + return ret; + } + + virtual Status DisableFileDeletions() { + return Status::OK(); + } + virtual Status EnableFileDeletions(bool force) { + return Status::OK(); + } + virtual Status GetLiveFiles(std::vector&, uint64_t* size, + bool flush_memtable = true) { + return Status::OK(); + } + + virtual Status GetSortedWalFiles(VectorLogPtr& files) { + return Status::OK(); + } + + virtual Status DeleteFile(std::string name) { + return Status::OK(); + } + + virtual Status GetDbIdentity(std::string& identity) { + return Status::OK(); + } + + virtual SequenceNumber GetLatestSequenceNumber() const { + return 0; + } + virtual Status GetUpdatesSince(rocksdb::SequenceNumber, + unique_ptr*) { + return Status::NotSupported("Not supported in Model DB"); + } + + private: + class ModelIter: public Iterator { + public: + ModelIter(const KVMap* map, bool owned) + : map_(map), owned_(owned), iter_(map_->end()) { + } + ~ModelIter() { + if (owned_) delete map_; + } + virtual bool Valid() const { return iter_ != map_->end(); } + virtual void SeekToFirst() { iter_ = map_->begin(); } + virtual void SeekToLast() { + if (map_->empty()) { + iter_ = map_->end(); + } else { + iter_ = map_->find(map_->rbegin()->first); + } + } + virtual void Seek(const Slice& k) { + iter_ = map_->lower_bound(k.ToString()); + } + virtual void Next() { ++iter_; } + virtual void Prev() { --iter_; } + virtual Slice key() const { return iter_->first; } + virtual Slice value() const { return iter_->second; } + virtual Status status() const { return Status::OK(); } + private: + const KVMap* const map_; + const bool owned_; // Do we own map_ + KVMap::const_iterator iter_; + }; + const Options options_; + KVMap map_; + std::string name_ = ""; +}; + +static std::string RandomKey(Random* rnd, int minimum = 0) { + int len; + do { + len = (rnd->OneIn(3) + ? 1 // Short sometimes to encourage collisions + : (rnd->OneIn(100) ? rnd->Skewed(10) : rnd->Uniform(10))); + } while (len < minimum); + return test::RandomKey(rnd, len); +} + +static bool CompareIterators(int step, + DB* model, + DB* db, + const Snapshot* model_snap, + const Snapshot* db_snap) { + ReadOptions options; + options.snapshot = model_snap; + Iterator* miter = model->NewIterator(options); + options.snapshot = db_snap; + Iterator* dbiter = db->NewIterator(options); + bool ok = true; + int count = 0; + for (miter->SeekToFirst(), dbiter->SeekToFirst(); + ok && miter->Valid() && dbiter->Valid(); + miter->Next(), dbiter->Next()) { + count++; + if (miter->key().compare(dbiter->key()) != 0) { + fprintf(stderr, "step %d: Key mismatch: '%s' vs. '%s'\n", + step, + EscapeString(miter->key()).c_str(), + EscapeString(dbiter->key()).c_str()); + ok = false; + break; + } + + if (miter->value().compare(dbiter->value()) != 0) { + fprintf(stderr, "step %d: Value mismatch for key '%s': '%s' vs. '%s'\n", + step, + EscapeString(miter->key()).c_str(), + EscapeString(miter->value()).c_str(), + EscapeString(miter->value()).c_str()); + ok = false; + } + } + + if (ok) { + if (miter->Valid() != dbiter->Valid()) { + fprintf(stderr, "step %d: Mismatch at end of iterators: %d vs. %d\n", + step, miter->Valid(), dbiter->Valid()); + ok = false; + } + } + delete miter; + delete dbiter; + return ok; +} + +TEST(DBTest, Randomized) { + Random rnd(test::RandomSeed()); + do { + ModelDB model(CurrentOptions()); + const int N = 10000; + const Snapshot* model_snap = nullptr; + const Snapshot* db_snap = nullptr; + std::string k, v; + for (int step = 0; step < N; step++) { + // TODO(sanjay): Test Get() works + int p = rnd.Uniform(100); + int minimum = 0; + if (option_config_ == kHashSkipList) { + minimum = 1; + } + if (p < 45) { // Put + k = RandomKey(&rnd, minimum); + v = RandomString(&rnd, + rnd.OneIn(20) + ? 100 + rnd.Uniform(100) + : rnd.Uniform(8)); + ASSERT_OK(model.Put(WriteOptions(), k, v)); + ASSERT_OK(db_->Put(WriteOptions(), k, v)); + + } else if (p < 90) { // Delete + k = RandomKey(&rnd, minimum); + ASSERT_OK(model.Delete(WriteOptions(), k)); + ASSERT_OK(db_->Delete(WriteOptions(), k)); + + + } else { // Multi-element batch + WriteBatch b; + const int num = rnd.Uniform(8); + for (int i = 0; i < num; i++) { + if (i == 0 || !rnd.OneIn(10)) { + k = RandomKey(&rnd, minimum); + } else { + // Periodically re-use the same key from the previous iter, so + // we have multiple entries in the write batch for the same key + } + if (rnd.OneIn(2)) { + v = RandomString(&rnd, rnd.Uniform(10)); + b.Put(k, v); + } else { + b.Delete(k); + } + } + ASSERT_OK(model.Write(WriteOptions(), &b)); + ASSERT_OK(db_->Write(WriteOptions(), &b)); + } + + if ((step % 100) == 0) { + ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr)); + ASSERT_TRUE(CompareIterators(step, &model, db_, model_snap, db_snap)); + // Save a snapshot from each DB this time that we'll use next + // time we compare things, to make sure the current state is + // preserved with the snapshot + if (model_snap != nullptr) model.ReleaseSnapshot(model_snap); + if (db_snap != nullptr) db_->ReleaseSnapshot(db_snap); + + Reopen(); + ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr)); + + model_snap = model.GetSnapshot(); + db_snap = db_->GetSnapshot(); + } + } + if (model_snap != nullptr) model.ReleaseSnapshot(model_snap); + if (db_snap != nullptr) db_->ReleaseSnapshot(db_snap); + } while (ChangeOptions(kSkipDeletesFilterFirst)); +} + +TEST(DBTest, MultiGetSimple) { + do { + ASSERT_OK(db_->Put(WriteOptions(),"k1","v1")); + ASSERT_OK(db_->Put(WriteOptions(),"k2","v2")); + ASSERT_OK(db_->Put(WriteOptions(),"k3","v3")); + ASSERT_OK(db_->Put(WriteOptions(),"k4","v4")); + ASSERT_OK(db_->Delete(WriteOptions(),"k4")); + ASSERT_OK(db_->Put(WriteOptions(),"k5","v5")); + ASSERT_OK(db_->Delete(WriteOptions(),"no_key")); + + std::vector keys(6); + keys[0] = "k1"; + keys[1] = "k2"; + keys[2] = "k3"; + keys[3] = "k4"; + keys[4] = "k5"; + keys[5] = "no_key"; + + std::vector values(20,"Temporary data to be overwritten"); + + std::vector s = db_->MultiGet(ReadOptions(),keys,&values); + ASSERT_EQ(values.size(),keys.size()); + ASSERT_EQ(values[0], "v1"); + ASSERT_EQ(values[1], "v2"); + ASSERT_EQ(values[2], "v3"); + ASSERT_EQ(values[4], "v5"); + + ASSERT_OK(s[0]); + ASSERT_OK(s[1]); + ASSERT_OK(s[2]); + ASSERT_TRUE(s[3].IsNotFound()); + ASSERT_OK(s[4]); + ASSERT_TRUE(s[5].IsNotFound()); + } while (ChangeCompactOptions()); +} + +TEST(DBTest, MultiGetEmpty) { + do { + // Empty Key Set + std::vector keys; + std::vector values; + std::vector s = db_->MultiGet(ReadOptions(),keys,&values); + ASSERT_EQ((int)s.size(),0); + + // Empty Database, Empty Key Set + DestroyAndReopen(); + s = db_->MultiGet(ReadOptions(), keys, &values); + ASSERT_EQ((int)s.size(),0); + + // Empty Database, Search for Keys + keys.resize(2); + keys[0] = "a"; + keys[1] = "b"; + s = db_->MultiGet(ReadOptions(),keys,&values); + ASSERT_EQ((int)s.size(), 2); + ASSERT_TRUE(s[0].IsNotFound() && s[1].IsNotFound()); + } while (ChangeCompactOptions()); +} + +void PrefixScanInit(DBTest *dbtest) { + char buf[100]; + std::string keystr; + const int small_range_sstfiles = 5; + const int big_range_sstfiles = 5; + + // Generate 11 sst files with the following prefix ranges. + // GROUP 0: [0,10] (level 1) + // GROUP 1: [1,2], [2,3], [3,4], [4,5], [5, 6] (level 0) + // GROUP 2: [0,6], [0,7], [0,8], [0,9], [0,10] (level 0) + // + // A seek with the previous API would do 11 random I/Os (to all the + // files). With the new API and a prefix filter enabled, we should + // only do 2 random I/O, to the 2 files containing the key. + + // GROUP 0 + snprintf(buf, sizeof(buf), "%02d______:start", 0); + keystr = std::string(buf); + ASSERT_OK(dbtest->Put(keystr, keystr)); + snprintf(buf, sizeof(buf), "%02d______:end", 10); + keystr = std::string(buf); + ASSERT_OK(dbtest->Put(keystr, keystr)); + dbtest->dbfull()->TEST_FlushMemTable(); + dbtest->dbfull()->CompactRange(nullptr, nullptr); // move to level 1 + + // GROUP 1 + for (int i = 1; i <= small_range_sstfiles; i++) { + snprintf(buf, sizeof(buf), "%02d______:start", i); + keystr = std::string(buf); + ASSERT_OK(dbtest->Put(keystr, keystr)); + snprintf(buf, sizeof(buf), "%02d______:end", i+1); + keystr = std::string(buf); + ASSERT_OK(dbtest->Put(keystr, keystr)); + dbtest->dbfull()->TEST_FlushMemTable(); + } + + // GROUP 2 + for (int i = 1; i <= big_range_sstfiles; i++) { + std::string keystr; + snprintf(buf, sizeof(buf), "%02d______:start", 0); + keystr = std::string(buf); + ASSERT_OK(dbtest->Put(keystr, keystr)); + snprintf(buf, sizeof(buf), "%02d______:end", + small_range_sstfiles+i+1); + keystr = std::string(buf); + ASSERT_OK(dbtest->Put(keystr, keystr)); + dbtest->dbfull()->TEST_FlushMemTable(); + } +} + +TEST(DBTest, PrefixScan) { + ReadOptions ro = ReadOptions(); + int count; + Slice prefix; + Slice key; + char buf[100]; + Iterator* iter; + snprintf(buf, sizeof(buf), "03______:"); + prefix = Slice(buf, 8); + key = Slice(buf, 9); + auto prefix_extractor = NewFixedPrefixTransform(8); + // db configs + env_->count_random_reads_ = true; + Options options = CurrentOptions(); + options.env = env_; + options.no_block_cache = true; + options.filter_policy = NewBloomFilterPolicy(10); + options.prefix_extractor = prefix_extractor; + options.whole_key_filtering = false; + options.disable_auto_compactions = true; + options.max_background_compactions = 2; + options.create_if_missing = true; + options.disable_seek_compaction = true; + options.memtable_factory.reset(NewHashSkipListRepFactory(prefix_extractor)); + + // prefix specified, with blooms: 2 RAND I/Os + // SeekToFirst + DestroyAndReopen(&options); + PrefixScanInit(this); + count = 0; + env_->random_read_counter_.Reset(); + ro.prefix = &prefix; + iter = db_->NewIterator(ro); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + assert(iter->key().starts_with(prefix)); + count++; + } + ASSERT_OK(iter->status()); + delete iter; + ASSERT_EQ(count, 2); + ASSERT_EQ(env_->random_read_counter_.Read(), 2); + + // prefix specified, with blooms: 2 RAND I/Os + // Seek + DestroyAndReopen(&options); + PrefixScanInit(this); + count = 0; + env_->random_read_counter_.Reset(); + ro.prefix = &prefix; + iter = db_->NewIterator(ro); + for (iter->Seek(key); iter->Valid(); iter->Next()) { + assert(iter->key().starts_with(prefix)); + count++; + } + ASSERT_OK(iter->status()); + delete iter; + ASSERT_EQ(count, 2); + ASSERT_EQ(env_->random_read_counter_.Read(), 2); + + // no prefix specified: 11 RAND I/Os + DestroyAndReopen(&options); + PrefixScanInit(this); + count = 0; + env_->random_read_counter_.Reset(); + iter = db_->NewIterator(ReadOptions()); + for (iter->Seek(prefix); iter->Valid(); iter->Next()) { + if (! iter->key().starts_with(prefix)) { + break; + } + count++; + } + ASSERT_OK(iter->status()); + delete iter; + ASSERT_EQ(count, 2); + ASSERT_EQ(env_->random_read_counter_.Read(), 11); + Close(); + delete options.filter_policy; +} + +std::string MakeKey(unsigned int num) { + char buf[30]; + snprintf(buf, sizeof(buf), "%016u", num); + return std::string(buf); +} + +void BM_LogAndApply(int iters, int num_base_files) { + std::string dbname = test::TmpDir() + "/rocksdb_test_benchmark"; + ASSERT_OK(DestroyDB(dbname, Options())); + + DB* db = nullptr; + Options opts; + opts.create_if_missing = true; + Status s = DB::Open(opts, dbname, &db); + ASSERT_OK(s); + ASSERT_TRUE(db != nullptr); + + delete db; + db = nullptr; + + Env* env = Env::Default(); + + port::Mutex mu; + MutexLock l(&mu); + + InternalKeyComparator cmp(BytewiseComparator()); + Options options; + EnvOptions sopt; + VersionSet vset(dbname, &options, sopt, nullptr, &cmp); + ASSERT_OK(vset.Recover()); + VersionEdit vbase; + uint64_t fnum = 1; + for (int i = 0; i < num_base_files; i++) { + InternalKey start(MakeKey(2*fnum), 1, kTypeValue); + InternalKey limit(MakeKey(2*fnum+1), 1, kTypeDeletion); + vbase.AddFile(2, fnum++, 1 /* file size */, start, limit, 1, 1); + } + ASSERT_OK(vset.LogAndApply(&vbase, &mu)); + + uint64_t start_micros = env->NowMicros(); + + for (int i = 0; i < iters; i++) { + VersionEdit vedit; + vedit.DeleteFile(2, fnum); + InternalKey start(MakeKey(2*fnum), 1, kTypeValue); + InternalKey limit(MakeKey(2*fnum+1), 1, kTypeDeletion); + vedit.AddFile(2, fnum++, 1 /* file size */, start, limit, 1, 1); + vset.LogAndApply(&vedit, &mu); + } + uint64_t stop_micros = env->NowMicros(); + unsigned int us = stop_micros - start_micros; + char buf[16]; + snprintf(buf, sizeof(buf), "%d", num_base_files); + fprintf(stderr, + "BM_LogAndApply/%-6s %8d iters : %9u us (%7.0f us / iter)\n", + buf, iters, us, ((float)us) / iters); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + if (argc > 1 && std::string(argv[1]) == "--benchmark") { + rocksdb::BM_LogAndApply(1000, 1); + rocksdb::BM_LogAndApply(1000, 100); + rocksdb::BM_LogAndApply(1000, 10000); + rocksdb::BM_LogAndApply(100, 100000); + return 0; + } + + return rocksdb::test::RunAllTests(); +} diff --git a/db/dbformat.cc b/db/dbformat.cc new file mode 100644 index 00000000..3d7e6101 --- /dev/null +++ b/db/dbformat.cc @@ -0,0 +1,147 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include "db/dbformat.h" +#include "port/port.h" +#include "util/coding.h" +#include "util/perf_context_imp.h" + +namespace rocksdb { + +static uint64_t PackSequenceAndType(uint64_t seq, ValueType t) { + assert(seq <= kMaxSequenceNumber); + assert(t <= kValueTypeForSeek); + return (seq << 8) | t; +} + +void AppendInternalKey(std::string* result, const ParsedInternalKey& key) { + result->append(key.user_key.data(), key.user_key.size()); + PutFixed64(result, PackSequenceAndType(key.sequence, key.type)); +} + +std::string ParsedInternalKey::DebugString(bool hex) const { + char buf[50]; + snprintf(buf, sizeof(buf), "' @ %llu : %d", + (unsigned long long) sequence, + int(type)); + std::string result = "'"; + result += user_key.ToString(hex); + result += buf; + return result; +} + +std::string InternalKey::DebugString(bool hex) const { + std::string result; + ParsedInternalKey parsed; + if (ParseInternalKey(rep_, &parsed)) { + result = parsed.DebugString(hex); + } else { + result = "(bad)"; + result.append(EscapeString(rep_)); + } + return result; +} + +const char* InternalKeyComparator::Name() const { + return name_.c_str(); +} + +int InternalKeyComparator::Compare(const Slice& akey, const Slice& bkey) const { + // Order by: + // increasing user key (according to user-supplied comparator) + // decreasing sequence number + // decreasing type (though sequence# should be enough to disambiguate) + int r = user_comparator_->Compare(ExtractUserKey(akey), ExtractUserKey(bkey)); + BumpPerfCount(&perf_context.user_key_comparison_count); + if (r == 0) { + const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8); + const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8); + if (anum > bnum) { + r = -1; + } else if (anum < bnum) { + r = +1; + } + } + return r; +} + +void InternalKeyComparator::FindShortestSeparator( + std::string* start, + const Slice& limit) const { + // Attempt to shorten the user portion of the key + Slice user_start = ExtractUserKey(*start); + Slice user_limit = ExtractUserKey(limit); + std::string tmp(user_start.data(), user_start.size()); + user_comparator_->FindShortestSeparator(&tmp, user_limit); + if (tmp.size() < user_start.size() && + user_comparator_->Compare(user_start, tmp) < 0) { + // User key has become shorter physically, but larger logically. + // Tack on the earliest possible number to the shortened user key. + PutFixed64(&tmp, PackSequenceAndType(kMaxSequenceNumber,kValueTypeForSeek)); + assert(this->Compare(*start, tmp) < 0); + assert(this->Compare(tmp, limit) < 0); + start->swap(tmp); + } +} + +void InternalKeyComparator::FindShortSuccessor(std::string* key) const { + Slice user_key = ExtractUserKey(*key); + std::string tmp(user_key.data(), user_key.size()); + user_comparator_->FindShortSuccessor(&tmp); + if (tmp.size() < user_key.size() && + user_comparator_->Compare(user_key, tmp) < 0) { + // User key has become shorter physically, but larger logically. + // Tack on the earliest possible number to the shortened user key. + PutFixed64(&tmp, PackSequenceAndType(kMaxSequenceNumber,kValueTypeForSeek)); + assert(this->Compare(*key, tmp) < 0); + key->swap(tmp); + } +} + +const char* InternalFilterPolicy::Name() const { + return user_policy_->Name(); +} + +void InternalFilterPolicy::CreateFilter(const Slice* keys, int n, + std::string* dst) const { + // We rely on the fact that the code in table.cc does not mind us + // adjusting keys[]. + Slice* mkey = const_cast(keys); + for (int i = 0; i < n; i++) { + mkey[i] = ExtractUserKey(keys[i]); + // TODO(sanjay): Suppress dups? + } + user_policy_->CreateFilter(keys, n, dst); +} + +bool InternalFilterPolicy::KeyMayMatch(const Slice& key, const Slice& f) const { + return user_policy_->KeyMayMatch(ExtractUserKey(key), f); +} + +LookupKey::LookupKey(const Slice& user_key, SequenceNumber s) { + size_t usize = user_key.size(); + size_t needed = usize + 13; // A conservative estimate + char* dst; + if (needed <= sizeof(space_)) { + dst = space_; + } else { + dst = new char[needed]; + } + start_ = dst; + dst = EncodeVarint32(dst, usize + 8); + kstart_ = dst; + memcpy(dst, user_key.data(), usize); + dst += usize; + EncodeFixed64(dst, PackSequenceAndType(s, kValueTypeForSeek)); + dst += 8; + end_ = dst; +} + +} // namespace rocksdb diff --git a/db/dbformat.h b/db/dbformat.h new file mode 100644 index 00000000..64a2c9f0 --- /dev/null +++ b/db/dbformat.h @@ -0,0 +1,229 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/slice.h" +#include "rocksdb/table.h" +#include "rocksdb/types.h" +#include "util/coding.h" +#include "util/logging.h" + +namespace rocksdb { + +class InternalKey; + +// Value types encoded as the last component of internal keys. +// DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk +// data structures. +enum ValueType { + kTypeDeletion = 0x0, + kTypeValue = 0x1, + kTypeMerge = 0x2, + kTypeLogData = 0x3 +}; +// kValueTypeForSeek defines the ValueType that should be passed when +// constructing a ParsedInternalKey object for seeking to a particular +// sequence number (since we sort sequence numbers in decreasing order +// and the value type is embedded as the low 8 bits in the sequence +// number in internal keys, we need to use the highest-numbered +// ValueType, not the lowest). +static const ValueType kValueTypeForSeek = kTypeMerge; + +// We leave eight bits empty at the bottom so a type and sequence# +// can be packed together into 64-bits. +static const SequenceNumber kMaxSequenceNumber = + ((0x1ull << 56) - 1); + +struct ParsedInternalKey { + Slice user_key; + SequenceNumber sequence; + ValueType type; + + ParsedInternalKey() { } // Intentionally left uninitialized (for speed) + ParsedInternalKey(const Slice& u, const SequenceNumber& seq, ValueType t) + : user_key(u), sequence(seq), type(t) { } + std::string DebugString(bool hex = false) const; +}; + +// Return the length of the encoding of "key". +inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) { + return key.user_key.size() + 8; +} + +// Append the serialization of "key" to *result. +extern void AppendInternalKey(std::string* result, + const ParsedInternalKey& key); + +// Attempt to parse an internal key from "internal_key". On success, +// stores the parsed data in "*result", and returns true. +// +// On error, returns false, leaves "*result" in an undefined state. +extern bool ParseInternalKey(const Slice& internal_key, + ParsedInternalKey* result); + +// Returns the user key portion of an internal key. +inline Slice ExtractUserKey(const Slice& internal_key) { + assert(internal_key.size() >= 8); + return Slice(internal_key.data(), internal_key.size() - 8); +} + +inline ValueType ExtractValueType(const Slice& internal_key) { + assert(internal_key.size() >= 8); + const size_t n = internal_key.size(); + uint64_t num = DecodeFixed64(internal_key.data() + n - 8); + unsigned char c = num & 0xff; + return static_cast(c); +} + +// A comparator for internal keys that uses a specified comparator for +// the user key portion and breaks ties by decreasing sequence number. +class InternalKeyComparator : public Comparator { + private: + const Comparator* user_comparator_; + std::string name_; + public: + explicit InternalKeyComparator(const Comparator* c) : user_comparator_(c), + name_("rocksdb.InternalKeyComparator:" + + std::string(user_comparator_->Name())) { + } + + virtual const char* Name() const; + virtual int Compare(const Slice& a, const Slice& b) const; + virtual void FindShortestSeparator( + std::string* start, + const Slice& limit) const; + virtual void FindShortSuccessor(std::string* key) const; + + const Comparator* user_comparator() const { return user_comparator_; } + + int Compare(const InternalKey& a, const InternalKey& b) const; +}; + +// Filter policy wrapper that converts from internal keys to user keys +class InternalFilterPolicy : public FilterPolicy { + private: + const FilterPolicy* const user_policy_; + public: + explicit InternalFilterPolicy(const FilterPolicy* p) : user_policy_(p) { } + virtual const char* Name() const; + virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const; + virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const; +}; + +// Modules in this directory should keep internal keys wrapped inside +// the following class instead of plain strings so that we do not +// incorrectly use string comparisons instead of an InternalKeyComparator. +class InternalKey { + private: + std::string rep_; + public: + InternalKey() { } // Leave rep_ as empty to indicate it is invalid + InternalKey(const Slice& user_key, SequenceNumber s, ValueType t) { + AppendInternalKey(&rep_, ParsedInternalKey(user_key, s, t)); + } + + void DecodeFrom(const Slice& s) { rep_.assign(s.data(), s.size()); } + Slice Encode() const { + assert(!rep_.empty()); + return rep_; + } + + Slice user_key() const { return ExtractUserKey(rep_); } + + void SetFrom(const ParsedInternalKey& p) { + rep_.clear(); + AppendInternalKey(&rep_, p); + } + + void Clear() { rep_.clear(); } + + std::string DebugString(bool hex = false) const; +}; + +inline int InternalKeyComparator::Compare( + const InternalKey& a, const InternalKey& b) const { + return Compare(a.Encode(), b.Encode()); +} + +inline bool ParseInternalKey(const Slice& internal_key, + ParsedInternalKey* result) { + const size_t n = internal_key.size(); + if (n < 8) return false; + uint64_t num = DecodeFixed64(internal_key.data() + n - 8); + unsigned char c = num & 0xff; + result->sequence = num >> 8; + result->type = static_cast(c); + result->user_key = Slice(internal_key.data(), n - 8); + return (c <= static_cast(kValueTypeForSeek)); +} + +// Update the sequence number in the internal key +inline void UpdateInternalKey(char* internal_key, + const size_t internal_key_size, + uint64_t seq, ValueType t) { + assert(internal_key_size >= 8); + char* seqtype = internal_key + internal_key_size - 8; + uint64_t newval = (seq << 8) | t; + EncodeFixed64(seqtype, newval); +} + +// Get the sequence number from the internal key +inline uint64_t GetInternalKeySeqno(const Slice& internal_key) { + const size_t n = internal_key.size(); + assert(n >= 8); + uint64_t num = DecodeFixed64(internal_key.data() + n - 8); + return num >> 8; +} + + +// A helper class useful for DBImpl::Get() +class LookupKey { + public: + // Initialize *this for looking up user_key at a snapshot with + // the specified sequence number. + LookupKey(const Slice& user_key, SequenceNumber sequence); + + ~LookupKey(); + + // Return a key suitable for lookup in a MemTable. + Slice memtable_key() const { return Slice(start_, end_ - start_); } + + // Return an internal key (suitable for passing to an internal iterator) + Slice internal_key() const { return Slice(kstart_, end_ - kstart_); } + + // Return the user key + Slice user_key() const { return Slice(kstart_, end_ - kstart_ - 8); } + + private: + // We construct a char array of the form: + // klength varint32 <-- start_ + // userkey char[klength] <-- kstart_ + // tag uint64 + // <-- end_ + // The array is a suitable MemTable key. + // The suffix starting with "userkey" can be used as an InternalKey. + const char* start_; + const char* kstart_; + const char* end_; + char space_[200]; // Avoid allocation for short keys + + // No copying allowed + LookupKey(const LookupKey&); + void operator=(const LookupKey&); +}; + +inline LookupKey::~LookupKey() { + if (start_ != space_) delete[] start_; +} + +} // namespace rocksdb diff --git a/db/dbformat_test.cc b/db/dbformat_test.cc new file mode 100644 index 00000000..b520f3c4 --- /dev/null +++ b/db/dbformat_test.cc @@ -0,0 +1,117 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/dbformat.h" +#include "util/logging.h" +#include "util/testharness.h" + +namespace rocksdb { + +static std::string IKey(const std::string& user_key, + uint64_t seq, + ValueType vt) { + std::string encoded; + AppendInternalKey(&encoded, ParsedInternalKey(user_key, seq, vt)); + return encoded; +} + +static std::string Shorten(const std::string& s, const std::string& l) { + std::string result = s; + InternalKeyComparator(BytewiseComparator()).FindShortestSeparator(&result, l); + return result; +} + +static std::string ShortSuccessor(const std::string& s) { + std::string result = s; + InternalKeyComparator(BytewiseComparator()).FindShortSuccessor(&result); + return result; +} + +static void TestKey(const std::string& key, + uint64_t seq, + ValueType vt) { + std::string encoded = IKey(key, seq, vt); + + Slice in(encoded); + ParsedInternalKey decoded("", 0, kTypeValue); + + ASSERT_TRUE(ParseInternalKey(in, &decoded)); + ASSERT_EQ(key, decoded.user_key.ToString()); + ASSERT_EQ(seq, decoded.sequence); + ASSERT_EQ(vt, decoded.type); + + ASSERT_TRUE(!ParseInternalKey(Slice("bar"), &decoded)); +} + +class FormatTest { }; + +TEST(FormatTest, InternalKey_EncodeDecode) { + const char* keys[] = { "", "k", "hello", "longggggggggggggggggggggg" }; + const uint64_t seq[] = { + 1, 2, 3, + (1ull << 8) - 1, 1ull << 8, (1ull << 8) + 1, + (1ull << 16) - 1, 1ull << 16, (1ull << 16) + 1, + (1ull << 32) - 1, 1ull << 32, (1ull << 32) + 1 + }; + for (unsigned int k = 0; k < sizeof(keys) / sizeof(keys[0]); k++) { + for (unsigned int s = 0; s < sizeof(seq) / sizeof(seq[0]); s++) { + TestKey(keys[k], seq[s], kTypeValue); + TestKey("hello", 1, kTypeDeletion); + } + } +} + +TEST(FormatTest, InternalKeyShortSeparator) { + // When user keys are same + ASSERT_EQ(IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), + IKey("foo", 99, kTypeValue))); + ASSERT_EQ(IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), + IKey("foo", 101, kTypeValue))); + ASSERT_EQ(IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), + IKey("foo", 100, kTypeValue))); + ASSERT_EQ(IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), + IKey("foo", 100, kTypeDeletion))); + + // When user keys are misordered + ASSERT_EQ(IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), + IKey("bar", 99, kTypeValue))); + + // When user keys are different, but correctly ordered + ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek), + Shorten(IKey("foo", 100, kTypeValue), + IKey("hello", 200, kTypeValue))); + + // When start user key is prefix of limit user key + ASSERT_EQ(IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), + IKey("foobar", 200, kTypeValue))); + + // When limit user key is prefix of start user key + ASSERT_EQ(IKey("foobar", 100, kTypeValue), + Shorten(IKey("foobar", 100, kTypeValue), + IKey("foo", 200, kTypeValue))); +} + +TEST(FormatTest, InternalKeyShortestSuccessor) { + ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek), + ShortSuccessor(IKey("foo", 100, kTypeValue))); + ASSERT_EQ(IKey("\xff\xff", 100, kTypeValue), + ShortSuccessor(IKey("\xff\xff", 100, kTypeValue))); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/db/deletefile_test.cc b/db/deletefile_test.cc new file mode 100644 index 00000000..14f0324c --- /dev/null +++ b/db/deletefile_test.cc @@ -0,0 +1,295 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/db.h" +#include "db/db_impl.h" +#include "db/filename.h" +#include "db/version_set.h" +#include "db/write_batch_internal.h" +#include "util/testharness.h" +#include "util/testutil.h" +#include "rocksdb/env.h" +#include "rocksdb/transaction_log.h" +#include +#include +#include +#include + +namespace rocksdb { + +class DeleteFileTest { + public: + std::string dbname_; + Options options_; + DB* db_; + Env* env_; + int numlevels_; + + DeleteFileTest() { + db_ = nullptr; + env_ = Env::Default(); + options_.write_buffer_size = 1024*1024*1000; + options_.target_file_size_base = 1024*1024*1000; + options_.max_bytes_for_level_base = 1024*1024*1000; + options_.WAL_ttl_seconds = 300; // Used to test log files + options_.WAL_size_limit_MB = 1024; // Used to test log files + dbname_ = test::TmpDir() + "/deletefile_test"; + options_.wal_dir = dbname_ + "/wal_files"; + + // clean up all the files that might have been there before + std::vector old_files; + env_->GetChildren(dbname_, &old_files); + for (auto file : old_files) { + env_->DeleteFile(dbname_ + "/" + file); + } + env_->GetChildren(options_.wal_dir, &old_files); + for (auto file : old_files) { + env_->DeleteFile(options_.wal_dir + "/" + file); + } + + DestroyDB(dbname_, options_); + numlevels_ = 7; + ASSERT_OK(ReopenDB(true)); + } + + Status ReopenDB(bool create) { + delete db_; + if (create) { + DestroyDB(dbname_, options_); + } + db_ = nullptr; + options_.create_if_missing = create; + return DB::Open(options_, dbname_, &db_); + } + + void CloseDB() { + delete db_; + } + + void AddKeys(int numkeys, int startkey = 0) { + WriteOptions options; + options.sync = false; + ReadOptions roptions; + for (int i = startkey; i < (numkeys + startkey) ; i++) { + std::string temp = std::to_string(i); + Slice key(temp); + Slice value(temp); + ASSERT_OK(db_->Put(options, key, value)); + } + } + + int numKeysInLevels( + std::vector &metadata, + std::vector *keysperlevel = nullptr) { + + if (keysperlevel != nullptr) { + keysperlevel->resize(numlevels_); + } + + int numKeys = 0; + for (size_t i = 0; i < metadata.size(); i++) { + int startkey = atoi(metadata[i].smallestkey.c_str()); + int endkey = atoi(metadata[i].largestkey.c_str()); + int numkeysinfile = (endkey - startkey + 1); + numKeys += numkeysinfile; + if (keysperlevel != nullptr) { + (*keysperlevel)[(int)metadata[i].level] += numkeysinfile; + } + fprintf(stderr, "level %d name %s smallest %s largest %s\n", + metadata[i].level, metadata[i].name.c_str(), + metadata[i].smallestkey.c_str(), + metadata[i].largestkey.c_str()); + } + return numKeys; + } + + void CreateTwoLevels() { + AddKeys(50000, 10000); + DBImpl* dbi = reinterpret_cast(db_); + ASSERT_OK(dbi->TEST_FlushMemTable()); + ASSERT_OK(dbi->TEST_WaitForFlushMemTable()); + + AddKeys(50000, 10000); + ASSERT_OK(dbi->TEST_FlushMemTable()); + ASSERT_OK(dbi->TEST_WaitForFlushMemTable()); + } + + void CheckFileTypeCounts(std::string& dir, + int required_log, + int required_sst, + int required_manifest) { + std::vector filenames; + env_->GetChildren(dir, &filenames); + + int log_cnt = 0, sst_cnt = 0, manifest_cnt = 0; + for (auto file : filenames) { + uint64_t number; + FileType type; + if (ParseFileName(file, &number, &type)) { + log_cnt += (type == kLogFile); + sst_cnt += (type == kTableFile); + manifest_cnt += (type == kDescriptorFile); + } + } + ASSERT_EQ(required_log, log_cnt); + ASSERT_EQ(required_sst, sst_cnt); + ASSERT_EQ(required_manifest, manifest_cnt); + } + +}; + +TEST(DeleteFileTest, AddKeysAndQueryLevels) { + CreateTwoLevels(); + std::vector metadata; + std::vector keysinlevel; + db_->GetLiveFilesMetaData(&metadata); + + std::string level1file = ""; + int level1keycount = 0; + std::string level2file = ""; + int level2keycount = 0; + int level1index = 0; + int level2index = 1; + + ASSERT_EQ((int)metadata.size(), 2); + if (metadata[0].level == 2) { + level1index = 1; + level2index = 0; + } + + level1file = metadata[level1index].name; + int startkey = atoi(metadata[level1index].smallestkey.c_str()); + int endkey = atoi(metadata[level1index].largestkey.c_str()); + level1keycount = (endkey - startkey + 1); + level2file = metadata[level2index].name; + startkey = atoi(metadata[level2index].smallestkey.c_str()); + endkey = atoi(metadata[level2index].largestkey.c_str()); + level2keycount = (endkey - startkey + 1); + + // COntrolled setup. Levels 1 and 2 should both have 50K files. + // This is a little fragile as it depends on the current + // compaction heuristics. + ASSERT_EQ(level1keycount, 50000); + ASSERT_EQ(level2keycount, 50000); + + Status status = db_->DeleteFile("0.sst"); + ASSERT_TRUE(status.IsInvalidArgument()); + + // intermediate level files cannot be deleted. + status = db_->DeleteFile(level1file); + ASSERT_TRUE(status.IsInvalidArgument()); + + // Lowest level file deletion should succeed. + ASSERT_OK(db_->DeleteFile(level2file)); + + CloseDB(); +} + +TEST(DeleteFileTest, PurgeObsoleteFilesTest) { + CreateTwoLevels(); + // there should be only one (empty) log file because CreateTwoLevels() + // flushes the memtables to disk + CheckFileTypeCounts(options_.wal_dir, 1, 0, 0); + // 2 ssts, 1 manifest + CheckFileTypeCounts(dbname_, 0, 2, 1); + std::string first("0"), last("999999"); + Slice first_slice(first), last_slice(last); + db_->CompactRange(&first_slice, &last_slice, true, 2); + // 1 sst after compaction + CheckFileTypeCounts(dbname_, 0, 1, 1); + + // this time, we keep an iterator alive + ReopenDB(true); + Iterator *itr = 0; + CreateTwoLevels(); + itr = db_->NewIterator(ReadOptions()); + db_->CompactRange(&first_slice, &last_slice, true, 2); + // 3 sst after compaction with live iterator + CheckFileTypeCounts(dbname_, 0, 3, 1); + delete itr; + // 1 sst after iterator deletion + CheckFileTypeCounts(dbname_, 0, 1, 1); + + CloseDB(); +} + +TEST(DeleteFileTest, DeleteFileWithIterator) { + CreateTwoLevels(); + ReadOptions options; + Iterator* it = db_->NewIterator(options); + std::vector metadata; + db_->GetLiveFilesMetaData(&metadata); + + std::string level2file = ""; + + ASSERT_EQ((int)metadata.size(), 2); + if (metadata[0].level == 1) { + level2file = metadata[1].name; + } else { + level2file = metadata[0].name; + } + + Status status = db_->DeleteFile(level2file); + fprintf(stdout, "Deletion status %s: %s\n", + level2file.c_str(), status.ToString().c_str()); + ASSERT_TRUE(status.ok()); + it->SeekToFirst(); + int numKeysIterated = 0; + while(it->Valid()) { + numKeysIterated++; + it->Next(); + } + ASSERT_EQ(numKeysIterated, 50000); + delete it; + CloseDB(); +} + +TEST(DeleteFileTest, DeleteLogFiles) { + AddKeys(10, 0); + VectorLogPtr logfiles; + db_->GetSortedWalFiles(logfiles); + ASSERT_GT(logfiles.size(), 0UL); + // Take the last log file which is expected to be alive and try to delete it + // Should not succeed because live logs are not allowed to be deleted + std::unique_ptr alive_log = std::move(logfiles.back()); + ASSERT_EQ(alive_log->Type(), kAliveLogFile); + ASSERT_TRUE(env_->FileExists(options_.wal_dir + "/" + alive_log->PathName())); + fprintf(stdout, "Deleting alive log file %s\n", + alive_log->PathName().c_str()); + ASSERT_TRUE(!db_->DeleteFile(alive_log->PathName()).ok()); + ASSERT_TRUE(env_->FileExists(options_.wal_dir + "/" + alive_log->PathName())); + logfiles.clear(); + + // Call Flush to bring about a new working log file and add more keys + // Call Flush again to flush out memtable and move alive log to archived log + // and try to delete the archived log file + FlushOptions fopts; + db_->Flush(fopts); + AddKeys(10, 0); + db_->Flush(fopts); + db_->GetSortedWalFiles(logfiles); + ASSERT_GT(logfiles.size(), 0UL); + std::unique_ptr archived_log = std::move(logfiles.front()); + ASSERT_EQ(archived_log->Type(), kArchivedLogFile); + ASSERT_TRUE(env_->FileExists(options_.wal_dir + "/" + + archived_log->PathName())); + fprintf(stdout, "Deleting archived log file %s\n", + archived_log->PathName().c_str()); + ASSERT_OK(db_->DeleteFile(archived_log->PathName())); + ASSERT_TRUE(!env_->FileExists(options_.wal_dir + "/" + + archived_log->PathName())); + CloseDB(); +} + +} //namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} + diff --git a/db/filename.cc b/db/filename.cc new file mode 100644 index 00000000..cdbd1bc7 --- /dev/null +++ b/db/filename.cc @@ -0,0 +1,266 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/filename.h" + +#include +#include +#include "db/dbformat.h" +#include "rocksdb/env.h" +#include "util/logging.h" + +namespace rocksdb { + +// Given a path, flatten the path name by replacing all chars not in +// {[0-9,a-z,A-Z,-,_,.]} with _. And append '\0' at the end. +// Return the number of chars stored in dest not including the trailing '\0'. +static int FlattenPath(const std::string& path, char* dest, int len) { + int write_idx = 0; + int i = 0; + int src_len = path.size(); + + while (i < src_len && write_idx < len - 1) { + if ((path[i] >= 'a' && path[i] <= 'z') || + (path[i] >= '0' && path[i] <= '9') || + (path[i] >= 'A' && path[i] <= 'Z') || + path[i] == '-' || + path[i] == '.' || + path[i] == '_'){ + dest[write_idx++] = path[i]; + } else { + if (i > 0) + dest[write_idx++] = '_'; + } + i++; + } + + dest[write_idx] = '\0'; + return write_idx; +} + +// A utility routine: write "data" to the named file and Sync() it. +extern Status WriteStringToFileSync(Env* env, const Slice& data, + const std::string& fname); + +static std::string MakeFileName(const std::string& name, uint64_t number, + const char* suffix) { + char buf[100]; + snprintf(buf, sizeof(buf), "/%06llu.%s", + static_cast(number), + suffix); + return name + buf; +} + +std::string LogFileName(const std::string& name, uint64_t number) { + assert(number > 0); + return MakeFileName(name, number, "log"); +} + +std::string ArchivalDirectory(const std::string& dir) { + return dir + "/" + ARCHIVAL_DIR; +} +std::string ArchivedLogFileName(const std::string& name, uint64_t number) { + assert(number > 0); + return MakeFileName(name + "/" + ARCHIVAL_DIR, number, "log"); +} + +std::string TableFileName(const std::string& name, uint64_t number) { + assert(number > 0); + return MakeFileName(name, number, "sst"); +} + +std::string DescriptorFileName(const std::string& dbname, uint64_t number) { + assert(number > 0); + char buf[100]; + snprintf(buf, sizeof(buf), "/MANIFEST-%06llu", + static_cast(number)); + return dbname + buf; +} + +std::string CurrentFileName(const std::string& dbname) { + return dbname + "/CURRENT"; +} + +std::string LockFileName(const std::string& dbname) { + return dbname + "/LOCK"; +} + +std::string TempFileName(const std::string& dbname, uint64_t number) { + assert(number >= 0); + return MakeFileName(dbname, number, "dbtmp"); +} + +std::string InfoLogFileName(const std::string& dbname, + const std::string& db_path, const std::string& log_dir) { + if (log_dir.empty()) + return dbname + "/LOG"; + + char flatten_db_path[256]; + FlattenPath(db_path, flatten_db_path, 256); + return log_dir + "/" + flatten_db_path + "_LOG"; +} + +// Return the name of the old info log file for "dbname". +std::string OldInfoLogFileName(const std::string& dbname, uint64_t ts, + const std::string& db_path, const std::string& log_dir) { + char buf[50]; + snprintf(buf, sizeof(buf), "%llu", static_cast(ts)); + + if (log_dir.empty()) + return dbname + "/LOG.old." + buf; + + char flatten_db_path[256]; + FlattenPath(db_path, flatten_db_path, 256); + return log_dir + "/" + flatten_db_path + "_LOG.old." + buf; +} + +std::string MetaDatabaseName(const std::string& dbname, uint64_t number) { + char buf[100]; + snprintf(buf, sizeof(buf), "/METADB-%llu", + static_cast(number)); + return dbname + buf; +} + +std::string IdentityFileName(const std::string& dbname) { + return dbname + "/IDENTITY"; +} + +// Owned filenames have the form: +// dbname/IDENTITY +// dbname/CURRENT +// dbname/LOCK +// dbname/LOG +// dbname/LOG.old.[0-9]+ +// dbname/MANIFEST-[0-9]+ +// dbname/[0-9]+.(log|sst) +// dbname/METADB-[0-9]+ +// Disregards / at the beginning +bool ParseFileName(const std::string& fname, + uint64_t* number, + FileType* type, + WalFileType* log_type) { + Slice rest(fname); + if (fname.length() > 1 && fname[0] == '/') { + rest.remove_prefix(1); + } + if (rest == "IDENTITY") { + *number = 0; + *type = kIdentityFile; + } else if (rest == "CURRENT") { + *number = 0; + *type = kCurrentFile; + } else if (rest == "LOCK") { + *number = 0; + *type = kDBLockFile; + } else if (rest == "LOG" || rest == "LOG.old") { + *number = 0; + *type = kInfoLogFile; + } else if (rest.starts_with("LOG.old.")) { + uint64_t ts_suffix; + // sizeof also counts the trailing '\0'. + rest.remove_prefix(sizeof("LOG.old.") - 1); + if (!ConsumeDecimalNumber(&rest, &ts_suffix)) { + return false; + } + *number = ts_suffix; + *type = kInfoLogFile; + } else if (rest.starts_with("MANIFEST-")) { + rest.remove_prefix(strlen("MANIFEST-")); + uint64_t num; + if (!ConsumeDecimalNumber(&rest, &num)) { + return false; + } + if (!rest.empty()) { + return false; + } + *type = kDescriptorFile; + *number = num; + } else if (rest.starts_with("METADB-")) { + rest.remove_prefix(strlen("METADB-")); + uint64_t num; + if (!ConsumeDecimalNumber(&rest, &num)) { + return false; + } + if (!rest.empty()) { + return false; + } + *type = kMetaDatabase; + *number = num; + } else { + // Avoid strtoull() to keep filename format independent of the + // current locale + bool archive_dir_found = false; + if (rest.starts_with(ARCHIVAL_DIR)) { + if (rest.size() <= ARCHIVAL_DIR.size()) { + return false; + } + rest.remove_prefix(ARCHIVAL_DIR.size() + 1); // Add 1 to remove / also + if (log_type) { + *log_type = kArchivedLogFile; + } + archive_dir_found = true; + } + uint64_t num; + if (!ConsumeDecimalNumber(&rest, &num)) { + return false; + } + Slice suffix = rest; + if (suffix == Slice(".log")) { + *type = kLogFile; + if (log_type && !archive_dir_found) { + *log_type = kAliveLogFile; + } + } else if (archive_dir_found) { + return false; // Archive dir can contain only log files + } else if (suffix == Slice(".sst")) { + *type = kTableFile; + } else if (suffix == Slice(".dbtmp")) { + *type = kTempFile; + } else { + return false; + } + *number = num; + } + return true; +} + +Status SetCurrentFile(Env* env, const std::string& dbname, + uint64_t descriptor_number) { + // Remove leading "dbname/" and add newline to manifest file name + std::string manifest = DescriptorFileName(dbname, descriptor_number); + Slice contents = manifest; + assert(contents.starts_with(dbname + "/")); + contents.remove_prefix(dbname.size() + 1); + std::string tmp = TempFileName(dbname, descriptor_number); + Status s = WriteStringToFileSync(env, contents.ToString() + "\n", tmp); + if (s.ok()) { + s = env->RenameFile(tmp, CurrentFileName(dbname)); + } + if (!s.ok()) { + env->DeleteFile(tmp); + } + return s; +} + +Status SetIdentityFile(Env* env, const std::string& dbname) { + std::string id = env->GenerateUniqueId(); + assert(!id.empty()); + // Reserve the filename dbname/000000.dbtmp for the temporary identity file + std::string tmp = TempFileName(dbname, 0); + Status s = WriteStringToFileSync(env, id, tmp); + if (s.ok()) { + s = env->RenameFile(tmp, IdentityFileName(dbname)); + } + if (!s.ok()) { + env->DeleteFile(tmp); + } + return s; +} + +} // namespace rocksdb diff --git a/db/filename.h b/db/filename.h new file mode 100644 index 00000000..8e55f113 --- /dev/null +++ b/db/filename.h @@ -0,0 +1,108 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// File names used by DB code + +#pragma once +#include +#include +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "rocksdb/transaction_log.h" +#include "port/port.h" + +namespace rocksdb { + +class Env; + +enum FileType { + kLogFile, + kDBLockFile, + kTableFile, + kDescriptorFile, + kCurrentFile, + kTempFile, + kInfoLogFile, // Either the current one, or an old one + kMetaDatabase, + kIdentityFile +}; + +// Return the name of the log file with the specified number +// in the db named by "dbname". The result will be prefixed with +// "dbname". +extern std::string LogFileName(const std::string& dbname, uint64_t number); + +static const std::string ARCHIVAL_DIR = "archive"; + +extern std::string ArchivalDirectory(const std::string& dbname); + +// Return the name of the archived log file with the specified number +// in the db named by "dbname". The result will be prefixed with "dbname". +extern std::string ArchivedLogFileName(const std::string& dbname, + uint64_t num); + +// Return the name of the sstable with the specified number +// in the db named by "dbname". The result will be prefixed with +// "dbname". +extern std::string TableFileName(const std::string& dbname, uint64_t number); + +// Return the name of the descriptor file for the db named by +// "dbname" and the specified incarnation number. The result will be +// prefixed with "dbname". +extern std::string DescriptorFileName(const std::string& dbname, + uint64_t number); + +// Return the name of the current file. This file contains the name +// of the current manifest file. The result will be prefixed with +// "dbname". +extern std::string CurrentFileName(const std::string& dbname); + +// Return the name of the lock file for the db named by +// "dbname". The result will be prefixed with "dbname". +extern std::string LockFileName(const std::string& dbname); + +// Return the name of a temporary file owned by the db named "dbname". +// The result will be prefixed with "dbname". +extern std::string TempFileName(const std::string& dbname, uint64_t number); + +// Return the name of the info log file for "dbname". +extern std::string InfoLogFileName(const std::string& dbname, + const std::string& db_path="", const std::string& log_dir=""); + +// Return the name of the old info log file for "dbname". +extern std::string OldInfoLogFileName(const std::string& dbname, uint64_t ts, + const std::string& db_path="", const std::string& log_dir=""); + +// Return the name to use for a metadatabase. The result will be prefixed with +// "dbname". +extern std::string MetaDatabaseName(const std::string& dbname, + uint64_t number); + +// Return the name of the Identity file which stores a unique number for the db +// that will get regenerated if the db loses all its data and is recreated fresh +// either from a backup-image or empty +extern std::string IdentityFileName(const std::string& dbname); + +// If filename is a rocksdb file, store the type of the file in *type. +// The number encoded in the filename is stored in *number. If the +// filename was successfully parsed, returns true. Else return false. +extern bool ParseFileName(const std::string& filename, + uint64_t* number, + FileType* type, + WalFileType* log_type = nullptr); + +// Make the CURRENT file point to the descriptor file with the +// specified number. +extern Status SetCurrentFile(Env* env, const std::string& dbname, + uint64_t descriptor_number); + +// Make the IDENTITY file for the db +extern Status SetIdentityFile(Env* env, const std::string& dbname); + +} // namespace rocksdb diff --git a/db/filename_test.cc b/db/filename_test.cc new file mode 100644 index 00000000..0baa7fda --- /dev/null +++ b/db/filename_test.cc @@ -0,0 +1,140 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/filename.h" + +#include "db/dbformat.h" +#include "port/port.h" +#include "util/logging.h" +#include "util/testharness.h" + +namespace rocksdb { + +class FileNameTest { }; + +TEST(FileNameTest, Parse) { + Slice db; + FileType type; + uint64_t number; + + // Successful parses + static struct { + const char* fname; + uint64_t number; + FileType type; + } cases[] = { + { "100.log", 100, kLogFile }, + { "0.log", 0, kLogFile }, + { "0.sst", 0, kTableFile }, + { "CURRENT", 0, kCurrentFile }, + { "LOCK", 0, kDBLockFile }, + { "MANIFEST-2", 2, kDescriptorFile }, + { "MANIFEST-7", 7, kDescriptorFile }, + { "METADB-2", 2, kMetaDatabase }, + { "METADB-7", 7, kMetaDatabase }, + { "LOG", 0, kInfoLogFile }, + { "LOG.old", 0, kInfoLogFile }, + { "18446744073709551615.log", 18446744073709551615ull, kLogFile }, + }; + for (unsigned int i = 0; i < sizeof(cases) / sizeof(cases[0]); i++) { + std::string f = cases[i].fname; + ASSERT_TRUE(ParseFileName(f, &number, &type)) << f; + ASSERT_EQ(cases[i].type, type) << f; + ASSERT_EQ(cases[i].number, number) << f; + } + + // Errors + static const char* errors[] = { + "", + "foo", + "foo-dx-100.log", + ".log", + "", + "manifest", + "CURREN", + "CURRENTX", + "MANIFES", + "MANIFEST", + "MANIFEST-", + "XMANIFEST-3", + "MANIFEST-3x", + "META", + "METADB", + "METADB-", + "XMETADB-3", + "METADB-3x", + "LOC", + "LOCKx", + "LO", + "LOGx", + "18446744073709551616.log", + "184467440737095516150.log", + "100", + "100.", + "100.lop" + }; + for (unsigned int i = 0; i < sizeof(errors) / sizeof(errors[0]); i++) { + std::string f = errors[i]; + ASSERT_TRUE(!ParseFileName(f, &number, &type)) << f; + }; +} + +TEST(FileNameTest, Construction) { + uint64_t number; + FileType type; + std::string fname; + + fname = CurrentFileName("foo"); + ASSERT_EQ("foo/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); + ASSERT_EQ(0U, number); + ASSERT_EQ(kCurrentFile, type); + + fname = LockFileName("foo"); + ASSERT_EQ("foo/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); + ASSERT_EQ(0U, number); + ASSERT_EQ(kDBLockFile, type); + + fname = LogFileName("foo", 192); + ASSERT_EQ("foo/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); + ASSERT_EQ(192U, number); + ASSERT_EQ(kLogFile, type); + + fname = TableFileName("bar", 200); + ASSERT_EQ("bar/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); + ASSERT_EQ(200U, number); + ASSERT_EQ(kTableFile, type); + + fname = DescriptorFileName("bar", 100); + ASSERT_EQ("bar/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); + ASSERT_EQ(100U, number); + ASSERT_EQ(kDescriptorFile, type); + + fname = TempFileName("tmp", 999); + ASSERT_EQ("tmp/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); + ASSERT_EQ(999U, number); + ASSERT_EQ(kTempFile, type); + + fname = MetaDatabaseName("met", 100); + ASSERT_EQ("met/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); + ASSERT_EQ(100U, number); + ASSERT_EQ(kMetaDatabase, type); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/db/log_format.h b/db/log_format.h new file mode 100644 index 00000000..10a31ba2 --- /dev/null +++ b/db/log_format.h @@ -0,0 +1,36 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Log format information shared by reader and writer. +// See ../doc/log_format.txt for more detail. + +#pragma once +namespace rocksdb { +namespace log { + +enum RecordType { + // Zero is reserved for preallocated files + kZeroType = 0, + + kFullType = 1, + + // For fragments + kFirstType = 2, + kMiddleType = 3, + kLastType = 4 +}; +static const int kMaxRecordType = kLastType; + +static const unsigned int kBlockSize = 32768; + +// Header is checksum (4 bytes), type (1 byte), length (2 bytes). +static const int kHeaderSize = 4 + 1 + 2; + +} // namespace log +} // namespace rocksdb diff --git a/db/log_reader.cc b/db/log_reader.cc new file mode 100644 index 00000000..6596cd84 --- /dev/null +++ b/db/log_reader.cc @@ -0,0 +1,264 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/log_reader.h" + +#include +#include "rocksdb/env.h" +#include "util/coding.h" +#include "util/crc32c.h" + +namespace rocksdb { +namespace log { + +Reader::Reporter::~Reporter() { +} + +Reader::Reader(unique_ptr&& file, Reporter* reporter, + bool checksum, uint64_t initial_offset) + : file_(std::move(file)), + reporter_(reporter), + checksum_(checksum), + backing_store_(new char[kBlockSize]), + buffer_(), + eof_(false), + last_record_offset_(0), + end_of_buffer_offset_(0), + initial_offset_(initial_offset) { +} + +Reader::~Reader() { + delete[] backing_store_; +} + +bool Reader::SkipToInitialBlock() { + size_t offset_in_block = initial_offset_ % kBlockSize; + uint64_t block_start_location = initial_offset_ - offset_in_block; + + // Don't search a block if we'd be in the trailer + if (offset_in_block > kBlockSize - 6) { + offset_in_block = 0; + block_start_location += kBlockSize; + } + + end_of_buffer_offset_ = block_start_location; + + // Skip to start of first block that can contain the initial record + if (block_start_location > 0) { + Status skip_status = file_->Skip(block_start_location); + if (!skip_status.ok()) { + ReportDrop(block_start_location, skip_status); + return false; + } + } + + return true; +} + +bool Reader::ReadRecord(Slice* record, std::string* scratch) { + if (last_record_offset_ < initial_offset_) { + if (!SkipToInitialBlock()) { + return false; + } + } + + scratch->clear(); + record->clear(); + bool in_fragmented_record = false; + // Record offset of the logical record that we're reading + // 0 is a dummy value to make compilers happy + uint64_t prospective_record_offset = 0; + + Slice fragment; + while (true) { + uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size(); + const unsigned int record_type = ReadPhysicalRecord(&fragment); + switch (record_type) { + case kFullType: + if (in_fragmented_record) { + // Handle bug in earlier versions of log::Writer where + // it could emit an empty kFirstType record at the tail end + // of a block followed by a kFullType or kFirstType record + // at the beginning of the next block. + if (scratch->empty()) { + in_fragmented_record = false; + } else { + ReportCorruption(scratch->size(), "partial record without end(1)"); + } + } + prospective_record_offset = physical_record_offset; + scratch->clear(); + *record = fragment; + last_record_offset_ = prospective_record_offset; + return true; + + case kFirstType: + if (in_fragmented_record) { + // Handle bug in earlier versions of log::Writer where + // it could emit an empty kFirstType record at the tail end + // of a block followed by a kFullType or kFirstType record + // at the beginning of the next block. + if (scratch->empty()) { + in_fragmented_record = false; + } else { + ReportCorruption(scratch->size(), "partial record without end(2)"); + } + } + prospective_record_offset = physical_record_offset; + scratch->assign(fragment.data(), fragment.size()); + in_fragmented_record = true; + break; + + case kMiddleType: + if (!in_fragmented_record) { + ReportCorruption(fragment.size(), + "missing start of fragmented record(1)"); + } else { + scratch->append(fragment.data(), fragment.size()); + } + break; + + case kLastType: + if (!in_fragmented_record) { + ReportCorruption(fragment.size(), + "missing start of fragmented record(2)"); + } else { + scratch->append(fragment.data(), fragment.size()); + *record = Slice(*scratch); + last_record_offset_ = prospective_record_offset; + return true; + } + break; + + case kEof: + if (in_fragmented_record) { + ReportCorruption(scratch->size(), "partial record without end(3)"); + scratch->clear(); + } + return false; + + case kBadRecord: + if (in_fragmented_record) { + ReportCorruption(scratch->size(), "error in middle of record"); + in_fragmented_record = false; + scratch->clear(); + } + break; + + default: { + char buf[40]; + snprintf(buf, sizeof(buf), "unknown record type %u", record_type); + ReportCorruption( + (fragment.size() + (in_fragmented_record ? scratch->size() : 0)), + buf); + in_fragmented_record = false; + scratch->clear(); + break; + } + } + } + return false; +} + +uint64_t Reader::LastRecordOffset() { + return last_record_offset_; +} + +void Reader::ReportCorruption(size_t bytes, const char* reason) { + ReportDrop(bytes, Status::Corruption(reason)); +} + +void Reader::ReportDrop(size_t bytes, const Status& reason) { + if (reporter_ != nullptr && + end_of_buffer_offset_ - buffer_.size() - bytes >= initial_offset_) { + reporter_->Corruption(bytes, reason); + } +} + +unsigned int Reader::ReadPhysicalRecord(Slice* result) { + while (true) { + if (buffer_.size() < (size_t)kHeaderSize) { + if (!eof_) { + // Last read was a full read, so this is a trailer to skip + buffer_.clear(); + Status status = file_->Read(kBlockSize, &buffer_, backing_store_); + end_of_buffer_offset_ += buffer_.size(); + if (!status.ok()) { + buffer_.clear(); + ReportDrop(kBlockSize, status); + eof_ = true; + return kEof; + } else if (buffer_.size() < (size_t)kBlockSize) { + eof_ = true; + } + continue; + } else if (buffer_.size() == 0) { + // End of file + return kEof; + } else { + size_t drop_size = buffer_.size(); + buffer_.clear(); + ReportCorruption(drop_size, "truncated record at end of file"); + return kEof; + } + } + + // Parse the header + const char* header = buffer_.data(); + const uint32_t a = static_cast(header[4]) & 0xff; + const uint32_t b = static_cast(header[5]) & 0xff; + const unsigned int type = header[6]; + const uint32_t length = a | (b << 8); + if (kHeaderSize + length > buffer_.size()) { + size_t drop_size = buffer_.size(); + buffer_.clear(); + ReportCorruption(drop_size, "bad record length"); + return kBadRecord; + } + + if (type == kZeroType && length == 0) { + // Skip zero length record without reporting any drops since + // such records are produced by the mmap based writing code in + // env_posix.cc that preallocates file regions. + buffer_.clear(); + return kBadRecord; + } + + // Check crc + if (checksum_) { + uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header)); + uint32_t actual_crc = crc32c::Value(header + 6, 1 + length); + if (actual_crc != expected_crc) { + // Drop the rest of the buffer since "length" itself may have + // been corrupted and if we trust it, we could find some + // fragment of a real log record that just happens to look + // like a valid log record. + size_t drop_size = buffer_.size(); + buffer_.clear(); + ReportCorruption(drop_size, "checksum mismatch"); + return kBadRecord; + } + } + + buffer_.remove_prefix(kHeaderSize + length); + + // Skip physical record that started before initial_offset_ + if (end_of_buffer_offset_ - buffer_.size() - kHeaderSize - length < + initial_offset_) { + result->clear(); + return kBadRecord; + } + + *result = Slice(header + kHeaderSize, length); + return type; + } +} + +} // namespace log +} // namespace rocksdb diff --git a/db/log_reader.h b/db/log_reader.h new file mode 100644 index 00000000..8e277c82 --- /dev/null +++ b/db/log_reader.h @@ -0,0 +1,124 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include + +#include "db/log_format.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" + +namespace rocksdb { + +class SequentialFile; +using std::unique_ptr; + +namespace log { + +class Reader { + public: + // Interface for reporting errors. + class Reporter { + public: + virtual ~Reporter(); + + // Some corruption was detected. "size" is the approximate number + // of bytes dropped due to the corruption. + virtual void Corruption(size_t bytes, const Status& status) = 0; + }; + + // Create a reader that will return log records from "*file". + // "*file" must remain live while this Reader is in use. + // + // If "reporter" is non-nullptr, it is notified whenever some data is + // dropped due to a detected corruption. "*reporter" must remain + // live while this Reader is in use. + // + // If "checksum" is true, verify checksums if available. + // + // The Reader will start reading at the first record located at physical + // position >= initial_offset within the file. + Reader(unique_ptr&& file, Reporter* reporter, + bool checksum, uint64_t initial_offset); + + ~Reader(); + + // Read the next record into *record. Returns true if read + // successfully, false if we hit end of the input. May use + // "*scratch" as temporary storage. The contents filled in *record + // will only be valid until the next mutating operation on this + // reader or the next mutation to *scratch. + bool ReadRecord(Slice* record, std::string* scratch); + + // Returns the physical offset of the last record returned by ReadRecord. + // + // Undefined before the first call to ReadRecord. + uint64_t LastRecordOffset(); + + // returns true if the reader has encountered an eof condition. + bool IsEOF() { + return eof_; + } + + // when we know more data has been written to the file. we can use this + // function to force the reader to look again in the file. + void UnmarkEOF() { + eof_ = false; + } + + SequentialFile* file() { return file_.get(); } + + private: + const unique_ptr file_; + Reporter* const reporter_; + bool const checksum_; + char* const backing_store_; + Slice buffer_; + bool eof_; // Last Read() indicated EOF by returning < kBlockSize + + // Offset of the last record returned by ReadRecord. + uint64_t last_record_offset_; + // Offset of the first location past the end of buffer_. + uint64_t end_of_buffer_offset_; + + // Offset at which to start looking for the first record to return + uint64_t const initial_offset_; + + // Extend record types with the following special values + enum { + kEof = kMaxRecordType + 1, + // Returned whenever we find an invalid physical record. + // Currently there are three situations in which this happens: + // * The record has an invalid CRC (ReadPhysicalRecord reports a drop) + // * The record is a 0-length record (No drop is reported) + // * The record is below constructor's initial_offset (No drop is reported) + kBadRecord = kMaxRecordType + 2 + }; + + // Skips all blocks that are completely before "initial_offset_". + // + // Returns true on success. Handles reporting. + bool SkipToInitialBlock(); + + // Return type, or one of the preceding special values + unsigned int ReadPhysicalRecord(Slice* result); + + // Reports dropped bytes to the reporter. + // buffer_ must be updated to remove the dropped bytes prior to invocation. + void ReportCorruption(size_t bytes, const char* reason); + void ReportDrop(size_t bytes, const Status& reason); + + // No copying allowed + Reader(const Reader&); + void operator=(const Reader&); +}; + +} // namespace log +} // namespace rocksdb diff --git a/db/log_test.cc b/db/log_test.cc new file mode 100644 index 00000000..dedbff0a --- /dev/null +++ b/db/log_test.cc @@ -0,0 +1,528 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/log_reader.h" +#include "db/log_writer.h" +#include "rocksdb/env.h" +#include "util/coding.h" +#include "util/crc32c.h" +#include "util/random.h" +#include "util/testharness.h" + +namespace rocksdb { +namespace log { + +// Construct a string of the specified length made out of the supplied +// partial string. +static std::string BigString(const std::string& partial_string, size_t n) { + std::string result; + while (result.size() < n) { + result.append(partial_string); + } + result.resize(n); + return result; +} + +// Construct a string from a number +static std::string NumberString(int n) { + char buf[50]; + snprintf(buf, sizeof(buf), "%d.", n); + return std::string(buf); +} + +// Return a skewed potentially long string +static std::string RandomSkewedString(int i, Random* rnd) { + return BigString(NumberString(i), rnd->Skewed(17)); +} + +class LogTest { + private: + class StringDest : public WritableFile { + public: + std::string contents_; + + virtual Status Close() { return Status::OK(); } + virtual Status Flush() { return Status::OK(); } + virtual Status Sync() { return Status::OK(); } + virtual Status Append(const Slice& slice) { + contents_.append(slice.data(), slice.size()); + return Status::OK(); + } + }; + + class StringSource : public SequentialFile { + public: + Slice contents_; + bool force_error_; + bool returned_partial_; + StringSource() : force_error_(false), returned_partial_(false) { } + + virtual Status Read(size_t n, Slice* result, char* scratch) { + ASSERT_TRUE(!returned_partial_) << "must not Read() after eof/error"; + + if (force_error_) { + force_error_ = false; + returned_partial_ = true; + return Status::Corruption("read error"); + } + + if (contents_.size() < n) { + n = contents_.size(); + returned_partial_ = true; + } + *result = Slice(contents_.data(), n); + contents_.remove_prefix(n); + return Status::OK(); + } + + virtual Status Skip(uint64_t n) { + if (n > contents_.size()) { + contents_.clear(); + return Status::NotFound("in-memory file skipepd past end"); + } + + contents_.remove_prefix(n); + + return Status::OK(); + } + }; + + class ReportCollector : public Reader::Reporter { + public: + size_t dropped_bytes_; + std::string message_; + + ReportCollector() : dropped_bytes_(0) { } + virtual void Corruption(size_t bytes, const Status& status) { + dropped_bytes_ += bytes; + message_.append(status.ToString()); + } + }; + + std::string& dest_contents() { + auto dest = dynamic_cast(writer_.file()); + assert(dest); + return dest->contents_; + } + + const std::string& dest_contents() const { + auto dest = dynamic_cast(writer_.file()); + assert(dest); + return dest->contents_; + } + + void reset_source_contents() { + auto src = dynamic_cast(reader_.file()); + assert(src); + src->contents_ = dest_contents(); + } + + unique_ptr dest_holder_; + unique_ptr source_holder_; + ReportCollector report_; + bool reading_; + Writer writer_; + Reader reader_; + + // Record metadata for testing initial offset functionality + static size_t initial_offset_record_sizes_[]; + static uint64_t initial_offset_last_record_offsets_[]; + + public: + LogTest() : dest_holder_(new StringDest), + source_holder_(new StringSource), + reading_(false), + writer_(std::move(dest_holder_)), + reader_(std::move(source_holder_), &report_, true/*checksum*/, + 0/*initial_offset*/) { + } + + void Write(const std::string& msg) { + ASSERT_TRUE(!reading_) << "Write() after starting to read"; + writer_.AddRecord(Slice(msg)); + } + + size_t WrittenBytes() const { + return dest_contents().size(); + } + + std::string Read() { + if (!reading_) { + reading_ = true; + reset_source_contents(); + } + std::string scratch; + Slice record; + if (reader_.ReadRecord(&record, &scratch)) { + return record.ToString(); + } else { + return "EOF"; + } + } + + void IncrementByte(int offset, int delta) { + dest_contents()[offset] += delta; + } + + void SetByte(int offset, char new_byte) { + dest_contents()[offset] = new_byte; + } + + void ShrinkSize(int bytes) { + dest_contents().resize(dest_contents().size() - bytes); + } + + void FixChecksum(int header_offset, int len) { + // Compute crc of type/len/data + uint32_t crc = crc32c::Value(&dest_contents()[header_offset+6], 1 + len); + crc = crc32c::Mask(crc); + EncodeFixed32(&dest_contents()[header_offset], crc); + } + + void ForceError() { + auto src = dynamic_cast(reader_.file()); + src->force_error_ = true; + } + + size_t DroppedBytes() const { + return report_.dropped_bytes_; + } + + std::string ReportMessage() const { + return report_.message_; + } + + // Returns OK iff recorded error message contains "msg" + std::string MatchError(const std::string& msg) const { + if (report_.message_.find(msg) == std::string::npos) { + return report_.message_; + } else { + return "OK"; + } + } + + void WriteInitialOffsetLog() { + for (int i = 0; i < 4; i++) { + std::string record(initial_offset_record_sizes_[i], + static_cast('a' + i)); + Write(record); + } + } + + void CheckOffsetPastEndReturnsNoRecords(uint64_t offset_past_end) { + WriteInitialOffsetLog(); + reading_ = true; + unique_ptr source(new StringSource); + source->contents_ = dest_contents(); + unique_ptr offset_reader( + new Reader(std::move(source), &report_, true/*checksum*/, + WrittenBytes() + offset_past_end)); + Slice record; + std::string scratch; + ASSERT_TRUE(!offset_reader->ReadRecord(&record, &scratch)); + } + + void CheckInitialOffsetRecord(uint64_t initial_offset, + int expected_record_offset) { + WriteInitialOffsetLog(); + reading_ = true; + unique_ptr source(new StringSource); + source->contents_ = dest_contents(); + unique_ptr offset_reader( + new Reader(std::move(source), &report_, true/*checksum*/, + initial_offset)); + Slice record; + std::string scratch; + ASSERT_TRUE(offset_reader->ReadRecord(&record, &scratch)); + ASSERT_EQ(initial_offset_record_sizes_[expected_record_offset], + record.size()); + ASSERT_EQ(initial_offset_last_record_offsets_[expected_record_offset], + offset_reader->LastRecordOffset()); + ASSERT_EQ((char)('a' + expected_record_offset), record.data()[0]); + } + +}; + +size_t LogTest::initial_offset_record_sizes_[] = + {10000, // Two sizable records in first block + 10000, + 2 * log::kBlockSize - 1000, // Span three blocks + 1}; + +uint64_t LogTest::initial_offset_last_record_offsets_[] = + {0, + kHeaderSize + 10000, + 2 * (kHeaderSize + 10000), + 2 * (kHeaderSize + 10000) + + (2 * log::kBlockSize - 1000) + 3 * kHeaderSize}; + + +TEST(LogTest, Empty) { + ASSERT_EQ("EOF", Read()); +} + +TEST(LogTest, ReadWrite) { + Write("foo"); + Write("bar"); + Write(""); + Write("xxxx"); + ASSERT_EQ("foo", Read()); + ASSERT_EQ("bar", Read()); + ASSERT_EQ("", Read()); + ASSERT_EQ("xxxx", Read()); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ("EOF", Read()); // Make sure reads at eof work +} + +TEST(LogTest, ManyBlocks) { + for (int i = 0; i < 100000; i++) { + Write(NumberString(i)); + } + for (int i = 0; i < 100000; i++) { + ASSERT_EQ(NumberString(i), Read()); + } + ASSERT_EQ("EOF", Read()); +} + +TEST(LogTest, Fragmentation) { + Write("small"); + Write(BigString("medium", 50000)); + Write(BigString("large", 100000)); + ASSERT_EQ("small", Read()); + ASSERT_EQ(BigString("medium", 50000), Read()); + ASSERT_EQ(BigString("large", 100000), Read()); + ASSERT_EQ("EOF", Read()); +} + +TEST(LogTest, MarginalTrailer) { + // Make a trailer that is exactly the same length as an empty record. + const int n = kBlockSize - 2*kHeaderSize; + Write(BigString("foo", n)); + ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize), WrittenBytes()); + Write(""); + Write("bar"); + ASSERT_EQ(BigString("foo", n), Read()); + ASSERT_EQ("", Read()); + ASSERT_EQ("bar", Read()); + ASSERT_EQ("EOF", Read()); +} + +TEST(LogTest, MarginalTrailer2) { + // Make a trailer that is exactly the same length as an empty record. + const int n = kBlockSize - 2*kHeaderSize; + Write(BigString("foo", n)); + ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize), WrittenBytes()); + Write("bar"); + ASSERT_EQ(BigString("foo", n), Read()); + ASSERT_EQ("bar", Read()); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(0U, DroppedBytes()); + ASSERT_EQ("", ReportMessage()); +} + +TEST(LogTest, ShortTrailer) { + const int n = kBlockSize - 2*kHeaderSize + 4; + Write(BigString("foo", n)); + ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize + 4), WrittenBytes()); + Write(""); + Write("bar"); + ASSERT_EQ(BigString("foo", n), Read()); + ASSERT_EQ("", Read()); + ASSERT_EQ("bar", Read()); + ASSERT_EQ("EOF", Read()); +} + +TEST(LogTest, AlignedEof) { + const int n = kBlockSize - 2*kHeaderSize + 4; + Write(BigString("foo", n)); + ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize + 4), WrittenBytes()); + ASSERT_EQ(BigString("foo", n), Read()); + ASSERT_EQ("EOF", Read()); +} + +TEST(LogTest, RandomRead) { + const int N = 500; + Random write_rnd(301); + for (int i = 0; i < N; i++) { + Write(RandomSkewedString(i, &write_rnd)); + } + Random read_rnd(301); + for (int i = 0; i < N; i++) { + ASSERT_EQ(RandomSkewedString(i, &read_rnd), Read()); + } + ASSERT_EQ("EOF", Read()); +} + +// Tests of all the error paths in log_reader.cc follow: + +TEST(LogTest, ReadError) { + Write("foo"); + ForceError(); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ((unsigned int)kBlockSize, DroppedBytes()); + ASSERT_EQ("OK", MatchError("read error")); +} + +TEST(LogTest, BadRecordType) { + Write("foo"); + // Type is stored in header[6] + IncrementByte(6, 100); + FixChecksum(0, 3); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(3U, DroppedBytes()); + ASSERT_EQ("OK", MatchError("unknown record type")); +} + +TEST(LogTest, TruncatedTrailingRecord) { + Write("foo"); + ShrinkSize(4); // Drop all payload as well as a header byte + ASSERT_EQ("EOF", Read()); + ASSERT_EQ((unsigned int)(kHeaderSize - 1), DroppedBytes()); + ASSERT_EQ("OK", MatchError("truncated record at end of file")); +} + +TEST(LogTest, BadLength) { + Write("foo"); + ShrinkSize(1); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ((unsigned int)(kHeaderSize + 2), DroppedBytes()); + ASSERT_EQ("OK", MatchError("bad record length")); +} + +TEST(LogTest, ChecksumMismatch) { + Write("foo"); + IncrementByte(0, 10); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(10U, DroppedBytes()); + ASSERT_EQ("OK", MatchError("checksum mismatch")); +} + +TEST(LogTest, UnexpectedMiddleType) { + Write("foo"); + SetByte(6, kMiddleType); + FixChecksum(0, 3); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(3U, DroppedBytes()); + ASSERT_EQ("OK", MatchError("missing start")); +} + +TEST(LogTest, UnexpectedLastType) { + Write("foo"); + SetByte(6, kLastType); + FixChecksum(0, 3); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(3U, DroppedBytes()); + ASSERT_EQ("OK", MatchError("missing start")); +} + +TEST(LogTest, UnexpectedFullType) { + Write("foo"); + Write("bar"); + SetByte(6, kFirstType); + FixChecksum(0, 3); + ASSERT_EQ("bar", Read()); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(3U, DroppedBytes()); + ASSERT_EQ("OK", MatchError("partial record without end")); +} + +TEST(LogTest, UnexpectedFirstType) { + Write("foo"); + Write(BigString("bar", 100000)); + SetByte(6, kFirstType); + FixChecksum(0, 3); + ASSERT_EQ(BigString("bar", 100000), Read()); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(3U, DroppedBytes()); + ASSERT_EQ("OK", MatchError("partial record without end")); +} + +TEST(LogTest, ErrorJoinsRecords) { + // Consider two fragmented records: + // first(R1) last(R1) first(R2) last(R2) + // where the middle two fragments disappear. We do not want + // first(R1),last(R2) to get joined and returned as a valid record. + + // Write records that span two blocks + Write(BigString("foo", kBlockSize)); + Write(BigString("bar", kBlockSize)); + Write("correct"); + + // Wipe the middle block + for (unsigned int offset = kBlockSize; offset < 2*kBlockSize; offset++) { + SetByte(offset, 'x'); + } + + ASSERT_EQ("correct", Read()); + ASSERT_EQ("EOF", Read()); + const unsigned int dropped = DroppedBytes(); + ASSERT_LE(dropped, 2*kBlockSize + 100); + ASSERT_GE(dropped, 2*kBlockSize); +} + +TEST(LogTest, ReadStart) { + CheckInitialOffsetRecord(0, 0); +} + +TEST(LogTest, ReadSecondOneOff) { + CheckInitialOffsetRecord(1, 1); +} + +TEST(LogTest, ReadSecondTenThousand) { + CheckInitialOffsetRecord(10000, 1); +} + +TEST(LogTest, ReadSecondStart) { + CheckInitialOffsetRecord(10007, 1); +} + +TEST(LogTest, ReadThirdOneOff) { + CheckInitialOffsetRecord(10008, 2); +} + +TEST(LogTest, ReadThirdStart) { + CheckInitialOffsetRecord(20014, 2); +} + +TEST(LogTest, ReadFourthOneOff) { + CheckInitialOffsetRecord(20015, 3); +} + +TEST(LogTest, ReadFourthFirstBlockTrailer) { + CheckInitialOffsetRecord(log::kBlockSize - 4, 3); +} + +TEST(LogTest, ReadFourthMiddleBlock) { + CheckInitialOffsetRecord(log::kBlockSize + 1, 3); +} + +TEST(LogTest, ReadFourthLastBlock) { + CheckInitialOffsetRecord(2 * log::kBlockSize + 1, 3); +} + +TEST(LogTest, ReadFourthStart) { + CheckInitialOffsetRecord( + 2 * (kHeaderSize + 1000) + (2 * log::kBlockSize - 1000) + 3 * kHeaderSize, + 3); +} + +TEST(LogTest, ReadEnd) { + CheckOffsetPastEndReturnsNoRecords(0); +} + +TEST(LogTest, ReadPastEnd) { + CheckOffsetPastEndReturnsNoRecords(5); +} + +} // namespace log +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/db/log_writer.cc b/db/log_writer.cc new file mode 100644 index 00000000..df601a47 --- /dev/null +++ b/db/log_writer.cc @@ -0,0 +1,108 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/log_writer.h" + +#include +#include "rocksdb/env.h" +#include "util/coding.h" +#include "util/crc32c.h" + +namespace rocksdb { +namespace log { + +Writer::Writer(unique_ptr&& dest) + : dest_(std::move(dest)), + block_offset_(0) { + for (int i = 0; i <= kMaxRecordType; i++) { + char t = static_cast(i); + type_crc_[i] = crc32c::Value(&t, 1); + } +} + +Writer::~Writer() { +} + +Status Writer::AddRecord(const Slice& slice) { + const char* ptr = slice.data(); + size_t left = slice.size(); + + // Fragment the record if necessary and emit it. Note that if slice + // is empty, we still want to iterate once to emit a single + // zero-length record + Status s; + bool begin = true; + do { + const int leftover = kBlockSize - block_offset_; + assert(leftover >= 0); + if (leftover < kHeaderSize) { + // Switch to a new block + if (leftover > 0) { + // Fill the trailer (literal below relies on kHeaderSize being 7) + assert(kHeaderSize == 7); + dest_->Append(Slice("\x00\x00\x00\x00\x00\x00", leftover)); + } + block_offset_ = 0; + } + + // Invariant: we never leave < kHeaderSize bytes in a block. + assert(kBlockSize - block_offset_ - kHeaderSize >= 0); + + const size_t avail = kBlockSize - block_offset_ - kHeaderSize; + const size_t fragment_length = (left < avail) ? left : avail; + + RecordType type; + const bool end = (left == fragment_length); + if (begin && end) { + type = kFullType; + } else if (begin) { + type = kFirstType; + } else if (end) { + type = kLastType; + } else { + type = kMiddleType; + } + + s = EmitPhysicalRecord(type, ptr, fragment_length); + ptr += fragment_length; + left -= fragment_length; + begin = false; + } while (s.ok() && left > 0); + return s; +} + +Status Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n) { + assert(n <= 0xffff); // Must fit in two bytes + assert(block_offset_ + kHeaderSize + n <= kBlockSize); + + // Format the header + char buf[kHeaderSize]; + buf[4] = static_cast(n & 0xff); + buf[5] = static_cast(n >> 8); + buf[6] = static_cast(t); + + // Compute the crc of the record type and the payload. + uint32_t crc = crc32c::Extend(type_crc_[t], ptr, n); + crc = crc32c::Mask(crc); // Adjust for storage + EncodeFixed32(buf, crc); + + // Write the header and the payload + Status s = dest_->Append(Slice(buf, kHeaderSize)); + if (s.ok()) { + s = dest_->Append(Slice(ptr, n)); + if (s.ok()) { + s = dest_->Flush(); + } + } + block_offset_ += kHeaderSize + n; + return s; +} + +} // namespace log +} // namespace rocksdb diff --git a/db/log_writer.h b/db/log_writer.h new file mode 100644 index 00000000..d7b7afff --- /dev/null +++ b/db/log_writer.h @@ -0,0 +1,55 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include +#include "db/log_format.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" + +namespace rocksdb { + +class WritableFile; + +using std::unique_ptr; + +namespace log { + +class Writer { + public: + // Create a writer that will append data to "*dest". + // "*dest" must be initially empty. + // "*dest" must remain live while this Writer is in use. + explicit Writer(unique_ptr&& dest); + ~Writer(); + + Status AddRecord(const Slice& slice); + + WritableFile* file() { return dest_.get(); } + const WritableFile* file() const { return dest_.get(); } + + private: + unique_ptr dest_; + int block_offset_; // Current offset in block + + // crc32c values for all supported record types. These are + // pre-computed to reduce the overhead of computing the crc of the + // record type stored in the header. + uint32_t type_crc_[kMaxRecordType + 1]; + + Status EmitPhysicalRecord(RecordType type, const char* ptr, size_t length); + + // No copying allowed + Writer(const Writer&); + void operator=(const Writer&); +}; + +} // namespace log +} // namespace rocksdb diff --git a/db/memtable.cc b/db/memtable.cc new file mode 100644 index 00000000..baff4fb3 --- /dev/null +++ b/db/memtable.cc @@ -0,0 +1,358 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/memtable.h" + +#include + +#include "db/dbformat.h" +#include "db/merge_context.h" +#include "rocksdb/comparator.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/merge_operator.h" +#include "util/coding.h" +#include "util/mutexlock.h" +#include "util/murmurhash.h" +#include "util/statistics_imp.h" + +namespace std { +template <> +struct hash { + size_t operator()(const rocksdb::Slice& slice) const { + return MurmurHash(slice.data(), slice.size(), 0); + } +}; +} + +namespace rocksdb { + +MemTable::MemTable(const InternalKeyComparator& cmp, const Options& options) + : comparator_(cmp), + refs_(0), + arena_impl_(options.arena_block_size), + table_(options.memtable_factory->CreateMemTableRep(comparator_, + &arena_impl_)), + flush_in_progress_(false), + flush_completed_(false), + file_number_(0), + first_seqno_(0), + mem_next_logfile_number_(0), + mem_logfile_number_(0), + locks_(options.inplace_update_support ? options.inplace_update_num_locks + : 0) {} + +MemTable::~MemTable() { + assert(refs_ == 0); +} + +size_t MemTable::ApproximateMemoryUsage() { + return arena_impl_.ApproximateMemoryUsage() + + table_->ApproximateMemoryUsage(); +} + +int MemTable::KeyComparator::operator()(const char* aptr, const char* bptr) + const { + // Internal keys are encoded as length-prefixed strings. + Slice a = GetLengthPrefixedSlice(aptr); + Slice b = GetLengthPrefixedSlice(bptr); + return comparator.Compare(a, b); +} + +Slice MemTableRep::UserKey(const char* key) const { + Slice slice = GetLengthPrefixedSlice(key); + return Slice(slice.data(), slice.size() - 8); +} + +// Encode a suitable internal key target for "target" and return it. +// Uses *scratch as scratch space, and the returned pointer will point +// into this scratch space. +static const char* EncodeKey(std::string* scratch, const Slice& target) { + scratch->clear(); + PutVarint32(scratch, target.size()); + scratch->append(target.data(), target.size()); + return scratch->data(); +} + +class MemTableIterator: public Iterator { + public: + MemTableIterator(MemTableRep* table, const ReadOptions& options) + : iter_() { + if (options.prefix) { + iter_ = table->GetPrefixIterator(*options.prefix); + } else if (options.prefix_seek) { + iter_ = table->GetDynamicPrefixIterator(); + } else { + iter_ = table->GetIterator(); + } + } + + virtual bool Valid() const { return iter_->Valid(); } + virtual void Seek(const Slice& k) { iter_->Seek(EncodeKey(&tmp_, k)); } + virtual void SeekToFirst() { iter_->SeekToFirst(); } + virtual void SeekToLast() { iter_->SeekToLast(); } + virtual void Next() { iter_->Next(); } + virtual void Prev() { iter_->Prev(); } + virtual Slice key() const { + return GetLengthPrefixedSlice(iter_->key()); + } + virtual Slice value() const { + Slice key_slice = GetLengthPrefixedSlice(iter_->key()); + return GetLengthPrefixedSlice(key_slice.data() + key_slice.size()); + } + + virtual Status status() const { return Status::OK(); } + + private: + std::shared_ptr iter_; + std::string tmp_; // For passing to EncodeKey + + // No copying allowed + MemTableIterator(const MemTableIterator&); + void operator=(const MemTableIterator&); +}; + +Iterator* MemTable::NewIterator(const ReadOptions& options) { + return new MemTableIterator(table_.get(), options); +} + +port::RWMutex* MemTable::GetLock(const Slice& key) { + return &locks_[std::hash()(key) % locks_.size()]; +} + +void MemTable::Add(SequenceNumber s, ValueType type, + const Slice& key, + const Slice& value) { + // Format of an entry is concatenation of: + // key_size : varint32 of internal_key.size() + // key bytes : char[internal_key.size()] + // value_size : varint32 of value.size() + // value bytes : char[value.size()] + size_t key_size = key.size(); + size_t val_size = value.size(); + size_t internal_key_size = key_size + 8; + const size_t encoded_len = + VarintLength(internal_key_size) + internal_key_size + + VarintLength(val_size) + val_size; + char* buf = arena_impl_.Allocate(encoded_len); + char* p = EncodeVarint32(buf, internal_key_size); + memcpy(p, key.data(), key_size); + p += key_size; + EncodeFixed64(p, (s << 8) | type); + p += 8; + p = EncodeVarint32(p, val_size); + memcpy(p, value.data(), val_size); + assert((p + val_size) - buf == (unsigned)encoded_len); + table_->Insert(buf); + + // The first sequence number inserted into the memtable + assert(first_seqno_ == 0 || s > first_seqno_); + if (first_seqno_ == 0) { + first_seqno_ = s; + } +} + +bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, + MergeContext& merge_context, const Options& options) { + Slice memkey = key.memtable_key(); + std::shared_ptr iter( + table_->GetIterator(key.user_key())); + iter->Seek(memkey.data()); + + bool merge_in_progress = s->IsMergeInProgress(); + auto merge_operator = options.merge_operator.get(); + auto logger = options.info_log; + std::string merge_result; + + for (; iter->Valid(); iter->Next()) { + // entry format is: + // klength varint32 + // userkey char[klength-8] + // tag uint64 + // vlength varint32 + // value char[vlength] + // Check that it belongs to same user key. We do not check the + // sequence number since the Seek() call above should have skipped + // all entries with overly large sequence numbers. + const char* entry = iter->key(); + uint32_t key_length; + const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length); + if (comparator_.comparator.user_comparator()->Compare( + Slice(key_ptr, key_length - 8), key.user_key()) == 0) { + // Correct user key + const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8); + switch (static_cast(tag & 0xff)) { + case kTypeValue: { + if (options.inplace_update_support) { + GetLock(key.user_key())->ReadLock(); + } + Slice v = GetLengthPrefixedSlice(key_ptr + key_length); + *s = Status::OK(); + if (merge_in_progress) { + assert(merge_operator); + if (!merge_operator->FullMerge(key.user_key(), &v, + merge_context.GetOperands(), value, + logger.get())) { + RecordTick(options.statistics.get(), NUMBER_MERGE_FAILURES); + *s = Status::Corruption("Error: Could not perform merge."); + } + } else { + value->assign(v.data(), v.size()); + } + if (options.inplace_update_support) { + GetLock(key.user_key())->Unlock(); + } + return true; + } + case kTypeDeletion: { + if (merge_in_progress) { + assert(merge_operator); + *s = Status::OK(); + if (!merge_operator->FullMerge(key.user_key(), nullptr, + merge_context.GetOperands(), value, + logger.get())) { + RecordTick(options.statistics.get(), NUMBER_MERGE_FAILURES); + *s = Status::Corruption("Error: Could not perform merge."); + } + } else { + *s = Status::NotFound(); + } + return true; + } + case kTypeMerge: { + Slice v = GetLengthPrefixedSlice(key_ptr + key_length); + merge_in_progress = true; + merge_context.PushOperand(v); + while(merge_context.GetNumOperands() >= 2) { + // Attempt to associative merge. (Returns true if successful) + if (merge_operator->PartialMerge(key.user_key(), + merge_context.GetOperand(0), + merge_context.GetOperand(1), + &merge_result, logger.get())) { + merge_context.PushPartialMergeResult(merge_result); + } else { + // Stack them because user can't associative merge + break; + } + } + break; + } + case kTypeLogData: + assert(false); + break; + } + } else { + // exit loop if user key does not match + break; + } + } + + // No change to value, since we have not yet found a Put/Delete + + if (merge_in_progress) { + *s = Status::MergeInProgress(""); + } + return false; +} + +bool MemTable::Update(SequenceNumber seq, ValueType type, + const Slice& key, + const Slice& value) { + LookupKey lkey(key, seq); + Slice memkey = lkey.memtable_key(); + + std::shared_ptr iter( + table_->GetIterator(lkey.user_key())); + iter->Seek(memkey.data()); + + if (iter->Valid()) { + // entry format is: + // klength varint32 + // userkey char[klength-8] + // tag uint64 + // vlength varint32 + // value char[vlength] + // Check that it belongs to same user key. We do not check the + // sequence number since the Seek() call above should have skipped + // all entries with overly large sequence numbers. + const char* entry = iter->key(); + uint32_t key_length; + const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length); + if (comparator_.comparator.user_comparator()->Compare( + Slice(key_ptr, key_length - 8), lkey.user_key()) == 0) { + // Correct user key + const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8); + switch (static_cast(tag & 0xff)) { + case kTypeValue: { + uint32_t vlength; + GetVarint32Ptr(key_ptr + key_length, + key_ptr + key_length+5, &vlength); + // Update value, if newValue size <= curValue size + if (value.size() <= vlength) { + char* p = EncodeVarint32(const_cast(key_ptr) + key_length, + value.size()); + WriteLock wl(GetLock(lkey.user_key())); + memcpy(p, value.data(), value.size()); + assert( + (p + value.size()) - entry == + (unsigned) (VarintLength(key_length) + + key_length + + VarintLength(value.size()) + + value.size()) + ); + return true; + } + } + default: + // If the latest value is kTypeDeletion, kTypeMerge or kTypeLogData + // then we probably don't have enough space to update in-place + // Maybe do something later + // Return false, and do normal Add() + return false; + } + } + } + + // Key doesn't exist + return false; +} + +size_t MemTable::CountSuccessiveMergeEntries(const LookupKey& key) { + Slice memkey = key.memtable_key(); + + // A total ordered iterator is costly for some memtablerep (prefix aware + // reps). By passing in the user key, we allow efficient iterator creation. + // The iterator only needs to be ordered within the same user key. + std::shared_ptr iter( + table_->GetIterator(key.user_key())); + iter->Seek(memkey.data()); + + size_t num_successive_merges = 0; + + for (; iter->Valid(); iter->Next()) { + const char* entry = iter->key(); + uint32_t key_length; + const char* iter_key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length); + if (!comparator_.comparator.user_comparator()->Compare( + Slice(iter_key_ptr, key_length - 8), key.user_key()) == 0) { + break; + } + + const uint64_t tag = DecodeFixed64(iter_key_ptr + key_length - 8); + if (static_cast(tag & 0xff) != kTypeMerge) { + break; + } + + ++num_successive_merges; + } + + return num_successive_merges; +} + +} // namespace rocksdb diff --git a/db/memtable.h b/db/memtable.h new file mode 100644 index 00000000..24a2c852 --- /dev/null +++ b/db/memtable.h @@ -0,0 +1,178 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include +#include +#include "db/dbformat.h" +#include "db/skiplist.h" +#include "db/version_set.h" +#include "rocksdb/db.h" +#include "rocksdb/memtablerep.h" +#include "util/arena_impl.h" + +namespace rocksdb { + +class Mutex; +class MemTableIterator; +class MergeContext; + +class MemTable { + public: + struct KeyComparator : public MemTableRep::KeyComparator { + const InternalKeyComparator comparator; + explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) { } + virtual int operator()(const char* a, const char* b) const; + }; + + // MemTables are reference counted. The initial reference count + // is zero and the caller must call Ref() at least once. + explicit MemTable(const InternalKeyComparator& comparator, + const Options& options = Options()); + + ~MemTable(); + + // Increase reference count. + void Ref() { ++refs_; } + + // Drop reference count. + // If the refcount goes to zero return this memtable, otherwise return null + MemTable* Unref() { + --refs_; + assert(refs_ >= 0); + if (refs_ <= 0) { + return this; + } + return nullptr; + } + + // Returns an estimate of the number of bytes of data in use by this + // data structure. + // + // REQUIRES: external synchronization to prevent simultaneous + // operations on the same MemTable. + size_t ApproximateMemoryUsage(); + + // Return an iterator that yields the contents of the memtable. + // + // The caller must ensure that the underlying MemTable remains live + // while the returned iterator is live. The keys returned by this + // iterator are internal keys encoded by AppendInternalKey in the + // db/dbformat.{h,cc} module. + // + // If options.prefix is supplied, it is passed to the underlying MemTableRep + // as a hint that the iterator only need to support access to keys with that + // specific prefix. + // If options.prefix is not supplied and options.prefix_seek is set, the + // iterator is not bound to a specific prefix. However, the semantics of + // Seek is changed - the result might only include keys with the same prefix + // as the seek-key. + Iterator* NewIterator(const ReadOptions& options = ReadOptions()); + + // Add an entry into memtable that maps key to value at the + // specified sequence number and with the specified type. + // Typically value will be empty if type==kTypeDeletion. + void Add(SequenceNumber seq, ValueType type, + const Slice& key, + const Slice& value); + + // If memtable contains a value for key, store it in *value and return true. + // If memtable contains a deletion for key, store a NotFound() error + // in *status and return true. + // If memtable contains Merge operation as the most recent entry for a key, + // and the merge process does not stop (not reaching a value or delete), + // prepend the current merge operand to *operands. + // store MergeInProgress in s, and return false. + // Else, return false. + bool Get(const LookupKey& key, std::string* value, Status* s, + MergeContext& merge_context, const Options& options); + + // Update the value and return status ok, + // if key exists in current memtable + // if new sizeof(new_value) <= sizeof(old_value) && + // old_value for that key is a put i.e. kTypeValue + // else return false, and status - NotUpdatable() + // else return false, and status - NotFound() + bool Update(SequenceNumber seq, ValueType type, + const Slice& key, + const Slice& value); + + // Returns the number of successive merge entries starting from the newest + // entry for the key up to the last non-merge entry or last entry for the + // key in the memtable. + size_t CountSuccessiveMergeEntries(const LookupKey& key); + + // Returns the edits area that is needed for flushing the memtable + VersionEdit* GetEdits() { return &edit_; } + + // Returns the sequence number of the first element that was inserted + // into the memtable + SequenceNumber GetFirstSequenceNumber() { return first_seqno_; } + + // Returns the next active logfile number when this memtable is about to + // be flushed to storage + uint64_t GetNextLogNumber() { return mem_next_logfile_number_; } + + // Sets the next active logfile number when this memtable is about to + // be flushed to storage + void SetNextLogNumber(uint64_t num) { mem_next_logfile_number_ = num; } + + // Returns the logfile number that can be safely deleted when this + // memstore is flushed to storage + uint64_t GetLogNumber() { return mem_logfile_number_; } + + // Sets the logfile number that can be safely deleted when this + // memstore is flushed to storage + void SetLogNumber(uint64_t num) { mem_logfile_number_ = num; } + + // Notify the underlying storage that no more items will be added + void MarkImmutable() { table_->MarkReadOnly(); } + + private: + friend class MemTableIterator; + friend class MemTableBackwardIterator; + friend class MemTableList; + + KeyComparator comparator_; + int refs_; + ArenaImpl arena_impl_; + shared_ptr table_; + + // These are used to manage memtable flushes to storage + bool flush_in_progress_; // started the flush + bool flush_completed_; // finished the flush + uint64_t file_number_; // filled up after flush is complete + + // The udpates to be applied to the transaction log when this + // memtable is flushed to storage. + VersionEdit edit_; + + // The sequence number of the kv that was inserted first + SequenceNumber first_seqno_; + + // The log files earlier than this number can be deleted. + uint64_t mem_next_logfile_number_; + + // The log file that backs this memtable (to be deleted when + // memtable flush is done) + uint64_t mem_logfile_number_; + + // rw locks for inplace updates + std::vector locks_; + + // No copying allowed + MemTable(const MemTable&); + void operator=(const MemTable&); + + // Get the lock associated for the key + port::RWMutex* GetLock(const Slice& key); +}; + +} // namespace rocksdb diff --git a/db/memtablelist.cc b/db/memtablelist.cc new file mode 100644 index 00000000..71e4e5a9 --- /dev/null +++ b/db/memtablelist.cc @@ -0,0 +1,222 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include "db/memtablelist.h" + +#include +#include "rocksdb/db.h" +#include "db/memtable.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "util/coding.h" + +namespace rocksdb { + +class InternalKeyComparator; +class Mutex; +class MemTableListIterator; +class VersionSet; + +using std::list; + +// Increase reference count on all underling memtables +void MemTableList::RefAll() { + for (auto &memtable : memlist_) { + memtable->Ref(); + } +} + +// Drop reference count on all underling memtables. If the +// refcount of an underlying memtable drops to zero, then +// return it in to_delete vector. +void MemTableList::UnrefAll(std::vector* to_delete) { + for (auto &memtable : memlist_) { + MemTable* m = memtable->Unref(); + if (m != nullptr) { + to_delete->push_back(m); + } + } +} + +// Returns the total number of memtables in the list +int MemTableList::size() { + assert(num_flush_not_started_ <= size_); + return size_; +} + +// Returns true if there is at least one memtable on which flush has +// not yet started. +bool MemTableList::IsFlushPending(int min_write_buffer_number_to_merge) { + if ((flush_requested_ && num_flush_not_started_ >= 1) || + (num_flush_not_started_ >= min_write_buffer_number_to_merge)) { + assert(imm_flush_needed.NoBarrier_Load() != nullptr); + return true; + } + return false; +} + +// Returns the memtables that need to be flushed. +void MemTableList::PickMemtablesToFlush(std::vector* ret) { + for (auto it = memlist_.rbegin(); it != memlist_.rend(); it++) { + MemTable* m = *it; + if (!m->flush_in_progress_) { + assert(!m->flush_completed_); + num_flush_not_started_--; + if (num_flush_not_started_ == 0) { + imm_flush_needed.Release_Store(nullptr); + } + m->flush_in_progress_ = true; // flushing will start very soon + ret->push_back(m); + } + } + flush_requested_ = false; // start-flush request is complete +} + +// Record a successful flush in the manifest file +Status MemTableList::InstallMemtableFlushResults( + const std::vector &mems, + VersionSet* vset, Status flushStatus, + port::Mutex* mu, Logger* info_log, + uint64_t file_number, + std::set& pending_outputs, + std::vector* to_delete) { + mu->AssertHeld(); + + // If the flush was not successful, then just reset state. + // Maybe a suceeding attempt to flush will be successful. + if (!flushStatus.ok()) { + for (MemTable* m : mems) { + assert(m->flush_in_progress_); + assert(m->file_number_ == 0); + + m->flush_in_progress_ = false; + m->flush_completed_ = false; + m->edit_.Clear(); + num_flush_not_started_++; + imm_flush_needed.Release_Store((void *)1); + pending_outputs.erase(file_number); + } + return flushStatus; + } + + // flush was sucessful + for (size_t i = 0; i < mems.size(); ++i) { + // All the edits are associated with the first memtable of this batch. + assert(i == 0 || mems[i]->GetEdits()->NumEntries() == 0); + + mems[i]->flush_completed_ = true; + mems[i]->file_number_ = file_number; + } + + // if some other thread is already commiting, then return + Status s; + if (commit_in_progress_) { + return s; + } + + // Only a single thread can be executing this piece of code + commit_in_progress_ = true; + + // scan all memtables from the earliest, and commit those + // (in that order) that have finished flushing. Memetables + // are always committed in the order that they were created. + while (!memlist_.empty() && s.ok()) { + MemTable* m = memlist_.back(); // get the last element + if (!m->flush_completed_) { + break; + } + + Log(info_log, + "Level-0 commit table #%lu started", + (unsigned long)m->file_number_); + + // this can release and reacquire the mutex. + s = vset->LogAndApply(&m->edit_, mu); + + // All the later memtables that have the same filenum + // are part of the same batch. They can be committed now. + uint64_t mem_id = 1; // how many memtables has been flushed. + do { + if (s.ok()) { // commit new state + Log(info_log, + "Level-0 commit table #%lu: memtable #%lu done", + (unsigned long)m->file_number_, + (unsigned long)mem_id); + memlist_.remove(m); + assert(m->file_number_ > 0); + + // pending_outputs can be cleared only after the newly created file + // has been written to a committed version so that other concurrently + // executing compaction threads do not mistakenly assume that this + // file is not live. + pending_outputs.erase(m->file_number_); + if (m->Unref() != nullptr) { + to_delete->push_back(m); + } + size_--; + } else { + //commit failed. setup state so that we can flush again. + Log(info_log, + "Level-0 commit table #%lu: memtable #%lu failed", + (unsigned long)m->file_number_, + (unsigned long)mem_id); + m->flush_completed_ = false; + m->flush_in_progress_ = false; + m->edit_.Clear(); + num_flush_not_started_++; + pending_outputs.erase(m->file_number_); + m->file_number_ = 0; + imm_flush_needed.Release_Store((void *)1); + s = Status::IOError("Unable to commit flushed memtable"); + } + ++mem_id; + } while (!memlist_.empty() && (m = memlist_.back()) && + m->file_number_ == file_number); + } + commit_in_progress_ = false; + return s; +} + +// New memtables are inserted at the front of the list. +void MemTableList::Add(MemTable* m) { + assert(size_ >= num_flush_not_started_); + size_++; + memlist_.push_front(m); + m->MarkImmutable(); + num_flush_not_started_++; + if (num_flush_not_started_ == 1) { + imm_flush_needed.Release_Store((void *)1); + } +} + +// Returns an estimate of the number of bytes of data in use. +size_t MemTableList::ApproximateMemoryUsage() { + size_t size = 0; + for (auto &memtable : memlist_) { + size += memtable->ApproximateMemoryUsage(); + } + return size; +} + +// Search all the memtables starting from the most recent one. +// Return the most recent value found, if any. +// Operands stores the list of merge operations to apply, so far. +bool MemTableList::Get(const LookupKey& key, std::string* value, Status* s, + MergeContext& merge_context, const Options& options) { + for (auto &memtable : memlist_) { + if (memtable->Get(key, value, s, merge_context, options)) { + return true; + } + } + return false; +} + +void MemTableList::GetMemTables(std::vector* output) { + for (auto &memtable : memlist_) { + output->push_back(memtable); + } +} + +} // namespace rocksdb diff --git a/db/memtablelist.h b/db/memtablelist.h new file mode 100644 index 00000000..ed353c8b --- /dev/null +++ b/db/memtablelist.h @@ -0,0 +1,108 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// + +#pragma once +#include +#include +#include +#include "rocksdb/db.h" +#include "db/dbformat.h" +#include "db/skiplist.h" +#include "memtable.h" + +namespace rocksdb { + +class InternalKeyComparator; +class Mutex; +class MemTableListIterator; + +// +// This class stores references to all the immutable memtables. +// The memtables are flushed to L0 as soon as possible and in +// any order. If there are more than one immutable memtable, their +// flushes can occur concurrently. However, they are 'committed' +// to the manifest in FIFO order to maintain correctness and +// recoverability from a crash. +// +class MemTableList { + public: + // A list of memtables. + MemTableList() : size_(0), num_flush_not_started_(0), + commit_in_progress_(false), + flush_requested_(false) { + imm_flush_needed.Release_Store(nullptr); + } + ~MemTableList() {}; + + // so that backgrund threads can detect non-nullptr pointer to + // determine whether this is anything more to start flushing. + port::AtomicPointer imm_flush_needed; + + // Increase reference count on all underling memtables + void RefAll(); + + // Drop reference count on all underling memtables. If the refcount + // on an underlying memtable drops to zero, then return it in + // to_delete vector. + void UnrefAll(std::vector* to_delete); + + // Returns the total number of memtables in the list + int size(); + + // Returns true if there is at least one memtable on which flush has + // not yet started. + bool IsFlushPending(int min_write_buffer_number_to_merge); + + // Returns the earliest memtables that needs to be flushed. The returned + // memtables are guaranteed to be in the ascending order of created time. + void PickMemtablesToFlush(std::vector* mems); + + // Commit a successful flush in the manifest file + Status InstallMemtableFlushResults(const std::vector &m, + VersionSet* vset, Status flushStatus, + port::Mutex* mu, Logger* info_log, + uint64_t file_number, + std::set& pending_outputs, + std::vector* to_delete); + + // New memtables are inserted at the front of the list. + // Takes ownership of the referenced held on *m by the caller of Add(). + void Add(MemTable* m); + + // Returns an estimate of the number of bytes of data in use. + size_t ApproximateMemoryUsage(); + + // Search all the memtables starting from the most recent one. + // Return the most recent value found, if any. + bool Get(const LookupKey& key, std::string* value, Status* s, + MergeContext& merge_context, const Options& options); + + // Returns the list of underlying memtables. + void GetMemTables(std::vector* list); + + // Request a flush of all existing memtables to storage + void FlushRequested() { flush_requested_ = true; } + + // Copying allowed + // MemTableList(const MemTableList&); + // void operator=(const MemTableList&); + + private: + std::list memlist_; + int size_; + + // the number of elements that still need flushing + int num_flush_not_started_; + + // committing in progress + bool commit_in_progress_; + + // Requested a flush of all memtables to storage + bool flush_requested_; + +}; + +} // namespace rocksdb diff --git a/db/merge_context.h b/db/merge_context.h new file mode 100644 index 00000000..91d9f8a0 --- /dev/null +++ b/db/merge_context.h @@ -0,0 +1,69 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#pragma once +#include "db/dbformat.h" +#include "rocksdb/slice.h" +#include +#include + +namespace rocksdb { + +const std::deque empty_operand_list; + +// The merge context for merging a user key. +// When doing a Get(), DB will create such a class and pass it when +// issuing Get() operation to memtables and version_set. The operands +// will be fetched from the context when issuing partial of full merge. +class MergeContext { +public: + // Clear all the operands + void Clear() { + if (operand_list) { + operand_list->clear(); + } + } + // Replace the first two operands of merge_result, which are expected be the + // merge results of them. + void PushPartialMergeResult(std::string& merge_result) { + assert (operand_list); + operand_list->pop_front(); + swap(operand_list->front(), merge_result); + } + // Push a merge operand + void PushOperand(const Slice& operand_slice) { + Initialize(); + operand_list->push_front(operand_slice.ToString()); + } + // return total number of operands in the list + size_t GetNumOperands() const { + if (!operand_list) { + return 0; + } + return operand_list->size(); + } + // Get the operand at the index. + Slice GetOperand(int index) const { + assert (operand_list); + return (*operand_list)[index]; + } + // Return all the operands. + const std::deque& GetOperands() const { + if (!operand_list) { + return empty_operand_list; + } + return *operand_list; + } +private: + void Initialize() { + if (!operand_list) { + operand_list.reset(new std::deque()); + } + } + std::unique_ptr> operand_list; +}; + +} // namespace rocksdb + diff --git a/db/merge_helper.cc b/db/merge_helper.cc new file mode 100644 index 00000000..a7e2df0a --- /dev/null +++ b/db/merge_helper.cc @@ -0,0 +1,198 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include "merge_helper.h" +#include "db/dbformat.h" +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "rocksdb/merge_operator.h" +#include "util/statistics_imp.h" +#include +#include + +namespace rocksdb { + +// PRE: iter points to the first merge type entry +// POST: iter points to the first entry beyond the merge process (or the end) +// keys_, operands_ are updated to reflect the merge result. +// keys_ stores the list of keys encountered while merging. +// operands_ stores the list of merge operands encountered while merging. +// keys_[i] corresponds to operands_[i] for each i. +void MergeHelper::MergeUntil(Iterator* iter, SequenceNumber stop_before, + bool at_bottom, Statistics* stats) { + // Get a copy of the internal key, before it's invalidated by iter->Next() + // Also maintain the list of merge operands seen. + keys_.clear(); + operands_.clear(); + keys_.push_front(iter->key().ToString()); + operands_.push_front(iter->value().ToString()); + + success_ = false; // Will become true if we hit Put/Delete or bottom + + // We need to parse the internal key again as the parsed key is + // backed by the internal key! + // Assume no internal key corruption as it has been successfully parsed + // by the caller. + // Invariant: keys_.back() will not change. Hence, orig_ikey is always valid. + ParsedInternalKey orig_ikey; + ParseInternalKey(keys_.back(), &orig_ikey); + + bool hit_the_next_user_key = false; + ParsedInternalKey ikey; + std::string merge_result; // Temporary value for merge results + for (iter->Next(); iter->Valid(); iter->Next()) { + assert(operands_.size() >= 1); // Should be invariants! + assert(keys_.size() == operands_.size()); + + if (!ParseInternalKey(iter->key(), &ikey)) { + // stop at corrupted key + if (assert_valid_internal_key_) { + assert(!"corrupted internal key is not expected"); + } + break; + } + + if (user_comparator_->Compare(ikey.user_key, orig_ikey.user_key) != 0) { + // hit a different user key, stop right here + hit_the_next_user_key = true; + break; + } + + if (stop_before && ikey.sequence <= stop_before) { + // hit an entry that's visible by the previous snapshot, can't touch that + break; + } + + // At this point we are guaranteed that we need to process this key. + + if (kTypeDeletion == ikey.type) { + // hit a delete + // => merge nullptr with operands_ + // => store result in operands_.back() (and update keys_.back()) + // => change the entry type to kTypeValue for keys_.back() + // We are done! Return a success if the merge passes. + success_ = user_merge_operator_->FullMerge(ikey.user_key, nullptr, + operands_, &merge_result, + logger_); + + // We store the result in keys_.back() and operands_.back() + // if nothing went wrong (i.e.: no operand corruption on disk) + if (success_) { + std::string& key = keys_.back(); // The original key encountered + orig_ikey.type = kTypeValue; + UpdateInternalKey(&key[0], key.size(), + orig_ikey.sequence, orig_ikey.type); + swap(operands_.back(), merge_result); + } else { + RecordTick(stats, NUMBER_MERGE_FAILURES); + } + + // move iter to the next entry (before doing anything else) + iter->Next(); + return; + } + + if (kTypeValue == ikey.type) { + // hit a put + // => merge the put value with operands_ + // => store result in operands_.back() (and update keys_.back()) + // => change the entry type to kTypeValue for keys_.back() + // We are done! Success! + const Slice value = iter->value(); + success_ = user_merge_operator_->FullMerge(ikey.user_key, &value, + operands_, &merge_result, + logger_); + + // We store the result in keys_.back() and operands_.back() + // if nothing went wrong (i.e.: no operand corruption on disk) + if (success_) { + std::string& key = keys_.back(); // The original key encountered + orig_ikey.type = kTypeValue; + UpdateInternalKey(&key[0], key.size(), + orig_ikey.sequence, orig_ikey.type); + swap(operands_.back(), merge_result); + } else { + RecordTick(stats, NUMBER_MERGE_FAILURES); + } + + // move iter to the next entry + iter->Next(); + return; + } + + if (kTypeMerge == ikey.type) { + // hit a merge + // => merge the operand into the front of the operands_ list + // => use the user's associative merge function to determine how. + // => then continue because we haven't yet seen a Put/Delete. + assert(!operands_.empty()); // Should have at least one element in it + + keys_.push_front(iter->key().ToString()); + operands_.push_front(iter->value().ToString()); + while (operands_.size() >= 2) { + // Returns false when the merge_operator can no longer process it + if (user_merge_operator_->PartialMerge(ikey.user_key, + Slice(operands_[0]), + Slice(operands_[1]), + &merge_result, + logger_)) { + // Merging of operands (associative merge) was successful. + // Replace these frontmost two operands with the merge result + keys_.pop_front(); + operands_.pop_front(); + swap(operands_.front(), merge_result); + } else { + // Merging of operands (associative merge) returned false. + // The user merge_operator does not know how to merge these operands. + // So we just stack them up until we find a Put/Delete or end of key. + break; + } + } + continue; + } + } + + // We are sure we have seen this key's entire history if we are at the + // last level and exhausted all internal keys of this user key. + // NOTE: !iter->Valid() does not necessarily mean we hit the + // beginning of a user key, as versions of a user key might be + // split into multiple files (even files on the same level) + // and some files might not be included in the compaction/merge. + // + // There are also cases where we have seen the root of history of this + // key without being sure of it. Then, we simply miss the opportunity + // to combine the keys. Since VersionSet::SetupOtherInputs() always makes + // sure that all merge-operands on the same level get compacted together, + // this will simply lead to these merge operands moving to the next level. + // + // So, we only perform the following logic (to merge all operands together + // without a Put/Delete) if we are certain that we have seen the end of key. + bool surely_seen_the_beginning = hit_the_next_user_key && at_bottom; + if (surely_seen_the_beginning) { + // do a final merge with nullptr as the existing value and say + // bye to the merge type (it's now converted to a Put) + assert(kTypeMerge == orig_ikey.type); + assert(operands_.size() >= 1); + assert(operands_.size() == keys_.size()); + success_ = user_merge_operator_->FullMerge(orig_ikey.user_key, nullptr, + operands_, &merge_result, + logger_); + + if (success_) { + std::string& key = keys_.back(); // The original key encountered + orig_ikey.type = kTypeValue; + UpdateInternalKey(&key[0], key.size(), + orig_ikey.sequence, orig_ikey.type); + + // The final value() is always stored in operands_.back() + swap(operands_.back(),merge_result); + } else { + RecordTick(stats, NUMBER_MERGE_FAILURES); + // Do nothing if not success_. Leave keys() and operands() as they are. + } + } +} + +} // namespace rocksdb diff --git a/db/merge_helper.h b/db/merge_helper.h new file mode 100644 index 00000000..6fe9bfb2 --- /dev/null +++ b/db/merge_helper.h @@ -0,0 +1,102 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#ifndef MERGE_HELPER_H +#define MERGE_HELPER_H + +#include "db/dbformat.h" +#include "rocksdb/slice.h" +#include +#include + +namespace rocksdb { + +class Comparator; +class Iterator; +class Logger; +class MergeOperator; +class Statistics; + +class MergeHelper { + public: + MergeHelper(const Comparator* user_comparator, + const MergeOperator* user_merge_operator, + Logger* logger, + bool assert_valid_internal_key) + : user_comparator_(user_comparator), + user_merge_operator_(user_merge_operator), + logger_(logger), + assert_valid_internal_key_(assert_valid_internal_key), + keys_(), + operands_(), + success_(false) {} + + // Merge entries until we hit + // - a corrupted key + // - a Put/Delete, + // - a different user key, + // - a specific sequence number (snapshot boundary), + // or - the end of iteration + // iter: (IN) points to the first merge type entry + // (OUT) points to the first entry not included in the merge process + // stop_before: (IN) a sequence number that merge should not cross. + // 0 means no restriction + // at_bottom: (IN) true if the iterator covers the bottem level, which means + // we could reach the start of the history of this user key. + void MergeUntil(Iterator* iter, SequenceNumber stop_before = 0, + bool at_bottom = false, Statistics* stats = nullptr); + + // Query the merge result + // These are valid until the next MergeUntil call + // If the merging was successful: + // - IsSuccess() will be true + // - key() will have the latest sequence number of the merges. + // The type will be Put or Merge. See IMPORTANT 1 note, below. + // - value() will be the result of merging all the operands together + // - The user should ignore keys() and values(). + // + // IMPORTANT 1: the key type could change after the MergeUntil call. + // Put/Delete + Merge + ... + Merge => Put + // Merge + ... + Merge => Merge + // + // If the merge operator is not associative, and if a Put/Delete is not found + // then the merging will be unsuccessful. In this case: + // - IsSuccess() will be false + // - keys() contains the list of internal keys seen in order of iteration. + // - values() contains the list of values (merges) seen in the same order. + // values() is parallel to keys() so that the first entry in + // keys() is the key associated with the first entry in values() + // and so on. These lists will be the same length. + // All of these pairs will be merges over the same user key. + // See IMPORTANT 2 note below. + // - The user should ignore key() and value(). + // + // IMPORTANT 2: The entries were traversed in order from BACK to FRONT. + // So keys().back() was the first key seen by iterator. + // TODO: Re-style this comment to be like the first one + bool IsSuccess() { return success_; } + Slice key() { assert(success_); return Slice(keys_.back()); } + Slice value() { assert(success_); return Slice(operands_.back()); } + const std::deque& keys() { assert(!success_); return keys_; } + const std::deque& values() { + assert(!success_); return operands_; + } + + private: + const Comparator* user_comparator_; + const MergeOperator* user_merge_operator_; + Logger* logger_; + bool assert_valid_internal_key_; // enforce no internal key corruption? + + // the scratch area that holds the result of MergeUntil + // valid up to the next MergeUntil call + std::deque keys_; // Keeps track of the sequence of keys seen + std::deque operands_; // Parallel with keys_; stores the values + bool success_; +}; + +} // namespace rocksdb + +#endif diff --git a/db/merge_operator.cc b/db/merge_operator.cc new file mode 100644 index 00000000..7d1ee4e5 --- /dev/null +++ b/db/merge_operator.cc @@ -0,0 +1,53 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +/** + * Back-end implementation details specific to the Merge Operator. + */ + +#include "rocksdb/merge_operator.h" + +namespace rocksdb { + +// Given a "real" merge from the library, call the user's +// associative merge function one-by-one on each of the operands. +// NOTE: It is assumed that the client's merge-operator will handle any errors. +bool AssociativeMergeOperator::FullMerge( + const Slice& key, + const Slice* existing_value, + const std::deque& operand_list, + std::string* new_value, + Logger* logger) const { + + // Simply loop through the operands + Slice temp_existing; + std::string temp_value; + for (const auto& operand : operand_list) { + Slice value(operand); + if (!Merge(key, existing_value, value, &temp_value, logger)) { + return false; + } + swap(temp_value, *new_value); + temp_existing = Slice(*new_value); + existing_value = &temp_existing; + } + + // The result will be in *new_value. All merges succeeded. + return true; +} + +// Call the user defined simple merge on the operands; +// NOTE: It is assumed that the client's merge-operator will handle any errors. +bool AssociativeMergeOperator::PartialMerge( + const Slice& key, + const Slice& left_operand, + const Slice& right_operand, + std::string* new_value, + Logger* logger) const { + + return Merge(key, &left_operand, right_operand, new_value, logger); +} + +} // namespace rocksdb diff --git a/db/merge_test.cc b/db/merge_test.cc new file mode 100644 index 00000000..887d8ad4 --- /dev/null +++ b/db/merge_test.cc @@ -0,0 +1,389 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include +#include +#include + +#include "rocksdb/cache.h" +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/merge_operator.h" +#include "db/dbformat.h" +#include "db/db_impl.h" +#include "db/write_batch_internal.h" +#include "utilities/merge_operators.h" +#include "util/testharness.h" +#include "utilities/utility_db.h" + +using namespace std; +using namespace rocksdb; + +namespace { + int numMergeOperatorCalls; + + void resetNumMergeOperatorCalls() { + numMergeOperatorCalls = 0; + } +} + +class CountMergeOperator : public AssociativeMergeOperator { + public: + CountMergeOperator() { + mergeOperator_ = MergeOperators::CreateUInt64AddOperator(); + } + + virtual bool Merge(const Slice& key, + const Slice* existing_value, + const Slice& value, + std::string* new_value, + Logger* logger) const override { + ++numMergeOperatorCalls; + return mergeOperator_->PartialMerge( + key, + *existing_value, + value, + new_value, + logger); + } + + virtual const char* Name() const override { + return "UInt64AddOperator"; + } + + private: + std::shared_ptr mergeOperator_; +}; + +std::shared_ptr OpenDb( + const string& dbname, + const bool ttl = false, + const unsigned max_successive_merges = 0) { + DB* db; + StackableDB* sdb; + Options options; + options.create_if_missing = true; + options.merge_operator = std::make_shared(); + options.max_successive_merges = max_successive_merges; + Status s; + DestroyDB(dbname, Options()); + if (ttl) { + cout << "Opening database with TTL\n"; + s = UtilityDB::OpenTtlDB(options, dbname, &sdb); + db = sdb; + } else { + s = DB::Open(options, dbname, &db); + } + if (!s.ok()) { + cerr << s.ToString() << endl; + assert(false); + } + return std::shared_ptr(db); +} + +// Imagine we are maintaining a set of uint64 counters. +// Each counter has a distinct name. And we would like +// to support four high level operations: +// set, add, get and remove +// This is a quick implementation without a Merge operation. +class Counters { + + protected: + std::shared_ptr db_; + + WriteOptions put_option_; + ReadOptions get_option_; + WriteOptions delete_option_; + + uint64_t default_; + + public: + explicit Counters(std::shared_ptr db, uint64_t defaultCount = 0) + : db_(db), + put_option_(), + get_option_(), + delete_option_(), + default_(defaultCount) { + assert(db_); + } + + virtual ~Counters() {} + + // public interface of Counters. + // All four functions return false + // if the underlying level db operation failed. + + // mapped to a levedb Put + bool set(const string& key, uint64_t value) { + // just treat the internal rep of int64 as the string + Slice slice((char *)&value, sizeof(value)); + auto s = db_->Put(put_option_, key, slice); + + if (s.ok()) { + return true; + } else { + cerr << s.ToString() << endl; + return false; + } + } + + // mapped to a rocksdb Delete + bool remove(const string& key) { + auto s = db_->Delete(delete_option_, key); + + if (s.ok()) { + return true; + } else { + cerr << s.ToString() << std::endl; + return false; + } + } + + // mapped to a rocksdb Get + bool get(const string& key, uint64_t *value) { + string str; + auto s = db_->Get(get_option_, key, &str); + + if (s.IsNotFound()) { + // return default value if not found; + *value = default_; + return true; + } else if (s.ok()) { + // deserialization + if (str.size() != sizeof(uint64_t)) { + cerr << "value corruption\n"; + return false; + } + *value = DecodeFixed64(&str[0]); + return true; + } else { + cerr << s.ToString() << std::endl; + return false; + } + } + + // 'add' is implemented as get -> modify -> set + // An alternative is a single merge operation, see MergeBasedCounters + virtual bool add(const string& key, uint64_t value) { + uint64_t base = default_; + return get(key, &base) && set(key, base + value); + } + + + // convenience functions for testing + void assert_set(const string& key, uint64_t value) { + assert(set(key, value)); + } + + void assert_remove(const string& key) { + assert(remove(key)); + } + + uint64_t assert_get(const string& key) { + uint64_t value = default_; + assert(get(key, &value)); + return value; + } + + void assert_add(const string& key, uint64_t value) { + assert(add(key, value)); + } +}; + +// Implement 'add' directly with the new Merge operation +class MergeBasedCounters : public Counters { + private: + WriteOptions merge_option_; // for merge + + public: + explicit MergeBasedCounters(std::shared_ptr db, uint64_t defaultCount = 0) + : Counters(db, defaultCount), + merge_option_() { + } + + // mapped to a rocksdb Merge operation + virtual bool add(const string& key, uint64_t value) override { + char encoded[sizeof(uint64_t)]; + EncodeFixed64(encoded, value); + Slice slice(encoded, sizeof(uint64_t)); + auto s = db_->Merge(merge_option_, key, slice); + + if (s.ok()) { + return true; + } else { + cerr << s.ToString() << endl; + return false; + } + } +}; + +void dumpDb(DB* db) { + auto it = unique_ptr(db->NewIterator(ReadOptions())); + for (it->SeekToFirst(); it->Valid(); it->Next()) { + uint64_t value = DecodeFixed64(it->value().data()); + cout << it->key().ToString() << ": " << value << endl; + } + assert(it->status().ok()); // Check for any errors found during the scan +} + +void testCounters(Counters& counters, DB* db, bool test_compaction) { + + FlushOptions o; + o.wait = true; + + counters.assert_set("a", 1); + + if (test_compaction) db->Flush(o); + + assert(counters.assert_get("a") == 1); + + counters.assert_remove("b"); + + // defaut value is 0 if non-existent + assert(counters.assert_get("b") == 0); + + counters.assert_add("a", 2); + + if (test_compaction) db->Flush(o); + + // 1+2 = 3 + assert(counters.assert_get("a")== 3); + + dumpDb(db); + + std::cout << "1\n"; + + // 1+...+49 = ? + uint64_t sum = 0; + for (int i = 1; i < 50; i++) { + counters.assert_add("b", i); + sum += i; + } + assert(counters.assert_get("b") == sum); + + std::cout << "2\n"; + dumpDb(db); + + std::cout << "3\n"; + + if (test_compaction) { + db->Flush(o); + + cout << "Compaction started ...\n"; + db->CompactRange(nullptr, nullptr); + cout << "Compaction ended\n"; + + dumpDb(db); + + assert(counters.assert_get("a")== 3); + assert(counters.assert_get("b") == sum); + } +} + +void testSuccessiveMerge( + Counters& counters, int max_num_merges, int num_merges) { + + counters.assert_remove("z"); + uint64_t sum = 0; + + for (int i = 1; i <= num_merges; ++i) { + resetNumMergeOperatorCalls(); + counters.assert_add("z", i); + sum += i; + + if (i % (max_num_merges + 1) == 0) { + assert(numMergeOperatorCalls == max_num_merges + 1); + } else { + assert(numMergeOperatorCalls == 0); + } + + resetNumMergeOperatorCalls(); + assert(counters.assert_get("z") == sum); + assert(numMergeOperatorCalls == i % (max_num_merges + 1)); + } +} + +void testSingleBatchSuccessiveMerge( + DB* db, + int max_num_merges, + int num_merges) { + assert(num_merges > max_num_merges); + + Slice key("BatchSuccessiveMerge"); + uint64_t merge_value = 1; + Slice merge_value_slice((char *)&merge_value, sizeof(merge_value)); + + // Create the batch + WriteBatch batch; + for (int i = 0; i < num_merges; ++i) { + batch.Merge(key, merge_value_slice); + } + + // Apply to memtable and count the number of merges + resetNumMergeOperatorCalls(); + { + Status s = db->Write(WriteOptions(), &batch); + assert(s.ok()); + } + assert(numMergeOperatorCalls == + num_merges - (num_merges % (max_num_merges + 1))); + + // Get the value + resetNumMergeOperatorCalls(); + string get_value_str; + { + Status s = db->Get(ReadOptions(), key, &get_value_str); + assert(s.ok()); + } + assert(get_value_str.size() == sizeof(uint64_t)); + uint64_t get_value = DecodeFixed64(&get_value_str[0]); + ASSERT_EQ(get_value, num_merges * merge_value); + ASSERT_EQ(numMergeOperatorCalls, (num_merges % (max_num_merges + 1))); +} + +void runTest(int argc, const string& dbname, const bool use_ttl = false) { + auto db = OpenDb(dbname, use_ttl); + + { + cout << "Test read-modify-write counters... \n"; + Counters counters(db, 0); + testCounters(counters, db.get(), true); + } + + bool compact = false; + if (argc > 1) { + compact = true; + cout << "Turn on Compaction\n"; + } + + { + cout << "Test merge-based counters... \n"; + MergeBasedCounters counters(db, 0); + testCounters(counters, db.get(), compact); + } + + DestroyDB(dbname, Options()); + db.reset(); + + { + cout << "Test merge in memtable... \n"; + unsigned maxMerge = 5; + auto db = OpenDb(dbname, use_ttl, maxMerge); + MergeBasedCounters counters(db, 0); + testCounters(counters, db.get(), compact); + testSuccessiveMerge(counters, maxMerge, maxMerge * 2); + testSingleBatchSuccessiveMerge(db.get(), 5, 7); + DestroyDB(dbname, Options()); + } + +} + +int main(int argc, char *argv[]) { + //TODO: Make this test like a general rocksdb unit-test + runTest(argc, test::TmpDir() + "/merge_testdb"); + runTest(argc, test::TmpDir() + "/merge_testdbttl", true); // Run test on TTL database + return 0; +} diff --git a/db/perf_context_test.cc b/db/perf_context_test.cc new file mode 100644 index 00000000..0934de0c --- /dev/null +++ b/db/perf_context_test.cc @@ -0,0 +1,328 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include +#include +#include +#include "/usr/include/valgrind/callgrind.h" + +#include "rocksdb/db.h" +#include "rocksdb/perf_context.h" +#include "util/histogram.h" +#include "util/stop_watch.h" +#include "util/testharness.h" + + +bool FLAGS_random_key = false; +bool FLAGS_use_set_based_memetable = false; +int FLAGS_total_keys = 100; +int FLAGS_write_buffer_size = 1000000000; +int FLAGS_max_write_buffer_number = 8; +int FLAGS_min_write_buffer_number_to_merge = 7; + +// Path to the database on file system +const std::string kDbName = rocksdb::test::TmpDir() + "/perf_context_test"; + +namespace rocksdb { + +std::shared_ptr OpenDb() { + DB* db; + Options options; + options.create_if_missing = true; + options.write_buffer_size = FLAGS_write_buffer_size; + options.max_write_buffer_number = FLAGS_max_write_buffer_number; + options.min_write_buffer_number_to_merge = + FLAGS_min_write_buffer_number_to_merge; + + if (FLAGS_use_set_based_memetable) { + auto prefix_extractor = rocksdb::NewFixedPrefixTransform(0); + options.memtable_factory.reset( + NewHashSkipListRepFactory(prefix_extractor)); + } + + Status s = DB::Open(options, kDbName, &db); + ASSERT_OK(s); + return std::shared_ptr(db); +} + +class PerfContextTest { }; + +TEST(PerfContextTest, SeekIntoDeletion) { + DestroyDB(kDbName, Options()); + auto db = OpenDb(); + WriteOptions write_options; + ReadOptions read_options; + + for (int i = 0; i < FLAGS_total_keys; ++i) { + std::string key = "k" + std::to_string(i); + std::string value = "v" + std::to_string(i); + + db->Put(write_options, key, value); + } + + for (int i = 0; i < FLAGS_total_keys -1 ; ++i) { + std::string key = "k" + std::to_string(i); + db->Delete(write_options, key); + } + + HistogramImpl hist_get; + HistogramImpl hist_get_time; + for (int i = 0; i < FLAGS_total_keys - 1; ++i) { + std::string key = "k" + std::to_string(i); + std::string value; + + perf_context.Reset(); + StopWatchNano timer(Env::Default(), true); + auto status = db->Get(read_options, key, &value); + auto elapsed_nanos = timer.ElapsedNanos(); + ASSERT_TRUE(status.IsNotFound()); + hist_get.Add(perf_context.user_key_comparison_count); + hist_get_time.Add(elapsed_nanos); + } + + std::cout << "Get uesr key comparison: \n" << hist_get.ToString() + << "Get time: \n" << hist_get_time.ToString(); + + HistogramImpl hist_seek_to_first; + std::unique_ptr iter(db->NewIterator(read_options)); + + perf_context.Reset(); + StopWatchNano timer(Env::Default(), true); + iter->SeekToFirst(); + hist_seek_to_first.Add(perf_context.user_key_comparison_count); + auto elapsed_nanos = timer.ElapsedNanos(); + + std::cout << "SeekToFirst uesr key comparison: \n" << hist_seek_to_first.ToString() + << "ikey skipped: " << perf_context.internal_key_skipped_count << "\n" + << "idelete skipped: " << perf_context.internal_delete_skipped_count << "\n" + << "elapsed: " << elapsed_nanos << "\n"; + + HistogramImpl hist_seek; + for (int i = 0; i < FLAGS_total_keys; ++i) { + std::unique_ptr iter(db->NewIterator(read_options)); + std::string key = "k" + std::to_string(i); + + perf_context.Reset(); + StopWatchNano timer(Env::Default(), true); + iter->Seek(key); + auto elapsed_nanos = timer.ElapsedNanos(); + hist_seek.Add(perf_context.user_key_comparison_count); + std::cout << "seek cmp: " << perf_context.user_key_comparison_count + << " ikey skipped " << perf_context.internal_key_skipped_count + << " idelete skipped " << perf_context.internal_delete_skipped_count + << " elapsed: " << elapsed_nanos << "ns\n"; + + perf_context.Reset(); + ASSERT_TRUE(iter->Valid()); + StopWatchNano timer2(Env::Default(), true); + iter->Next(); + auto elapsed_nanos2 = timer2.ElapsedNanos(); + std::cout << "next cmp: " << perf_context.user_key_comparison_count + << "elapsed: " << elapsed_nanos2 << "ns\n"; + } + + std::cout << "Seek uesr key comparison: \n" << hist_seek.ToString(); +} + +TEST(PerfContextTest, StopWatchNanoOverhead) { + // profile the timer cost by itself! + const int kTotalIterations = 1000000; + std::vector timings(kTotalIterations); + + StopWatchNano timer(Env::Default(), true); + for (auto& timing : timings) { + timing = timer.ElapsedNanos(true /* reset */); + } + + HistogramImpl histogram; + for (const auto timing : timings) { + histogram.Add(timing); + } + + std::cout << histogram.ToString(); +} + +TEST(PerfContextTest, StopWatchOverhead) { + // profile the timer cost by itself! + const int kTotalIterations = 1000000; + std::vector timings(kTotalIterations); + + StopWatch timer(Env::Default()); + for (auto& timing : timings) { + timing = timer.ElapsedMicros(); + } + + HistogramImpl histogram; + uint64_t prev_timing = 0; + for (const auto timing : timings) { + histogram.Add(timing - prev_timing); + prev_timing = timing; + } + + std::cout << histogram.ToString(); +} + +void ProfileKeyComparison() { + DestroyDB(kDbName, Options()); // Start this test with a fresh DB + + auto db = OpenDb(); + + WriteOptions write_options; + ReadOptions read_options; + + HistogramImpl hist_put; + HistogramImpl hist_get; + + std::cout << "Inserting " << FLAGS_total_keys << " key/value pairs\n...\n"; + + std::vector keys; + for (int i = 0; i < FLAGS_total_keys; ++i) { + keys.push_back(i); + } + + if (FLAGS_random_key) { + std::random_shuffle(keys.begin(), keys.end()); + } + + for (const int i : keys) { + std::string key = "k" + std::to_string(i); + std::string value = "v" + std::to_string(i); + + perf_context.Reset(); + db->Put(write_options, key, value); + hist_put.Add(perf_context.user_key_comparison_count); + + perf_context.Reset(); + db->Get(read_options, key, &value); + hist_get.Add(perf_context.user_key_comparison_count); + } + + std::cout << "Put uesr key comparison: \n" << hist_put.ToString() + << "Get uesr key comparison: \n" << hist_get.ToString(); + +} + +TEST(PerfContextTest, KeyComparisonCount) { + SetPerfLevel(kEnableCount); + ProfileKeyComparison(); + + SetPerfLevel(kDisable); + ProfileKeyComparison(); + + SetPerfLevel(kEnableTime); + ProfileKeyComparison(); +} + +// make perf_context_test +// export ROCKSDB_TESTS=PerfContextTest.SeekKeyComparison +// For one memtable: +// ./perf_context_test --write_buffer_size=500000 --total_keys=10000 +// For two memtables: +// ./perf_context_test --write_buffer_size=250000 --total_keys=10000 +// Specify --random_key=1 to shuffle the key before insertion +// Results show that, for sequential insertion, worst-case Seek Key comparison +// is close to the total number of keys (linear), when there is only one +// memtable. When there are two memtables, even the avg Seek Key comparison +// starts to become linear to the input size. + +TEST(PerfContextTest, SeekKeyComparison) { + DestroyDB(kDbName, Options()); + auto db = OpenDb(); + WriteOptions write_options; + ReadOptions read_options; + + std::cout << "Inserting " << FLAGS_total_keys << " key/value pairs\n...\n"; + + std::vector keys; + for (int i = 0; i < FLAGS_total_keys; ++i) { + keys.push_back(i); + } + + if (FLAGS_random_key) { + std::random_shuffle(keys.begin(), keys.end()); + } + + HistogramImpl hist_put_time; + HistogramImpl hist_wal_time; + HistogramImpl hist_time_diff; + + SetPerfLevel(kEnableTime); + StopWatchNano timer(Env::Default()); + for (const int i : keys) { + std::string key = "k" + std::to_string(i); + std::string value = "v" + std::to_string(i); + + perf_context.Reset(); + timer.Start(); + db->Put(write_options, key, value); + auto put_time = timer.ElapsedNanos(); + hist_put_time.Add(put_time); + hist_wal_time.Add(perf_context.wal_write_time); + hist_time_diff.Add(put_time - perf_context.wal_write_time); + } + + std::cout << "Put time:\n" << hist_put_time.ToString() + << "WAL time:\n" << hist_wal_time.ToString() + << "time diff:\n" << hist_time_diff.ToString(); + + HistogramImpl hist_seek; + HistogramImpl hist_next; + + for (int i = 0; i < FLAGS_total_keys; ++i) { + std::string key = "k" + std::to_string(i); + std::string value = "v" + std::to_string(i); + + std::unique_ptr iter(db->NewIterator(read_options)); + perf_context.Reset(); + iter->Seek(key); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->value().ToString(), value); + hist_seek.Add(perf_context.user_key_comparison_count); + } + + std::unique_ptr iter(db->NewIterator(read_options)); + for (iter->SeekToFirst(); iter->Valid();) { + perf_context.Reset(); + iter->Next(); + hist_next.Add(perf_context.user_key_comparison_count); + } + + std::cout << "Seek:\n" << hist_seek.ToString() + << "Next:\n" << hist_next.ToString(); +} + +} + +int main(int argc, char** argv) { + + for (int i = 1; i < argc; i++) { + int n; + char junk; + + if (sscanf(argv[i], "--write_buffer_size=%d%c", &n, &junk) == 1) { + FLAGS_write_buffer_size = n; + } + + if (sscanf(argv[i], "--total_keys=%d%c", &n, &junk) == 1) { + FLAGS_total_keys = n; + } + + if (sscanf(argv[i], "--random_key=%d%c", &n, &junk) == 1 && + (n == 0 || n == 1)) { + FLAGS_random_key = n; + } + + if (sscanf(argv[i], "--use_set_based_memetable=%d%c", &n, &junk) == 1 && + (n == 0 || n == 1)) { + FLAGS_use_set_based_memetable = n; + } + + } + + std::cout << kDbName << "\n"; + + rocksdb::test::RunAllTests(); + return 0; +} diff --git a/db/prefix_filter_iterator.h b/db/prefix_filter_iterator.h new file mode 100644 index 00000000..f4488379 --- /dev/null +++ b/db/prefix_filter_iterator.h @@ -0,0 +1,73 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Wrap an underlying iterator, but exclude any results not starting +// with a given prefix. Seeking to keys not beginning with the prefix +// is invalid, and SeekToLast is not implemented (that would be +// non-trivial), but otherwise this iterator will behave just like the +// underlying iterator would if there happened to be no non-matching +// keys in the dataset. + +#pragma once +#include "rocksdb/iterator.h" + +namespace rocksdb { + +class PrefixFilterIterator : public Iterator { + private: + Iterator* iter_; + const Slice &prefix_; + const SliceTransform *prefix_extractor_; + Status status_; + + public: + PrefixFilterIterator(Iterator* iter, + const Slice &prefix, + const SliceTransform* prefix_extractor) + : iter_(iter), prefix_(prefix), + prefix_extractor_(prefix_extractor), + status_(Status::OK()) { + if (prefix_extractor == nullptr) { + status_ = Status::InvalidArgument("A prefix filter may not be used " + "unless a function is also defined " + "for extracting prefixes"); + } else if (!prefix_extractor_->InRange(prefix)) { + status_ = Status::InvalidArgument("Must provide a slice for prefix which" + "is a prefix for some key"); + } + } + ~PrefixFilterIterator() { + delete iter_; + } + Slice key() const { return iter_->key(); } + Slice value() const { return iter_->value(); } + Status status() const { + if (!status_.ok()) { + return status_; + } + return iter_->status(); + } + void Next() { iter_->Next(); } + void Prev() { iter_->Prev(); } + void Seek(const Slice& k) { + if (prefix_extractor_->Transform(k) == prefix_) { + iter_->Seek(k); + } else { + status_ = Status::InvalidArgument("Seek must begin with target prefix"); + } + } + void SeekToFirst() { + Seek(prefix_); + } + void SeekToLast() { + status_ = Status::NotSupported("SeekToLast is incompatible with prefixes"); + } + bool Valid() const { + return (status_.ok() && iter_->Valid() && + prefix_extractor_->Transform(iter_->key()) == prefix_); + } +}; + +} // namespace rocksdb diff --git a/db/prefix_test.cc b/db/prefix_test.cc new file mode 100644 index 00000000..7e5e9cc0 --- /dev/null +++ b/db/prefix_test.cc @@ -0,0 +1,329 @@ +#include +#include +#include + +#include +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "rocksdb/perf_context.h" +#include "util/histogram.h" +#include "util/stop_watch.h" +#include "util/testharness.h" + +DEFINE_bool(use_prefix_hash_memtable, true, ""); +DEFINE_bool(trigger_deadlock, false, + "issue delete in range scan to trigger PrefixHashMap deadlock"); +DEFINE_uint64(bucket_count, 100000, "number of buckets"); +DEFINE_uint64(num_locks, 10001, "number of locks"); +DEFINE_bool(random_prefix, false, "randomize prefix"); +DEFINE_uint64(total_prefixes, 1000, "total number of prefixes"); +DEFINE_uint64(items_per_prefix, 10, "total number of values per prefix"); +DEFINE_int64(write_buffer_size, 1000000000, ""); +DEFINE_int64(max_write_buffer_number, 8, ""); +DEFINE_int64(min_write_buffer_number_to_merge, 7, ""); + +// Path to the database on file system +const std::string kDbName = rocksdb::test::TmpDir() + "/prefix_test"; + +namespace rocksdb { + +struct TestKey { + uint64_t prefix; + uint64_t sorted; + + TestKey(uint64_t prefix, uint64_t sorted) : prefix(prefix), sorted(sorted) {} +}; + +// return a slice backed by test_key +inline Slice TestKeyToSlice(const TestKey& test_key) { + return Slice((const char*)&test_key, sizeof(test_key)); +} + +inline const TestKey* SliceToTestKey(const Slice& slice) { + return (const TestKey*)slice.data(); +} + +class TestKeyComparator : public Comparator { + public: + + // Compare needs to be aware of the possibility of a and/or b is + // prefix only + virtual int Compare(const Slice& a, const Slice& b) const { + const TestKey* key_a = SliceToTestKey(a); + const TestKey* key_b = SliceToTestKey(b); + if (key_a->prefix != key_b->prefix) { + if (key_a->prefix < key_b->prefix) return -1; + if (key_a->prefix > key_b->prefix) return 1; + } else { + ASSERT_TRUE(key_a->prefix == key_b->prefix); + // note, both a and b could be prefix only + if (a.size() != b.size()) { + // one of them is prefix + ASSERT_TRUE( + (a.size() == sizeof(uint64_t) && b.size() == sizeof(TestKey)) || + (b.size() == sizeof(uint64_t) && a.size() == sizeof(TestKey))); + if (a.size() < b.size()) return -1; + if (a.size() > b.size()) return 1; + } else { + // both a and b are prefix + if (a.size() == sizeof(uint64_t)) { + return 0; + } + + // both a and b are whole key + ASSERT_TRUE(a.size() == sizeof(TestKey) && b.size() == sizeof(TestKey)); + if (key_a->sorted < key_b->sorted) return -1; + if (key_a->sorted > key_b->sorted) return 1; + if (key_a->sorted == key_b->sorted) return 0; + } + } + return 0; + } + + virtual const char* Name() const override { + return "TestKeyComparator"; + } + + virtual void FindShortestSeparator( + std::string* start, + const Slice& limit) const { + } + + virtual void FindShortSuccessor(std::string* key) const {} + +}; + +class PrefixTest { + public: + std::shared_ptr OpenDb() { + DB* db; + + options.create_if_missing = true; + options.write_buffer_size = FLAGS_write_buffer_size; + options.max_write_buffer_number = FLAGS_max_write_buffer_number; + options.min_write_buffer_number_to_merge = + FLAGS_min_write_buffer_number_to_merge; + + options.comparator = new TestKeyComparator(); + if (FLAGS_use_prefix_hash_memtable) { + auto prefix_extractor = NewFixedPrefixTransform(8); + options.prefix_extractor = prefix_extractor; + options.memtable_factory.reset(NewHashSkipListRepFactory( + prefix_extractor, FLAGS_bucket_count)); + } + + Status s = DB::Open(options, kDbName, &db); + ASSERT_OK(s); + return std::shared_ptr(db); + } + ~PrefixTest() { + delete options.comparator; + } + protected: + Options options; +}; + +TEST(PrefixTest, DynamicPrefixIterator) { + + DestroyDB(kDbName, Options()); + auto db = OpenDb(); + WriteOptions write_options; + ReadOptions read_options; + + std::vector prefixes; + for (uint64_t i = 0; i < FLAGS_total_prefixes; ++i) { + prefixes.push_back(i); + } + + if (FLAGS_random_prefix) { + std::random_shuffle(prefixes.begin(), prefixes.end()); + } + + // insert x random prefix, each with y continuous element. + for (auto prefix : prefixes) { + for (uint64_t sorted = 0; sorted < FLAGS_items_per_prefix; sorted++) { + TestKey test_key(prefix, sorted); + + Slice key = TestKeyToSlice(test_key); + std::string value = "v" + std::to_string(sorted); + + ASSERT_OK(db->Put(write_options, key, value)); + } + } + + // test seek existing keys + HistogramImpl hist_seek_time; + HistogramImpl hist_seek_comparison; + + if (FLAGS_use_prefix_hash_memtable) { + read_options.prefix_seek = true; + } + std::unique_ptr iter(db->NewIterator(read_options)); + + for (auto prefix : prefixes) { + TestKey test_key(prefix, FLAGS_items_per_prefix / 2); + Slice key = TestKeyToSlice(test_key); + std::string value = "v" + std::to_string(0); + + perf_context.Reset(); + StopWatchNano timer(Env::Default(), true); + uint64_t total_keys = 0; + for (iter->Seek(key); iter->Valid(); iter->Next()) { + if (FLAGS_trigger_deadlock) { + std::cout << "Behold the deadlock!\n"; + db->Delete(write_options, iter->key()); + } + auto test_key = SliceToTestKey(iter->key()); + if (test_key->prefix != prefix) break; + total_keys++; + } + hist_seek_time.Add(timer.ElapsedNanos()); + hist_seek_comparison.Add(perf_context.user_key_comparison_count); + ASSERT_EQ(total_keys, FLAGS_items_per_prefix - FLAGS_items_per_prefix/2); + } + + std::cout << "Seek key comparison: \n" + << hist_seek_comparison.ToString() + << "Seek time: \n" + << hist_seek_time.ToString(); + + // test non-existing keys + HistogramImpl hist_no_seek_time; + HistogramImpl hist_no_seek_comparison; + + for (auto prefix = FLAGS_total_prefixes; + prefix < FLAGS_total_prefixes + 100; + prefix++) { + TestKey test_key(prefix, 0); + Slice key = TestKeyToSlice(test_key); + + perf_context.Reset(); + StopWatchNano timer(Env::Default(), true); + iter->Seek(key); + hist_no_seek_time.Add(timer.ElapsedNanos()); + hist_no_seek_comparison.Add(perf_context.user_key_comparison_count); + ASSERT_TRUE(!iter->Valid()); + } + + std::cout << "non-existing Seek key comparison: \n" + << hist_no_seek_comparison.ToString() + << "non-existing Seek time: \n" + << hist_no_seek_time.ToString(); +} + +TEST(PrefixTest, PrefixHash) { + + DestroyDB(kDbName, Options()); + auto db = OpenDb(); + WriteOptions write_options; + ReadOptions read_options; + + std::vector prefixes; + for (uint64_t i = 0; i < FLAGS_total_prefixes; ++i) { + prefixes.push_back(i); + } + + if (FLAGS_random_prefix) { + std::random_shuffle(prefixes.begin(), prefixes.end()); + } + + // insert x random prefix, each with y continuous element. + HistogramImpl hist_put_time; + HistogramImpl hist_put_comparison; + + for (auto prefix : prefixes) { + for (uint64_t sorted = 0; sorted < FLAGS_items_per_prefix; sorted++) { + TestKey test_key(prefix, sorted); + + Slice key = TestKeyToSlice(test_key); + std::string value = "v" + std::to_string(sorted); + + perf_context.Reset(); + StopWatchNano timer(Env::Default(), true); + ASSERT_OK(db->Put(write_options, key, value)); + hist_put_time.Add(timer.ElapsedNanos()); + hist_put_comparison.Add(perf_context.user_key_comparison_count); + } + } + + std::cout << "Put key comparison: \n" << hist_put_comparison.ToString() + << "Put time: \n" << hist_put_time.ToString(); + + + // test seek existing keys + HistogramImpl hist_seek_time; + HistogramImpl hist_seek_comparison; + + for (auto prefix : prefixes) { + TestKey test_key(prefix, 0); + Slice key = TestKeyToSlice(test_key); + std::string value = "v" + std::to_string(0); + + Slice key_prefix; + if (FLAGS_use_prefix_hash_memtable) { + key_prefix = options.prefix_extractor->Transform(key); + read_options.prefix = &key_prefix; + } + std::unique_ptr iter(db->NewIterator(read_options)); + + perf_context.Reset(); + StopWatchNano timer(Env::Default(), true); + uint64_t total_keys = 0; + for (iter->Seek(key); iter->Valid(); iter->Next()) { + if (FLAGS_trigger_deadlock) { + std::cout << "Behold the deadlock!\n"; + db->Delete(write_options, iter->key()); + } + auto test_key = SliceToTestKey(iter->key()); + if (test_key->prefix != prefix) break; + total_keys++; + } + hist_seek_time.Add(timer.ElapsedNanos()); + hist_seek_comparison.Add(perf_context.user_key_comparison_count); + ASSERT_EQ(total_keys, FLAGS_items_per_prefix); + } + + std::cout << "Seek key comparison: \n" + << hist_seek_comparison.ToString() + << "Seek time: \n" + << hist_seek_time.ToString(); + + // test non-existing keys + HistogramImpl hist_no_seek_time; + HistogramImpl hist_no_seek_comparison; + + for (auto prefix = FLAGS_total_prefixes; + prefix < FLAGS_total_prefixes + 100; + prefix++) { + TestKey test_key(prefix, 0); + Slice key = TestKeyToSlice(test_key); + + if (FLAGS_use_prefix_hash_memtable) { + Slice key_prefix = options.prefix_extractor->Transform(key); + read_options.prefix = &key_prefix; + } + std::unique_ptr iter(db->NewIterator(read_options)); + + perf_context.Reset(); + StopWatchNano timer(Env::Default(), true); + iter->Seek(key); + hist_no_seek_time.Add(timer.ElapsedNanos()); + hist_no_seek_comparison.Add(perf_context.user_key_comparison_count); + ASSERT_TRUE(!iter->Valid()); + } + + std::cout << "non-existing Seek key comparison: \n" + << hist_no_seek_comparison.ToString() + << "non-existing Seek time: \n" + << hist_no_seek_time.ToString(); +} + +} + +int main(int argc, char** argv) { + google::ParseCommandLineFlags(&argc, &argv, true); + std::cout << kDbName << "\n"; + + rocksdb::test::RunAllTests(); + return 0; +} diff --git a/db/repair.cc b/db/repair.cc new file mode 100644 index 00000000..29524233 --- /dev/null +++ b/db/repair.cc @@ -0,0 +1,390 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// We recover the contents of the descriptor from the other files we find. +// (1) Any log files are first converted to tables +// (2) We scan every table to compute +// (a) smallest/largest for the table +// (b) largest sequence number in the table +// (3) We generate descriptor contents: +// - log number is set to zero +// - next-file-number is set to 1 + largest file number we found +// - last-sequence-number is set to largest sequence# found across +// all tables (see 2c) +// - compaction pointers are cleared +// - every table file is added at level 0 +// +// Possible optimization 1: +// (a) Compute total size and use to pick appropriate max-level M +// (b) Sort tables by largest sequence# in the table +// (c) For each table: if it overlaps earlier table, place in level-0, +// else place in level-M. +// Possible optimization 2: +// Store per-table metadata (smallest, largest, largest-seq#, ...) +// in the table's meta section to speed up ScanTable. + +#include "db/builder.h" +#include "db/db_impl.h" +#include "db/dbformat.h" +#include "db/filename.h" +#include "db/log_reader.h" +#include "db/log_writer.h" +#include "db/memtable.h" +#include "db/table_cache.h" +#include "db/version_edit.h" +#include "db/write_batch_internal.h" +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" + +namespace rocksdb { + +namespace { + +class Repairer { + public: + Repairer(const std::string& dbname, const Options& options) + : dbname_(dbname), + env_(options.env), + icmp_(options.comparator), + ipolicy_(options.filter_policy), + options_(SanitizeOptions(dbname, &icmp_, &ipolicy_, options)), + next_file_number_(1) { + // TableCache can be small since we expect each table to be opened once. + table_cache_ = new TableCache(dbname_, &options_, storage_options_, 10); + edit_ = new VersionEdit(); + } + + ~Repairer() { + delete table_cache_; + delete edit_; + } + + Status Run() { + Status status = FindFiles(); + if (status.ok()) { + ConvertLogFilesToTables(); + ExtractMetaData(); + status = WriteDescriptor(); + } + if (status.ok()) { + unsigned long long bytes = 0; + for (size_t i = 0; i < tables_.size(); i++) { + bytes += tables_[i].meta.file_size; + } + Log(options_.info_log, + "**** Repaired rocksdb %s; " + "recovered %d files; %llu bytes. " + "Some data may have been lost. " + "****", + dbname_.c_str(), + static_cast(tables_.size()), + bytes); + } + return status; + } + + private: + struct TableInfo { + FileMetaData meta; + SequenceNumber min_sequence; + SequenceNumber max_sequence; + }; + + std::string const dbname_; + Env* const env_; + InternalKeyComparator const icmp_; + InternalFilterPolicy const ipolicy_; + Options const options_; + TableCache* table_cache_; + VersionEdit* edit_; + + std::vector manifests_; + std::vector table_numbers_; + std::vector logs_; + std::vector tables_; + uint64_t next_file_number_; + const EnvOptions storage_options_; + + Status FindFiles() { + std::vector filenames; + Status status = env_->GetChildren(dbname_, &filenames); + if (!status.ok()) { + return status; + } + if (filenames.empty()) { + return Status::IOError(dbname_, "repair found no files"); + } + + uint64_t number; + FileType type; + for (size_t i = 0; i < filenames.size(); i++) { + if (ParseFileName(filenames[i], &number, &type)) { + if (type == kDescriptorFile) { + manifests_.push_back(filenames[i]); + } else { + if (number + 1 > next_file_number_) { + next_file_number_ = number + 1; + } + if (type == kLogFile) { + logs_.push_back(number); + } else if (type == kTableFile) { + table_numbers_.push_back(number); + } else { + // Ignore other files + } + } + } + } + return status; + } + + void ConvertLogFilesToTables() { + for (size_t i = 0; i < logs_.size(); i++) { + std::string logname = LogFileName(dbname_, logs_[i]); + Status status = ConvertLogToTable(logs_[i]); + if (!status.ok()) { + Log(options_.info_log, "Log #%llu: ignoring conversion error: %s", + (unsigned long long) logs_[i], + status.ToString().c_str()); + } + ArchiveFile(logname); + } + } + + Status ConvertLogToTable(uint64_t log) { + struct LogReporter : public log::Reader::Reporter { + Env* env; + std::shared_ptr info_log; + uint64_t lognum; + virtual void Corruption(size_t bytes, const Status& s) { + // We print error messages for corruption, but continue repairing. + Log(info_log, "Log #%llu: dropping %d bytes; %s", + (unsigned long long) lognum, + static_cast(bytes), + s.ToString().c_str()); + } + }; + + // Open the log file + std::string logname = LogFileName(dbname_, log); + unique_ptr lfile; + Status status = env_->NewSequentialFile(logname, &lfile, storage_options_); + if (!status.ok()) { + return status; + } + + // Create the log reader. + LogReporter reporter; + reporter.env = env_; + reporter.info_log = options_.info_log; + reporter.lognum = log; + // We intentially make log::Reader do checksumming so that + // corruptions cause entire commits to be skipped instead of + // propagating bad information (like overly large sequence + // numbers). + log::Reader reader(std::move(lfile), &reporter, false/*do not checksum*/, + 0/*initial_offset*/); + + // Read all the records and add to a memtable + std::string scratch; + Slice record; + WriteBatch batch; + MemTable* mem = new MemTable(icmp_, options_); + mem->Ref(); + int counter = 0; + while (reader.ReadRecord(&record, &scratch)) { + if (record.size() < 12) { + reporter.Corruption( + record.size(), Status::Corruption("log record too small")); + continue; + } + WriteBatchInternal::SetContents(&batch, record); + status = WriteBatchInternal::InsertInto(&batch, mem, &options_); + if (status.ok()) { + counter += WriteBatchInternal::Count(&batch); + } else { + Log(options_.info_log, "Log #%llu: ignoring %s", + (unsigned long long) log, + status.ToString().c_str()); + status = Status::OK(); // Keep going with rest of file + } + } + + // Do not record a version edit for this conversion to a Table + // since ExtractMetaData() will also generate edits. + FileMetaData meta; + meta.number = next_file_number_++; + Iterator* iter = mem->NewIterator(); + status = BuildTable(dbname_, env_, options_, storage_options_, + table_cache_, iter, &meta, + icmp_.user_comparator(), 0, 0, + kNoCompression); + delete iter; + delete mem->Unref(); + mem = nullptr; + if (status.ok()) { + if (meta.file_size > 0) { + table_numbers_.push_back(meta.number); + } + } + Log(options_.info_log, "Log #%llu: %d ops saved to Table #%llu %s", + (unsigned long long) log, + counter, + (unsigned long long) meta.number, + status.ToString().c_str()); + return status; + } + + void ExtractMetaData() { + std::vector kept; + for (size_t i = 0; i < table_numbers_.size(); i++) { + TableInfo t; + t.meta.number = table_numbers_[i]; + Status status = ScanTable(&t); + if (!status.ok()) { + std::string fname = TableFileName(dbname_, table_numbers_[i]); + Log(options_.info_log, "Table #%llu: ignoring %s", + (unsigned long long) table_numbers_[i], + status.ToString().c_str()); + ArchiveFile(fname); + } else { + tables_.push_back(t); + } + } + } + + Status ScanTable(TableInfo* t) { + std::string fname = TableFileName(dbname_, t->meta.number); + int counter = 0; + Status status = env_->GetFileSize(fname, &t->meta.file_size); + if (status.ok()) { + Iterator* iter = table_cache_->NewIterator( + ReadOptions(), storage_options_, t->meta.number, t->meta.file_size); + bool empty = true; + ParsedInternalKey parsed; + t->min_sequence = 0; + t->max_sequence = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + Slice key = iter->key(); + if (!ParseInternalKey(key, &parsed)) { + Log(options_.info_log, "Table #%llu: unparsable key %s", + (unsigned long long) t->meta.number, + EscapeString(key).c_str()); + continue; + } + + counter++; + if (empty) { + empty = false; + t->meta.smallest.DecodeFrom(key); + } + t->meta.largest.DecodeFrom(key); + if (parsed.sequence < t->min_sequence) { + t->min_sequence = parsed.sequence; + } + if (parsed.sequence > t->max_sequence) { + t->max_sequence = parsed.sequence; + } + } + if (!iter->status().ok()) { + status = iter->status(); + } + delete iter; + } + Log(options_.info_log, "Table #%llu: %d entries %s", + (unsigned long long) t->meta.number, + counter, + status.ToString().c_str()); + return status; + } + + Status WriteDescriptor() { + std::string tmp = TempFileName(dbname_, 1); + unique_ptr file; + Status status = env_->NewWritableFile(tmp, &file, storage_options_); + if (!status.ok()) { + return status; + } + + SequenceNumber max_sequence = 0; + for (size_t i = 0; i < tables_.size(); i++) { + if (max_sequence < tables_[i].max_sequence) { + max_sequence = tables_[i].max_sequence; + } + } + + edit_->SetComparatorName(icmp_.user_comparator()->Name()); + edit_->SetLogNumber(0); + edit_->SetNextFile(next_file_number_); + edit_->SetLastSequence(max_sequence); + + for (size_t i = 0; i < tables_.size(); i++) { + // TODO(opt): separate out into multiple levels + const TableInfo& t = tables_[i]; + edit_->AddFile(0, t.meta.number, t.meta.file_size, + t.meta.smallest, t.meta.largest, + t.min_sequence, t.max_sequence); + } + + //fprintf(stderr, "NewDescriptor:\n%s\n", edit_.DebugString().c_str()); + { + log::Writer log(std::move(file)); + std::string record; + edit_->EncodeTo(&record); + status = log.AddRecord(record); + } + + if (!status.ok()) { + env_->DeleteFile(tmp); + } else { + // Discard older manifests + for (size_t i = 0; i < manifests_.size(); i++) { + ArchiveFile(dbname_ + "/" + manifests_[i]); + } + + // Install new manifest + status = env_->RenameFile(tmp, DescriptorFileName(dbname_, 1)); + if (status.ok()) { + status = SetCurrentFile(env_, dbname_, 1); + } else { + env_->DeleteFile(tmp); + } + } + return status; + } + + void ArchiveFile(const std::string& fname) { + // Move into another directory. E.g., for + // dir/foo + // rename to + // dir/lost/foo + const char* slash = strrchr(fname.c_str(), '/'); + std::string new_dir; + if (slash != nullptr) { + new_dir.assign(fname.data(), slash - fname.data()); + } + new_dir.append("/lost"); + env_->CreateDir(new_dir); // Ignore error + std::string new_file = new_dir; + new_file.append("/"); + new_file.append((slash == nullptr) ? fname.c_str() : slash + 1); + Status s = env_->RenameFile(fname, new_file); + Log(options_.info_log, "Archiving %s: %s\n", + fname.c_str(), s.ToString().c_str()); + } +}; +} // namespace + +Status RepairDB(const std::string& dbname, const Options& options) { + Repairer repairer(dbname, options); + return repairer.Run(); +} + +} // namespace rocksdb diff --git a/db/simple_table_db_test.cc b/db/simple_table_db_test.cc new file mode 100644 index 00000000..555d3189 --- /dev/null +++ b/db/simple_table_db_test.cc @@ -0,0 +1,793 @@ +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include +#include + +#include "rocksdb/db.h" +#include "rocksdb/filter_policy.h" +#include "db/db_impl.h" +#include "db/filename.h" +#include "db/version_set.h" +#include "db/write_batch_internal.h" +#include "db/db_statistics.h" +#include "rocksdb/cache.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/env.h" +#include "rocksdb/table.h" +#include "util/hash.h" +#include "util/logging.h" +#include "util/mutexlock.h" +#include "util/testharness.h" +#include "util/testutil.h" +#include "utilities/merge_operators.h" + +using std::unique_ptr; + +namespace rocksdb { + +// SimpleTable is a simple table format for UNIT TEST ONLY. It is not built +// as production quality. +// SimpleTable requires the input key size to be fixed 16 bytes, value cannot +// be longer than 150000 bytes and stored data on disk in this format: +// +--------------------------------------------+ <= key1 offset +// | key1 | value_size (4 bytes) | | +// +----------------------------------------+ | +// | value1 | +// | | +// +----------------------------------------+---+ <= key2 offset +// | key2 | value_size (4 bytes) | | +// +----------------------------------------+ | +// | value2 | +// | | +// | ...... | +// +-----------------+--------------------------+ <= index_block_offset +// | key1 | key1 offset (8 bytes) | +// +-----------------+--------------------------+ +// | key2 | key2 offset (8 bytes) | +// +-----------------+--------------------------+ +// | key3 | key3 offset (8 bytes) | +// +-----------------+--------------------------+ +// | ...... | +// +-----------------+------------+-------------+ +// | index_block_offset (8 bytes) | +// +------------------------------+ + +// SimpleTable is a simple table format for UNIT TEST ONLY. It is not built +// as production quality. +class SimpleTableReader: public TableReader { +public: + // Attempt to open the table that is stored in bytes [0..file_size) + // of "file", and read the metadata entries necessary to allow + // retrieving data from the table. + // + // If successful, returns ok and sets "*table" to the newly opened + // table. The client should delete "*table" when no longer needed. + // If there was an error while initializing the table, sets "*table" + // to nullptr and returns a non-ok status. Does not take ownership of + // "*source", but the client must ensure that "source" remains live + // for the duration of the returned table's lifetime. + // + // *file must remain live while this Table is in use. + static Status Open(const Options& options, const EnvOptions& soptions, + unique_ptr && file, uint64_t file_size, + unique_ptr* table_reader); + + bool PrefixMayMatch(const Slice& internal_prefix) override; + + Iterator* NewIterator(const ReadOptions&) override; + + Status Get( + const ReadOptions&, const Slice& key, void* arg, + bool (*handle_result)(void* arg, const Slice& k, const Slice& v, bool), + void (*mark_key_may_exist)(void*) = nullptr) override; + + uint64_t ApproximateOffsetOf(const Slice& key) override; + + bool TEST_KeyInCache(const ReadOptions& options, const Slice& key) override; + + void SetupForCompaction() override; + + TableProperties& GetTableProperties() override; + + ~SimpleTableReader(); + +private: + struct Rep; + Rep* rep_; + + explicit SimpleTableReader(Rep* rep) { + rep_ = rep; + } + friend class TableCache; + friend class SimpleTableIterator; + + Status GetOffset(const Slice& target, uint64_t* offset); + + // No copying allowed + explicit SimpleTableReader(const TableReader&) = delete; + void operator=(const TableReader&) = delete; +}; + +// Iterator to iterate SimpleTable +class SimpleTableIterator: public Iterator { +public: + explicit SimpleTableIterator(SimpleTableReader* table); + ~SimpleTableIterator(); + + bool Valid() const; + + void SeekToFirst(); + + void SeekToLast(); + + void Seek(const Slice& target); + + void Next(); + + void Prev(); + + Slice key() const; + + Slice value() const; + + Status status() const; + +private: + SimpleTableReader* table_; + uint64_t offset_; + uint64_t next_offset_; + Slice key_; + Slice value_; + char tmp_str_[4]; + char* key_str_; + char* value_str_; + int value_str_len_; + Status status_; + // No copying allowed + SimpleTableIterator(const SimpleTableIterator&) = delete; + void operator=(const Iterator&) = delete; +}; + +struct SimpleTableReader::Rep { + ~Rep() { + } + Rep(const EnvOptions& storage_options, uint64_t index_start_offset, + int num_entries) : + soptions(storage_options), index_start_offset(index_start_offset), + num_entries(num_entries) { + } + + Options options; + const EnvOptions& soptions; + Status status; + unique_ptr file; + uint64_t index_start_offset; + int num_entries; + TableProperties table_properties; + + const static int user_key_size = 16; + const static int offset_length = 8; + const static int key_footer_len = 8; + + static int GetInternalKeyLength() { + return user_key_size + key_footer_len; + } +}; + +SimpleTableReader::~SimpleTableReader() { + delete rep_; +} + +Status SimpleTableReader::Open(const Options& options, + const EnvOptions& soptions, + unique_ptr && file, + uint64_t size, + unique_ptr* table_reader) { + char footer_space[Rep::offset_length]; + Slice footer_input; + Status s = file->Read(size - Rep::offset_length, Rep::offset_length, + &footer_input, footer_space); + if (s.ok()) { + uint64_t index_start_offset = DecodeFixed64(footer_space); + + int num_entries = (size - Rep::offset_length - index_start_offset) + / (Rep::GetInternalKeyLength() + Rep::offset_length); + SimpleTableReader::Rep* rep = new SimpleTableReader::Rep(soptions, + index_start_offset, + num_entries); + + rep->file = std::move(file); + rep->options = options; + table_reader->reset(new SimpleTableReader(rep)); + } + return s; +} + +void SimpleTableReader::SetupForCompaction() { +} + +TableProperties& SimpleTableReader::GetTableProperties() { + return rep_->table_properties; +} + +bool SimpleTableReader::PrefixMayMatch(const Slice& internal_prefix) { + return true; +} + +Iterator* SimpleTableReader::NewIterator(const ReadOptions& options) { + return new SimpleTableIterator(this); +} + +Status SimpleTableReader::GetOffset(const Slice& target, uint64_t* offset) { + uint32_t left = 0; + uint32_t right = rep_->num_entries - 1; + char key_chars[Rep::GetInternalKeyLength()]; + Slice tmp_slice; + + uint32_t target_offset = 0; + while (left <= right) { + uint32_t mid = (left + right + 1) / 2; + + uint64_t offset_to_read = rep_->index_start_offset + + (Rep::GetInternalKeyLength() + Rep::offset_length) * mid; + Status s = rep_->file->Read(offset_to_read, Rep::GetInternalKeyLength(), + &tmp_slice, key_chars); + if (!s.ok()) { + return s; + } + + int compare_result = rep_->options.comparator->Compare(tmp_slice, target); + + if (compare_result < 0) { + if (left == right) { + target_offset = right + 1; + break; + } + left = mid; + } else { + if (left == right) { + target_offset = left; + break; + } + right = mid - 1; + } + } + + if (target_offset >= (uint32_t) rep_->num_entries) { + *offset = rep_->index_start_offset; + return Status::OK(); + } + + char value_offset_chars[Rep::offset_length]; + + int64_t offset_for_value_offset = rep_->index_start_offset + + (Rep::GetInternalKeyLength() + Rep::offset_length) * target_offset + + Rep::GetInternalKeyLength(); + Status s = rep_->file->Read(offset_for_value_offset, Rep::offset_length, + &tmp_slice, value_offset_chars); + if (s.ok()) { + *offset = DecodeFixed64(value_offset_chars); + } + return s; +} + +Status SimpleTableReader::Get( + const ReadOptions& options, const Slice& k, void* arg, + bool (*saver)(void*, const Slice&, const Slice&, bool), + void (*mark_key_may_exist)(void*)) { + Status s; + SimpleTableIterator* iter = new SimpleTableIterator(this); + for (iter->Seek(k); iter->Valid(); iter->Next()) { + if (!(*saver)(arg, iter->key(), iter->value(), true)) { + break; + } + } + s = iter->status(); + delete iter; + return s; +} + +bool SimpleTableReader::TEST_KeyInCache(const ReadOptions& options, + const Slice& key) { + return false; +} + +uint64_t SimpleTableReader::ApproximateOffsetOf(const Slice& key) { + return 0; +} + +SimpleTableIterator::SimpleTableIterator(SimpleTableReader* table) : + table_(table) { + key_str_ = new char[SimpleTableReader::Rep::GetInternalKeyLength()]; + value_str_len_ = -1; + SeekToFirst(); +} + +SimpleTableIterator::~SimpleTableIterator() { + delete[] key_str_; + if (value_str_len_ >= 0) { + delete[] value_str_; + } +} + +bool SimpleTableIterator::Valid() const { + return offset_ < table_->rep_->index_start_offset; +} + +void SimpleTableIterator::SeekToFirst() { + next_offset_ = 0; + Next(); +} + +void SimpleTableIterator::SeekToLast() { + assert(false); +} + +void SimpleTableIterator::Seek(const Slice& target) { + Status s = table_->GetOffset(target, &next_offset_); + if (!s.ok()) { + status_ = s; + } + Next(); +} + +void SimpleTableIterator::Next() { + offset_ = next_offset_; + if (offset_ >= table_->rep_->index_start_offset) { + return; + } + Slice result; + int internal_key_size = SimpleTableReader::Rep::GetInternalKeyLength(); + + Status s = table_->rep_->file->Read(next_offset_, internal_key_size, &result, + key_str_); + next_offset_ += internal_key_size; + key_ = result; + + Slice value_size_slice; + s = table_->rep_->file->Read(next_offset_, 4, &value_size_slice, tmp_str_); + next_offset_ += 4; + uint32_t value_size = DecodeFixed32(tmp_str_); + + Slice value_slice; + if ((int) value_size > value_str_len_) { + if (value_str_len_ >= 0) { + delete[] value_str_; + } + value_str_ = new char[value_size]; + value_str_len_ = value_size; + } + s = table_->rep_->file->Read(next_offset_, value_size, &value_slice, + value_str_); + next_offset_ += value_size; + value_ = value_slice; +} + +void SimpleTableIterator::Prev() { + assert(false); +} + +Slice SimpleTableIterator::key() const { + Log(table_->rep_->options.info_log, "key!!!!"); + return key_; +} + +Slice SimpleTableIterator::value() const { + return value_; +} + +Status SimpleTableIterator::status() const { + return status_; +} + +class SimpleTableBuilder: public TableBuilder { +public: + // Create a builder that will store the contents of the table it is + // building in *file. Does not close the file. It is up to the + // caller to close the file after calling Finish(). The output file + // will be part of level specified by 'level'. A value of -1 means + // that the caller does not know which level the output file will reside. + SimpleTableBuilder(const Options& options, WritableFile* file, + CompressionType compression_type); + + // REQUIRES: Either Finish() or Abandon() has been called. + ~SimpleTableBuilder(); + + // Add key,value to the table being constructed. + // REQUIRES: key is after any previously added key according to comparator. + // REQUIRES: Finish(), Abandon() have not been called + void Add(const Slice& key, const Slice& value) override; + + // Return non-ok iff some error has been detected. + Status status() const override; + + // Finish building the table. Stops using the file passed to the + // constructor after this function returns. + // REQUIRES: Finish(), Abandon() have not been called + Status Finish() override; + + // Indicate that the contents of this builder should be abandoned. Stops + // using the file passed to the constructor after this function returns. + // If the caller is not going to call Finish(), it must call Abandon() + // before destroying this builder. + // REQUIRES: Finish(), Abandon() have not been called + void Abandon() override; + + // Number of calls to Add() so far. + uint64_t NumEntries() const override; + + // Size of the file generated so far. If invoked after a successful + // Finish() call, returns the size of the final generated file. + uint64_t FileSize() const override; + +private: + struct Rep; + Rep* rep_; + + // No copying allowed + SimpleTableBuilder(const SimpleTableBuilder&) = delete; + void operator=(const SimpleTableBuilder&) = delete; +}; + +struct SimpleTableBuilder::Rep { + Options options; + WritableFile* file; + uint64_t offset = 0; + Status status; + + uint64_t num_entries = 0; + + bool closed = false; // Either Finish() or Abandon() has been called. + + const static int user_key_size = 16; + const static int offset_length = 8; + const static int key_footer_len = 8; + + static int GetInternalKeyLength() { + return user_key_size + key_footer_len; + } + + std::string index; + + Rep(const Options& opt, WritableFile* f) : + options(opt), file(f) { + } + ~Rep() { + } +}; + +SimpleTableBuilder::SimpleTableBuilder(const Options& options, + WritableFile* file, + CompressionType compression_type) : + rep_(new SimpleTableBuilder::Rep(options, file)) { +} + +SimpleTableBuilder::~SimpleTableBuilder() { + delete (rep_); +} + +void SimpleTableBuilder::Add(const Slice& key, const Slice& value) { + assert((int ) key.size() == Rep::GetInternalKeyLength()); + + // Update index + rep_->index.append(key.data(), key.size()); + PutFixed64(&(rep_->index), rep_->offset); + + // Write key-value pair + rep_->file->Append(key); + rep_->offset += Rep::GetInternalKeyLength(); + + std::string size; + int value_size = value.size(); + PutFixed32(&size, value_size); + Slice sizeSlice(size); + rep_->file->Append(sizeSlice); + rep_->file->Append(value); + rep_->offset += value_size + 4; + + rep_->num_entries++; +} + +Status SimpleTableBuilder::status() const { + return Status::OK(); +} + +Status SimpleTableBuilder::Finish() { + Rep* r = rep_; + assert(!r->closed); + r->closed = true; + + uint64_t index_offset = rep_->offset; + Slice index_slice(rep_->index); + rep_->file->Append(index_slice); + rep_->offset += index_slice.size(); + + std::string index_offset_str; + PutFixed64(&index_offset_str, index_offset); + Slice foot_slice(index_offset_str); + rep_->file->Append(foot_slice); + rep_->offset += foot_slice.size(); + + return Status::OK(); +} + +void SimpleTableBuilder::Abandon() { + rep_->closed = true; +} + +uint64_t SimpleTableBuilder::NumEntries() const { + return rep_->num_entries; +} + +uint64_t SimpleTableBuilder::FileSize() const { + return rep_->offset; +} + +class SimpleTableFactory: public TableFactory { +public: + ~SimpleTableFactory() { + } + SimpleTableFactory() { + } + const char* Name() const override { + return "SimpleTable"; + } + Status GetTableReader(const Options& options, const EnvOptions& soptions, + unique_ptr && file, + uint64_t file_size, + unique_ptr* table_reader) const; + + TableBuilder* GetTableBuilder(const Options& options, WritableFile* file, + CompressionType compression_type) const; +}; + +Status SimpleTableFactory::GetTableReader( + const Options& options, const EnvOptions& soptions, + unique_ptr && file, uint64_t file_size, + unique_ptr* table_reader) const { + + return SimpleTableReader::Open(options, soptions, std::move(file), file_size, + table_reader); +} + +TableBuilder* SimpleTableFactory::GetTableBuilder( + const Options& options, WritableFile* file, + CompressionType compression_type) const { + return new SimpleTableBuilder(options, file, compression_type); +} + +class SimpleTableDBTest { +protected: +public: + std::string dbname_; + Env* env_; + DB* db_; + + Options last_options_; + + SimpleTableDBTest() : + env_(Env::Default()) { + dbname_ = test::TmpDir() + "/simple_table_db_test"; + ASSERT_OK(DestroyDB(dbname_, Options())); + db_ = nullptr; + Reopen(); + } + + ~SimpleTableDBTest() { + delete db_; + ASSERT_OK(DestroyDB(dbname_, Options())); + } + + // Return the current option configuration. + Options CurrentOptions() { + Options options; + options.table_factory.reset(new SimpleTableFactory()); + return options; + } + + DBImpl* dbfull() { + return reinterpret_cast(db_); + } + + void Reopen(Options* options = nullptr) { + ASSERT_OK(TryReopen(options)); + } + + void Close() { + delete db_; + db_ = nullptr; + } + + void DestroyAndReopen(Options* options = nullptr) { + //Destroy using last options + Destroy(&last_options_); + ASSERT_OK(TryReopen(options)); + } + + void Destroy(Options* options) { + delete db_; + db_ = nullptr; + ASSERT_OK(DestroyDB(dbname_, *options)); + } + + Status PureReopen(Options* options, DB** db) { + return DB::Open(*options, dbname_, db); + } + + Status TryReopen(Options* options = nullptr) { + delete db_; + db_ = nullptr; + Options opts; + if (options != nullptr) { + opts = *options; + } else { + opts = CurrentOptions(); + opts.create_if_missing = true; + } + last_options_ = opts; + + return DB::Open(opts, dbname_, &db_); + } + + Status Put(const Slice& k, const Slice& v) { + return db_->Put(WriteOptions(), k, v); + } + + Status Delete(const std::string& k) { + return db_->Delete(WriteOptions(), k); + } + + std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) { + ReadOptions options; + options.snapshot = snapshot; + std::string result; + Status s = db_->Get(options, k, &result); + if (s.IsNotFound()) { + result = "NOT_FOUND"; + } else if (!s.ok()) { + result = s.ToString(); + } + return result; + } + + + int NumTableFilesAtLevel(int level) { + std::string property; + ASSERT_TRUE( + db_->GetProperty("rocksdb.num-files-at-level" + NumberToString(level), + &property)); + return atoi(property.c_str()); + } + + // Return spread of files per level + std::string FilesPerLevel() { + std::string result; + int last_non_zero_offset = 0; + for (int level = 0; level < db_->NumberLevels(); level++) { + int f = NumTableFilesAtLevel(level); + char buf[100]; + snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f); + result += buf; + if (f > 0) { + last_non_zero_offset = result.size(); + } + } + result.resize(last_non_zero_offset); + return result; + } + + std::string IterStatus(Iterator* iter) { + std::string result; + if (iter->Valid()) { + result = iter->key().ToString() + "->" + iter->value().ToString(); + } else { + result = "(invalid)"; + } + return result; + } +}; + +TEST(SimpleTableDBTest, Empty) { + ASSERT_TRUE(db_ != nullptr); + ASSERT_EQ("NOT_FOUND", Get("0000000000000foo")); +} + +TEST(SimpleTableDBTest, ReadWrite) { + ASSERT_OK(Put("0000000000000foo", "v1")); + ASSERT_EQ("v1", Get("0000000000000foo")); + ASSERT_OK(Put("0000000000000bar", "v2")); + ASSERT_OK(Put("0000000000000foo", "v3")); + ASSERT_EQ("v3", Get("0000000000000foo")); + ASSERT_EQ("v2", Get("0000000000000bar")); +} + +TEST(SimpleTableDBTest, Flush) { + ASSERT_OK(Put("0000000000000foo", "v1")); + ASSERT_OK(Put("0000000000000bar", "v2")); + ASSERT_OK(Put("0000000000000foo", "v3")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v3", Get("0000000000000foo")); + ASSERT_EQ("v2", Get("0000000000000bar")); +} + +TEST(SimpleTableDBTest, Flush2) { + ASSERT_OK(Put("0000000000000bar", "b")); + ASSERT_OK(Put("0000000000000foo", "v1")); + dbfull()->TEST_FlushMemTable(); + + ASSERT_OK(Put("0000000000000foo", "v2")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v2", Get("0000000000000foo")); + + ASSERT_OK(Put("0000000000000eee", "v3")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v3", Get("0000000000000eee")); + + ASSERT_OK(Delete("0000000000000bar")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("NOT_FOUND", Get("0000000000000bar")); + + ASSERT_OK(Put("0000000000000eee", "v5")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v5", Get("0000000000000eee")); +} + +static std::string Key(int i) { + char buf[100]; + snprintf(buf, sizeof(buf), "key_______%06d", i); + return std::string(buf); +} + +static std::string RandomString(Random* rnd, int len) { + std::string r; + test::RandomString(rnd, len, &r); + return r; +} + +TEST(SimpleTableDBTest, CompactionTrigger) { + Options options = CurrentOptions(); + options.write_buffer_size = 100 << 10; //100KB + options.num_levels = 3; + options.max_mem_compaction_level = 0; + options.level0_file_num_compaction_trigger = 3; + Reopen(&options); + + Random rnd(301); + + for (int num = 0; num < options.level0_file_num_compaction_trigger - 1; + num++) { + std::vector values; + // Write 120KB (12 values, each 10K) + for (int i = 0; i < 12; i++) { + values.push_back(RandomString(&rnd, 10000)); + ASSERT_OK(Put(Key(i), values[i])); + } + dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_EQ(NumTableFilesAtLevel(0), num + 1); + } + + //generate one more file in level-0, and should trigger level-0 compaction + std::vector values; + for (int i = 0; i < 12; i++) { + values.push_back(RandomString(&rnd, 10000)); + ASSERT_OK(Put(Key(i), values[i])); + } + dbfull()->TEST_WaitForCompact(); + + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_EQ(NumTableFilesAtLevel(1), 1); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/db/skiplist.h b/db/skiplist.h new file mode 100644 index 00000000..06a35d91 --- /dev/null +++ b/db/skiplist.h @@ -0,0 +1,416 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Thread safety +// ------------- +// +// Writes require external synchronization, most likely a mutex. +// Reads require a guarantee that the SkipList will not be destroyed +// while the read is in progress. Apart from that, reads progress +// without any internal locking or synchronization. +// +// Invariants: +// +// (1) Allocated nodes are never deleted until the SkipList is +// destroyed. This is trivially guaranteed by the code since we +// never delete any skip list nodes. +// +// (2) The contents of a Node except for the next/prev pointers are +// immutable after the Node has been linked into the SkipList. +// Only Insert() modifies the list, and it is careful to initialize +// a node and use release-stores to publish the nodes in one or +// more lists. +// +// ... prev vs. next pointer ordering ... +// + +#pragma once +#include +#include +#include "port/port.h" +#include "util/random.h" + +namespace rocksdb { + +template +class SkipList { + private: + struct Node; + + public: + // Create a new SkipList object that will use "cmp" for comparing keys, + // and will allocate memory using "*arena". Objects allocated in the arena + // must remain allocated for the lifetime of the skiplist object. + explicit SkipList(Comparator cmp, Arena* arena); + + // Insert key into the list. + // REQUIRES: nothing that compares equal to key is currently in the list. + void Insert(const Key& key); + + // Returns true iff an entry that compares equal to key is in the list. + bool Contains(const Key& key) const; + + // Iteration over the contents of a skip list + class Iterator { + public: + // Initialize an iterator over the specified list. + // The returned iterator is not valid. + explicit Iterator(const SkipList* list); + + // Change the underlying skiplist used for this iterator + // This enables us not changing the iterator without deallocating + // an old one and then allocating a new one + void SetList(const SkipList* list); + + // Returns true iff the iterator is positioned at a valid node. + bool Valid() const; + + // Returns the key at the current position. + // REQUIRES: Valid() + const Key& key() const; + + // Advances to the next position. + // REQUIRES: Valid() + void Next(); + + // Advances to the previous position. + // REQUIRES: Valid() + void Prev(); + + // Advance to the first entry with a key >= target + void Seek(const Key& target); + + // Position at the first entry in list. + // Final state of iterator is Valid() iff list is not empty. + void SeekToFirst(); + + // Position at the last entry in list. + // Final state of iterator is Valid() iff list is not empty. + void SeekToLast(); + + private: + const SkipList* list_; + Node* node_; + // Intentionally copyable + }; + + private: + enum { kMaxHeight = 12 }; + + // Immutable after construction + Comparator const compare_; + Arena* const arena_; // Arena used for allocations of nodes + + Node* const head_; + + // Modified only by Insert(). Read racily by readers, but stale + // values are ok. + port::AtomicPointer max_height_; // Height of the entire list + + // Used for optimizing sequential insert patterns + Node* prev_[kMaxHeight]; + int prev_height_; + + inline int GetMaxHeight() const { + return static_cast( + reinterpret_cast(max_height_.NoBarrier_Load())); + } + + // Read/written only by Insert(). + Random rnd_; + + Node* NewNode(const Key& key, int height); + int RandomHeight(); + bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); } + + // Return true if key is greater than the data stored in "n" + bool KeyIsAfterNode(const Key& key, Node* n) const; + + // Return the earliest node that comes at or after key. + // Return nullptr if there is no such node. + // + // If prev is non-nullptr, fills prev[level] with pointer to previous + // node at "level" for every level in [0..max_height_-1]. + Node* FindGreaterOrEqual(const Key& key, Node** prev) const; + + // Return the latest node with a key < key. + // Return head_ if there is no such node. + Node* FindLessThan(const Key& key) const; + + // Return the last node in the list. + // Return head_ if list is empty. + Node* FindLast() const; + + // No copying allowed + SkipList(const SkipList&); + void operator=(const SkipList&); +}; + +// Implementation details follow +template +struct SkipList::Node { + explicit Node(const Key& k) : key(k) { } + + Key const key; + + // Accessors/mutators for links. Wrapped in methods so we can + // add the appropriate barriers as necessary. + Node* Next(int n) { + assert(n >= 0); + // Use an 'acquire load' so that we observe a fully initialized + // version of the returned Node. + return reinterpret_cast(next_[n].Acquire_Load()); + } + void SetNext(int n, Node* x) { + assert(n >= 0); + // Use a 'release store' so that anybody who reads through this + // pointer observes a fully initialized version of the inserted node. + next_[n].Release_Store(x); + } + + // No-barrier variants that can be safely used in a few locations. + Node* NoBarrier_Next(int n) { + assert(n >= 0); + return reinterpret_cast(next_[n].NoBarrier_Load()); + } + void NoBarrier_SetNext(int n, Node* x) { + assert(n >= 0); + next_[n].NoBarrier_Store(x); + } + + private: + // Array of length equal to the node height. next_[0] is lowest level link. + port::AtomicPointer next_[1]; +}; + +template +typename SkipList::Node* +SkipList::NewNode(const Key& key, int height) { + char* mem = arena_->AllocateAligned( + sizeof(Node) + sizeof(port::AtomicPointer) * (height - 1)); + return new (mem) Node(key); +} + +template +inline SkipList::Iterator::Iterator(const SkipList* list) { + SetList(list); +} + +template +inline void SkipList::Iterator::SetList(const SkipList* list) { + list_ = list; + node_ = nullptr; +} + +template +inline bool SkipList::Iterator::Valid() const { + return node_ != nullptr; +} + +template +inline const Key& SkipList::Iterator::key() const { + assert(Valid()); + return node_->key; +} + +template +inline void SkipList::Iterator::Next() { + assert(Valid()); + node_ = node_->Next(0); +} + +template +inline void SkipList::Iterator::Prev() { + // Instead of using explicit "prev" links, we just search for the + // last node that falls before key. + assert(Valid()); + node_ = list_->FindLessThan(node_->key); + if (node_ == list_->head_) { + node_ = nullptr; + } +} + +template +inline void SkipList::Iterator::Seek(const Key& target) { + node_ = list_->FindGreaterOrEqual(target, nullptr); +} + +template +inline void SkipList::Iterator::SeekToFirst() { + node_ = list_->head_->Next(0); +} + +template +inline void SkipList::Iterator::SeekToLast() { + node_ = list_->FindLast(); + if (node_ == list_->head_) { + node_ = nullptr; + } +} + +template +int SkipList::RandomHeight() { + // Increase height with probability 1 in kBranching + static const unsigned int kBranching = 4; + int height = 1; + while (height < kMaxHeight && ((rnd_.Next() % kBranching) == 0)) { + height++; + } + assert(height > 0); + assert(height <= kMaxHeight); + return height; +} + +template +bool SkipList::KeyIsAfterNode(const Key& key, Node* n) const { + // nullptr n is considered infinite + return (n != nullptr) && (compare_(n->key, key) < 0); +} + +template +typename SkipList::Node* SkipList::FindGreaterOrEqual(const Key& key, Node** prev) + const { + // Use prev as an optimization hint and fallback to slow path + if (prev && !KeyIsAfterNode(key, prev[0]->Next(0))) { + Node* x = prev[0]; + Node* next = x->Next(0); + if ((x == head_) || KeyIsAfterNode(key, x)) { + // Adjust all relevant insertion points to the previous entry + for (int i = 1; i < prev_height_; i++) { + prev[i] = x; + } + return next; + } + } + // Normal lookup + Node* x = head_; + int level = GetMaxHeight() - 1; + while (true) { + Node* next = x->Next(level); + // Make sure the lists are sorted. + // If x points to head_ or next points nullptr, it is trivially satisfied. + assert((x == head_) || (next == nullptr) || KeyIsAfterNode(next->key, x)); + if (KeyIsAfterNode(key, next)) { + // Keep searching in this list + x = next; + } else { + if (prev != nullptr) prev[level] = x; + if (level == 0) { + return next; + } else { + // Switch to next list + level--; + } + } + } +} + +template +typename SkipList::Node* +SkipList::FindLessThan(const Key& key) const { + Node* x = head_; + int level = GetMaxHeight() - 1; + while (true) { + assert(x == head_ || compare_(x->key, key) < 0); + Node* next = x->Next(level); + if (next == nullptr || compare_(next->key, key) >= 0) { + if (level == 0) { + return x; + } else { + // Switch to next list + level--; + } + } else { + x = next; + } + } +} + +template +typename SkipList::Node* SkipList::FindLast() + const { + Node* x = head_; + int level = GetMaxHeight() - 1; + while (true) { + Node* next = x->Next(level); + if (next == nullptr) { + if (level == 0) { + return x; + } else { + // Switch to next list + level--; + } + } else { + x = next; + } + } +} + +template +SkipList::SkipList(Comparator cmp, Arena* arena) + : compare_(cmp), + arena_(arena), + head_(NewNode(0 /* any key will do */, kMaxHeight)), + max_height_(reinterpret_cast(1)), + prev_height_(1), + rnd_(0xdeadbeef) { + for (int i = 0; i < kMaxHeight; i++) { + head_->SetNext(i, nullptr); + prev_[i] = head_; + } +} + +template +void SkipList::Insert(const Key& key) { + // TODO(opt): We can use a barrier-free variant of FindGreaterOrEqual() + // here since Insert() is externally synchronized. + Node* x = FindGreaterOrEqual(key, prev_); + + // Our data structure does not allow duplicate insertion + assert(x == nullptr || !Equal(key, x->key)); + + int height = RandomHeight(); + if (height > GetMaxHeight()) { + for (int i = GetMaxHeight(); i < height; i++) { + prev_[i] = head_; + } + //fprintf(stderr, "Change height from %d to %d\n", max_height_, height); + + // It is ok to mutate max_height_ without any synchronization + // with concurrent readers. A concurrent reader that observes + // the new value of max_height_ will see either the old value of + // new level pointers from head_ (nullptr), or a new value set in + // the loop below. In the former case the reader will + // immediately drop to the next level since nullptr sorts after all + // keys. In the latter case the reader will use the new node. + max_height_.NoBarrier_Store(reinterpret_cast(height)); + } + + x = NewNode(key, height); + for (int i = 0; i < height; i++) { + // NoBarrier_SetNext() suffices since we will add a barrier when + // we publish a pointer to "x" in prev[i]. + x->NoBarrier_SetNext(i, prev_[i]->NoBarrier_Next(i)); + prev_[i]->SetNext(i, x); + } + prev_[0] = x; + prev_height_ = height; +} + +template +bool SkipList::Contains(const Key& key) const { + Node* x = FindGreaterOrEqual(key, nullptr); + if (x != nullptr && Equal(key, x->key)) { + return true; + } else { + return false; + } +} + +} // namespace rocksdb diff --git a/db/skiplist_test.cc b/db/skiplist_test.cc new file mode 100644 index 00000000..dcbaf0ab --- /dev/null +++ b/db/skiplist_test.cc @@ -0,0 +1,383 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/skiplist.h" +#include +#include "rocksdb/env.h" +#include "util/arena_impl.h" +#include "util/hash.h" +#include "util/random.h" +#include "util/testharness.h" + +namespace rocksdb { + +typedef uint64_t Key; + +struct TestComparator { + int operator()(const Key& a, const Key& b) const { + if (a < b) { + return -1; + } else if (a > b) { + return +1; + } else { + return 0; + } + } +}; + +class SkipTest { }; + +TEST(SkipTest, Empty) { + ArenaImpl arena_impl; + TestComparator cmp; + SkipList list(cmp, &arena_impl); + ASSERT_TRUE(!list.Contains(10)); + + SkipList::Iterator iter(&list); + ASSERT_TRUE(!iter.Valid()); + iter.SeekToFirst(); + ASSERT_TRUE(!iter.Valid()); + iter.Seek(100); + ASSERT_TRUE(!iter.Valid()); + iter.SeekToLast(); + ASSERT_TRUE(!iter.Valid()); +} + +TEST(SkipTest, InsertAndLookup) { + const int N = 2000; + const int R = 5000; + Random rnd(1000); + std::set keys; + ArenaImpl arena_impl; + TestComparator cmp; + SkipList list(cmp, &arena_impl); + for (int i = 0; i < N; i++) { + Key key = rnd.Next() % R; + if (keys.insert(key).second) { + list.Insert(key); + } + } + + for (int i = 0; i < R; i++) { + if (list.Contains(i)) { + ASSERT_EQ(keys.count(i), 1U); + } else { + ASSERT_EQ(keys.count(i), 0U); + } + } + + // Simple iterator tests + { + SkipList::Iterator iter(&list); + ASSERT_TRUE(!iter.Valid()); + + iter.Seek(0); + ASSERT_TRUE(iter.Valid()); + ASSERT_EQ(*(keys.begin()), iter.key()); + + iter.SeekToFirst(); + ASSERT_TRUE(iter.Valid()); + ASSERT_EQ(*(keys.begin()), iter.key()); + + iter.SeekToLast(); + ASSERT_TRUE(iter.Valid()); + ASSERT_EQ(*(keys.rbegin()), iter.key()); + } + + // Forward iteration test + for (int i = 0; i < R; i++) { + SkipList::Iterator iter(&list); + iter.Seek(i); + + // Compare against model iterator + std::set::iterator model_iter = keys.lower_bound(i); + for (int j = 0; j < 3; j++) { + if (model_iter == keys.end()) { + ASSERT_TRUE(!iter.Valid()); + break; + } else { + ASSERT_TRUE(iter.Valid()); + ASSERT_EQ(*model_iter, iter.key()); + ++model_iter; + iter.Next(); + } + } + } + + // Backward iteration test + { + SkipList::Iterator iter(&list); + iter.SeekToLast(); + + // Compare against model iterator + for (std::set::reverse_iterator model_iter = keys.rbegin(); + model_iter != keys.rend(); + ++model_iter) { + ASSERT_TRUE(iter.Valid()); + ASSERT_EQ(*model_iter, iter.key()); + iter.Prev(); + } + ASSERT_TRUE(!iter.Valid()); + } +} + +// We want to make sure that with a single writer and multiple +// concurrent readers (with no synchronization other than when a +// reader's iterator is created), the reader always observes all the +// data that was present in the skip list when the iterator was +// constructor. Because insertions are happening concurrently, we may +// also observe new values that were inserted since the iterator was +// constructed, but we should never miss any values that were present +// at iterator construction time. +// +// We generate multi-part keys: +// +// where: +// key is in range [0..K-1] +// gen is a generation number for key +// hash is hash(key,gen) +// +// The insertion code picks a random key, sets gen to be 1 + the last +// generation number inserted for that key, and sets hash to Hash(key,gen). +// +// At the beginning of a read, we snapshot the last inserted +// generation number for each key. We then iterate, including random +// calls to Next() and Seek(). For every key we encounter, we +// check that it is either expected given the initial snapshot or has +// been concurrently added since the iterator started. +class ConcurrentTest { + private: + static const uint32_t K = 4; + + static uint64_t key(Key key) { return (key >> 40); } + static uint64_t gen(Key key) { return (key >> 8) & 0xffffffffu; } + static uint64_t hash(Key key) { return key & 0xff; } + + static uint64_t HashNumbers(uint64_t k, uint64_t g) { + uint64_t data[2] = { k, g }; + return Hash(reinterpret_cast(data), sizeof(data), 0); + } + + static Key MakeKey(uint64_t k, uint64_t g) { + assert(sizeof(Key) == sizeof(uint64_t)); + assert(k <= K); // We sometimes pass K to seek to the end of the skiplist + assert(g <= 0xffffffffu); + return ((k << 40) | (g << 8) | (HashNumbers(k, g) & 0xff)); + } + + static bool IsValidKey(Key k) { + return hash(k) == (HashNumbers(key(k), gen(k)) & 0xff); + } + + static Key RandomTarget(Random* rnd) { + switch (rnd->Next() % 10) { + case 0: + // Seek to beginning + return MakeKey(0, 0); + case 1: + // Seek to end + return MakeKey(K, 0); + default: + // Seek to middle + return MakeKey(rnd->Next() % K, 0); + } + } + + // Per-key generation + struct State { + port::AtomicPointer generation[K]; + void Set(int k, intptr_t v) { + generation[k].Release_Store(reinterpret_cast(v)); + } + intptr_t Get(int k) { + return reinterpret_cast(generation[k].Acquire_Load()); + } + + State() { + for (unsigned int k = 0; k < K; k++) { + Set(k, 0); + } + } + }; + + // Current state of the test + State current_; + + ArenaImpl arena_impl_; + + // SkipList is not protected by mu_. We just use a single writer + // thread to modify it. + SkipList list_; + + public: + ConcurrentTest() : list_(TestComparator(), &arena_impl_) { } + + // REQUIRES: External synchronization + void WriteStep(Random* rnd) { + const uint32_t k = rnd->Next() % K; + const intptr_t g = current_.Get(k) + 1; + const Key key = MakeKey(k, g); + list_.Insert(key); + current_.Set(k, g); + } + + void ReadStep(Random* rnd) { + // Remember the initial committed state of the skiplist. + State initial_state; + for (unsigned int k = 0; k < K; k++) { + initial_state.Set(k, current_.Get(k)); + } + + Key pos = RandomTarget(rnd); + SkipList::Iterator iter(&list_); + iter.Seek(pos); + while (true) { + Key current; + if (!iter.Valid()) { + current = MakeKey(K, 0); + } else { + current = iter.key(); + ASSERT_TRUE(IsValidKey(current)) << current; + } + ASSERT_LE(pos, current) << "should not go backwards"; + + // Verify that everything in [pos,current) was not present in + // initial_state. + while (pos < current) { + ASSERT_LT(key(pos), K) << pos; + + // Note that generation 0 is never inserted, so it is ok if + // <*,0,*> is missing. + ASSERT_TRUE((gen(pos) == 0U) || + (gen(pos) > (uint64_t)initial_state.Get(key(pos))) + ) << "key: " << key(pos) + << "; gen: " << gen(pos) + << "; initgen: " + << initial_state.Get(key(pos)); + + // Advance to next key in the valid key space + if (key(pos) < key(current)) { + pos = MakeKey(key(pos) + 1, 0); + } else { + pos = MakeKey(key(pos), gen(pos) + 1); + } + } + + if (!iter.Valid()) { + break; + } + + if (rnd->Next() % 2) { + iter.Next(); + pos = MakeKey(key(pos), gen(pos) + 1); + } else { + Key new_target = RandomTarget(rnd); + if (new_target > pos) { + pos = new_target; + iter.Seek(new_target); + } + } + } + } +}; +const uint32_t ConcurrentTest::K; + +// Simple test that does single-threaded testing of the ConcurrentTest +// scaffolding. +TEST(SkipTest, ConcurrentWithoutThreads) { + ConcurrentTest test; + Random rnd(test::RandomSeed()); + for (int i = 0; i < 10000; i++) { + test.ReadStep(&rnd); + test.WriteStep(&rnd); + } +} + +class TestState { + public: + ConcurrentTest t_; + int seed_; + port::AtomicPointer quit_flag_; + + enum ReaderState { + STARTING, + RUNNING, + DONE + }; + + explicit TestState(int s) + : seed_(s), + quit_flag_(nullptr), + state_(STARTING), + state_cv_(&mu_) {} + + void Wait(ReaderState s) { + mu_.Lock(); + while (state_ != s) { + state_cv_.Wait(); + } + mu_.Unlock(); + } + + void Change(ReaderState s) { + mu_.Lock(); + state_ = s; + state_cv_.Signal(); + mu_.Unlock(); + } + + private: + port::Mutex mu_; + ReaderState state_; + port::CondVar state_cv_; +}; + +static void ConcurrentReader(void* arg) { + TestState* state = reinterpret_cast(arg); + Random rnd(state->seed_); + int64_t reads = 0; + state->Change(TestState::RUNNING); + while (!state->quit_flag_.Acquire_Load()) { + state->t_.ReadStep(&rnd); + ++reads; + } + state->Change(TestState::DONE); +} + +static void RunConcurrent(int run) { + const int seed = test::RandomSeed() + (run * 100); + Random rnd(seed); + const int N = 1000; + const int kSize = 1000; + for (int i = 0; i < N; i++) { + if ((i % 100) == 0) { + fprintf(stderr, "Run %d of %d\n", i, N); + } + TestState state(seed + 1); + Env::Default()->Schedule(ConcurrentReader, &state); + state.Wait(TestState::RUNNING); + for (int i = 0; i < kSize; i++) { + state.t_.WriteStep(&rnd); + } + state.quit_flag_.Release_Store(&state); // Any non-nullptr arg will do + state.Wait(TestState::DONE); + } +} + +TEST(SkipTest, Concurrent1) { RunConcurrent(1); } +TEST(SkipTest, Concurrent2) { RunConcurrent(2); } +TEST(SkipTest, Concurrent3) { RunConcurrent(3); } +TEST(SkipTest, Concurrent4) { RunConcurrent(4); } +TEST(SkipTest, Concurrent5) { RunConcurrent(5); } + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/db/snapshot.h b/db/snapshot.h new file mode 100644 index 00000000..2c2e3eac --- /dev/null +++ b/db/snapshot.h @@ -0,0 +1,86 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include "rocksdb/db.h" + +namespace rocksdb { + +class SnapshotList; + +// Snapshots are kept in a doubly-linked list in the DB. +// Each SnapshotImpl corresponds to a particular sequence number. +class SnapshotImpl : public Snapshot { + public: + SequenceNumber number_; // const after creation + + private: + friend class SnapshotList; + + // SnapshotImpl is kept in a doubly-linked circular list + SnapshotImpl* prev_; + SnapshotImpl* next_; + + SnapshotList* list_; // just for sanity checks +}; + +class SnapshotList { + public: + SnapshotList() { + list_.prev_ = &list_; + list_.next_ = &list_; + list_.number_ = 0xFFFFFFFFL; // placeholder marker, for debugging + } + + bool empty() const { return list_.next_ == &list_; } + SnapshotImpl* oldest() const { assert(!empty()); return list_.next_; } + SnapshotImpl* newest() const { assert(!empty()); return list_.prev_; } + + const SnapshotImpl* New(SequenceNumber seq) { + SnapshotImpl* s = new SnapshotImpl; + s->number_ = seq; + s->list_ = this; + s->next_ = &list_; + s->prev_ = list_.prev_; + s->prev_->next_ = s; + s->next_->prev_ = s; + return s; + } + + void Delete(const SnapshotImpl* s) { + assert(s->list_ == this); + s->prev_->next_ = s->next_; + s->next_->prev_ = s->prev_; + delete s; + } + + // retrieve all snapshot numbers. They are sorted in ascending order. + void getAll(std::vector& ret) { + if (empty()) return; + SnapshotImpl* s = &list_; + while (s->next_ != &list_) { + ret.push_back(s->next_->number_); + s = s ->next_; + } + } + + // get the sequence number of the most recent snapshot + const SequenceNumber GetNewest() { + if (empty()) { + return 0; + } + return newest()->number_; + } + + private: + // Dummy head of doubly-linked list of snapshots + SnapshotImpl list_; +}; + +} // namespace rocksdb diff --git a/db/table_cache.cc b/db/table_cache.cc new file mode 100644 index 00000000..593352dd --- /dev/null +++ b/db/table_cache.cc @@ -0,0 +1,174 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/table_cache.h" + +#include "db/filename.h" + +#include "rocksdb/statistics.h" +#include "rocksdb/table.h" +#include "util/coding.h" +#include "util/stop_watch.h" + +namespace rocksdb { + +static void DeleteEntry(const Slice& key, void* value) { + TableReader* table_reader = reinterpret_cast(value); + delete table_reader; +} + +static void UnrefEntry(void* arg1, void* arg2) { + Cache* cache = reinterpret_cast(arg1); + Cache::Handle* h = reinterpret_cast(arg2); + cache->Release(h); +} + +static Slice GetSliceForFileNumber(uint64_t* file_number) { + return Slice(reinterpret_cast(file_number), + sizeof(*file_number)); +} + +TableCache::TableCache(const std::string& dbname, + const Options* options, + const EnvOptions& storage_options, + int entries) + : env_(options->env), + dbname_(dbname), + options_(options), + storage_options_(storage_options), + cache_( + NewLRUCache(entries, options->table_cache_numshardbits, + options->table_cache_remove_scan_count_limit)) { +} + +TableCache::~TableCache() { +} + +Status TableCache::FindTable(const EnvOptions& toptions, + uint64_t file_number, uint64_t file_size, + Cache::Handle** handle, bool* table_io, + const bool no_io) { + Status s; + Slice key = GetSliceForFileNumber(&file_number); + *handle = cache_->Lookup(key); + if (*handle == nullptr) { + if (no_io) { // Dont do IO and return a not-found status + return Status::Incomplete("Table not found in table_cache, no_io is set"); + } + if (table_io != nullptr) { + *table_io = true; // we had to do IO from storage + } + std::string fname = TableFileName(dbname_, file_number); + unique_ptr file; + unique_ptr table_reader; + s = env_->NewRandomAccessFile(fname, &file, toptions); + RecordTick(options_->statistics.get(), NO_FILE_OPENS); + if (s.ok()) { + if (options_->advise_random_on_open) { + file->Hint(RandomAccessFile::RANDOM); + } + StopWatch sw(env_, options_->statistics.get(), TABLE_OPEN_IO_MICROS); + s = options_->table_factory->GetTableReader(*options_, toptions, + std::move(file), file_size, + &table_reader); + } + + if (!s.ok()) { + assert(table_reader == nullptr); + RecordTick(options_->statistics.get(), NO_FILE_ERRORS); + // We do not cache error results so that if the error is transient, + // or somebody repairs the file, we recover automatically. + } else { + assert(file.get() == nullptr); + *handle = cache_->Insert(key, table_reader.release(), 1, &DeleteEntry); + } + } + return s; +} + +Iterator* TableCache::NewIterator(const ReadOptions& options, + const EnvOptions& toptions, + uint64_t file_number, + uint64_t file_size, + TableReader** table_reader_ptr, + bool for_compaction) { + if (table_reader_ptr != nullptr) { + *table_reader_ptr = nullptr; + } + + Cache::Handle* handle = nullptr; + Status s = FindTable(toptions, file_number, file_size, &handle, + nullptr, options.read_tier == kBlockCacheTier); + if (!s.ok()) { + return NewErrorIterator(s); + } + + TableReader* table_reader = + reinterpret_cast(cache_->Value(handle)); + Iterator* result = table_reader->NewIterator(options); + result->RegisterCleanup(&UnrefEntry, cache_.get(), handle); + if (table_reader_ptr != nullptr) { + *table_reader_ptr = table_reader; + } + + if (for_compaction) { + table_reader->SetupForCompaction(); + } + + return result; +} + +Status TableCache::Get(const ReadOptions& options, + uint64_t file_number, + uint64_t file_size, + const Slice& k, + void* arg, + bool (*saver)(void*, const Slice&, const Slice&, bool), + bool* table_io, + void (*mark_key_may_exist)(void*)) { + Cache::Handle* handle = nullptr; + Status s = FindTable(storage_options_, file_number, file_size, + &handle, table_io, + options.read_tier == kBlockCacheTier); + if (s.ok()) { + TableReader* t = + reinterpret_cast(cache_->Value(handle)); + s = t->Get(options, k, arg, saver, mark_key_may_exist); + cache_->Release(handle); + } else if (options.read_tier && s.IsIncomplete()) { + // Couldnt find Table in cache but treat as kFound if no_io set + (*mark_key_may_exist)(arg); + return Status::OK(); + } + return s; +} + +bool TableCache::PrefixMayMatch(const ReadOptions& options, + uint64_t file_number, + uint64_t file_size, + const Slice& internal_prefix, + bool* table_io) { + Cache::Handle* handle = nullptr; + Status s = FindTable(storage_options_, file_number, + file_size, &handle, table_io); + bool may_match = true; + if (s.ok()) { + TableReader* t = + reinterpret_cast(cache_->Value(handle)); + may_match = t->PrefixMayMatch(internal_prefix); + cache_->Release(handle); + } + return may_match; +} + +void TableCache::Evict(uint64_t file_number) { + cache_->Erase(GetSliceForFileNumber(&file_number)); +} + +} // namespace rocksdb diff --git a/db/table_cache.h b/db/table_cache.h new file mode 100644 index 00000000..4b225af9 --- /dev/null +++ b/db/table_cache.h @@ -0,0 +1,78 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Thread-safe (provides internal synchronization) + +#pragma once +#include +#include +#include "db/dbformat.h" +#include "rocksdb/env.h" +#include "rocksdb/cache.h" +#include "port/port.h" +#include "rocksdb/table.h" + +namespace rocksdb { + +class Env; + +class TableCache { + public: + TableCache(const std::string& dbname, const Options* options, + const EnvOptions& storage_options, int entries); + ~TableCache(); + + // Return an iterator for the specified file number (the corresponding + // file length must be exactly "file_size" bytes). If "tableptr" is + // non-nullptr, also sets "*tableptr" to point to the Table object + // underlying the returned iterator, or nullptr if no Table object underlies + // the returned iterator. The returned "*tableptr" object is owned by + // the cache and should not be deleted, and is valid for as long as the + // returned iterator is live. + Iterator* NewIterator(const ReadOptions& options, + const EnvOptions& toptions, + uint64_t file_number, + uint64_t file_size, + TableReader** table_reader_ptr = nullptr, + bool for_compaction = false); + + // If a seek to internal key "k" in specified file finds an entry, + // call (*handle_result)(arg, found_key, found_value) repeatedly until + // it returns false. + Status Get(const ReadOptions& options, + uint64_t file_number, + uint64_t file_size, + const Slice& k, + void* arg, + bool (*handle_result)(void*, const Slice&, const Slice&, bool), + bool* table_io, + void (*mark_key_may_exist)(void*) = nullptr); + + // Determine whether the table may contain the specified prefix. If + // the table index of blooms are not in memory, this may cause an I/O + bool PrefixMayMatch(const ReadOptions& options, uint64_t file_number, + uint64_t file_size, const Slice& internal_prefix, + bool* table_io); + + // Evict any entry for the specified file number + void Evict(uint64_t file_number); + + private: + Env* const env_; + const std::string dbname_; + const Options* options_; + const EnvOptions& storage_options_; + std::shared_ptr cache_; + + Status FindTable(const EnvOptions& toptions, uint64_t file_number, + uint64_t file_size, Cache::Handle**, bool* table_io=nullptr, + const bool no_io = false); +}; + +} // namespace rocksdb diff --git a/db/table_properties_collector.cc b/db/table_properties_collector.cc new file mode 100644 index 00000000..3654663c --- /dev/null +++ b/db/table_properties_collector.cc @@ -0,0 +1,164 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include "db/table_properties_collector.h" + +#include "db/dbformat.h" +#include "util/coding.h" + +namespace rocksdb { + +namespace { + void AppendProperty( + std::string& props, + const std::string& key, + const std::string& value, + const std::string& prop_delim, + const std::string& kv_delim) { + props.append(key); + props.append(kv_delim); + props.append(value); + props.append(prop_delim); + } + + template + void AppendProperty( + std::string& props, + const std::string& key, + const TValue& value, + const std::string& prop_delim, + const std::string& kv_delim) { + AppendProperty( + props, key, std::to_string(value), prop_delim, kv_delim + ); + } +} + +std::string TableProperties::ToString( + const std::string& prop_delim, + const std::string& kv_delim) const { + std::string result; + result.reserve(1024); + + // Basic Info + AppendProperty( + result, "# data blocks", num_data_blocks, prop_delim, kv_delim + ); + AppendProperty(result, "# entries", num_entries, prop_delim, kv_delim); + + AppendProperty(result, "raw key size", raw_key_size, prop_delim, kv_delim); + AppendProperty( + result, + "raw average key size", + num_entries != 0 ? 1.0 * raw_key_size / num_entries : 0.0, + prop_delim, + kv_delim + ); + AppendProperty( + result, "raw value size", raw_value_size, prop_delim, kv_delim + ); + AppendProperty( + result, + "raw average value size", + num_entries != 0 ? 1.0 * raw_value_size / num_entries : 0.0, + prop_delim, + kv_delim + ); + + AppendProperty(result, "data block size", data_size, prop_delim, kv_delim); + AppendProperty(result, "index block size", index_size, prop_delim, kv_delim); + AppendProperty( + result, "filter block size", filter_size, prop_delim, kv_delim + ); + AppendProperty( + result, + "(estimated) table size", + data_size + index_size + filter_size, + prop_delim, + kv_delim + ); + + AppendProperty( + result, + "filter policy name", + filter_policy_name.empty() ? std::string("N/A") : filter_policy_name, + prop_delim, + kv_delim + ); + + return result; +} + +Status InternalKeyPropertiesCollector::Add( + const Slice& key, const Slice& value) { + ParsedInternalKey ikey; + if (!ParseInternalKey(key, &ikey)) { + return Status::InvalidArgument("Invalid internal key"); + } + + if (ikey.type == ValueType::kTypeDeletion) { + ++deleted_keys_; + } + + return Status::OK(); +} + +Status InternalKeyPropertiesCollector::Finish( + TableProperties::UserCollectedProperties* properties) { + assert(properties); + assert(properties->find( + InternalKeyTablePropertiesNames::kDeletedKeys) == properties->end()); + std::string val; + + PutVarint64(&val, deleted_keys_); + properties->insert({ InternalKeyTablePropertiesNames::kDeletedKeys, val }); + + return Status::OK(); +} + +TableProperties::UserCollectedProperties +InternalKeyPropertiesCollector::GetReadableProperties() const { + return { + { "kDeletedKeys", std::to_string(deleted_keys_) } + }; +} + + +Status UserKeyTablePropertiesCollector::Add( + const Slice& key, const Slice& value) { + ParsedInternalKey ikey; + if (!ParseInternalKey(key, &ikey)) { + return Status::InvalidArgument("Invalid internal key"); + } + + return collector_->Add(ikey.user_key, value); +} + +Status UserKeyTablePropertiesCollector::Finish( + TableProperties::UserCollectedProperties* properties) { + return collector_->Finish(properties); +} + +TableProperties::UserCollectedProperties +UserKeyTablePropertiesCollector::GetReadableProperties() const { + return collector_->GetReadableProperties(); +} + + +const std::string InternalKeyTablePropertiesNames::kDeletedKeys + = "rocksdb.deleted.keys"; + +uint64_t GetDeletedKeys( + const TableProperties::UserCollectedProperties& props) { + auto pos = props.find(InternalKeyTablePropertiesNames::kDeletedKeys); + if (pos == props.end()) { + return 0; + } + Slice raw = pos->second; + uint64_t val = 0; + return GetVarint64(&raw, &val) ? val : 0; +} + +} // namespace rocksdb diff --git a/db/table_properties_collector.h b/db/table_properties_collector.h new file mode 100644 index 00000000..533130db --- /dev/null +++ b/db/table_properties_collector.h @@ -0,0 +1,76 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// This file defines a collection of statistics collectors. +#pragma once + +#include "rocksdb/table_properties.h" + +#include +#include +#include + +namespace rocksdb { + +struct InternalKeyTablePropertiesNames { + static const std::string kDeletedKeys; +}; + +// Collecting the statistics for internal keys. Visible only by internal +// rocksdb modules. +class InternalKeyPropertiesCollector : public TablePropertiesCollector { + public: + virtual Status Add(const Slice& key, const Slice& value) override; + + virtual Status Finish( + TableProperties::UserCollectedProperties* properties) override; + + virtual const char* Name() const override { + return "InternalKeyPropertiesCollector"; + } + + TableProperties::UserCollectedProperties + GetReadableProperties() const override; + + private: + uint64_t deleted_keys_ = 0; +}; + +// When rocksdb creates a new table, it will encode all "user keys" into +// "internal keys", which contains meta information of a given entry. +// +// This class extracts user key from the encoded internal key when Add() is +// invoked. +class UserKeyTablePropertiesCollector : public TablePropertiesCollector { + public: + explicit UserKeyTablePropertiesCollector( + TablePropertiesCollector* collector) : + UserKeyTablePropertiesCollector( + std::shared_ptr(collector) + ) { + } + + explicit UserKeyTablePropertiesCollector( + std::shared_ptr collector) : + collector_(collector) { + } + + virtual ~UserKeyTablePropertiesCollector() { } + + virtual Status Add(const Slice& key, const Slice& value) override; + + virtual Status Finish( + TableProperties::UserCollectedProperties* properties) override; + + virtual const char* Name() const override { return collector_->Name(); } + + TableProperties::UserCollectedProperties + GetReadableProperties() const override; + + protected: + std::shared_ptr collector_; +}; + +} // namespace rocksdb diff --git a/db/table_properties_collector_test.cc b/db/table_properties_collector_test.cc new file mode 100644 index 00000000..6f405b28 --- /dev/null +++ b/db/table_properties_collector_test.cc @@ -0,0 +1,266 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include +#include +#include + +#include "db/dbformat.h" +#include "db/db_impl.h" +#include "db/table_properties_collector.h" +#include "rocksdb/table_properties.h" +#include "rocksdb/table.h" +#include "table/block_based_table_factory.h" +#include "util/coding.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { + +class TablePropertiesTest { + private: + unique_ptr table_reader_; +}; + +// TODO(kailiu) the following classes should be moved to some more general +// places, so that other tests can also make use of them. +// `FakeWritableFile` and `FakeRandomeAccessFile` bypass the real file system +// and therefore enable us to quickly setup the tests. +class FakeWritableFile : public WritableFile { + public: + ~FakeWritableFile() { } + + const std::string& contents() const { return contents_; } + + virtual Status Close() { return Status::OK(); } + virtual Status Flush() { return Status::OK(); } + virtual Status Sync() { return Status::OK(); } + + virtual Status Append(const Slice& data) { + contents_.append(data.data(), data.size()); + return Status::OK(); + } + + private: + std::string contents_; +}; + + +class FakeRandomeAccessFile : public RandomAccessFile { + public: + explicit FakeRandomeAccessFile(const Slice& contents) + : contents_(contents.data(), contents.size()) { + } + + virtual ~FakeRandomeAccessFile() { } + + uint64_t Size() const { return contents_.size(); } + + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + if (offset > contents_.size()) { + return Status::InvalidArgument("invalid Read offset"); + } + if (offset + n > contents_.size()) { + n = contents_.size() - offset; + } + memcpy(scratch, &contents_[offset], n); + *result = Slice(scratch, n); + return Status::OK(); + } + + private: + std::string contents_; +}; + + +class DumbLogger : public Logger { + public: + virtual void Logv(const char* format, va_list ap) { } + virtual size_t GetLogFileSize() const { return 0; } +}; + +// Utilities test functions +void MakeBuilder( + const Options& options, + std::unique_ptr* writable, + std::unique_ptr* builder) { + writable->reset(new FakeWritableFile); + builder->reset( + options.table_factory->GetTableBuilder(options, writable->get(), + options.compression)); +} + +void OpenTable( + const Options& options, + const std::string& contents, + std::unique_ptr* table_reader) { + + std::unique_ptr file(new FakeRandomeAccessFile(contents)); + auto s = options.table_factory->GetTableReader( + options, + EnvOptions(), + std::move(file), + contents.size(), + table_reader + ); + ASSERT_OK(s); +} + +// Collects keys that starts with "A" in a table. +class RegularKeysStartWithA: public TablePropertiesCollector { + public: + const char* Name() const { return "RegularKeysStartWithA"; } + + Status Finish(TableProperties::UserCollectedProperties* properties) { + std::string encoded; + PutVarint32(&encoded, count_); + *properties = TableProperties::UserCollectedProperties { + { "TablePropertiesTest", "Rocksdb" }, + { "Count", encoded } + }; + return Status::OK(); + } + + Status Add(const Slice& user_key, const Slice& value) { + // simply asssume all user keys are not empty. + if (user_key.data()[0] == 'A') { + ++count_; + } + return Status::OK(); + } + + virtual TableProperties::UserCollectedProperties + GetReadableProperties() const { + return {}; + } + + + private: + uint32_t count_ = 0; +}; + +TEST(TablePropertiesTest, CustomizedTablePropertiesCollector) { + Options options; + + // make sure the entries will be inserted with order. + std::map kvs = { + {"About", "val5"}, // starts with 'A' + {"Abstract", "val2"}, // starts with 'A' + {"Around", "val7"}, // starts with 'A' + {"Beyond", "val3"}, + {"Builder", "val1"}, + {"Cancel", "val4"}, + {"Find", "val6"}, + }; + + // Test properties collectors with internal keys or regular keys + for (bool encode_as_internal : { true, false }) { + // -- Step 1: build table + auto collector = new RegularKeysStartWithA(); + if (encode_as_internal) { + options.table_properties_collectors = { + std::make_shared(collector) + }; + } else { + options.table_properties_collectors.resize(1); + options.table_properties_collectors[0].reset(collector); + } + std::unique_ptr builder; + std::unique_ptr writable; + MakeBuilder(options, &writable, &builder); + + for (const auto& kv : kvs) { + if (encode_as_internal) { + InternalKey ikey(kv.first, 0, ValueType::kTypeValue); + builder->Add(ikey.Encode(), kv.second); + } else { + builder->Add(kv.first, kv.second); + } + } + ASSERT_OK(builder->Finish()); + + // -- Step 2: Open table + std::unique_ptr table_reader; + OpenTable(options, writable->contents(), &table_reader); + const auto& properties = + table_reader->GetTableProperties().user_collected_properties; + + ASSERT_EQ("Rocksdb", properties.at("TablePropertiesTest")); + + uint32_t starts_with_A = 0; + Slice key(properties.at("Count")); + ASSERT_TRUE(GetVarint32(&key, &starts_with_A)); + ASSERT_EQ(3u, starts_with_A); + } +} + +TEST(TablePropertiesTest, InternalKeyPropertiesCollector) { + InternalKey keys[] = { + InternalKey("A", 0, ValueType::kTypeValue), + InternalKey("B", 0, ValueType::kTypeValue), + InternalKey("C", 0, ValueType::kTypeValue), + InternalKey("W", 0, ValueType::kTypeDeletion), + InternalKey("X", 0, ValueType::kTypeDeletion), + InternalKey("Y", 0, ValueType::kTypeDeletion), + InternalKey("Z", 0, ValueType::kTypeDeletion), + }; + + for (bool sanitized : { false, true }) { + std::unique_ptr builder; + std::unique_ptr writable; + Options options; + if (sanitized) { + options.table_properties_collectors = { + std::make_shared() + }; + // with sanitization, even regular properties collector will be able to + // handle internal keys. + auto comparator = options.comparator; + // HACK: Set options.info_log to avoid writing log in + // SanitizeOptions(). + options.info_log = std::make_shared(); + options = SanitizeOptions( + "db", // just a place holder + nullptr, // with skip internal key comparator + nullptr, // don't care filter policy + options + ); + options.comparator = comparator; + } else { + options.table_properties_collectors = { + std::make_shared() + }; + } + + MakeBuilder(options, &writable, &builder); + for (const auto& k : keys) { + builder->Add(k.Encode(), "val"); + } + + ASSERT_OK(builder->Finish()); + + std::unique_ptr table_reader; + OpenTable(options, writable->contents(), &table_reader); + const auto& properties = + table_reader->GetTableProperties().user_collected_properties; + + uint64_t deleted = GetDeletedKeys(properties); + ASSERT_EQ(4u, deleted); + + if (sanitized) { + uint32_t starts_with_A = 0; + Slice key(properties.at("Count")); + ASSERT_TRUE(GetVarint32(&key, &starts_with_A)); + ASSERT_EQ(1u, starts_with_A); + } + } +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/db/transaction_log_impl.cc b/db/transaction_log_impl.cc new file mode 100644 index 00000000..092d88ca --- /dev/null +++ b/db/transaction_log_impl.cc @@ -0,0 +1,264 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include "db/transaction_log_impl.h" +#include "db/write_batch_internal.h" + +namespace rocksdb { + +TransactionLogIteratorImpl::TransactionLogIteratorImpl( + const std::string& dir, + const Options* options, + const EnvOptions& soptions, + const SequenceNumber seq, + std::unique_ptr files, + DBImpl const * const dbimpl) : + dir_(dir), + options_(options), + soptions_(soptions), + startingSequenceNumber_(seq), + files_(std::move(files)), + started_(false), + isValid_(false), + currentFileIndex_(0), + currentBatchSeq_(0), + currentLastSeq_(0), + dbimpl_(dbimpl) { + assert(files_ != nullptr); + assert(dbimpl_ != nullptr); + + reporter_.env = options_->env; + reporter_.info_log = options_->info_log.get(); + SeekToStartSequence(); // Seek till starting sequence +} + +Status TransactionLogIteratorImpl::OpenLogFile( + const LogFile* logFile, + unique_ptr* file) { + Env* env = options_->env; + if (logFile->Type() == kArchivedLogFile) { + std::string fname = ArchivedLogFileName(dir_, logFile->LogNumber()); + return env->NewSequentialFile(fname, file, soptions_); + } else { + std::string fname = LogFileName(dir_, logFile->LogNumber()); + Status status = env->NewSequentialFile(fname, file, soptions_); + if (!status.ok()) { + // If cannot open file in DB directory. + // Try the archive dir, as it could have moved in the meanwhile. + fname = ArchivedLogFileName(dir_, logFile->LogNumber()); + status = env->NewSequentialFile(fname, file, soptions_); + if (!status.ok()) { + return Status::IOError("Requested file not present in the dir"); + } + } + return status; + } +} + +BatchResult TransactionLogIteratorImpl::GetBatch() { + assert(isValid_); // cannot call in a non valid state. + BatchResult result; + result.sequence = currentBatchSeq_; + result.writeBatchPtr = std::move(currentBatch_); + return result; +} + +Status TransactionLogIteratorImpl::status() { + return currentStatus_; +} + +bool TransactionLogIteratorImpl::Valid() { + return started_ && isValid_; +} + +bool TransactionLogIteratorImpl::RestrictedRead( + Slice* record, + std::string* scratch) { + // Don't read if no more complete entries to read from logs + if (currentLastSeq_ >= dbimpl_->GetLatestSequenceNumber()) { + return false; + } + return currentLogReader_->ReadRecord(record, scratch); +} + +void TransactionLogIteratorImpl::SeekToStartSequence( + uint64_t startFileIndex, + bool strict) { + std::string scratch; + Slice record; + started_ = false; + isValid_ = false; + if (files_->size() <= startFileIndex) { + return; + } + Status s = OpenLogReader(files_->at(startFileIndex).get()); + if (!s.ok()) { + currentStatus_ = s; + return; + } + while (RestrictedRead(&record, &scratch)) { + if (record.size() < 12) { + reporter_.Corruption( + record.size(), Status::Corruption("very small log record")); + continue; + } + UpdateCurrentWriteBatch(record); + if (currentLastSeq_ >= startingSequenceNumber_) { + if (strict && currentBatchSeq_ != startingSequenceNumber_) { + currentStatus_ = Status::Corruption("Gap in sequence number. Could not " + "seek to required sequence number"); + reporter_.Info(currentStatus_.ToString().c_str()); + return; + } else if (strict) { + reporter_.Info("Could seek required sequence number. Iterator will " + "continue."); + } + isValid_ = true; + started_ = true; // set started_ as we could seek till starting sequence + return; + } else { + isValid_ = false; + } + } + + // Could not find start sequence in first file. Normally this must be the + // only file. Otherwise log the error and let the iterator return next entry + // If strict is set, we want to seek exactly till the start sequence and it + // should have been present in the file we scanned above + if (strict) { + currentStatus_ = Status::Corruption("Gap in sequence number. Could not " + "seek to required sequence number"); + reporter_.Info(currentStatus_.ToString().c_str()); + } else if (files_->size() != 1) { + currentStatus_ = Status::Corruption("Start sequence was not found, " + "skipping to the next available"); + reporter_.Info(currentStatus_.ToString().c_str()); + // Let NextImpl find the next available entry. started_ remains false + // because we don't want to check for gaps while moving to start sequence + NextImpl(true); + } +} + +void TransactionLogIteratorImpl::Next() { + return NextImpl(false); +} + +void TransactionLogIteratorImpl::NextImpl(bool internal) { + std::string scratch; + Slice record; + isValid_ = false; + if (!internal && !started_) { + // Runs every time until we can seek to the start sequence + return SeekToStartSequence(); + } + while(true) { + assert(currentLogReader_); + if (currentLogReader_->IsEOF()) { + currentLogReader_->UnmarkEOF(); + } + while (RestrictedRead(&record, &scratch)) { + if (record.size() < 12) { + reporter_.Corruption( + record.size(), Status::Corruption("very small log record")); + continue; + } else { + // started_ should be true if called by application + assert(internal || started_); + // started_ should be false if called internally + assert(!internal || !started_); + UpdateCurrentWriteBatch(record); + if (internal && !started_) { + started_ = true; + } + return; + } + } + + // Open the next file + if (currentFileIndex_ < files_->size() - 1) { + ++currentFileIndex_; + Status status =OpenLogReader(files_->at(currentFileIndex_).get()); + if (!status.ok()) { + isValid_ = false; + currentStatus_ = status; + return; + } + } else { + isValid_ = false; + if (currentLastSeq_ == dbimpl_->GetLatestSequenceNumber()) { + currentStatus_ = Status::OK(); + } else { + currentStatus_ = Status::IOError("NO MORE DATA LEFT"); + } + return; + } + } +} + +bool TransactionLogIteratorImpl::IsBatchExpected( + const WriteBatch* batch, + const SequenceNumber expectedSeq) { + assert(batch); + SequenceNumber batchSeq = WriteBatchInternal::Sequence(batch); + if (batchSeq != expectedSeq) { + char buf[200]; + snprintf(buf, sizeof(buf), + "Discontinuity in log records. Got seq=%lu, Expected seq=%lu, " + "Last flushed seq=%lu.Log iterator will reseek the correct " + "batch.", + (unsigned long)batchSeq, + (unsigned long)expectedSeq, + (unsigned long)dbimpl_->GetLatestSequenceNumber()); + reporter_.Info(buf); + return false; + } + return true; +} + +void TransactionLogIteratorImpl::UpdateCurrentWriteBatch(const Slice& record) { + std::unique_ptr batch(new WriteBatch()); + WriteBatchInternal::SetContents(batch.get(), record); + + SequenceNumber expectedSeq = currentLastSeq_ + 1; + // If the iterator has started, then confirm that we get continuous batches + if (started_ && !IsBatchExpected(batch.get(), expectedSeq)) { + // Seek to the batch having expected sequence number + if (expectedSeq < files_->at(currentFileIndex_)->StartSequence()) { + // Expected batch must lie in the previous log file + // Avoid underflow. + if (currentFileIndex_ != 0) { + currentFileIndex_--; + } + } + startingSequenceNumber_ = expectedSeq; + // currentStatus_ will be set to Ok if reseek succeeds + currentStatus_ = Status::NotFound("Gap in sequence numbers"); + return SeekToStartSequence(currentFileIndex_, true); + } + + currentBatchSeq_ = WriteBatchInternal::Sequence(batch.get()); + currentLastSeq_ = currentBatchSeq_ + + WriteBatchInternal::Count(batch.get()) - 1; + // currentBatchSeq_ can only change here + assert(currentLastSeq_ <= dbimpl_->GetLatestSequenceNumber()); + + currentBatch_ = move(batch); + isValid_ = true; + currentStatus_ = Status::OK(); +} + +Status TransactionLogIteratorImpl::OpenLogReader(const LogFile* logFile) { + unique_ptr file; + Status status = OpenLogFile(logFile, &file); + if (!status.ok()) { + return status; + } + assert(file); + currentLogReader_.reset( + new log::Reader(std::move(file), &reporter_, true, 0) + ); + return Status::OK(); +} +} // namespace rocksdb diff --git a/db/transaction_log_impl.h b/db/transaction_log_impl.h new file mode 100644 index 00000000..f3f4ce22 --- /dev/null +++ b/db/transaction_log_impl.h @@ -0,0 +1,118 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#pragma once +#include + +#include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "rocksdb/types.h" +#include "rocksdb/transaction_log.h" +#include "db/db_impl.h" +#include "db/log_reader.h" +#include "db/filename.h" + +namespace rocksdb { + +struct LogReporter : public log::Reader::Reporter { + Env* env; + Logger* info_log; + virtual void Corruption(size_t bytes, const Status& s) { + Log(info_log, "dropping %zu bytes; %s", bytes, s.ToString().c_str()); + } + virtual void Info(const char* s) { + Log(info_log, "%s", s); + } +}; + +class LogFileImpl : public LogFile { + public: + LogFileImpl(uint64_t logNum, WalFileType logType, SequenceNumber startSeq, + uint64_t sizeBytes) : + logNumber_(logNum), + type_(logType), + startSequence_(startSeq), + sizeFileBytes_(sizeBytes) { + } + + std::string PathName() const { + if (type_ == kArchivedLogFile) { + return ArchivedLogFileName("", logNumber_); + } + return LogFileName("", logNumber_); + } + + uint64_t LogNumber() const { return logNumber_; } + + WalFileType Type() const { return type_; } + + SequenceNumber StartSequence() const { return startSequence_; } + + uint64_t SizeFileBytes() const { return sizeFileBytes_; } + + bool operator < (const LogFile& that) const { + return LogNumber() < that.LogNumber(); + } + + private: + uint64_t logNumber_; + WalFileType type_; + SequenceNumber startSequence_; + uint64_t sizeFileBytes_; + +}; + +class TransactionLogIteratorImpl : public TransactionLogIterator { + public: + TransactionLogIteratorImpl(const std::string& dir, + const Options* options, + const EnvOptions& soptions, + const SequenceNumber seqNum, + std::unique_ptr files, + DBImpl const * const dbimpl); + + virtual bool Valid(); + + virtual void Next(); + + virtual Status status(); + + virtual BatchResult GetBatch(); + + private: + const std::string& dir_; + const Options* options_; + const EnvOptions& soptions_; + SequenceNumber startingSequenceNumber_; + std::unique_ptr files_; + bool started_; + bool isValid_; // not valid when it starts of. + Status currentStatus_; + size_t currentFileIndex_; + std::unique_ptr currentBatch_; + unique_ptr currentLogReader_; + Status OpenLogFile(const LogFile* logFile, unique_ptr* file); + LogReporter reporter_; + SequenceNumber currentBatchSeq_; // sequence number at start of current batch + SequenceNumber currentLastSeq_; // last sequence in the current batch + DBImpl const * const dbimpl_; // The db on whose log files this iterates + + // Reads from transaction log only if the writebatch record has been written + bool RestrictedRead(Slice* record, std::string* scratch); + // Seeks to startingSequenceNumber reading from startFileIndex in files_. + // If strict is set,then must get a batch starting with startingSequenceNumber + void SeekToStartSequence(uint64_t startFileIndex = 0, bool strict = false); + // Implementation of Next. SeekToStartSequence calls it internally with + // internal=true to let it find next entry even if it has to jump gaps because + // the iterator may start off from the first available entry but promises to + // be continuous after that + void NextImpl(bool internal = false); + // Check if batch is expected, else return false + bool IsBatchExpected(const WriteBatch* batch, SequenceNumber expectedSeq); + // Update current batch if a continuous batch is found, else return false + void UpdateCurrentWriteBatch(const Slice& record); + Status OpenLogReader(const LogFile* file); +}; +} // namespace rocksdb diff --git a/db/version_edit.cc b/db/version_edit.cc new file mode 100644 index 00000000..42c07e7b --- /dev/null +++ b/db/version_edit.cc @@ -0,0 +1,301 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/version_edit.h" + +#include "db/version_set.h" +#include "util/coding.h" + +namespace rocksdb { + +// Tag numbers for serialized VersionEdit. These numbers are written to +// disk and should not be changed. +enum Tag { + kComparator = 1, + kLogNumber = 2, + kNextFileNumber = 3, + kLastSequence = 4, + kCompactPointer = 5, + kDeletedFile = 6, + kNewFile = 7, + // 8 was used for large value refs + kPrevLogNumber = 9, + + // these are new formats divergent from open source leveldb + kNewFile2 = 100 // store smallest & largest seqno +}; + +void VersionEdit::Clear() { + comparator_.clear(); + max_level_ = 0; + log_number_ = 0; + prev_log_number_ = 0; + last_sequence_ = 0; + next_file_number_ = 0; + has_comparator_ = false; + has_log_number_ = false; + has_prev_log_number_ = false; + has_next_file_number_ = false; + has_last_sequence_ = false; + deleted_files_.clear(); + new_files_.clear(); +} + +void VersionEdit::EncodeTo(std::string* dst) const { + if (has_comparator_) { + PutVarint32(dst, kComparator); + PutLengthPrefixedSlice(dst, comparator_); + } + if (has_log_number_) { + PutVarint32(dst, kLogNumber); + PutVarint64(dst, log_number_); + } + if (has_prev_log_number_) { + PutVarint32(dst, kPrevLogNumber); + PutVarint64(dst, prev_log_number_); + } + if (has_next_file_number_) { + PutVarint32(dst, kNextFileNumber); + PutVarint64(dst, next_file_number_); + } + if (has_last_sequence_) { + PutVarint32(dst, kLastSequence); + PutVarint64(dst, last_sequence_); + } + + for (size_t i = 0; i < compact_pointers_.size(); i++) { + PutVarint32(dst, kCompactPointer); + PutVarint32(dst, compact_pointers_[i].first); // level + PutLengthPrefixedSlice(dst, compact_pointers_[i].second.Encode()); + } + + for (DeletedFileSet::const_iterator iter = deleted_files_.begin(); + iter != deleted_files_.end(); + ++iter) { + PutVarint32(dst, kDeletedFile); + PutVarint32(dst, iter->first); // level + PutVarint64(dst, iter->second); // file number + } + + for (size_t i = 0; i < new_files_.size(); i++) { + const FileMetaData& f = new_files_[i].second; + PutVarint32(dst, kNewFile2); + PutVarint32(dst, new_files_[i].first); // level + PutVarint64(dst, f.number); + PutVarint64(dst, f.file_size); + PutLengthPrefixedSlice(dst, f.smallest.Encode()); + PutLengthPrefixedSlice(dst, f.largest.Encode()); + PutVarint64(dst, f.smallest_seqno); + PutVarint64(dst, f.largest_seqno); + } +} + +static bool GetInternalKey(Slice* input, InternalKey* dst) { + Slice str; + if (GetLengthPrefixedSlice(input, &str)) { + dst->DecodeFrom(str); + return true; + } else { + return false; + } +} + +bool VersionEdit::GetLevel(Slice* input, int* level, const char** msg) { + uint32_t v; + if (GetVarint32(input, &v)) { + *level = v; + if (max_level_ < *level) { + max_level_ = *level; + } + return true; + } else { + return false; + } +} + +Status VersionEdit::DecodeFrom(const Slice& src) { + Clear(); + Slice input = src; + const char* msg = nullptr; + uint32_t tag; + + // Temporary storage for parsing + int level; + uint64_t number; + FileMetaData f; + Slice str; + InternalKey key; + + while (msg == nullptr && GetVarint32(&input, &tag)) { + switch (tag) { + case kComparator: + if (GetLengthPrefixedSlice(&input, &str)) { + comparator_ = str.ToString(); + has_comparator_ = true; + } else { + msg = "comparator name"; + } + break; + + case kLogNumber: + if (GetVarint64(&input, &log_number_)) { + has_log_number_ = true; + } else { + msg = "log number"; + } + break; + + case kPrevLogNumber: + if (GetVarint64(&input, &prev_log_number_)) { + has_prev_log_number_ = true; + } else { + msg = "previous log number"; + } + break; + + case kNextFileNumber: + if (GetVarint64(&input, &next_file_number_)) { + has_next_file_number_ = true; + } else { + msg = "next file number"; + } + break; + + case kLastSequence: + if (GetVarint64(&input, &last_sequence_)) { + has_last_sequence_ = true; + } else { + msg = "last sequence number"; + } + break; + + case kCompactPointer: + if (GetLevel(&input, &level, &msg) && + GetInternalKey(&input, &key)) { + compact_pointers_.push_back(std::make_pair(level, key)); + } else { + if (!msg) { + msg = "compaction pointer"; + } + } + break; + + case kDeletedFile: + if (GetLevel(&input, &level, &msg) && + GetVarint64(&input, &number)) { + deleted_files_.insert(std::make_pair(level, number)); + } else { + if (!msg) { + msg = "deleted file"; + } + } + break; + + case kNewFile: + if (GetLevel(&input, &level, &msg) && + GetVarint64(&input, &f.number) && + GetVarint64(&input, &f.file_size) && + GetInternalKey(&input, &f.smallest) && + GetInternalKey(&input, &f.largest)) { + new_files_.push_back(std::make_pair(level, f)); + } else { + if (!msg) { + msg = "new-file entry"; + } + } + break; + + case kNewFile2: + if (GetLevel(&input, &level, &msg) && + GetVarint64(&input, &f.number) && + GetVarint64(&input, &f.file_size) && + GetInternalKey(&input, &f.smallest) && + GetInternalKey(&input, &f.largest) && + GetVarint64(&input, &f.smallest_seqno) && + GetVarint64(&input, &f.largest_seqno) ) { + new_files_.push_back(std::make_pair(level, f)); + } else { + if (!msg) { + msg = "new-file2 entry"; + } + } + break; + + default: + msg = "unknown tag"; + break; + } + } + + if (msg == nullptr && !input.empty()) { + msg = "invalid tag"; + } + + Status result; + if (msg != nullptr) { + result = Status::Corruption("VersionEdit", msg); + } + return result; +} + +std::string VersionEdit::DebugString(bool hex_key) const { + std::string r; + r.append("VersionEdit {"); + if (has_comparator_) { + r.append("\n Comparator: "); + r.append(comparator_); + } + if (has_log_number_) { + r.append("\n LogNumber: "); + AppendNumberTo(&r, log_number_); + } + if (has_prev_log_number_) { + r.append("\n PrevLogNumber: "); + AppendNumberTo(&r, prev_log_number_); + } + if (has_next_file_number_) { + r.append("\n NextFile: "); + AppendNumberTo(&r, next_file_number_); + } + if (has_last_sequence_) { + r.append("\n LastSeq: "); + AppendNumberTo(&r, last_sequence_); + } + for (size_t i = 0; i < compact_pointers_.size(); i++) { + r.append("\n CompactPointer: "); + AppendNumberTo(&r, compact_pointers_[i].first); + r.append(" "); + r.append(compact_pointers_[i].second.DebugString(hex_key)); + } + for (DeletedFileSet::const_iterator iter = deleted_files_.begin(); + iter != deleted_files_.end(); + ++iter) { + r.append("\n DeleteFile: "); + AppendNumberTo(&r, iter->first); + r.append(" "); + AppendNumberTo(&r, iter->second); + } + for (size_t i = 0; i < new_files_.size(); i++) { + const FileMetaData& f = new_files_[i].second; + r.append("\n AddFile: "); + AppendNumberTo(&r, new_files_[i].first); + r.append(" "); + AppendNumberTo(&r, f.number); + r.append(" "); + AppendNumberTo(&r, f.file_size); + r.append(" "); + r.append(f.smallest.DebugString(hex_key)); + r.append(" .. "); + r.append(f.largest.DebugString(hex_key)); + } + r.append("\n}\n"); + return r; +} + +} // namespace rocksdb diff --git a/db/version_edit.h b/db/version_edit.h new file mode 100644 index 00000000..a0546c98 --- /dev/null +++ b/db/version_edit.h @@ -0,0 +1,125 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include +#include +#include "db/dbformat.h" + +namespace rocksdb { + +class VersionSet; + +struct FileMetaData { + int refs; + int allowed_seeks; // Seeks allowed until compaction + uint64_t number; + uint64_t file_size; // File size in bytes + InternalKey smallest; // Smallest internal key served by table + InternalKey largest; // Largest internal key served by table + bool being_compacted; // Is this file undergoing compaction? + SequenceNumber smallest_seqno;// The smallest seqno in this file + SequenceNumber largest_seqno; // The largest seqno in this file + + FileMetaData() : refs(0), allowed_seeks(1 << 30), file_size(0), + being_compacted(false) { } +}; + +class VersionEdit { + public: + VersionEdit() { Clear(); } + ~VersionEdit() { } + + void Clear(); + + void SetComparatorName(const Slice& name) { + has_comparator_ = true; + comparator_ = name.ToString(); + } + void SetLogNumber(uint64_t num) { + has_log_number_ = true; + log_number_ = num; + } + void SetPrevLogNumber(uint64_t num) { + has_prev_log_number_ = true; + prev_log_number_ = num; + } + void SetNextFile(uint64_t num) { + has_next_file_number_ = true; + next_file_number_ = num; + } + void SetLastSequence(SequenceNumber seq) { + has_last_sequence_ = true; + last_sequence_ = seq; + } + void SetCompactPointer(int level, const InternalKey& key) { + compact_pointers_.push_back(std::make_pair(level, key)); + } + + // Add the specified file at the specified number. + // REQUIRES: This version has not been saved (see VersionSet::SaveTo) + // REQUIRES: "smallest" and "largest" are smallest and largest keys in file + void AddFile(int level, uint64_t file, + uint64_t file_size, + const InternalKey& smallest, + const InternalKey& largest, + const SequenceNumber& smallest_seqno, + const SequenceNumber& largest_seqno) { + FileMetaData f; + f.number = file; + f.file_size = file_size; + f.smallest = smallest; + f.largest = largest; + f.smallest_seqno = smallest_seqno; + f.largest_seqno = largest_seqno; + assert(smallest_seqno <= largest_seqno); + new_files_.push_back(std::make_pair(level, f)); + } + + // Delete the specified "file" from the specified "level". + void DeleteFile(int level, uint64_t file) { + deleted_files_.insert(std::make_pair(level, file)); + } + + // Number of edits + int NumEntries() { + return new_files_.size() + deleted_files_.size(); + } + + void EncodeTo(std::string* dst) const; + Status DecodeFrom(const Slice& src); + + std::string DebugString(bool hex_key = false) const; + + private: + friend class VersionSet; + + typedef std::set< std::pair > DeletedFileSet; + + bool GetLevel(Slice* input, int* level, const char** msg); + + int max_level_; + std::string comparator_; + uint64_t log_number_; + uint64_t prev_log_number_; + uint64_t next_file_number_; + SequenceNumber last_sequence_; + bool has_comparator_; + bool has_log_number_; + bool has_prev_log_number_; + bool has_next_file_number_; + bool has_last_sequence_; + + std::vector > compact_pointers_; + DeletedFileSet deleted_files_; + std::vector > new_files_; +}; + +} // namespace rocksdb diff --git a/db/version_edit_test.cc b/db/version_edit_test.cc new file mode 100644 index 00000000..63aa32e8 --- /dev/null +++ b/db/version_edit_test.cc @@ -0,0 +1,53 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/version_edit.h" +#include "util/testharness.h" + +namespace rocksdb { + +static void TestEncodeDecode(const VersionEdit& edit) { + std::string encoded, encoded2; + edit.EncodeTo(&encoded); + VersionEdit parsed; + Status s = parsed.DecodeFrom(encoded); + ASSERT_TRUE(s.ok()) << s.ToString(); + parsed.EncodeTo(&encoded2); + ASSERT_EQ(encoded, encoded2); +} + +class VersionEditTest { }; + +TEST(VersionEditTest, EncodeDecode) { + static const uint64_t kBig = 1ull << 50; + + VersionEdit edit; + for (int i = 0; i < 4; i++) { + TestEncodeDecode(edit); + edit.AddFile(3, kBig + 300 + i, kBig + 400 + i, + InternalKey("foo", kBig + 500 + i, kTypeValue), + InternalKey("zoo", kBig + 600 + i, kTypeDeletion), + kBig + 500 + i, + kBig + 600 + i); + edit.DeleteFile(4, kBig + 700 + i); + edit.SetCompactPointer(i, InternalKey("x", kBig + 900 + i, kTypeValue)); + } + + edit.SetComparatorName("foo"); + edit.SetLogNumber(kBig + 100); + edit.SetNextFile(kBig + 200); + edit.SetLastSequence(kBig + 1000); + TestEncodeDecode(edit); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/db/version_set.cc b/db/version_set.cc new file mode 100644 index 00000000..eb20650b --- /dev/null +++ b/db/version_set.cc @@ -0,0 +1,3148 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/version_set.h" + +#include +#include +#include +#include "db/filename.h" +#include "db/log_reader.h" +#include "db/log_writer.h" +#include "db/memtable.h" +#include "db/merge_context.h" +#include "db/table_cache.h" +#include "rocksdb/env.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/table.h" +#include "table/merger.h" +#include "table/two_level_iterator.h" +#include "util/coding.h" +#include "util/logging.h" +#include "util/stop_watch.h" + +namespace rocksdb { + +static uint64_t TotalFileSize(const std::vector& files) { + uint64_t sum = 0; + for (size_t i = 0; i < files.size() && files[i]; i++) { + sum += files[i]->file_size; + } + return sum; +} + +Version::~Version() { + assert(refs_ == 0); + + // Remove from linked list + prev_->next_ = next_; + next_->prev_ = prev_; + + // Drop references to files + for (int level = 0; level < num_levels_; level++) { + for (size_t i = 0; i < files_[level].size(); i++) { + FileMetaData* f = files_[level][i]; + assert(f->refs > 0); + f->refs--; + if (f->refs <= 0) { + vset_->obsolete_files_.push_back(f); + } + } + } + delete[] files_; +} + +int FindFile(const InternalKeyComparator& icmp, + const std::vector& files, + const Slice& key) { + uint32_t left = 0; + uint32_t right = files.size(); + while (left < right) { + uint32_t mid = (left + right) / 2; + const FileMetaData* f = files[mid]; + if (icmp.InternalKeyComparator::Compare(f->largest.Encode(), key) < 0) { + // Key at "mid.largest" is < "target". Therefore all + // files at or before "mid" are uninteresting. + left = mid + 1; + } else { + // Key at "mid.largest" is >= "target". Therefore all files + // after "mid" are uninteresting. + right = mid; + } + } + return right; +} + +static bool AfterFile(const Comparator* ucmp, + const Slice* user_key, const FileMetaData* f) { + // nullptr user_key occurs before all keys and is therefore never after *f + return (user_key != nullptr && + ucmp->Compare(*user_key, f->largest.user_key()) > 0); +} + +static bool BeforeFile(const Comparator* ucmp, + const Slice* user_key, const FileMetaData* f) { + // nullptr user_key occurs after all keys and is therefore never before *f + return (user_key != nullptr && + ucmp->Compare(*user_key, f->smallest.user_key()) < 0); +} + +bool SomeFileOverlapsRange( + const InternalKeyComparator& icmp, + bool disjoint_sorted_files, + const std::vector& files, + const Slice* smallest_user_key, + const Slice* largest_user_key) { + const Comparator* ucmp = icmp.user_comparator(); + if (!disjoint_sorted_files) { + // Need to check against all files + for (size_t i = 0; i < files.size(); i++) { + const FileMetaData* f = files[i]; + if (AfterFile(ucmp, smallest_user_key, f) || + BeforeFile(ucmp, largest_user_key, f)) { + // No overlap + } else { + return true; // Overlap + } + } + return false; + } + + // Binary search over file list + uint32_t index = 0; + if (smallest_user_key != nullptr) { + // Find the earliest possible internal key for smallest_user_key + InternalKey small(*smallest_user_key, kMaxSequenceNumber,kValueTypeForSeek); + index = FindFile(icmp, files, small.Encode()); + } + + if (index >= files.size()) { + // beginning of range is after all files, so no overlap. + return false; + } + + return !BeforeFile(ucmp, largest_user_key, files[index]); +} + +// An internal iterator. For a given version/level pair, yields +// information about the files in the level. For a given entry, key() +// is the largest key that occurs in the file, and value() is an +// 16-byte value containing the file number and file size, both +// encoded using EncodeFixed64. +class Version::LevelFileNumIterator : public Iterator { + public: + LevelFileNumIterator(const InternalKeyComparator& icmp, + const std::vector* flist) + : icmp_(icmp), + flist_(flist), + index_(flist->size()) { // Marks as invalid + } + virtual bool Valid() const { + return index_ < flist_->size(); + } + virtual void Seek(const Slice& target) { + index_ = FindFile(icmp_, *flist_, target); + } + virtual void SeekToFirst() { index_ = 0; } + virtual void SeekToLast() { + index_ = flist_->empty() ? 0 : flist_->size() - 1; + } + virtual void Next() { + assert(Valid()); + index_++; + } + virtual void Prev() { + assert(Valid()); + if (index_ == 0) { + index_ = flist_->size(); // Marks as invalid + } else { + index_--; + } + } + Slice key() const { + assert(Valid()); + return (*flist_)[index_]->largest.Encode(); + } + Slice value() const { + assert(Valid()); + EncodeFixed64(value_buf_, (*flist_)[index_]->number); + EncodeFixed64(value_buf_+8, (*flist_)[index_]->file_size); + return Slice(value_buf_, sizeof(value_buf_)); + } + virtual Status status() const { return Status::OK(); } + private: + const InternalKeyComparator icmp_; + const std::vector* const flist_; + uint32_t index_; + + // Backing store for value(). Holds the file number and size. + mutable char value_buf_[16]; +}; + +static Iterator* GetFileIterator(void* arg, + const ReadOptions& options, + const EnvOptions& soptions, + const Slice& file_value, + bool for_compaction) { + TableCache* cache = reinterpret_cast(arg); + if (file_value.size() != 16) { + return NewErrorIterator( + Status::Corruption("FileReader invoked with unexpected value")); + } else { + ReadOptions options_copy; + if (options.prefix) { + // suppress prefix filtering since we have already checked the + // filters once at this point + options_copy = options; + options_copy.prefix = nullptr; + } + return cache->NewIterator(options.prefix ? options_copy : options, + soptions, + DecodeFixed64(file_value.data()), + DecodeFixed64(file_value.data() + 8), + nullptr /* don't need reference to table*/, + for_compaction); + } +} + +bool Version::PrefixMayMatch(const ReadOptions& options, + const EnvOptions& soptions, + const Slice& internal_prefix, + Iterator* level_iter) const { + bool may_match = true; + level_iter->Seek(internal_prefix); + if (!level_iter->Valid()) { + // we're past end of level + may_match = false; + } else if (ExtractUserKey(level_iter->key()).starts_with( + ExtractUserKey(internal_prefix))) { + // TODO(tylerharter): do we need this case? Or are we guaranteed + // key() will always be the biggest value for this SST? + may_match = true; + } else { + may_match = vset_->table_cache_->PrefixMayMatch( + options, + DecodeFixed64(level_iter->value().data()), + DecodeFixed64(level_iter->value().data() + 8), + internal_prefix, nullptr); + } + return may_match; +} + +Iterator* Version::NewConcatenatingIterator(const ReadOptions& options, + const EnvOptions& soptions, + int level) const { + Iterator* level_iter = new LevelFileNumIterator(vset_->icmp_, &files_[level]); + if (options.prefix) { + InternalKey internal_prefix(*options.prefix, 0, kTypeValue); + if (!PrefixMayMatch(options, soptions, + internal_prefix.Encode(), level_iter)) { + delete level_iter; + // nothing in this level can match the prefix + return NewEmptyIterator(); + } + } + return NewTwoLevelIterator(level_iter, &GetFileIterator, + vset_->table_cache_, options, soptions); +} + +void Version::AddIterators(const ReadOptions& options, + const EnvOptions& soptions, + std::vector* iters) { + // Merge all level zero files together since they may overlap + for (const FileMetaData* file : files_[0]) { + iters->push_back( + vset_->table_cache_->NewIterator( + options, soptions, file->number, file->file_size)); + } + + // For levels > 0, we can use a concatenating iterator that sequentially + // walks through the non-overlapping files in the level, opening them + // lazily. + for (int level = 1; level < num_levels_; level++) { + if (!files_[level].empty()) { + iters->push_back(NewConcatenatingIterator(options, soptions, level)); + } + } +} + +// Callback from TableCache::Get() +namespace { +enum SaverState { + kNotFound, + kFound, + kDeleted, + kCorrupt, + kMerge // saver contains the current merge result (the operands) +}; +struct Saver { + SaverState state; + const Comparator* ucmp; + Slice user_key; + bool* value_found; // Is value set correctly? Used by KeyMayExist + std::string* value; + const MergeOperator* merge_operator; + // the merge operations encountered; + MergeContext* merge_context; + Logger* logger; + bool didIO; // did we do any disk io? + Statistics* statistics; +}; +} + +// Called from TableCache::Get and Table::Get when file/block in which +// key may exist are not there in TableCache/BlockCache respectively. In this +// case we can't guarantee that key does not exist and are not permitted to do +// IO to be certain.Set the status=kFound and value_found=false to let the +// caller know that key may exist but is not there in memory +static void MarkKeyMayExist(void* arg) { + Saver* s = reinterpret_cast(arg); + s->state = kFound; + if (s->value_found != nullptr) { + *(s->value_found) = false; + } +} + +static bool SaveValue(void* arg, const Slice& ikey, const Slice& v, bool didIO){ + Saver* s = reinterpret_cast(arg); + MergeContext* merge_contex = s->merge_context; + std::string merge_result; // temporary area for merge results later + + assert(s != nullptr && merge_contex != nullptr); + + ParsedInternalKey parsed_key; + // TODO: didIO and Merge? + s->didIO = didIO; + if (!ParseInternalKey(ikey, &parsed_key)) { + // TODO: what about corrupt during Merge? + s->state = kCorrupt; + } else { + if (s->ucmp->Compare(parsed_key.user_key, s->user_key) == 0) { + // Key matches. Process it + switch (parsed_key.type) { + case kTypeValue: + if (kNotFound == s->state) { + s->state = kFound; + s->value->assign(v.data(), v.size()); + } else if (kMerge == s->state) { + assert(s->merge_operator != nullptr); + s->state = kFound; + if (!s->merge_operator->FullMerge(s->user_key, &v, + merge_contex->GetOperands(), + s->value, s->logger)) { + RecordTick(s->statistics, NUMBER_MERGE_FAILURES); + s->state = kCorrupt; + } + } else { + assert(false); + } + return false; + + case kTypeDeletion: + if (kNotFound == s->state) { + s->state = kDeleted; + } else if (kMerge == s->state) { + s->state = kFound; + if (!s->merge_operator->FullMerge(s->user_key, nullptr, + merge_contex->GetOperands(), + s->value, s->logger)) { + RecordTick(s->statistics, NUMBER_MERGE_FAILURES); + s->state = kCorrupt; + } + } else { + assert(false); + } + return false; + + case kTypeMerge: + assert(s->state == kNotFound || s->state == kMerge); + s->state = kMerge; + merge_contex->PushOperand(v); + while (merge_contex->GetNumOperands() >= 2) { + // Attempt to merge operands together via user associateive merge + if (s->merge_operator->PartialMerge(s->user_key, + merge_contex->GetOperand(0), + merge_contex->GetOperand(1), + &merge_result, + s->logger)) { + merge_contex->PushPartialMergeResult(merge_result); + } else { + // Associative merge returns false ==> stack the operands + break; + } + } + return true; + + case kTypeLogData: + assert(false); + break; + } + } + } + + // s->state could be Corrupt, merge or notfound + + return false; +} + +static bool NewestFirst(FileMetaData* a, FileMetaData* b) { + return a->number > b->number; +} +static bool NewestFirstBySeqNo(FileMetaData* a, FileMetaData* b) { + if (a->smallest_seqno > b->smallest_seqno) { + assert(a->largest_seqno > b->largest_seqno); + return true; + } + assert(a->largest_seqno <= b->largest_seqno); + return false; +} + +Version::Version(VersionSet* vset, uint64_t version_number) + : vset_(vset), + next_(this), + prev_(this), + refs_(0), + num_levels_(vset->num_levels_), + files_(new std::vector[num_levels_]), + files_by_size_(num_levels_), + next_file_to_compact_by_size_(num_levels_), + file_to_compact_(nullptr), + file_to_compact_level_(-1), + compaction_score_(num_levels_), + compaction_level_(num_levels_), + version_number_(version_number) {} + +void Version::Get(const ReadOptions& options, + const LookupKey& k, + std::string* value, + Status* status, + MergeContext* merge_context, + GetStats* stats, + const Options& db_options, + bool* value_found) { + Slice ikey = k.internal_key(); + Slice user_key = k.user_key(); + const Comparator* ucmp = vset_->icmp_.user_comparator(); + + auto merge_operator = db_options.merge_operator.get(); + auto logger = db_options.info_log; + + assert(status->ok() || status->IsMergeInProgress()); + Saver saver; + saver.state = status->ok()? kNotFound : kMerge; + saver.ucmp = ucmp; + saver.user_key = user_key; + saver.value_found = value_found; + saver.value = value; + saver.merge_operator = merge_operator; + saver.merge_context = merge_context; + saver.logger = logger.get(); + saver.didIO = false; + saver.statistics = db_options.statistics.get(); + + stats->seek_file = nullptr; + stats->seek_file_level = -1; + FileMetaData* last_file_read = nullptr; + int last_file_read_level = -1; + + // We can search level-by-level since entries never hop across + // levels. Therefore we are guaranteed that if we find data + // in an smaller level, later levels are irrelevant (unless we + // are MergeInProgress). + for (int level = 0; level < num_levels_; level++) { + size_t num_files = files_[level].size(); + if (num_files == 0) continue; + + // Get the list of files to search in this level + FileMetaData* const* files = &files_[level][0]; + + // Some files may overlap each other. We find + // all files that overlap user_key and process them in order from + // newest to oldest. In the context of merge-operator, + // this can occur at any level. Otherwise, it only occurs + // at Level-0 (since Put/Deletes are always compacted into a single entry). + uint32_t start_index; + if (level == 0) { + // On Level-0, we read through all files to check for overlap. + start_index = 0; + } else { + // On Level-n (n>=1), files are sorted. + // Binary search to find earliest index whose largest key >= ikey. + // We will also stop when the file no longer overlaps ikey + start_index = FindFile(vset_->icmp_, files_[level], ikey); + } + + // Traverse each relevant file to find the desired key +#ifndef NDEBUG + FileMetaData* prev_file = nullptr; +#endif + for (uint32_t i = start_index; i < num_files; ++i) { + FileMetaData* f = files[i]; + if (ucmp->Compare(user_key, f->smallest.user_key()) < 0 || + ucmp->Compare(user_key, f->largest.user_key()) > 0) { + // Only process overlapping files. + if (level > 0) { + // If on Level-n (n>=1) then the files are sorted. + // So we can stop looking when we are past the ikey. + break; + } + // TODO: do we want to check file ranges for level0 files at all? + // For new SST format where Get() is fast, we might want to consider + // to avoid those two comparisons, if it can filter out too few files. + continue; + } +#ifndef NDEBUG + // Sanity check to make sure that the files are correctly sorted + if (prev_file) { + if (level != 0) { + int comp_sign = vset_->icmp_.Compare(prev_file->largest, f->smallest); + assert(comp_sign < 0); + } else { + // level == 0, the current file cannot be newer than the previous one. + if (vset_->options_->compaction_style == kCompactionStyleUniversal) { + assert(!NewestFirstBySeqNo(f, prev_file)); + } else { + assert(!NewestFirst(f, prev_file)); + } + } + } + prev_file = f; +#endif + bool tableIO = false; + *status = vset_->table_cache_->Get(options, f->number, f->file_size, + ikey, &saver, SaveValue, &tableIO, + MarkKeyMayExist); + // TODO: examine the behavior for corrupted key + if (!status->ok()) { + return; + } + + if (last_file_read != nullptr && stats->seek_file == nullptr) { + // We have had more than one seek for this read. Charge the 1st file. + stats->seek_file = last_file_read; + stats->seek_file_level = last_file_read_level; + } + + // If we did any IO as part of the read, then we remember it because + // it is a possible candidate for seek-based compaction. saver.didIO + // is true if the block had to be read in from storage and was not + // pre-exisiting in the block cache. Also, if this file was not pre- + // existing in the table cache and had to be freshly opened that needed + // the index blocks to be read-in, then tableIO is true. One thing + // to note is that the index blocks are not part of the block cache. + if (saver.didIO || tableIO) { + last_file_read = f; + last_file_read_level = level; + } + + switch (saver.state) { + case kNotFound: + break; // Keep searching in other files + case kFound: + return; + case kDeleted: + *status = Status::NotFound(); // Use empty error message for speed + return; + case kCorrupt: + *status = Status::Corruption("corrupted key for ", user_key); + return; + case kMerge: + break; + } + } + } + + + if (kMerge == saver.state) { + // merge_operands are in saver and we hit the beginning of the key history + // do a final merge of nullptr and operands; + if (merge_operator->FullMerge(user_key, nullptr, + saver.merge_context->GetOperands(), + value, logger.get())) { + *status = Status::OK(); + } else { + RecordTick(db_options.statistics.get(), NUMBER_MERGE_FAILURES); + *status = Status::Corruption("could not perform end-of-key merge for ", + user_key); + } + } else { + *status = Status::NotFound(); // Use an empty error message for speed + } +} + +bool Version::UpdateStats(const GetStats& stats) { + FileMetaData* f = stats.seek_file; + if (f != nullptr) { + f->allowed_seeks--; + if (f->allowed_seeks <= 0 && file_to_compact_ == nullptr) { + file_to_compact_ = f; + file_to_compact_level_ = stats.seek_file_level; + return true; + } + } + return false; +} + +void Version::Ref() { + ++refs_; +} + +void Version::Unref() { + assert(this != &vset_->dummy_versions_); + assert(refs_ >= 1); + --refs_; + if (refs_ == 0) { + delete this; + } +} + +bool Version::OverlapInLevel(int level, + const Slice* smallest_user_key, + const Slice* largest_user_key) { + return SomeFileOverlapsRange(vset_->icmp_, (level > 0), files_[level], + smallest_user_key, largest_user_key); +} + +int Version::PickLevelForMemTableOutput( + const Slice& smallest_user_key, + const Slice& largest_user_key) { + int level = 0; + if (!OverlapInLevel(0, &smallest_user_key, &largest_user_key)) { + // Push to next level if there is no overlap in next level, + // and the #bytes overlapping in the level after that are limited. + InternalKey start(smallest_user_key, kMaxSequenceNumber, kValueTypeForSeek); + InternalKey limit(largest_user_key, 0, static_cast(0)); + std::vector overlaps; + int max_mem_compact_level = vset_->options_->max_mem_compaction_level; + while (max_mem_compact_level > 0 && level < max_mem_compact_level) { + if (OverlapInLevel(level + 1, &smallest_user_key, &largest_user_key)) { + break; + } + if (level + 2 >= num_levels_) { + level++; + break; + } + GetOverlappingInputs(level + 2, &start, &limit, &overlaps); + const uint64_t sum = TotalFileSize(overlaps); + if (sum > vset_->MaxGrandParentOverlapBytes(level)) { + break; + } + level++; + } + } + + return level; +} + +// Store in "*inputs" all files in "level" that overlap [begin,end] +// If hint_index is specified, then it points to a file in the +// overlapping range. +// The file_index returns a pointer to any file in an overlapping range. +void Version::GetOverlappingInputs( + int level, + const InternalKey* begin, + const InternalKey* end, + std::vector* inputs, + int hint_index, + int* file_index) { + inputs->clear(); + Slice user_begin, user_end; + if (begin != nullptr) { + user_begin = begin->user_key(); + } + if (end != nullptr) { + user_end = end->user_key(); + } + if (file_index) { + *file_index = -1; + } + const Comparator* user_cmp = vset_->icmp_.user_comparator(); + if (begin != nullptr && end != nullptr && level > 0) { + GetOverlappingInputsBinarySearch(level, user_begin, user_end, inputs, + hint_index, file_index); + return; + } + for (size_t i = 0; i < files_[level].size(); ) { + FileMetaData* f = files_[level][i++]; + const Slice file_start = f->smallest.user_key(); + const Slice file_limit = f->largest.user_key(); + if (begin != nullptr && user_cmp->Compare(file_limit, user_begin) < 0) { + // "f" is completely before specified range; skip it + } else if (end != nullptr && user_cmp->Compare(file_start, user_end) > 0) { + // "f" is completely after specified range; skip it + } else { + inputs->push_back(f); + if (level == 0) { + // Level-0 files may overlap each other. So check if the newly + // added file has expanded the range. If so, restart search. + if (begin != nullptr && user_cmp->Compare(file_start, user_begin) < 0) { + user_begin = file_start; + inputs->clear(); + i = 0; + } else if (end != nullptr + && user_cmp->Compare(file_limit, user_end) > 0) { + user_end = file_limit; + inputs->clear(); + i = 0; + } + } else if (file_index) { + *file_index = i-1; + } + } + } +} + +// Store in "*inputs" all files in "level" that overlap [begin,end] +// Employ binary search to find at least one file that overlaps the +// specified range. From that file, iterate backwards and +// forwards to find all overlapping files. +void Version::GetOverlappingInputsBinarySearch( + int level, + const Slice& user_begin, + const Slice& user_end, + std::vector* inputs, + int hint_index, + int* file_index) { + assert(level > 0); + int min = 0; + int mid = 0; + int max = files_[level].size() -1; + bool foundOverlap = false; + const Comparator* user_cmp = vset_->icmp_.user_comparator(); + + // if the caller already knows the index of a file that has overlap, + // then we can skip the binary search. + if (hint_index != -1) { + mid = hint_index; + foundOverlap = true; + } + + while (!foundOverlap && min <= max) { + mid = (min + max)/2; + FileMetaData* f = files_[level][mid]; + const Slice file_start = f->smallest.user_key(); + const Slice file_limit = f->largest.user_key(); + if (user_cmp->Compare(file_limit, user_begin) < 0) { + min = mid + 1; + } else if (user_cmp->Compare(user_end, file_start) < 0) { + max = mid - 1; + } else { + foundOverlap = true; + break; + } + } + + // If there were no overlapping files, return immediately. + if (!foundOverlap) { + return; + } + // returns the index where an overlap is found + if (file_index) { + *file_index = mid; + } + ExtendOverlappingInputs(level, user_begin, user_end, inputs, mid); +} + +// Store in "*inputs" all files in "level" that overlap [begin,end] +// The midIndex specifies the index of at least one file that +// overlaps the specified range. From that file, iterate backward +// and forward to find all overlapping files. +void Version::ExtendOverlappingInputs( + int level, + const Slice& user_begin, + const Slice& user_end, + std::vector* inputs, + unsigned int midIndex) { + + const Comparator* user_cmp = vset_->icmp_.user_comparator(); +#ifndef NDEBUG + { + // assert that the file at midIndex overlaps with the range + assert(midIndex < files_[level].size()); + FileMetaData* f = files_[level][midIndex]; + const Slice fstart = f->smallest.user_key(); + const Slice flimit = f->largest.user_key(); + if (user_cmp->Compare(fstart, user_begin) >= 0) { + assert(user_cmp->Compare(fstart, user_end) <= 0); + } else { + assert(user_cmp->Compare(flimit, user_begin) >= 0); + } + } +#endif + int startIndex = midIndex + 1; + int endIndex = midIndex; + int count __attribute__((unused)) = 0; + + // check backwards from 'mid' to lower indices + for (int i = midIndex; i >= 0 ; i--) { + FileMetaData* f = files_[level][i]; + const Slice file_limit = f->largest.user_key(); + if (user_cmp->Compare(file_limit, user_begin) >= 0) { + startIndex = i; + assert((count++, true)); + } else { + break; + } + } + // check forward from 'mid+1' to higher indices + for (unsigned int i = midIndex+1; i < files_[level].size(); i++) { + FileMetaData* f = files_[level][i]; + const Slice file_start = f->smallest.user_key(); + if (user_cmp->Compare(file_start, user_end) <= 0) { + assert((count++, true)); + endIndex = i; + } else { + break; + } + } + assert(count == endIndex - startIndex + 1); + + // insert overlapping files into vector + for (int i = startIndex; i <= endIndex; i++) { + FileMetaData* f = files_[level][i]; + inputs->push_back(f); + } +} + +// Returns true iff the first or last file in inputs contains +// an overlapping user key to the file "just outside" of it (i.e. +// just after the last file, or just before the first file) +// REQUIRES: "*inputs" is a sorted list of non-overlapping files +bool Version::HasOverlappingUserKey( + const std::vector* inputs, + int level) { + + // If inputs empty, there is no overlap. + // If level == 0, it is assumed that all needed files were already included. + if (inputs->empty() || level == 0){ + return false; + } + + const Comparator* user_cmp = vset_->icmp_.user_comparator(); + const std::vector& files = files_[level]; + const size_t kNumFiles = files.size(); + + // Check the last file in inputs against the file after it + size_t last_file = FindFile(vset_->icmp_, files, + inputs->back()->largest.Encode()); + assert(0 <= last_file && last_file < kNumFiles); // File should exist! + if (last_file < kNumFiles-1) { // If not the last file + const Slice last_key_in_input = files[last_file]->largest.user_key(); + const Slice first_key_after = files[last_file+1]->smallest.user_key(); + if (user_cmp->Compare(last_key_in_input, first_key_after) == 0) { + // The last user key in input overlaps with the next file's first key + return true; + } + } + + // Check the first file in inputs against the file just before it + size_t first_file = FindFile(vset_->icmp_, files, + inputs->front()->smallest.Encode()); + assert(0 <= first_file && first_file <= last_file); // File should exist! + if (first_file > 0) { // If not first file + const Slice& first_key_in_input = files[first_file]->smallest.user_key(); + const Slice& last_key_before = files[first_file-1]->largest.user_key(); + if (user_cmp->Compare(first_key_in_input, last_key_before) == 0) { + // The first user key in input overlaps with the previous file's last key + return true; + } + } + + return false; +} + +int64_t Version::NumLevelBytes(int level) const { + assert(level >= 0); + assert(level < NumberLevels()); + return TotalFileSize(files_[level]); +} + +const char* Version::LevelSummary(LevelSummaryStorage* scratch) const { + int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files["); + for (int i = 0; i < NumberLevels(); i++) { + int sz = sizeof(scratch->buffer) - len; + int ret = snprintf(scratch->buffer + len, sz, "%d ", int(files_[i].size())); + if (ret < 0 || ret >= sz) break; + len += ret; + } + snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "]"); + return scratch->buffer; +} + +const char* Version::LevelFileSummary(FileSummaryStorage* scratch, + int level) const { + int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files_size["); + for (const auto& f : files_[level]) { + int sz = sizeof(scratch->buffer) - len; + int ret = snprintf(scratch->buffer + len, sz, + "#%lu(seq=%lu,sz=%lu,%lu) ", + (unsigned long)f->number, + (unsigned long)f->smallest_seqno, + (unsigned long)f->file_size, + (unsigned long)f->being_compacted); + if (ret < 0 || ret >= sz) + break; + len += ret; + } + snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "]"); + return scratch->buffer; +} + +int64_t Version::MaxNextLevelOverlappingBytes() { + uint64_t result = 0; + std::vector overlaps; + for (int level = 1; level < NumberLevels() - 1; level++) { + for (const auto& f : files_[level]) { + GetOverlappingInputs(level + 1, &f->smallest, &f->largest, &overlaps); + const uint64_t sum = TotalFileSize(overlaps); + if (sum > result) { + result = sum; + } + } + } + return result; +} + +void Version::AddLiveFiles(std::set* live) { + for (int level = 0; level < NumberLevels(); level++) { + const std::vector& files = files_[level]; + for (const auto& file : files) { + live->insert(file->number); + } + } +} + +std::string Version::DebugString(bool hex) const { + std::string r; + for (int level = 0; level < num_levels_; level++) { + // E.g., + // --- level 1 --- + // 17:123['a' .. 'd'] + // 20:43['e' .. 'g'] + r.append("--- level "); + AppendNumberTo(&r, level); + r.append(" --- version# "); + AppendNumberTo(&r, version_number_); + r.append(" ---\n"); + const std::vector& files = files_[level]; + for (size_t i = 0; i < files.size(); i++) { + r.push_back(' '); + AppendNumberTo(&r, files[i]->number); + r.push_back(':'); + AppendNumberTo(&r, files[i]->file_size); + r.append("["); + r.append(files[i]->smallest.DebugString(hex)); + r.append(" .. "); + r.append(files[i]->largest.DebugString(hex)); + r.append("]\n"); + } + } + return r; +} + +// this is used to batch writes to the manifest file +struct VersionSet::ManifestWriter { + Status status; + bool done; + port::CondVar cv; + VersionEdit* edit; + + explicit ManifestWriter(port::Mutex* mu, VersionEdit* e) : + done(false), cv(mu), edit(e) {} +}; + +// A helper class so we can efficiently apply a whole sequence +// of edits to a particular state without creating intermediate +// Versions that contain full copies of the intermediate state. +class VersionSet::Builder { + private: + // Helper to sort by v->files_[file_number].smallest + struct BySmallestKey { + const InternalKeyComparator* internal_comparator; + + bool operator()(FileMetaData* f1, FileMetaData* f2) const { + int r = internal_comparator->Compare(f1->smallest, f2->smallest); + if (r != 0) { + return (r < 0); + } else { + // Break ties by file number + return (f1->number < f2->number); + } + } + }; + + typedef std::set FileSet; + struct LevelState { + std::set deleted_files; + FileSet* added_files; + }; + + VersionSet* vset_; + Version* base_; + LevelState* levels_; + + public: + // Initialize a builder with the files from *base and other info from *vset + Builder(VersionSet* vset, Version* base) : vset_(vset), base_(base) { + base_->Ref(); + levels_ = new LevelState[base->NumberLevels()]; + BySmallestKey cmp; + cmp.internal_comparator = &vset_->icmp_; + for (int level = 0; level < base->NumberLevels(); level++) { + levels_[level].added_files = new FileSet(cmp); + } + } + + ~Builder() { + for (int level = 0; level < base_->NumberLevels(); level++) { + const FileSet* added = levels_[level].added_files; + std::vector to_unref; + to_unref.reserve(added->size()); + for (FileSet::const_iterator it = added->begin(); + it != added->end(); ++it) { + to_unref.push_back(*it); + } + delete added; + for (uint32_t i = 0; i < to_unref.size(); i++) { + FileMetaData* f = to_unref[i]; + f->refs--; + if (f->refs <= 0) { + delete f; + } + } + } + delete[] levels_; + base_->Unref(); + } + + void CheckConsistency(Version* v) { +#ifndef NDEBUG + for (int level = 0; level < v->NumberLevels(); level++) { + // Make sure there is no overlap in levels > 0 + if (level > 0) { + for (uint32_t i = 1; i < v->files_[level].size(); i++) { + const InternalKey& prev_end = v->files_[level][i-1]->largest; + const InternalKey& this_begin = v->files_[level][i]->smallest; + if (vset_->icmp_.Compare(prev_end, this_begin) >= 0) { + fprintf(stderr, "overlapping ranges in same level %s vs. %s\n", + prev_end.DebugString().c_str(), + this_begin.DebugString().c_str()); + abort(); + } + } + } + } +#endif + } + + void CheckConsistencyForDeletes(VersionEdit* edit, unsigned int number, + int level) { +#ifndef NDEBUG + // a file to be deleted better exist in the previous version + bool found = false; + for (int l = 0; !found && l < base_->NumberLevels(); l++) { + const std::vector& base_files = base_->files_[l]; + for (unsigned int i = 0; i < base_files.size(); i++) { + FileMetaData* f = base_files[i]; + if (f->number == number) { + found = true; + break; + } + } + } + // if the file did not exist in the previous version, then it + // is possibly moved from lower level to higher level in current + // version + for (int l = level+1; !found && l < base_->NumberLevels(); l++) { + const FileSet* added = levels_[l].added_files; + for (FileSet::const_iterator added_iter = added->begin(); + added_iter != added->end(); ++added_iter) { + FileMetaData* f = *added_iter; + if (f->number == number) { + found = true; + break; + } + } + } + + // maybe this file was added in a previous edit that was Applied + if (!found) { + const FileSet* added = levels_[level].added_files; + for (FileSet::const_iterator added_iter = added->begin(); + added_iter != added->end(); ++added_iter) { + FileMetaData* f = *added_iter; + if (f->number == number) { + found = true; + break; + } + } + } + assert(found); +#endif + } + + // Apply all of the edits in *edit to the current state. + void Apply(VersionEdit* edit) { + CheckConsistency(base_); + + // Update compaction pointers + for (size_t i = 0; i < edit->compact_pointers_.size(); i++) { + const int level = edit->compact_pointers_[i].first; + vset_->compact_pointer_[level] = + edit->compact_pointers_[i].second.Encode().ToString(); + } + + // Delete files + const VersionEdit::DeletedFileSet& del = edit->deleted_files_; + for (VersionEdit::DeletedFileSet::const_iterator iter = del.begin(); + iter != del.end(); + ++iter) { + const int level = iter->first; + const uint64_t number = iter->second; + levels_[level].deleted_files.insert(number); + CheckConsistencyForDeletes(edit, number, level); + } + + // Add new files + for (size_t i = 0; i < edit->new_files_.size(); i++) { + const int level = edit->new_files_[i].first; + FileMetaData* f = new FileMetaData(edit->new_files_[i].second); + f->refs = 1; + + // We arrange to automatically compact this file after + // a certain number of seeks. Let's assume: + // (1) One seek costs 10ms + // (2) Writing or reading 1MB costs 10ms (100MB/s) + // (3) A compaction of 1MB does 25MB of IO: + // 1MB read from this level + // 10-12MB read from next level (boundaries may be misaligned) + // 10-12MB written to next level + // This implies that 25 seeks cost the same as the compaction + // of 1MB of data. I.e., one seek costs approximately the + // same as the compaction of 40KB of data. We are a little + // conservative and allow approximately one seek for every 16KB + // of data before triggering a compaction. + f->allowed_seeks = (f->file_size / 16384); + if (f->allowed_seeks < 100) f->allowed_seeks = 100; + + levels_[level].deleted_files.erase(f->number); + levels_[level].added_files->insert(f); + } + } + + // Save the current state in *v. + void SaveTo(Version* v) { + CheckConsistency(base_); + CheckConsistency(v); + BySmallestKey cmp; + cmp.internal_comparator = &vset_->icmp_; + for (int level = 0; level < base_->NumberLevels(); level++) { + // Merge the set of added files with the set of pre-existing files. + // Drop any deleted files. Store the result in *v. + const std::vector& base_files = base_->files_[level]; + std::vector::const_iterator base_iter = base_files.begin(); + std::vector::const_iterator base_end = base_files.end(); + const FileSet* added = levels_[level].added_files; + v->files_[level].reserve(base_files.size() + added->size()); + for (FileSet::const_iterator added_iter = added->begin(); + added_iter != added->end(); + ++added_iter) { + // Add all smaller files listed in base_ + for (std::vector::const_iterator bpos + = std::upper_bound(base_iter, base_end, *added_iter, cmp); + base_iter != bpos; + ++base_iter) { + MaybeAddFile(v, level, *base_iter); + } + + MaybeAddFile(v, level, *added_iter); + } + + // Add remaining base files + for (; base_iter != base_end; ++base_iter) { + MaybeAddFile(v, level, *base_iter); + } + } + + CheckConsistency(v); + } + + void MaybeAddFile(Version* v, int level, FileMetaData* f) { + if (levels_[level].deleted_files.count(f->number) > 0) { + // File is deleted: do nothing + } else { + std::vector* files = &v->files_[level]; + if (level > 0 && !files->empty()) { + // Must not overlap + assert(vset_->icmp_.Compare((*files)[files->size()-1]->largest, + f->smallest) < 0); + } + f->refs++; + files->push_back(f); + } + } +}; + +VersionSet::VersionSet(const std::string& dbname, const Options* options, + const EnvOptions& storage_options, + TableCache* table_cache, + const InternalKeyComparator* cmp) + : env_(options->env), + dbname_(dbname), + options_(options), + table_cache_(table_cache), + icmp_(*cmp), + next_file_number_(2), + manifest_file_number_(0), // Filled by Recover() + last_sequence_(0), + log_number_(0), + prev_log_number_(0), + num_levels_(options_->num_levels), + dummy_versions_(this), + current_(nullptr), + need_slowdown_for_num_level0_files_(false), + compactions_in_progress_(options_->num_levels), + current_version_number_(0), + manifest_file_size_(0), + storage_options_(storage_options), + storage_options_compactions_(storage_options_) { + compact_pointer_ = new std::string[options_->num_levels]; + Init(options_->num_levels); + AppendVersion(new Version(this, current_version_number_++)); +} + +VersionSet::~VersionSet() { + current_->Unref(); + assert(dummy_versions_.next_ == &dummy_versions_); // List must be empty + for (auto file : obsolete_files_) { + delete file; + } + obsolete_files_.clear(); + delete[] compact_pointer_; + delete[] max_file_size_; + delete[] level_max_bytes_; +} + +void VersionSet::Init(int num_levels) { + max_file_size_ = new uint64_t[num_levels]; + level_max_bytes_ = new uint64_t[num_levels]; + int target_file_size_multiplier = options_->target_file_size_multiplier; + int max_bytes_multiplier = options_->max_bytes_for_level_multiplier; + for (int i = 0; i < num_levels; i++) { + if (i == 0 && options_->compaction_style == kCompactionStyleUniversal) { + max_file_size_[i] = ULLONG_MAX; + level_max_bytes_[i] = options_->max_bytes_for_level_base; + } else if (i > 1) { + max_file_size_[i] = max_file_size_[i-1] * target_file_size_multiplier; + level_max_bytes_[i] = level_max_bytes_[i-1] * max_bytes_multiplier * + options_->max_bytes_for_level_multiplier_additional[i-1]; + } else { + max_file_size_[i] = options_->target_file_size_base; + level_max_bytes_[i] = options_->max_bytes_for_level_base; + } + } +} + +void VersionSet::AppendVersion(Version* v) { + // Make "v" current + assert(v->refs_ == 0); + assert(v != current_); + if (current_ != nullptr) { + assert(current_->refs_ > 0); + current_->Unref(); + } + current_ = v; + need_slowdown_for_num_level0_files_ = + (options_->level0_slowdown_writes_trigger >= 0 && current_ != nullptr && + v->NumLevelFiles(0) >= options_->level0_slowdown_writes_trigger); + v->Ref(); + + // Append to linked list + v->prev_ = dummy_versions_.prev_; + v->next_ = &dummy_versions_; + v->prev_->next_ = v; + v->next_->prev_ = v; +} + +Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu, + bool new_descriptor_log) { + mu->AssertHeld(); + + // queue our request + ManifestWriter w(mu, edit); + manifest_writers_.push_back(&w); + while (!w.done && &w != manifest_writers_.front()) { + w.cv.Wait(); + } + if (w.done) { + return w.status; + } + + std::vector batch_edits; + Version* v = new Version(this, current_version_number_++); + Builder builder(this, current_); + + // process all requests in the queue + ManifestWriter* last_writer = &w; + assert(!manifest_writers_.empty()); + assert(manifest_writers_.front() == &w); + std::deque::iterator iter = manifest_writers_.begin(); + for (; iter != manifest_writers_.end(); ++iter) { + last_writer = *iter; + LogAndApplyHelper(&builder, v, last_writer->edit, mu); + batch_edits.push_back(last_writer->edit); + } + builder.SaveTo(v); + + // Initialize new descriptor log file if necessary by creating + // a temporary file that contains a snapshot of the current version. + std::string new_manifest_file; + uint64_t new_manifest_file_size = 0; + Status s; + // we will need this if we are creating new manifest + uint64_t old_manifest_file_number = manifest_file_number_; + + // No need to perform this check if a new Manifest is being created anyways. + if (!descriptor_log_ || + manifest_file_size_ > options_->max_manifest_file_size) { + new_descriptor_log = true; + manifest_file_number_ = NewFileNumber(); // Change manifest file no. + } + + if (new_descriptor_log) { + new_manifest_file = DescriptorFileName(dbname_, manifest_file_number_); + edit->SetNextFile(next_file_number_); + } + + // Unlock during expensive MANIFEST log write. New writes cannot get here + // because &w is ensuring that all new writes get queued. + { + // calculate the amount of data being compacted at every level + std::vector size_being_compacted(v->NumberLevels() - 1); + SizeBeingCompacted(size_being_compacted); + + mu->Unlock(); + + // This is fine because everything inside of this block is serialized -- + // only one thread can be here at the same time + if (!new_manifest_file.empty()) { + unique_ptr descriptor_file; + s = env_->NewWritableFile(new_manifest_file, &descriptor_file, + storage_options_); + if (s.ok()) { + descriptor_log_.reset(new log::Writer(std::move(descriptor_file))); + s = WriteSnapshot(descriptor_log_.get()); + } + } + + // The calls to Finalize and UpdateFilesBySize are cpu-heavy + // and is best called outside the mutex. + Finalize(v, size_being_compacted); + UpdateFilesBySize(v); + + // Write new record to MANIFEST log + if (s.ok()) { + std::string record; + for (unsigned int i = 0; i < batch_edits.size(); i++) { + batch_edits[i]->EncodeTo(&record); + s = descriptor_log_->AddRecord(record); + if (!s.ok()) { + break; + } + } + if (s.ok()) { + if (options_->use_fsync) { + StopWatch sw(env_, options_->statistics.get(), + MANIFEST_FILE_SYNC_MICROS); + s = descriptor_log_->file()->Fsync(); + } else { + StopWatch sw(env_, options_->statistics.get(), + MANIFEST_FILE_SYNC_MICROS); + s = descriptor_log_->file()->Sync(); + } + } + if (!s.ok()) { + Log(options_->info_log, "MANIFEST write: %s\n", s.ToString().c_str()); + if (ManifestContains(record)) { + Log(options_->info_log, + "MANIFEST contains log record despite error; advancing to new " + "version to prevent mismatch between in-memory and logged state" + " If paranoid is set, then the db is now in readonly mode."); + s = Status::OK(); + } + } + } + + // If we just created a new descriptor file, install it by writing a + // new CURRENT file that points to it. + if (s.ok() && !new_manifest_file.empty()) { + s = SetCurrentFile(env_, dbname_, manifest_file_number_); + if (s.ok() && old_manifest_file_number < manifest_file_number_) { + // delete old manifest file + Log(options_->info_log, + "Deleting manifest %lu current manifest %lu\n", + (unsigned long)old_manifest_file_number, + (unsigned long)manifest_file_number_); + // we don't care about an error here, PurgeObsoleteFiles will take care + // of it later + env_->DeleteFile(DescriptorFileName(dbname_, old_manifest_file_number)); + } + } + + // find offset in manifest file where this version is stored. + new_manifest_file_size = descriptor_log_->file()->GetFileSize(); + + LogFlush(options_->info_log); + mu->Lock(); + } + + // Install the new version + if (s.ok()) { + manifest_file_size_ = new_manifest_file_size; + AppendVersion(v); + log_number_ = edit->log_number_; + prev_log_number_ = edit->prev_log_number_; + + } else { + Log(options_->info_log, "Error in committing version %lu", + (unsigned long)v->GetVersionNumber()); + delete v; + if (!new_manifest_file.empty()) { + descriptor_log_.reset(); + env_->DeleteFile(new_manifest_file); + } + } + + // wake up all the waiting writers + while (true) { + ManifestWriter* ready = manifest_writers_.front(); + manifest_writers_.pop_front(); + if (ready != &w) { + ready->status = s; + ready->done = true; + ready->cv.Signal(); + } + if (ready == last_writer) break; + } + // Notify new head of write queue + if (!manifest_writers_.empty()) { + manifest_writers_.front()->cv.Signal(); + } + return s; +} + +void VersionSet::LogAndApplyHelper(Builder* builder, Version* v, + VersionEdit* edit, port::Mutex* mu) { + mu->AssertHeld(); + + if (edit->has_log_number_) { + assert(edit->log_number_ >= log_number_); + assert(edit->log_number_ < next_file_number_); + } else { + edit->SetLogNumber(log_number_); + } + + if (!edit->has_prev_log_number_) { + edit->SetPrevLogNumber(prev_log_number_); + } + + edit->SetNextFile(next_file_number_); + edit->SetLastSequence(last_sequence_); + + builder->Apply(edit); +} + +Status VersionSet::Recover() { + struct LogReporter : public log::Reader::Reporter { + Status* status; + virtual void Corruption(size_t bytes, const Status& s) { + if (this->status->ok()) *this->status = s; + } + }; + + // Read "CURRENT" file, which contains a pointer to the current manifest file + std::string current; + Status s = ReadFileToString(env_, CurrentFileName(dbname_), ¤t); + if (!s.ok()) { + return s; + } + if (current.empty() || current[current.size()-1] != '\n') { + return Status::Corruption("CURRENT file does not end with newline"); + } + current.resize(current.size() - 1); + + Log(options_->info_log, "Recovering from manifest file:%s\n", + current.c_str()); + + std::string dscname = dbname_ + "/" + current; + unique_ptr file; + s = env_->NewSequentialFile(dscname, &file, storage_options_); + if (!s.ok()) { + return s; + } + uint64_t manifest_file_size; + s = env_->GetFileSize(dscname, &manifest_file_size); + if (!s.ok()) { + return s; + } + + bool have_log_number = false; + bool have_prev_log_number = false; + bool have_next_file = false; + bool have_last_sequence = false; + uint64_t next_file = 0; + uint64_t last_sequence = 0; + uint64_t log_number = 0; + uint64_t prev_log_number = 0; + Builder builder(this, current_); + + { + LogReporter reporter; + reporter.status = &s; + log::Reader reader(std::move(file), &reporter, true/*checksum*/, + 0/*initial_offset*/); + Slice record; + std::string scratch; + while (reader.ReadRecord(&record, &scratch) && s.ok()) { + VersionEdit edit; + s = edit.DecodeFrom(record); + if (!s.ok()) { + break; + } + + if (edit.max_level_ >= current_->NumberLevels()) { + s = Status::InvalidArgument( + "db has more levels than options.num_levels"); + break; + } + + if (edit.has_comparator_ && + edit.comparator_ != icmp_.user_comparator()->Name()) { + s = Status::InvalidArgument(icmp_.user_comparator()->Name(), + "does not match existing comparator " + + edit.comparator_); + break; + } + + builder.Apply(&edit); + + if (edit.has_log_number_) { + log_number = edit.log_number_; + have_log_number = true; + } + + if (edit.has_prev_log_number_) { + prev_log_number = edit.prev_log_number_; + have_prev_log_number = true; + } + + if (edit.has_next_file_number_) { + next_file = edit.next_file_number_; + have_next_file = true; + } + + if (edit.has_last_sequence_) { + last_sequence = edit.last_sequence_; + have_last_sequence = true; + } + } + } + file.reset(); + + if (s.ok()) { + if (!have_next_file) { + s = Status::Corruption("no meta-nextfile entry in descriptor"); + } else if (!have_log_number) { + s = Status::Corruption("no meta-lognumber entry in descriptor"); + } else if (!have_last_sequence) { + s = Status::Corruption("no last-sequence-number entry in descriptor"); + } + + if (!have_prev_log_number) { + prev_log_number = 0; + } + + MarkFileNumberUsed(prev_log_number); + MarkFileNumberUsed(log_number); + } + + if (s.ok()) { + Version* v = new Version(this, current_version_number_++); + builder.SaveTo(v); + + // Install recovered version + std::vector size_being_compacted(v->NumberLevels() - 1); + SizeBeingCompacted(size_being_compacted); + Finalize(v, size_being_compacted); + + manifest_file_size_ = manifest_file_size; + AppendVersion(v); + manifest_file_number_ = next_file; + next_file_number_ = next_file + 1; + last_sequence_ = last_sequence; + log_number_ = log_number; + prev_log_number_ = prev_log_number; + + Log(options_->info_log, "Recovered from manifest file:%s succeeded," + "manifest_file_number is %lu, next_file_number is %lu, " + "last_sequence is %lu, log_number is %lu," + "prev_log_number is %lu\n", + current.c_str(), + (unsigned long)manifest_file_number_, + (unsigned long)next_file_number_, + (unsigned long)last_sequence_, + (unsigned long)log_number_, + (unsigned long)prev_log_number_); + } + + return s; +} + +Status VersionSet::DumpManifest(Options& options, std::string& dscname, + bool verbose, bool hex) { + struct LogReporter : public log::Reader::Reporter { + Status* status; + virtual void Corruption(size_t bytes, const Status& s) { + if (this->status->ok()) *this->status = s; + } + }; + + // Open the specified manifest file. + unique_ptr file; + Status s = options.env->NewSequentialFile(dscname, &file, storage_options_); + if (!s.ok()) { + return s; + } + + bool have_log_number = false; + bool have_prev_log_number = false; + bool have_next_file = false; + bool have_last_sequence = false; + uint64_t next_file = 0; + uint64_t last_sequence = 0; + uint64_t log_number = 0; + uint64_t prev_log_number = 0; + int count = 0; + VersionSet::Builder builder(this, current_); + + { + LogReporter reporter; + reporter.status = &s; + log::Reader reader(std::move(file), &reporter, true/*checksum*/, + 0/*initial_offset*/); + Slice record; + std::string scratch; + while (reader.ReadRecord(&record, &scratch) && s.ok()) { + VersionEdit edit; + s = edit.DecodeFrom(record); + if (s.ok()) { + if (edit.has_comparator_ && + edit.comparator_ != icmp_.user_comparator()->Name()) { + s = Status::InvalidArgument(icmp_.user_comparator()->Name(), + "does not match existing comparator " + + edit.comparator_); + } + } + + // Write out each individual edit + if (verbose) { + printf("*************************Edit[%d] = %s\n", + count, edit.DebugString(hex).c_str()); + } + count++; + + if (s.ok()) { + builder.Apply(&edit); + } + + if (edit.has_log_number_) { + log_number = edit.log_number_; + have_log_number = true; + } + + if (edit.has_prev_log_number_) { + prev_log_number = edit.prev_log_number_; + have_prev_log_number = true; + } + + if (edit.has_next_file_number_) { + next_file = edit.next_file_number_; + have_next_file = true; + } + + if (edit.has_last_sequence_) { + last_sequence = edit.last_sequence_; + have_last_sequence = true; + } + } + } + file.reset(); + + if (s.ok()) { + if (!have_next_file) { + s = Status::Corruption("no meta-nextfile entry in descriptor"); + printf("no meta-nextfile entry in descriptor"); + } else if (!have_log_number) { + s = Status::Corruption("no meta-lognumber entry in descriptor"); + printf("no meta-lognumber entry in descriptor"); + } else if (!have_last_sequence) { + printf("no last-sequence-number entry in descriptor"); + s = Status::Corruption("no last-sequence-number entry in descriptor"); + } + + if (!have_prev_log_number) { + prev_log_number = 0; + } + + MarkFileNumberUsed(prev_log_number); + MarkFileNumberUsed(log_number); + } + + if (s.ok()) { + Version* v = new Version(this, 0); + builder.SaveTo(v); + + // Install recovered version + std::vector size_being_compacted(v->NumberLevels() - 1); + SizeBeingCompacted(size_being_compacted); + Finalize(v, size_being_compacted); + + AppendVersion(v); + manifest_file_number_ = next_file; + next_file_number_ = next_file + 1; + last_sequence_ = last_sequence; + log_number_ = log_number; + prev_log_number_ = prev_log_number; + + printf("manifest_file_number %lu next_file_number %lu last_sequence " + "%lu log_number %lu prev_log_number %lu\n", + (unsigned long)manifest_file_number_, + (unsigned long)next_file_number_, + (unsigned long)last_sequence, + (unsigned long)log_number, + (unsigned long)prev_log_number); + printf("%s \n", v->DebugString(hex).c_str()); + } + + return s; +} + +void VersionSet::MarkFileNumberUsed(uint64_t number) { + if (next_file_number_ <= number) { + next_file_number_ = number + 1; + } +} + +void VersionSet::Finalize(Version* v, + std::vector& size_being_compacted) { + // Pre-sort level0 for Get() + if (options_->compaction_style == kCompactionStyleUniversal) { + std::sort(v->files_[0].begin(), v->files_[0].end(), NewestFirstBySeqNo); + } else { + std::sort(v->files_[0].begin(), v->files_[0].end(), NewestFirst); + } + + double max_score = 0; + int max_score_level = 0; + + int num_levels_to_check = + (options_->compaction_style != kCompactionStyleUniversal) ? + v->NumberLevels() - 1 : 1; + + for (int level = 0; level < num_levels_to_check; level++) { + + double score; + if (level == 0) { + // We treat level-0 specially by bounding the number of files + // instead of number of bytes for two reasons: + // + // (1) With larger write-buffer sizes, it is nice not to do too + // many level-0 compactions. + // + // (2) The files in level-0 are merged on every read and + // therefore we wish to avoid too many files when the individual + // file size is small (perhaps because of a small write-buffer + // setting, or very high compression ratios, or lots of + // overwrites/deletions). + int numfiles = 0; + for (unsigned int i = 0; i < v->files_[level].size(); i++) { + if (!v->files_[level][i]->being_compacted) { + numfiles++; + } + } + + // If we are slowing down writes, then we better compact that first + if (numfiles >= options_->level0_stop_writes_trigger) { + score = 1000000; + // Log(options_->info_log, "XXX score l0 = 1000000000 max"); + } else if (numfiles >= options_->level0_slowdown_writes_trigger) { + score = 10000; + // Log(options_->info_log, "XXX score l0 = 1000000 medium"); + } else { + score = numfiles / + static_cast(options_->level0_file_num_compaction_trigger); + if (score >= 1) { + // Log(options_->info_log, "XXX score l0 = %d least", (int)score); + } + } + } else { + // Compute the ratio of current size to size limit. + const uint64_t level_bytes = TotalFileSize(v->files_[level]) - + size_being_compacted[level]; + score = static_cast(level_bytes) / MaxBytesForLevel(level); + if (score > 1) { + // Log(options_->info_log, "XXX score l%d = %d ", level, (int)score); + } + if (max_score < score) { + max_score = score; + max_score_level = level; + } + } + v->compaction_level_[level] = level; + v->compaction_score_[level] = score; + } + + // update the max compaction score in levels 1 to n-1 + v->max_compaction_score_ = max_score; + v->max_compaction_score_level_ = max_score_level; + + // sort all the levels based on their score. Higher scores get listed + // first. Use bubble sort because the number of entries are small. + for (int i = 0; i < v->NumberLevels() - 2; i++) { + for (int j = i + 1; j < v->NumberLevels() - 1; j++) { + if (v->compaction_score_[i] < v->compaction_score_[j]) { + double score = v->compaction_score_[i]; + int level = v->compaction_level_[i]; + v->compaction_score_[i] = v->compaction_score_[j]; + v->compaction_level_[i] = v->compaction_level_[j]; + v->compaction_score_[j] = score; + v->compaction_level_[j] = level; + } + } + } +} + +// A static compator used to sort files based on their size +// In normal mode: descending size +static bool compareSizeDescending(const VersionSet::Fsize& first, + const VersionSet::Fsize& second) { + return (first.file->file_size > second.file->file_size); +} +// A static compator used to sort files based on their seqno +// In universal style : descending seqno +static bool compareSeqnoDescending(const VersionSet::Fsize& first, + const VersionSet::Fsize& second) { + if (first.file->smallest_seqno > second.file->smallest_seqno) { + assert(first.file->largest_seqno > second.file->largest_seqno); + return true; + } + assert(first.file->largest_seqno <= second.file->largest_seqno); + return false; +} + +// sort all files in level1 to level(n-1) based on file size +void VersionSet::UpdateFilesBySize(Version* v) { + + // No need to sort the highest level because it is never compacted. + int max_level = (options_->compaction_style == kCompactionStyleUniversal) + ? v->NumberLevels() + : v->NumberLevels() - 1; + + for (int level = 0; level < max_level; level++) { + + const std::vector& files = v->files_[level]; + std::vector& files_by_size = v->files_by_size_[level]; + assert(files_by_size.size() == 0); + + // populate a temp vector for sorting based on size + std::vector temp(files.size()); + for (unsigned int i = 0; i < files.size(); i++) { + temp[i].index = i; + temp[i].file = files[i]; + } + + // sort the top number_of_files_to_sort_ based on file size + if (options_->compaction_style == kCompactionStyleUniversal) { + int num = temp.size(); + std::partial_sort(temp.begin(), temp.begin() + num, + temp.end(), compareSeqnoDescending); + } else { + int num = Version::number_of_files_to_sort_; + if (num > (int)temp.size()) { + num = temp.size(); + } + std::partial_sort(temp.begin(), temp.begin() + num, + temp.end(), compareSizeDescending); + } + assert(temp.size() == files.size()); + + // initialize files_by_size_ + for (unsigned int i = 0; i < temp.size(); i++) { + files_by_size.push_back(temp[i].index); + } + v->next_file_to_compact_by_size_[level] = 0; + assert(v->files_[level].size() == v->files_by_size_[level].size()); + } +} + +Status VersionSet::WriteSnapshot(log::Writer* log) { + // TODO: Break up into multiple records to reduce memory usage on recovery? + + // Save metadata + VersionEdit edit; + edit.SetComparatorName(icmp_.user_comparator()->Name()); + + // Save compaction pointers + for (int level = 0; level < NumberLevels(); level++) { + if (!compact_pointer_[level].empty()) { + InternalKey key; + key.DecodeFrom(compact_pointer_[level]); + edit.SetCompactPointer(level, key); + } + } + + // Save files + for (int level = 0; level < current_->NumberLevels(); level++) { + const std::vector& files = current_->files_[level]; + for (size_t i = 0; i < files.size(); i++) { + const FileMetaData* f = files[i]; + edit.AddFile(level, f->number, f->file_size, f->smallest, f->largest, + f->smallest_seqno, f->largest_seqno); + } + } + + std::string record; + edit.EncodeTo(&record); + return log->AddRecord(record); +} + +// Opens the mainfest file and reads all records +// till it finds the record we are looking for. +bool VersionSet::ManifestContains(const std::string& record) const { + std::string fname = DescriptorFileName(dbname_, manifest_file_number_); + Log(options_->info_log, "ManifestContains: checking %s\n", fname.c_str()); + unique_ptr file; + Status s = env_->NewSequentialFile(fname, &file, storage_options_); + if (!s.ok()) { + Log(options_->info_log, "ManifestContains: %s\n", s.ToString().c_str()); + Log(options_->info_log, + "ManifestContains: is unable to reopen the manifest file %s", + fname.c_str()); + return false; + } + log::Reader reader(std::move(file), nullptr, true/*checksum*/, 0); + Slice r; + std::string scratch; + bool result = false; + while (reader.ReadRecord(&r, &scratch)) { + if (r == Slice(record)) { + result = true; + break; + } + } + Log(options_->info_log, "ManifestContains: result = %d\n", result ? 1 : 0); + return result; +} + + +uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) { + uint64_t result = 0; + for (int level = 0; level < v->NumberLevels(); level++) { + const std::vector& files = v->files_[level]; + for (size_t i = 0; i < files.size(); i++) { + if (icmp_.Compare(files[i]->largest, ikey) <= 0) { + // Entire file is before "ikey", so just add the file size + result += files[i]->file_size; + } else if (icmp_.Compare(files[i]->smallest, ikey) > 0) { + // Entire file is after "ikey", so ignore + if (level > 0) { + // Files other than level 0 are sorted by meta->smallest, so + // no further files in this level will contain data for + // "ikey". + break; + } + } else { + // "ikey" falls in the range for this table. Add the + // approximate offset of "ikey" within the table. + TableReader* table_reader_ptr; + Iterator* iter = table_cache_->NewIterator( + ReadOptions(), storage_options_, files[i]->number, + files[i]->file_size, &table_reader_ptr); + if (table_reader_ptr != nullptr) { + result += table_reader_ptr->ApproximateOffsetOf(ikey.Encode()); + } + delete iter; + } + } + } + return result; +} + +void VersionSet::AddLiveFiles(std::vector* live_list) { + // pre-calculate space requirement + int64_t total_files = 0; + for (Version* v = dummy_versions_.next_; + v != &dummy_versions_; + v = v->next_) { + for (int level = 0; level < v->NumberLevels(); level++) { + total_files += v->files_[level].size(); + } + } + + // just one time extension to the right size + live_list->reserve(live_list->size() + total_files); + + for (Version* v = dummy_versions_.next_; + v != &dummy_versions_; + v = v->next_) { + for (int level = 0; level < v->NumberLevels(); level++) { + for (const auto& f : v->files_[level]) { + live_list->push_back(f->number); + } + } + } +} + +// Stores the minimal range that covers all entries in inputs in +// *smallest, *largest. +// REQUIRES: inputs is not empty +void VersionSet::GetRange(const std::vector& inputs, + InternalKey* smallest, + InternalKey* largest) { + assert(!inputs.empty()); + smallest->Clear(); + largest->Clear(); + for (size_t i = 0; i < inputs.size(); i++) { + FileMetaData* f = inputs[i]; + if (i == 0) { + *smallest = f->smallest; + *largest = f->largest; + } else { + if (icmp_.Compare(f->smallest, *smallest) < 0) { + *smallest = f->smallest; + } + if (icmp_.Compare(f->largest, *largest) > 0) { + *largest = f->largest; + } + } + } +} + +// Stores the minimal range that covers all entries in inputs1 and inputs2 +// in *smallest, *largest. +// REQUIRES: inputs is not empty +void VersionSet::GetRange2(const std::vector& inputs1, + const std::vector& inputs2, + InternalKey* smallest, + InternalKey* largest) { + std::vector all = inputs1; + all.insert(all.end(), inputs2.begin(), inputs2.end()); + GetRange(all, smallest, largest); +} + +Iterator* VersionSet::MakeInputIterator(Compaction* c) { + ReadOptions options; + options.verify_checksums = options_->paranoid_checks; + options.fill_cache = false; + + // Level-0 files have to be merged together. For other levels, + // we will make a concatenating iterator per level. + // TODO(opt): use concatenating iterator for level-0 if there is no overlap + const int space = (c->level() == 0 ? c->inputs_[0].size() + 1 : 2); + Iterator** list = new Iterator*[space]; + int num = 0; + for (int which = 0; which < 2; which++) { + if (!c->inputs_[which].empty()) { + if (c->level() + which == 0) { + const std::vector& files = c->inputs_[which]; + for (size_t i = 0; i < files.size(); i++) { + list[num++] = table_cache_->NewIterator( + options, storage_options_compactions_, + files[i]->number, files[i]->file_size, nullptr, + true /* for compaction */); + } + } else { + // Create concatenating iterator for the files from this level + list[num++] = NewTwoLevelIterator( + new Version::LevelFileNumIterator(icmp_, &c->inputs_[which]), + &GetFileIterator, table_cache_, options, storage_options_, + true /* for compaction */); + } + } + } + assert(num <= space); + Iterator* result = NewMergingIterator(&icmp_, list, num); + delete[] list; + return result; +} + +double VersionSet::MaxBytesForLevel(int level) { + // Note: the result for level zero is not really used since we set + // the level-0 compaction threshold based on number of files. + assert(level >= 0); + assert(level < NumberLevels()); + return level_max_bytes_[level]; +} + +uint64_t VersionSet::MaxFileSizeForLevel(int level) { + assert(level >= 0); + assert(level < NumberLevels()); + return max_file_size_[level]; +} + +uint64_t VersionSet::ExpandedCompactionByteSizeLimit(int level) { + uint64_t result = MaxFileSizeForLevel(level); + result *= options_->expanded_compaction_factor; + return result; +} + +uint64_t VersionSet::MaxGrandParentOverlapBytes(int level) { + uint64_t result = MaxFileSizeForLevel(level); + result *= options_->max_grandparent_overlap_factor; + return result; +} + +// verify that the files listed in this compaction are present +// in the current version +bool VersionSet::VerifyCompactionFileConsistency(Compaction* c) { +#ifndef NDEBUG + if (c->input_version_ != current_) { + Log(options_->info_log, "VerifyCompactionFileConsistency version mismatch"); + } + + // verify files in level + int level = c->level(); + for (int i = 0; i < c->num_input_files(0); i++) { + uint64_t number = c->input(0,i)->number; + + // look for this file in the current version + bool found = false; + for (unsigned int j = 0; j < current_->files_[level].size(); j++) { + FileMetaData* f = current_->files_[level][j]; + if (f->number == number) { + found = true; + break; + } + } + if (!found) { + return false; // input files non existant in current version + } + } + // verify level+1 files + level++; + for (int i = 0; i < c->num_input_files(1); i++) { + uint64_t number = c->input(1,i)->number; + + // look for this file in the current version + bool found = false; + for (unsigned int j = 0; j < current_->files_[level].size(); j++) { + FileMetaData* f = current_->files_[level][j]; + if (f->number == number) { + found = true; + break; + } + } + if (!found) { + return false; // input files non existant in current version + } + } +#endif + return true; // everything good +} + +// Clear all files to indicate that they are not being compacted +// Delete this compaction from the list of running compactions. +void VersionSet::ReleaseCompactionFiles(Compaction* c, Status status) { + c->MarkFilesBeingCompacted(false); + compactions_in_progress_[c->level()].erase(c); + if (!status.ok()) { + c->ResetNextCompactionIndex(); + } +} + +// The total size of files that are currently being compacted +// at at every level upto the penultimate level. +void VersionSet::SizeBeingCompacted(std::vector& sizes) { + for (int level = 0; level < NumberLevels() - 1; level++) { + uint64_t total = 0; + for (std::set::iterator it = + compactions_in_progress_[level].begin(); + it != compactions_in_progress_[level].end(); + ++it) { + Compaction* c = (*it); + assert(c->level() == level); + for (int i = 0; i < c->num_input_files(0); i++) { + total += c->input(0,i)->file_size; + } + } + sizes[level] = total; + } +} + +// +// Look at overall size amplification. If size amplification +// exceeeds the configured value, then do a compaction +// of the candidate files all the way upto the earliest +// base file (overrides configured values of file-size ratios, +// min_merge_width and max_merge_width). +// +Compaction* VersionSet::PickCompactionUniversalSizeAmp(int level, + double score) { + assert (level == 0); + + // percentage flexibilty while reducing size amplification + uint64_t ratio = options_->compaction_options_universal. + max_size_amplification_percent; + + // The files are sorted from newest first to oldest last. + std::vector& file_by_time = current_->files_by_size_[level]; + assert(file_by_time.size() == current_->files_[level].size()); + + unsigned int candidate_count = 0; + uint64_t candidate_size = 0; + unsigned int start_index = 0; + FileMetaData* f = nullptr; + + // Skip files that are already being compacted + for (unsigned int loop = 0; loop < file_by_time.size() - 1; loop++) { + int index = file_by_time[loop]; + f = current_->files_[level][index]; + if (!f->being_compacted) { + start_index = loop; // Consider this as the first candidate. + break; + } + Log(options_->info_log, "Universal: skipping file %lu[%d] compacted %s", + (unsigned long)f->number, + loop, + " cannot be a candidate to reduce size amp.\n"); + f = nullptr; + } + if (f == nullptr) { + return nullptr; // no candidate files + } + + Log(options_->info_log, "Universal: First candidate file %lu[%d] %s", + (unsigned long)f->number, + start_index, + " to reduce size amp.\n"); + + // keep adding up all the remaining files + for (unsigned int loop = start_index; loop < file_by_time.size() - 1; + loop++) { + int index = file_by_time[loop]; + f = current_->files_[level][index]; + if (f->being_compacted) { + Log(options_->info_log, + "Universal: Possible candidate file %lu[%d] %s.", + (unsigned long)f->number, + loop, + " is already being compacted. No size amp reduction possible.\n"); + return nullptr; + } + candidate_size += f->file_size; + candidate_count++; + } + if (candidate_count == 0) { + return nullptr; + } + + // size of earliest file + int index = file_by_time[file_by_time.size() - 1]; + uint64_t earliest_file_size = current_->files_[level][index]->file_size; + + // size amplification = percentage of additional size + if (candidate_size * 100 < ratio * earliest_file_size) { + Log(options_->info_log, + "Universal: size amp not needed. newer-files-total-size %lu " + "earliest-file-size %lu", + (unsigned long)candidate_size, + (unsigned long)earliest_file_size); + return nullptr; + } else { + Log(options_->info_log, + "Universal: size amp needed. newer-files-total-size %lu " + "earliest-file-size %lu", + (unsigned long)candidate_size, + (unsigned long)earliest_file_size); + } + assert(start_index >= 0 && start_index < file_by_time.size() - 1); + + // create a compaction request + // We always compact all the files, so always compress. + Compaction* c = + new Compaction(current_, level, level, MaxFileSizeForLevel(level), + LLONG_MAX, false, true); + c->score_ = score; + for (unsigned int loop = start_index; loop < file_by_time.size(); loop++) { + int index = file_by_time[loop]; + f = c->input_version_->files_[level][index]; + c->inputs_[0].push_back(f); + Log(options_->info_log, + "Universal: size amp picking file %lu[%d] with size %lu", + (unsigned long)f->number, + index, + (unsigned long)f->file_size); + } + return c; +} + +// +// Consider compaction files based on their size differences with +// the next file in time order. +// +Compaction* VersionSet::PickCompactionUniversalReadAmp( + int level, double score, unsigned int ratio, + unsigned int max_number_of_files_to_compact) { + + unsigned int min_merge_width = + options_->compaction_options_universal.min_merge_width; + unsigned int max_merge_width = + options_->compaction_options_universal.max_merge_width; + + // The files are sorted from newest first to oldest last. + std::vector& file_by_time = current_->files_by_size_[level]; + FileMetaData* f = nullptr; + bool done = false; + int start_index = 0; + unsigned int candidate_count; + assert(file_by_time.size() == current_->files_[level].size()); + + unsigned int max_files_to_compact = std::min(max_merge_width, + max_number_of_files_to_compact); + min_merge_width = std::max(min_merge_width, 2U); + + // Considers a candidate file only if it is smaller than the + // total size accumulated so far. + for (unsigned int loop = 0; loop < file_by_time.size(); loop++) { + + candidate_count = 0; + + // Skip files that are already being compacted + for (f = nullptr; loop < file_by_time.size(); loop++) { + int index = file_by_time[loop]; + f = current_->files_[level][index]; + + if (!f->being_compacted) { + candidate_count = 1; + break; + } + Log(options_->info_log, + "Universal: file %lu[%d] being compacted, skipping", + (unsigned long)f->number, loop); + f = nullptr; + } + + // This file is not being compacted. Consider it as the + // first candidate to be compacted. + uint64_t candidate_size = f != nullptr? f->file_size : 0; + if (f != nullptr) { + Log(options_->info_log, "Universal: Possible candidate file %lu[%d].", + (unsigned long)f->number, loop); + } + + // Check if the suceeding files need compaction. + for (unsigned int i = loop+1; + candidate_count < max_files_to_compact && i < file_by_time.size(); + i++) { + int index = file_by_time[i]; + FileMetaData* f = current_->files_[level][index]; + if (f->being_compacted) { + break; + } + // pick files if the total candidate file size (increased by the + // specified ratio) is still larger than the next candidate file. + uint64_t sz = (candidate_size * (100L + ratio)) /100; + if (sz < f->file_size) { + break; + } + candidate_count++; + candidate_size += f->file_size; + } + + // Found a series of consecutive files that need compaction. + if (candidate_count >= (unsigned int)min_merge_width) { + start_index = loop; + done = true; + break; + } else { + for (unsigned int i = loop; + i < loop + candidate_count && i < file_by_time.size(); i++) { + int index = file_by_time[i]; + FileMetaData* f = current_->files_[level][index]; + Log(options_->info_log, + "Universal: Skipping file %lu[%d] with size %lu %d\n", + (unsigned long)f->number, + i, + (unsigned long)f->file_size, + f->being_compacted); + } + } + } + if (!done || candidate_count <= 1) { + return nullptr; + } + unsigned int first_index_after = start_index + candidate_count; + // Compression is enabled if files compacted earlier already reached + // size ratio of compression. + bool enable_compression = true; + int ratio_to_compress = + options_->compaction_options_universal.compression_size_percent; + if (ratio_to_compress >= 0) { + uint64_t total_size = TotalFileSize(current_->files_[level]); + uint64_t older_file_size = 0; + for (unsigned int i = file_by_time.size() - 1; i >= first_index_after; + i--) { + older_file_size += current_->files_[level][file_by_time[i]]->file_size; + if (older_file_size * 100L >= total_size * (long) ratio_to_compress) { + enable_compression = false; + break; + } + } + } + Compaction* c = + new Compaction(current_, level, level, MaxFileSizeForLevel(level), + LLONG_MAX, false, enable_compression); + c->score_ = score; + + for (unsigned int i = start_index; i < first_index_after; i++) { + int index = file_by_time[i]; + FileMetaData* f = c->input_version_->files_[level][index]; + c->inputs_[0].push_back(f); + Log(options_->info_log, "Universal: Picking file %lu[%d] with size %lu\n", + (unsigned long)f->number, + i, + (unsigned long)f->file_size); + } + return c; +} + +// +// Universal style of compaction. Pick files that are contiguous in +// time-range to compact. +// +Compaction* VersionSet::PickCompactionUniversal(int level, double score) { + assert (level == 0); + + if ((current_->files_[level].size() < + (unsigned int)options_->level0_file_num_compaction_trigger)) { + Log(options_->info_log, "Universal: nothing to do\n"); + return nullptr; + } + Version::FileSummaryStorage tmp; + Log(options_->info_log, "Universal: candidate files(%lu): %s\n", + current_->files_[level].size(), + current_->LevelFileSummary(&tmp, 0)); + + // Check for size amplification first. + Compaction* c = PickCompactionUniversalSizeAmp(level, score); + if (c == nullptr) { + + // Size amplification is within limits. Try reducing read + // amplification while maintaining file size ratios. + unsigned int ratio = options_->compaction_options_universal.size_ratio; + c = PickCompactionUniversalReadAmp(level, score, ratio, UINT_MAX); + + // Size amplification and file size ratios are within configured limits. + // If max read amplification is exceeding configured limits, then force + // compaction without looking at filesize ratios and try to reduce + // the number of files to fewer than level0_file_num_compaction_trigger. + if (c == nullptr) { + unsigned int num_files = current_->files_[level].size() - + options_->level0_file_num_compaction_trigger; + c = PickCompactionUniversalReadAmp(level, score, UINT_MAX, num_files); + } + } + if (c == nullptr) { + return nullptr; + } + assert(c->inputs_[0].size() > 1); + + // validate that all the chosen files are non overlapping in time + FileMetaData* newerfile __attribute__((unused)) = nullptr; + for (unsigned int i = 0; i < c->inputs_[0].size(); i++) { + FileMetaData* f = c->inputs_[0][i]; + assert (f->smallest_seqno <= f->largest_seqno); + assert(newerfile == nullptr || + newerfile->smallest_seqno > f->largest_seqno); + newerfile = f; + } + + // The files are sorted from newest first to oldest last. + std::vector& file_by_time = c->input_version_->files_by_size_[level]; + + // Is the earliest file part of this compaction? + int last_index = file_by_time[file_by_time.size()-1]; + FileMetaData* last_file = c->input_version_->files_[level][last_index]; + if (c->inputs_[0][c->inputs_[0].size()-1] == last_file) { + c->bottommost_level_ = true; + } + + // update statistics + if (options_->statistics != nullptr) { + options_->statistics->measureTime(NUM_FILES_IN_SINGLE_COMPACTION, + c->inputs_[0].size()); + } + + // mark all the files that are being compacted + c->MarkFilesBeingCompacted(true); + + // remember this currently undergoing compaction + compactions_in_progress_[level].insert(c); + + // Record whether this compaction includes all sst files. + // For now, it is only relevant in universal compaction mode. + c->is_full_compaction_ = + (c->inputs_[0].size() == c->input_version_->files_[0].size()); + + return c; +} + +Compaction* VersionSet::PickCompactionBySize(int level, double score) { + Compaction* c = nullptr; + + // level 0 files are overlapping. So we cannot pick more + // than one concurrent compactions at this level. This + // could be made better by looking at key-ranges that are + // being compacted at level 0. + if (level == 0 && compactions_in_progress_[level].size() == 1) { + return nullptr; + } + + assert(level >= 0); + assert(level + 1 < current_->NumberLevels()); + c = new Compaction(current_, level, level + 1, MaxFileSizeForLevel(level + 1), + MaxGrandParentOverlapBytes(level)); + c->score_ = score; + + // Pick the largest file in this level that is not already + // being compacted + std::vector& file_size = c->input_version_->files_by_size_[level]; + + // record the first file that is not yet compacted + int nextIndex = -1; + + for (unsigned int i = c->input_version_->next_file_to_compact_by_size_[level]; + i < file_size.size(); i++) { + int index = file_size[i]; + FileMetaData* f = c->input_version_->files_[level][index]; + + // check to verify files are arranged in descending size + assert((i == file_size.size() - 1) || + (i >= Version::number_of_files_to_sort_ - 1) || + (f->file_size >= + c->input_version_->files_[level][file_size[i + 1]]->file_size)); + + // do not pick a file to compact if it is being compacted + // from n-1 level. + if (f->being_compacted) { + continue; + } + + // remember the startIndex for the next call to PickCompaction + if (nextIndex == -1) { + nextIndex = i; + } + + //if (i > Version::number_of_files_to_sort_) { + // Log(options_->info_log, "XXX Looking at index %d", i); + //} + + // Do not pick this file if its parents at level+1 are being compacted. + // Maybe we can avoid redoing this work in SetupOtherInputs + int parent_index = -1; + if (ParentRangeInCompaction(&f->smallest, &f->largest, level, + &parent_index)) { + continue; + } + c->inputs_[0].push_back(f); + c->base_index_ = index; + c->parent_index_ = parent_index; + break; + } + + if (c->inputs_[0].empty()) { + delete c; + c = nullptr; + } + + // store where to start the iteration in the next call to PickCompaction + c->input_version_->next_file_to_compact_by_size_[level] = nextIndex; + + return c; +} + +Compaction* VersionSet::PickCompaction() { + Compaction* c = nullptr; + int level = -1; + + // Compute the compactions needed. It is better to do it here + // and also in LogAndApply(), otherwise the values could be stale. + std::vector size_being_compacted(NumberLevels()-1); + current_->vset_->SizeBeingCompacted(size_being_compacted); + Finalize(current_, size_being_compacted); + + // In universal style of compaction, compact L0 files back into L0. + if (options_->compaction_style == kCompactionStyleUniversal) { + int level = 0; + c = PickCompactionUniversal(level, current_->compaction_score_[level]); + return c; + } + + // We prefer compactions triggered by too much data in a level over + // the compactions triggered by seeks. + // + // Find the compactions by size on all levels. + for (int i = 0; i < NumberLevels()-1; i++) { + assert(i == 0 || current_->compaction_score_[i] <= + current_->compaction_score_[i-1]); + level = current_->compaction_level_[i]; + if ((current_->compaction_score_[i] >= 1)) { + c = PickCompactionBySize(level, current_->compaction_score_[i]); + ExpandWhileOverlapping(c); + if (c != nullptr) { + break; + } + } + } + + // Find compactions needed by seeks + FileMetaData* f = current_->file_to_compact_; + if (c == nullptr && f != nullptr && !f->being_compacted) { + + level = current_->file_to_compact_level_; + int parent_index = -1; + + // Only allow one level 0 compaction at a time. + // Do not pick this file if its parents at level+1 are being compacted. + if (level != 0 || compactions_in_progress_[0].empty()) { + if(!ParentRangeInCompaction(&f->smallest, &f->largest, level, + &parent_index)) { + c = new Compaction(current_, level, level + 1, + MaxFileSizeForLevel(level + 1), + MaxGrandParentOverlapBytes(level), true); + c->inputs_[0].push_back(f); + c->parent_index_ = parent_index; + c->input_version_->file_to_compact_ = nullptr; + ExpandWhileOverlapping(c); + } + } + } + + if (c == nullptr) { + return nullptr; + } + + // Two level 0 compaction won't run at the same time, so don't need to worry + // about files on level 0 being compacted. + if (level == 0) { + assert(compactions_in_progress_[0].empty()); + InternalKey smallest, largest; + GetRange(c->inputs_[0], &smallest, &largest); + // Note that the next call will discard the file we placed in + // c->inputs_[0] earlier and replace it with an overlapping set + // which will include the picked file. + c->inputs_[0].clear(); + c->input_version_->GetOverlappingInputs(0, &smallest, &largest, + &c->inputs_[0]); + + // If we include more L0 files in the same compaction run it can + // cause the 'smallest' and 'largest' key to get extended to a + // larger range. So, re-invoke GetRange to get the new key range + GetRange(c->inputs_[0], &smallest, &largest); + if (ParentRangeInCompaction(&smallest, &largest, + level, &c->parent_index_)) { + delete c; + return nullptr; + } + assert(!c->inputs_[0].empty()); + } + + // Setup "level+1" files (inputs_[1]) + SetupOtherInputs(c); + + // mark all the files that are being compacted + c->MarkFilesBeingCompacted(true); + + // Is this compaction creating a file at the bottommost level + c->SetupBottomMostLevel(false); + + // remember this currently undergoing compaction + compactions_in_progress_[level].insert(c); + + return c; +} + +// Returns true if any one of the parent files are being compacted +bool VersionSet::ParentRangeInCompaction(const InternalKey* smallest, + const InternalKey* largest, int level, + int* parent_index) { + std::vector inputs; + assert(level + 1 < current_->NumberLevels()); + + current_->GetOverlappingInputs(level + 1, smallest, largest, &inputs, + *parent_index, parent_index); + return FilesInCompaction(inputs); +} + +// Returns true if any one of specified files are being compacted +bool VersionSet::FilesInCompaction(std::vector& files) { + for (unsigned int i = 0; i < files.size(); i++) { + if (files[i]->being_compacted) { + return true; + } + } + return false; +} + +// Add more files to the inputs on "level" to make sure that +// no newer version of a key is compacted to "level+1" while leaving an older +// version in a "level". Otherwise, any Get() will search "level" first, +// and will likely return an old/stale value for the key, since it always +// searches in increasing order of level to find the value. This could +// also scramble the order of merge operands. This function should be +// called any time a new Compaction is created, and its inputs_[0] are +// populated. +// +// Will set c to nullptr if it is impossible to apply this compaction. +void VersionSet::ExpandWhileOverlapping(Compaction* c) { + // If inputs are empty then there is nothing to expand. + if (!c || c->inputs_[0].empty()) { + return; + } + + // GetOverlappingInputs will always do the right thing for level-0. + // So we don't need to do any expansion if level == 0. + if (c->level() == 0) { + return; + } + + const int level = c->level(); + InternalKey smallest, largest; + + // Keep expanding c->inputs_[0] until we are sure that there is a + // "clean cut" boundary between the files in input and the surrounding files. + // This will ensure that no parts of a key are lost during compaction. + int hint_index = -1; + size_t old_size; + do { + old_size = c->inputs_[0].size(); + GetRange(c->inputs_[0], &smallest, &largest); + c->inputs_[0].clear(); + c->input_version_->GetOverlappingInputs( + level, &smallest, &largest, &c->inputs_[0], hint_index, &hint_index); + } while(c->inputs_[0].size() > old_size); + + // Get the new range + GetRange(c->inputs_[0], &smallest, &largest); + + // If, after the expansion, there are files that are already under + // compaction, then we must drop/cancel this compaction. + int parent_index = -1; + if (FilesInCompaction(c->inputs_[0]) || + (c->level() != c->output_level() && + ParentRangeInCompaction(&smallest, &largest, level, &parent_index))) { + c->inputs_[0].clear(); + c->inputs_[1].clear(); + delete c; + c = nullptr; + } +} + +// Populates the set of inputs from "level+1" that overlap with "level". +// Will also attempt to expand "level" if that doesn't expand "level+1" +// or cause "level" to include a file for compaction that has an overlapping +// user-key with another file. +void VersionSet::SetupOtherInputs(Compaction* c) { + // If inputs are empty, then there is nothing to expand. + // If both input and output levels are the same, no need to consider + // files at level "level+1" + if (c->inputs_[0].empty() || c->level() == c->output_level()) { + return; + } + + const int level = c->level(); + InternalKey smallest, largest; + + // Get the range one last time. + GetRange(c->inputs_[0], &smallest, &largest); + + // Populate the set of next-level files (inputs_[1]) to include in compaction + c->input_version_->GetOverlappingInputs(level + 1, &smallest, &largest, + &c->inputs_[1], c->parent_index_, + &c->parent_index_); + + // Get entire range covered by compaction + InternalKey all_start, all_limit; + GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit); + + // See if we can further grow the number of inputs in "level" without + // changing the number of "level+1" files we pick up. We also choose NOT + // to expand if this would cause "level" to include some entries for some + // user key, while excluding other entries for the same user key. This + // can happen when one user key spans multiple files. + if (!c->inputs_[1].empty()) { + std::vector expanded0; + c->input_version_->GetOverlappingInputs( + level, &all_start, &all_limit, &expanded0, c->base_index_, nullptr); + const uint64_t inputs0_size = TotalFileSize(c->inputs_[0]); + const uint64_t inputs1_size = TotalFileSize(c->inputs_[1]); + const uint64_t expanded0_size = TotalFileSize(expanded0); + uint64_t limit = ExpandedCompactionByteSizeLimit(level); + if (expanded0.size() > c->inputs_[0].size() && + inputs1_size + expanded0_size < limit && + !FilesInCompaction(expanded0) && + !c->input_version_->HasOverlappingUserKey(&expanded0, level)) { + InternalKey new_start, new_limit; + GetRange(expanded0, &new_start, &new_limit); + std::vector expanded1; + c->input_version_->GetOverlappingInputs(level + 1, &new_start, &new_limit, + &expanded1, c->parent_index_, + &c->parent_index_); + if (expanded1.size() == c->inputs_[1].size() && + !FilesInCompaction(expanded1)) { + Log(options_->info_log, + "Expanding@%lu %lu+%lu (%lu+%lu bytes) to %lu+%lu (%lu+%lu bytes)" + "\n", + (unsigned long)level, + (unsigned long)(c->inputs_[0].size()), + (unsigned long)(c->inputs_[1].size()), + (unsigned long)inputs0_size, + (unsigned long)inputs1_size, + (unsigned long)(expanded0.size()), + (unsigned long)(expanded1.size()), + (unsigned long)expanded0_size, + (unsigned long)inputs1_size); + smallest = new_start; + largest = new_limit; + c->inputs_[0] = expanded0; + c->inputs_[1] = expanded1; + GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit); + } + } + } + + // Compute the set of grandparent files that overlap this compaction + // (parent == level+1; grandparent == level+2) + if (level + 2 < NumberLevels()) { + c->input_version_->GetOverlappingInputs(level + 2, &all_start, &all_limit, + &c->grandparents_); + } + + if (false) { + Log(options_->info_log, "Compacting %d '%s' .. '%s'", + level, + smallest.DebugString().c_str(), + largest.DebugString().c_str()); + } + + // Update the place where we will do the next compaction for this level. + // We update this immediately instead of waiting for the VersionEdit + // to be applied so that if the compaction fails, we will try a different + // key range next time. + compact_pointer_[level] = largest.Encode().ToString(); + c->edit_->SetCompactPointer(level, largest); +} + +Status VersionSet::GetMetadataForFile(uint64_t number, int* filelevel, + FileMetaData* meta) { + for (int level = 0; level < NumberLevels(); level++) { + const std::vector& files = current_->files_[level]; + for (size_t i = 0; i < files.size(); i++) { + if (files[i]->number == number) { + *meta = *files[i]; + *filelevel = level; + return Status::OK(); + } + } + } + return Status::NotFound("File not present in any level"); +} + +void VersionSet::GetLiveFilesMetaData(std::vector* metadata) { + for (int level = 0; level < NumberLevels(); level++) { + const std::vector& files = current_->files_[level]; + for (size_t i = 0; i < files.size(); i++) { + LiveFileMetaData filemetadata; + filemetadata.name = TableFileName("", files[i]->number); + filemetadata.level = level; + filemetadata.size = files[i]->file_size; + filemetadata.smallestkey = files[i]->smallest.user_key().ToString(); + filemetadata.largestkey = files[i]->largest.user_key().ToString(); + filemetadata.smallest_seqno = files[i]->smallest_seqno; + filemetadata.largest_seqno = files[i]->largest_seqno; + metadata->push_back(filemetadata); + } + } +} + +void VersionSet::GetObsoleteFiles(std::vector* files) { + files->insert(files->end(), + obsolete_files_.begin(), + obsolete_files_.end()); + obsolete_files_.clear(); +} + +Compaction* VersionSet::CompactRange(int input_level, + int output_level, + const InternalKey* begin, + const InternalKey* end, + InternalKey** compaction_end) { + std::vector inputs; + bool covering_the_whole_range = true; + + // All files are 'overlapping' in universal style compaction. + // We have to compact the entire range in one shot. + if (options_->compaction_style == kCompactionStyleUniversal) { + begin = nullptr; + end = nullptr; + } + current_->GetOverlappingInputs(input_level, begin, end, &inputs); + if (inputs.empty()) { + return nullptr; + } + + // Avoid compacting too much in one shot in case the range is large. + // But we cannot do this for level-0 since level-0 files can overlap + // and we must not pick one file and drop another older file if the + // two files overlap. + if (input_level > 0) { + const uint64_t limit = + MaxFileSizeForLevel(input_level) * options_->source_compaction_factor; + uint64_t total = 0; + for (size_t i = 0; i + 1 < inputs.size(); ++i) { + uint64_t s = inputs[i]->file_size; + total += s; + if (total >= limit) { + **compaction_end = inputs[i + 1]->smallest; + covering_the_whole_range = false; + inputs.resize(i + 1); + break; + } + } + } + Compaction* c = new Compaction(current_, input_level, output_level, + MaxFileSizeForLevel(output_level), + MaxGrandParentOverlapBytes(input_level)); + + c->inputs_[0] = inputs; + ExpandWhileOverlapping(c); + if (c == nullptr) { + Log(options_->info_log, "Could not compact due to expansion failure.\n"); + return nullptr; + } + + SetupOtherInputs(c); + + if (covering_the_whole_range) { + *compaction_end = nullptr; + } + + // These files that are to be manaully compacted do not trample + // upon other files because manual compactions are processed when + // the system has a max of 1 background compaction thread. + c->MarkFilesBeingCompacted(true); + + // Is this compaction creating a file at the bottommost level + c->SetupBottomMostLevel(true); + return c; +} + +Compaction::Compaction(Version* input_version, int level, int out_level, + uint64_t target_file_size, + uint64_t max_grandparent_overlap_bytes, + bool seek_compaction, bool enable_compression) + : level_(level), + out_level_(out_level), + max_output_file_size_(target_file_size), + maxGrandParentOverlapBytes_(max_grandparent_overlap_bytes), + input_version_(input_version), + number_levels_(input_version_->NumberLevels()), + seek_compaction_(seek_compaction), + enable_compression_(enable_compression), + grandparent_index_(0), + seen_key_(false), + overlapped_bytes_(0), + base_index_(-1), + parent_index_(-1), + score_(0), + bottommost_level_(false), + is_full_compaction_(false), + level_ptrs_(std::vector(number_levels_)) { + + input_version_->Ref(); + edit_ = new VersionEdit(); + for (int i = 0; i < number_levels_; i++) { + level_ptrs_[i] = 0; + } +} + +Compaction::~Compaction() { + delete edit_; + if (input_version_ != nullptr) { + input_version_->Unref(); + } +} + +bool Compaction::IsTrivialMove() const { + // Avoid a move if there is lots of overlapping grandparent data. + // Otherwise, the move could create a parent file that will require + // a very expensive merge later on. + // If level_== out_level_, the purpose is to force compaction filter to be + // applied to that level, and thus cannot be a trivia move. + return (level_ != out_level_ && + num_input_files(0) == 1 && + num_input_files(1) == 0 && + TotalFileSize(grandparents_) <= maxGrandParentOverlapBytes_); +} + +void Compaction::AddInputDeletions(VersionEdit* edit) { + for (int which = 0; which < 2; which++) { + for (size_t i = 0; i < inputs_[which].size(); i++) { + edit->DeleteFile(level_ + which, inputs_[which][i]->number); + } + } +} + +bool Compaction::IsBaseLevelForKey(const Slice& user_key) { + if (input_version_->vset_->options_->compaction_style == + kCompactionStyleUniversal) { + return bottommost_level_; + } + // Maybe use binary search to find right entry instead of linear search? + const Comparator* user_cmp = input_version_->vset_->icmp_.user_comparator(); + for (int lvl = level_ + 2; lvl < number_levels_; lvl++) { + const std::vector& files = input_version_->files_[lvl]; + for (; level_ptrs_[lvl] < files.size(); ) { + FileMetaData* f = files[level_ptrs_[lvl]]; + if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) { + // We've advanced far enough + if (user_cmp->Compare(user_key, f->smallest.user_key()) >= 0) { + // Key falls in this file's range, so definitely not base level + return false; + } + break; + } + level_ptrs_[lvl]++; + } + } + return true; +} + +bool Compaction::ShouldStopBefore(const Slice& internal_key) { + // Scan to find earliest grandparent file that contains key. + const InternalKeyComparator* icmp = &input_version_->vset_->icmp_; + while (grandparent_index_ < grandparents_.size() && + icmp->Compare(internal_key, + grandparents_[grandparent_index_]->largest.Encode()) > 0) { + if (seen_key_) { + overlapped_bytes_ += grandparents_[grandparent_index_]->file_size; + } + assert(grandparent_index_ + 1 >= grandparents_.size() || + icmp->Compare(grandparents_[grandparent_index_]->largest.Encode(), + grandparents_[grandparent_index_+1]->smallest.Encode()) + < 0); + grandparent_index_++; + } + seen_key_ = true; + + if (overlapped_bytes_ > maxGrandParentOverlapBytes_) { + // Too much overlap for current output; start new output + overlapped_bytes_ = 0; + return true; + } else { + return false; + } +} + +// Mark (or clear) each file that is being compacted +void Compaction::MarkFilesBeingCompacted(bool value) { + for (int i = 0; i < 2; i++) { + std::vector v = inputs_[i]; + for (unsigned int j = 0; j < inputs_[i].size(); j++) { + assert(value ? !inputs_[i][j]->being_compacted : + inputs_[i][j]->being_compacted); + inputs_[i][j]->being_compacted = value; + } + } +} + +// Is this compaction producing files at the bottommost level? +void Compaction::SetupBottomMostLevel(bool isManual) { + if (input_version_->vset_->options_->compaction_style == + kCompactionStyleUniversal) { + // If universal compaction style is used and manual + // compaction is occuring, then we are guaranteed that + // all files will be picked in a single compaction + // run. We can safely set bottommost_level_ = true. + // If it is not manual compaction, then bottommost_level_ + // is already set when the Compaction was created. + if (isManual) { + bottommost_level_ = true; + } + return; + } + bottommost_level_ = true; + int num_levels = input_version_->vset_->NumberLevels(); + for (int i = output_level() + 1; i < num_levels; i++) { + if (input_version_->NumLevelFiles(i) > 0) { + bottommost_level_ = false; + break; + } + } +} + +void Compaction::ReleaseInputs() { + if (input_version_ != nullptr) { + input_version_->Unref(); + input_version_ = nullptr; + } +} + +void Compaction::ResetNextCompactionIndex() { + input_version_->ResetNextCompactionIndex(level_); +} + +static void InputSummary(std::vector& files, char* output, + int len) { + int write = 0; + for (unsigned int i = 0; i < files.size(); i++) { + int sz = len - write; + int ret = snprintf(output + write, sz, "%lu(%lu) ", + (unsigned long)files.at(i)->number, + (unsigned long)files.at(i)->file_size); + if (ret < 0 || ret >= sz) + break; + write += ret; + } +} + +void Compaction::Summary(char* output, int len) { + int write = snprintf(output, len, + "Base version %lu Base level %d, seek compaction:%d, inputs:", + (unsigned long)input_version_->GetVersionNumber(), + level_, + seek_compaction_); + if (write < 0 || write > len) { + return; + } + + char level_low_summary[100]; + InputSummary(inputs_[0], level_low_summary, sizeof(level_low_summary)); + char level_up_summary[100]; + if (inputs_[1].size()) { + InputSummary(inputs_[1], level_up_summary, sizeof(level_up_summary)); + } else { + level_up_summary[0] = '\0'; + } + + snprintf(output + write, len - write, "[%s],[%s]", + level_low_summary, level_up_summary); +} + +} // namespace rocksdb diff --git a/db/version_set.h b/db/version_set.h new file mode 100644 index 00000000..51f6d9b6 --- /dev/null +++ b/db/version_set.h @@ -0,0 +1,663 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// The representation of a DBImpl consists of a set of Versions. The +// newest version is called "current". Older versions may be kept +// around to provide a consistent view to live iterators. +// +// Each Version keeps track of a set of Table files per level. The +// entire set of versions is maintained in a VersionSet. +// +// Version,VersionSet are thread-compatible, but require external +// synchronization on all accesses. + +#pragma once +#include +#include +#include +#include +#include +#include "db/dbformat.h" +#include "db/version_edit.h" +#include "port/port.h" +#include "db/table_cache.h" + +namespace rocksdb { + +namespace log { class Writer; } + +class Compaction; +class Iterator; +class MemTable; +class TableCache; +class Version; +class VersionSet; +class MergeContext; + +// Return the smallest index i such that files[i]->largest >= key. +// Return files.size() if there is no such file. +// REQUIRES: "files" contains a sorted list of non-overlapping files. +extern int FindFile(const InternalKeyComparator& icmp, + const std::vector& files, + const Slice& key); + +// Returns true iff some file in "files" overlaps the user key range +// [*smallest,*largest]. +// smallest==nullptr represents a key smaller than all keys in the DB. +// largest==nullptr represents a key largest than all keys in the DB. +// REQUIRES: If disjoint_sorted_files, files[] contains disjoint ranges +// in sorted order. +extern bool SomeFileOverlapsRange( + const InternalKeyComparator& icmp, + bool disjoint_sorted_files, + const std::vector& files, + const Slice* smallest_user_key, + const Slice* largest_user_key); + +class Version { + public: + // Append to *iters a sequence of iterators that will + // yield the contents of this Version when merged together. + // REQUIRES: This version has been saved (see VersionSet::SaveTo) + void AddIterators(const ReadOptions&, const EnvOptions& soptions, + std::vector* iters); + + // Lookup the value for key. If found, store it in *val and + // return OK. Else return a non-OK status. Fills *stats. + // Uses *operands to store merge_operator operations to apply later + // REQUIRES: lock is not held + struct GetStats { + FileMetaData* seek_file; + int seek_file_level; + }; + void Get(const ReadOptions&, const LookupKey& key, std::string* val, + Status* status, MergeContext* merge_context, + GetStats* stats, const Options& db_option, bool* value_found = + nullptr); + + // Adds "stats" into the current state. Returns true if a new + // compaction may need to be triggered, false otherwise. + // REQUIRES: lock is held + bool UpdateStats(const GetStats& stats); + + // Reference count management (so Versions do not disappear out from + // under live iterators) + void Ref(); + void Unref(); + + void GetOverlappingInputs( + int level, + const InternalKey* begin, // nullptr means before all keys + const InternalKey* end, // nullptr means after all keys + std::vector* inputs, + int hint_index = -1, // index of overlap file + int* file_index = nullptr); // return index of overlap file + + void GetOverlappingInputsBinarySearch( + int level, + const Slice& begin, // nullptr means before all keys + const Slice& end, // nullptr means after all keys + std::vector* inputs, + int hint_index, // index of overlap file + int* file_index); // return index of overlap file + + void ExtendOverlappingInputs( + int level, + const Slice& begin, // nullptr means before all keys + const Slice& end, // nullptr means after all keys + std::vector* inputs, + unsigned int index); // start extending from this index + + // Returns true iff some file in the specified level overlaps + // some part of [*smallest_user_key,*largest_user_key]. + // smallest_user_key==NULL represents a key smaller than all keys in the DB. + // largest_user_key==NULL represents a key largest than all keys in the DB. + bool OverlapInLevel(int level, + const Slice* smallest_user_key, + const Slice* largest_user_key); + + // Returns true iff the first or last file in inputs contains + // an overlapping user key to the file "just outside" of it (i.e. + // just after the last file, or just before the first file) + // REQUIRES: "*inputs" is a sorted list of non-overlapping files + bool HasOverlappingUserKey(const std::vector* inputs, + int level); + + + // Return the level at which we should place a new memtable compaction + // result that covers the range [smallest_user_key,largest_user_key]. + int PickLevelForMemTableOutput(const Slice& smallest_user_key, + const Slice& largest_user_key); + + int NumberLevels() const { return num_levels_; } + + // REQUIRES: lock is held + int NumLevelFiles(int level) const { return files_[level].size(); } + + // Return the combined file size of all files at the specified level. + int64_t NumLevelBytes(int level) const; + + // Return a human-readable short (single-line) summary of the number + // of files per level. Uses *scratch as backing store. + struct LevelSummaryStorage { + char buffer[100]; + }; + struct FileSummaryStorage { + char buffer[1000]; + }; + const char* LevelSummary(LevelSummaryStorage* scratch) const; + // Return a human-readable short (single-line) summary of files + // in a specified level. Uses *scratch as backing store. + const char* LevelFileSummary(FileSummaryStorage* scratch, int level) const; + + // Return the maximum overlapping data (in bytes) at next level for any + // file at a level >= 1. + int64_t MaxNextLevelOverlappingBytes(); + + // Add all files listed in the current version to *live. + void AddLiveFiles(std::set* live); + + // Return a human readable string that describes this version's contents. + std::string DebugString(bool hex = false) const; + + // Returns the version nuber of this version + uint64_t GetVersionNumber() const { return version_number_; } + + private: + friend class Compaction; + friend class VersionSet; + friend class DBImpl; + + class LevelFileNumIterator; + Iterator* NewConcatenatingIterator(const ReadOptions&, + const EnvOptions& soptions, + int level) const; + bool PrefixMayMatch(const ReadOptions& options, const EnvOptions& soptions, + const Slice& internal_prefix, Iterator* level_iter) const; + + VersionSet* vset_; // VersionSet to which this Version belongs + Version* next_; // Next version in linked list + Version* prev_; // Previous version in linked list + int refs_; // Number of live refs to this version + int num_levels_; // Number of levels + + // List of files per level, files in each level are arranged + // in increasing order of keys + std::vector* files_; + + // A list for the same set of files that are stored in files_, + // but files in each level are now sorted based on file + // size. The file with the largest size is at the front. + // This vector stores the index of the file from files_. + std::vector< std::vector > files_by_size_; + + // An index into files_by_size_ that specifies the first + // file that is not yet compacted + std::vector next_file_to_compact_by_size_; + + // Only the first few entries of files_by_size_ are sorted. + // There is no need to sort all the files because it is likely + // that on a running system, we need to look at only the first + // few largest files because a new version is created every few + // seconds/minutes (because of concurrent compactions). + static const int number_of_files_to_sort_ = 50; + + // Next file to compact based on seek stats. + FileMetaData* file_to_compact_; + int file_to_compact_level_; + + // Level that should be compacted next and its compaction score. + // Score < 1 means compaction is not strictly needed. These fields + // are initialized by Finalize(). + // The most critical level to be compacted is listed first + // These are used to pick the best compaction level + std::vector compaction_score_; + std::vector compaction_level_; + double max_compaction_score_; // max score in l1 to ln-1 + int max_compaction_score_level_; // level on which max score occurs + + // A version number that uniquely represents this version. This is + // used for debugging and logging purposes only. + uint64_t version_number_; + + explicit Version(VersionSet* vset, uint64_t version_number = 0); + + ~Version(); + + // re-initializes the index that is used to offset into files_by_size_ + // to find the next compaction candidate file. + void ResetNextCompactionIndex(int level) { + next_file_to_compact_by_size_[level] = 0; + } + + // No copying allowed + Version(const Version&); + void operator=(const Version&); +}; + +class VersionSet { + public: + VersionSet(const std::string& dbname, const Options* options, + const EnvOptions& storage_options, TableCache* table_cache, + const InternalKeyComparator*); + ~VersionSet(); + + // Apply *edit to the current version to form a new descriptor that + // is both saved to persistent state and installed as the new + // current version. Will release *mu while actually writing to the file. + // REQUIRES: *mu is held on entry. + // REQUIRES: no other thread concurrently calls LogAndApply() + Status LogAndApply(VersionEdit* edit, port::Mutex* mu, + bool new_descriptor_log = false); + + // Recover the last saved descriptor from persistent storage. + Status Recover(); + + // Try to reduce the number of levels. This call is valid when + // only one level from the new max level to the old + // max level containing files. + // For example, a db currently has 7 levels [0-6], and a call to + // to reduce to 5 [0-4] can only be executed when only one level + // among [4-6] contains files. + Status ReduceNumberOfLevels(int new_levels, port::Mutex* mu); + + // Return the current version. + Version* current() const { return current_; } + + // A Flag indicating whether write needs to slowdown because of there are + // too many number of level0 files. + bool NeedSlowdownForNumLevel0Files() const { + return need_slowdown_for_num_level0_files_; + } + + // Return the current manifest file number + uint64_t ManifestFileNumber() const { return manifest_file_number_; } + + // Allocate and return a new file number + uint64_t NewFileNumber() { return next_file_number_++; } + + // Arrange to reuse "file_number" unless a newer file number has + // already been allocated. + // REQUIRES: "file_number" was returned by a call to NewFileNumber(). + void ReuseFileNumber(uint64_t file_number) { + if (next_file_number_ == file_number + 1) { + next_file_number_ = file_number; + } + } + + // Return the last sequence number. + uint64_t LastSequence() const { + return last_sequence_.load(std::memory_order_acquire); + } + + // Set the last sequence number to s. + void SetLastSequence(uint64_t s) { + assert(s >= last_sequence_); + last_sequence_.store(s, std::memory_order_release); + } + + // Mark the specified file number as used. + void MarkFileNumberUsed(uint64_t number); + + // Return the current log file number. + uint64_t LogNumber() const { return log_number_; } + + // Return the log file number for the log file that is currently + // being compacted, or zero if there is no such log file. + uint64_t PrevLogNumber() const { return prev_log_number_; } + + int NumberLevels() const { return num_levels_; } + + // Pick level and inputs for a new compaction. + // Returns nullptr if there is no compaction to be done. + // Otherwise returns a pointer to a heap-allocated object that + // describes the compaction. Caller should delete the result. + Compaction* PickCompaction(); + + // Return a compaction object for compacting the range [begin,end] in + // the specified level. Returns nullptr if there is nothing in that + // level that overlaps the specified range. Caller should delete + // the result. + // + // The returned Compaction might not include the whole requested range. + // In that case, compaction_end will be set to the next key that needs + // compacting. In case the compaction will compact the whole range, + // compaction_end will be set to nullptr. + // Client is responsible for compaction_end storage -- when called, + // *compaction_end should point to valid InternalKey! + Compaction* CompactRange(int input_level, + int output_level, + const InternalKey* begin, + const InternalKey* end, + InternalKey** compaction_end); + + // Create an iterator that reads over the compaction inputs for "*c". + // The caller should delete the iterator when no longer needed. + Iterator* MakeInputIterator(Compaction* c); + + // Returns true iff some level needs a compaction because it has + // exceeded its target size. + bool NeedsSizeCompaction() const { + // In universal compaction case, this check doesn't really + // check the compaction condition, but checks num of files threshold + // only. We are not going to miss any compaction opportunity + // but it's likely that more compactions are scheduled but + // ending up with nothing to do. We can improve it later. + // TODO: improve this function to be accurate for universal + // compactions. + int num_levels_to_check = + (options_->compaction_style != kCompactionStyleUniversal) ? + NumberLevels() - 1 : 1; + for (int i = 0; i < num_levels_to_check; i++) { + if (current_->compaction_score_[i] >= 1) { + return true; + } + } + return false; + } + // Returns true iff some level needs a compaction. + bool NeedsCompaction() const { + return ((current_->file_to_compact_ != nullptr) || + NeedsSizeCompaction()); + } + + // Returns the maxmimum compaction score for levels 1 to max + double MaxCompactionScore() const { + return current_->max_compaction_score_; + } + + // See field declaration + int MaxCompactionScoreLevel() const { + return current_->max_compaction_score_level_; + } + + // Add all files listed in any live version to *live. + void AddLiveFiles(std::vector* live_list); + + // Return the approximate offset in the database of the data for + // "key" as of version "v". + uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key); + + // printf contents (for debugging) + Status DumpManifest(Options& options, std::string& manifestFileName, + bool verbose, bool hex = false); + + // Return the size of the current manifest file + uint64_t ManifestFileSize() const { return manifest_file_size_; } + + // For the specfied level, pick a compaction. + // Returns nullptr if there is no compaction to be done. + // If level is 0 and there is already a compaction on that level, this + // function will return nullptr. + Compaction* PickCompactionBySize(int level, double score); + + // Pick files to compact in Universal mode + Compaction* PickCompactionUniversal(int level, double score); + + // Pick Universal compaction to limit read amplification + Compaction* PickCompactionUniversalReadAmp(int level, double score, + unsigned int ratio, unsigned int num_files); + + // Pick Universal compaction to limit space amplification. + Compaction* PickCompactionUniversalSizeAmp(int level, double score); + + // Free up the files that were participated in a compaction + void ReleaseCompactionFiles(Compaction* c, Status status); + + // verify that the files that we started with for a compaction + // still exist in the current version and in the same original level. + // This ensures that a concurrent compaction did not erroneously + // pick the same files to compact. + bool VerifyCompactionFileConsistency(Compaction* c); + + // used to sort files by size + typedef struct fsize { + int index; + FileMetaData* file; + } Fsize; + + // Sort all files for this version based on their file size and + // record results in files_by_size_. The largest files are listed first. + void UpdateFilesBySize(Version *v); + + // Get the max file size in a given level. + uint64_t MaxFileSizeForLevel(int level); + + double MaxBytesForLevel(int level); + + Status GetMetadataForFile( + uint64_t number, int *filelevel, FileMetaData *metadata); + + void GetLiveFilesMetaData( + std::vector *metadata); + + void GetObsoleteFiles(std::vector* files); + + private: + class Builder; + struct ManifestWriter; + + friend class Compaction; + friend class Version; + + void Init(int num_levels); + + void Finalize(Version* v, std::vector&); + + void GetRange(const std::vector& inputs, + InternalKey* smallest, + InternalKey* largest); + + void GetRange2(const std::vector& inputs1, + const std::vector& inputs2, + InternalKey* smallest, + InternalKey* largest); + + void ExpandWhileOverlapping(Compaction* c); + + void SetupOtherInputs(Compaction* c); + + // Save current contents to *log + Status WriteSnapshot(log::Writer* log); + + void AppendVersion(Version* v); + + bool ManifestContains(const std::string& record) const; + + uint64_t ExpandedCompactionByteSizeLimit(int level); + + uint64_t MaxGrandParentOverlapBytes(int level); + + Env* const env_; + const std::string dbname_; + const Options* const options_; + TableCache* const table_cache_; + const InternalKeyComparator icmp_; + uint64_t next_file_number_; + uint64_t manifest_file_number_; + std::atomic last_sequence_; + uint64_t log_number_; + uint64_t prev_log_number_; // 0 or backing store for memtable being compacted + + int num_levels_; + + // Opened lazily + unique_ptr descriptor_log_; + Version dummy_versions_; // Head of circular doubly-linked list of versions. + Version* current_; // == dummy_versions_.prev_ + + // A flag indicating whether we should delay writes because + // we have too many level 0 files + bool need_slowdown_for_num_level0_files_; + + // Per-level key at which the next compaction at that level should start. + // Either an empty string, or a valid InternalKey. + std::string* compact_pointer_; + + // Per-level target file size. + uint64_t* max_file_size_; + + // Per-level max bytes + uint64_t* level_max_bytes_; + + // record all the ongoing compactions for all levels + std::vector > compactions_in_progress_; + + // generates a increasing version number for every new version + uint64_t current_version_number_; + + // Queue of writers to the manifest file + std::deque manifest_writers_; + + // Current size of manifest file + uint64_t manifest_file_size_; + + std::vector obsolete_files_; + + // storage options for all reads and writes except compactions + const EnvOptions& storage_options_; + + // storage options used for compactions. This is a copy of + // storage_options_ but with readaheads set to readahead_compactions_. + const EnvOptions storage_options_compactions_; + + // No copying allowed + VersionSet(const VersionSet&); + void operator=(const VersionSet&); + + // Return the total amount of data that is undergoing + // compactions per level + void SizeBeingCompacted(std::vector&); + + // Returns true if any one of the parent files are being compacted + bool ParentRangeInCompaction(const InternalKey* smallest, + const InternalKey* largest, int level, int* index); + + // Returns true if any one of the specified files are being compacted + bool FilesInCompaction(std::vector& files); + + void LogAndApplyHelper(Builder*b, Version* v, + VersionEdit* edit, port::Mutex* mu); +}; + +// A Compaction encapsulates information about a compaction. +class Compaction { + public: + ~Compaction(); + + // Return the level that is being compacted. Inputs from "level" + // will be merged. + int level() const { return level_; } + + // Outputs will go to this level + int output_level() const { return out_level_; } + + // Return the object that holds the edits to the descriptor done + // by this compaction. + VersionEdit* edit() { return edit_; } + + // "which" must be either 0 or 1 + int num_input_files(int which) const { return inputs_[which].size(); } + + // Return the ith input file at "level()+which" ("which" must be 0 or 1). + FileMetaData* input(int which, int i) const { return inputs_[which][i]; } + + // Maximum size of files to build during this compaction. + uint64_t MaxOutputFileSize() const { return max_output_file_size_; } + + // Whether compression will be enabled for compaction outputs + bool enable_compression() const { return enable_compression_; } + + // Is this a trivial compaction that can be implemented by just + // moving a single input file to the next level (no merging or splitting) + bool IsTrivialMove() const; + + // Add all inputs to this compaction as delete operations to *edit. + void AddInputDeletions(VersionEdit* edit); + + // Returns true if the information we have available guarantees that + // the compaction is producing data in "level+1" for which no data exists + // in levels greater than "level+1". + bool IsBaseLevelForKey(const Slice& user_key); + + // Returns true iff we should stop building the current output + // before processing "internal_key". + bool ShouldStopBefore(const Slice& internal_key); + + // Release the input version for the compaction, once the compaction + // is successful. + void ReleaseInputs(); + + void Summary(char* output, int len); + + // Return the score that was used to pick this compaction run. + double score() const { return score_; } + + // Is this compaction creating a file in the bottom most level? + bool BottomMostLevel() { return bottommost_level_; } + + // Does this compaction include all sst files? + bool IsFullCompaction() { return is_full_compaction_; } + + private: + friend class Version; + friend class VersionSet; + + Compaction(Version* input_version, int level, int out_level, + uint64_t target_file_size, uint64_t max_grandparent_overlap_bytes, + bool seek_compaction = false, bool enable_compression = true); + + int level_; + int out_level_; // levels to which output files are stored + uint64_t max_output_file_size_; + uint64_t maxGrandParentOverlapBytes_; + Version* input_version_; + VersionEdit* edit_; + int number_levels_; + + bool seek_compaction_; + bool enable_compression_; + + // Each compaction reads inputs from "level_" and "level_+1" + std::vector inputs_[2]; // The two sets of inputs + + // State used to check for number of of overlapping grandparent files + // (parent == level_ + 1, grandparent == level_ + 2) + std::vector grandparents_; + size_t grandparent_index_; // Index in grandparent_starts_ + bool seen_key_; // Some output key has been seen + uint64_t overlapped_bytes_; // Bytes of overlap between current output + // and grandparent files + int base_index_; // index of the file in files_[level_] + int parent_index_; // index of some file with same range in files_[level_+1] + double score_; // score that was used to pick this compaction. + + // Is this compaction creating a file in the bottom most level? + bool bottommost_level_; + // Does this compaction include all sst files? + bool is_full_compaction_; + + // level_ptrs_ holds indices into input_version_->levels_: our state + // is that we are positioned at one of the file ranges for each + // higher level than the ones involved in this compaction (i.e. for + // all L >= level_ + 2). + std::vector level_ptrs_; + + // mark (or clear) all files that are being compacted + void MarkFilesBeingCompacted(bool); + + // Initialize whether compaction producing files at the bottommost level + void SetupBottomMostLevel(bool isManual); + + // In case of compaction error, reset the nextIndex that is used + // to pick up the next file to be compacted from files_by_size_ + void ResetNextCompactionIndex(); +}; + +} // namespace rocksdb diff --git a/db/version_set_reduce_num_levels.cc b/db/version_set_reduce_num_levels.cc new file mode 100644 index 00000000..2ca68980 --- /dev/null +++ b/db/version_set_reduce_num_levels.cc @@ -0,0 +1,81 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2012 Facebook. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "db/version_set.h" + +#include +#include +#include "db/log_reader.h" +#include "db/log_writer.h" +#include "util/logging.h" + +namespace rocksdb { + +Status VersionSet::ReduceNumberOfLevels(int new_levels, port::Mutex* mu) { + + if(new_levels <= 1) { + return Status::InvalidArgument( + "Number of levels needs to be bigger than 1"); + } + + Version* current_version = current_; + int current_levels = current_version->NumberLevels(); + + if (current_levels <= new_levels) { + return Status::OK(); + } + + // Make sure there are file only on one level from + // (new_levels-1) to (current_levels-1) + int first_nonempty_level = -1; + int first_nonempty_level_filenum = 0; + for (int i = new_levels - 1; i < current_levels; i++) { + int file_num = current_version->NumLevelFiles(i); + if (file_num != 0) { + if (first_nonempty_level < 0) { + first_nonempty_level = i; + first_nonempty_level_filenum = file_num; + } else { + char msg[255]; + sprintf(msg, "Found at least two levels containing files: " + "[%d:%d],[%d:%d].\n", + first_nonempty_level, first_nonempty_level_filenum, i, file_num); + return Status::InvalidArgument(msg); + } + } + } + + Status st; + std::vector* old_files_list = current_version->files_; + std::vector* new_files_list = + new std::vector[new_levels]; + for (int i = 0; i < new_levels - 1; i++) { + new_files_list[i] = old_files_list[i]; + } + + if (first_nonempty_level > 0) { + new_files_list[new_levels - 1] = old_files_list[first_nonempty_level]; + } + + delete[] current_version->files_; + current_version->files_ = new_files_list; + current_version->num_levels_ = new_levels; + + delete[] compact_pointer_; + delete[] max_file_size_; + delete[] level_max_bytes_; + num_levels_ = new_levels; + compact_pointer_ = new std::string[new_levels]; + Init(new_levels); + VersionEdit ve; + st = LogAndApply(&ve, mu, true); + return st; +} + +} diff --git a/db/version_set_test.cc b/db/version_set_test.cc new file mode 100644 index 00000000..1af95dd3 --- /dev/null +++ b/db/version_set_test.cc @@ -0,0 +1,184 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/version_set.h" +#include "util/logging.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { + +class FindFileTest { + public: + std::vector files_; + bool disjoint_sorted_files_; + + FindFileTest() : disjoint_sorted_files_(true) { } + + ~FindFileTest() { + for (unsigned int i = 0; i < files_.size(); i++) { + delete files_[i]; + } + } + + void Add(const char* smallest, const char* largest, + SequenceNumber smallest_seq = 100, + SequenceNumber largest_seq = 100) { + FileMetaData* f = new FileMetaData; + f->number = files_.size() + 1; + f->smallest = InternalKey(smallest, smallest_seq, kTypeValue); + f->largest = InternalKey(largest, largest_seq, kTypeValue); + files_.push_back(f); + } + + int Find(const char* key) { + InternalKey target(key, 100, kTypeValue); + InternalKeyComparator cmp(BytewiseComparator()); + return FindFile(cmp, files_, target.Encode()); + } + + bool Overlaps(const char* smallest, const char* largest) { + InternalKeyComparator cmp(BytewiseComparator()); + Slice s(smallest != nullptr ? smallest : ""); + Slice l(largest != nullptr ? largest : ""); + return SomeFileOverlapsRange(cmp, disjoint_sorted_files_, files_, + (smallest != nullptr ? &s : nullptr), + (largest != nullptr ? &l : nullptr)); + } +}; + +TEST(FindFileTest, Empty) { + ASSERT_EQ(0, Find("foo")); + ASSERT_TRUE(! Overlaps("a", "z")); + ASSERT_TRUE(! Overlaps(nullptr, "z")); + ASSERT_TRUE(! Overlaps("a", nullptr)); + ASSERT_TRUE(! Overlaps(nullptr, nullptr)); +} + +TEST(FindFileTest, Single) { + Add("p", "q"); + ASSERT_EQ(0, Find("a")); + ASSERT_EQ(0, Find("p")); + ASSERT_EQ(0, Find("p1")); + ASSERT_EQ(0, Find("q")); + ASSERT_EQ(1, Find("q1")); + ASSERT_EQ(1, Find("z")); + + ASSERT_TRUE(! Overlaps("a", "b")); + ASSERT_TRUE(! Overlaps("z1", "z2")); + ASSERT_TRUE(Overlaps("a", "p")); + ASSERT_TRUE(Overlaps("a", "q")); + ASSERT_TRUE(Overlaps("a", "z")); + ASSERT_TRUE(Overlaps("p", "p1")); + ASSERT_TRUE(Overlaps("p", "q")); + ASSERT_TRUE(Overlaps("p", "z")); + ASSERT_TRUE(Overlaps("p1", "p2")); + ASSERT_TRUE(Overlaps("p1", "z")); + ASSERT_TRUE(Overlaps("q", "q")); + ASSERT_TRUE(Overlaps("q", "q1")); + + ASSERT_TRUE(! Overlaps(nullptr, "j")); + ASSERT_TRUE(! Overlaps("r", nullptr)); + ASSERT_TRUE(Overlaps(nullptr, "p")); + ASSERT_TRUE(Overlaps(nullptr, "p1")); + ASSERT_TRUE(Overlaps("q", nullptr)); + ASSERT_TRUE(Overlaps(nullptr, nullptr)); +} + + +TEST(FindFileTest, Multiple) { + Add("150", "200"); + Add("200", "250"); + Add("300", "350"); + Add("400", "450"); + ASSERT_EQ(0, Find("100")); + ASSERT_EQ(0, Find("150")); + ASSERT_EQ(0, Find("151")); + ASSERT_EQ(0, Find("199")); + ASSERT_EQ(0, Find("200")); + ASSERT_EQ(1, Find("201")); + ASSERT_EQ(1, Find("249")); + ASSERT_EQ(1, Find("250")); + ASSERT_EQ(2, Find("251")); + ASSERT_EQ(2, Find("299")); + ASSERT_EQ(2, Find("300")); + ASSERT_EQ(2, Find("349")); + ASSERT_EQ(2, Find("350")); + ASSERT_EQ(3, Find("351")); + ASSERT_EQ(3, Find("400")); + ASSERT_EQ(3, Find("450")); + ASSERT_EQ(4, Find("451")); + + ASSERT_TRUE(! Overlaps("100", "149")); + ASSERT_TRUE(! Overlaps("251", "299")); + ASSERT_TRUE(! Overlaps("451", "500")); + ASSERT_TRUE(! Overlaps("351", "399")); + + ASSERT_TRUE(Overlaps("100", "150")); + ASSERT_TRUE(Overlaps("100", "200")); + ASSERT_TRUE(Overlaps("100", "300")); + ASSERT_TRUE(Overlaps("100", "400")); + ASSERT_TRUE(Overlaps("100", "500")); + ASSERT_TRUE(Overlaps("375", "400")); + ASSERT_TRUE(Overlaps("450", "450")); + ASSERT_TRUE(Overlaps("450", "500")); +} + +TEST(FindFileTest, MultipleNullBoundaries) { + Add("150", "200"); + Add("200", "250"); + Add("300", "350"); + Add("400", "450"); + ASSERT_TRUE(! Overlaps(nullptr, "149")); + ASSERT_TRUE(! Overlaps("451", nullptr)); + ASSERT_TRUE(Overlaps(nullptr, nullptr)); + ASSERT_TRUE(Overlaps(nullptr, "150")); + ASSERT_TRUE(Overlaps(nullptr, "199")); + ASSERT_TRUE(Overlaps(nullptr, "200")); + ASSERT_TRUE(Overlaps(nullptr, "201")); + ASSERT_TRUE(Overlaps(nullptr, "400")); + ASSERT_TRUE(Overlaps(nullptr, "800")); + ASSERT_TRUE(Overlaps("100", nullptr)); + ASSERT_TRUE(Overlaps("200", nullptr)); + ASSERT_TRUE(Overlaps("449", nullptr)); + ASSERT_TRUE(Overlaps("450", nullptr)); +} + +TEST(FindFileTest, OverlapSequenceChecks) { + Add("200", "200", 5000, 3000); + ASSERT_TRUE(! Overlaps("199", "199")); + ASSERT_TRUE(! Overlaps("201", "300")); + ASSERT_TRUE(Overlaps("200", "200")); + ASSERT_TRUE(Overlaps("190", "200")); + ASSERT_TRUE(Overlaps("200", "210")); +} + +TEST(FindFileTest, OverlappingFiles) { + Add("150", "600"); + Add("400", "500"); + disjoint_sorted_files_ = false; + ASSERT_TRUE(! Overlaps("100", "149")); + ASSERT_TRUE(! Overlaps("601", "700")); + ASSERT_TRUE(Overlaps("100", "150")); + ASSERT_TRUE(Overlaps("100", "200")); + ASSERT_TRUE(Overlaps("100", "300")); + ASSERT_TRUE(Overlaps("100", "400")); + ASSERT_TRUE(Overlaps("100", "500")); + ASSERT_TRUE(Overlaps("375", "400")); + ASSERT_TRUE(Overlaps("450", "450")); + ASSERT_TRUE(Overlaps("450", "500")); + ASSERT_TRUE(Overlaps("450", "700")); + ASSERT_TRUE(Overlaps("600", "700")); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/db/write_batch.cc b/db/write_batch.cc new file mode 100644 index 00000000..7a6106af --- /dev/null +++ b/db/write_batch.cc @@ -0,0 +1,303 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// WriteBatch::rep_ := +// sequence: fixed64 +// count: fixed32 +// data: record[count] +// record := +// kTypeValue varstring varstring +// kTypeMerge varstring varstring +// kTypeDeletion varstring +// varstring := +// len: varint32 +// data: uint8[len] + +#include "rocksdb/write_batch.h" +#include "rocksdb/options.h" +#include "rocksdb/merge_operator.h" +#include "db/dbformat.h" +#include "db/db_impl.h" +#include "db/memtable.h" +#include "db/snapshot.h" +#include "db/write_batch_internal.h" +#include "util/coding.h" +#include "util/statistics_imp.h" +#include + +namespace rocksdb { + +// WriteBatch header has an 8-byte sequence number followed by a 4-byte count. +static const size_t kHeader = 12; + +WriteBatch::WriteBatch(size_t reserved_bytes) { + rep_.reserve((reserved_bytes > kHeader) ? reserved_bytes : kHeader); + Clear(); +} + +WriteBatch::~WriteBatch() { } + +WriteBatch::Handler::~Handler() { } + +void WriteBatch::Handler::Merge(const Slice& key, const Slice& value) { + throw std::runtime_error("Handler::Merge not implemented!"); +} + +void WriteBatch::Handler::LogData(const Slice& blob) { + // If the user has not specified something to do with blobs, then we ignore + // them. +} + +bool WriteBatch::Handler::Continue() { + return true; +} + +void WriteBatch::Clear() { + rep_.clear(); + rep_.resize(kHeader); +} + +int WriteBatch::Count() const { + return WriteBatchInternal::Count(this); +} + +Status WriteBatch::Iterate(Handler* handler) const { + Slice input(rep_); + if (input.size() < kHeader) { + return Status::Corruption("malformed WriteBatch (too small)"); + } + + input.remove_prefix(kHeader); + Slice key, value, blob; + int found = 0; + while (!input.empty() && handler->Continue()) { + char tag = input[0]; + input.remove_prefix(1); + switch (tag) { + case kTypeValue: + if (GetLengthPrefixedSlice(&input, &key) && + GetLengthPrefixedSlice(&input, &value)) { + handler->Put(key, value); + found++; + } else { + return Status::Corruption("bad WriteBatch Put"); + } + break; + case kTypeDeletion: + if (GetLengthPrefixedSlice(&input, &key)) { + handler->Delete(key); + found++; + } else { + return Status::Corruption("bad WriteBatch Delete"); + } + break; + case kTypeMerge: + if (GetLengthPrefixedSlice(&input, &key) && + GetLengthPrefixedSlice(&input, &value)) { + handler->Merge(key, value); + found++; + } else { + return Status::Corruption("bad WriteBatch Merge"); + } + break; + case kTypeLogData: + if (GetLengthPrefixedSlice(&input, &blob)) { + handler->LogData(blob); + } else { + return Status::Corruption("bad WriteBatch Blob"); + } + break; + default: + return Status::Corruption("unknown WriteBatch tag"); + } + } + if (found != WriteBatchInternal::Count(this)) { + return Status::Corruption("WriteBatch has wrong count"); + } else { + return Status::OK(); + } +} + +int WriteBatchInternal::Count(const WriteBatch* b) { + return DecodeFixed32(b->rep_.data() + 8); +} + +void WriteBatchInternal::SetCount(WriteBatch* b, int n) { + EncodeFixed32(&b->rep_[8], n); +} + +SequenceNumber WriteBatchInternal::Sequence(const WriteBatch* b) { + return SequenceNumber(DecodeFixed64(b->rep_.data())); +} + +void WriteBatchInternal::SetSequence(WriteBatch* b, SequenceNumber seq) { + EncodeFixed64(&b->rep_[0], seq); +} + +void WriteBatch::Put(const Slice& key, const Slice& value) { + WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1); + rep_.push_back(static_cast(kTypeValue)); + PutLengthPrefixedSlice(&rep_, key); + PutLengthPrefixedSlice(&rep_, value); +} + +void WriteBatch::Put(const SliceParts& key, const SliceParts& value) { + WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1); + rep_.push_back(static_cast(kTypeValue)); + PutLengthPrefixedSliceParts(&rep_, key); + PutLengthPrefixedSliceParts(&rep_, value); +} + +void WriteBatch::Delete(const Slice& key) { + WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1); + rep_.push_back(static_cast(kTypeDeletion)); + PutLengthPrefixedSlice(&rep_, key); +} + +void WriteBatch::Merge(const Slice& key, const Slice& value) { + WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1); + rep_.push_back(static_cast(kTypeMerge)); + PutLengthPrefixedSlice(&rep_, key); + PutLengthPrefixedSlice(&rep_, value); +} + +void WriteBatch::PutLogData(const Slice& blob) { + rep_.push_back(static_cast(kTypeLogData)); + PutLengthPrefixedSlice(&rep_, blob); +} + +namespace { +class MemTableInserter : public WriteBatch::Handler { + public: + SequenceNumber sequence_; + MemTable* mem_; + const Options* options_; + DBImpl* db_; + const bool filter_deletes_; + + MemTableInserter(SequenceNumber sequence, MemTable* mem, const Options* opts, + DB* db, const bool filter_deletes) + : sequence_(sequence), + mem_(mem), + options_(opts), + db_(reinterpret_cast(db)), + filter_deletes_(filter_deletes) { + assert(mem_); + if (filter_deletes_) { + assert(options_); + assert(db_); + } + } + + virtual void Put(const Slice& key, const Slice& value) { + if (options_->inplace_update_support + && mem_->Update(sequence_, kTypeValue, key, value)) { + RecordTick(options_->statistics.get(), NUMBER_KEYS_UPDATED); + } else { + mem_->Add(sequence_, kTypeValue, key, value); + } + sequence_++; + } + virtual void Merge(const Slice& key, const Slice& value) { + bool perform_merge = false; + + if (options_->max_successive_merges > 0 && db_ != nullptr) { + LookupKey lkey(key, sequence_); + + // Count the number of successive merges at the head + // of the key in the memtable + size_t num_merges = mem_->CountSuccessiveMergeEntries(lkey); + + if (num_merges >= options_->max_successive_merges) { + perform_merge = true; + } + } + + if (perform_merge) { + // 1) Get the existing value + std::string get_value; + + // Pass in the sequence number so that we also include previous merge + // operations in the same batch. + SnapshotImpl read_from_snapshot; + read_from_snapshot.number_ = sequence_; + ReadOptions read_options; + read_options.snapshot = &read_from_snapshot; + + db_->Get(read_options, key, &get_value); + Slice get_value_slice = Slice(get_value); + + // 2) Apply this merge + auto merge_operator = options_->merge_operator.get(); + assert(merge_operator); + + std::deque operands; + operands.push_front(value.ToString()); + std::string new_value; + if (!merge_operator->FullMerge(key, + &get_value_slice, + operands, + &new_value, + options_->info_log.get())) { + // Failed to merge! + RecordTick(options_->statistics.get(), NUMBER_MERGE_FAILURES); + + // Store the delta in memtable + perform_merge = false; + } else { + // 3) Add value to memtable + mem_->Add(sequence_, kTypeValue, key, new_value); + } + } + + if (!perform_merge) { + // Add merge operator to memtable + mem_->Add(sequence_, kTypeMerge, key, value); + } + + sequence_++; + } + virtual void Delete(const Slice& key) { + if (filter_deletes_) { + SnapshotImpl read_from_snapshot; + read_from_snapshot.number_ = sequence_; + ReadOptions ropts; + ropts.snapshot = &read_from_snapshot; + std::string value; + if (!db_->KeyMayExist(ropts, key, &value)) { + RecordTick(options_->statistics.get(), NUMBER_FILTERED_DELETES); + return; + } + } + mem_->Add(sequence_, kTypeDeletion, key, Slice()); + sequence_++; + } +}; +} // namespace + +Status WriteBatchInternal::InsertInto(const WriteBatch* b, MemTable* mem, + const Options* opts, DB* db, + const bool filter_deletes) { + MemTableInserter inserter(WriteBatchInternal::Sequence(b), mem, opts, db, + filter_deletes); + return b->Iterate(&inserter); +} + +void WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) { + assert(contents.size() >= kHeader); + b->rep_.assign(contents.data(), contents.size()); +} + +void WriteBatchInternal::Append(WriteBatch* dst, const WriteBatch* src) { + SetCount(dst, Count(dst) + Count(src)); + assert(src->rep_.size() >= kHeader); + dst->rep_.append(src->rep_.data() + kHeader, src->rep_.size() - kHeader); +} + +} // namespace rocksdb diff --git a/db/write_batch_internal.h b/db/write_batch_internal.h new file mode 100644 index 00000000..b8991732 --- /dev/null +++ b/db/write_batch_internal.h @@ -0,0 +1,57 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include "rocksdb/types.h" +#include "rocksdb/write_batch.h" +#include "rocksdb/db.h" +#include "rocksdb/options.h" + +namespace rocksdb { + +class MemTable; + +// WriteBatchInternal provides static methods for manipulating a +// WriteBatch that we don't want in the public WriteBatch interface. +class WriteBatchInternal { + public: + // Return the number of entries in the batch. + static int Count(const WriteBatch* batch); + + // Set the count for the number of entries in the batch. + static void SetCount(WriteBatch* batch, int n); + + // Return the seqeunce number for the start of this batch. + static SequenceNumber Sequence(const WriteBatch* batch); + + // Store the specified number as the seqeunce number for the start of + // this batch. + static void SetSequence(WriteBatch* batch, SequenceNumber seq); + + static Slice Contents(const WriteBatch* batch) { + return Slice(batch->rep_); + } + + static size_t ByteSize(const WriteBatch* batch) { + return batch->rep_.size(); + } + + static void SetContents(WriteBatch* batch, const Slice& contents); + + // Inserts batch entries into memtable + // Drops deletes in batch if filter_del is set to true and + // db->KeyMayExist returns false + static Status InsertInto(const WriteBatch* batch, MemTable* memtable, + const Options* opts, DB* db = nullptr, + const bool filter_del = false); + + static void Append(WriteBatch* dst, const WriteBatch* src); +}; + +} // namespace rocksdb diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc new file mode 100644 index 00000000..931d8f3f --- /dev/null +++ b/db/write_batch_test.cc @@ -0,0 +1,263 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/db.h" + +#include +#include "db/memtable.h" +#include "db/write_batch_internal.h" +#include "rocksdb/env.h" +#include "rocksdb/memtablerep.h" +#include "util/logging.h" +#include "util/testharness.h" + +namespace rocksdb { + +static std::string PrintContents(WriteBatch* b) { + InternalKeyComparator cmp(BytewiseComparator()); + auto factory = std::make_shared(); + Options options; + options.memtable_factory = factory; + MemTable* mem = new MemTable(cmp, options); + mem->Ref(); + std::string state; + Status s = WriteBatchInternal::InsertInto(b, mem, &options); + int count = 0; + Iterator* iter = mem->NewIterator(); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ParsedInternalKey ikey; + memset((void *)&ikey, 0, sizeof(ikey)); + ASSERT_TRUE(ParseInternalKey(iter->key(), &ikey)); + switch (ikey.type) { + case kTypeValue: + state.append("Put("); + state.append(ikey.user_key.ToString()); + state.append(", "); + state.append(iter->value().ToString()); + state.append(")"); + count++; + break; + case kTypeMerge: + state.append("Merge("); + state.append(ikey.user_key.ToString()); + state.append(", "); + state.append(iter->value().ToString()); + state.append(")"); + count++; + break; + case kTypeDeletion: + state.append("Delete("); + state.append(ikey.user_key.ToString()); + state.append(")"); + count++; + break; + case kTypeLogData: + assert(false); + break; + } + state.append("@"); + state.append(NumberToString(ikey.sequence)); + } + delete iter; + if (!s.ok()) { + state.append(s.ToString()); + } else if (count != WriteBatchInternal::Count(b)) { + state.append("CountMismatch()"); + } + delete mem->Unref(); + return state; +} + +class WriteBatchTest { }; + +TEST(WriteBatchTest, Empty) { + WriteBatch batch; + ASSERT_EQ("", PrintContents(&batch)); + ASSERT_EQ(0, WriteBatchInternal::Count(&batch)); + ASSERT_EQ(0, batch.Count()); +} + +TEST(WriteBatchTest, Multiple) { + WriteBatch batch; + batch.Put(Slice("foo"), Slice("bar")); + batch.Delete(Slice("box")); + batch.Put(Slice("baz"), Slice("boo")); + WriteBatchInternal::SetSequence(&batch, 100); + ASSERT_EQ(100U, WriteBatchInternal::Sequence(&batch)); + ASSERT_EQ(3, WriteBatchInternal::Count(&batch)); + ASSERT_EQ("Put(baz, boo)@102" + "Delete(box)@101" + "Put(foo, bar)@100", + PrintContents(&batch)); + ASSERT_EQ(3, batch.Count()); +} + +TEST(WriteBatchTest, Corruption) { + WriteBatch batch; + batch.Put(Slice("foo"), Slice("bar")); + batch.Delete(Slice("box")); + WriteBatchInternal::SetSequence(&batch, 200); + Slice contents = WriteBatchInternal::Contents(&batch); + WriteBatchInternal::SetContents(&batch, + Slice(contents.data(),contents.size()-1)); + ASSERT_EQ("Put(foo, bar)@200" + "Corruption: bad WriteBatch Delete", + PrintContents(&batch)); +} + +TEST(WriteBatchTest, Append) { + WriteBatch b1, b2; + WriteBatchInternal::SetSequence(&b1, 200); + WriteBatchInternal::SetSequence(&b2, 300); + WriteBatchInternal::Append(&b1, &b2); + ASSERT_EQ("", + PrintContents(&b1)); + ASSERT_EQ(0, b1.Count()); + b2.Put("a", "va"); + WriteBatchInternal::Append(&b1, &b2); + ASSERT_EQ("Put(a, va)@200", + PrintContents(&b1)); + ASSERT_EQ(1, b1.Count()); + b2.Clear(); + b2.Put("b", "vb"); + WriteBatchInternal::Append(&b1, &b2); + ASSERT_EQ("Put(a, va)@200" + "Put(b, vb)@201", + PrintContents(&b1)); + ASSERT_EQ(2, b1.Count()); + b2.Delete("foo"); + WriteBatchInternal::Append(&b1, &b2); + ASSERT_EQ("Put(a, va)@200" + "Put(b, vb)@202" + "Put(b, vb)@201" + "Delete(foo)@203", + PrintContents(&b1)); + ASSERT_EQ(4, b1.Count()); +} + +namespace { + struct TestHandler : public WriteBatch::Handler { + std::string seen; + virtual void Put(const Slice& key, const Slice& value) { + seen += "Put(" + key.ToString() + ", " + value.ToString() + ")"; + } + virtual void Merge(const Slice& key, const Slice& value) { + seen += "Merge(" + key.ToString() + ", " + value.ToString() + ")"; + } + virtual void LogData(const Slice& blob) { + seen += "LogData(" + blob.ToString() + ")"; + } + virtual void Delete(const Slice& key) { + seen += "Delete(" + key.ToString() + ")"; + } + }; +} + +TEST(WriteBatchTest, Blob) { + WriteBatch batch; + batch.Put(Slice("k1"), Slice("v1")); + batch.Put(Slice("k2"), Slice("v2")); + batch.Put(Slice("k3"), Slice("v3")); + batch.PutLogData(Slice("blob1")); + batch.Delete(Slice("k2")); + batch.PutLogData(Slice("blob2")); + batch.Merge(Slice("foo"), Slice("bar")); + ASSERT_EQ(5, batch.Count()); + ASSERT_EQ("Merge(foo, bar)@4" + "Put(k1, v1)@0" + "Delete(k2)@3" + "Put(k2, v2)@1" + "Put(k3, v3)@2", + PrintContents(&batch)); + + TestHandler handler; + batch.Iterate(&handler); + ASSERT_EQ( + "Put(k1, v1)" + "Put(k2, v2)" + "Put(k3, v3)" + "LogData(blob1)" + "Delete(k2)" + "LogData(blob2)" + "Merge(foo, bar)", + handler.seen); +} + +TEST(WriteBatchTest, Continue) { + WriteBatch batch; + + struct Handler : public TestHandler { + int num_seen = 0; + virtual void Put(const Slice& key, const Slice& value) { + ++num_seen; + TestHandler::Put(key, value); + } + virtual void Merge(const Slice& key, const Slice& value) { + ++num_seen; + TestHandler::Merge(key, value); + } + virtual void LogData(const Slice& blob) { + ++num_seen; + TestHandler::LogData(blob); + } + virtual void Delete(const Slice& key) { + ++num_seen; + TestHandler::Delete(key); + } + virtual bool Continue() override { + return num_seen < 3; + } + } handler; + + batch.Put(Slice("k1"), Slice("v1")); + batch.PutLogData(Slice("blob1")); + batch.Delete(Slice("k1")); + batch.PutLogData(Slice("blob2")); + batch.Merge(Slice("foo"), Slice("bar")); + batch.Iterate(&handler); + ASSERT_EQ( + "Put(k1, v1)" + "LogData(blob1)" + "Delete(k1)", + handler.seen); +} + +TEST(WriteBatchTest, PutGatherSlices) { + WriteBatch batch; + batch.Put(Slice("foo"), Slice("bar")); + + { + // Try a write where the key is one slice but the value is two + Slice key_slice("baz"); + Slice value_slices[2] = { Slice("header"), Slice("payload") }; + batch.Put(SliceParts(&key_slice, 1), + SliceParts(value_slices, 2)); + } + + { + // One where the key is composite but the value is a single slice + Slice key_slices[3] = { Slice("key"), Slice("part2"), Slice("part3") }; + Slice value_slice("value"); + batch.Put(SliceParts(key_slices, 3), + SliceParts(&value_slice, 1)); + } + + WriteBatchInternal::SetSequence(&batch, 100); + ASSERT_EQ("Put(baz, headerpayload)@101" + "Put(foo, bar)@100" + "Put(keypart2part3, value)@102", + PrintContents(&batch)); + ASSERT_EQ(3, batch.Count()); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/doc/doc.css b/doc/doc.css new file mode 100644 index 00000000..700c564e --- /dev/null +++ b/doc/doc.css @@ -0,0 +1,89 @@ +body { + margin-left: 0.5in; + margin-right: 0.5in; + background: white; + color: black; +} + +h1 { + margin-left: -0.2in; + font-size: 14pt; +} +h2 { + margin-left: -0in; + font-size: 12pt; +} +h3 { + margin-left: -0in; +} +h4 { + margin-left: -0in; +} +hr { + margin-left: -0in; +} + +/* Definition lists: definition term bold */ +dt { + font-weight: bold; +} + +address { + text-align: center; +} +code,samp,var { + color: blue; +} +kbd { + color: #600000; +} +div.note p { + float: right; + width: 3in; + margin-right: 0%; + padding: 1px; + border: 2px solid #6060a0; + background-color: #fffff0; +} + +ul { + margin-top: -0em; + margin-bottom: -0em; +} + +ol { + margin-top: -0em; + margin-bottom: -0em; +} + +UL.nobullets { + list-style-type: none; + list-style-image: none; + margin-left: -1em; +} + +p { + margin: 1em 0 1em 0; + padding: 0 0 0 0; +} + +pre { + line-height: 1.3em; + padding: 0.4em 0 0.8em 0; + margin: 0 0 0 0; + border: 0 0 0 0; + color: blue; +} + +.datatable { + margin-left: auto; + margin-right: auto; + margin-top: 2em; + margin-bottom: 2em; + border: 1px solid; +} + +.datatable td,th { + padding: 0 0.5em 0 0.5em; + text-align: right; +} diff --git a/doc/index.html b/doc/index.html new file mode 100644 index 00000000..84c4d132 --- /dev/null +++ b/doc/index.html @@ -0,0 +1,831 @@ + + + + +RocksDB + + + +

RocksDB

+
The Facebook Database Engineering Team
+
Build on earlier work on leveldb by Sanjay Ghemawat + (sanjay@google.com) and Jeff Dean (jeff@google.com)
+

+The rocksdb library provides a persistent key value store. Keys and +values are arbitrary byte arrays. The keys are ordered within the key +value store according to a user-specified comparator function. + +

+

Opening A Database

+

+A rocksdb database has a name which corresponds to a file system +directory. All of the contents of database are stored in this +directory. The following example shows how to open a database, +creating it if necessary: +

+

+  #include <assert>
+  #include "rocksdb/db.h"
+
+  rocksdb::DB* db;
+  rocksdb::Options options;
+  options.create_if_missing = true;
+  rocksdb::Status status = rocksdb::DB::Open(options, "/tmp/testdb", &db);
+  assert(status.ok());
+  ...
+
+If you want to raise an error if the database already exists, add +the following line before the rocksdb::DB::Open call: +
+  options.error_if_exists = true;
+
+

Status

+

+You may have noticed the rocksdb::Status type above. Values of this +type are returned by most functions in rocksdb that may encounter an +error. You can check if such a result is ok, and also print an +associated error message: +

+

+   rocksdb::Status s = ...;
+   if (!s.ok()) cerr << s.ToString() << endl;
+
+

Closing A Database

+

+When you are done with a database, just delete the database object. +Example: +

+

+  ... open the db as described above ...
+  ... do something with db ...
+  delete db;
+
+

Reads And Writes

+

+The database provides Put, Delete, and Get methods to +modify/query the database. For example, the following code +moves the value stored under key1 to key2. +

+  std::string value;
+  rocksdb::Status s = db->Get(rocksdb::ReadOptions(), key1, &value);
+  if (s.ok()) s = db->Put(rocksdb::WriteOptions(), key2, value);
+  if (s.ok()) s = db->Delete(rocksdb::WriteOptions(), key1);
+
+ +

Atomic Updates

+

+Note that if the process dies after the Put of key2 but before the +delete of key1, the same value may be left stored under multiple keys. +Such problems can be avoided by using the WriteBatch class to +atomically apply a set of updates: +

+

+  #include "rocksdb/write_batch.h"
+  ...
+  std::string value;
+  rocksdb::Status s = db->Get(rocksdb::ReadOptions(), key1, &value);
+  if (s.ok()) {
+    rocksdb::WriteBatch batch;
+    batch.Delete(key1);
+    batch.Put(key2, value);
+    s = db->Write(rocksdb::WriteOptions(), &batch);
+  }
+
+The WriteBatch holds a sequence of edits to be made to the database, +and these edits within the batch are applied in order. Note that we +called Delete before Put so that if key1 is identical to key2, +we do not end up erroneously dropping the value entirely. +

+Apart from its atomicity benefits, WriteBatch may also be used to +speed up bulk updates by placing lots of individual mutations into the +same batch. + +

Synchronous Writes

+By default, each write to leveldb is asynchronous: it +returns after pushing the write from the process into the operating +system. The transfer from operating system memory to the underlying +persistent storage happens asynchronously. The sync flag +can be turned on for a particular write to make the write operation +not return until the data being written has been pushed all the way to +persistent storage. (On Posix systems, this is implemented by calling +either fsync(...) or fdatasync(...) or +msync(..., MS_SYNC) before the write operation returns.) +
+  rocksdb::WriteOptions write_options;
+  write_options.sync = true;
+  db->Put(write_options, ...);
+
+Asynchronous writes are often more than a thousand times as fast as +synchronous writes. The downside of asynchronous writes is that a +crash of the machine may cause the last few updates to be lost. Note +that a crash of just the writing process (i.e., not a reboot) will not +cause any loss since even when sync is false, an update +is pushed from the process memory into the operating system before it +is considered done. + +

+Asynchronous writes can often be used safely. For example, when +loading a large amount of data into the database you can handle lost +updates by restarting the bulk load after a crash. A hybrid scheme is +also possible where every Nth write is synchronous, and in the event +of a crash, the bulk load is restarted just after the last synchronous +write finished by the previous run. (The synchronous write can update +a marker that describes where to restart on a crash.) + +

+WriteBatch provides an alternative to asynchronous writes. +Multiple updates may be placed in the same WriteBatch and +applied together using a synchronous write (i.e., +write_options.sync is set to true). The extra cost of +the synchronous write will be amortized across all of the writes in +the batch. + +

+We also provide a way to completely disable Write Ahead Log for a +particular write. If you set write_option.disableWAL to true, the +write will not go to the log at all and may be lost in an event of +process crash. + +

+When opening a DB, you can disable syncing of data files by setting +Options::disableDataSync to true. This can be useful when doing +bulk-loading or big idempotent operations. Once the operation is +finished, you can manually call sync() to flush all dirty buffers +to stable storage. + +

+RocksDB by default uses faster fdatasync() to sync files. If you want +to use fsync(), you can set Options::use_fsync to true. You should set +this to true on filesystems like ext3 that can lose files after a +reboot. + +

+

Concurrency

+

+A database may only be opened by one process at a time. +The rocksdb implementation acquires a lock from the +operating system to prevent misuse. Within a single process, the +same rocksdb::DB object may be safely shared by multiple +concurrent threads. I.e., different threads may write into or fetch +iterators or call Get on the same database without any +external synchronization (the leveldb implementation will +automatically do the required synchronization). However other objects +(like Iterator and WriteBatch) may require external synchronization. +If two threads share such an object, they must protect access to it +using their own locking protocol. More details are available in +the public header files. + +

+

Merge operators

+

+Merge operators provide efficient support for read-modify-write operation. +More on the interface and implementation can be found on: +

+ + Merge Operator +

+ + Merge Operator Implementation + +

+

Iteration

+

+The following example demonstrates how to print all key,value pairs +in a database. +

+

+  rocksdb::Iterator* it = db->NewIterator(rocksdb::ReadOptions());
+  for (it->SeekToFirst(); it->Valid(); it->Next()) {
+    cout << it->key().ToString() << ": "  << it->value().ToString() << endl;
+  }
+  assert(it->status().ok());  // Check for any errors found during the scan
+  delete it;
+
+The following variation shows how to process just the keys in the +range [start,limit): +

+

+  for (it->Seek(start);
+       it->Valid() && it->key().ToString() < limit;
+       it->Next()) {
+    ...
+  }
+
+You can also process entries in reverse order. (Caveat: reverse +iteration may be somewhat slower than forward iteration.) +

+

+  for (it->SeekToLast(); it->Valid(); it->Prev()) {
+    ...
+  }
+
+

Snapshots

+

+Snapshots provide consistent read-only views over the entire state of +the key-value store. ReadOptions::snapshot may be non-NULL to indicate +that a read should operate on a particular version of the DB state. +If ReadOptions::snapshot is NULL, the read will operate on an +implicit snapshot of the current state. +

+Snapshots are created by the DB::GetSnapshot() method: +

+

+  rocksdb::ReadOptions options;
+  options.snapshot = db->GetSnapshot();
+  ... apply some updates to db ...
+  rocksdb::Iterator* iter = db->NewIterator(options);
+  ... read using iter to view the state when the snapshot was created ...
+  delete iter;
+  db->ReleaseSnapshot(options.snapshot);
+
+Note that when a snapshot is no longer needed, it should be released +using the DB::ReleaseSnapshot interface. This allows the +implementation to get rid of state that was being maintained just to +support reading as of that snapshot. +

Slice

+

+The return value of the it->key() and it->value() calls above +are instances of the rocksdb::Slice type. Slice is a simple +structure that contains a length and a pointer to an external byte +array. Returning a Slice is a cheaper alternative to returning a +std::string since we do not need to copy potentially large keys and +values. In addition, rocksdb methods do not return null-terminated +C-style strings since rocksdb keys and values are allowed to +contain '\0' bytes. +

+C++ strings and null-terminated C-style strings can be easily converted +to a Slice: +

+

+   rocksdb::Slice s1 = "hello";
+
+   std::string str("world");
+   rocksdb::Slice s2 = str;
+
+A Slice can be easily converted back to a C++ string: +
+   std::string str = s1.ToString();
+   assert(str == std::string("hello"));
+
+Be careful when using Slices since it is up to the caller to ensure that +the external byte array into which the Slice points remains live while +the Slice is in use. For example, the following is buggy: +

+

+   rocksdb::Slice slice;
+   if (...) {
+     std::string str = ...;
+     slice = str;
+   }
+   Use(slice);
+
+When the if statement goes out of scope, str will be destroyed and the +backing storage for slice will disappear. +

+

Comparators

+

+The preceding examples used the default ordering function for key, +which orders bytes lexicographically. You can however supply a custom +comparator when opening a database. For example, suppose each +database key consists of two numbers and we should sort by the first +number, breaking ties by the second number. First, define a proper +subclass of rocksdb::Comparator that expresses these rules: +

+

+  class TwoPartComparator : public rocksdb::Comparator {
+   public:
+    // Three-way comparison function:
+    //   if a < b: negative result
+    //   if a > b: positive result
+    //   else: zero result
+    int Compare(const rocksdb::Slice& a, const rocksdb::Slice& b) const {
+      int a1, a2, b1, b2;
+      ParseKey(a, &a1, &a2);
+      ParseKey(b, &b1, &b2);
+      if (a1 < b1) return -1;
+      if (a1 > b1) return +1;
+      if (a2 < b2) return -1;
+      if (a2 > b2) return +1;
+      return 0;
+    }
+
+    // Ignore the following methods for now:
+    const char* Name() const { return "TwoPartComparator"; }
+    void FindShortestSeparator(std::string*, const rocksdb::Slice&) const { }
+    void FindShortSuccessor(std::string*) const { }
+  };
+
+Now create a database using this custom comparator: +

+

+  TwoPartComparator cmp;
+  rocksdb::DB* db;
+  rocksdb::Options options;
+  options.create_if_missing = true;
+  options.comparator = &cmp;
+  rocksdb::Status status = rocksdb::DB::Open(options, "/tmp/testdb", &db);
+  ...
+
+

Backwards compatibility

+

+The result of the comparator's Name method is attached to the +database when it is created, and is checked on every subsequent +database open. If the name changes, the rocksdb::DB::Open call will +fail. Therefore, change the name if and only if the new key format +and comparison function are incompatible with existing databases, and +it is ok to discard the contents of all existing databases. +

+You can however still gradually evolve your key format over time with +a little bit of pre-planning. For example, you could store a version +number at the end of each key (one byte should suffice for most uses). +When you wish to switch to a new key format (e.g., adding an optional +third part to the keys processed by TwoPartComparator), +(a) keep the same comparator name (b) increment the version number +for new keys (c) change the comparator function so it uses the +version numbers found in the keys to decide how to interpret them. + + +

+

MemTable and Table factories

+

+By default, we keep the data in memory in skiplist memtable and the data +on disk in a table format described here: + + RocksDB Table Format. +

+Since one of the goals of RocksDB is to have +different parts of the system easily pluggable, we support different +implementations of both memtable and table format. You can supply +your own memtable factory by setting Options::memtable_factory +and your own table factory by setting Options::table_factory. +For available memtable factories, please refer to +rocksdb/memtablerep.h and for table factores to +rocksdb/table.h. These features are both in active development +and please be wary of any API changes that might break your application +going forward. +

+You can also read more about memtables here: + +Memtables wiki + + +

+

Performance

+

+Performance can be tuned by changing the default values of the +types defined in include/rocksdb/options.h. + +

+

Block size

+

+rocksdb groups adjacent keys together into the same block and such a +block is the unit of transfer to and from persistent storage. The +default block size is approximately 4096 uncompressed bytes. +Applications that mostly do bulk scans over the contents of the +database may wish to increase this size. Applications that do a lot +of point reads of small values may wish to switch to a smaller block +size if performance measurements indicate an improvement. There isn't +much benefit in using blocks smaller than one kilobyte, or larger than +a few megabytes. Also note that compression will be more effective +with larger block sizes. To change block size parameter, use +Options::block_size. +

+

Write buffer

+

+Options::write_buffer_size specifies the amount of data +to build up in memory before converting to a sorted on-disk file. +Larger values increase performance, especially during bulk loads. +Up to max_write_buffer_number write buffers may be held in memory +at the same time, +so you may wish to adjust this parameter to control memory usage. +Also, a larger write buffer will result in a longer recovery time +the next time the database is opened. +Related option is +Options::max_write_buffer_number, which is maximum number +of write buffers that are built up in memory. The default is 2, so that +when 1 write buffer is being flushed to storage, new writes can continue +to the other write buffer. +Options::min_write_buffer_number_to_merge is the minimum number +of write buffers that will be merged together before writing to storage. +If set to 1, then all write buffers are flushed to L0 as individual files and +this increases read amplification because a get request has to check in all +of these files. Also, an in-memory merge may result in writing lesser +data to storage if there are duplicate records in each of these +individual write buffers. Default: 1 +

+

Compression

+

+Each block is individually compressed before being written to +persistent storage. Compression is on by default since the default +compression method is very fast, and is automatically disabled for +uncompressible data. In rare cases, applications may want to disable +compression entirely, but should only do so if benchmarks show a +performance improvement: +

+

+  rocksdb::Options options;
+  options.compression = rocksdb::kNoCompression;
+  ... rocksdb::DB::Open(options, name, ...) ....
+
+

Cache

+

+The contents of the database are stored in a set of files in the +filesystem and each file stores a sequence of compressed blocks. If +options.block_cache is non-NULL, it is used to cache frequently +used uncompressed block contents. If options.block_cache_compressed +is non-NULL, it is used to cache frequently used compressed blocks. Compressed +cache is an alternative to OS cache, which also caches compressed blocks. If +compressed cache is used, the OS cache will be disabled automatically by setting +options.allow_os_buffer to false. +

+

+  #include "rocksdb/cache.h"
+
+  rocksdb::Options options;
+  options.block_cache = rocksdb::NewLRUCache(100 * 1048576);  // 100MB uncompressed cache
+  options.block_cache_compressed = rocksdb::NewLRUCache(100 * 1048576);  // 100MB compressed cache
+  rocksdb::DB* db;
+  rocksdb::DB::Open(options, name, &db);
+  ... use the db ...
+  delete db
+  delete options.block_cache;
+  delete options.block_cache_compressed;
+
+

+When performing a bulk read, the application may wish to disable +caching so that the data processed by the bulk read does not end up +displacing most of the cached contents. A per-iterator option can be +used to achieve this: +

+

+  rocksdb::ReadOptions options;
+  options.fill_cache = false;
+  rocksdb::Iterator* it = db->NewIterator(options);
+  for (it->SeekToFirst(); it->Valid(); it->Next()) {
+    ...
+  }
+
+

+You can also disable block cache by setting options.no_block_cache +to true. +

Key Layout

+

+Note that the unit of disk transfer and caching is a block. Adjacent +keys (according to the database sort order) will usually be placed in +the same block. Therefore the application can improve its performance +by placing keys that are accessed together near each other and placing +infrequently used keys in a separate region of the key space. +

+For example, suppose we are implementing a simple file system on top +of rocksdb. The types of entries we might wish to store are: +

+

+   filename -> permission-bits, length, list of file_block_ids
+   file_block_id -> data
+
+We might want to prefix filename keys with one letter (say '/') and the +file_block_id keys with a different letter (say '0') so that scans +over just the metadata do not force us to fetch and cache bulky file +contents. +

+

Filters

+

+Because of the way rocksdb data is organized on disk, +a single Get() call may involve multiple reads from disk. +The optional FilterPolicy mechanism can be used to reduce +the number of disk reads substantially. +

+   rocksdb::Options options;
+   options.filter_policy = NewBloomFilter(10);
+   rocksdb::DB* db;
+   rocksdb::DB::Open(options, "/tmp/testdb", &db);
+   ... use the database ...
+   delete db;
+   delete options.filter_policy;
+
+The preceding code associates a +Bloom filter +based filtering policy with the database. Bloom filter based +filtering relies on keeping some number of bits of data in memory per +key (in this case 10 bits per key since that is the argument we passed +to NewBloomFilter). This filter will reduce the number of unnecessary +disk reads needed for Get() calls by a factor of +approximately a 100. Increasing the bits per key will lead to a +larger reduction at the cost of more memory usage. We recommend that +applications whose working set does not fit in memory and that do a +lot of random reads set a filter policy. +

+If you are using a custom comparator, you should ensure that the filter +policy you are using is compatible with your comparator. For example, +consider a comparator that ignores trailing spaces when comparing keys. +NewBloomFilter must not be used with such a comparator. +Instead, the application should provide a custom filter policy that +also ignores trailing spaces. For example: +

+  class CustomFilterPolicy : public rocksdb::FilterPolicy {
+   private:
+    FilterPolicy* builtin_policy_;
+   public:
+    CustomFilterPolicy() : builtin_policy_(NewBloomFilter(10)) { }
+    ~CustomFilterPolicy() { delete builtin_policy_; }
+
+    const char* Name() const { return "IgnoreTrailingSpacesFilter"; }
+
+    void CreateFilter(const Slice* keys, int n, std::string* dst) const {
+      // Use builtin bloom filter code after removing trailing spaces
+      std::vector<Slice> trimmed(n);
+      for (int i = 0; i < n; i++) {
+        trimmed[i] = RemoveTrailingSpaces(keys[i]);
+      }
+      return builtin_policy_->CreateFilter(&trimmed[i], n, dst);
+    }
+
+    bool KeyMayMatch(const Slice& key, const Slice& filter) const {
+      // Use builtin bloom filter code after removing trailing spaces
+      return builtin_policy_->KeyMayMatch(RemoveTrailingSpaces(key), filter);
+    }
+  };
+
+

+Advanced applications may provide a filter policy that does not use +a bloom filter but uses some other mechanism for summarizing a set +of keys. See rocksdb/filter_policy.h for detail. +

+

Checksums

+

+rocksdb associates checksums with all data it stores in the file system. +There are two separate controls provided over how aggressively these +checksums are verified: +

+

    +
  • ReadOptions::verify_checksums may be set to true to force + checksum verification of all data that is read from the file system on + behalf of a particular read. By default, no such verification is + done. +

    +

  • Options::paranoid_checks may be set to true before opening a + database to make the database implementation raise an error as soon as + it detects an internal corruption. Depending on which portion of the + database has been corrupted, the error may be raised when the database + is opened, or later by another database operation. By default, + paranoid checking is off so that the database can be used even if + parts of its persistent storage have been corrupted. +

    + If a database is corrupted (perhaps it cannot be opened when + paranoid checking is turned on), the rocksdb::RepairDB function + may be used to recover as much of the data as possible. +

    +

+ +

+

Compaction

+

+You can read more on Compactions here: + + Multi-threaded compactions + +

+Here we give overview of the options that impact behavior of Compactions: +

    +

    +

  • Options::compaction_style - RocksDB currently supports two +compaction algorithms - Universal style and Level style. This option switches +between the two. Can be kCompactionStyleUniversal or kCompactionStyleLevel. +If this is kCompactionStyleUniversal, then you can configure universal style +parameters with Options::compaction_options_universal. +

    +

  • Options::disable_auto_compactions - Disable automatic compactions. +Manual compactions can still be issued on this database. +

    +

  • Options::compaction_filter - Allows an application to modify/delete +a key-value during background compaction. The client must provide +compaction_filter_factory if it requires a new compaction filter to be used +for different compaction processes. Client should specify only one of filter +or factory. +

    +

  • Options::compaction_filter_factory - a factory that provides +compaction filter objects which allow an application to modify/delete a +key-value during background compaction. +
+

+Other options impacting performance of compactions and when they get triggered +are: +

    +

    +

  • Options::access_hint_on_compaction_start - Specify the file access +pattern once a compaction is started. It will be applied to all input files of a compaction. Default: NORMAL +

    +

  • Options::level0_file_num_compaction_trigger - Number of files to trigger level-0 compaction. +A negative value means that level-0 compaction will not be triggered by number of files at all. +

    +

  • Options::max_mem_compaction_level - Maximum level to which a new compacted memtable is pushed if it +does not create overlap. We try to push to level 2 to avoid the relatively expensive level 0=>1 compactions and to avoid some +expensive manifest file operations. We do not push all the way to the largest level since that can generate a lot of wasted disk +space if the same key space is being repeatedly overwritten. +

    +

  • Options::target_file_size_base and Options::target_file_size_multiplier - +Target file size for compaction. target_file_size_base is per-file size for level-1. +Target file size for level L can be calculated by target_file_size_base * (target_file_size_multiplier ^ (L-1)) +For example, if target_file_size_base is 2MB and target_file_size_multiplier is 10, then each file on level-1 will +be 2MB, and each file on level 2 will be 20MB, and each file on level-3 will be 200MB. Default target_file_size_base is 2MB +and default target_file_size_multiplier is 1. +

    +

  • Options::expanded_compaction_factor - Maximum number of bytes in all compacted files. We avoid expanding +the lower level file set of a compaction if it would make the total compaction cover more than +(expanded_compaction_factor * targetFileSizeLevel()) many bytes. +

    +

  • Options::source_compaction_factor - Maximum number of bytes in all source files to be compacted in a +single compaction run. We avoid picking too many files in the source level so that we do not exceed the total source bytes +for compaction to exceed (source_compaction_factor * targetFileSizeLevel()) many bytes. +Default:1, i.e. pick maxfilesize amount of data as the source of a compaction. +

    +

  • Options::max_grandparent_overlap_factor - Control maximum bytes of overlaps in grandparent (i.e., level+2) before we +stop building a single file in a level->level+1 compaction. +

    +

  • Options::disable_seek_compaction - Disable compaction triggered by seek. +With bloomfilter and fast storage, a miss on one level is very cheap if the file handle is cached in table cache +(which is true if max_open_files is large). +

    +

  • Options::max_background_compactions - Maximum number of concurrent background jobs, submitted to +the default LOW priority thread pool +
+ +

+You can learn more about all of those options in rocksdb/options.h + +

Universal style compaction specific settings

+

+If you're using Universal style compaction, there is an object CompactionOptionsUniversal +that hold all the different options for that compaction. The exact definition is in +rocksdb/universal_compaction.h and you can set it in Options::compaction_options_universal. +Here we give short overview of options in CompactionOptionsUniversal: +

    +

    +

  • CompactionOptionsUniversal::size_ratio - Percentage flexibility while comparing file size. If the candidate file(s) + size is 1% smaller than the next file's size, then include next file into + this candidate set. Default: 1 +

    +

  • CompactionOptionsUniversal::min_merge_width - The minimum number of files in a single compaction run. Default: 2 +

    +

  • CompactionOptionsUniversal::max_merge_width - The maximum number of files in a single compaction run. Default: UINT_MAX +

    +

  • CompactionOptionsUniversal::max_size_amplification_percent - The size amplification is defined as the amount (in percentage) of +additional storage needed to store a single byte of data in the database. For example, a size amplification of 2% means that a database that +contains 100 bytes of user-data may occupy upto 102 bytes of physical storage. By this definition, a fully compacted database has +a size amplification of 0%. Rocksdb uses the following heuristic to calculate size amplification: it assumes that all files excluding +the earliest file contribute to the size amplification. Default: 200, which means that a 100 byte database could require upto +300 bytes of storage. +

    +

  • CompactionOptionsUniversal::compression_size_percent - If this option is set to be -1 (the default value), all the output files +will follow compression type specified. If this option is not negative, we will try to make sure compressed +size is just above this value. In normal cases, at least this percentage +of data will be compressed. +When we are compacting to a new file, here is the criteria whether +it needs to be compressed: assuming here are the list of files sorted +by generation time: [ A1...An B1...Bm C1...Ct ], +where A1 is the newest and Ct is the oldest, and we are going to compact +B1...Bm, we calculate the total size of all the files as total_size, as +well as the total size of C1...Ct as total_C, the compaction output file +will be compressed iff total_C / total_size < this percentage +

    +

  • CompactionOptionsUniversal::stop_style - The algorithm used to stop picking files into a single compaction run. +Can be kCompactionStopStyleSimilarSize (pick files of similar size) or kCompactionStopStyleTotalSize (total size of picked files > next file). +Default: kCompactionStopStyleTotalSize +
+ +

Thread pools

+

+A thread pool is associated with Env environment object. The client has to create a thread pool by setting the number of background +threads using method Env::SetBackgroundThreads() defined in rocksdb/env.h. +We use the thread pool for compactions and memtable flushes. +Since memtable flushes are in critical code path (stalling memtable flush can stall writes, increasing p99), we suggest +having two thread pools - with priorities HIGH and LOW. Memtable flushes can be set up to be scheduled on HIGH thread pool. +There are two options available for configuration of background compactions and flushes: +

    +

    +

  • Options::max_background_compactions - Maximum number of concurrent background jobs, +submitted to the default LOW priority thread pool +

    +

  • Options::max_background_flushes - Maximum number of concurrent background memtable flush jobs, submitted to +the HIGH priority thread pool. By default, all background jobs (major compaction and memtable flush) go +to the LOW priority pool. If this option is set to a positive number, memtable flush jobs will be submitted to the HIGH priority pool. +It is important when the same Env is shared by multiple db instances. Without a separate pool, long running major compaction jobs could +potentially block memtable flush jobs of other db instances, leading to unnecessary Put stalls. +
+

+

+  #include "rocksdb/env.h"
+  #include "rocksdb/db.h"
+
+  auto env = rocksdb::Env::Default();
+  env->SetBackgroundThreads(2, rocksdb::Env::LOW);
+  env->SetBackgroundThreads(1, rocksdb::Env::HIGH);
+  rocksdb::DB* db;
+  rocksdb::Options options;
+  options.env = env;
+  options.max_background_compactions = 2;
+  options.max_background_flushes = 1;
+  rocksdb::Status status = rocksdb::DB::Open(options, "/tmp/testdb", &db);
+  assert(status.ok());
+  ...
+
+

Approximate Sizes

+

+The GetApproximateSizes method can used to get the approximate +number of bytes of file system space used by one or more key ranges. +

+

+   rocksdb::Range ranges[2];
+   ranges[0] = rocksdb::Range("a", "c");
+   ranges[1] = rocksdb::Range("x", "z");
+   uint64_t sizes[2];
+   rocksdb::Status s = db->GetApproximateSizes(ranges, 2, sizes);
+
+The preceding call will set sizes[0] to the approximate number of +bytes of file system space used by the key range [a..c) and +sizes[1] to the approximate number of bytes used by the key range +[x..z). +

+

Environment

+

+All file operations (and other operating system calls) issued by the +rocksdb implementation are routed through a rocksdb::Env object. +Sophisticated clients may wish to provide their own Env +implementation to get better control. For example, an application may +introduce artificial delays in the file IO paths to limit the impact +of rocksdb on other activities in the system. +

+

+  class SlowEnv : public rocksdb::Env {
+    .. implementation of the Env interface ...
+  };
+
+  SlowEnv env;
+  rocksdb::Options options;
+  options.env = &env;
+  Status s = rocksdb::DB::Open(options, ...);
+
+

Porting

+

+rocksdb may be ported to a new platform by providing platform +specific implementations of the types/methods/functions exported by +rocksdb/port/port.h. See rocksdb/port/port_example.h for more +details. +

+In addition, the new platform may need a new default rocksdb::Env +implementation. See rocksdb/util/env_posix.h for an example. + +

Statistics

+

+To be able to efficiently tune your application, it is always helpful if you +have access to usage statistics. You can collect those statistics by setting +Options::table_stats_collectors or +Options::statistics. For more information, refer to +rocksdb/table_stats.h and rocksdb/statistics.h. +These should not add significant overhead to your application and we +recommend exporting them to other monitoring tools. + +

Purging WAL files

+

+By default, old write-ahead logs are deleted automatically when they fall out +of scope and application doesn't need them anymore. There are options that +enable the user to archive the logs and then delete them lazily, either in +TTL fashion or based on size limit. + +The options are Options::WAL_ttl_seconds and +Options::WAL_size_limit_MB. Here is how they can be used: +

    +
  • +

    +If both set to 0, logs will be deleted asap and will never get into the archive. +

  • +

    +If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0, WAL +files will be checked every 10 min and if total size is greater then +WAL_size_limit_MB, they will be deleted starting with the +earliest until size_limit is met. All empty files will be deleted. +

  • +

    +If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then +WAL files will be checked every WAL_ttl_seconds / 2 and those +that are older than WAL_ttl_seconds will be deleted. +

  • +

    +If both are not 0, WAL files will be checked every 10 min and both +checks will be performed with ttl being first. +

+ +

Other Information

+

+Details about the rocksdb implementation may be found in +the following documents: +

+ + + diff --git a/doc/log_format.txt b/doc/log_format.txt new file mode 100644 index 00000000..3a0414b6 --- /dev/null +++ b/doc/log_format.txt @@ -0,0 +1,75 @@ +The log file contents are a sequence of 32KB blocks. The only +exception is that the tail of the file may contain a partial block. + +Each block consists of a sequence of records: + block := record* trailer? + record := + checksum: uint32 // crc32c of type and data[] + length: uint16 + type: uint8 // One of FULL, FIRST, MIDDLE, LAST + data: uint8[length] + +A record never starts within the last six bytes of a block (since it +won't fit). Any leftover bytes here form the trailer, which must +consist entirely of zero bytes and must be skipped by readers. + +Aside: if exactly seven bytes are left in the current block, and a new +non-zero length record is added, the writer must emit a FIRST record +(which contains zero bytes of user data) to fill up the trailing seven +bytes of the block and then emit all of the user data in subsequent +blocks. + +More types may be added in the future. Some Readers may skip record +types they do not understand, others may report that some data was +skipped. + +FULL == 1 +FIRST == 2 +MIDDLE == 3 +LAST == 4 + +The FULL record contains the contents of an entire user record. + +FIRST, MIDDLE, LAST are types used for user records that have been +split into multiple fragments (typically because of block boundaries). +FIRST is the type of the first fragment of a user record, LAST is the +type of the last fragment of a user record, and MID is the type of all +interior fragments of a user record. + +Example: consider a sequence of user records: + A: length 1000 + B: length 97270 + C: length 8000 +A will be stored as a FULL record in the first block. + +B will be split into three fragments: first fragment occupies the rest +of the first block, second fragment occupies the entirety of the +second block, and the third fragment occupies a prefix of the third +block. This will leave six bytes free in the third block, which will +be left empty as the trailer. + +C will be stored as a FULL record in the fourth block. + +=================== + +Some benefits over the recordio format: + +(1) We do not need any heuristics for resyncing - just go to next +block boundary and scan. If there is a corruption, skip to the next +block. As a side-benefit, we do not get confused when part of the +contents of one log file are embedded as a record inside another log +file. + +(2) Splitting at approximate boundaries (e.g., for mapreduce) is +simple: find the next block boundary and skip records until we +hit a FULL or FIRST record. + +(3) We do not need extra buffering for large records. + +Some downsides compared to recordio format: + +(1) No packing of tiny records. This could be fixed by adding a new +record type, so it is a shortcoming of the current implementation, +not necessarily the format. + +(2) No compression. Again, this could be fixed by adding new record types. diff --git a/doc/rockslogo.jpg b/doc/rockslogo.jpg new file mode 100644 index 00000000..363905af Binary files /dev/null and b/doc/rockslogo.jpg differ diff --git a/doc/rockslogo.png b/doc/rockslogo.png new file mode 100644 index 00000000..19613607 Binary files /dev/null and b/doc/rockslogo.png differ diff --git a/hdfs/README b/hdfs/README new file mode 100644 index 00000000..9b7d0a64 --- /dev/null +++ b/hdfs/README @@ -0,0 +1,26 @@ +This directory contains the hdfs extensions needed to make rocksdb store +files in HDFS. + +The hdfs.h file is copied from the Apache Hadoop 1.0 source code. +It defines the libhdfs library +(http://hadoop.apache.org/common/docs/r0.20.2/libhdfs.html) to access +data in HDFS. The libhdfs.a is copied from the Apache Hadoop 1.0 build. +It implements the API defined in hdfs.h. If your hadoop cluster is running +a different hadoop release, then install these two files manually from your +hadoop distribution and then recompile rocksdb. + +The env_hdfs.h file defines the rocksdb objects that are needed to talk to an +underlying filesystem. + +If you want to compile rocksdb with hdfs support, please set the following +enviroment variables appropriately: + USE_HDFS=1 + JAVA_HOME=/usr/local/jdk-6u22-64 + LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/jdk-6u22-64/jre/lib/amd64/server:/usr/local/jdk-6u22-64/jre/lib/amd64/:./snappy/libs + make clean all db_bench + +To run dbbench, + set CLASSPATH to include your hadoop distribution + db_bench --hdfs="hdfs://hbaseudbperf001.snc1.facebook.com:9000" + + diff --git a/hdfs/env_hdfs.h b/hdfs/env_hdfs.h new file mode 100644 index 00000000..cb8ca623 --- /dev/null +++ b/hdfs/env_hdfs.h @@ -0,0 +1,302 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// + +#pragma once +#include +#include +#include +#include +#include +#include "rocksdb/env.h" +#include "rocksdb/status.h" + +#ifdef USE_HDFS +#include "hdfs/hdfs.h" + +namespace rocksdb { + +static const std::string kProto = "hdfs://"; +static const std::string pathsep = "/"; + +// Thrown during execution when there is an issue with the supplied +// arguments. +class HdfsUsageException : public std::exception { }; + +// A simple exception that indicates something went wrong that is not +// recoverable. The intention is for the message to be printed (with +// nothing else) and the process terminate. +class HdfsFatalException : public std::exception { +public: + explicit HdfsFatalException(const std::string& s) : what_(s) { } + virtual ~HdfsFatalException() throw() { } + virtual const char* what() const throw() { + return what_.c_str(); + } +private: + const std::string what_; +}; + +// +// The HDFS environment for rocksdb. This class overrides all the +// file/dir access methods and delegates the thread-mgmt methods to the +// default posix environment. +// +class HdfsEnv : public Env { + + public: + HdfsEnv(const std::string& fsname) : fsname_(fsname) { + posixEnv = Env::Default(); + fileSys_ = connectToPath(fsname_); + } + + virtual ~HdfsEnv() { + fprintf(stderr, "Destroying HdfsEnv::Default()\n"); + hdfsDisconnect(fileSys_); + } + + virtual Status NewSequentialFile(const std::string& fname, + SequentialFile** result); + + virtual Status NewRandomAccessFile(const std::string& fname, + RandomAccessFile** result); + + virtual Status NewWritableFile(const std::string& fname, + WritableFile** result); + + virtual Status NewRandomRWFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& options); + + virtual bool FileExists(const std::string& fname); + + virtual Status GetChildren(const std::string& path, + std::vector* result); + + virtual Status DeleteFile(const std::string& fname); + + virtual Status CreateDir(const std::string& name); + + virtual Status CreateDirIfMissing(const std::string& name); + + virtual Status DeleteDir(const std::string& name); + + virtual Status GetFileSize(const std::string& fname, uint64_t* size); + + virtual Status GetFileModificationTime(const std::string& fname, + uint64_t* file_mtime); + + virtual Status RenameFile(const std::string& src, const std::string& target); + + virtual Status LockFile(const std::string& fname, FileLock** lock); + + virtual Status UnlockFile(FileLock* lock); + + virtual Status NewLogger(const std::string& fname, Logger** result); + + virtual void Schedule(void (*function)(void* arg), void* arg, + Priority pri = LOW) { + posixEnv->Schedule(function, arg, pri); + } + + virtual void StartThread(void (*function)(void* arg), void* arg) { + posixEnv->StartThread(function, arg); + } + + virtual Status GetTestDirectory(std::string* path) { + return posixEnv->GetTestDirectory(path); + } + + virtual uint64_t NowMicros() { + return posixEnv->NowMicros(); + } + + virtual void SleepForMicroseconds(int micros) { + posixEnv->SleepForMicroseconds(micros); + } + + virtual Status GetHostName(char* name, uint64_t len) { + return posixEnv->GetHostName(name, len); + } + + virtual Status GetCurrentTime(int64_t* unix_time) { + return posixEnv->GetCurrentTime(unix_time); + } + + virtual Status GetAbsolutePath(const std::string& db_path, + std::string* output_path) { + return posixEnv->GetAbsolutePath(db_path, output_path); + } + + virtual void SetBackgroundThreads(int number, Priority pri = LOW) { + posixEnv->SetBackgroundThreads(number, pri); + } + + virtual std::string TimeToString(uint64_t number) { + return posixEnv->TimeToString(number); + } + + static uint64_t gettid() { + assert(sizeof(pthread_t) <= sizeof(uint64_t)); + return (uint64_t)pthread_self(); + } + + private: + std::string fsname_; // string of the form "hdfs://hostname:port/" + hdfsFS fileSys_; // a single FileSystem object for all files + Env* posixEnv; // This object is derived from Env, but not from + // posixEnv. We have posixnv as an encapsulated + // object here so that we can use posix timers, + // posix threads, etc. + + /** + * If the URI is specified of the form hdfs://server:port/path, + * then connect to the specified cluster + * else connect to default. + */ + hdfsFS connectToPath(const std::string& uri) { + if (uri.empty()) { + return NULL; + } + if (uri.find(kProto) != 0) { + // uri doesn't start with hdfs:// -> use default:0, which is special + // to libhdfs. + return hdfsConnectNewInstance("default", 0); + } + const std::string hostport = uri.substr(kProto.length()); + + std::vector parts; + split(hostport, ':', parts); + if (parts.size() != 2) { + throw HdfsFatalException("Bad uri for hdfs " + uri); + } + // parts[0] = hosts, parts[1] = port/xxx/yyy + std::string host(parts[0]); + std::string remaining(parts[1]); + + int rem = remaining.find(pathsep); + std::string portStr = (rem == 0 ? remaining : + remaining.substr(0, rem)); + + tPort port; + port = atoi(portStr.c_str()); + if (port == 0) { + throw HdfsFatalException("Bad host-port for hdfs " + uri); + } + hdfsFS fs = hdfsConnectNewInstance(host.c_str(), port); + return fs; + } + + void split(const std::string &s, char delim, + std::vector &elems) { + elems.clear(); + size_t prev = 0; + size_t pos = s.find(delim); + while (pos != std::string::npos) { + elems.push_back(s.substr(prev, pos)); + prev = pos + 1; + pos = s.find(delim, prev); + } + elems.push_back(s.substr(prev, s.size())); + } +}; + +} // namespace rocksdb + +#else // USE_HDFS + + +namespace rocksdb { + +static const Status notsup; + +class HdfsEnv : public Env { + + public: + HdfsEnv(const std::string& fsname) { + fprintf(stderr, "You have not build rocksdb with HDFS support\n"); + fprintf(stderr, "Please see hdfs/README for details\n"); + throw new std::exception(); + } + + virtual ~HdfsEnv() { + } + + virtual Status NewSequentialFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& options); + + virtual Status NewRandomAccessFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& options) { + return notsup; + } + + virtual Status NewWritableFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& options) { + return notsup; + } + + virtual Status NewRandomRWFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& options) { + return notsup; + } + + virtual bool FileExists(const std::string& fname){return false;} + + virtual Status GetChildren(const std::string& path, + std::vector* result){return notsup;} + + virtual Status DeleteFile(const std::string& fname){return notsup;} + + virtual Status CreateDir(const std::string& name){return notsup;} + + virtual Status CreateDirIfMissing(const std::string& name){return notsup;} + + virtual Status DeleteDir(const std::string& name){return notsup;} + + virtual Status GetFileSize(const std::string& fname, uint64_t* size){return notsup;} + + virtual Status GetFileModificationTime(const std::string& fname, + uint64_t* time) { + return notsup; + } + + virtual Status RenameFile(const std::string& src, const std::string& target){return notsup;} + + virtual Status LockFile(const std::string& fname, FileLock** lock){return notsup;} + + virtual Status UnlockFile(FileLock* lock){return notsup;} + + virtual Status NewLogger(const std::string& fname, + shared_ptr* result){return notsup;} + + virtual void Schedule(void (*function)(void* arg), void* arg, + Priority pri = LOW) {} + + virtual void StartThread(void (*function)(void* arg), void* arg) {} + + virtual Status GetTestDirectory(std::string* path) {return notsup;} + + virtual uint64_t NowMicros() {return 0;} + + virtual void SleepForMicroseconds(int micros) {} + + virtual Status GetHostName(char* name, uint64_t len) {return notsup;} + + virtual Status GetCurrentTime(int64_t* unix_time) {return notsup;} + + virtual Status GetAbsolutePath(const std::string& db_path, + std::string* outputpath) {return notsup;} + + virtual void SetBackgroundThreads(int number, Priority pri = LOW) {} + + virtual std::string TimeToString(uint64_t number) { return "";} +}; +} + +#endif // USE_HDFS diff --git a/hdfs/hdfs.h b/hdfs/hdfs.h new file mode 100644 index 00000000..8e8dfecb --- /dev/null +++ b/hdfs/hdfs.h @@ -0,0 +1,477 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#ifndef LIBHDFS_HDFS_H +#define LIBHDFS_HDFS_H + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +#ifndef O_RDONLY +#define O_RDONLY 1 +#endif + +#ifndef O_WRONLY +#define O_WRONLY 2 +#endif + +#ifndef EINTERNAL +#define EINTERNAL 255 +#endif + + +/** All APIs set errno to meaningful values */ +#ifdef __cplusplus +extern "C" { +#endif + + /** + * Some utility decls used in libhdfs. + */ + + typedef int32_t tSize; /// size of data for read/write io ops + typedef time_t tTime; /// time type in seconds + typedef int64_t tOffset;/// offset within the file + typedef uint16_t tPort; /// port + typedef enum tObjectKind { + kObjectKindFile = 'F', + kObjectKindDirectory = 'D', + } tObjectKind; + + + /** + * The C reflection of org.apache.org.hadoop.FileSystem . + */ + typedef void* hdfsFS; + + + /** + * The C equivalent of org.apache.org.hadoop.FSData(Input|Output)Stream . + */ + enum hdfsStreamType + { + UNINITIALIZED = 0, + INPUT = 1, + OUTPUT = 2, + }; + + + /** + * The 'file-handle' to a file in hdfs. + */ + struct hdfsFile_internal { + void* file; + enum hdfsStreamType type; + }; + typedef struct hdfsFile_internal* hdfsFile; + + + /** + * hdfsConnectAsUser - Connect to a hdfs file system as a specific user + * Connect to the hdfs. + * @param host A string containing either a host name, or an ip address + * of the namenode of a hdfs cluster. 'host' should be passed as NULL if + * you want to connect to local filesystem. 'host' should be passed as + * 'default' (and port as 0) to used the 'configured' filesystem + * (core-site/core-default.xml). + * @param port The port on which the server is listening. + * @param user the user name (this is hadoop domain user). Or NULL is equivelant to hhdfsConnect(host, port) + * @param groups the groups (these are hadoop domain groups) + * @return Returns a handle to the filesystem or NULL on error. + */ + hdfsFS hdfsConnectAsUser(const char* host, tPort port, const char *user , const char *groups[], int groups_size ); + + + /** + * hdfsConnect - Connect to a hdfs file system. + * Connect to the hdfs. + * @param host A string containing either a host name, or an ip address + * of the namenode of a hdfs cluster. 'host' should be passed as NULL if + * you want to connect to local filesystem. 'host' should be passed as + * 'default' (and port as 0) to used the 'configured' filesystem + * (core-site/core-default.xml). + * @param port The port on which the server is listening. + * @return Returns a handle to the filesystem or NULL on error. + */ + hdfsFS hdfsConnect(const char* host, tPort port); + + + /** + * This are the same as hdfsConnectAsUser except that every invocation returns a new FileSystem handle. + * Applications should call a hdfsDisconnect for every call to hdfsConnectAsUserNewInstance. + */ + hdfsFS hdfsConnectAsUserNewInstance(const char* host, tPort port, const char *user , const char *groups[], int groups_size ); + hdfsFS hdfsConnectNewInstance(const char* host, tPort port); + hdfsFS hdfsConnectPath(const char* uri); + + /** + * hdfsDisconnect - Disconnect from the hdfs file system. + * Disconnect from hdfs. + * @param fs The configured filesystem handle. + * @return Returns 0 on success, -1 on error. + */ + int hdfsDisconnect(hdfsFS fs); + + + /** + * hdfsOpenFile - Open a hdfs file in given mode. + * @param fs The configured filesystem handle. + * @param path The full path to the file. + * @param flags - an | of bits/fcntl.h file flags - supported flags are O_RDONLY, O_WRONLY (meaning create or overwrite i.e., implies O_TRUNCAT), + * O_WRONLY|O_APPEND. Other flags are generally ignored other than (O_RDWR || (O_EXCL & O_CREAT)) which return NULL and set errno equal ENOTSUP. + * @param bufferSize Size of buffer for read/write - pass 0 if you want + * to use the default configured values. + * @param replication Block replication - pass 0 if you want to use + * the default configured values. + * @param blocksize Size of block - pass 0 if you want to use the + * default configured values. + * @return Returns the handle to the open file or NULL on error. + */ + hdfsFile hdfsOpenFile(hdfsFS fs, const char* path, int flags, + int bufferSize, short replication, tSize blocksize); + + + /** + * hdfsCloseFile - Close an open file. + * @param fs The configured filesystem handle. + * @param file The file handle. + * @return Returns 0 on success, -1 on error. + */ + int hdfsCloseFile(hdfsFS fs, hdfsFile file); + + + /** + * hdfsExists - Checks if a given path exsits on the filesystem + * @param fs The configured filesystem handle. + * @param path The path to look for + * @return Returns 0 on exists, 1 on non-exists, -1/-2 on error. + */ + int hdfsExists(hdfsFS fs, const char *path); + + + /** + * hdfsSeek - Seek to given offset in file. + * This works only for files opened in read-only mode. + * @param fs The configured filesystem handle. + * @param file The file handle. + * @param desiredPos Offset into the file to seek into. + * @return Returns 0 on success, -1 on error. + */ + int hdfsSeek(hdfsFS fs, hdfsFile file, tOffset desiredPos); + + + /** + * hdfsTell - Get the current offset in the file, in bytes. + * @param fs The configured filesystem handle. + * @param file The file handle. + * @return Current offset, -1 on error. + */ + tOffset hdfsTell(hdfsFS fs, hdfsFile file); + + + /** + * hdfsRead - Read data from an open file. + * @param fs The configured filesystem handle. + * @param file The file handle. + * @param buffer The buffer to copy read bytes into. + * @param length The length of the buffer. + * @return Returns the number of bytes actually read, possibly less + * than than length;-1 on error. + */ + tSize hdfsRead(hdfsFS fs, hdfsFile file, void* buffer, tSize length); + + + /** + * hdfsPread - Positional read of data from an open file. + * @param fs The configured filesystem handle. + * @param file The file handle. + * @param position Position from which to read + * @param buffer The buffer to copy read bytes into. + * @param length The length of the buffer. + * @return Returns the number of bytes actually read, possibly less than + * than length;-1 on error. + */ + tSize hdfsPread(hdfsFS fs, hdfsFile file, tOffset position, + void* buffer, tSize length); + + + /** + * hdfsWrite - Write data into an open file. + * @param fs The configured filesystem handle. + * @param file The file handle. + * @param buffer The data. + * @param length The no. of bytes to write. + * @return Returns the number of bytes written, -1 on error. + */ + tSize hdfsWrite(hdfsFS fs, hdfsFile file, const void* buffer, + tSize length); + + + /** + * hdfsWrite - Flush the data. + * @param fs The configured filesystem handle. + * @param file The file handle. + * @return Returns 0 on success, -1 on error. + */ + int hdfsFlush(hdfsFS fs, hdfsFile file); + + /** + * hdfsSync - Sync the data to persistent store. + * @param fs The configured filesystem handle. + * @param file The file handle. + * @return Returns 0 on success, -1 on error. + */ + int hdfsSync(hdfsFS fs, hdfsFile file); + + /** + * hdfsGetNumReplicasInPipeline - get number of remaining replicas in + * pipeline + * @param fs The configured filesystem handle + * @param file the file handle + * @return returns the # of datanodes in the write pipeline; -1 on error + */ + int hdfsGetNumCurrentReplicas(hdfsFS, hdfsFile file); + + /** + * hdfsAvailable - Number of bytes that can be read from this + * input stream without blocking. + * @param fs The configured filesystem handle. + * @param file The file handle. + * @return Returns available bytes; -1 on error. + */ + int hdfsAvailable(hdfsFS fs, hdfsFile file); + + + /** + * hdfsCopy - Copy file from one filesystem to another. + * @param srcFS The handle to source filesystem. + * @param src The path of source file. + * @param dstFS The handle to destination filesystem. + * @param dst The path of destination file. + * @return Returns 0 on success, -1 on error. + */ + int hdfsCopy(hdfsFS srcFS, const char* src, hdfsFS dstFS, const char* dst); + + + /** + * hdfsMove - Move file from one filesystem to another. + * @param srcFS The handle to source filesystem. + * @param src The path of source file. + * @param dstFS The handle to destination filesystem. + * @param dst The path of destination file. + * @return Returns 0 on success, -1 on error. + */ + int hdfsMove(hdfsFS srcFS, const char* src, hdfsFS dstFS, const char* dst); + + + /** + * hdfsDelete - Delete file. + * @param fs The configured filesystem handle. + * @param path The path of the file. + * @return Returns 0 on success, -1 on error. + */ + int hdfsDelete(hdfsFS fs, const char* path); + + + /** + * hdfsRename - Rename file. + * @param fs The configured filesystem handle. + * @param oldPath The path of the source file. + * @param newPath The path of the destination file. + * @return Returns 0 on success, -1 on error. + */ + int hdfsRename(hdfsFS fs, const char* oldPath, const char* newPath); + + + /** + * hdfsGetWorkingDirectory - Get the current working directory for + * the given filesystem. + * @param fs The configured filesystem handle. + * @param buffer The user-buffer to copy path of cwd into. + * @param bufferSize The length of user-buffer. + * @return Returns buffer, NULL on error. + */ + char* hdfsGetWorkingDirectory(hdfsFS fs, char *buffer, size_t bufferSize); + + + /** + * hdfsSetWorkingDirectory - Set the working directory. All relative + * paths will be resolved relative to it. + * @param fs The configured filesystem handle. + * @param path The path of the new 'cwd'. + * @return Returns 0 on success, -1 on error. + */ + int hdfsSetWorkingDirectory(hdfsFS fs, const char* path); + + + /** + * hdfsCreateDirectory - Make the given file and all non-existent + * parents into directories. + * @param fs The configured filesystem handle. + * @param path The path of the directory. + * @return Returns 0 on success, -1 on error. + */ + int hdfsCreateDirectory(hdfsFS fs, const char* path); + + + /** + * hdfsSetReplication - Set the replication of the specified + * file to the supplied value + * @param fs The configured filesystem handle. + * @param path The path of the file. + * @return Returns 0 on success, -1 on error. + */ + int hdfsSetReplication(hdfsFS fs, const char* path, int16_t replication); + + + /** + * hdfsFileInfo - Information about a file/directory. + */ + typedef struct { + tObjectKind mKind; /* file or directory */ + char *mName; /* the name of the file */ + tTime mLastMod; /* the last modification time for the file in seconds */ + tOffset mSize; /* the size of the file in bytes */ + short mReplication; /* the count of replicas */ + tOffset mBlockSize; /* the block size for the file */ + char *mOwner; /* the owner of the file */ + char *mGroup; /* the group associated with the file */ + short mPermissions; /* the permissions associated with the file */ + tTime mLastAccess; /* the last access time for the file in seconds */ + } hdfsFileInfo; + + + /** + * hdfsListDirectory - Get list of files/directories for a given + * directory-path. hdfsFreeFileInfo should be called to deallocate memory if + * the function returns non-NULL value. + * @param fs The configured filesystem handle. + * @param path The path of the directory. + * @param numEntries Set to the number of files/directories in path. + * @return Returns a dynamically-allocated array of hdfsFileInfo + * objects; NULL if empty or on error. + * on error, numEntries will be -1. + */ + hdfsFileInfo *hdfsListDirectory(hdfsFS fs, const char* path, + int *numEntries); + + + /** + * hdfsGetPathInfo - Get information about a path as a (dynamically + * allocated) single hdfsFileInfo struct. hdfsFreeFileInfo should be + * called when the pointer is no longer needed. + * @param fs The configured filesystem handle. + * @param path The path of the file. + * @return Returns a dynamically-allocated hdfsFileInfo object; + * NULL on error. + */ + hdfsFileInfo *hdfsGetPathInfo(hdfsFS fs, const char* path); + + + /** + * hdfsFreeFileInfo - Free up the hdfsFileInfo array (including fields) + * @param hdfsFileInfo The array of dynamically-allocated hdfsFileInfo + * objects. + * @param numEntries The size of the array. + */ + void hdfsFreeFileInfo(hdfsFileInfo *hdfsFileInfo, int numEntries); + + + /** + * hdfsGetHosts - Get hostnames where a particular block (determined by + * pos & blocksize) of a file is stored. The last element in the array + * is NULL. Due to replication, a single block could be present on + * multiple hosts. + * @param fs The configured filesystem handle. + * @param path The path of the file. + * @param start The start of the block. + * @param length The length of the block. + * @return Returns a dynamically-allocated 2-d array of blocks-hosts; + * NULL on error. + */ + char*** hdfsGetHosts(hdfsFS fs, const char* path, + tOffset start, tOffset length); + + + /** + * hdfsFreeHosts - Free up the structure returned by hdfsGetHosts + * @param hdfsFileInfo The array of dynamically-allocated hdfsFileInfo + * objects. + * @param numEntries The size of the array. + */ + void hdfsFreeHosts(char ***blockHosts); + + + /** + * hdfsGetDefaultBlockSize - Get the optimum blocksize. + * @param fs The configured filesystem handle. + * @return Returns the blocksize; -1 on error. + */ + tOffset hdfsGetDefaultBlockSize(hdfsFS fs); + + + /** + * hdfsGetCapacity - Return the raw capacity of the filesystem. + * @param fs The configured filesystem handle. + * @return Returns the raw-capacity; -1 on error. + */ + tOffset hdfsGetCapacity(hdfsFS fs); + + + /** + * hdfsGetUsed - Return the total raw size of all files in the filesystem. + * @param fs The configured filesystem handle. + * @return Returns the total-size; -1 on error. + */ + tOffset hdfsGetUsed(hdfsFS fs); + + /** + * hdfsChown + * @param fs The configured filesystem handle. + * @param path the path to the file or directory + * @param owner this is a string in Hadoop land. Set to null or "" if only setting group + * @param group this is a string in Hadoop land. Set to null or "" if only setting user + * @return 0 on success else -1 + */ + int hdfsChown(hdfsFS fs, const char* path, const char *owner, const char *group); + + /** + * hdfsChmod + * @param fs The configured filesystem handle. + * @param path the path to the file or directory + * @param mode the bitmask to set it to + * @return 0 on success else -1 + */ + int hdfsChmod(hdfsFS fs, const char* path, short mode); + + /** + * hdfsUtime + * @param fs The configured filesystem handle. + * @param path the path to the file or directory + * @param mtime new modification time or 0 for only set access time in seconds + * @param atime new access time or 0 for only set modification time in seconds + * @return 0 on success else -1 + */ + int hdfsUtime(hdfsFS fs, const char* path, tTime mtime, tTime atime); + +#ifdef __cplusplus +} +#endif + +#endif /*LIBHDFS_HDFS_H*/ + +/** + * vim: ts=4: sw=4: et + */ diff --git a/hdfs/libhdfs.a b/hdfs/libhdfs.a new file mode 100644 index 00000000..4d1f19f0 Binary files /dev/null and b/hdfs/libhdfs.a differ diff --git a/helpers/memenv/memenv.cc b/helpers/memenv/memenv.cc new file mode 100644 index 00000000..15f1383a --- /dev/null +++ b/helpers/memenv/memenv.cc @@ -0,0 +1,386 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "helpers/memenv/memenv.h" + +#include "rocksdb/env.h" +#include "rocksdb/status.h" +#include "port/port.h" +#include "util/mutexlock.h" +#include +#include +#include +#include + +namespace rocksdb { + +namespace { + +class FileState { + public: + // FileStates are reference counted. The initial reference count is zero + // and the caller must call Ref() at least once. + FileState() : refs_(0), size_(0) {} + + // Increase the reference count. + void Ref() { + MutexLock lock(&refs_mutex_); + ++refs_; + } + + // Decrease the reference count. Delete if this is the last reference. + void Unref() { + bool do_delete = false; + + { + MutexLock lock(&refs_mutex_); + --refs_; + assert(refs_ >= 0); + if (refs_ <= 0) { + do_delete = true; + } + } + + if (do_delete) { + delete this; + } + } + + uint64_t Size() const { return size_; } + + Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const { + if (offset > size_) { + return Status::IOError("Offset greater than file size."); + } + const uint64_t available = size_ - offset; + if (n > available) { + n = available; + } + if (n == 0) { + *result = Slice(); + return Status::OK(); + } + + size_t block = offset / kBlockSize; + size_t block_offset = offset % kBlockSize; + + if (n <= kBlockSize - block_offset) { + // The requested bytes are all in the first block. + *result = Slice(blocks_[block] + block_offset, n); + return Status::OK(); + } + + size_t bytes_to_copy = n; + char* dst = scratch; + + while (bytes_to_copy > 0) { + size_t avail = kBlockSize - block_offset; + if (avail > bytes_to_copy) { + avail = bytes_to_copy; + } + memcpy(dst, blocks_[block] + block_offset, avail); + + bytes_to_copy -= avail; + dst += avail; + block++; + block_offset = 0; + } + + *result = Slice(scratch, n); + return Status::OK(); + } + + Status Append(const Slice& data) { + const char* src = data.data(); + size_t src_len = data.size(); + + while (src_len > 0) { + size_t avail; + size_t offset = size_ % kBlockSize; + + if (offset != 0) { + // There is some room in the last block. + avail = kBlockSize - offset; + } else { + // No room in the last block; push new one. + blocks_.push_back(new char[kBlockSize]); + avail = kBlockSize; + } + + if (avail > src_len) { + avail = src_len; + } + memcpy(blocks_.back() + offset, src, avail); + src_len -= avail; + src += avail; + size_ += avail; + } + + return Status::OK(); + } + + private: + // Private since only Unref() should be used to delete it. + ~FileState() { + for (std::vector::iterator i = blocks_.begin(); i != blocks_.end(); + ++i) { + delete [] *i; + } + } + + // No copying allowed. + FileState(const FileState&); + void operator=(const FileState&); + + port::Mutex refs_mutex_; + int refs_; // Protected by refs_mutex_; + + // The following fields are not protected by any mutex. They are only mutable + // while the file is being written, and concurrent access is not allowed + // to writable files. + std::vector blocks_; + uint64_t size_; + + enum { kBlockSize = 8 * 1024 }; +}; + +class SequentialFileImpl : public SequentialFile { + public: + explicit SequentialFileImpl(FileState* file) : file_(file), pos_(0) { + file_->Ref(); + } + + ~SequentialFileImpl() { + file_->Unref(); + } + + virtual Status Read(size_t n, Slice* result, char* scratch) { + Status s = file_->Read(pos_, n, result, scratch); + if (s.ok()) { + pos_ += result->size(); + } + return s; + } + + virtual Status Skip(uint64_t n) { + if (pos_ > file_->Size()) { + return Status::IOError("pos_ > file_->Size()"); + } + const size_t available = file_->Size() - pos_; + if (n > available) { + n = available; + } + pos_ += n; + return Status::OK(); + } + + private: + FileState* file_; + size_t pos_; +}; + +class RandomAccessFileImpl : public RandomAccessFile { + public: + explicit RandomAccessFileImpl(FileState* file) : file_(file) { + file_->Ref(); + } + + ~RandomAccessFileImpl() { + file_->Unref(); + } + + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + return file_->Read(offset, n, result, scratch); + } + + private: + FileState* file_; +}; + +class WritableFileImpl : public WritableFile { + public: + WritableFileImpl(FileState* file) : file_(file) { + file_->Ref(); + } + + ~WritableFileImpl() { + file_->Unref(); + } + + virtual Status Append(const Slice& data) { + return file_->Append(data); + } + + virtual Status Close() { return Status::OK(); } + virtual Status Flush() { return Status::OK(); } + virtual Status Sync() { return Status::OK(); } + + private: + FileState* file_; +}; + +class InMemoryEnv : public EnvWrapper { + public: + explicit InMemoryEnv(Env* base_env) : EnvWrapper(base_env) { } + + virtual ~InMemoryEnv() { + for (FileSystem::iterator i = file_map_.begin(); i != file_map_.end(); ++i){ + i->second->Unref(); + } + } + + // Partial implementation of the Env interface. + virtual Status NewSequentialFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& soptions) { + MutexLock lock(&mutex_); + if (file_map_.find(fname) == file_map_.end()) { + *result = NULL; + return Status::IOError(fname, "File not found"); + } + + result->reset(new SequentialFileImpl(file_map_[fname])); + return Status::OK(); + } + + virtual Status NewRandomAccessFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& soptions) { + MutexLock lock(&mutex_); + if (file_map_.find(fname) == file_map_.end()) { + *result = NULL; + return Status::IOError(fname, "File not found"); + } + + result->reset(new RandomAccessFileImpl(file_map_[fname])); + return Status::OK(); + } + + virtual Status NewWritableFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& soptions) { + MutexLock lock(&mutex_); + if (file_map_.find(fname) != file_map_.end()) { + DeleteFileInternal(fname); + } + + FileState* file = new FileState(); + file->Ref(); + file_map_[fname] = file; + + result->reset(new WritableFileImpl(file)); + return Status::OK(); + } + + virtual bool FileExists(const std::string& fname) { + MutexLock lock(&mutex_); + return file_map_.find(fname) != file_map_.end(); + } + + virtual Status GetChildren(const std::string& dir, + std::vector* result) { + MutexLock lock(&mutex_); + result->clear(); + + for (FileSystem::iterator i = file_map_.begin(); i != file_map_.end(); ++i){ + const std::string& filename = i->first; + + if (filename.size() >= dir.size() + 1 && filename[dir.size()] == '/' && + Slice(filename).starts_with(Slice(dir))) { + result->push_back(filename.substr(dir.size() + 1)); + } + } + + return Status::OK(); + } + + void DeleteFileInternal(const std::string& fname) { + if (file_map_.find(fname) == file_map_.end()) { + return; + } + + file_map_[fname]->Unref(); + file_map_.erase(fname); + } + + virtual Status DeleteFile(const std::string& fname) { + MutexLock lock(&mutex_); + if (file_map_.find(fname) == file_map_.end()) { + return Status::IOError(fname, "File not found"); + } + + DeleteFileInternal(fname); + return Status::OK(); + } + + virtual Status CreateDir(const std::string& dirname) { + return Status::OK(); + } + + virtual Status CreateDirIfMissing(const std::string& dirname) { + return Status::OK(); + } + + virtual Status DeleteDir(const std::string& dirname) { + return Status::OK(); + } + + virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) { + MutexLock lock(&mutex_); + if (file_map_.find(fname) == file_map_.end()) { + return Status::IOError(fname, "File not found"); + } + + *file_size = file_map_[fname]->Size(); + return Status::OK(); + } + + virtual Status GetFileModificationTime(const std::string& fname, + uint64_t* time) { + return Status::NotSupported("getFileMTime", "Not supported in MemEnv"); + } + + virtual Status RenameFile(const std::string& src, + const std::string& target) { + MutexLock lock(&mutex_); + if (file_map_.find(src) == file_map_.end()) { + return Status::IOError(src, "File not found"); + } + + DeleteFileInternal(target); + file_map_[target] = file_map_[src]; + file_map_.erase(src); + return Status::OK(); + } + + virtual Status LockFile(const std::string& fname, FileLock** lock) { + *lock = new FileLock; + return Status::OK(); + } + + virtual Status UnlockFile(FileLock* lock) { + delete lock; + return Status::OK(); + } + + virtual Status GetTestDirectory(std::string* path) { + *path = "/test"; + return Status::OK(); + } + + private: + // Map from filenames to FileState objects, representing a simple file system. + typedef std::map FileSystem; + port::Mutex mutex_; + FileSystem file_map_; // Protected by mutex_. +}; + +} // namespace + +Env* NewMemEnv(Env* base_env) { + return new InMemoryEnv(base_env); +} + +} // namespace rocksdb diff --git a/helpers/memenv/memenv.h b/helpers/memenv/memenv.h new file mode 100644 index 00000000..21264411 --- /dev/null +++ b/helpers/memenv/memenv.h @@ -0,0 +1,19 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_ROCKSDB_HELPERS_MEMENV_MEMENV_H_ +#define STORAGE_ROCKSDB_HELPERS_MEMENV_MEMENV_H_ +namespace rocksdb { + +class Env; + +// Returns a new environment that stores its data in memory and delegates +// all non-file-storage tasks to base_env. The caller must delete the result +// when it is no longer needed. +// *base_env must remain live while the result is in use. +Env* NewMemEnv(Env* base_env); + +} // namespace rocksdb + +#endif // STORAGE_ROCKSDB_HELPERS_MEMENV_MEMENV_H_ diff --git a/helpers/memenv/memenv_test.cc b/helpers/memenv/memenv_test.cc new file mode 100644 index 00000000..19fc8ff2 --- /dev/null +++ b/helpers/memenv/memenv_test.cc @@ -0,0 +1,233 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "helpers/memenv/memenv.h" + +#include "db/db_impl.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "util/testharness.h" +#include +#include +#include + +namespace rocksdb { + +class MemEnvTest { + public: + Env* env_; + const EnvOptions soptions_; + + MemEnvTest() + : env_(NewMemEnv(Env::Default())) { + } + ~MemEnvTest() { + delete env_; + } +}; + +TEST(MemEnvTest, Basics) { + uint64_t file_size; + unique_ptr writable_file; + std::vector children; + + ASSERT_OK(env_->CreateDir("/dir")); + + // Check that the directory is empty. + ASSERT_TRUE(!env_->FileExists("/dir/non_existent")); + ASSERT_TRUE(!env_->GetFileSize("/dir/non_existent", &file_size).ok()); + ASSERT_OK(env_->GetChildren("/dir", &children)); + ASSERT_EQ(0U, children.size()); + + // Create a file. + ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_)); + writable_file.reset(); + + // Check that the file exists. + ASSERT_TRUE(env_->FileExists("/dir/f")); + ASSERT_OK(env_->GetFileSize("/dir/f", &file_size)); + ASSERT_EQ(0U, file_size); + ASSERT_OK(env_->GetChildren("/dir", &children)); + ASSERT_EQ(1U, children.size()); + ASSERT_EQ("f", children[0]); + + // Write to the file. + ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_)); + ASSERT_OK(writable_file->Append("abc")); + writable_file.reset(); + + // Check for expected size. + ASSERT_OK(env_->GetFileSize("/dir/f", &file_size)); + ASSERT_EQ(3U, file_size); + + // Check that renaming works. + ASSERT_TRUE(!env_->RenameFile("/dir/non_existent", "/dir/g").ok()); + ASSERT_OK(env_->RenameFile("/dir/f", "/dir/g")); + ASSERT_TRUE(!env_->FileExists("/dir/f")); + ASSERT_TRUE(env_->FileExists("/dir/g")); + ASSERT_OK(env_->GetFileSize("/dir/g", &file_size)); + ASSERT_EQ(3U, file_size); + + // Check that opening non-existent file fails. + unique_ptr seq_file; + unique_ptr rand_file; + ASSERT_TRUE(!env_->NewSequentialFile("/dir/non_existent", &seq_file, + soptions_).ok()); + ASSERT_TRUE(!seq_file); + ASSERT_TRUE(!env_->NewRandomAccessFile("/dir/non_existent", &rand_file, + soptions_).ok()); + ASSERT_TRUE(!rand_file); + + // Check that deleting works. + ASSERT_TRUE(!env_->DeleteFile("/dir/non_existent").ok()); + ASSERT_OK(env_->DeleteFile("/dir/g")); + ASSERT_TRUE(!env_->FileExists("/dir/g")); + ASSERT_OK(env_->GetChildren("/dir", &children)); + ASSERT_EQ(0U, children.size()); + ASSERT_OK(env_->DeleteDir("/dir")); +} + +TEST(MemEnvTest, ReadWrite) { + unique_ptr writable_file; + unique_ptr seq_file; + unique_ptr rand_file; + Slice result; + char scratch[100]; + + ASSERT_OK(env_->CreateDir("/dir")); + + ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_)); + ASSERT_OK(writable_file->Append("hello ")); + ASSERT_OK(writable_file->Append("world")); + writable_file.reset(); + + // Read sequentially. + ASSERT_OK(env_->NewSequentialFile("/dir/f", &seq_file, soptions_)); + ASSERT_OK(seq_file->Read(5, &result, scratch)); // Read "hello". + ASSERT_EQ(0, result.compare("hello")); + ASSERT_OK(seq_file->Skip(1)); + ASSERT_OK(seq_file->Read(1000, &result, scratch)); // Read "world". + ASSERT_EQ(0, result.compare("world")); + ASSERT_OK(seq_file->Read(1000, &result, scratch)); // Try reading past EOF. + ASSERT_EQ(0U, result.size()); + ASSERT_OK(seq_file->Skip(100)); // Try to skip past end of file. + ASSERT_OK(seq_file->Read(1000, &result, scratch)); + ASSERT_EQ(0U, result.size()); + + // Random reads. + ASSERT_OK(env_->NewRandomAccessFile("/dir/f", &rand_file, soptions_)); + ASSERT_OK(rand_file->Read(6, 5, &result, scratch)); // Read "world". + ASSERT_EQ(0, result.compare("world")); + ASSERT_OK(rand_file->Read(0, 5, &result, scratch)); // Read "hello". + ASSERT_EQ(0, result.compare("hello")); + ASSERT_OK(rand_file->Read(10, 100, &result, scratch)); // Read "d". + ASSERT_EQ(0, result.compare("d")); + + // Too high offset. + ASSERT_TRUE(!rand_file->Read(1000, 5, &result, scratch).ok()); +} + +TEST(MemEnvTest, Locks) { + FileLock* lock; + + // These are no-ops, but we test they return success. + ASSERT_OK(env_->LockFile("some file", &lock)); + ASSERT_OK(env_->UnlockFile(lock)); +} + +TEST(MemEnvTest, Misc) { + std::string test_dir; + ASSERT_OK(env_->GetTestDirectory(&test_dir)); + ASSERT_TRUE(!test_dir.empty()); + + unique_ptr writable_file; + ASSERT_OK(env_->NewWritableFile("/a/b", &writable_file, soptions_)); + + // These are no-ops, but we test they return success. + ASSERT_OK(writable_file->Sync()); + ASSERT_OK(writable_file->Flush()); + ASSERT_OK(writable_file->Close()); + writable_file.reset(); +} + +TEST(MemEnvTest, LargeWrite) { + const size_t kWriteSize = 300 * 1024; + char* scratch = new char[kWriteSize * 2]; + + std::string write_data; + for (size_t i = 0; i < kWriteSize; ++i) { + write_data.append(1, static_cast(i)); + } + + unique_ptr writable_file; + ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_)); + ASSERT_OK(writable_file->Append("foo")); + ASSERT_OK(writable_file->Append(write_data)); + writable_file.reset(); + + unique_ptr seq_file; + Slice result; + ASSERT_OK(env_->NewSequentialFile("/dir/f", &seq_file, soptions_)); + ASSERT_OK(seq_file->Read(3, &result, scratch)); // Read "foo". + ASSERT_EQ(0, result.compare("foo")); + + size_t read = 0; + std::string read_data; + while (read < kWriteSize) { + ASSERT_OK(seq_file->Read(kWriteSize - read, &result, scratch)); + read_data.append(result.data(), result.size()); + read += result.size(); + } + ASSERT_TRUE(write_data == read_data); + delete [] scratch; +} + +TEST(MemEnvTest, DBTest) { + Options options; + options.create_if_missing = true; + options.env = env_; + DB* db; + + const Slice keys[] = {Slice("aaa"), Slice("bbb"), Slice("ccc")}; + const Slice vals[] = {Slice("foo"), Slice("bar"), Slice("baz")}; + + ASSERT_OK(DB::Open(options, "/dir/db", &db)); + for (size_t i = 0; i < 3; ++i) { + ASSERT_OK(db->Put(WriteOptions(), keys[i], vals[i])); + } + + for (size_t i = 0; i < 3; ++i) { + std::string res; + ASSERT_OK(db->Get(ReadOptions(), keys[i], &res)); + ASSERT_TRUE(res == vals[i]); + } + + Iterator* iterator = db->NewIterator(ReadOptions()); + iterator->SeekToFirst(); + for (size_t i = 0; i < 3; ++i) { + ASSERT_TRUE(iterator->Valid()); + ASSERT_TRUE(keys[i] == iterator->key()); + ASSERT_TRUE(vals[i] == iterator->value()); + iterator->Next(); + } + ASSERT_TRUE(!iterator->Valid()); + delete iterator; + + DBImpl* dbi = reinterpret_cast(db); + ASSERT_OK(dbi->TEST_FlushMemTable()); + + for (size_t i = 0; i < 3; ++i) { + std::string res; + ASSERT_OK(db->Get(ReadOptions(), keys[i], &res)); + ASSERT_TRUE(res == vals[i]); + } + + delete db; +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/include/rocksdb/arena.h b/include/rocksdb/arena.h new file mode 100644 index 00000000..642b6140 --- /dev/null +++ b/include/rocksdb/arena.h @@ -0,0 +1,45 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Arena class defines memory allocation methods. It's used by memtable and +// skiplist. + +#ifndef STORAGE_ROCKSDB_INCLUDE_ARENA_H_ +#define STORAGE_ROCKSDB_INCLUDE_ARENA_H_ + +#include +#include + +namespace rocksdb { + +class Arena { + public: + Arena() {}; + virtual ~Arena() {}; + + // Return a pointer to a newly allocated memory block of "bytes" bytes. + virtual char* Allocate(size_t bytes) = 0; + + // Allocate memory with the normal alignment guarantees provided by malloc. + virtual char* AllocateAligned(size_t bytes) = 0; + + // Returns an estimate of the total memory used by arena. + virtual const size_t ApproximateMemoryUsage() = 0; + + // Returns the total number of bytes in all blocks allocated so far. + virtual const size_t MemoryAllocatedBytes() = 0; + + private: + // No copying allowed + Arena(const Arena&); + void operator=(const Arena&); +}; + +} // namespace rocksdb + +#endif // STORAGE_ROCKSDB_INCLUDE_ARENA_H_ diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h new file mode 100644 index 00000000..bd22e191 --- /dev/null +++ b/include/rocksdb/c.h @@ -0,0 +1,344 @@ +/* Copyright (c) 2013, Facebook, Inc. All rights reserved. + This source code is licensed under the BSD-style license found in the + LICENSE file in the root directory of this source tree. An additional grant + of patent rights can be found in the PATENTS file in the same directory. + Copyright (c) 2011 The LevelDB Authors. All rights reserved. + Use of this source code is governed by a BSD-style license that can be + found in the LICENSE file. See the AUTHORS file for names of contributors. + + C bindings for leveldb. May be useful as a stable ABI that can be + used by programs that keep leveldb in a shared library, or for + a JNI api. + + Does not support: + . getters for the option types + . custom comparators that implement key shortening + . capturing post-write-snapshot + . custom iter, db, env, cache implementations using just the C bindings + + Some conventions: + + (1) We expose just opaque struct pointers and functions to clients. + This allows us to change internal representations without having to + recompile clients. + + (2) For simplicity, there is no equivalent to the Slice type. Instead, + the caller has to pass the pointer and length as separate + arguments. + + (3) Errors are represented by a null-terminated c string. NULL + means no error. All operations that can raise an error are passed + a "char** errptr" as the last argument. One of the following must + be true on entry: + *errptr == NULL + *errptr points to a malloc()ed null-terminated error message + On success, a leveldb routine leaves *errptr unchanged. + On failure, leveldb frees the old value of *errptr and + set *errptr to a malloc()ed error message. + + (4) Bools have the type unsigned char (0 == false; rest == true) + + (5) All of the pointer arguments must be non-NULL. +*/ + +#ifndef STORAGE_ROCKSDB_INCLUDE_C_H_ +#define STORAGE_ROCKSDB_INCLUDE_C_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include + +/* Exported types */ + +typedef struct rocksdb_t rocksdb_t; +typedef struct rocksdb_cache_t rocksdb_cache_t; +typedef struct rocksdb_comparator_t rocksdb_comparator_t; +typedef struct rocksdb_env_t rocksdb_env_t; +typedef struct rocksdb_filelock_t rocksdb_filelock_t; +typedef struct rocksdb_filterpolicy_t rocksdb_filterpolicy_t; +typedef struct rocksdb_iterator_t rocksdb_iterator_t; +typedef struct rocksdb_logger_t rocksdb_logger_t; +typedef struct rocksdb_options_t rocksdb_options_t; +typedef struct rocksdb_randomfile_t rocksdb_randomfile_t; +typedef struct rocksdb_readoptions_t rocksdb_readoptions_t; +typedef struct rocksdb_seqfile_t rocksdb_seqfile_t; +typedef struct rocksdb_snapshot_t rocksdb_snapshot_t; +typedef struct rocksdb_writablefile_t rocksdb_writablefile_t; +typedef struct rocksdb_writebatch_t rocksdb_writebatch_t; +typedef struct rocksdb_writeoptions_t rocksdb_writeoptions_t; +typedef struct rocksdb_universal_compaction_options_t rocksdb_universal_compaction_options_t; + +/* DB operations */ + +extern rocksdb_t* rocksdb_open( + const rocksdb_options_t* options, + const char* name, + char** errptr); + +extern void rocksdb_close(rocksdb_t* db); + +extern void rocksdb_put( + rocksdb_t* db, + const rocksdb_writeoptions_t* options, + const char* key, size_t keylen, + const char* val, size_t vallen, + char** errptr); + +extern void rocksdb_delete( + rocksdb_t* db, + const rocksdb_writeoptions_t* options, + const char* key, size_t keylen, + char** errptr); + +extern void rocksdb_write( + rocksdb_t* db, + const rocksdb_writeoptions_t* options, + rocksdb_writebatch_t* batch, + char** errptr); + +/* Returns NULL if not found. A malloc()ed array otherwise. + Stores the length of the array in *vallen. */ +extern char* rocksdb_get( + rocksdb_t* db, + const rocksdb_readoptions_t* options, + const char* key, size_t keylen, + size_t* vallen, + char** errptr); + +extern rocksdb_iterator_t* rocksdb_create_iterator( + rocksdb_t* db, + const rocksdb_readoptions_t* options); + +extern const rocksdb_snapshot_t* rocksdb_create_snapshot( + rocksdb_t* db); + +extern void rocksdb_release_snapshot( + rocksdb_t* db, + const rocksdb_snapshot_t* snapshot); + +/* Returns NULL if property name is unknown. + Else returns a pointer to a malloc()-ed null-terminated value. */ +extern char* rocksdb_property_value( + rocksdb_t* db, + const char* propname); + +extern void rocksdb_approximate_sizes( + rocksdb_t* db, + int num_ranges, + const char* const* range_start_key, const size_t* range_start_key_len, + const char* const* range_limit_key, const size_t* range_limit_key_len, + uint64_t* sizes); + +extern void rocksdb_compact_range( + rocksdb_t* db, + const char* start_key, size_t start_key_len, + const char* limit_key, size_t limit_key_len); + +/* Management operations */ + +extern void rocksdb_destroy_db( + const rocksdb_options_t* options, + const char* name, + char** errptr); + +extern void rocksdb_repair_db( + const rocksdb_options_t* options, + const char* name, + char** errptr); + +/* Iterator */ + +extern void rocksdb_iter_destroy(rocksdb_iterator_t*); +extern unsigned char rocksdb_iter_valid(const rocksdb_iterator_t*); +extern void rocksdb_iter_seek_to_first(rocksdb_iterator_t*); +extern void rocksdb_iter_seek_to_last(rocksdb_iterator_t*); +extern void rocksdb_iter_seek(rocksdb_iterator_t*, const char* k, size_t klen); +extern void rocksdb_iter_next(rocksdb_iterator_t*); +extern void rocksdb_iter_prev(rocksdb_iterator_t*); +extern const char* rocksdb_iter_key(const rocksdb_iterator_t*, size_t* klen); +extern const char* rocksdb_iter_value(const rocksdb_iterator_t*, size_t* vlen); +extern void rocksdb_iter_get_error(const rocksdb_iterator_t*, char** errptr); + +/* Write batch */ + +extern rocksdb_writebatch_t* rocksdb_writebatch_create(); +extern void rocksdb_writebatch_destroy(rocksdb_writebatch_t*); +extern void rocksdb_writebatch_clear(rocksdb_writebatch_t*); +extern void rocksdb_writebatch_put( + rocksdb_writebatch_t*, + const char* key, size_t klen, + const char* val, size_t vlen); +extern void rocksdb_writebatch_delete( + rocksdb_writebatch_t*, + const char* key, size_t klen); +extern void rocksdb_writebatch_iterate( + rocksdb_writebatch_t*, + void* state, + void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen), + void (*deleted)(void*, const char* k, size_t klen)); + +/* Options */ + +extern rocksdb_options_t* rocksdb_options_create(); +extern void rocksdb_options_destroy(rocksdb_options_t*); +extern void rocksdb_options_set_comparator( + rocksdb_options_t*, + rocksdb_comparator_t*); +extern void rocksdb_options_set_compression_per_level( + rocksdb_options_t* opt, + int* level_values, + size_t num_levels); +extern void rocksdb_options_set_filter_policy( + rocksdb_options_t*, + rocksdb_filterpolicy_t*); +extern void rocksdb_options_set_create_if_missing( + rocksdb_options_t*, unsigned char); +extern void rocksdb_options_set_error_if_exists( + rocksdb_options_t*, unsigned char); +extern void rocksdb_options_set_paranoid_checks( + rocksdb_options_t*, unsigned char); +extern void rocksdb_options_set_env(rocksdb_options_t*, rocksdb_env_t*); +extern void rocksdb_options_set_info_log(rocksdb_options_t*, rocksdb_logger_t*); +extern void rocksdb_options_set_write_buffer_size(rocksdb_options_t*, size_t); +extern void rocksdb_options_set_max_open_files(rocksdb_options_t*, int); +extern void rocksdb_options_set_cache(rocksdb_options_t*, rocksdb_cache_t*); +extern void rocksdb_options_set_block_size(rocksdb_options_t*, size_t); +extern void rocksdb_options_set_block_restart_interval(rocksdb_options_t*, int); +extern void rocksdb_options_set_compression_options( + rocksdb_options_t*, int, int, int); +extern void rocksdb_options_set_num_levels(rocksdb_options_t*, int); +extern void rocksdb_options_set_level0_file_num_compaction_trigger( + rocksdb_options_t*, int); +extern void rocksdb_options_set_level0_slowdown_writes_trigger( + rocksdb_options_t*, int); +extern void rocksdb_options_set_level0_stop_writes_trigger( + rocksdb_options_t*, int); +extern void rocksdb_options_set_target_file_size_base( + rocksdb_options_t*, uint64_t); +extern void rocksdb_options_set_target_file_size_multiplier( + rocksdb_options_t*, int); +extern void rocksdb_options_set_max_write_buffer_number(rocksdb_options_t*, int); +extern void rocksdb_options_set_min_write_buffer_number_to_merge(rocksdb_options_t*, int); +extern void rocksdb_options_set_max_background_compactions(rocksdb_options_t*, int); +extern void rocksdb_options_set_max_background_flushes(rocksdb_options_t*, int); +extern void rocksdb_options_set_use_fsync( + rocksdb_options_t*, int); +extern void rocksdb_options_set_disable_data_sync(rocksdb_options_t*, int); +extern void rocksdb_options_set_disable_auto_compactions(rocksdb_options_t*, int); +extern void rocksdb_options_set_disable_seek_compaction(rocksdb_options_t*, int); +extern void rocksdb_options_set_source_compaction_factor(rocksdb_options_t*, int); +extern void rocksdb_options_prepare_for_bulk_load(rocksdb_options_t*); +extern void rocksdb_options_set_memtable_vector_rep(rocksdb_options_t*); + + +enum { + rocksdb_no_compression = 0, + rocksdb_snappy_compression = 1, + rocksdb_zlib_compression = 1, + rocksdb_bz2_compression = 1 +}; +extern void rocksdb_options_set_compression(rocksdb_options_t*, int); + +enum { + rocksdb_level_compaction = 0, + rocksdb_universal_compaction = 1 +}; +extern void rocksdb_options_set_compaction_style(rocksdb_options_t*, int); +extern void rocksdb_options_set_universal_compaction_options(rocksdb_options_t*, rocksdb_universal_compaction_options_t*); +/* Comparator */ + +extern rocksdb_comparator_t* rocksdb_comparator_create( + void* state, + void (*destructor)(void*), + int (*compare)( + void*, + const char* a, size_t alen, + const char* b, size_t blen), + const char* (*name)(void*)); +extern void rocksdb_comparator_destroy(rocksdb_comparator_t*); + +/* Filter policy */ + +extern rocksdb_filterpolicy_t* rocksdb_filterpolicy_create( + void* state, + void (*destructor)(void*), + char* (*create_filter)( + void*, + const char* const* key_array, const size_t* key_length_array, + int num_keys, + size_t* filter_length), + unsigned char (*key_may_match)( + void*, + const char* key, size_t length, + const char* filter, size_t filter_length), + const char* (*name)(void*)); +extern void rocksdb_filterpolicy_destroy(rocksdb_filterpolicy_t*); + +extern rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom( + int bits_per_key); + +/* Read options */ + +extern rocksdb_readoptions_t* rocksdb_readoptions_create(); +extern void rocksdb_readoptions_destroy(rocksdb_readoptions_t*); +extern void rocksdb_readoptions_set_verify_checksums( + rocksdb_readoptions_t*, + unsigned char); +extern void rocksdb_readoptions_set_fill_cache( + rocksdb_readoptions_t*, unsigned char); +extern void rocksdb_readoptions_set_snapshot( + rocksdb_readoptions_t*, + const rocksdb_snapshot_t*); + +/* Write options */ + +extern rocksdb_writeoptions_t* rocksdb_writeoptions_create(); +extern void rocksdb_writeoptions_destroy(rocksdb_writeoptions_t*); +extern void rocksdb_writeoptions_set_sync( + rocksdb_writeoptions_t*, unsigned char); +extern void rocksdb_writeoptions_disable_WAL(rocksdb_writeoptions_t* opt, int disable); + +/* Cache */ + +extern rocksdb_cache_t* rocksdb_cache_create_lru(size_t capacity); +extern void rocksdb_cache_destroy(rocksdb_cache_t* cache); + +/* Env */ + +extern rocksdb_env_t* rocksdb_create_default_env(); +extern void rocksdb_env_set_background_threads(rocksdb_env_t* env, int n); +extern void rocksdb_env_set_high_priority_background_threads(rocksdb_env_t* env, int n); +extern void rocksdb_env_destroy(rocksdb_env_t*); + +/* Universal Compaction options */ + +enum { + rocksdb_similar_size_compaction_stop_style = 0, + rocksdb_total_size_compaction_stop_style = 1 +}; + +extern rocksdb_universal_compaction_options_t* rocksdb_universal_compaction_options_create() ; +extern void rocksdb_universal_compaction_options_set_size_ratio( + rocksdb_universal_compaction_options_t*, int); +extern void rocksdb_universal_compaction_options_set_min_merge_width( + rocksdb_universal_compaction_options_t*, int); +extern void rocksdb_universal_compaction_options_set_max_merge_width( + rocksdb_universal_compaction_options_t*, int); +extern void rocksdb_universal_compaction_options_set_max_size_amplification_percent( + rocksdb_universal_compaction_options_t*, int); +extern void rocksdb_universal_compaction_options_set_compression_size_percent( + rocksdb_universal_compaction_options_t*, int); +extern void rocksdb_universal_compaction_options_set_stop_style( + rocksdb_universal_compaction_options_t*, int); +extern void rocksdb_universal_compaction_options_destroy( + rocksdb_universal_compaction_options_t*); + +#ifdef __cplusplus +} /* end extern "C" */ +#endif + +#endif /* STORAGE_ROCKSDB_INCLUDE_C_H_ */ diff --git a/include/rocksdb/cache.h b/include/rocksdb/cache.h new file mode 100644 index 00000000..3e0e5c1c --- /dev/null +++ b/include/rocksdb/cache.h @@ -0,0 +1,122 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A Cache is an interface that maps keys to values. It has internal +// synchronization and may be safely accessed concurrently from +// multiple threads. It may automatically evict entries to make room +// for new entries. Values have a specified charge against the cache +// capacity. For example, a cache where the values are variable +// length strings, may use the length of the string as the charge for +// the string. +// +// A builtin cache implementation with a least-recently-used eviction +// policy is provided. Clients may use their own implementations if +// they want something more sophisticated (like scan-resistance, a +// custom eviction policy, variable cache sizing, etc.) + +#ifndef STORAGE_ROCKSDB_INCLUDE_CACHE_H_ +#define STORAGE_ROCKSDB_INCLUDE_CACHE_H_ + +#include +#include +#include "rocksdb/slice.h" + +namespace rocksdb { + +using std::shared_ptr; + +class Cache; + +// Create a new cache with a fixed size capacity. The cache is sharded +// to 2^numShardBits shards, by hash of the key. The total capacity +// is divided and evenly assigned to each shard. Inside each shard, +// the eviction is done in two passes: first try to free spaces by +// evicting entries that are among the most least used removeScanCountLimit +// entries and do not have reference other than by the cache itself, in +// the least-used order. If not enough space is freed, further free the +// entries in least used order. +// +// The functions without parameter numShardBits and/or removeScanCountLimit +// use default values. removeScanCountLimit's default value is 0, which +// means a strict LRU order inside each shard. +extern shared_ptr NewLRUCache(size_t capacity); +extern shared_ptr NewLRUCache(size_t capacity, int numShardBits); +extern shared_ptr NewLRUCache(size_t capacity, int numShardBits, + int removeScanCountLimit); + +class Cache { + public: + Cache() { } + + // Destroys all existing entries by calling the "deleter" + // function that was passed to the constructor. + virtual ~Cache(); + + // Opaque handle to an entry stored in the cache. + struct Handle { }; + + // Insert a mapping from key->value into the cache and assign it + // the specified charge against the total cache capacity. + // + // Returns a handle that corresponds to the mapping. The caller + // must call this->Release(handle) when the returned mapping is no + // longer needed. + // + // When the inserted entry is no longer needed, the key and + // value will be passed to "deleter". + virtual Handle* Insert(const Slice& key, void* value, size_t charge, + void (*deleter)(const Slice& key, void* value)) = 0; + + // If the cache has no mapping for "key", returns nullptr. + // + // Else return a handle that corresponds to the mapping. The caller + // must call this->Release(handle) when the returned mapping is no + // longer needed. + virtual Handle* Lookup(const Slice& key) = 0; + + // Release a mapping returned by a previous Lookup(). + // REQUIRES: handle must not have been released yet. + // REQUIRES: handle must have been returned by a method on *this. + virtual void Release(Handle* handle) = 0; + + // Return the value encapsulated in a handle returned by a + // successful Lookup(). + // REQUIRES: handle must not have been released yet. + // REQUIRES: handle must have been returned by a method on *this. + virtual void* Value(Handle* handle) = 0; + + // If the cache contains entry for key, erase it. Note that the + // underlying entry will be kept around until all existing handles + // to it have been released. + virtual void Erase(const Slice& key) = 0; + + // Return a new numeric id. May be used by multiple clients who are + // sharing the same cache to partition the key space. Typically the + // client will allocate a new id at startup and prepend the id to + // its cache keys. + virtual uint64_t NewId() = 0; + + // returns the maximum configured capacity of the cache + virtual size_t GetCapacity() = 0; + + private: + void LRU_Remove(Handle* e); + void LRU_Append(Handle* e); + void Unref(Handle* e); + + struct Rep; + Rep* rep_; + + // No copying allowed + Cache(const Cache&); + void operator=(const Cache&); +}; + +} // namespace rocksdb + +#endif // STORAGE_ROCKSDB_UTIL_CACHE_H_ diff --git a/include/rocksdb/compaction_filter.h b/include/rocksdb/compaction_filter.h new file mode 100644 index 00000000..f24132a6 --- /dev/null +++ b/include/rocksdb/compaction_filter.h @@ -0,0 +1,93 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// Copyright (c) 2013 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_ +#define STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_ + +#include + +namespace rocksdb { + +class Slice; + +// CompactionFilter allows an application to modify/delete a key-value at +// the time of compaction. + +class CompactionFilter { + public: + + // Context information of a compaction run + struct Context { + // Does this compaction run include all data files + bool is_full_compaction; + }; + + virtual ~CompactionFilter() {} + + // The compaction process invokes this + // method for kv that is being compacted. A return value + // of false indicates that the kv should be preserved in the + // output of this compaction run and a return value of true + // indicates that this key-value should be removed from the + // output of the compaction. The application can inspect + // the existing value of the key and make decision based on it. + // + // When the value is to be preserved, the application has the option + // to modify the existing_value and pass it back through new_value. + // value_changed needs to be set to true in this case. + // + // If multithreaded compaction is being used *and* a single CompactionFilter + // instance was supplied via Options::compaction_filter, this method may be + // called from different threads concurrently. The application must ensure + // that the call is thread-safe. + // + // If the CompactionFilter was created by a factory, then it will only ever + // be used by a single thread that is doing the compaction run, and this + // call does not need to be thread-safe. However, multiple filters may be + // in existence and operating concurrently. + virtual bool Filter(int level, + const Slice& key, + const Slice& existing_value, + std::string* new_value, + bool* value_changed) const = 0; + + // Returns a name that identifies this compaction filter. + // The name will be printed to LOG file on start up for diagnosis. + virtual const char* Name() const = 0; +}; + +// Each compaction will create a new CompactionFilter allowing the +// application to know about different campactions +class CompactionFilterFactory { + public: + virtual ~CompactionFilterFactory() { }; + + virtual std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& context) = 0; + + // Returns a name that identifies this compaction filter factory. + virtual const char* Name() const = 0; +}; + +// Default implementaion of CompactionFilterFactory which does not +// return any filter +class DefaultCompactionFilterFactory : public CompactionFilterFactory { + public: + virtual std::unique_ptr + CreateCompactionFilter(const CompactionFilter::Context& context) override { + return std::unique_ptr(nullptr); + } + + virtual const char* Name() const override { + return "DefaultCompactionFilterFactory"; + } +}; + +} // namespace rocksdb + +#endif // STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_ diff --git a/include/rocksdb/comparator.h b/include/rocksdb/comparator.h new file mode 100644 index 00000000..f3a8499a --- /dev/null +++ b/include/rocksdb/comparator.h @@ -0,0 +1,67 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_ROCKSDB_INCLUDE_COMPARATOR_H_ +#define STORAGE_ROCKSDB_INCLUDE_COMPARATOR_H_ + +#include + +namespace rocksdb { + +class Slice; + +// A Comparator object provides a total order across slices that are +// used as keys in an sstable or a database. A Comparator implementation +// must be thread-safe since rocksdb may invoke its methods concurrently +// from multiple threads. +class Comparator { + public: + virtual ~Comparator(); + + // Three-way comparison. Returns value: + // < 0 iff "a" < "b", + // == 0 iff "a" == "b", + // > 0 iff "a" > "b" + virtual int Compare(const Slice& a, const Slice& b) const = 0; + + // The name of the comparator. Used to check for comparator + // mismatches (i.e., a DB created with one comparator is + // accessed using a different comparator. + // + // The client of this package should switch to a new name whenever + // the comparator implementation changes in a way that will cause + // the relative ordering of any two keys to change. + // + // Names starting with "rocksdb." are reserved and should not be used + // by any clients of this package. + virtual const char* Name() const = 0; + + // Advanced functions: these are used to reduce the space requirements + // for internal data structures like index blocks. + + // If *start < limit, changes *start to a short string in [start,limit). + // Simple comparator implementations may return with *start unchanged, + // i.e., an implementation of this method that does nothing is correct. + virtual void FindShortestSeparator( + std::string* start, + const Slice& limit) const = 0; + + // Changes *key to a short string >= *key. + // Simple comparator implementations may return with *key unchanged, + // i.e., an implementation of this method that does nothing is correct. + virtual void FindShortSuccessor(std::string* key) const = 0; +}; + +// Return a builtin comparator that uses lexicographic byte-wise +// ordering. The result remains the property of this module and +// must not be deleted. +extern const Comparator* BytewiseComparator(); + +} // namespace rocksdb + +#endif // STORAGE_ROCKSDB_INCLUDE_COMPARATOR_H_ diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h new file mode 100644 index 00000000..4bf09575 --- /dev/null +++ b/include/rocksdb/db.h @@ -0,0 +1,331 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_ROCKSDB_INCLUDE_DB_H_ +#define STORAGE_ROCKSDB_INCLUDE_DB_H_ + +#include +#include +#include +#include +#include "rocksdb/iterator.h" +#include "rocksdb/options.h" +#include "rocksdb/types.h" +#include "rocksdb/transaction_log.h" + +namespace rocksdb { + +using std::unique_ptr; + +// Update Makefile if you change these +static const int kMajorVersion = 2; +static const int kMinorVersion = 0; + +struct Options; +struct ReadOptions; +struct WriteOptions; +struct FlushOptions; +class WriteBatch; + +// Metadata associated with each SST file. +struct LiveFileMetaData { + std::string name; // Name of the file + int level; // Level at which this file resides. + size_t size; // File size in bytes. + std::string smallestkey; // Smallest user defined key in the file. + std::string largestkey; // Largest user defined key in the file. + SequenceNumber smallest_seqno; // smallest seqno in file + SequenceNumber largest_seqno; // largest seqno in file +}; + +// Abstract handle to particular state of a DB. +// A Snapshot is an immutable object and can therefore be safely +// accessed from multiple threads without any external synchronization. +class Snapshot { + protected: + virtual ~Snapshot(); +}; + +// A range of keys +struct Range { + Slice start; // Included in the range + Slice limit; // Not included in the range + + Range() { } + Range(const Slice& s, const Slice& l) : start(s), limit(l) { } +}; + +// A DB is a persistent ordered map from keys to values. +// A DB is safe for concurrent access from multiple threads without +// any external synchronization. +class DB { + public: + // Open the database with the specified "name". + // Stores a pointer to a heap-allocated database in *dbptr and returns + // OK on success. + // Stores nullptr in *dbptr and returns a non-OK status on error. + // Caller should delete *dbptr when it is no longer needed. + static Status Open(const Options& options, + const std::string& name, + DB** dbptr); + + // Open the database for read only. All DB interfaces + // that modify data, like put/delete, will return error. + // If the db is opened in read only mode, then no compactions + // will happen. + static Status OpenForReadOnly(const Options& options, + const std::string& name, DB** dbptr, + bool error_if_log_file_exist = false); + + DB() { } + virtual ~DB(); + + // Set the database entry for "key" to "value". + // Returns OK on success, and a non-OK status on error. + // Note: consider setting options.sync = true. + virtual Status Put(const WriteOptions& options, + const Slice& key, + const Slice& value) = 0; + + // Remove the database entry (if any) for "key". Returns OK on + // success, and a non-OK status on error. It is not an error if "key" + // did not exist in the database. + // Note: consider setting options.sync = true. + virtual Status Delete(const WriteOptions& options, const Slice& key) = 0; + + // Merge the database entry for "key" with "value". Returns OK on success, + // and a non-OK status on error. The semantics of this operation is + // determined by the user provided merge_operator when opening DB. + // Note: consider setting options.sync = true. + virtual Status Merge(const WriteOptions& options, + const Slice& key, + const Slice& value) = 0; + + // Apply the specified updates to the database. + // Returns OK on success, non-OK on failure. + // Note: consider setting options.sync = true. + virtual Status Write(const WriteOptions& options, WriteBatch* updates) = 0; + + // If the database contains an entry for "key" store the + // corresponding value in *value and return OK. + // + // If there is no entry for "key" leave *value unchanged and return + // a status for which Status::IsNotFound() returns true. + // + // May return some other Status on an error. + virtual Status Get(const ReadOptions& options, + const Slice& key, + std::string* value) = 0; + + // If keys[i] does not exist in the database, then the i'th returned + // status will be one for which Status::IsNotFound() is true, and + // (*values)[i] will be set to some arbitrary value (often ""). Otherwise, + // the i'th returned status will have Status::ok() true, and (*values)[i] + // will store the value associated with keys[i]. + // + // (*values) will always be resized to be the same size as (keys). + // Similarly, the number of returned statuses will be the number of keys. + // Note: keys will not be "de-duplicated". Duplicate keys will return + // duplicate values in order. + virtual std::vector MultiGet(const ReadOptions& options, + const std::vector& keys, + std::vector* values) = 0; + + // If the key definitely does not exist in the database, then this method + // returns false, else true. If the caller wants to obtain value when the key + // is found in memory, a bool for 'value_found' must be passed. 'value_found' + // will be true on return if value has been set properly. + // This check is potentially lighter-weight than invoking DB::Get(). One way + // to make this lighter weight is to avoid doing any IOs. + // Default implementation here returns true and sets 'value_found' to false + virtual bool KeyMayExist(const ReadOptions& options, + const Slice& key, + std::string* value, + bool* value_found = nullptr) { + if (value_found != nullptr) { + *value_found = false; + } + return true; + } + + // Return a heap-allocated iterator over the contents of the database. + // The result of NewIterator() is initially invalid (caller must + // call one of the Seek methods on the iterator before using it). + // + // Caller should delete the iterator when it is no longer needed. + // The returned iterator should be deleted before this db is deleted. + virtual Iterator* NewIterator(const ReadOptions& options) = 0; + + // Return a handle to the current DB state. Iterators created with + // this handle will all observe a stable snapshot of the current DB + // state. The caller must call ReleaseSnapshot(result) when the + // snapshot is no longer needed. + virtual const Snapshot* GetSnapshot() = 0; + + // Release a previously acquired snapshot. The caller must not + // use "snapshot" after this call. + virtual void ReleaseSnapshot(const Snapshot* snapshot) = 0; + + // DB implementations can export properties about their state + // via this method. If "property" is a valid property understood by this + // DB implementation, fills "*value" with its current value and returns + // true. Otherwise returns false. + // + // + // Valid property names include: + // + // "rocksdb.num-files-at-level" - return the number of files at level , + // where is an ASCII representation of a level number (e.g. "0"). + // "rocksdb.stats" - returns a multi-line string that describes statistics + // about the internal operation of the DB. + // "rocksdb.sstables" - returns a multi-line string that describes all + // of the sstables that make up the db contents. + virtual bool GetProperty(const Slice& property, std::string* value) = 0; + + // For each i in [0,n-1], store in "sizes[i]", the approximate + // file system space used by keys in "[range[i].start .. range[i].limit)". + // + // Note that the returned sizes measure file system space usage, so + // if the user data compresses by a factor of ten, the returned + // sizes will be one-tenth the size of the corresponding user data size. + // + // The results may not include the sizes of recently written data. + virtual void GetApproximateSizes(const Range* range, int n, + uint64_t* sizes) = 0; + + // Compact the underlying storage for the key range [*begin,*end]. + // The actual compaction interval might be superset of [*begin, *end]. + // In particular, deleted and overwritten versions are discarded, + // and the data is rearranged to reduce the cost of operations + // needed to access the data. This operation should typically only + // be invoked by users who understand the underlying implementation. + // + // begin==nullptr is treated as a key before all keys in the database. + // end==nullptr is treated as a key after all keys in the database. + // Therefore the following call will compact the entire database: + // db->CompactRange(nullptr, nullptr); + // Note that after the entire database is compacted, all data are pushed + // down to the last level containing any data. If the total data size + // after compaction is reduced, that level might not be appropriate for + // hosting all the files. In this case, client could set reduce_level + // to true, to move the files back to the minimum level capable of holding + // the data set or a given level (specified by non-negative target_level). + virtual void CompactRange(const Slice* begin, const Slice* end, + bool reduce_level = false, + int target_level = -1) = 0; + + // Number of levels used for this DB. + virtual int NumberLevels() = 0; + + // Maximum level to which a new compacted memtable is pushed if it + // does not create overlap. + virtual int MaxMemCompactionLevel() = 0; + + // Number of files in level-0 that would stop writes. + virtual int Level0StopWriteTrigger() = 0; + + // Get DB name -- the exact same name that was provided as an argument to + // DB::Open() + virtual const std::string& GetName() const = 0; + + // Get Env object from the DB + virtual Env* GetEnv() const = 0; + + // Get DB Options that we use + virtual const Options& GetOptions() const = 0; + + // Flush all mem-table data. + virtual Status Flush(const FlushOptions& options) = 0; + + // Prevent file deletions. Compactions will continue to occur, + // but no obsolete files will be deleted. Calling this multiple + // times have the same effect as calling it once. + virtual Status DisableFileDeletions() = 0; + + // Allow compactions to delete obselete files. + // If force == true, the call to EnableFileDeletions() will guarantee that + // file deletions are enabled after the call, even if DisableFileDeletions() + // was called multiple times before. + // If force == false, EnableFileDeletions will only enable file deletion + // after it's been called at least as many times as DisableFileDeletions(), + // enabling the two methods to be called by two threads concurrently without + // synchronization -- i.e., file deletions will be enabled only after both + // threads call EnableFileDeletions() + virtual Status EnableFileDeletions(bool force = true) = 0; + + // GetLiveFiles followed by GetSortedWalFiles can generate a lossless backup + + // THIS METHOD IS DEPRECATED. Use the GetTableMetaData to get more + // detailed information on the live files. + // Retrieve the list of all files in the database. The files are + // relative to the dbname and are not absolute paths. The valid size of the + // manifest file is returned in manifest_file_size. The manifest file is an + // ever growing file, but only the portion specified by manifest_file_size is + // valid for this snapshot. + // Setting flush_memtable to true does Flush before recording the live files. + // Setting flush_memtable to false is useful when we don't want to wait for + // flush which may have to wait for compaction to complete taking an + // indeterminate time. But this will have to use GetSortedWalFiles after + // GetLiveFiles to compensate for memtables missed in this snapshot due to the + // absence of Flush, by WAL files to recover the database consistently later + virtual Status GetLiveFiles(std::vector&, + uint64_t* manifest_file_size, + bool flush_memtable = true) = 0; + + // Retrieve the sorted list of all wal files with earliest file first + virtual Status GetSortedWalFiles(VectorLogPtr& files) = 0; + + // The sequence number of the most recent transaction. + virtual SequenceNumber GetLatestSequenceNumber() const = 0; + + // Sets iter to an iterator that is positioned at a write-batch containing + // seq_number. If the sequence number is non existent, it returns an iterator + // at the first available seq_no after the requested seq_no + // Returns Status::OK if iterator is valid + // Must set WAL_ttl_seconds or WAL_size_limit_MB to large values to + // use this api, else the WAL files will get + // cleared aggressively and the iterator might keep getting invalid before + // an update is read. + virtual Status GetUpdatesSince(SequenceNumber seq_number, + unique_ptr* iter) = 0; + + // Delete the file name from the db directory and update the internal state to + // reflect that. Supports deletion of sst and log files only. 'name' must be + // path relative to the db directory. eg. 000001.sst, /archive/000003.log + virtual Status DeleteFile(std::string name) = 0; + + // Returns a list of all table files with their level, start key + // and end key + virtual void GetLiveFilesMetaData( + std::vector *metadata) { + } + + // Sets the globally unique ID created at database creation time by invoking + // Env::GenerateUniqueId(), in identity. Returns Status::OK if identity could + // be set properly + virtual Status GetDbIdentity(std::string& identity) = 0; + + private: + // No copying allowed + DB(const DB&); + void operator=(const DB&); +}; + +// Destroy the contents of the specified database. +// Be very careful using this method. +Status DestroyDB(const std::string& name, const Options& options); + +// If a DB cannot be opened, you may attempt to call this method to +// resurrect as much of the contents of the database as possible. +// Some data may be lost, so be careful when calling this function +// on a database that contains important information. +Status RepairDB(const std::string& dbname, const Options& options); + +} // namespace rocksdb + +#endif // STORAGE_ROCKSDB_INCLUDE_DB_H_ diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h new file mode 100644 index 00000000..73acbfab --- /dev/null +++ b/include/rocksdb/env.h @@ -0,0 +1,649 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// An Env is an interface used by the rocksdb implementation to access +// operating system functionality like the filesystem etc. Callers +// may wish to provide a custom Env object when opening a database to +// get fine gain control; e.g., to rate limit file system operations. +// +// All Env implementations are safe for concurrent access from +// multiple threads without any external synchronization. + +#ifndef STORAGE_ROCKSDB_INCLUDE_ENV_H_ +#define STORAGE_ROCKSDB_INCLUDE_ENV_H_ + +#include +#include +#include +#include +#include +#include "rocksdb/status.h" + +namespace rocksdb { + +class FileLock; +class Logger; +class RandomAccessFile; +class SequentialFile; +class Slice; +class WritableFile; +class RandomRWFile; +struct Options; + +using std::unique_ptr; +using std::shared_ptr; + + +// Options while opening a file to read/write +struct EnvOptions { + + // construct with default Options + EnvOptions(); + + // construct from Options + explicit EnvOptions(const Options& options); + + // If true, then allow caching of data in environment buffers + bool use_os_buffer = true; + + // If true, then use mmap to read data + bool use_mmap_reads = false; + + // If true, then use mmap to write data + bool use_mmap_writes = true; + + // If true, set the FD_CLOEXEC on open fd. + bool set_fd_cloexec= true; + + // Allows OS to incrementally sync files to disk while they are being + // written, in the background. Issue one request for every bytes_per_sync + // written. 0 turns it off. + // Default: 0 + uint64_t bytes_per_sync = 0; +}; + +class Env { + public: + Env() { } + virtual ~Env(); + + // Return a default environment suitable for the current operating + // system. Sophisticated users may wish to provide their own Env + // implementation instead of relying on this default environment. + // + // The result of Default() belongs to rocksdb and must never be deleted. + static Env* Default(); + + // Create a brand new sequentially-readable file with the specified name. + // On success, stores a pointer to the new file in *result and returns OK. + // On failure stores nullptr in *result and returns non-OK. If the file does + // not exist, returns a non-OK status. + // + // The returned file will only be accessed by one thread at a time. + virtual Status NewSequentialFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& options) + = 0; + + // Create a brand new random access read-only file with the + // specified name. On success, stores a pointer to the new file in + // *result and returns OK. On failure stores nullptr in *result and + // returns non-OK. If the file does not exist, returns a non-OK + // status. + // + // The returned file may be concurrently accessed by multiple threads. + virtual Status NewRandomAccessFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& options) + = 0; + + // Create an object that writes to a new file with the specified + // name. Deletes any existing file with the same name and creates a + // new file. On success, stores a pointer to the new file in + // *result and returns OK. On failure stores nullptr in *result and + // returns non-OK. + // + // The returned file will only be accessed by one thread at a time. + virtual Status NewWritableFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& options) = 0; + + // Create an object that both reads and writes to a file on + // specified offsets (random access). If file already exists, + // does not overwrite it. On success, stores a pointer to the + // new file in *result and returns OK. On failure stores nullptr + // in *result and returns non-OK. + virtual Status NewRandomRWFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& options) = 0; + + // Returns true iff the named file exists. + virtual bool FileExists(const std::string& fname) = 0; + + // Store in *result the names of the children of the specified directory. + // The names are relative to "dir". + // Original contents of *results are dropped. + virtual Status GetChildren(const std::string& dir, + std::vector* result) = 0; + + // Delete the named file. + virtual Status DeleteFile(const std::string& fname) = 0; + + // Create the specified directory. Returns error if directory exists. + virtual Status CreateDir(const std::string& dirname) = 0; + + // Creates directory if missing. Return Ok if it exists, or successful in + // Creating. + virtual Status CreateDirIfMissing(const std::string& dirname) = 0; + + // Delete the specified directory. + virtual Status DeleteDir(const std::string& dirname) = 0; + + // Store the size of fname in *file_size. + virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) = 0; + + // Store the last modification time of fname in *file_mtime. + virtual Status GetFileModificationTime(const std::string& fname, + uint64_t* file_mtime) = 0; + // Rename file src to target. + virtual Status RenameFile(const std::string& src, + const std::string& target) = 0; + + // Lock the specified file. Used to prevent concurrent access to + // the same db by multiple processes. On failure, stores nullptr in + // *lock and returns non-OK. + // + // On success, stores a pointer to the object that represents the + // acquired lock in *lock and returns OK. The caller should call + // UnlockFile(*lock) to release the lock. If the process exits, + // the lock will be automatically released. + // + // If somebody else already holds the lock, finishes immediately + // with a failure. I.e., this call does not wait for existing locks + // to go away. + // + // May create the named file if it does not already exist. + virtual Status LockFile(const std::string& fname, FileLock** lock) = 0; + + // Release the lock acquired by a previous successful call to LockFile. + // REQUIRES: lock was returned by a successful LockFile() call + // REQUIRES: lock has not already been unlocked. + virtual Status UnlockFile(FileLock* lock) = 0; + + enum Priority { LOW, HIGH, TOTAL }; + + // Arrange to run "(*function)(arg)" once in a background thread, in + // the thread pool specified by pri. By default, jobs go to the 'LOW' + // priority thread pool. + + // "function" may run in an unspecified thread. Multiple functions + // added to the same Env may run concurrently in different threads. + // I.e., the caller may not assume that background work items are + // serialized. + virtual void Schedule( + void (*function)(void* arg), + void* arg, + Priority pri = LOW) = 0; + + // Start a new thread, invoking "function(arg)" within the new thread. + // When "function(arg)" returns, the thread will be destroyed. + virtual void StartThread(void (*function)(void* arg), void* arg) = 0; + + // *path is set to a temporary directory that can be used for testing. It may + // or many not have just been created. The directory may or may not differ + // between runs of the same process, but subsequent calls will return the + // same directory. + virtual Status GetTestDirectory(std::string* path) = 0; + + // Create and return a log file for storing informational messages. + virtual Status NewLogger(const std::string& fname, + shared_ptr* result) = 0; + + // Returns the number of micro-seconds since some fixed point in time. Only + // useful for computing deltas of time. + virtual uint64_t NowMicros() = 0; + + // Returns the number of nano-seconds since some fixed point in time. Only + // useful for computing deltas of time in one run. + // Default implementation simply relies on NowMicros + virtual uint64_t NowNanos() { + return NowMicros() * 1000; + } + + // Sleep/delay the thread for the perscribed number of micro-seconds. + virtual void SleepForMicroseconds(int micros) = 0; + + // Get the current host name. + virtual Status GetHostName(char* name, uint64_t len) = 0; + + // Get the number of seconds since the Epoch, 1970-01-01 00:00:00 (UTC). + virtual Status GetCurrentTime(int64_t* unix_time) = 0; + + // Get full directory name for this db. + virtual Status GetAbsolutePath(const std::string& db_path, + std::string* output_path) = 0; + + // The number of background worker threads of a specific thread pool + // for this environment. 'LOW' is the default pool. + // default number: 1 + virtual void SetBackgroundThreads(int number, Priority pri = LOW) = 0; + + // Converts seconds-since-Jan-01-1970 to a printable string + virtual std::string TimeToString(uint64_t time) = 0; + + // Generates a unique id that can be used to identify a db + virtual std::string GenerateUniqueId(); + + private: + // No copying allowed + Env(const Env&); + void operator=(const Env&); +}; + +// A file abstraction for reading sequentially through a file +class SequentialFile { + public: + SequentialFile() { } + virtual ~SequentialFile(); + + // Read up to "n" bytes from the file. "scratch[0..n-1]" may be + // written by this routine. Sets "*result" to the data that was + // read (including if fewer than "n" bytes were successfully read). + // May set "*result" to point at data in "scratch[0..n-1]", so + // "scratch[0..n-1]" must be live when "*result" is used. + // If an error was encountered, returns a non-OK status. + // + // REQUIRES: External synchronization + virtual Status Read(size_t n, Slice* result, char* scratch) = 0; + + // Skip "n" bytes from the file. This is guaranteed to be no + // slower that reading the same data, but may be faster. + // + // If end of file is reached, skipping will stop at the end of the + // file, and Skip will return OK. + // + // REQUIRES: External synchronization + virtual Status Skip(uint64_t n) = 0; + + // Remove any kind of caching of data from the offset to offset+length + // of this file. If the length is 0, then it refers to the end of file. + // If the system is not caching the file contents, then this is a noop. + virtual Status InvalidateCache(size_t offset, size_t length) { + return Status::NotSupported("InvalidateCache not supported."); + } +}; + +// A file abstraction for randomly reading the contents of a file. +class RandomAccessFile { + public: + RandomAccessFile() { } + virtual ~RandomAccessFile(); + + // Read up to "n" bytes from the file starting at "offset". + // "scratch[0..n-1]" may be written by this routine. Sets "*result" + // to the data that was read (including if fewer than "n" bytes were + // successfully read). May set "*result" to point at data in + // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when + // "*result" is used. If an error was encountered, returns a non-OK + // status. + // + // Safe for concurrent use by multiple threads. + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const = 0; + + // Tries to get an unique ID for this file that will be the same each time + // the file is opened (and will stay the same while the file is open). + // Furthermore, it tries to make this ID at most "max_size" bytes. If such an + // ID can be created this function returns the length of the ID and places it + // in "id"; otherwise, this function returns 0, in which case "id" + // may not have been modified. + // + // This function guarantees, for IDs from a given environment, two unique ids + // cannot be made equal to eachother by adding arbitrary bytes to one of + // them. That is, no unique ID is the prefix of another. + // + // This function guarantees that the returned ID will not be interpretable as + // a single varint. + // + // Note: these IDs are only valid for the duration of the process. + virtual size_t GetUniqueId(char* id, size_t max_size) const { + return 0; // Default implementation to prevent issues with backwards + // compatibility. + }; + + + enum AccessPattern { NORMAL, RANDOM, SEQUENTIAL, WILLNEED, DONTNEED }; + + virtual void Hint(AccessPattern pattern) {} + + // Remove any kind of caching of data from the offset to offset+length + // of this file. If the length is 0, then it refers to the end of file. + // If the system is not caching the file contents, then this is a noop. + virtual Status InvalidateCache(size_t offset, size_t length) { + return Status::NotSupported("InvalidateCache not supported."); + } +}; + +// A file abstraction for sequential writing. The implementation +// must provide buffering since callers may append small fragments +// at a time to the file. +class WritableFile { + public: + WritableFile() : last_preallocated_block_(0), preallocation_block_size_ (0) { + } + virtual ~WritableFile(); + + virtual Status Append(const Slice& data) = 0; + virtual Status Close() = 0; + virtual Status Flush() = 0; + virtual Status Sync() = 0; // sync data + + /* + * Sync data and/or metadata as well. + * By default, sync only data. + * Override this method for environments where we need to sync + * metadata as well. + */ + virtual Status Fsync() { + return Sync(); + } + + /* + * Get the size of valid data in the file. + */ + virtual uint64_t GetFileSize() { + return 0; + } + + /* + * Get and set the default pre-allocation block size for writes to + * this file. If non-zero, then Allocate will be used to extend the + * underlying storage of a file (generally via fallocate) if the Env + * instance supports it. + */ + void SetPreallocationBlockSize(size_t size) { + preallocation_block_size_ = size; + } + + virtual void GetPreallocationStatus(size_t* block_size, + size_t* last_allocated_block) { + *last_allocated_block = last_preallocated_block_; + *block_size = preallocation_block_size_; + } + + // For documentation, refer to RandomAccessFile::GetUniqueId() + virtual size_t GetUniqueId(char* id, size_t max_size) const { + return 0; // Default implementation to prevent issues with backwards + } + + // Remove any kind of caching of data from the offset to offset+length + // of this file. If the length is 0, then it refers to the end of file. + // If the system is not caching the file contents, then this is a noop. + // This call has no effect on dirty pages in the cache. + virtual Status InvalidateCache(size_t offset, size_t length) { + return Status::NotSupported("InvalidateCache not supported."); + } + + protected: + // PrepareWrite performs any necessary preparation for a write + // before the write actually occurs. This allows for pre-allocation + // of space on devices where it can result in less file + // fragmentation and/or less waste from over-zealous filesystem + // pre-allocation. + void PrepareWrite(size_t offset, size_t len) { + if (preallocation_block_size_ == 0) { + return; + } + // If this write would cross one or more preallocation blocks, + // determine what the last preallocation block necesessary to + // cover this write would be and Allocate to that point. + const auto block_size = preallocation_block_size_; + size_t new_last_preallocated_block = + (offset + len + block_size - 1) / block_size; + if (new_last_preallocated_block > last_preallocated_block_) { + size_t num_spanned_blocks = + new_last_preallocated_block - last_preallocated_block_; + Allocate(block_size * last_preallocated_block_, + block_size * num_spanned_blocks); + last_preallocated_block_ = new_last_preallocated_block; + } + } + + /* + * Pre-allocate space for a file. + */ + virtual Status Allocate(off_t offset, off_t len) { + return Status::OK(); + } + + // Sync a file range with disk. + // offset is the starting byte of the file range to be synchronized. + // nbytes specifies the length of the range to be synchronized. + // This asks the OS to initiate flushing the cached data to disk, + // without waiting for completion. + // Default implementation does nothing. + virtual Status RangeSync(off_t offset, off_t nbytes) { + return Status::OK(); + } + + private: + size_t last_preallocated_block_; + size_t preallocation_block_size_; + // No copying allowed + WritableFile(const WritableFile&); + void operator=(const WritableFile&); +}; + +// A file abstraction for random reading and writing. +class RandomRWFile { + public: + RandomRWFile() {} + virtual ~RandomRWFile() {} + + // Write data from Slice data to file starting from offset + // Returns IOError on failure, but does not guarantee + // atomicity of a write. Returns OK status on success. + // + // Safe for concurrent use. + virtual Status Write(uint64_t offset, const Slice& data) = 0; + // Read up to "n" bytes from the file starting at "offset". + // "scratch[0..n-1]" may be written by this routine. Sets "*result" + // to the data that was read (including if fewer than "n" bytes were + // successfully read). May set "*result" to point at data in + // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when + // "*result" is used. If an error was encountered, returns a non-OK + // status. + // + // Safe for concurrent use by multiple threads. + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const = 0; + virtual Status Close() = 0; // closes the file + virtual Status Sync() = 0; // sync data + + /* + * Sync data and/or metadata as well. + * By default, sync only data. + * Override this method for environments where we need to sync + * metadata as well. + */ + virtual Status Fsync() { + return Sync(); + } + + /* + * Pre-allocate space for a file. + */ + virtual Status Allocate(off_t offset, off_t len) { + return Status::OK(); + } + + private: + // No copying allowed + RandomRWFile(const RandomRWFile&); + void operator=(const RandomRWFile&); +}; + +// An interface for writing log messages. +class Logger { + public: + enum { DO_NOT_SUPPORT_GET_LOG_FILE_SIZE = -1 }; + Logger() { } + virtual ~Logger(); + + // Write an entry to the log file with the specified format. + virtual void Logv(const char* format, va_list ap) = 0; + virtual size_t GetLogFileSize() const { + return DO_NOT_SUPPORT_GET_LOG_FILE_SIZE; + } + // Flush to the OS buffers + virtual void Flush() {} + + private: + // No copying allowed + Logger(const Logger&); + void operator=(const Logger&); +}; + + +// Identifies a locked file. +class FileLock { + public: + FileLock() { } + virtual ~FileLock(); + private: + // No copying allowed + FileLock(const FileLock&); + void operator=(const FileLock&); +}; + + +extern void LogFlush(const shared_ptr& info_log); + +// Log the specified data to *info_log if info_log is non-nullptr. +extern void Log(const shared_ptr& info_log, const char* format, ...) +# if defined(__GNUC__) || defined(__clang__) + __attribute__((__format__ (__printf__, 2, 3))) +# endif + ; + +extern void LogFlush(Logger *info_log); + +extern void Log(Logger* info_log, const char* format, ...) +# if defined(__GNUC__) || defined(__clang__) + __attribute__((__format__ (__printf__, 2, 3))) +# endif + ; + +// A utility routine: write "data" to the named file. +extern Status WriteStringToFile(Env* env, const Slice& data, + const std::string& fname); + +// A utility routine: read contents of named file into *data +extern Status ReadFileToString(Env* env, const std::string& fname, + std::string* data); + +// An implementation of Env that forwards all calls to another Env. +// May be useful to clients who wish to override just part of the +// functionality of another Env. +class EnvWrapper : public Env { + public: + // Initialize an EnvWrapper that delegates all calls to *t + explicit EnvWrapper(Env* t) : target_(t) { } + virtual ~EnvWrapper(); + + // Return the target to which this Env forwards all calls + Env* target() const { return target_; } + + // The following text is boilerplate that forwards all methods to target() + Status NewSequentialFile(const std::string& f, + unique_ptr* r, + const EnvOptions& options) { + return target_->NewSequentialFile(f, r, options); + } + Status NewRandomAccessFile(const std::string& f, + unique_ptr* r, + const EnvOptions& options) { + return target_->NewRandomAccessFile(f, r, options); + } + Status NewWritableFile(const std::string& f, unique_ptr* r, + const EnvOptions& options) { + return target_->NewWritableFile(f, r, options); + } + Status NewRandomRWFile(const std::string& f, unique_ptr* r, + const EnvOptions& options) { + return target_->NewRandomRWFile(f, r, options); + } + bool FileExists(const std::string& f) { return target_->FileExists(f); } + Status GetChildren(const std::string& dir, std::vector* r) { + return target_->GetChildren(dir, r); + } + Status DeleteFile(const std::string& f) { return target_->DeleteFile(f); } + Status CreateDir(const std::string& d) { return target_->CreateDir(d); } + Status CreateDirIfMissing(const std::string& d) { + return target_->CreateDirIfMissing(d); + } + Status DeleteDir(const std::string& d) { return target_->DeleteDir(d); } + Status GetFileSize(const std::string& f, uint64_t* s) { + return target_->GetFileSize(f, s); + } + + Status GetFileModificationTime(const std::string& fname, + uint64_t* file_mtime) { + return target_->GetFileModificationTime(fname, file_mtime); + } + + Status RenameFile(const std::string& s, const std::string& t) { + return target_->RenameFile(s, t); + } + Status LockFile(const std::string& f, FileLock** l) { + return target_->LockFile(f, l); + } + Status UnlockFile(FileLock* l) { return target_->UnlockFile(l); } + void Schedule(void (*f)(void*), void* a, Priority pri) { + return target_->Schedule(f, a, pri); + } + void StartThread(void (*f)(void*), void* a) { + return target_->StartThread(f, a); + } + virtual Status GetTestDirectory(std::string* path) { + return target_->GetTestDirectory(path); + } + virtual Status NewLogger(const std::string& fname, + shared_ptr* result) { + return target_->NewLogger(fname, result); + } + uint64_t NowMicros() { + return target_->NowMicros(); + } + void SleepForMicroseconds(int micros) { + target_->SleepForMicroseconds(micros); + } + Status GetHostName(char* name, uint64_t len) { + return target_->GetHostName(name, len); + } + Status GetCurrentTime(int64_t* unix_time) { + return target_->GetCurrentTime(unix_time); + } + Status GetAbsolutePath(const std::string& db_path, + std::string* output_path) { + return target_->GetAbsolutePath(db_path, output_path); + } + void SetBackgroundThreads(int num, Priority pri) { + return target_->SetBackgroundThreads(num, pri); + } + std::string TimeToString(uint64_t time) { + return target_->TimeToString(time); + } + + private: + Env* target_; +}; + +} // namespace rocksdb + +#endif // STORAGE_ROCKSDB_INCLUDE_ENV_H_ diff --git a/include/rocksdb/filter_policy.h b/include/rocksdb/filter_policy.h new file mode 100644 index 00000000..fa44db45 --- /dev/null +++ b/include/rocksdb/filter_policy.h @@ -0,0 +1,74 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A database can be configured with a custom FilterPolicy object. +// This object is responsible for creating a small filter from a set +// of keys. These filters are stored in rocksdb and are consulted +// automatically by rocksdb to decide whether or not to read some +// information from disk. In many cases, a filter can cut down the +// number of disk seeks form a handful to a single disk seek per +// DB::Get() call. +// +// Most people will want to use the builtin bloom filter support (see +// NewBloomFilterPolicy() below). + +#ifndef STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_ +#define STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_ + +#include + +namespace rocksdb { + +class Slice; + +class FilterPolicy { + public: + virtual ~FilterPolicy(); + + // Return the name of this policy. Note that if the filter encoding + // changes in an incompatible way, the name returned by this method + // must be changed. Otherwise, old incompatible filters may be + // passed to methods of this type. + virtual const char* Name() const = 0; + + // keys[0,n-1] contains a list of keys (potentially with duplicates) + // that are ordered according to the user supplied comparator. + // Append a filter that summarizes keys[0,n-1] to *dst. + // + // Warning: do not change the initial contents of *dst. Instead, + // append the newly constructed filter to *dst. + virtual void CreateFilter(const Slice* keys, int n, std::string* dst) + const = 0; + + // "filter" contains the data appended by a preceding call to + // CreateFilter() on this class. This method must return true if + // the key was in the list of keys passed to CreateFilter(). + // This method may return true or false if the key was not on the + // list, but it should aim to return false with a high probability. + virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const = 0; +}; + +// Return a new filter policy that uses a bloom filter with approximately +// the specified number of bits per key. A good value for bits_per_key +// is 10, which yields a filter with ~ 1% false positive rate. +// +// Callers must delete the result after any database that is using the +// result has been closed. +// +// Note: if you are using a custom comparator that ignores some parts +// of the keys being compared, you must not use NewBloomFilterPolicy() +// and must provide your own FilterPolicy that also ignores the +// corresponding parts of the keys. For example, if the comparator +// ignores trailing spaces, it would be incorrect to use a +// FilterPolicy (like NewBloomFilterPolicy) that does not ignore +// trailing spaces in keys. +extern const FilterPolicy* NewBloomFilterPolicy(int bits_per_key); + +} + +#endif // STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_ diff --git a/include/rocksdb/flush_block_policy.h b/include/rocksdb/flush_block_policy.h new file mode 100644 index 00000000..1740d879 --- /dev/null +++ b/include/rocksdb/flush_block_policy.h @@ -0,0 +1,64 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#pragma once + +#include + +namespace rocksdb { + +class Slice; +class BlockBuilder; + +// FlushBlockPolicy provides a configurable way to determine when to flush a +// block in the block based tables, +class FlushBlockPolicy { + public: + // Keep track of the key/value sequences and return the boolean value to + // determine if table builder should flush current data block. + virtual bool Update(const Slice& key, + const Slice& value) = 0; + + virtual ~FlushBlockPolicy() { } +}; + +class FlushBlockPolicyFactory { + public: + // Return the name of the flush block policy. + virtual const char* Name() const = 0; + + // Return a new block flush policy that flushes data blocks by data size. + // FlushBlockPolicy may need to access the metadata of the data block + // builder to determine when to flush the blocks. + // + // Callers must delete the result after any database that is using the + // result has been closed. + virtual FlushBlockPolicy* NewFlushBlockPolicy( + const BlockBuilder& data_block_builder) const = 0; + + virtual ~FlushBlockPolicyFactory() { } +}; + +class FlushBlockBySizePolicyFactory : public FlushBlockPolicyFactory { + public: + FlushBlockBySizePolicyFactory(const uint64_t block_size, + const uint64_t block_size_deviation) : + block_size_(block_size), + block_size_deviation_(block_size_deviation) { + } + + virtual const char* Name() const override { + return "FlushBlockBySizePolicyFactory"; + } + + virtual FlushBlockPolicy* NewFlushBlockPolicy( + const BlockBuilder& data_block_builder) const override; + + private: + const uint64_t block_size_; + const uint64_t block_size_deviation_; +}; + +} // rocksdb diff --git a/include/rocksdb/iterator.h b/include/rocksdb/iterator.h new file mode 100644 index 00000000..7538e9cf --- /dev/null +++ b/include/rocksdb/iterator.h @@ -0,0 +1,106 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// An iterator yields a sequence of key/value pairs from a source. +// The following class defines the interface. Multiple implementations +// are provided by this library. In particular, iterators are provided +// to access the contents of a Table or a DB. +// +// Multiple threads can invoke const methods on an Iterator without +// external synchronization, but if any of the threads may call a +// non-const method, all threads accessing the same Iterator must use +// external synchronization. + +#ifndef STORAGE_ROCKSDB_INCLUDE_ITERATOR_H_ +#define STORAGE_ROCKSDB_INCLUDE_ITERATOR_H_ + +#include "rocksdb/slice.h" +#include "rocksdb/status.h" + +namespace rocksdb { + +class Iterator { + public: + Iterator(); + virtual ~Iterator(); + + // An iterator is either positioned at a key/value pair, or + // not valid. This method returns true iff the iterator is valid. + virtual bool Valid() const = 0; + + // Position at the first key in the source. The iterator is Valid() + // after this call iff the source is not empty. + virtual void SeekToFirst() = 0; + + // Position at the last key in the source. The iterator is + // Valid() after this call iff the source is not empty. + virtual void SeekToLast() = 0; + + // Position at the first key in the source that at or past target + // The iterator is Valid() after this call iff the source contains + // an entry that comes at or past target. + virtual void Seek(const Slice& target) = 0; + + // Moves to the next entry in the source. After this call, Valid() is + // true iff the iterator was not positioned at the last entry in the source. + // REQUIRES: Valid() + virtual void Next() = 0; + + // Moves to the previous entry in the source. After this call, Valid() is + // true iff the iterator was not positioned at the first entry in source. + // REQUIRES: Valid() + virtual void Prev() = 0; + + // Return the key for the current entry. The underlying storage for + // the returned slice is valid only until the next modification of + // the iterator. + // REQUIRES: Valid() + virtual Slice key() const = 0; + + // Return the value for the current entry. The underlying storage for + // the returned slice is valid only until the next modification of + // the iterator. + // REQUIRES: !AtEnd() && !AtStart() + virtual Slice value() const = 0; + + // If an error has occurred, return it. Else return an ok status. + // If non-blocking IO is requested and this operation cannot be + // satisfied without doing some IO, then this returns Status::Incomplete(). + virtual Status status() const = 0; + + // Clients are allowed to register function/arg1/arg2 triples that + // will be invoked when this iterator is destroyed. + // + // Note that unlike all of the preceding methods, this method is + // not abstract and therefore clients should not override it. + typedef void (*CleanupFunction)(void* arg1, void* arg2); + void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2); + + private: + struct Cleanup { + CleanupFunction function; + void* arg1; + void* arg2; + Cleanup* next; + }; + Cleanup cleanup_; + + // No copying allowed + Iterator(const Iterator&); + void operator=(const Iterator&); +}; + +// Return an empty iterator (yields nothing). +extern Iterator* NewEmptyIterator(); + +// Return an empty iterator with the specified status. +extern Iterator* NewErrorIterator(const Status& status); + +} // namespace rocksdb + +#endif // STORAGE_ROCKSDB_INCLUDE_ITERATOR_H_ diff --git a/include/rocksdb/ldb_tool.h b/include/rocksdb/ldb_tool.h new file mode 100644 index 00000000..a46b6a75 --- /dev/null +++ b/include/rocksdb/ldb_tool.h @@ -0,0 +1,18 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +#ifndef STORAGE_ROCKSDB_INCLUDE_LDB_TOOL_H +#define STORAGE_ROCKSDB_INCLUDE_LDB_TOOL_H +#include "rocksdb/options.h" + +namespace rocksdb { + +class LDBTool { + public: + void Run(int argc, char** argv, Options = Options()); +}; + +} // namespace rocksdb + +#endif // STORAGE_ROCKSDB_INCLUDE_LDB_TOOL_H diff --git a/include/rocksdb/memtablerep.h b/include/rocksdb/memtablerep.h new file mode 100644 index 00000000..fcb782d4 --- /dev/null +++ b/include/rocksdb/memtablerep.h @@ -0,0 +1,203 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// This file contains the interface that must be implemented by any collection +// to be used as the backing store for a MemTable. Such a collection must +// satisfy the following properties: +// (1) It does not store duplicate items. +// (2) It uses MemTableRep::KeyComparator to compare items for iteration and +// equality. +// (3) It can be accessed concurrently by multiple readers and can support +// during reads. However, it needn't support multiple concurrent writes. +// (4) Items are never deleted. +// The liberal use of assertions is encouraged to enforce (1). +// +// The factory will be passed an Arena object when a new MemTableRep is +// requested. The API for this object is in rocksdb/arena.h. +// +// Users can implement their own memtable representations. We include three +// types built in: +// - SkipListRep: This is the default; it is backed by a skip list. +// - HashSkipListRep: The memtable rep that is best used for keys that are +// structured like "prefix:suffix" where iteration withing a prefix is +// common and iteration across different prefixes is rare. It is backed by +// a hash map where each bucket is a skip list. +// - VectorRep: This is backed by an unordered std::vector. On iteration, the +// vector is sorted. It is intelligent about sorting; once the MarkReadOnly() +// has been called, the vector will only be sorted once. It is optimized for +// random-write-heavy workloads. +// +// The last four implementations are designed for situations in which +// iteration over the entire collection is rare since doing so requires all the +// keys to be copied into a sorted data structure. + +#ifndef STORAGE_ROCKSDB_DB_MEMTABLEREP_H_ +#define STORAGE_ROCKSDB_DB_MEMTABLEREP_H_ + +#include +#include "rocksdb/arena.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" + +namespace rocksdb { + +class MemTableRep { + public: + // KeyComparator provides a means to compare keys, which are internal keys + // concatenated with values. + class KeyComparator { + public: + // Compare a and b. Return a negative value if a is less than b, 0 if they + // are equal, and a positive value if a is greater than b + virtual int operator()(const char* a, const char* b) const = 0; + + virtual ~KeyComparator() { } + }; + + // Insert key into the collection. (The caller will pack key and value into a + // single buffer and pass that in as the parameter to Insert) + // REQUIRES: nothing that compares equal to key is currently in the + // collection. + virtual void Insert(const char* key) = 0; + + // Returns true iff an entry that compares equal to key is in the collection. + virtual bool Contains(const char* key) const = 0; + + // Notify this table rep that it will no longer be added to. By default, does + // nothing. + virtual void MarkReadOnly() { } + + // Report an approximation of how much memory has been used other than memory + // that was allocated through the arena. + virtual size_t ApproximateMemoryUsage() = 0; + + virtual ~MemTableRep() { } + + // Iteration over the contents of a skip collection + class Iterator { + public: + // Initialize an iterator over the specified collection. + // The returned iterator is not valid. + // explicit Iterator(const MemTableRep* collection); + virtual ~Iterator() { }; + + // Returns true iff the iterator is positioned at a valid node. + virtual bool Valid() const = 0; + + // Returns the key at the current position. + // REQUIRES: Valid() + virtual const char* key() const = 0; + + // Advances to the next position. + // REQUIRES: Valid() + virtual void Next() = 0; + + // Advances to the previous position. + // REQUIRES: Valid() + virtual void Prev() = 0; + + // Advance to the first entry with a key >= target + virtual void Seek(const char* target) = 0; + + // Position at the first entry in collection. + // Final state of iterator is Valid() iff collection is not empty. + virtual void SeekToFirst() = 0; + + // Position at the last entry in collection. + // Final state of iterator is Valid() iff collection is not empty. + virtual void SeekToLast() = 0; + }; + + // Return an iterator over the keys in this representation. + virtual std::shared_ptr GetIterator() = 0; + + // Return an iterator over at least the keys with the specified user key. The + // iterator may also allow access to other keys, but doesn't have to. Default: + // GetIterator(). + virtual std::shared_ptr GetIterator(const Slice& user_key) { + return GetIterator(); + } + + // Return an iterator over at least the keys with the specified prefix. The + // iterator may also allow access to other keys, but doesn't have to. Default: + // GetIterator(). + virtual std::shared_ptr GetPrefixIterator(const Slice& prefix) { + return GetIterator(); + } + + // Return an iterator that has a special Seek semantics. The result of + // a Seek might only include keys with the same prefix as the target key. + virtual std::shared_ptr GetDynamicPrefixIterator() { + return GetIterator(); + } + + protected: + // When *key is an internal key concatenated with the value, returns the + // user key. + virtual Slice UserKey(const char* key) const; +}; + +// This is the base class for all factories that are used by RocksDB to create +// new MemTableRep objects +class MemTableRepFactory { + public: + virtual ~MemTableRepFactory() { }; + virtual std::shared_ptr CreateMemTableRep( + MemTableRep::KeyComparator&, Arena*) = 0; + virtual const char* Name() const = 0; +}; + +// This creates MemTableReps that are backed by an std::vector. On iteration, +// the vector is sorted. This is useful for workloads where iteration is very +// rare and writes are generally not issued after reads begin. +// +// Parameters: +// count: Passed to the constructor of the underlying std::vector of each +// VectorRep. On initialization, the underlying array will be at least count +// bytes reserved for usage. +class VectorRepFactory : public MemTableRepFactory { + const size_t count_; +public: + explicit VectorRepFactory(size_t count = 0) : count_(count) { } + virtual std::shared_ptr CreateMemTableRep( + MemTableRep::KeyComparator&, Arena*) override; + virtual const char* Name() const override { + return "VectorRepFactory"; + } +}; + +// This uses a skip list to store keys. It is the default. +class SkipListFactory : public MemTableRepFactory { +public: + virtual std::shared_ptr CreateMemTableRep( + MemTableRep::KeyComparator&, Arena*) override; + virtual const char* Name() const override { + return "SkipListFactory"; + } +}; + +// HashSkipListRep is backed by hash map of buckets. Each bucket is a skip +// list. All the keys with the same prefix will be in the same bucket. +// The prefix is determined using user supplied SliceTransform. It has +// to match prefix_extractor in options.prefix_extractor. +// +// Iteration over the entire collection is implemented by dumping all the keys +// into a separate skip list. Thus, these data structures are best used when +// iteration over the entire collection is rare. +// +// Parameters: +// transform: The prefix extractor that returns prefix when supplied a user +// key. Has to match options.prefix_extractor +// bucket_count: Number of buckets in a hash_map. Each bucket needs +// 8 bytes. By default, we set buckets to one million, which +// will take 8MB of memory. If you know the number of keys you'll +// keep in hash map, set bucket count to be approximately twice +// the number of keys +extern MemTableRepFactory* NewHashSkipListRepFactory( + const SliceTransform* transform, size_t bucket_count = 1000000); + +} + +#endif // STORAGE_ROCKSDB_DB_MEMTABLEREP_H_ diff --git a/include/rocksdb/merge_operator.h b/include/rocksdb/merge_operator.h new file mode 100644 index 00000000..bd4c36c0 --- /dev/null +++ b/include/rocksdb/merge_operator.h @@ -0,0 +1,149 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#ifndef STORAGE_ROCKSDB_INCLUDE_MERGE_OPERATOR_H_ +#define STORAGE_ROCKSDB_INCLUDE_MERGE_OPERATOR_H_ + +#include +#include +#include +#include "rocksdb/slice.h" + +namespace rocksdb { + +class Slice; +class Logger; + +// The Merge Operator +// +// Essentially, a MergeOperator specifies the SEMANTICS of a merge, which only +// client knows. It could be numeric addition, list append, string +// concatenation, edit data structure, ... , anything. +// The library, on the other hand, is concerned with the exercise of this +// interface, at the right time (during get, iteration, compaction...) +// +// To use merge, the client needs to provide an object implementing one of +// the following interfaces: +// a) AssociativeMergeOperator - for most simple semantics (always take +// two values, and merge them into one value, which is then put back +// into rocksdb); numeric addition and string concatenation are examples; +// +// b) MergeOperator - the generic class for all the more abstract / complex +// operations; one method (FullMerge) to merge a Put/Delete value with a +// merge operand; and another method (PartialMerge) that merges two +// operands together. this is especially useful if your key values have a +// complex structure but you would still like to support client-specific +// incremental updates. +// +// AssociativeMergeOperator is simpler to implement. MergeOperator is simply +// more powerful. +// +// Refer to rocksdb-merge wiki for more details and example implementations. +// +class MergeOperator { + public: + virtual ~MergeOperator() {} + + // Gives the client a way to express the read -> modify -> write semantics + // key: (IN) The key that's associated with this merge operation. + // Client could multiplex the merge operator based on it + // if the key space is partitioned and different subspaces + // refer to different types of data which have different + // merge operation semantics + // existing: (IN) null indicates that the key does not exist before this op + // operand_list:(IN) the sequence of merge operations to apply, front() first. + // new_value:(OUT) Client is responsible for filling the merge result here + // logger: (IN) Client could use this to log errors during merge. + // + // Return true on success. + // All values passed in will be client-specific values. So if this method + // returns false, it is because client specified bad data or there was + // internal corruption. This will be treated as an error by the library. + // + // Also make use of the *logger for error messages. + virtual bool FullMerge(const Slice& key, + const Slice* existing_value, + const std::deque& operand_list, + std::string* new_value, + Logger* logger) const = 0; + + // This function performs merge(left_op, right_op) + // when both the operands are themselves merge operation types + // that you would have passed to a DB::Merge() call in the same order + // (i.e.: DB::Merge(key,left_op), followed by DB::Merge(key,right_op)). + // + // PartialMerge should combine them into a single merge operation that is + // saved into *new_value, and then it should return true. + // *new_value should be constructed such that a call to + // DB::Merge(key, *new_value) would yield the same result as a call + // to DB::Merge(key, left_op) followed by DB::Merge(key, right_op). + // + // If it is impossible or infeasible to combine the two operations, + // leave new_value unchanged and return false. The library will + // internally keep track of the operations, and apply them in the + // correct order once a base-value (a Put/Delete/End-of-Database) is seen. + // + // TODO: Presently there is no way to differentiate between error/corruption + // and simply "return false". For now, the client should simply return + // false in any case it cannot perform partial-merge, regardless of reason. + // If there is corruption in the data, handle it in the FullMerge() function, + // and return false there. + virtual bool PartialMerge(const Slice& key, + const Slice& left_operand, + const Slice& right_operand, + std::string* new_value, + Logger* logger) const = 0; + + // The name of the MergeOperator. Used to check for MergeOperator + // mismatches (i.e., a DB created with one MergeOperator is + // accessed using a different MergeOperator) + // TODO: the name is currently not stored persistently and thus + // no checking is enforced. Client is responsible for providing + // consistent MergeOperator between DB opens. + virtual const char* Name() const = 0; +}; + +// The simpler, associative merge operator. +class AssociativeMergeOperator : public MergeOperator { + public: + virtual ~AssociativeMergeOperator() {} + + // Gives the client a way to express the read -> modify -> write semantics + // key: (IN) The key that's associated with this merge operation. + // existing_value:(IN) null indicates the key does not exist before this op + // value: (IN) the value to update/merge the existing_value with + // new_value: (OUT) Client is responsible for filling the merge result here + // logger: (IN) Client could use this to log errors during merge. + // + // Return true on success. + // All values passed in will be client-specific values. So if this method + // returns false, it is because client specified bad data or there was + // internal corruption. The client should assume that this will be treated + // as an error by the library. + virtual bool Merge(const Slice& key, + const Slice* existing_value, + const Slice& value, + std::string* new_value, + Logger* logger) const = 0; + + + private: + // Default implementations of the MergeOperator functions + virtual bool FullMerge(const Slice& key, + const Slice* existing_value, + const std::deque& operand_list, + std::string* new_value, + Logger* logger) const override; + + virtual bool PartialMerge(const Slice& key, + const Slice& left_operand, + const Slice& right_operand, + std::string* new_value, + Logger* logger) const override; +}; + +} // namespace rocksdb + +#endif // STORAGE_ROCKSDB_INCLUDE_MERGE_OPERATOR_H_ diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h new file mode 100644 index 00000000..b84bdcf3 --- /dev/null +++ b/include/rocksdb/options.h @@ -0,0 +1,768 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_ +#define STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_ + +#include +#include +#include +#include +#include + +#include "rocksdb/memtablerep.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/statistics.h" +#include "rocksdb/table_properties.h" +#include "rocksdb/universal_compaction.h" + +namespace rocksdb { + +class Cache; +class CompactionFilter; +class CompactionFilterFactory; +class Comparator; +class Env; +class FilterPolicy; +class Logger; +class MergeOperator; +class Snapshot; +class TableFactory; + +using std::shared_ptr; + +// DB contents are stored in a set of blocks, each of which holds a +// sequence of key,value pairs. Each block may be compressed before +// being stored in a file. The following enum describes which +// compression method (if any) is used to compress a block. +enum CompressionType : char { + // NOTE: do not change the values of existing entries, as these are + // part of the persistent format on disk. + kNoCompression = 0x0, + kSnappyCompression = 0x1, + kZlibCompression = 0x2, + kBZip2Compression = 0x3 +}; + +enum CompactionStyle : char { + kCompactionStyleLevel = 0x0, // level based compaction style + kCompactionStyleUniversal = 0x1 // Universal compaction style +}; + +// Compression options for different compression algorithms like Zlib +struct CompressionOptions { + int window_bits; + int level; + int strategy; + CompressionOptions() : window_bits(-14), level(-1), strategy(0) {} + CompressionOptions(int wbits, int lev, int strategy) + : window_bits(wbits), level(lev), strategy(strategy) {} +}; + +// Options to control the behavior of a database (passed to DB::Open) +struct Options { + // ------------------- + // Parameters that affect behavior + + // Comparator used to define the order of keys in the table. + // Default: a comparator that uses lexicographic byte-wise ordering + // + // REQUIRES: The client must ensure that the comparator supplied + // here has the same name and orders keys *exactly* the same as the + // comparator provided to previous open calls on the same DB. + const Comparator* comparator; + + // REQUIRES: The client must provide a merge operator if Merge operation + // needs to be accessed. Calling Merge on a DB without a merge operator + // would result in Status::NotSupported. The client must ensure that the + // merge operator supplied here has the same name and *exactly* the same + // semantics as the merge operator provided to previous open calls on + // the same DB. The only exception is reserved for upgrade, where a DB + // previously without a merge operator is introduced to Merge operation + // for the first time. It's necessary to specify a merge operator when + // openning the DB in this case. + // Default: nullptr + shared_ptr merge_operator; + + // A single CompactionFilter instance to call into during compaction. + // Allows an application to modify/delete a key-value during background + // compaction. + // + // If the client requires a new compaction filter to be used for different + // compaction runs, it can specify compaction_filter_factory instead of this + // option. The client should specify only one of the two. + // compaction_filter takes precedence over compaction_filter_factory if + // client specifies both. + // + // If multithreaded compaction is being used, the supplied CompactionFilter + // instance may be used from different threads concurrently and so should be + // thread-safe. + // + // Default: nullptr + const CompactionFilter* compaction_filter; + + // This is a factory that provides compaction filter objects which allow + // an application to modify/delete a key-value during background compaction. + // + // A new filter will be created on each compaction run. If multithreaded + // compaction is being used, each created CompactionFilter will only be used + // from a single thread and so does not need to be thread-safe. + // + // Default: a factory that doesn't provide any object + std::shared_ptr compaction_filter_factory; + + // If true, the database will be created if it is missing. + // Default: false + bool create_if_missing; + + // If true, an error is raised if the database already exists. + // Default: false + bool error_if_exists; + + // If true, the implementation will do aggressive checking of the + // data it is processing and will stop early if it detects any + // errors. This may have unforeseen ramifications: for example, a + // corruption of one DB entry may cause a large number of entries to + // become unreadable or for the entire DB to become unopenable. + // If any of the writes to the database fails (Put, Delete, Merge, Write), + // the database will switch to read-only mode and fail all other + // Write operations. + // Default: false + bool paranoid_checks; + + // Use the specified object to interact with the environment, + // e.g. to read/write files, schedule background work, etc. + // Default: Env::Default() + Env* env; + + // Any internal progress/error information generated by the db will + // be written to info_log if it is non-nullptr, or to a file stored + // in the same directory as the DB contents if info_log is nullptr. + // Default: nullptr + shared_ptr info_log; + + // ------------------- + // Parameters that affect performance + + // Amount of data to build up in memory (backed by an unsorted log + // on disk) before converting to a sorted on-disk file. + // + // Larger values increase performance, especially during bulk loads. + // Up to max_write_buffer_number write buffers may be held in memory + // at the same time, + // so you may wish to adjust this parameter to control memory usage. + // Also, a larger write buffer will result in a longer recovery time + // the next time the database is opened. + // + // Default: 4MB + size_t write_buffer_size; + + // The maximum number of write buffers that are built up in memory. + // The default is 2, so that when 1 write buffer is being flushed to + // storage, new writes can continue to the other write buffer. + // Default: 2 + int max_write_buffer_number; + + // The minimum number of write buffers that will be merged together + // before writing to storage. If set to 1, then + // all write buffers are fushed to L0 as individual files and this increases + // read amplification because a get request has to check in all of these + // files. Also, an in-memory merge may result in writing lesser + // data to storage if there are duplicate records in each of these + // individual write buffers. Default: 1 + int min_write_buffer_number_to_merge; + + // Number of open files that can be used by the DB. You may need to + // increase this if your database has a large working set (budget + // one open file per 2MB of working set). + // + // Default: 1000 + int max_open_files; + + // Control over blocks (user data is stored in a set of blocks, and + // a block is the unit of reading from disk). + + // If non-NULL use the specified cache for blocks. + // If NULL, rocksdb will automatically create and use an 8MB internal cache. + // Default: nullptr + shared_ptr block_cache; + + // If non-NULL use the specified cache for compressed blocks. + // If NULL, rocksdb will not use a compressed block cache. + // Default: nullptr + shared_ptr block_cache_compressed; + + // Approximate size of user data packed per block. Note that the + // block size specified here corresponds to uncompressed data. The + // actual size of the unit read from disk may be smaller if + // compression is enabled. This parameter can be changed dynamically. + // + // Default: 4K + size_t block_size; + + // Number of keys between restart points for delta encoding of keys. + // This parameter can be changed dynamically. Most clients should + // leave this parameter alone. + // + // Default: 16 + int block_restart_interval; + + // Compress blocks using the specified compression algorithm. This + // parameter can be changed dynamically. + // + // Default: kSnappyCompression, which gives lightweight but fast + // compression. + // + // Typical speeds of kSnappyCompression on an Intel(R) Core(TM)2 2.4GHz: + // ~200-500MB/s compression + // ~400-800MB/s decompression + // Note that these speeds are significantly faster than most + // persistent storage speeds, and therefore it is typically never + // worth switching to kNoCompression. Even if the input data is + // incompressible, the kSnappyCompression implementation will + // efficiently detect that and will switch to uncompressed mode. + CompressionType compression; + + // Different levels can have different compression policies. There + // are cases where most lower levels would like to quick compression + // algorithm while the higher levels (which have more data) use + // compression algorithms that have better compression but could + // be slower. This array, if non nullptr, should have an entry for + // each level of the database. This array, if non nullptr, overides the + // value specified in the previous field 'compression'. The caller is + // reponsible for allocating memory and initializing the values in it + // before invoking Open(). The caller is responsible for freeing this + // array and it could be freed anytime after the return from Open(). + // This could have been a std::vector but that makes the equivalent + // java/C api hard to construct. + std::vector compression_per_level; + + // different options for compression algorithms + CompressionOptions compression_opts; + + // If non-nullptr, use the specified filter policy to reduce disk reads. + // Many applications will benefit from passing the result of + // NewBloomFilterPolicy() here. + // + // Default: nullptr + const FilterPolicy* filter_policy; + + // If non-nullptr, use the specified function to determine the + // prefixes for keys. These prefixes will be placed in the filter. + // Depending on the workload, this can reduce the number of read-IOP + // cost for scans when a prefix is passed via ReadOptions to + // db.NewIterator(). For prefix filtering to work properly, + // "prefix_extractor" and "comparator" must be such that the following + // properties hold: + // + // 1) key.starts_with(prefix(key)) + // 2) Compare(prefix(key), key) <= 0. + // 3) If Compare(k1, k2) <= 0, then Compare(prefix(k1), prefix(k2)) <= 0 + // 4) prefix(prefix(key)) == prefix(key) + // + // Default: nullptr + const SliceTransform* prefix_extractor; + + // If true, place whole keys in the filter (not just prefixes). + // This must generally be true for gets to be efficient. + // + // Default: true + bool whole_key_filtering; + + // Number of levels for this database + int num_levels; + + // Number of files to trigger level-0 compaction. A value <0 means that + // level-0 compaction will not be triggered by number of files at all. + int level0_file_num_compaction_trigger; + + // Soft limit on number of level-0 files. We start slowing down writes at this + // point. A value <0 means that no writing slow down will be triggered by + // number of files in level-0. + int level0_slowdown_writes_trigger; + + // Maximum number of level-0 files. We stop writes at this point. + int level0_stop_writes_trigger; + + // Maximum level to which a new compacted memtable is pushed if it + // does not create overlap. We try to push to level 2 to avoid the + // relatively expensive level 0=>1 compactions and to avoid some + // expensive manifest file operations. We do not push all the way to + // the largest level since that can generate a lot of wasted disk + // space if the same key space is being repeatedly overwritten. + int max_mem_compaction_level; + + // Target file size for compaction. + // target_file_size_base is per-file size for level-1. + // Target file size for level L can be calculated by + // target_file_size_base * (target_file_size_multiplier ^ (L-1)) + // For example, if target_file_size_base is 2MB and + // target_file_size_multiplier is 10, then each file on level-1 will + // be 2MB, and each file on level 2 will be 20MB, + // and each file on level-3 will be 200MB. + + // by default target_file_size_base is 2MB. + int target_file_size_base; + // by default target_file_size_multiplier is 1, which means + // by default files in different levels will have similar size. + int target_file_size_multiplier; + + // Control maximum total data size for a level. + // max_bytes_for_level_base is the max total for level-1. + // Maximum number of bytes for level L can be calculated as + // (max_bytes_for_level_base) * (max_bytes_for_level_multiplier ^ (L-1)) + // For example, if max_bytes_for_level_base is 20MB, and if + // max_bytes_for_level_multiplier is 10, total data size for level-1 + // will be 20MB, total file size for level-2 will be 200MB, + // and total file size for level-3 will be 2GB. + + // by default 'max_bytes_for_level_base' is 10MB. + uint64_t max_bytes_for_level_base; + // by default 'max_bytes_for_level_base' is 10. + int max_bytes_for_level_multiplier; + + // Different max-size multipliers for different levels. + // These are multiplied by max_bytes_for_level_multiplier to arrive + // at the max-size of each level. + // Default: 1 + std::vector max_bytes_for_level_multiplier_additional; + + // Maximum number of bytes in all compacted files. We avoid expanding + // the lower level file set of a compaction if it would make the + // total compaction cover more than + // (expanded_compaction_factor * targetFileSizeLevel()) many bytes. + int expanded_compaction_factor; + + // Maximum number of bytes in all source files to be compacted in a + // single compaction run. We avoid picking too many files in the + // source level so that we do not exceed the total source bytes + // for compaction to exceed + // (source_compaction_factor * targetFileSizeLevel()) many bytes. + // Default:1, i.e. pick maxfilesize amount of data as the source of + // a compaction. + int source_compaction_factor; + + // Control maximum bytes of overlaps in grandparent (i.e., level+2) before we + // stop building a single file in a level->level+1 compaction. + int max_grandparent_overlap_factor; + + // If non-null, then we should collect metrics about database operations + // Statistics objects should not be shared between DB instances as + // it does not use any locks to prevent concurrent updates. + shared_ptr statistics; + + // If true, then the contents of data files are not synced + // to stable storage. Their contents remain in the OS buffers till the + // OS decides to flush them. This option is good for bulk-loading + // of data. Once the bulk-loading is complete, please issue a + // sync to the OS to flush all dirty buffesrs to stable storage. + // Default: false + bool disableDataSync; + + // If true, then every store to stable storage will issue a fsync. + // If false, then every store to stable storage will issue a fdatasync. + // This parameter should be set to true while storing data to + // filesystem like ext3 that can lose files after a reboot. + // Default: false + bool use_fsync; + + // This number controls how often a new scribe log about + // db deploy stats is written out. + // -1 indicates no logging at all. + // Default value is 1800 (half an hour). + int db_stats_log_interval; + + // This specifies the info LOG dir. + // If it is empty, the log files will be in the same dir as data. + // If it is non empty, the log files will be in the specified dir, + // and the db data dir's absolute path will be used as the log file + // name's prefix. + std::string db_log_dir; + + // This specifies the absolute dir path for write-ahead logs (WAL). + // If it is empty, the log files will be in the same dir as data, + // dbname is used as the data dir by default + // If it is non empty, the log files will be in kept the specified dir. + // When destroying the db, + // all log files in wal_dir and the dir itself is deleted + std::string wal_dir; + + // Disable compaction triggered by seek. + // With bloomfilter and fast storage, a miss on one level + // is very cheap if the file handle is cached in table cache + // (which is true if max_open_files is large). + bool disable_seek_compaction; + + // The periodicity when obsolete files get deleted. The default + // value is 6 hours. The files that get out of scope by compaction + // process will still get automatically delete on every compaction, + // regardless of this setting + uint64_t delete_obsolete_files_period_micros; + + // Maximum number of concurrent background jobs, submitted to + // the default LOW priority thread pool + // Default: 1 + int max_background_compactions; + + // Maximum number of concurrent background memtable flush jobs, submitted to + // the HIGH priority thread pool. + // By default, all background jobs (major compaction and memtable flush) go + // to the LOW priority pool. If this option is set to a positive number, + // memtable flush jobs will be submitted to the HIGH priority pool. + // It is important when the same Env is shared by multiple db instances. + // Without a separate pool, long running major compaction jobs could + // potentially block memtable flush jobs of other db instances, leading to + // unnecessary Put stalls. + // Default: 0 + int max_background_flushes; + + // Specify the maximal size of the info log file. If the log file + // is larger than `max_log_file_size`, a new info log file will + // be created. + // If max_log_file_size == 0, all logs will be written to one + // log file. + size_t max_log_file_size; + + // Time for the info log file to roll (in seconds). + // If specified with non-zero value, log file will be rolled + // if it has been active longer than `log_file_time_to_roll`. + // Default: 0 (disabled) + size_t log_file_time_to_roll; + + // Maximal info log files to be kept. + // Default: 1000 + size_t keep_log_file_num; + + // Puts are delayed 0-1 ms when any level has a compaction score that exceeds + // soft_rate_limit. This is ignored when == 0.0. + // CONSTRAINT: soft_rate_limit <= hard_rate_limit. If this constraint does not + // hold, RocksDB will set soft_rate_limit = hard_rate_limit + // Default: 0 (disabled) + double soft_rate_limit; + + // Puts are delayed 1ms at a time when any level has a compaction score that + // exceeds hard_rate_limit. This is ignored when <= 1.0. + // Default: 0 (disabled) + double hard_rate_limit; + + // Max time a put will be stalled when hard_rate_limit is enforced. If 0, then + // there is no limit. + // Default: 1000 + unsigned int rate_limit_delay_max_milliseconds; + + // manifest file is rolled over on reaching this limit. + // The older manifest file be deleted. + // The default value is MAX_INT so that roll-over does not take place. + uint64_t max_manifest_file_size; + + // Disable block cache. If this is set to true, + // then no block cache should be used, and the block_cache should + // point to a nullptr object. + // Default: false + bool no_block_cache; + + // Number of shards used for table cache. + int table_cache_numshardbits; + + // During data eviction of table's LRU cache, it would be inefficient + // to strictly follow LRU because this piece of memory will not really + // be released unless its refcount falls to zero. Instead, make two + // passes: the first pass will release items with refcount = 1, + // and if not enough space releases after scanning the number of + // elements specified by this parameter, we will remove items in LRU + // order. + int table_cache_remove_scan_count_limit; + + // Size of one block in arena memory allocation. + // + // If <= 0, a proper value is automatically calculated (usually about 1/10 of + // writer_buffer_size). + // + // There are two additonal restriction of the The specified size: + // (1) size should be in the range of [4096, 2 << 30] and + // (2) be the multiple of the CPU word (which helps with the memory + // alignment). + // + // We'll automatically check and adjust the size number to make sure it + // conforms to the restrictions. + // + // Default: 0 + size_t arena_block_size; + + // Create an Options object with default values for all fields. + Options(); + + void Dump(Logger* log) const; + + // Set appropriate parameters for bulk loading. + // The reason that this is a function that returns "this" instead of a + // constructor is to enable chaining of multiple similar calls in the future. + // + // All data will be in level 0 without any automatic compaction. + // It's recommended to manually call CompactRange(NULL, NULL) before reading + // from the database, because otherwise the read can be very slow. + Options* PrepareForBulkLoad(); + + // Disable automatic compactions. Manual compactions can still + // be issued on this database. + bool disable_auto_compactions; + + // The following two fields affect how archived logs will be deleted. + // 1. If both set to 0, logs will be deleted asap and will not get into + // the archive. + // 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0, + // WAL files will be checked every 10 min and if total size is greater + // then WAL_size_limit_MB, they will be deleted starting with the + // earliest until size_limit is met. All empty files will be deleted. + // 3. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then + // WAL files will be checked every WAL_ttl_secondsi / 2 and those that + // are older than WAL_ttl_seconds will be deleted. + // 4. If both are not 0, WAL files will be checked every 10 min and both + // checks will be performed with ttl being first. + uint64_t WAL_ttl_seconds; + uint64_t WAL_size_limit_MB; + + // Number of bytes to preallocate (via fallocate) the manifest + // files. Default is 4mb, which is reasonable to reduce random IO + // as well as prevent overallocation for mounts that preallocate + // large amounts of data (such as xfs's allocsize option). + size_t manifest_preallocation_size; + + // Purge duplicate/deleted keys when a memtable is flushed to storage. + // Default: true + bool purge_redundant_kvs_while_flush; + + // Data being read from file storage may be buffered in the OS + // Default: true + bool allow_os_buffer; + + // Allow the OS to mmap file for reading sst tables. Default: false + bool allow_mmap_reads; + + // Allow the OS to mmap file for writing. Default: true + bool allow_mmap_writes; + + // Disable child process inherit open files. Default: true + bool is_fd_close_on_exec; + + // Skip log corruption error on recovery (If client is ok with + // losing most recent changes) + // Default: false + bool skip_log_error_on_recovery; + + // if not zero, dump rocksdb.stats to LOG every stats_dump_period_sec + // Default: 3600 (1 hour) + unsigned int stats_dump_period_sec; + + // This is used to close a block before it reaches the configured + // 'block_size'. If the percentage of free space in the current block is less + // than this specified number and adding a new record to the block will + // exceed the configured block size, then this block will be closed and the + // new record will be written to the next block. + // Default is 10. + int block_size_deviation; + + // If set true, will hint the underlying file system that the file + // access pattern is random, when a sst file is opened. + // Default: true + bool advise_random_on_open; + + // Specify the file access pattern once a compaction is started. + // It will be applied to all input files of a compaction. + // Default: NORMAL + enum { + NONE, + NORMAL, + SEQUENTIAL, + WILLNEED + } access_hint_on_compaction_start; + + // Use adaptive mutex, which spins in the user space before resorting + // to kernel. This could reduce context switch when the mutex is not + // heavily contended. However, if the mutex is hot, we could end up + // wasting spin time. + // Default: false + bool use_adaptive_mutex; + + // Allows OS to incrementally sync files to disk while they are being + // written, asynchronously, in the background. + // Issue one request for every bytes_per_sync written. 0 turns it off. + // Default: 0 + uint64_t bytes_per_sync; + + // The compaction style. Default: kCompactionStyleLevel + CompactionStyle compaction_style; + + // The options needed to support Universal Style compactions + CompactionOptionsUniversal compaction_options_universal; + + // Use KeyMayExist API to filter deletes when this is true. + // If KeyMayExist returns false, i.e. the key definitely does not exist, then + // the delete is a noop. KeyMayExist only incurs in-memory look up. + // This optimization avoids writing the delete to storage when appropriate. + // Default: false + bool filter_deletes; + + // An iteration->Next() sequentially skips over keys with the same + // user-key unless this option is set. This number specifies the number + // of keys (with the same userkey) that will be sequentially + // skipped before a reseek is issued. + // Default: 8 + uint64_t max_sequential_skip_in_iterations; + + // This is a factory that provides MemTableRep objects. + // Default: a factory that provides a skip-list-based implementation of + // MemTableRep. + std::shared_ptr memtable_factory; + + // This is a factory that provides TableFactory objects. + // Default: a factory that provides a default implementation of + // Table and TableBuilder. + std::shared_ptr table_factory; + + // This option allows user to to collect their own interested statistics of + // the tables. + // Default: emtpy vector -- no user-defined statistics collection will be + // performed. + std::vector> + table_properties_collectors; + + // Allows thread-safe inplace updates. Requires Updates iff + // * key exists in current memtable + // * new sizeof(new_value) <= sizeof(old_value) + // * old_value for that key is a put i.e. kTypeValue + // Default: false. + bool inplace_update_support; + + // Number of locks used for inplace update + // Default: 10000, if inplace_update_support = true, else 0. + size_t inplace_update_num_locks; + + // Maximum number of successive merge operations on a key in the memtable. + // + // When a merge operation is added to the memtable and the maximum number of + // successive merges is reached, the value of the key will be calculated and + // inserted into the memtable instead of the merge operation. This will + // ensure that there are never more than max_successive_merges merge + // operations in the memtable. + // + // Default: 0 (disabled) + size_t max_successive_merges; +}; + +// +// An application can issue a read request (via Get/Iterators) and specify +// if that read should process data that ALREADY resides on a specified cache +// level. For example, if an application specifies kBlockCacheTier then the +// Get call will process data that is already processed in the memtable or +// the block cache. It will not page in data from the OS cache or data that +// resides in storage. +enum ReadTier { + kReadAllTier = 0x0, // data in memtable, block cache, OS cache or storage + kBlockCacheTier = 0x1 // data in memtable or block cache +}; + +// Options that control read operations +struct ReadOptions { + // If true, all data read from underlying storage will be + // verified against corresponding checksums. + // Default: false + bool verify_checksums; + + // Should the "data block"/"index block"/"filter block" read for this + // iteration be cached in memory? + // Callers may wish to set this field to false for bulk scans. + // Default: true + bool fill_cache; + + // If this option is set and memtable implementation allows, Seek + // might only return keys with the same prefix as the seek-key + bool prefix_seek; + + // If "snapshot" is non-nullptr, read as of the supplied snapshot + // (which must belong to the DB that is being read and which must + // not have been released). If "snapshot" is nullptr, use an impliicit + // snapshot of the state at the beginning of this read operation. + // Default: nullptr + const Snapshot* snapshot; + + // If "prefix" is non-nullptr, and ReadOptions is being passed to + // db.NewIterator, only return results when the key begins with this + // prefix. This field is ignored by other calls (e.g., Get). + // Options.prefix_extractor must also be set, and + // prefix_extractor.InRange(prefix) must be true. The iterator + // returned by NewIterator when this option is set will behave just + // as if the underlying store did not contain any non-matching keys, + // with two exceptions. Seek() only accepts keys starting with the + // prefix, and SeekToLast() is not supported. prefix filter with this + // option will sometimes reduce the number of read IOPs. + // Default: nullptr + const Slice* prefix; + + // Specify if this read request should process data that ALREADY + // resides on a particular cache. If the required data is not + // found at the specified cache, then Status::Incomplete is returned. + // Default: kReadAllTier + ReadTier read_tier; + + ReadOptions() + : verify_checksums(false), + fill_cache(true), + prefix_seek(false), + snapshot(nullptr), + prefix(nullptr), + read_tier(kReadAllTier) {} + ReadOptions(bool cksum, bool cache) + : verify_checksums(cksum), + fill_cache(cache), + prefix_seek(false), + snapshot(nullptr), + prefix(nullptr), + read_tier(kReadAllTier) {} +}; + +// Options that control write operations +struct WriteOptions { + // If true, the write will be flushed from the operating system + // buffer cache (by calling WritableFile::Sync()) before the write + // is considered complete. If this flag is true, writes will be + // slower. + // + // If this flag is false, and the machine crashes, some recent + // writes may be lost. Note that if it is just the process that + // crashes (i.e., the machine does not reboot), no writes will be + // lost even if sync==false. + // + // In other words, a DB write with sync==false has similar + // crash semantics as the "write()" system call. A DB write + // with sync==true has similar crash semantics to a "write()" + // system call followed by "fdatasync()". + // + // Default: false + bool sync; + + // If true, writes will not first go to the write ahead log, + // and the write may got lost after a crash. + bool disableWAL; + + WriteOptions() : sync(false), disableWAL(false) {} +}; + +// Options that control flush operations +struct FlushOptions { + // If true, the flush will wait until the flush is done. + // Default: true + bool wait; + + FlushOptions() : wait(true) {} +}; + +} // namespace rocksdb + +#endif // STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_ diff --git a/include/rocksdb/perf_context.h b/include/rocksdb/perf_context.h new file mode 100644 index 00000000..9e900e05 --- /dev/null +++ b/include/rocksdb/perf_context.h @@ -0,0 +1,48 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#ifndef STORAGE_ROCKSDB_INCLUDE_PERF_CONTEXT_H +#define STORAGE_ROCKSDB_INCLUDE_PERF_CONTEXT_H + +#include + +namespace rocksdb { + +enum PerfLevel { + kDisable = 0, // disable perf stats + kEnableCount = 1, // enable only count stats + kEnableTime = 2 // enable time stats too +}; + +// set the perf stats level +void SetPerfLevel(PerfLevel level); + +// A thread local context for gathering performance counter efficiently +// and transparently. + +struct PerfContext { + + void Reset(); // reset all performance counters to zero + + uint64_t user_key_comparison_count; // total number of user key comparisons + uint64_t block_cache_hit_count; // total number of block cache hits + uint64_t block_read_count; // total number of block reads (with IO) + uint64_t block_read_byte; // total number of bytes from block reads + uint64_t block_read_time; // total time spent on block reads + uint64_t block_checksum_time; // total time spent on block checksum + uint64_t block_decompress_time; // total time spent on block decompression + // total number of internal keys skipped over during iteration (overwritten or + // deleted, to be more specific, hidden by a put or delete of the same key) + uint64_t internal_key_skipped_count; + // total number of deletes skipped over during iteration + uint64_t internal_delete_skipped_count; + uint64_t wal_write_time; // total time spent on writing to WAL +}; + +extern __thread PerfContext perf_context; + +} + +#endif diff --git a/include/rocksdb/slice.h b/include/rocksdb/slice.h new file mode 100644 index 00000000..e6cca213 --- /dev/null +++ b/include/rocksdb/slice.h @@ -0,0 +1,136 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Slice is a simple structure containing a pointer into some external +// storage and a size. The user of a Slice must ensure that the slice +// is not used after the corresponding external storage has been +// deallocated. +// +// Multiple threads can invoke const methods on a Slice without +// external synchronization, but if any of the threads may call a +// non-const method, all threads accessing the same Slice must use +// external synchronization. + +#ifndef STORAGE_ROCKSDB_INCLUDE_SLICE_H_ +#define STORAGE_ROCKSDB_INCLUDE_SLICE_H_ + +#include +#include +#include +#include + +namespace rocksdb { + +class Slice { + public: + // Create an empty slice. + Slice() : data_(""), size_(0) { } + + // Create a slice that refers to d[0,n-1]. + Slice(const char* d, size_t n) : data_(d), size_(n) { } + + // Create a slice that refers to the contents of "s" + /* implicit */ + Slice(const std::string& s) : data_(s.data()), size_(s.size()) { } + + // Create a slice that refers to s[0,strlen(s)-1] + /* implicit */ + Slice(const char* s) : data_(s), size_(strlen(s)) { } + + // Return a pointer to the beginning of the referenced data + const char* data() const { return data_; } + + // Return the length (in bytes) of the referenced data + size_t size() const { return size_; } + + // Return true iff the length of the referenced data is zero + bool empty() const { return size_ == 0; } + + // Return the ith byte in the referenced data. + // REQUIRES: n < size() + char operator[](size_t n) const { + assert(n < size()); + return data_[n]; + } + + // Change this slice to refer to an empty array + void clear() { data_ = ""; size_ = 0; } + + // Drop the first "n" bytes from this slice. + void remove_prefix(size_t n) { + assert(n <= size()); + data_ += n; + size_ -= n; + } + + // Return a string that contains the copy of the referenced data. + std::string ToString(bool hex = false) const { + if (hex) { + std::string result; + char buf[10]; + for (size_t i = 0; i < size_; i++) { + snprintf(buf, 10, "%02X", (unsigned char)data_[i]); + result += buf; + } + return result; + } else { + return std::string(data_, size_); + } + } + + // Three-way comparison. Returns value: + // < 0 iff "*this" < "b", + // == 0 iff "*this" == "b", + // > 0 iff "*this" > "b" + int compare(const Slice& b) const; + + // Return true iff "x" is a prefix of "*this" + bool starts_with(const Slice& x) const { + return ((size_ >= x.size_) && + (memcmp(data_, x.data_, x.size_) == 0)); + } + + // private: make these public for rocksdbjni access + const char* data_; + size_t size_; + + // Intentionally copyable +}; + +// A set of Slices that are virtually concatenated together. 'parts' points +// to an array of Slices. The number of elements in the array is 'num_parts'. +struct SliceParts { + SliceParts(const Slice* parts, int num_parts) : + parts(parts), num_parts(num_parts) { } + + const Slice* parts; + int num_parts; +}; + +inline bool operator==(const Slice& x, const Slice& y) { + return ((x.size() == y.size()) && + (memcmp(x.data(), y.data(), x.size()) == 0)); +} + +inline bool operator!=(const Slice& x, const Slice& y) { + return !(x == y); +} + +inline int Slice::compare(const Slice& b) const { + const int min_len = (size_ < b.size_) ? size_ : b.size_; + int r = memcmp(data_, b.data_, min_len); + if (r == 0) { + if (size_ < b.size_) r = -1; + else if (size_ > b.size_) r = +1; + } + return r; +} + +} // namespace rocksdb + +#endif // STORAGE_ROCKSDB_INCLUDE_SLICE_H_ diff --git a/include/rocksdb/slice_transform.h b/include/rocksdb/slice_transform.h new file mode 100644 index 00000000..a7845500 --- /dev/null +++ b/include/rocksdb/slice_transform.h @@ -0,0 +1,47 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Class for specifying user-defined functions which perform a +// transformation on a slice. It is not required that every slice +// belong to the domain and/or range of a function. Subclasses should +// define InDomain and InRange to determine which slices are in either +// of these sets respectively. + +#ifndef STORAGE_ROCKSDB_INCLUDE_SLICE_TRANSFORM_H_ +#define STORAGE_ROCKSDB_INCLUDE_SLICE_TRANSFORM_H_ + +#include + +namespace rocksdb { + +class Slice; + +class SliceTransform { + public: + virtual ~SliceTransform() {}; + + // Return the name of this transformation. + virtual const char* Name() const = 0; + + // transform a src in domain to a dst in the range + virtual Slice Transform(const Slice& src) const = 0; + + // determine whether this is a valid src upon the function applies + virtual bool InDomain(const Slice& src) const = 0; + + // determine whether dst=Transform(src) for some src + virtual bool InRange(const Slice& dst) const = 0; +}; + +extern const SliceTransform* NewFixedPrefixTransform(size_t prefix_len); + +extern const SliceTransform* NewNoopTransform(); + +} + +#endif // STORAGE_ROCKSDB_INCLUDE_SLICE_TRANSFORM_H_ diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h new file mode 100644 index 00000000..011e510f --- /dev/null +++ b/include/rocksdb/statistics.h @@ -0,0 +1,308 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#ifndef STORAGE_ROCKSDB_INCLUDE_STATISTICS_H_ +#define STORAGE_ROCKSDB_INCLUDE_STATISTICS_H_ + +#include +#include +#include +#include +#include +#include +#include + +namespace rocksdb { + +/** + * Keep adding ticker's here. + * Any ticker should have a value less than TICKER_ENUM_MAX. + * Add a new ticker by assigning it the current value of TICKER_ENUM_MAX + * Add a string representation in TickersNameMap below. + * And incrementing TICKER_ENUM_MAX. + */ +enum Tickers { + // total block cache misses + // REQUIRES: BLOCK_CACHE_MISS == BLOCK_CACHE_INDEX_MISS + + // BLOCK_CACHE_FILTER_MISS + + // BLOCK_CACHE_DATA_MISS; + BLOCK_CACHE_MISS, + // total block cache hit + // REQUIRES: BLOCK_CACHE_HIT == BLOCK_CACHE_INDEX_HIT + + // BLOCK_CACHE_FILTER_HIT + + // BLOCK_CACHE_DATA_HIT; + BLOCK_CACHE_HIT, + // # of blocks added to block cache. + BLOCK_CACHE_ADD, + // # of times cache miss when accessing index block from block cache. + BLOCK_CACHE_INDEX_MISS, + // # of times cache hit when accessing index block from block cache. + BLOCK_CACHE_INDEX_HIT, + // # of times cache miss when accessing filter block from block cache. + BLOCK_CACHE_FILTER_MISS, + // # of times cache hit when accessing filter block from block cache. + BLOCK_CACHE_FILTER_HIT, + // # of times cache miss when accessing data block from block cache. + BLOCK_CACHE_DATA_MISS, + // # of times cache hit when accessing data block from block cache. + BLOCK_CACHE_DATA_HIT, + // # of times bloom filter has avoided file reads. + BLOOM_FILTER_USEFUL, + + // # of memtable hits. + MEMTABLE_HIT, + // # of memtable misses. + MEMTABLE_MISS, + + /** + * COMPACTION_KEY_DROP_* count the reasons for key drop during compaction + * There are 3 reasons currently. + */ + COMPACTION_KEY_DROP_NEWER_ENTRY, // key was written with a newer value. + COMPACTION_KEY_DROP_OBSOLETE, // The key is obsolete. + COMPACTION_KEY_DROP_USER, // user compaction function has dropped the key. + + // Number of keys written to the database via the Put and Write call's + NUMBER_KEYS_WRITTEN, + // Number of Keys read, + NUMBER_KEYS_READ, + // Number keys updated, if inplace update is enabled + NUMBER_KEYS_UPDATED, + // Bytes written / read + BYTES_WRITTEN, + BYTES_READ, + NO_FILE_CLOSES, + NO_FILE_OPENS, + NO_FILE_ERRORS, + // Time system had to wait to do LO-L1 compactions + STALL_L0_SLOWDOWN_MICROS, + // Time system had to wait to move memtable to L1. + STALL_MEMTABLE_COMPACTION_MICROS, + // write throttle because of too many files in L0 + STALL_L0_NUM_FILES_MICROS, + RATE_LIMIT_DELAY_MILLIS, + + NO_ITERATORS, // number of iterators currently open + + // Number of MultiGet calls, keys read, and bytes read + NUMBER_MULTIGET_CALLS, + NUMBER_MULTIGET_KEYS_READ, + NUMBER_MULTIGET_BYTES_READ, + + // Number of deletes records that were not required to be + // written to storage because key does not exist + NUMBER_FILTERED_DELETES, + NUMBER_MERGE_FAILURES, + SEQUENCE_NUMBER, + + // number of times bloom was checked before creating iterator on a + // file, and the number of times the check was useful in avoiding + // iterator creation (and thus likely IOPs). + BLOOM_FILTER_PREFIX_CHECKED, + BLOOM_FILTER_PREFIX_USEFUL, + + // Number of times we had to reseek inside an iteration to skip + // over large number of keys with same userkey. + NUMBER_OF_RESEEKS_IN_ITERATION, + + // Record the number of calls to GetUpadtesSince. Useful to keep track of + // transaction log iterator refreshes + GET_UPDATES_SINCE_CALLS, + + BLOCK_CACHE_COMPRESSED_MISS, // miss in the compressed block cache + BLOCK_CACHE_COMPRESSED_HIT, // hit in the compressed block cache + + WAL_FILE_SYNCED, // Number of times WAL sync is done + WAL_FILE_BYTES, // Number of bytes written to WAL + + // Writes can be processed by requesting thread or by the thread at the + // head of the writers queue. + WRITE_DONE_BY_SELF, + WRITE_DONE_BY_OTHER, + + WRITE_WITH_WAL, // Number of Write calls that request WAL + + COMPACT_READ_BYTES, // Bytes read during compaction + COMPACT_WRITE_BYTES, // Bytes written during compaction + + TICKER_ENUM_MAX +}; + +// The order of items listed in Tickers should be the same as +// the order listed in TickersNameMap +const std::vector> TickersNameMap = { + { BLOCK_CACHE_MISS, "rocksdb.block.cache.miss" }, + { BLOCK_CACHE_HIT, "rocksdb.block.cache.hit" }, + { BLOCK_CACHE_ADD, "rocksdb.block.cache.add" }, + { BLOCK_CACHE_INDEX_MISS, "rocksdb.block.cache.index.miss" }, + { BLOCK_CACHE_INDEX_HIT, "rocksdb.block.cache.index.hit" }, + { BLOCK_CACHE_FILTER_MISS, "rocksdb.block.cache.filter.miss" }, + { BLOCK_CACHE_FILTER_HIT, "rocksdb.block.cache.filter.hit" }, + { BLOCK_CACHE_DATA_MISS, "rocksdb.block.cache.data.miss" }, + { BLOCK_CACHE_DATA_HIT, "rocksdb.block.cache.data.hit" }, + { BLOOM_FILTER_USEFUL, "rocksdb.bloom.filter.useful" }, + { MEMTABLE_HIT, "rocksdb.memtable.hit" }, + { MEMTABLE_MISS, "rocksdb.memtable.miss" }, + { COMPACTION_KEY_DROP_NEWER_ENTRY, "rocksdb.compaction.key.drop.new" }, + { COMPACTION_KEY_DROP_OBSOLETE, "rocksdb.compaction.key.drop.obsolete" }, + { COMPACTION_KEY_DROP_USER, "rocksdb.compaction.key.drop.user" }, + { NUMBER_KEYS_WRITTEN, "rocksdb.number.keys.written" }, + { NUMBER_KEYS_READ, "rocksdb.number.keys.read" }, + { NUMBER_KEYS_UPDATED, "rocksdb.number.keys.updated" }, + { BYTES_WRITTEN, "rocksdb.bytes.written" }, + { BYTES_READ, "rocksdb.bytes.read" }, + { NO_FILE_CLOSES, "rocksdb.no.file.closes" }, + { NO_FILE_OPENS, "rocksdb.no.file.opens" }, + { NO_FILE_ERRORS, "rocksdb.no.file.errors" }, + { STALL_L0_SLOWDOWN_MICROS, "rocksdb.l0.slowdown.micros" }, + { STALL_MEMTABLE_COMPACTION_MICROS, "rocksdb.memtable.compaction.micros" }, + { STALL_L0_NUM_FILES_MICROS, "rocksdb.l0.num.files.stall.micros" }, + { RATE_LIMIT_DELAY_MILLIS, "rocksdb.rate.limit.delay.millis" }, + { NO_ITERATORS, "rocksdb.num.iterators" }, + { NUMBER_MULTIGET_CALLS, "rocksdb.number.multiget.get" }, + { NUMBER_MULTIGET_KEYS_READ, "rocksdb.number.multiget.keys.read" }, + { NUMBER_MULTIGET_BYTES_READ, "rocksdb.number.multiget.bytes.read" }, + { NUMBER_FILTERED_DELETES, "rocksdb.number.deletes.filtered" }, + { NUMBER_MERGE_FAILURES, "rocksdb.number.merge.failures" }, + { SEQUENCE_NUMBER, "rocksdb.sequence.number" }, + { BLOOM_FILTER_PREFIX_CHECKED, "rocksdb.bloom.filter.prefix.checked" }, + { BLOOM_FILTER_PREFIX_USEFUL, "rocksdb.bloom.filter.prefix.useful" }, + { NUMBER_OF_RESEEKS_IN_ITERATION, "rocksdb.number.reseeks.iteration" }, + { GET_UPDATES_SINCE_CALLS, "rocksdb.getupdatessince.calls" }, + { BLOCK_CACHE_COMPRESSED_MISS, "rocksdb.block.cachecompressed.miss" }, + { BLOCK_CACHE_COMPRESSED_HIT, "rocksdb.block.cachecompressed.hit" }, + { WAL_FILE_SYNCED, "rocksdb.wal.synced" }, + { WAL_FILE_BYTES, "rocksdb.wal.bytes" }, + { WRITE_DONE_BY_SELF, "rocksdb.write.self" }, + { WRITE_DONE_BY_OTHER, "rocksdb.write.other" }, + { WRITE_WITH_WAL, "rocksdb.write.wal" }, + { COMPACT_READ_BYTES, "rocksdb.compact.read.bytes" }, + { COMPACT_WRITE_BYTES, "rocksdb.compact.write.bytes" }, +}; + +/** + * Keep adding histogram's here. + * Any histogram whould have value less than HISTOGRAM_ENUM_MAX + * Add a new Histogram by assigning it the current value of HISTOGRAM_ENUM_MAX + * Add a string representation in HistogramsNameMap below + * And increment HISTOGRAM_ENUM_MAX + */ +enum Histograms { + DB_GET, + DB_WRITE, + COMPACTION_TIME, + TABLE_SYNC_MICROS, + COMPACTION_OUTFILE_SYNC_MICROS, + WAL_FILE_SYNC_MICROS, + MANIFEST_FILE_SYNC_MICROS, + // TIME SPENT IN IO DURING TABLE OPEN + TABLE_OPEN_IO_MICROS, + DB_MULTIGET, + READ_BLOCK_COMPACTION_MICROS, + READ_BLOCK_GET_MICROS, + WRITE_RAW_BLOCK_MICROS, + + STALL_L0_SLOWDOWN_COUNT, + STALL_MEMTABLE_COMPACTION_COUNT, + STALL_L0_NUM_FILES_COUNT, + HARD_RATE_LIMIT_DELAY_COUNT, + SOFT_RATE_LIMIT_DELAY_COUNT, + NUM_FILES_IN_SINGLE_COMPACTION, + HISTOGRAM_ENUM_MAX, +}; + +const std::vector> HistogramsNameMap = { + { DB_GET, "rocksdb.db.get.micros" }, + { DB_WRITE, "rocksdb.db.write.micros" }, + { COMPACTION_TIME, "rocksdb.compaction.times.micros" }, + { TABLE_SYNC_MICROS, "rocksdb.table.sync.micros" }, + { COMPACTION_OUTFILE_SYNC_MICROS, "rocksdb.compaction.outfile.sync.micros" }, + { WAL_FILE_SYNC_MICROS, "rocksdb.wal.file.sync.micros" }, + { MANIFEST_FILE_SYNC_MICROS, "rocksdb.manifest.file.sync.micros" }, + { TABLE_OPEN_IO_MICROS, "rocksdb.table.open.io.micros" }, + { DB_MULTIGET, "rocksdb.db.multiget.micros" }, + { READ_BLOCK_COMPACTION_MICROS, "rocksdb.read.block.compaction.micros" }, + { READ_BLOCK_GET_MICROS, "rocksdb.read.block.get.micros" }, + { WRITE_RAW_BLOCK_MICROS, "rocksdb.write.raw.block.micros" }, + { STALL_L0_SLOWDOWN_COUNT, "rocksdb.l0.slowdown.count"}, + { STALL_MEMTABLE_COMPACTION_COUNT, "rocksdb.memtable.compaction.count"}, + { STALL_L0_NUM_FILES_COUNT, "rocksdb.num.files.stall.count"}, + { HARD_RATE_LIMIT_DELAY_COUNT, "rocksdb.hard.rate.limit.delay.count"}, + { SOFT_RATE_LIMIT_DELAY_COUNT, "rocksdb.soft.rate.limit.delay.count"}, + { NUM_FILES_IN_SINGLE_COMPACTION, "rocksdb.numfiles.in.singlecompaction" }, +}; + +struct HistogramData { + double median; + double percentile95; + double percentile99; + double average; + double standard_deviation; +}; + + +class Histogram { + public: + // clear's the histogram + virtual void Clear() = 0; + virtual ~Histogram(); + // Add a value to be recorded in the histogram. + virtual void Add(uint64_t value) = 0; + + virtual std::string ToString() const = 0; + + // Get statistics + virtual double Median() const = 0; + virtual double Percentile(double p) const = 0; + virtual double Average() const = 0; + virtual double StandardDeviation() const = 0; + virtual void Data(HistogramData * const data) const = 0; + +}; + +/** + * A dumb ticker which keeps incrementing through its life time. + * Thread safe. Locking managed by implementation of this interface. + */ +class Ticker { + public: + Ticker() : count_(0) { } + + inline void setTickerCount(uint64_t count) { + count_ = count; + } + + inline void recordTick(int count = 1) { + count_ += count; + } + + inline uint64_t getCount() { + return count_; + } + + private: + std::atomic_uint_fast64_t count_; +}; + +// Analyze the performance of a db +class Statistics { + public: + + virtual long getTickerCount(Tickers tickerType) = 0; + virtual void recordTick(Tickers tickerType, uint64_t count = 0) = 0; + virtual void setTickerCount(Tickers tickerType, uint64_t count) = 0; + virtual void measureTime(Histograms histogramType, uint64_t time) = 0; + + virtual void histogramData(Histograms type, HistogramData * const data) = 0; + // String representation of the statistic object. + std::string ToString(); +}; + +// Create a concrete DBStatistics object +std::shared_ptr CreateDBStatistics(); + +} // namespace rocksdb + +#endif // STORAGE_ROCKSDB_INCLUDE_STATISTICS_H_ diff --git a/include/rocksdb/status.h b/include/rocksdb/status.h new file mode 100644 index 00000000..e2304fdb --- /dev/null +++ b/include/rocksdb/status.h @@ -0,0 +1,137 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A Status encapsulates the result of an operation. It may indicate success, +// or it may indicate an error with an associated error message. +// +// Multiple threads can invoke const methods on a Status without +// external synchronization, but if any of the threads may call a +// non-const method, all threads accessing the same Status must use +// external synchronization. + +#ifndef STORAGE_ROCKSDB_INCLUDE_STATUS_H_ +#define STORAGE_ROCKSDB_INCLUDE_STATUS_H_ + +#include +#include "rocksdb/slice.h" + +namespace rocksdb { + +class Status { + public: + // Create a success status. + Status() : code_(kOk), state_(nullptr) { } + ~Status() { delete[] state_; } + + // Copy the specified status. + Status(const Status& s); + void operator=(const Status& s); + + // Return a success status. + static Status OK() { return Status(); } + + // Return error status of an appropriate type. + static Status NotFound(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kNotFound, msg, msg2); + } + // Fast path for not found without malloc; + static Status NotFound() { + return Status(kNotFound); + } + static Status Corruption(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kCorruption, msg, msg2); + } + static Status NotSupported(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kNotSupported, msg, msg2); + } + static Status InvalidArgument(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kInvalidArgument, msg, msg2); + } + static Status IOError(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kIOError, msg, msg2); + } + static Status MergeInProgress(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kMergeInProgress, msg, msg2); + } + static Status Incomplete(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kIncomplete, msg, msg2); + } + + // Returns true iff the status indicates success. + bool ok() const { return code() == kOk; } + + // Returns true iff the status indicates a NotFound error. + bool IsNotFound() const { return code() == kNotFound; } + + // Returns true iff the status indicates a Corruption error. + bool IsCorruption() const { return code() == kCorruption; } + + // Returns true iff the status indicates a NotSupported error. + bool IsNotSupported() const { return code() == kNotSupported; } + + // Returns true iff the status indicates an InvalidArgument error. + bool IsInvalidArgument() const { return code() == kInvalidArgument; } + + // Returns true iff the status indicates an IOError. + bool IsIOError() const { return code() == kIOError; } + + // Returns true iff the status indicates an MergeInProgress. + bool IsMergeInProgress() const { return code() == kMergeInProgress; } + + // Returns true iff the status indicates Incomplete + bool IsIncomplete() const { return code() == kIncomplete; } + + // Return a string representation of this status suitable for printing. + // Returns the string "OK" for success. + std::string ToString() const; + + private: + enum Code { + kOk = 0, + kNotFound = 1, + kCorruption = 2, + kNotSupported = 3, + kInvalidArgument = 4, + kIOError = 5, + kMergeInProgress = 6, + kIncomplete = 7 + }; + + // A nullptr state_ (which is always the case for OK) means the message + // is empty. + // of the following form: + // state_[0..3] == length of message + // state_[4..] == message + Code code_; + const char* state_; + + Code code() const { + return code_; + } + explicit Status(Code code) : code_(code), state_(nullptr) { } + Status(Code code, const Slice& msg, const Slice& msg2); + static const char* CopyState(const char* s); +}; + +inline Status::Status(const Status& s) { + code_ = s.code_; + state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_); +} +inline void Status::operator=(const Status& s) { + // The following condition catches both aliasing (when this == &s), + // and the common case where both s and *this are ok. + code_ = s.code_; + if (state_ != s.state_) { + delete[] state_; + state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_); + } +} + +} // namespace rocksdb + +#endif // STORAGE_ROCKSDB_INCLUDE_STATUS_H_ diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h new file mode 100644 index 00000000..2d2bfacc --- /dev/null +++ b/include/rocksdb/table.h @@ -0,0 +1,180 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/table_properties.h" +#include "rocksdb/options.h" + +namespace rocksdb { + +struct Options; +class RandomAccessFile; +struct ReadOptions; +class TableCache; +class WritableFile; + +using std::unique_ptr; + +// TableBuilder provides the interface used to build a Table +// (an immutable and sorted map from keys to values). +// +// Multiple threads can invoke const methods on a TableBuilder without +// external synchronization, but if any of the threads may call a +// non-const method, all threads accessing the same TableBuilder must use +// external synchronization. +class TableBuilder { + public: + // REQUIRES: Either Finish() or Abandon() has been called. + virtual ~TableBuilder() {} + + // Add key,value to the table being constructed. + // REQUIRES: key is after any previously added key according to comparator. + // REQUIRES: Finish(), Abandon() have not been called + virtual void Add(const Slice& key, const Slice& value) = 0; + + // Return non-ok iff some error has been detected. + virtual Status status() const = 0; + + // Finish building the table. + // REQUIRES: Finish(), Abandon() have not been called + virtual Status Finish() = 0; + + // Indicate that the contents of this builder should be abandoned. + // If the caller is not going to call Finish(), it must call Abandon() + // before destroying this builder. + // REQUIRES: Finish(), Abandon() have not been called + virtual void Abandon() = 0; + + // Number of calls to Add() so far. + virtual uint64_t NumEntries() const = 0; + + // Size of the file generated so far. If invoked after a successful + // Finish() call, returns the size of the final generated file. + virtual uint64_t FileSize() const = 0; +}; + +// A Table is a sorted map from strings to strings. Tables are +// immutable and persistent. A Table may be safely accessed from +// multiple threads without external synchronization. +class TableReader { + public: + virtual ~TableReader() {} + + // Determine whether there is a chance that the current table file + // contains the key a key starting with iternal_prefix. The specific + // table implementation can use bloom filter and/or other heuristic + // to filter out this table as a whole. + virtual bool PrefixMayMatch(const Slice& internal_prefix) = 0; + + // Returns a new iterator over the table contents. + // The result of NewIterator() is initially invalid (caller must + // call one of the Seek methods on the iterator before using it). + virtual Iterator* NewIterator(const ReadOptions&) = 0; + + // Given a key, return an approximate byte offset in the file where + // the data for that key begins (or would begin if the key were + // present in the file). The returned value is in terms of file + // bytes, and so includes effects like compression of the underlying data. + // E.g., the approximate offset of the last key in the table will + // be close to the file length. + virtual uint64_t ApproximateOffsetOf(const Slice& key) = 0; + + // Returns true if the block for the specified key is in cache. + // REQUIRES: key is in this table. + virtual bool TEST_KeyInCache(const ReadOptions& options, + const Slice& key) = 0; + + // Set up the table for Compaction. Might change some parameters with + // posix_fadvise + virtual void SetupForCompaction() = 0; + + virtual TableProperties& GetTableProperties() = 0; + + // Calls (*result_handler)(handle_context, ...) repeatedly, starting with + // the entry found after a call to Seek(key), until result_handler returns + // false, where k is the actual internal key for a row found and v as the + // value of the key. didIO is true if I/O is involved in the operation. May + // not make such a call if filter policy says that key is not present. + // + // mark_key_may_exist_handler needs to be called when it is configured to be + // memory only and the key is not found in the block cache, with + // the parameter to be handle_context. + // + // readOptions is the options for the read + // key is the key to search for + virtual Status Get( + const ReadOptions& readOptions, + const Slice& key, + void* handle_context, + bool (*result_handler)(void* handle_context, const Slice& k, + const Slice& v, bool didIO), + void (*mark_key_may_exist_handler)(void* handle_context) = nullptr) = 0; +}; + +// A base class for table factories +class TableFactory { + public: + virtual ~TableFactory() {} + + // The type of the table. + // + // The client of this package should switch to a new name whenever + // the table format implementation changes. + // + // Names starting with "rocksdb." are reserved and should not be used + // by any clients of this package. + virtual const char* Name() const = 0; + + // Returns a Table object table that can fetch data from file specified + // in parameter file. It's the caller's responsibility to make sure + // file is in the correct format. + // + // GetTableReader() is called in two places: + // (1) TableCache::FindTable() calls the function when table cache miss + // and cache the table object returned. + // (1) SstFileReader (for SST Dump) opens the table and dump the table + // contents using the interator of the table. + // options and soptions are options. options is the general options. + // Multiple configured can be accessed from there, including and not + // limited to block cache and key comparators. + // file is a file handler to handle the file for the table + // file_size is the physical file size of the file + // table_reader is the output table reader + virtual Status GetTableReader( + const Options& options, const EnvOptions& soptions, + unique_ptr && file, uint64_t file_size, + unique_ptr* table_reader) const = 0; + + // Return a table builder to write to a file for this table type. + // + // It is called in several places: + // (1) When flushing memtable to a level-0 output file, it creates a table + // builder (In DBImpl::WriteLevel0Table(), by calling BuildTable()) + // (2) During compaction, it gets the builder for writing compaction output + // files in DBImpl::OpenCompactionOutputFile(). + // (3) When recovering from transaction logs, it creates a table builder to + // write to a level-0 output file (In DBImpl::WriteLevel0TableForRecovery, + // by calling BuildTable()) + // (4) When running Repairer, it creates a table builder to convert logs to + // SST files (In Repairer::ConvertLogToTable() by calling BuildTable()) + // + // options is the general options. Multiple configured can be acceseed from + // there, including and not limited to compression options. + // file is a handle of a writable file. It is the caller's responsibility to + // keep the file open and close the file after closing the table builder. + // compression_type is the compression type to use in this table. + virtual TableBuilder* GetTableBuilder( + const Options& options, WritableFile* file, + CompressionType compression_type) const = 0; +}; +} // namespace rocksdb diff --git a/include/rocksdb/table_properties.h b/include/rocksdb/table_properties.h new file mode 100644 index 00000000..8824ca13 --- /dev/null +++ b/include/rocksdb/table_properties.h @@ -0,0 +1,90 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +#pragma once + +#include +#include + +#include "rocksdb/status.h" + +namespace rocksdb { + +// TableProperties contains a bunch of read-only properties of its associated +// table. +struct TableProperties { + public: + // Other than basic table properties, each table may also have the user + // collected properties. + // The value of the user-collected properties are encoded as raw bytes -- + // users have to interprete these values by themselves. + typedef + std::unordered_map + UserCollectedProperties; + + // the total size of all data blocks. + uint64_t data_size = 0; + // the size of index block. + uint64_t index_size = 0; + // the size of filter block. + uint64_t filter_size = 0; + // total raw key size + uint64_t raw_key_size = 0; + // total raw value size + uint64_t raw_value_size = 0; + // the number of blocks in this table + uint64_t num_data_blocks = 0; + // the number of entries in this table + uint64_t num_entries = 0; + + // The name of the filter policy used in this table. + // If no filter policy is used, `filter_policy_name` will be an empty string. + std::string filter_policy_name; + + // user collected properties + UserCollectedProperties user_collected_properties; + + // convert this object to a human readable form + // @prop_delim: delimiter for each property. + std::string ToString( + const std::string& prop_delim = "; ", + const std::string& kv_delim = "=") const; +}; + +// `TablePropertiesCollector` provides the mechanism for users to collect +// their own interested properties. This class is essentially a collection +// of callback functions that will be invoked during table building. +class TablePropertiesCollector { + public: + virtual ~TablePropertiesCollector() { } + + // Add() will be called when a new key/value pair is inserted into the table. + // @params key the original key that is inserted into the table. + // @params value the original value that is inserted into the table. + virtual Status Add(const Slice& key, const Slice& value) = 0; + + // Finish() will be called when a table has already been built and is ready + // for writing the properties block. + // @params properties User will add their collected statistics to + // `properties`. + virtual Status Finish( + TableProperties::UserCollectedProperties* properties) = 0; + + // The name of the properties collector can be used for debugging purpose. + virtual const char* Name() const = 0; + + // Return the human-readable properties, where the key is property name and + // the value is the human-readable form of value. + virtual TableProperties::UserCollectedProperties + GetReadableProperties() const = 0; +}; + +// Extra properties +// Below is a list of non-basic properties that are collected by database +// itself. Especially some properties regarding to the internal keys (which +// is unknown to `table`). +extern uint64_t GetDeletedKeys( + const TableProperties::UserCollectedProperties& props); + +} // namespace rocksdb diff --git a/include/rocksdb/transaction_log.h b/include/rocksdb/transaction_log.h new file mode 100644 index 00000000..41a3250d --- /dev/null +++ b/include/rocksdb/transaction_log.h @@ -0,0 +1,91 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#ifndef STORAGE_ROCKSDB_INCLUDE_TRANSACTION_LOG_ITERATOR_H_ +#define STORAGE_ROCKSDB_INCLUDE_TRANSACTION_LOG_ITERATOR_H_ + +#include "rocksdb/status.h" +#include "rocksdb/types.h" +#include "rocksdb/write_batch.h" +#include +#include + +namespace rocksdb { + +class LogFile; +typedef std::vector> VectorLogPtr; + +enum WalFileType { + /* Indicates that WAL file is in archive directory. WAL files are moved from + * the main db directory to archive directory once they are not live and stay + * there until cleaned up. Files are cleaned depending on archive size + * (Options::WAL_size_limit_MB) and time since last cleaning + * (Options::WAL_ttl_seconds). + */ + kArchivedLogFile = 0, + + /* Indicates that WAL file is live and resides in the main db directory */ + kAliveLogFile = 1 +} ; + +class LogFile { + public: + LogFile() {} + virtual ~LogFile() {} + + // Returns log file's pathname relative to the main db dir + // Eg. For a live-log-file = /000003.log + // For an archived-log-file = /archive/000003.log + virtual std::string PathName() const = 0; + + + // Primary identifier for log file. + // This is directly proportional to creation time of the log file + virtual uint64_t LogNumber() const = 0; + + // Log file can be either alive or archived + virtual WalFileType Type() const = 0; + + // Starting sequence number of writebatch written in this log file + virtual SequenceNumber StartSequence() const = 0; + + // Size of log file on disk in Bytes + virtual uint64_t SizeFileBytes() const = 0; +}; + +struct BatchResult { + SequenceNumber sequence = 0; + std::unique_ptr writeBatchPtr; +}; + +// A TransactionLogIterator is used to iterate over the transactions in a db. +// One run of the iterator is continuous, i.e. the iterator will stop at the +// beginning of any gap in sequences +class TransactionLogIterator { + public: + TransactionLogIterator() {} + virtual ~TransactionLogIterator() {} + + // An iterator is either positioned at a WriteBatch or not valid. + // This method returns true if the iterator is valid. + // Can read data from a valid iterator. + virtual bool Valid() = 0; + + // Moves the iterator to the next WriteBatch. + // REQUIRES: Valid() to be true. + virtual void Next() = 0; + + // Returns ok if the iterator is valid. + // Returns the Error when something has gone wrong. + virtual Status status() = 0; + + // If valid return's the current write_batch and the sequence number of the + // earliest transaction contained in the batch. + // ONLY use if Valid() is true and status() is OK. + virtual BatchResult GetBatch() = 0; +}; +} // namespace rocksdb + +#endif // STORAGE_ROCKSDB_INCLUDE_TRANSACTION_LOG_ITERATOR_H_ diff --git a/include/rocksdb/types.h b/include/rocksdb/types.h new file mode 100644 index 00000000..f20bf827 --- /dev/null +++ b/include/rocksdb/types.h @@ -0,0 +1,20 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#ifndef STORAGE_ROCKSDB_INCLUDE_TYPES_H_ +#define STORAGE_ROCKSDB_INCLUDE_TYPES_H_ + +#include + +namespace rocksdb { + +// Define all public custom types here. + +// Represents a sequence number in a WAL file. +typedef uint64_t SequenceNumber; + +} // namespace rocksdb + +#endif // STORAGE_ROCKSDB_INCLUDE_TYPES_H_ diff --git a/include/rocksdb/universal_compaction.h b/include/rocksdb/universal_compaction.h new file mode 100644 index 00000000..ec862b95 --- /dev/null +++ b/include/rocksdb/universal_compaction.h @@ -0,0 +1,89 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#ifndef STORAGE_ROCKSDB_UNIVERSAL_COMPACTION_OPTIONS_H +#define STORAGE_ROCKSDB_UNIVERSAL_COMPACTION_OPTIONS_H + +#include +#include +#include +#include +#include +#include +#include "rocksdb/slice.h" +#include "rocksdb/statistics.h" + +namespace rocksdb { + +// +// Algorithm used to make a compaction request stop picking new files +// into a single compaction run +// +enum CompactionStopStyle { + kCompactionStopStyleSimilarSize, // pick files of similar size + kCompactionStopStyleTotalSize // total size of picked files > next file +}; + +class CompactionOptionsUniversal { + public: + + // Percentage flexibilty while comparing file size. If the candidate file(s) + // size is 1% smaller than the next file's size, then include next file into + // this candidate set. // Default: 1 + unsigned int size_ratio; + + // The minimum number of files in a single compaction run. Default: 2 + unsigned int min_merge_width; + + // The maximum number of files in a single compaction run. Default: UINT_MAX + unsigned int max_merge_width; + + // The size amplification is defined as the amount (in percentage) of + // additional storage needed to store a single byte of data in the database. + // For example, a size amplification of 2% means that a database that + // contains 100 bytes of user-data may occupy upto 102 bytes of + // physical storage. By this definition, a fully compacted database has + // a size amplification of 0%. Rocksdb uses the following heuristic + // to calculate size amplification: it assumes that all files excluding + // the earliest file contribute to the size amplification. + // Default: 200, which means that a 100 byte database could require upto + // 300 bytes of storage. + unsigned int max_size_amplification_percent; + + // If this option is set to be -1 (the default value), all the output files + // will follow compression type specified. + // + // If this option is not negative, we will try to make sure compressed + // size is just above this value. In normal cases, at least this percentage + // of data will be compressed. + // When we are compacting to a new file, here is the criteria whether + // it needs to be compressed: assuming here are the list of files sorted + // by generation time: + // A1...An B1...Bm C1...Ct + // where A1 is the newest and Ct is the oldest, and we are going to compact + // B1...Bm, we calculate the total size of all the files as total_size, as + // well as the total size of C1...Ct as total_C, the compaction output file + // will be compressed iff + // total_C / total_size < this percentage + int compression_size_percent; + + // The algorithm used to stop picking files into a single compaction run + // Default: kCompactionStopStyleTotalSize + CompactionStopStyle stop_style; + + // Default set of parameters + CompactionOptionsUniversal() : + size_ratio(1), + min_merge_width(2), + max_merge_width(UINT_MAX), + max_size_amplification_percent(200), + compression_size_percent(-1), + stop_style(kCompactionStopStyleTotalSize) { + } +}; + +} // namespace rocksdb + +#endif // STORAGE_ROCKSDB_UNIVERSAL_COMPACTION_OPTIONS_H diff --git a/include/rocksdb/write_batch.h b/include/rocksdb/write_batch.h new file mode 100644 index 00000000..2cfb731f --- /dev/null +++ b/include/rocksdb/write_batch.h @@ -0,0 +1,112 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// WriteBatch holds a collection of updates to apply atomically to a DB. +// +// The updates are applied in the order in which they are added +// to the WriteBatch. For example, the value of "key" will be "v3" +// after the following batch is written: +// +// batch.Put("key", "v1"); +// batch.Delete("key"); +// batch.Put("key", "v2"); +// batch.Put("key", "v3"); +// +// Multiple threads can invoke const methods on a WriteBatch without +// external synchronization, but if any of the threads may call a +// non-const method, all threads accessing the same WriteBatch must use +// external synchronization. + +#ifndef STORAGE_ROCKSDB_INCLUDE_WRITE_BATCH_H_ +#define STORAGE_ROCKSDB_INCLUDE_WRITE_BATCH_H_ + +#include +#include "rocksdb/status.h" + +namespace rocksdb { + +class Slice; +struct SliceParts; + +class WriteBatch { + public: + explicit WriteBatch(size_t reserved_bytes = 0); + ~WriteBatch(); + + // Store the mapping "key->value" in the database. + void Put(const Slice& key, const Slice& value); + + // Variant of Put() that gathers output like writev(2). The key and value + // that will be written to the database are concatentations of arrays of + // slices. + void Put(const SliceParts& key, const SliceParts& value); + + // Merge "value" with the existing value of "key" in the database. + // "key->merge(existing, value)" + void Merge(const Slice& key, const Slice& value); + + // If the database contains a mapping for "key", erase it. Else do nothing. + void Delete(const Slice& key); + + // Append a blob of arbitrary size to the records in this batch. The blob will + // be stored in the transaction log but not in any other file. In particular, + // it will not be persisted to the SST files. When iterating over this + // WriteBatch, WriteBatch::Handler::LogData will be called with the contents + // of the blob as it is encountered. Blobs, puts, deletes, and merges will be + // encountered in the same order in thich they were inserted. The blob will + // NOT consume sequence number(s) and will NOT increase the count of the batch + // + // Example application: add timestamps to the transaction log for use in + // replication. + void PutLogData(const Slice& blob); + + // Clear all updates buffered in this batch. + void Clear(); + + // Support for iterating over the contents of a batch. + class Handler { + public: + virtual ~Handler(); + virtual void Put(const Slice& key, const Slice& value) = 0; + // Merge and LogData are not pure virtual. Otherwise, we would break + // existing clients of Handler on a source code level. The default + // implementation of Merge simply throws a runtime exception. + virtual void Merge(const Slice& key, const Slice& value); + // The default implementation of LogData does nothing. + virtual void LogData(const Slice& blob); + virtual void Delete(const Slice& key) = 0; + // Continue is called by WriteBatch::Iterate. If it returns false, + // iteration is halted. Otherwise, it continues iterating. The default + // implementation always returns true. + virtual bool Continue(); + }; + Status Iterate(Handler* handler) const; + + // Retrieve the serialized version of this batch. + const std::string& Data() const { return rep_; } + + // Retrieve data size of the batch. + size_t GetDataSize() const { return rep_.size(); } + + // Returns the number of updates in the batch + int Count() const; + + // Constructor with a serialized string object + explicit WriteBatch(std::string rep): rep_(rep) {} + + private: + friend class WriteBatchInternal; + + std::string rep_; // See comment in write_batch.cc for the format of rep_ + + // Intentionally copyable +}; + +} // namespace rocksdb + +#endif // STORAGE_ROCKSDB_INCLUDE_WRITE_BATCH_H_ diff --git a/include/utilities/backupable_db.h b/include/utilities/backupable_db.h new file mode 100644 index 00000000..fbe2ae8a --- /dev/null +++ b/include/utilities/backupable_db.h @@ -0,0 +1,151 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include "utilities/stackable_db.h" +#include "rocksdb/env.h" +#include "rocksdb/status.h" + +#include +#include +#include + +namespace rocksdb { + +struct BackupableDBOptions { + // Where to keep the backup files. Has to be different than dbname_ + // Best to set this to dbname_ + "/backups" + // Required + std::string backup_dir; + + // Backup Env object. It will be used for backup file I/O. If it's + // nullptr, backups will be written out using DBs Env. If it's + // non-nullptr, backup's I/O will be performed using this object. + // If you want to have backups on HDFS, use HDFS Env here! + // Default: nullptr + Env* backup_env; + + // If share_table_files == true, backup will assume that table files with + // same name have the same contents. This enables incremental backups and + // avoids unnecessary data copies. + // If share_table_files == false, each backup will be on its own and will + // not share any data with other backups. + // default: true + bool share_table_files; + + // Backup info and error messages will be written to info_log + // if non-nullptr. + // Default: nullptr + Logger* info_log; + + // If sync == true, we can guarantee you'll get consistent backup even + // on a machine crash/reboot. Backup process is slower with sync enabled. + // If sync == false, we don't guarantee anything on machine reboot. However, + // chances are some of the backups are consistent. + // Default: true + bool sync; + + // If true, it will delete whatever backups there are already + // Default: false + bool destroy_old_data; + + explicit BackupableDBOptions(const std::string& _backup_dir, + Env* _backup_env = nullptr, + bool _share_table_files = true, + Logger* _info_log = nullptr, + bool _sync = true, + bool _destroy_old_data = false) : + backup_dir(_backup_dir), + backup_env(_backup_env), + info_log(_info_log), + sync(_sync), + destroy_old_data(_destroy_old_data) { } +}; + +class BackupEngine; + +typedef uint32_t BackupID; + +struct BackupInfo { + BackupID backup_id; + int64_t timestamp; + uint64_t size; + + BackupInfo() {} + BackupInfo(BackupID _backup_id, int64_t _timestamp, uint64_t _size) + : backup_id(_backup_id), timestamp(_timestamp), size(_size) {} +}; + +// Stack your DB with BackupableDB to be able to backup the DB +class BackupableDB : public StackableDB { + public: + // BackupableDBOptions have to be the same as the ones used in a previous + // incarnation of the DB + // + // BackupableDB ownes the pointer `DB* db` now. You should not delete it or + // use it after the invocation of BackupableDB + BackupableDB(DB* db, const BackupableDBOptions& options); + virtual ~BackupableDB(); + + // Captures the state of the database in the latest backup + // NOT a thread safe call + Status CreateNewBackup(bool flush_before_backup = false); + // Returns info about backups in backup_info + void GetBackupInfo(std::vector* backup_info); + // deletes old backups, keeping latest num_backups_to_keep alive + Status PurgeOldBackups(uint32_t num_backups_to_keep); + // deletes a specific backup + Status DeleteBackup(BackupID backup_id); + // Call this from another thread if you want to stop the backup + // that is currently happening. It will return immediatelly, will + // not wait for the backup to stop. + // The backup will stop ASAP and the call to CreateNewBackup will + // return Status::Incomplete(). It will not clean up after itself, but + // the state will remain consistent. The state will be cleaned up + // next time you create BackupableDB or RestoreBackupableDB. + void StopBackup(); + + private: + BackupEngine* backup_engine_; +}; + +// Use this class to access information about backups and restore from them +class RestoreBackupableDB { + public: + RestoreBackupableDB(Env* db_env, const BackupableDBOptions& options); + ~RestoreBackupableDB(); + + // Returns info about backups in backup_info + void GetBackupInfo(std::vector* backup_info); + + // restore from backup with backup_id + // IMPORTANT -- if options_.share_table_files == true and you restore DB + // from some backup that is not the latest, and you start creating new + // backups from the new DB, all the backups that were newer than the + // backup you restored from will be deleted + // + // Example: Let's say you have backups 1, 2, 3, 4, 5 and you restore 3. + // If you try creating a new backup now, old backups 4 and 5 will be deleted + // and new backup with ID 4 will be created. + Status RestoreDBFromBackup(BackupID backup_id, const std::string& db_dir, + const std::string& wal_dir); + + // restore from the latest backup + Status RestoreDBFromLatestBackup(const std::string& db_dir, + const std::string& wal_dir); + // deletes old backups, keeping latest num_backups_to_keep alive + Status PurgeOldBackups(uint32_t num_backups_to_keep); + // deletes a specific backup + Status DeleteBackup(BackupID backup_id); + + private: + BackupEngine* backup_engine_; +}; + +} // rocksdb namespace diff --git a/include/utilities/stackable_db.h b/include/utilities/stackable_db.h new file mode 100644 index 00000000..908fe10b --- /dev/null +++ b/include/utilities/stackable_db.h @@ -0,0 +1,161 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include "rocksdb/db.h" + +namespace rocksdb { + +// This class contains APIs to stack rocksdb wrappers.Eg. Stack TTL over base d +class StackableDB : public DB { + public: + // StackableDB is the owner of db now! + explicit StackableDB(DB* db) : db_(db) {} + + ~StackableDB() { + delete db_; + } + + virtual DB* GetBaseDB() { + return db_; + } + + virtual Status Put(const WriteOptions& options, + const Slice& key, + const Slice& val) override { + return db_->Put(options, key, val); + } + + virtual Status Get(const ReadOptions& options, + const Slice& key, + std::string* value) override { + return db_->Get(options, key, value); + } + + virtual std::vector MultiGet(const ReadOptions& options, + const std::vector& keys, + std::vector* values) + override { + return db_->MultiGet(options, keys, values); + } + + virtual bool KeyMayExist(const ReadOptions& options, + const Slice& key, + std::string* value, + bool* value_found = nullptr) override { + return db_->KeyMayExist(options, key, value, value_found); + } + + virtual Status Delete(const WriteOptions& wopts, const Slice& key) override { + return db_->Delete(wopts, key); + } + + virtual Status Merge(const WriteOptions& options, + const Slice& key, + const Slice& value) override { + return db_->Merge(options, key, value); + } + + + virtual Status Write(const WriteOptions& opts, WriteBatch* updates) + override { + return db_->Write(opts, updates); + } + + virtual Iterator* NewIterator(const ReadOptions& opts) override { + return db_->NewIterator(opts); + } + + virtual const Snapshot* GetSnapshot() override { + return db_->GetSnapshot(); + } + + virtual void ReleaseSnapshot(const Snapshot* snapshot) override { + return db_->ReleaseSnapshot(snapshot); + } + + virtual bool GetProperty(const Slice& property, std::string* value) + override { + return db_->GetProperty(property, value); + } + + virtual void GetApproximateSizes(const Range* r, int n, uint64_t* sizes) + override { + return db_->GetApproximateSizes(r, n, sizes); + } + + virtual void CompactRange(const Slice* begin, const Slice* end, + bool reduce_level = false, + int target_level = -1) override { + return db_->CompactRange(begin, end, reduce_level, target_level); + } + + virtual int NumberLevels() override { + return db_->NumberLevels(); + } + + virtual int MaxMemCompactionLevel() override { + return db_->MaxMemCompactionLevel(); + } + + virtual int Level0StopWriteTrigger() override { + return db_->Level0StopWriteTrigger(); + } + + virtual const std::string& GetName() const override { + return db_->GetName(); + } + + virtual Env* GetEnv() const override { + return db_->GetEnv(); + } + + virtual const Options& GetOptions() const override { + return db_->GetOptions(); + } + + virtual Status Flush(const FlushOptions& fopts) override { + return db_->Flush(fopts); + } + + virtual Status DisableFileDeletions() override { + return db_->DisableFileDeletions(); + } + + virtual Status EnableFileDeletions(bool force) override { + return db_->EnableFileDeletions(force); + } + + virtual Status GetLiveFiles(std::vector& vec, uint64_t* mfs, + bool flush_memtable = true) override { + return db_->GetLiveFiles(vec, mfs, flush_memtable); + } + + virtual SequenceNumber GetLatestSequenceNumber() const override { + return db_->GetLatestSequenceNumber(); + } + + virtual Status GetSortedWalFiles(VectorLogPtr& files) override { + return db_->GetSortedWalFiles(files); + } + + virtual Status DeleteFile(std::string name) override { + return db_->DeleteFile(name); + } + + virtual Status GetDbIdentity(std::string& identity) { + return db_->GetDbIdentity(identity); + } + + virtual Status GetUpdatesSince(SequenceNumber seq_number, + unique_ptr* iter) + override { + return db_->GetUpdatesSince(seq_number, iter); + } + + protected: + DB* db_; +}; + +} // namespace rocksdb diff --git a/include/utilities/utility_db.h b/include/utilities/utility_db.h new file mode 100644 index 00000000..1a7a269d --- /dev/null +++ b/include/utilities/utility_db.h @@ -0,0 +1,50 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include "stackable_db.h" + +namespace rocksdb { + +// This class contains APIs to open rocksdb with specific support eg. TTL +class UtilityDB { + + public: + // Open the database with TTL support. + // + // USE-CASES: + // This API should be used to open the db when key-values inserted are + // meant to be removed from the db in a non-strict 'ttl' amount of time + // Therefore, this guarantees that key-values inserted will remain in the + // db for >= ttl amount of time and the db will make efforts to remove the + // key-values as soon as possible after ttl seconds of their insertion. + // + // BEHAVIOUR: + // TTL is accepted in seconds + // (int32_t)Timestamp(creation) is suffixed to values in Put internally + // Expired TTL values deleted in compaction only:(Timestamp+ttl=5 + // read_only=true opens in the usual read-only mode. Compactions will not be + // triggered(neither manual nor automatic), so no expired entries removed + // + // CONSTRAINTS: + // Not specifying/passing or non-positive TTL behaves like TTL = infinity + // + // !!!WARNING!!!: + // Calling DB::Open directly to re-open a db created by this API will get + // corrupt values(timestamp suffixed) and no ttl effect will be there + // during the second Open, so use this API consistently to open the db + // Be careful when passing ttl with a small positive value because the + // whole database may be deleted in a small amount of time + static Status OpenTtlDB(const Options& options, + const std::string& name, + StackableDB** dbptr, + int32_t ttl = 0, + bool read_only = false); +}; + +} // namespace rocksdb diff --git a/linters/src/.phutil_module_cache b/linters/src/.phutil_module_cache new file mode 100644 index 00000000..5c93a840 --- /dev/null +++ b/linters/src/.phutil_module_cache @@ -0,0 +1 @@ +{"__symbol_cache_version__":8,"b937ad5f80a8bd1156038b730ff56ec5":{"have":{"class":{"FacebookFbcodeLintEngine":71}},"need":{"class":{"ArcanistLintEngine":104,"ArcanistGeneratedLinter":488,"ArcanistNoLintLinter":577,"ArcanistTextLinter":658,"ArcanistPEP8Linter":1227,"FbcodeCppLinter":1715,"PfffCppLinter":1759,"ArcanistSpellingLinter":1875,"ArcanistFilenameLinter":4207,"Filesystem":357,"ArcanistLintSeverity":778}},"xmap":{"FacebookFbcodeLintEngine":["ArcanistLintEngine"]}},"02e2a613e371424b2108d2d6cb849d39":{"have":{"class":{"PfffCppLinter":71}},"need":{"function":{"Futures":875},"class":{"ArcanistLinter":93,"ExecFuture":756,"ArcanistLintMessage":1270,"ArcanistLintSeverity":1607}},"xmap":{"PfffCppLinter":["ArcanistLinter"]}},"4443484928afb005f585843d07b04190":{"have":{"class":{"FbcodeCppLinter":13}},"need":{"function":{"Futures":1265},"class":{"ArcanistLinter":37,"ExecFuture":934,"ArcanistLintSeverity":1729}},"xmap":{"FbcodeCppLinter":["ArcanistLinter"]}}} \ No newline at end of file diff --git a/linters/src/__phutil_library_init__.php b/linters/src/__phutil_library_init__.php new file mode 100644 index 00000000..4b8d3d13 --- /dev/null +++ b/linters/src/__phutil_library_init__.php @@ -0,0 +1,3 @@ + 2, + 'class' => + array( + 'FacebookFbcodeLintEngine' => 'lint_engine/FacebookFbcodeLintEngine.php', + 'FbcodeCppLinter' => 'cpp_linter/FbcodeCppLinter.php', + 'PfffCppLinter' => 'cpp_linter/PfffCppLinter.php', + ), + 'function' => + array( + ), + 'xmap' => + array( + 'FacebookFbcodeLintEngine' => 'ArcanistLintEngine', + 'FbcodeCppLinter' => 'ArcanistLinter', + 'PfffCppLinter' => 'ArcanistLinter', + ), +)); diff --git a/linters/src/cpp_linter/FbcodeCppLinter.php b/linters/src/cpp_linter/FbcodeCppLinter.php new file mode 100644 index 00000000..e62d3bbe --- /dev/null +++ b/linters/src/cpp_linter/FbcodeCppLinter.php @@ -0,0 +1,99 @@ +getEngine()->getFilePathOnDisk($p); + $lpath_file = file($lpath); + if (preg_match('/\.(c)$/', $lpath) || + preg_match('/-\*-.*Mode: C[; ].*-\*-/', $lpath_file[0]) || + preg_match('/vim(:.*)*:\s*(set\s+)?filetype=c\s*:/', $lpath_file[0]) + ) { + $futures[$p] = new ExecFuture("%s %s %s 2>&1", + $CPP_LINT, self::C_FLAG, + $this->getEngine()->getFilePathOnDisk($p)); + } else { + $futures[$p] = new ExecFuture("%s %s 2>&1", + self::CPPLINT, $this->getEngine()->getFilePathOnDisk($p)); + } + } + + foreach (Futures($futures)->limit(8) as $p => $f) { + $this->rawLintOutput[$p] = $f->resolvex(); + } + } + return; + } + + public function getLinterName() { + return "FBCPP"; + } + + public function lintPath($path) { + $msgs = $this->getCppLintOutput($path); + foreach ($msgs as $m) { + $this->raiseLintAtLine($m['line'], 0, $m['severity'], $m['msg']); + } + } + + public function getLintSeverityMap() { + return array( + self::LINT_WARNING => ArcanistLintSeverity::SEVERITY_WARNING, + self::LINT_ERROR => ArcanistLintSeverity::SEVERITY_ERROR + ); + } + + public function getLintNameMap() { + return array( + self::LINT_WARNING => "CppLint Warning", + self::LINT_ERROR => "CppLint Error" + ); + } + + private function getCppLintOutput($path) { + list($output) = $this->rawLintOutput[$path]; + + $msgs = array(); + $current = null; + foreach (explode("\n", $output) as $line) { + if (preg_match('/[^:]*\((\d+)\):(.*)$/', $line, $matches)) { + if ($current) { + $msgs[] = $current; + } + $line = $matches[1]; + $text = $matches[2]; + $sev = preg_match('/.*Warning.*/', $text) + ? self::LINT_WARNING + : self::LINT_ERROR; + $current = array('line' => $line, + 'msg' => $text, + 'severity' => $sev); + } else if ($current) { + $current['msg'] .= ' ' . $line; + } + } + if ($current) { + $msgs[] = $current; + } + + return $msgs; + } +} + diff --git a/linters/src/cpp_linter/PfffCppLinter.php b/linters/src/cpp_linter/PfffCppLinter.php new file mode 100644 index 00000000..67366143 --- /dev/null +++ b/linters/src/cpp_linter/PfffCppLinter.php @@ -0,0 +1,68 @@ +&1", + $program, $this->getEngine()->getFilePathOnDisk($p)); + } + foreach (Futures($futures)->limit(8) as $p => $f) { + + list($stdout, $stderr) = $f->resolvex(); + $raw = json_decode($stdout, true); + if (!is_array($raw)) { + throw new Exception( + "checkCpp returned invalid JSON!". + "Stdout: {$stdout} Stderr: {$stderr}" + ); + } + foreach($raw as $err) { + $this->addLintMessage( + ArcanistLintMessage::newFromDictionary( + array( + 'path' => $err['file'], + 'line' => $err['line'], + 'char' => 0, + 'name' => $err['name'], + 'description' => $err['info'], + 'code' => $this->getLinterName(), + 'severity' => ArcanistLintSeverity::SEVERITY_WARNING, + ) + ) + ); + } + } + } + return; + } + + public function lintPath($path) { + return; + } +} diff --git a/linters/src/lint_engine/FacebookFbcodeLintEngine.php b/linters/src/lint_engine/FacebookFbcodeLintEngine.php new file mode 100644 index 00000000..c34530c3 --- /dev/null +++ b/linters/src/lint_engine/FacebookFbcodeLintEngine.php @@ -0,0 +1,147 @@ +getPaths(); + + // Remove all deleted files, which are not checked by the + // following linters. + foreach ($paths as $key => $path) { + if (!Filesystem::pathExists($this->getFilePathOnDisk($path))) { + unset($paths[$key]); + } + } + + $generated_linter = new ArcanistGeneratedLinter(); + $linters[] = $generated_linter; + + $nolint_linter = new ArcanistNoLintLinter(); + $linters[] = $nolint_linter; + + $text_linter = new ArcanistTextLinter(); + $text_linter->setCustomSeverityMap(array( + ArcanistTextLinter::LINT_LINE_WRAP + => ArcanistLintSeverity::SEVERITY_ADVICE, + )); + $linters[] = $text_linter; + + $java_text_linter = new ArcanistTextLinter(); + $java_text_linter->setMaxLineLength(100); + $java_text_linter->setCustomSeverityMap(array( + ArcanistTextLinter::LINT_LINE_WRAP + => ArcanistLintSeverity::SEVERITY_ADVICE, + )); + $linters[] = $java_text_linter; + + $pep8_options = $this->getPEP8WithTextOptions().',E302'; + + $python_linter = new ArcanistPEP8Linter(); + $python_linter->setConfig(array('options' => $pep8_options)); + $linters[] = $python_linter; + + $python_2space_linter = new ArcanistPEP8Linter(); + $python_2space_linter->setConfig(array('options' => $pep8_options.',E111')); + $linters[] = $python_2space_linter; + + // Currently we can't run cpplint in commit hook mode, because it + // depends on having access to the working directory. + if (!$this->getCommitHookMode()) { + $cpp_linter = new FbcodeCppLinter(); + $cpp_linter2 = new PfffCppLinter(); + $linters[] = $cpp_linter; + $linters[] = $cpp_linter2; + } + + $spelling_linter = new ArcanistSpellingLinter(); + $linters[] = $spelling_linter; + + foreach ($paths as $path) { + $is_text = false; + + $text_extensions = ( + '/\.('. + 'cpp|cxx|c|cc|h|hpp|hxx|tcc|'. + 'py|rb|hs|pl|pm|tw|'. + 'php|phpt|css|js|'. + 'java|'. + 'thrift|'. + 'lua|'. + 'siv|'. + 'txt'. + ')$/' + ); + if (preg_match($text_extensions, $path)) { + $is_text = true; + } + if ($is_text) { + $nolint_linter->addPath($path); + + $generated_linter->addPath($path); + $generated_linter->addData($path, $this->loadData($path)); + + if (preg_match('/\.java$/', $path)) { + $java_text_linter->addPath($path); + $java_text_linter->addData($path, $this->loadData($path)); + } else { + $text_linter->addPath($path); + $text_linter->addData($path, $this->loadData($path)); + } + + $spelling_linter->addPath($path); + $spelling_linter->addData($path, $this->loadData($path)); + } + if (isset($cpp_linter) && isset($cpp_linter2) && + preg_match('/\.(cpp|c|cc|cxx|h|hh|hpp|hxx|tcc)$/', $path)) { + $cpp_linter->addPath($path); + $cpp_linter->addData($path, $this->loadData($path)); + $cpp_linter2->addPath($path); + $cpp_linter2->addData($path, $this->loadData($path)); + + } + + // Match *.py and contbuild config files + if (preg_match('/(\.(py|tw|smcprops)|^contbuild\/configs\/[^\/]*)$/', + $path)) { + $space_count = 4; + $real_path = $this->getFilePathOnDisk($path); + $dir = dirname($real_path); + do { + if (file_exists($dir.'/.python2space')) { + $space_count = 2; + break; + } + $dir = dirname($dir); + } while ($dir != '/' && $dir != '.'); + + if ($space_count == 4) { + $cur_path_linter = $python_linter; + } else { + $cur_path_linter = $python_2space_linter; + } + $cur_path_linter->addPath($path); + $cur_path_linter->addData($path, $this->loadData($path)); + + if (preg_match('/\.tw$/', $path)) { + $cur_path_linter->setCustomSeverityMap(array( + 'E251' => ArcanistLintSeverity::SEVERITY_DISABLED, + )); + } + } + + + + } + + $name_linter = new ArcanistFilenameLinter(); + $linters[] = $name_linter; + foreach ($paths as $path) { + $name_linter->addPath($path); + } + + return $linters; + } + +} diff --git a/port/README b/port/README new file mode 100644 index 00000000..422563e2 --- /dev/null +++ b/port/README @@ -0,0 +1,10 @@ +This directory contains interfaces and implementations that isolate the +rest of the package from platform details. + +Code in the rest of the package includes "port.h" from this directory. +"port.h" in turn includes a platform specific "port_.h" file +that provides the platform specific implementation. + +See port_posix.h for an example of what must be provided in a platform +specific header file. + diff --git a/port/atomic_pointer.h b/port/atomic_pointer.h new file mode 100644 index 00000000..db3580bd --- /dev/null +++ b/port/atomic_pointer.h @@ -0,0 +1,157 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +// AtomicPointer provides storage for a lock-free pointer. +// Platform-dependent implementation of AtomicPointer: +// - If the platform provides a cheap barrier, we use it with raw pointers +// - If cstdatomic is present (on newer versions of gcc, it is), we use +// a cstdatomic-based AtomicPointer. However we prefer the memory +// barrier based version, because at least on a gcc 4.4 32-bit build +// on linux, we have encountered a buggy +// implementation. Also, some implementations are much +// slower than a memory-barrier based implementation (~16ns for +// based acquire-load vs. ~1ns for a barrier based +// acquire-load). +// This code is based on atomicops-internals-* in Google's perftools: +// http://code.google.com/p/google-perftools/source/browse/#svn%2Ftrunk%2Fsrc%2Fbase + +#ifndef PORT_ATOMIC_POINTER_H_ +#define PORT_ATOMIC_POINTER_H_ + +#include +#ifdef ROCKSDB_ATOMIC_PRESENT +#include +#endif +#ifdef OS_WIN +#include +#endif +#ifdef OS_MACOSX +#include +#endif + +#if defined(_M_X64) || defined(__x86_64__) +#define ARCH_CPU_X86_FAMILY 1 +#elif defined(_M_IX86) || defined(__i386__) || defined(__i386) +#define ARCH_CPU_X86_FAMILY 1 +#elif defined(__ARMEL__) +#define ARCH_CPU_ARM_FAMILY 1 +#endif + +namespace rocksdb { +namespace port { + +// Define MemoryBarrier() if available +// Windows on x86 +#if defined(OS_WIN) && defined(COMPILER_MSVC) && defined(ARCH_CPU_X86_FAMILY) +// windows.h already provides a MemoryBarrier(void) macro +// http://msdn.microsoft.com/en-us/library/ms684208(v=vs.85).aspx +#define ROCKSDB_HAVE_MEMORY_BARRIER + +// Gcc on x86 +#elif defined(ARCH_CPU_X86_FAMILY) && defined(__GNUC__) +inline void MemoryBarrier() { + // See http://gcc.gnu.org/ml/gcc/2003-04/msg01180.html for a discussion on + // this idiom. Also see http://en.wikipedia.org/wiki/Memory_ordering. + __asm__ __volatile__("" : : : "memory"); +} +#define ROCKSDB_HAVE_MEMORY_BARRIER + +// Sun Studio +#elif defined(ARCH_CPU_X86_FAMILY) && defined(__SUNPRO_CC) +inline void MemoryBarrier() { + // See http://gcc.gnu.org/ml/gcc/2003-04/msg01180.html for a discussion on + // this idiom. Also see http://en.wikipedia.org/wiki/Memory_ordering. + asm volatile("" : : : "memory"); +} +#define ROCKSDB_HAVE_MEMORY_BARRIER + +// Mac OS +#elif defined(OS_MACOSX) +inline void MemoryBarrier() { + OSMemoryBarrier(); +} +#define ROCKSDB_HAVE_MEMORY_BARRIER + +// ARM Linux +#elif defined(ARCH_CPU_ARM_FAMILY) && defined(__linux__) +typedef void (*LinuxKernelMemoryBarrierFunc)(void); +// The Linux ARM kernel provides a highly optimized device-specific memory +// barrier function at a fixed memory address that is mapped in every +// user-level process. +// +// This beats using CPU-specific instructions which are, on single-core +// devices, un-necessary and very costly (e.g. ARMv7-A "dmb" takes more +// than 180ns on a Cortex-A8 like the one on a Nexus One). Benchmarking +// shows that the extra function call cost is completely negligible on +// multi-core devices. +// +inline void MemoryBarrier() { + (*(LinuxKernelMemoryBarrierFunc)0xffff0fa0)(); +} +#define ROCKSDB_HAVE_MEMORY_BARRIER + +#endif + +// AtomicPointer built using platform-specific MemoryBarrier() +#if defined(ROCKSDB_HAVE_MEMORY_BARRIER) +class AtomicPointer { + private: + void* rep_; + public: + AtomicPointer() { } + explicit AtomicPointer(void* p) : rep_(p) {} + inline void* NoBarrier_Load() const { return rep_; } + inline void NoBarrier_Store(void* v) { rep_ = v; } + inline void* Acquire_Load() const { + void* result = rep_; + MemoryBarrier(); + return result; + } + inline void Release_Store(void* v) { + MemoryBarrier(); + rep_ = v; + } +}; + +// AtomicPointer based on +#elif defined(ROCKSDB_ATOMIC_PRESENT) +class AtomicPointer { + private: + std::atomic rep_; + public: + AtomicPointer() { } + explicit AtomicPointer(void* v) : rep_(v) { } + inline void* Acquire_Load() const { + return rep_.load(std::memory_order_acquire); + } + inline void Release_Store(void* v) { + rep_.store(v, std::memory_order_release); + } + inline void* NoBarrier_Load() const { + return rep_.load(std::memory_order_relaxed); + } + inline void NoBarrier_Store(void* v) { + rep_.store(v, std::memory_order_relaxed); + } +}; + +// We have neither MemoryBarrier(), nor +#else +#error Please implement AtomicPointer for this platform. + +#endif + +#undef ROCKSDB_HAVE_MEMORY_BARRIER +#undef ARCH_CPU_X86_FAMILY +#undef ARCH_CPU_ARM_FAMILY + +} // namespace port +} // namespace rocksdb + +#endif // PORT_ATOMIC_POINTER_H_ diff --git a/port/port.h b/port/port.h new file mode 100644 index 00000000..2dc9a0fa --- /dev/null +++ b/port/port.h @@ -0,0 +1,22 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_PORT_PORT_H_ +#define STORAGE_LEVELDB_PORT_PORT_H_ + +#include + +// Include the appropriate platform specific file below. If you are +// porting to a new platform, see "port_example.h" for documentation +// of what the new port_.h file must provide. +#if defined(ROCKSDB_PLATFORM_POSIX) +# include "port/port_posix.h" +#endif + +#endif // STORAGE_LEVELDB_PORT_PORT_H_ diff --git a/port/port_example.h b/port/port_example.h new file mode 100644 index 00000000..64a57918 --- /dev/null +++ b/port/port_example.h @@ -0,0 +1,140 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// This file contains the specification, but not the implementations, +// of the types/operations/etc. that should be defined by a platform +// specific port_.h file. Use this file as a reference for +// how to port this package to a new platform. + +#ifndef STORAGE_LEVELDB_PORT_PORT_EXAMPLE_H_ +#define STORAGE_LEVELDB_PORT_PORT_EXAMPLE_H_ + +namespace rocksdb { +namespace port { + +// TODO(jorlow): Many of these belong more in the environment class rather than +// here. We should try moving them and see if it affects perf. + +// The following boolean constant must be true on a little-endian machine +// and false otherwise. +static const bool kLittleEndian = true /* or some other expression */; + +// ------------------ Threading ------------------- + +// A Mutex represents an exclusive lock. +class Mutex { + public: + Mutex(); + ~Mutex(); + + // Lock the mutex. Waits until other lockers have exited. + // Will deadlock if the mutex is already locked by this thread. + void Lock(); + + // Unlock the mutex. + // REQUIRES: This mutex was locked by this thread. + void Unlock(); + + // Optionally crash if this thread does not hold this mutex. + // The implementation must be fast, especially if NDEBUG is + // defined. The implementation is allowed to skip all checks. + void AssertHeld(); +}; + +class CondVar { + public: + explicit CondVar(Mutex* mu); + ~CondVar(); + + // Atomically release *mu and block on this condition variable until + // either a call to SignalAll(), or a call to Signal() that picks + // this thread to wakeup. + // REQUIRES: this thread holds *mu + void Wait(); + + // If there are some threads waiting, wake up at least one of them. + void Signal(); + + // Wake up all waiting threads. + void SignallAll(); +}; + +// Thread-safe initialization. +// Used as follows: +// static port::OnceType init_control = LEVELDB_ONCE_INIT; +// static void Initializer() { ... do something ...; } +// ... +// port::InitOnce(&init_control, &Initializer); +typedef intptr_t OnceType; +#define LEVELDB_ONCE_INIT 0 +extern void InitOnce(port::OnceType*, void (*initializer)()); + +// A type that holds a pointer that can be read or written atomically +// (i.e., without word-tearing.) +class AtomicPointer { + private: + intptr_t rep_; + public: + // Initialize to arbitrary value + AtomicPointer(); + + // Initialize to hold v + explicit AtomicPointer(void* v) : rep_(v) { } + + // Read and return the stored pointer with the guarantee that no + // later memory access (read or write) by this thread can be + // reordered ahead of this read. + void* Acquire_Load() const; + + // Set v as the stored pointer with the guarantee that no earlier + // memory access (read or write) by this thread can be reordered + // after this store. + void Release_Store(void* v); + + // Read the stored pointer with no ordering guarantees. + void* NoBarrier_Load() const; + + // Set va as the stored pointer with no ordering guarantees. + void NoBarrier_Store(void* v); +}; + +// ------------------ Compression ------------------- + +// Store the snappy compression of "input[0,input_length-1]" in *output. +// Returns false if snappy is not supported by this port. +extern bool Snappy_Compress(const char* input, size_t input_length, + std::string* output); + +// If input[0,input_length-1] looks like a valid snappy compressed +// buffer, store the size of the uncompressed data in *result and +// return true. Else return false. +extern bool Snappy_GetUncompressedLength(const char* input, size_t length, + size_t* result); + +// Attempt to snappy uncompress input[0,input_length-1] into *output. +// Returns true if successful, false if the input is invalid lightweight +// compressed data. +// +// REQUIRES: at least the first "n" bytes of output[] must be writable +// where "n" is the result of a successful call to +// Snappy_GetUncompressedLength. +extern bool Snappy_Uncompress(const char* input_data, size_t input_length, + char* output); + +// ------------------ Miscellaneous ------------------- + +// If heap profiling is not supported, returns false. +// Else repeatedly calls (*func)(arg, data, n) and then returns true. +// The concatenation of all "data[0,n-1]" fragments is the heap profile. +extern bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg); + +} // namespace port +} // namespace rocksdb + +#endif // STORAGE_LEVELDB_PORT_PORT_EXAMPLE_H_ diff --git a/port/port_posix.cc b/port/port_posix.cc new file mode 100644 index 00000000..f7025f46 --- /dev/null +++ b/port/port_posix.cc @@ -0,0 +1,86 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "port/port_posix.h" + +#include +#include +#include +#include "util/logging.h" + +namespace rocksdb { +namespace port { + +static void PthreadCall(const char* label, int result) { + if (result != 0) { + fprintf(stderr, "pthread %s: %s\n", label, strerror(result)); + abort(); + } +} + +Mutex::Mutex(bool adaptive) { +#ifdef OS_LINUX + if (!adaptive) { + PthreadCall("init mutex", pthread_mutex_init(&mu_, NULL)); + } else { + pthread_mutexattr_t mutex_attr; + PthreadCall("init mutex attr", pthread_mutexattr_init(&mutex_attr)); + PthreadCall("set mutex attr", + pthread_mutexattr_settype(&mutex_attr, + PTHREAD_MUTEX_ADAPTIVE_NP)); + PthreadCall("init mutex", pthread_mutex_init(&mu_, &mutex_attr)); + PthreadCall("destroy mutex attr", + pthread_mutexattr_destroy(&mutex_attr)); + } +#else // ignore adaptive for non-linux platform + PthreadCall("init mutex", pthread_mutex_init(&mu_, NULL)); +#endif // OS_LINUX +} + +Mutex::~Mutex() { PthreadCall("destroy mutex", pthread_mutex_destroy(&mu_)); } + +void Mutex::Lock() { PthreadCall("lock", pthread_mutex_lock(&mu_)); } + +void Mutex::Unlock() { PthreadCall("unlock", pthread_mutex_unlock(&mu_)); } + +CondVar::CondVar(Mutex* mu) + : mu_(mu) { + PthreadCall("init cv", pthread_cond_init(&cv_, NULL)); +} + +CondVar::~CondVar() { PthreadCall("destroy cv", pthread_cond_destroy(&cv_)); } + +void CondVar::Wait() { + PthreadCall("wait", pthread_cond_wait(&cv_, &mu_->mu_)); +} + +void CondVar::Signal() { + PthreadCall("signal", pthread_cond_signal(&cv_)); +} + +void CondVar::SignalAll() { + PthreadCall("broadcast", pthread_cond_broadcast(&cv_)); +} + +RWMutex::RWMutex() { PthreadCall("init mutex", pthread_rwlock_init(&mu_, NULL)); } + +RWMutex::~RWMutex() { PthreadCall("destroy mutex", pthread_rwlock_destroy(&mu_)); } + +void RWMutex::ReadLock() { PthreadCall("read lock", pthread_rwlock_rdlock(&mu_)); } + +void RWMutex::WriteLock() { PthreadCall("write lock", pthread_rwlock_wrlock(&mu_)); } + +void RWMutex::Unlock() { PthreadCall("unlock", pthread_rwlock_unlock(&mu_)); } + +void InitOnce(OnceType* once, void (*initializer)()) { + PthreadCall("once", pthread_once(once, initializer)); +} + +} // namespace port +} // namespace rocksdb diff --git a/port/port_posix.h b/port/port_posix.h new file mode 100644 index 00000000..15ab0dc5 --- /dev/null +++ b/port/port_posix.h @@ -0,0 +1,421 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// See port_example.h for documentation for the following types/functions. + +#ifndef STORAGE_LEVELDB_PORT_PORT_POSIX_H_ +#define STORAGE_LEVELDB_PORT_PORT_POSIX_H_ + +#undef PLATFORM_IS_LITTLE_ENDIAN +#if defined(OS_MACOSX) + #include + #if defined(__DARWIN_LITTLE_ENDIAN) && defined(__DARWIN_BYTE_ORDER) + #define PLATFORM_IS_LITTLE_ENDIAN \ + (__DARWIN_BYTE_ORDER == __DARWIN_LITTLE_ENDIAN) + #endif +#elif defined(OS_SOLARIS) + #include + #ifdef _LITTLE_ENDIAN + #define PLATFORM_IS_LITTLE_ENDIAN true + #else + #define PLATFORM_IS_LITTLE_ENDIAN false + #endif +#elif defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) ||\ + defined(OS_DRAGONFLYBSD) || defined(OS_ANDROID) + #include + #include +#else + #include +#endif +#include +#ifdef SNAPPY +#include +#endif + +#ifdef ZLIB +#include +#endif + +#ifdef BZIP2 +#include +#endif + +#include +#include +#include +#include "rocksdb/options.h" +#include "port/atomic_pointer.h" + +#ifndef PLATFORM_IS_LITTLE_ENDIAN +#define PLATFORM_IS_LITTLE_ENDIAN (__BYTE_ORDER == __LITTLE_ENDIAN) +#endif + +#if defined(OS_MACOSX) || defined(OS_SOLARIS) || defined(OS_FREEBSD) ||\ + defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLYBSD) ||\ + defined(OS_ANDROID) +// Use fread/fwrite/fflush on platforms without _unlocked variants +#define fread_unlocked fread +#define fwrite_unlocked fwrite +#define fflush_unlocked fflush +#endif + +#if defined(OS_MACOSX) || defined(OS_FREEBSD) ||\ + defined(OS_OPENBSD) || defined(OS_DRAGONFLYBSD) +// Use fsync() on platforms without fdatasync() +#define fdatasync fsync +#endif + +#if defined(OS_ANDROID) && __ANDROID_API__ < 9 +// fdatasync() was only introduced in API level 9 on Android. Use fsync() +// when targetting older platforms. +#define fdatasync fsync +#endif + +namespace rocksdb { +namespace port { + +static const bool kLittleEndian = PLATFORM_IS_LITTLE_ENDIAN; +#undef PLATFORM_IS_LITTLE_ENDIAN + +class CondVar; + +class Mutex { + public: + /* implicit */ Mutex(bool adaptive = false); + ~Mutex(); + + void Lock(); + void Unlock(); + void AssertHeld() { } + + private: + friend class CondVar; + pthread_mutex_t mu_; + + // No copying + Mutex(const Mutex&); + void operator=(const Mutex&); +}; + +class RWMutex { + public: + RWMutex(); + ~RWMutex(); + + void ReadLock(); + void WriteLock(); + void Unlock(); + void AssertHeld() { } + + private: + pthread_rwlock_t mu_; // the underlying platform mutex + + // No copying allowed + RWMutex(const RWMutex&); + void operator=(const RWMutex&); +}; + +class CondVar { + public: + explicit CondVar(Mutex* mu); + ~CondVar(); + void Wait(); + void Signal(); + void SignalAll(); + private: + pthread_cond_t cv_; + Mutex* mu_; +}; + +typedef pthread_once_t OnceType; +#define LEVELDB_ONCE_INIT PTHREAD_ONCE_INIT +extern void InitOnce(OnceType* once, void (*initializer)()); + +inline bool Snappy_Compress(const CompressionOptions& opts, const char* input, + size_t length, ::std::string* output) { +#ifdef SNAPPY + output->resize(snappy::MaxCompressedLength(length)); + size_t outlen; + snappy::RawCompress(input, length, &(*output)[0], &outlen); + output->resize(outlen); + return true; +#endif + + return false; +} + +inline bool Snappy_GetUncompressedLength(const char* input, size_t length, + size_t* result) { +#ifdef SNAPPY + return snappy::GetUncompressedLength(input, length, result); +#else + return false; +#endif +} + +inline bool Snappy_Uncompress(const char* input, size_t length, + char* output) { +#ifdef SNAPPY + return snappy::RawUncompress(input, length, output); +#else + return false; +#endif +} + +inline bool Zlib_Compress(const CompressionOptions& opts, const char* input, + size_t length, ::std::string* output) { +#ifdef ZLIB + // The memLevel parameter specifies how much memory should be allocated for + // the internal compression state. + // memLevel=1 uses minimum memory but is slow and reduces compression ratio. + // memLevel=9 uses maximum memory for optimal speed. + // The default value is 8. See zconf.h for more details. + static const int memLevel = 8; + z_stream _stream; + memset(&_stream, 0, sizeof(z_stream)); + int st = deflateInit2(&_stream, opts.level, Z_DEFLATED, opts.window_bits, + memLevel, opts.strategy); + if (st != Z_OK) { + return false; + } + + // Resize output to be the plain data length. + // This may not be big enough if the compression actually expands data. + output->resize(length); + + // Compress the input, and put compressed data in output. + _stream.next_in = (Bytef *)input; + _stream.avail_in = length; + + // Initialize the output size. + _stream.avail_out = length; + _stream.next_out = (Bytef *)&(*output)[0]; + + int old_sz =0, new_sz =0, new_sz_delta =0; + bool done = false; + while (!done) { + int st = deflate(&_stream, Z_FINISH); + switch (st) { + case Z_STREAM_END: + done = true; + break; + case Z_OK: + // No output space. Increase the output space by 20%. + // (Should we fail the compression since it expands the size?) + old_sz = output->size(); + new_sz_delta = (int)(output->size() * 0.2); + new_sz = output->size() + (new_sz_delta < 10 ? 10 : new_sz_delta); + output->resize(new_sz); + // Set more output. + _stream.next_out = (Bytef *)&(*output)[old_sz]; + _stream.avail_out = new_sz - old_sz; + break; + case Z_BUF_ERROR: + default: + deflateEnd(&_stream); + return false; + } + } + + output->resize(output->size() - _stream.avail_out); + deflateEnd(&_stream); + return true; +#endif + return false; +} + +inline char* Zlib_Uncompress(const char* input_data, size_t input_length, + int* decompress_size, int windowBits = -14) { +#ifdef ZLIB + z_stream _stream; + memset(&_stream, 0, sizeof(z_stream)); + + // For raw inflate, the windowBits should be -8..-15. + // If windowBits is bigger than zero, it will use either zlib + // header or gzip header. Adding 32 to it will do automatic detection. + int st = inflateInit2(&_stream, + windowBits > 0 ? windowBits + 32 : windowBits); + if (st != Z_OK) { + return nullptr; + } + + _stream.next_in = (Bytef *)input_data; + _stream.avail_in = input_length; + + // Assume the decompressed data size will 5x of compressed size. + int output_len = input_length * 5; + char* output = new char[output_len]; + int old_sz = output_len; + + _stream.next_out = (Bytef *)output; + _stream.avail_out = output_len; + + char* tmp = nullptr; + int output_len_delta; + bool done = false; + + //while(_stream.next_in != nullptr && _stream.avail_in != 0) { + while (!done) { + int st = inflate(&_stream, Z_SYNC_FLUSH); + switch (st) { + case Z_STREAM_END: + done = true; + break; + case Z_OK: + // No output space. Increase the output space by 20%. + old_sz = output_len; + output_len_delta = (int)(output_len * 0.2); + output_len += output_len_delta < 10 ? 10 : output_len_delta; + tmp = new char[output_len]; + memcpy(tmp, output, old_sz); + delete[] output; + output = tmp; + + // Set more output. + _stream.next_out = (Bytef *)(output + old_sz); + _stream.avail_out = output_len - old_sz; + break; + case Z_BUF_ERROR: + default: + delete[] output; + inflateEnd(&_stream); + return nullptr; + } + } + + *decompress_size = output_len - _stream.avail_out; + inflateEnd(&_stream); + return output; +#endif + + return nullptr; +} + +inline bool BZip2_Compress(const CompressionOptions& opts, const char* input, + size_t length, ::std::string* output) { +#ifdef BZIP2 + bz_stream _stream; + memset(&_stream, 0, sizeof(bz_stream)); + + // Block size 1 is 100K. + // 0 is for silent. + // 30 is the default workFactor + int st = BZ2_bzCompressInit(&_stream, 1, 0, 30); + if (st != BZ_OK) { + return false; + } + + // Resize output to be the plain data length. + // This may not be big enough if the compression actually expands data. + output->resize(length); + + // Compress the input, and put compressed data in output. + _stream.next_in = (char *)input; + _stream.avail_in = length; + + // Initialize the output size. + _stream.next_out = (char *)&(*output)[0]; + _stream.avail_out = length; + + int old_sz =0, new_sz =0; + while(_stream.next_in != nullptr && _stream.avail_in != 0) { + int st = BZ2_bzCompress(&_stream, BZ_FINISH); + switch (st) { + case BZ_STREAM_END: + break; + case BZ_FINISH_OK: + // No output space. Increase the output space by 20%. + // (Should we fail the compression since it expands the size?) + old_sz = output->size(); + new_sz = (int)(output->size() * 1.2); + output->resize(new_sz); + // Set more output. + _stream.next_out = (char *)&(*output)[old_sz]; + _stream.avail_out = new_sz - old_sz; + break; + case BZ_SEQUENCE_ERROR: + default: + BZ2_bzCompressEnd(&_stream); + return false; + } + } + + output->resize(output->size() - _stream.avail_out); + BZ2_bzCompressEnd(&_stream); + return true; + return output; +#endif + return false; +} + +inline char* BZip2_Uncompress(const char* input_data, size_t input_length, + int* decompress_size) { +#ifdef BZIP2 + bz_stream _stream; + memset(&_stream, 0, sizeof(bz_stream)); + + int st = BZ2_bzDecompressInit(&_stream, 0, 0); + if (st != BZ_OK) { + return nullptr; + } + + _stream.next_in = (char *)input_data; + _stream.avail_in = input_length; + + // Assume the decompressed data size will be 5x of compressed size. + int output_len = input_length * 5; + char* output = new char[output_len]; + int old_sz = output_len; + + _stream.next_out = (char *)output; + _stream.avail_out = output_len; + + char* tmp = nullptr; + + while(_stream.next_in != nullptr && _stream.avail_in != 0) { + int st = BZ2_bzDecompress(&_stream); + switch (st) { + case BZ_STREAM_END: + break; + case BZ_OK: + // No output space. Increase the output space by 20%. + old_sz = output_len; + output_len = (int)(output_len * 1.2); + tmp = new char[output_len]; + memcpy(tmp, output, old_sz); + delete[] output; + output = tmp; + + // Set more output. + _stream.next_out = (char *)(output + old_sz); + _stream.avail_out = output_len - old_sz; + break; + case Z_BUF_ERROR: + default: + delete[] output; + BZ2_bzDecompressEnd(&_stream); + return nullptr; + } + } + + *decompress_size = output_len - _stream.avail_out; + BZ2_bzDecompressEnd(&_stream); + return output; +#endif + return nullptr; +} + +inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) { + return false; +} + +} // namespace port +} // namespace rocksdb + +#endif // STORAGE_LEVELDB_PORT_PORT_POSIX_H_ diff --git a/port/stack_trace.cc b/port/stack_trace.cc new file mode 100644 index 00000000..aa01fd0c --- /dev/null +++ b/port/stack_trace.cc @@ -0,0 +1,102 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include "util/stack_trace.h" + +#ifdef OS_LINUX + +#include +#include +#include +#include +#include +#include + +namespace rocksdb { + +static const char* GetExecutableName() +{ + static char name[1024]; + + char link[1024]; + snprintf(link, sizeof(link), "/proc/%d/exe", getpid()); + auto read = readlink(link, name, sizeof(name)); + if (-1 == read) { + return nullptr; + } else { + name[read] = 0; + return name; + } +} + +void PrintStack(int first_frames_to_skip) { + const int kMaxFrames = 100; + void *frames[kMaxFrames]; + + auto num_frames = backtrace(frames, kMaxFrames); + auto symbols = backtrace_symbols(frames, num_frames); + + auto executable = GetExecutableName(); + + for (int i = first_frames_to_skip; i < num_frames; ++i) { + fprintf(stderr, "#%-2d ", i - first_frames_to_skip); + if (symbols) { + fprintf(stderr, "%s ", symbols[i]); + } + if (executable) { + // out source to addr2line, for the address translation + const int kLineMax = 256; + char cmd[kLineMax]; + sprintf(cmd, "addr2line %p -e %s -f -C 2>&1", frames[i], executable); + auto f = popen(cmd, "r"); + if (f) { + char line[kLineMax]; + while (fgets(line, sizeof(line), f)) { + line[strlen(line) - 1] = 0; // remove newline + fprintf(stderr, "%s\t", line); + } + pclose(f); + } + } else { + fprintf(stderr, " %p", frames[i]); + } + fprintf(stderr, "\n"); + } +} + +static void StackTraceHandler(int sig) { + // reset to default handler + signal(sig, SIG_DFL); + fprintf(stderr, "Received signal %d (%s)\n", sig, strsignal(sig)); + // skip the top three signal handler related frames + PrintStack(3); + // re-signal to default handler (so we still get core dump if needed...) + raise(sig); +} + +void InstallStackTraceHandler() { + // just use the plain old signal as it's simple and sufficient + // for this use case + signal(SIGILL, StackTraceHandler); + signal(SIGSEGV, StackTraceHandler); + signal(SIGBUS, StackTraceHandler); + signal(SIGABRT, StackTraceHandler); + + printf("Installed stack trace handler for SIGILL SIGSEGV SIGBUS SIGABRT\n"); + +} + +} // namespace rocksdb + +#else // no-op for non-linux system for now + +namespace rocksdb { + +void InstallStackTraceHandler() {} +void PrintStack(int first_frames_to_skip) {} + +} + +#endif // OS_LINUX diff --git a/port/win/stdint.h b/port/win/stdint.h new file mode 100644 index 00000000..39edd0db --- /dev/null +++ b/port/win/stdint.h @@ -0,0 +1,24 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +// MSVC didn't ship with this file until the 2010 version. + +#ifndef STORAGE_LEVELDB_PORT_WIN_STDINT_H_ +#define STORAGE_LEVELDB_PORT_WIN_STDINT_H_ + +#if !defined(_MSC_VER) +#error This file should only be included when compiling with MSVC. +#endif + +// Define C99 equivalent types. +typedef signed char int8_t; +typedef signed short int16_t; +typedef signed int int32_t; +typedef signed long long int64_t; +typedef unsigned char uint8_t; +typedef unsigned short uint16_t; +typedef unsigned int uint32_t; +typedef unsigned long long uint64_t; + +#endif // STORAGE_LEVELDB_PORT_WIN_STDINT_H_ diff --git a/table/block.cc b/table/block.cc new file mode 100644 index 00000000..3f969fe2 --- /dev/null +++ b/table/block.cc @@ -0,0 +1,274 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Decodes the blocks generated by block_builder.cc. + +#include "table/block.h" + +#include +#include +#include "rocksdb/comparator.h" +#include "table/format.h" +#include "util/coding.h" +#include "util/logging.h" + +namespace rocksdb { + +inline uint32_t Block::NumRestarts() const { + assert(size_ >= 2*sizeof(uint32_t)); + return DecodeFixed32(data_ + size_ - sizeof(uint32_t)); +} + +Block::Block(const BlockContents& contents) + : data_(contents.data.data()), + size_(contents.data.size()), + owned_(contents.heap_allocated), + cachable_(contents.cachable), + compression_type_(contents.compression_type) { + if (size_ < sizeof(uint32_t)) { + size_ = 0; // Error marker + } else { + restart_offset_ = size_ - (1 + NumRestarts()) * sizeof(uint32_t); + if (restart_offset_ > size_ - sizeof(uint32_t)) { + // The size is too small for NumRestarts() and therefore + // restart_offset_ wrapped around. + size_ = 0; + } + } +} + +Block::~Block() { + if (owned_) { + delete[] data_; + } +} + +// Helper routine: decode the next block entry starting at "p", +// storing the number of shared key bytes, non_shared key bytes, +// and the length of the value in "*shared", "*non_shared", and +// "*value_length", respectively. Will not derefence past "limit". +// +// If any errors are detected, returns nullptr. Otherwise, returns a +// pointer to the key delta (just past the three decoded values). +static inline const char* DecodeEntry(const char* p, const char* limit, + uint32_t* shared, + uint32_t* non_shared, + uint32_t* value_length) { + if (limit - p < 3) return nullptr; + *shared = reinterpret_cast(p)[0]; + *non_shared = reinterpret_cast(p)[1]; + *value_length = reinterpret_cast(p)[2]; + if ((*shared | *non_shared | *value_length) < 128) { + // Fast path: all three values are encoded in one byte each + p += 3; + } else { + if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr; + if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr; + if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) return nullptr; + } + + if (static_cast(limit - p) < (*non_shared + *value_length)) { + return nullptr; + } + return p; +} + +class Block::Iter : public Iterator { + private: + const Comparator* const comparator_; + const char* const data_; // underlying block contents + uint32_t const restarts_; // Offset of restart array (list of fixed32) + uint32_t const num_restarts_; // Number of uint32_t entries in restart array + + // current_ is offset in data_ of current entry. >= restarts_ if !Valid + uint32_t current_; + uint32_t restart_index_; // Index of restart block in which current_ falls + std::string key_; + Slice value_; + Status status_; + + inline int Compare(const Slice& a, const Slice& b) const { + return comparator_->Compare(a, b); + } + + // Return the offset in data_ just past the end of the current entry. + inline uint32_t NextEntryOffset() const { + return (value_.data() + value_.size()) - data_; + } + + uint32_t GetRestartPoint(uint32_t index) { + assert(index < num_restarts_); + return DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t)); + } + + void SeekToRestartPoint(uint32_t index) { + key_.clear(); + restart_index_ = index; + // current_ will be fixed by ParseNextKey(); + + // ParseNextKey() starts at the end of value_, so set value_ accordingly + uint32_t offset = GetRestartPoint(index); + value_ = Slice(data_ + offset, 0); + } + + public: + Iter(const Comparator* comparator, + const char* data, + uint32_t restarts, + uint32_t num_restarts) + : comparator_(comparator), + data_(data), + restarts_(restarts), + num_restarts_(num_restarts), + current_(restarts_), + restart_index_(num_restarts_) { + assert(num_restarts_ > 0); + } + + virtual bool Valid() const { return current_ < restarts_; } + virtual Status status() const { return status_; } + virtual Slice key() const { + assert(Valid()); + return key_; + } + virtual Slice value() const { + assert(Valid()); + return value_; + } + + virtual void Next() { + assert(Valid()); + ParseNextKey(); + } + + virtual void Prev() { + assert(Valid()); + + // Scan backwards to a restart point before current_ + const uint32_t original = current_; + while (GetRestartPoint(restart_index_) >= original) { + if (restart_index_ == 0) { + // No more entries + current_ = restarts_; + restart_index_ = num_restarts_; + return; + } + restart_index_--; + } + + SeekToRestartPoint(restart_index_); + do { + // Loop until end of current entry hits the start of original entry + } while (ParseNextKey() && NextEntryOffset() < original); + } + + virtual void Seek(const Slice& target) { + // Binary search in restart array to find the first restart point + // with a key >= target + uint32_t left = 0; + uint32_t right = num_restarts_ - 1; + while (left < right) { + uint32_t mid = (left + right + 1) / 2; + uint32_t region_offset = GetRestartPoint(mid); + uint32_t shared, non_shared, value_length; + const char* key_ptr = DecodeEntry(data_ + region_offset, + data_ + restarts_, + &shared, &non_shared, &value_length); + if (key_ptr == nullptr || (shared != 0)) { + CorruptionError(); + return; + } + Slice mid_key(key_ptr, non_shared); + if (Compare(mid_key, target) < 0) { + // Key at "mid" is smaller than "target". Therefore all + // blocks before "mid" are uninteresting. + left = mid; + } else { + // Key at "mid" is >= "target". Therefore all blocks at or + // after "mid" are uninteresting. + right = mid - 1; + } + } + + // Linear search (within restart block) for first key >= target + SeekToRestartPoint(left); + while (true) { + if (!ParseNextKey()) { + return; + } + if (Compare(key_, target) >= 0) { + return; + } + } + } + + virtual void SeekToFirst() { + SeekToRestartPoint(0); + ParseNextKey(); + } + + virtual void SeekToLast() { + SeekToRestartPoint(num_restarts_ - 1); + while (ParseNextKey() && NextEntryOffset() < restarts_) { + // Keep skipping + } + } + + private: + void CorruptionError() { + current_ = restarts_; + restart_index_ = num_restarts_; + status_ = Status::Corruption("bad entry in block"); + key_.clear(); + value_.clear(); + } + + bool ParseNextKey() { + current_ = NextEntryOffset(); + const char* p = data_ + current_; + const char* limit = data_ + restarts_; // Restarts come right after data + if (p >= limit) { + // No more entries to return. Mark as invalid. + current_ = restarts_; + restart_index_ = num_restarts_; + return false; + } + + // Decode next entry + uint32_t shared, non_shared, value_length; + p = DecodeEntry(p, limit, &shared, &non_shared, &value_length); + if (p == nullptr || key_.size() < shared) { + CorruptionError(); + return false; + } else { + key_.resize(shared); + key_.append(p, non_shared); + value_ = Slice(p + non_shared, value_length); + while (restart_index_ + 1 < num_restarts_ && + GetRestartPoint(restart_index_ + 1) < current_) { + ++restart_index_; + } + return true; + } + } +}; + +Iterator* Block::NewIterator(const Comparator* cmp) { + if (size_ < 2*sizeof(uint32_t)) { + return NewErrorIterator(Status::Corruption("bad block contents")); + } + const uint32_t num_restarts = NumRestarts(); + if (num_restarts == 0) { + return NewEmptyIterator(); + } else { + return new Iter(cmp, data_, restart_offset_, num_restarts); + } +} + +} // namespace rocksdb diff --git a/table/block.h b/table/block.h new file mode 100644 index 00000000..7fac0065 --- /dev/null +++ b/table/block.h @@ -0,0 +1,51 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include +#include "rocksdb/iterator.h" +#include "rocksdb/options.h" + +namespace rocksdb { + +struct BlockContents; +class Comparator; + +class Block { + public: + // Initialize the block with the specified contents. + explicit Block(const BlockContents& contents); + + ~Block(); + + size_t size() const { return size_; } + bool isCachable() const { return cachable_; } + CompressionType compressionType() const { return compression_type_; } + Iterator* NewIterator(const Comparator* comparator); + const char* data() { return data_; } + + private: + uint32_t NumRestarts() const; + + const char* data_; + size_t size_; + uint32_t restart_offset_; // Offset in data_ of restart array + bool owned_; // Block owns data_[] + bool cachable_; + CompressionType compression_type_; + + // No copying allowed + Block(const Block&); + void operator=(const Block&); + + class Iter; +}; + +} // namespace rocksdb diff --git a/table/block_based_table_builder.cc b/table/block_based_table_builder.cc new file mode 100644 index 00000000..a5e546be --- /dev/null +++ b/table/block_based_table_builder.cc @@ -0,0 +1,559 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/block_based_table_builder.h" + +#include +#include +#include +#include + +#include "rocksdb/flush_block_policy.h" +#include "rocksdb/cache.h" +#include "rocksdb/comparator.h" +#include "rocksdb/table.h" +#include "rocksdb/env.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/options.h" +#include "table/block_based_table_reader.h" +#include "table/block.h" +#include "table/block_builder.h" +#include "table/filter_block.h" +#include "table/format.h" +#include "util/coding.h" +#include "util/crc32c.h" +#include "util/stop_watch.h" + +namespace rocksdb { + +namespace { + +struct BytewiseLessThan { + bool operator()(const std::string& key1, const std::string& key2) const { + // smaller entries will be placed in front. + return comparator->Compare(key1, key2) <= 0; + } + const Comparator* comparator = BytewiseComparator(); +}; + +// When writing to a block that requires entries to be sorted by +// `BytewiseComparator`, we can buffer the content to `BytewiseSortedMap` +// before writng to store. +typedef std::map BytewiseSortedMap; + +void AddProperties(BytewiseSortedMap& props, std::string name, uint64_t val) { + assert(props.find(name) == props.end()); + + std::string dst; + PutVarint64(&dst, val); + + props.insert( + std::make_pair(name, dst) + ); +} + +static bool GoodCompressionRatio(size_t compressed_size, size_t raw_size) { + // Check to see if compressed less than 12.5% + return compressed_size < raw_size - (raw_size / 8u); +} + +// Were we encounter any error occurs during user-defined statistics collection, +// we'll write the warning message to info log. +void LogPropertiesCollectionError( + Logger* info_log, const std::string& method, const std::string& name) { + assert(method == "Add" || method == "Finish"); + + std::string msg = + "[Warning] encountered error when calling TablePropertiesCollector::" + + method + "() with collector name: " + name; + Log(info_log, "%s", msg.c_str()); +} + +} // anonymous namespace + +struct BlockBasedTableBuilder::Rep { + Options options; + WritableFile* file; + uint64_t offset = 0; + Status status; + BlockBuilder data_block; + BlockBuilder index_block; + std::string last_key; + CompressionType compression_type; + TableProperties props; + + bool closed = false; // Either Finish() or Abandon() has been called. + FilterBlockBuilder* filter_block; + char compressed_cache_key_prefix[BlockBasedTable::kMaxCacheKeyPrefixSize]; + size_t compressed_cache_key_prefix_size; + + BlockHandle pending_handle; // Handle to add to index block + + std::string compressed_output; + std::unique_ptr flush_block_policy; + + Rep(const Options& opt, + WritableFile* f, + FlushBlockPolicyFactory* flush_block_policy_factory, + CompressionType compression_type) + : options(opt), + file(f), + data_block(options), + // To avoid linear scan, we make the block_restart_interval to be `1` + // in index block builder + index_block(1 /* block_restart_interval */, options.comparator), + compression_type(compression_type), + filter_block(opt.filter_policy == nullptr ? nullptr + : new FilterBlockBuilder(opt)), + flush_block_policy( + flush_block_policy_factory->NewFlushBlockPolicy(data_block)) { + } +}; + +BlockBasedTableBuilder::BlockBasedTableBuilder( + const Options& options, + WritableFile* file, + FlushBlockPolicyFactory* flush_block_policy_factory, + CompressionType compression_type) + : rep_(new Rep(options, + file, flush_block_policy_factory, compression_type)) { + if (rep_->filter_block != nullptr) { + rep_->filter_block->StartBlock(0); + } + if (options.block_cache_compressed.get() != nullptr) { + BlockBasedTable::GenerateCachePrefix( + options.block_cache_compressed.get(), file, + &rep_->compressed_cache_key_prefix[0], + &rep_->compressed_cache_key_prefix_size); + } +} + +BlockBasedTableBuilder::~BlockBasedTableBuilder() { + assert(rep_->closed); // Catch errors where caller forgot to call Finish() + delete rep_->filter_block; + delete rep_; +} + +void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) { + Rep* r = rep_; + assert(!r->closed); + if (!ok()) return; + if (r->props.num_entries > 0) { + assert(r->options.comparator->Compare(key, Slice(r->last_key)) > 0); + } + + auto should_flush = r->flush_block_policy->Update(key, value); + if (should_flush) { + assert(!r->data_block.empty()); + Flush(); + + // Add item to index block. + // We do not emit the index entry for a block until we have seen the + // first key for the next data block. This allows us to use shorter + // keys in the index block. For example, consider a block boundary + // between the keys "the quick brown fox" and "the who". We can use + // "the r" as the key for the index block entry since it is >= all + // entries in the first block and < all entries in subsequent + // blocks. + if (ok()) { + r->options.comparator->FindShortestSeparator(&r->last_key, key); + std::string handle_encoding; + r->pending_handle.EncodeTo(&handle_encoding); + r->index_block.Add(r->last_key, Slice(handle_encoding)); + } + } + + if (r->filter_block != nullptr) { + r->filter_block->AddKey(key); + } + + r->last_key.assign(key.data(), key.size()); + r->data_block.Add(key, value); + r->props.num_entries++; + r->props.raw_key_size += key.size(); + r->props.raw_value_size += value.size(); + + for (auto collector : r->options.table_properties_collectors) { + Status s = collector->Add(key, value); + if (!s.ok()) { + LogPropertiesCollectionError( + r->options.info_log.get(), + "Add", /* method */ + collector->Name() + ); + } + } +} + +void BlockBasedTableBuilder::Flush() { + Rep* r = rep_; + assert(!r->closed); + if (!ok()) return; + if (r->data_block.empty()) return; + WriteBlock(&r->data_block, &r->pending_handle); + if (ok()) { + r->status = r->file->Flush(); + } + if (r->filter_block != nullptr) { + r->filter_block->StartBlock(r->offset); + } + r->props.data_size = r->offset; + ++r->props.num_data_blocks; +} + +void BlockBasedTableBuilder::WriteBlock(BlockBuilder* block, + BlockHandle* handle) { + // File format contains a sequence of blocks where each block has: + // block_data: uint8[n] + // type: uint8 + // crc: uint32 + assert(ok()); + Rep* r = rep_; + Slice raw = block->Finish(); + + Slice block_contents; + std::string* compressed = &r->compressed_output; + CompressionType type = r->compression_type; + switch (type) { + case kNoCompression: + block_contents = raw; + break; + + case kSnappyCompression: { + std::string* compressed = &r->compressed_output; + if (port::Snappy_Compress(r->options.compression_opts, raw.data(), + raw.size(), compressed) && + GoodCompressionRatio(compressed->size(), raw.size())) { + block_contents = *compressed; + } else { + // Snappy not supported, or not good compression ratio, so just + // store uncompressed form + block_contents = raw; + type = kNoCompression; + } + break; + } + case kZlibCompression: + if (port::Zlib_Compress(r->options.compression_opts, raw.data(), + raw.size(), compressed) && + GoodCompressionRatio(compressed->size(), raw.size())) { + block_contents = *compressed; + } else { + // Zlib not supported, or not good compression ratio, so just + // store uncompressed form + block_contents = raw; + type = kNoCompression; + } + break; + case kBZip2Compression: + if (port::BZip2_Compress(r->options.compression_opts, raw.data(), + raw.size(), compressed) && + GoodCompressionRatio(compressed->size(), raw.size())) { + block_contents = *compressed; + } else { + // BZip not supported, or not good compression ratio, so just + // store uncompressed form + block_contents = raw; + type = kNoCompression; + } + break; + } + WriteRawBlock(block_contents, type, handle); + r->compressed_output.clear(); + block->Reset(); +} + +void BlockBasedTableBuilder::WriteRawBlock(const Slice& block_contents, + CompressionType type, + BlockHandle* handle) { + Rep* r = rep_; + StopWatch sw(r->options.env, r->options.statistics.get(), + WRITE_RAW_BLOCK_MICROS); + handle->set_offset(r->offset); + handle->set_size(block_contents.size()); + r->status = r->file->Append(block_contents); + if (r->status.ok()) { + char trailer[kBlockTrailerSize]; + trailer[0] = type; + uint32_t crc = crc32c::Value(block_contents.data(), block_contents.size()); + crc = crc32c::Extend(crc, trailer, 1); // Extend crc to cover block type + EncodeFixed32(trailer+1, crc32c::Mask(crc)); + r->status = r->file->Append(Slice(trailer, kBlockTrailerSize)); + if (r->status.ok()) { + r->status = InsertBlockInCache(block_contents, type, handle); + } + if (r->status.ok()) { + r->offset += block_contents.size() + kBlockTrailerSize; + } + } +} + +Status BlockBasedTableBuilder::status() const { + return rep_->status; +} + +static void DeleteCachedBlock(const Slice& key, void* value) { + Block* block = reinterpret_cast(value); + delete block; +} + +// +// Make a copy of the block contents and insert into compressed block cache +// +Status BlockBasedTableBuilder::InsertBlockInCache(const Slice& block_contents, + const CompressionType type, + const BlockHandle* handle) { + Rep* r = rep_; + Cache* block_cache_compressed = r->options.block_cache_compressed.get(); + + if (type != kNoCompression && block_cache_compressed != nullptr) { + + Cache::Handle* cache_handle = nullptr; + size_t size = block_contents.size(); + + char* ubuf = new char[size]; // make a new copy + memcpy(ubuf, block_contents.data(), size); + + BlockContents results; + Slice sl(ubuf, size); + results.data = sl; + results.cachable = true; // XXX + results.heap_allocated = true; + results.compression_type = type; + + Block* block = new Block(results); + + // make cache key by appending the file offset to the cache prefix id + char* end = EncodeVarint64( + r->compressed_cache_key_prefix + + r->compressed_cache_key_prefix_size, + handle->offset()); + Slice key(r->compressed_cache_key_prefix, static_cast + (end - r->compressed_cache_key_prefix)); + + // Insert into compressed block cache. + cache_handle = block_cache_compressed->Insert(key, block, block->size(), + &DeleteCachedBlock); + block_cache_compressed->Release(cache_handle); + + // Invalidate OS cache. + r->file->InvalidateCache(r->offset, size); + } + return Status::OK(); +} + +Status BlockBasedTableBuilder::Finish() { + Rep* r = rep_; + bool empty_data_block = r->data_block.empty(); + Flush(); + assert(!r->closed); + r->closed = true; + + BlockHandle filter_block_handle, + metaindex_block_handle, + index_block_handle; + + // Write filter block + if (ok() && r->filter_block != nullptr) { + auto filter_contents = r->filter_block->Finish(); + r->props.filter_size = filter_contents.size(); + WriteRawBlock(filter_contents, kNoCompression, &filter_block_handle); + } + + // To make sure properties block is able to keep the accurate size of index + // block, we will finish writing all index entries here and flush them + // to storage after metaindex block is written. + if (ok() && !empty_data_block) { + r->options.comparator->FindShortSuccessor(&r->last_key); + + std::string handle_encoding; + r->pending_handle.EncodeTo(&handle_encoding); + r->index_block.Add(r->last_key, handle_encoding); + } + + // Write meta blocks and metaindex block with the following order. + // 1. [meta block: filter] + // 2. [meta block: properties] + // 3. [metaindex block] + if (ok()) { + // We use `BytewiseComparator` as the comparator for meta block. + BlockBuilder meta_index_block( + r->options.block_restart_interval, + BytewiseComparator() + ); + // Key: meta block name + // Value: block handle to that meta block + BytewiseSortedMap meta_block_handles; + + // Write filter block. + if (r->filter_block != nullptr) { + // Add mapping from ".Name" to location + // of filter data. + std::string key = BlockBasedTable::kFilterBlockPrefix; + key.append(r->options.filter_policy->Name()); + std::string handle_encoding; + filter_block_handle.EncodeTo(&handle_encoding); + meta_block_handles.insert( + std::make_pair(key, handle_encoding) + ); + } + + // Write properties block. + { + BlockBuilder properties_block( + r->options.block_restart_interval, + BytewiseComparator() + ); + + BytewiseSortedMap properties; + + // Add basic properties + AddProperties( + properties, + BlockBasedTablePropertiesNames::kRawKeySize, + r->props.raw_key_size + ); + AddProperties( + properties, + BlockBasedTablePropertiesNames::kRawValueSize, + r->props.raw_value_size + ); + AddProperties( + properties, + BlockBasedTablePropertiesNames::kDataSize, + r->props.data_size + ); + r->props.index_size = + r->index_block.CurrentSizeEstimate() + kBlockTrailerSize; + AddProperties( + properties, + BlockBasedTablePropertiesNames::kIndexSize, + r->props.index_size + ); + AddProperties( + properties, + BlockBasedTablePropertiesNames::kNumEntries, + r->props.num_entries + ); + AddProperties( + properties, + BlockBasedTablePropertiesNames::kNumDataBlocks, + r->props.num_data_blocks); + if (r->filter_block != nullptr) { + properties.insert({ + BlockBasedTablePropertiesNames::kFilterPolicy, + r->options.filter_policy->Name() + }); + } + AddProperties( + properties, + BlockBasedTablePropertiesNames::kFilterSize, + r->props.filter_size + ); + + for (auto collector : r->options.table_properties_collectors) { + TableProperties::UserCollectedProperties user_collected_properties; + Status s = + collector->Finish(&user_collected_properties); + + if (!s.ok()) { + LogPropertiesCollectionError( + r->options.info_log.get(), + "Finish", /* method */ + collector->Name() + ); + } else { + properties.insert( + user_collected_properties.begin(), + user_collected_properties.end() + ); + } + } + + for (const auto& stat : properties) { + properties_block.Add(stat.first, stat.second); + } + + BlockHandle properties_block_handle; + WriteBlock(&properties_block, &properties_block_handle); + + std::string handle_encoding; + properties_block_handle.EncodeTo(&handle_encoding); + meta_block_handles.insert( + { BlockBasedTable::kPropertiesBlock, handle_encoding } + ); + } // end of properties block writing + + for (const auto& metablock : meta_block_handles) { + meta_index_block.Add(metablock.first, metablock.second); + } + + WriteBlock(&meta_index_block, &metaindex_block_handle); + } // meta blocks and metaindex block. + + // Write index block + if (ok()) { + WriteBlock(&r->index_block, &index_block_handle); + } + + // Write footer + if (ok()) { + Footer footer; + footer.set_metaindex_handle(metaindex_block_handle); + footer.set_index_handle(index_block_handle); + std::string footer_encoding; + footer.EncodeTo(&footer_encoding); + r->status = r->file->Append(footer_encoding); + if (r->status.ok()) { + r->offset += footer_encoding.size(); + } + } + + // Print out the table stats + if (ok()) { + // user collected properties + std::string user_collected; + user_collected.reserve(1024); + for (auto collector : r->options.table_properties_collectors) { + for (const auto& prop : collector->GetReadableProperties()) { + user_collected.append(prop.first); + user_collected.append("="); + user_collected.append(prop.second); + user_collected.append("; "); + } + } + + Log( + r->options.info_log, + "Table was constructed:\n" + " [basic properties]: %s\n" + " [user collected properties]: %s", + r->props.ToString().c_str(), + user_collected.c_str() + ); + } + + return r->status; +} + +void BlockBasedTableBuilder::Abandon() { + Rep* r = rep_; + assert(!r->closed); + r->closed = true; +} + +uint64_t BlockBasedTableBuilder::NumEntries() const { + return rep_->props.num_entries; +} + +uint64_t BlockBasedTableBuilder::FileSize() const { + return rep_->offset; +} + +} // namespace rocksdb diff --git a/table/block_based_table_builder.h b/table/block_based_table_builder.h new file mode 100644 index 00000000..517f8e78 --- /dev/null +++ b/table/block_based_table_builder.h @@ -0,0 +1,85 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include "rocksdb/flush_block_policy.h" +#include "rocksdb/options.h" +#include "rocksdb/status.h" +#include "rocksdb/table.h" + +namespace rocksdb { + +class BlockBuilder; +class BlockHandle; +class WritableFile; + + +class BlockBasedTableBuilder : public TableBuilder { + public: + // Create a builder that will store the contents of the table it is + // building in *file. Does not close the file. It is up to the + // caller to close the file after calling Finish(). + BlockBasedTableBuilder(const Options& options, + WritableFile* file, + FlushBlockPolicyFactory* flush_block_policy_factory, + CompressionType compression_type); + + // REQUIRES: Either Finish() or Abandon() has been called. + ~BlockBasedTableBuilder(); + + // Add key,value to the table being constructed. + // REQUIRES: key is after any previously added key according to comparator. + // REQUIRES: Finish(), Abandon() have not been called + void Add(const Slice& key, const Slice& value) override; + + // Return non-ok iff some error has been detected. + Status status() const override; + + // Finish building the table. Stops using the file passed to the + // constructor after this function returns. + // REQUIRES: Finish(), Abandon() have not been called + Status Finish() override; + + // Indicate that the contents of this builder should be abandoned. Stops + // using the file passed to the constructor after this function returns. + // If the caller is not going to call Finish(), it must call Abandon() + // before destroying this builder. + // REQUIRES: Finish(), Abandon() have not been called + void Abandon() override; + + // Number of calls to Add() so far. + uint64_t NumEntries() const override; + + // Size of the file generated so far. If invoked after a successful + // Finish() call, returns the size of the final generated file. + uint64_t FileSize() const override; + + private: + bool ok() const { return status().ok(); } + void WriteBlock(BlockBuilder* block, BlockHandle* handle); + void WriteRawBlock(const Slice& data, CompressionType, BlockHandle* handle); + Status InsertBlockInCache(const Slice& block_contents, + const CompressionType type, const BlockHandle* handle); + struct Rep; + Rep* rep_; + + // Advanced operation: flush any buffered key/value pairs to file. + // Can be used to ensure that two adjacent entries never live in + // the same data block. Most clients should not need to use this method. + // REQUIRES: Finish(), Abandon() have not been called + void Flush(); + + // No copying allowed + BlockBasedTableBuilder(const BlockBasedTableBuilder&) = delete; + void operator=(const BlockBasedTableBuilder&) = delete; +}; + +} // namespace rocksdb + diff --git a/table/block_based_table_factory.cc b/table/block_based_table_factory.cc new file mode 100644 index 00000000..836f6edf --- /dev/null +++ b/table/block_based_table_factory.cc @@ -0,0 +1,66 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + + +#include "table/block_based_table_factory.h" + +#include +#include +#include "table/block_based_table_builder.h" +#include "table/block_based_table_reader.h" +#include "port/port.h" + +namespace rocksdb { + +Status BlockBasedTableFactory::GetTableReader( + const Options& options, const EnvOptions& soptions, + unique_ptr && file, uint64_t file_size, + unique_ptr* table_reader) const { + return BlockBasedTable::Open(options, soptions, std::move(file), file_size, + table_reader); +} + +TableBuilder* BlockBasedTableFactory::GetTableBuilder( + const Options& options, WritableFile* file, + CompressionType compression_type) const { + auto flush_block_policy_factory = + table_options_.flush_block_policy_factory.get(); + + // if flush block policy factory is not set, we'll create the default one + // from the options. + // + // NOTE: we cannot pre-cache the "default block policy factory" because + // `FlushBlockBySizePolicyFactory` takes `options.block_size` and + // `options.block_size_deviation` as parameters, which may be different + // every time. + if (flush_block_policy_factory == nullptr) { + flush_block_policy_factory = + new FlushBlockBySizePolicyFactory(options.block_size, + options.block_size_deviation); + } + + auto table_builder = new BlockBasedTableBuilder( + options, + file, + flush_block_policy_factory, + compression_type); + + // Delete flush_block_policy_factory only when it's just created from the + // options. + // We can safely delete flush_block_policy_factory since it will only be used + // during the construction of `BlockBasedTableBuilder`. + if (flush_block_policy_factory != + table_options_.flush_block_policy_factory.get()) { + delete flush_block_policy_factory; + } + + return table_builder; +} + +} // namespace rocksdb diff --git a/table/block_based_table_factory.h b/table/block_based_table_factory.h new file mode 100644 index 00000000..ee525816 --- /dev/null +++ b/table/block_based_table_factory.h @@ -0,0 +1,69 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include + +#include "rocksdb/flush_block_policy.h" +#include "rocksdb/options.h" +#include "rocksdb/table.h" + +namespace rocksdb { + +struct Options; +struct EnvOptions; + +using std::unique_ptr; +class Status; +class RandomAccessFile; +class WritableFile; +class Table; +class TableBuilder; +class BlockBasedTable; +class BlockBasedTableBuilder; + +class BlockBasedTableFactory: public TableFactory { +public: + struct TableOptions { + // @flush_block_policy_factory creates the instances of flush block policy. + // which provides a configurable way to determine when to flush a block in + // the block based tables. If not set, table builder will use the default + // block flush policy, which cut blocks by block size (please refer to + // `FlushBlockBySizePolicy`). + std::shared_ptr flush_block_policy_factory; + }; + + BlockBasedTableFactory() : BlockBasedTableFactory(TableOptions()) { } + BlockBasedTableFactory(const TableOptions& table_options): + table_options_(table_options) { + } + + ~BlockBasedTableFactory() { + } + + const char* Name() const override { + return "BlockBasedTable"; + } + + Status GetTableReader(const Options& options, const EnvOptions& soptions, + unique_ptr && file, + uint64_t file_size, + unique_ptr* table_reader) const override; + + TableBuilder* GetTableBuilder(const Options& options, WritableFile* file, + CompressionType compression_type) const + override; + + private: + TableOptions table_options_; +}; + + +} // namespace rocksdb diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc new file mode 100644 index 00000000..dcb55fc3 --- /dev/null +++ b/table/block_based_table_reader.cc @@ -0,0 +1,1099 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/block_based_table_reader.h" + +#include "db/dbformat.h" + +#include "rocksdb/comparator.h" +#include "rocksdb/env.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/options.h" +#include "rocksdb/statistics.h" +#include "rocksdb/table.h" + +#include "table/block.h" +#include "table/filter_block.h" +#include "table/format.h" +#include "table/two_level_iterator.h" + +#include "util/coding.h" +#include "util/perf_context_imp.h" +#include "util/stop_watch.h" + +namespace rocksdb { + +// The longest the prefix of the cache key used to identify blocks can be. +// We are using the fact that we know for Posix files the unique ID is three +// varints. +const size_t kMaxCacheKeyPrefixSize = kMaxVarint64Length*3+1; +using std::unique_ptr; + +struct BlockBasedTable::Rep { + Rep(const EnvOptions& storage_options) : + soptions(storage_options) { + } + + Options options; + const EnvOptions& soptions; + Status status; + unique_ptr file; + char cache_key_prefix[kMaxCacheKeyPrefixSize]; + size_t cache_key_prefix_size; + char compressed_cache_key_prefix[kMaxCacheKeyPrefixSize]; + size_t compressed_cache_key_prefix_size; + + // Handle to metaindex_block: saved from footer + BlockHandle metaindex_handle; + // Handle to index: saved from footer + BlockHandle index_handle; + // index_block will be populated and used only when options.block_cache is + // NULL; otherwise we will get the index block via the block cache. + unique_ptr index_block; + unique_ptr filter; + + TableProperties table_properties; +}; + +BlockBasedTable::~BlockBasedTable() { + delete rep_; +} + +// CachableEntry represents the entries that *may* be fetched from block cache. +// field `value` is the item we want to get. +// field `cache_handle` is the cache handle to the block cache. If the value +// was not read from cache, `cache_handle` will be nullptr. +template +struct BlockBasedTable::CachableEntry { + CachableEntry(TValue* value, Cache::Handle* cache_handle) + : value(value) + , cache_handle(cache_handle) { + } + CachableEntry(): CachableEntry(nullptr, nullptr) { } + void Release(Cache* cache) { + if (cache_handle) { + cache->Release(cache_handle); + value = nullptr; + cache_handle = nullptr; + } + } + + TValue* value = nullptr; + // if the entry is from the cache, cache_handle will be populated. + Cache::Handle* cache_handle = nullptr; +}; + +// Helper function to setup the cache key's prefix for the Table. +void BlockBasedTable::SetupCacheKeyPrefix(Rep* rep) { + assert(kMaxCacheKeyPrefixSize >= 10); + rep->cache_key_prefix_size = 0; + rep->compressed_cache_key_prefix_size = 0; + if (rep->options.block_cache != nullptr) { + GenerateCachePrefix(rep->options.block_cache.get(), rep->file.get(), + &rep->cache_key_prefix[0], + &rep->cache_key_prefix_size); + } + if (rep->options.block_cache_compressed != nullptr) { + GenerateCachePrefix(rep->options.block_cache_compressed.get(), + rep->file.get(), &rep->compressed_cache_key_prefix[0], + &rep->compressed_cache_key_prefix_size); + } +} + +void BlockBasedTable::GenerateCachePrefix(Cache* cc, + RandomAccessFile* file, char* buffer, size_t* size) { + + // generate an id from the file + *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize); + + // If the prefix wasn't generated or was too long, + // create one from the cache. + if (*size == 0) { + char* end = EncodeVarint64(buffer, cc->NewId()); + *size = static_cast(end - buffer); + } +} + +void BlockBasedTable::GenerateCachePrefix(Cache* cc, + WritableFile* file, char* buffer, size_t* size) { + + // generate an id from the file + *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize); + + // If the prefix wasn't generated or was too long, + // create one from the cache. + if (*size == 0) { + char* end = EncodeVarint64(buffer, cc->NewId()); + *size = static_cast(end - buffer); + } +} + +namespace { // anonymous namespace, not visible externally + +// Read the block identified by "handle" from "file". +// The only relevant option is options.verify_checksums for now. +// Set *didIO to true if didIO is not null. +// On failure return non-OK. +// On success fill *result and return OK - caller owns *result +Status ReadBlockFromFile( + RandomAccessFile* file, + const ReadOptions& options, + const BlockHandle& handle, + Block** result, + Env* env, + bool* didIO = nullptr, + bool do_uncompress = true) { + BlockContents contents; + Status s = ReadBlockContents(file, options, handle, &contents, + env, do_uncompress); + if (s.ok()) { + *result = new Block(contents); + } + + if (didIO) { + *didIO = true; + } + return s; +} + +void DeleteBlock(void* arg, void* ignored) { + delete reinterpret_cast(arg); +} + +void DeleteCachedBlock(const Slice& key, void* value) { + Block* block = reinterpret_cast(value); + delete block; +} + +void DeleteCachedFilter(const Slice& key, void* value) { + auto filter = reinterpret_cast(value); + delete filter; +} + +void ReleaseBlock(void* arg, void* h) { + Cache* cache = reinterpret_cast(arg); + Cache::Handle* handle = reinterpret_cast(h); + cache->Release(handle); +} + +Slice GetCacheKey(const char* cache_key_prefix, + size_t cache_key_prefix_size, + const BlockHandle& handle, + char* cache_key) { + assert(cache_key != nullptr); + assert(cache_key_prefix_size != 0); + assert(cache_key_prefix_size <= kMaxCacheKeyPrefixSize); + memcpy(cache_key, cache_key_prefix, cache_key_prefix_size); + char* end = EncodeVarint64(cache_key + cache_key_prefix_size, + handle.offset()); + return Slice(cache_key, static_cast(end - cache_key)); +} + +Cache::Handle* GetFromBlockCache( + Cache* block_cache, + const Slice& key, + Tickers block_cache_miss_ticker, + Tickers block_cache_hit_ticker, + Statistics* statistics) { + auto cache_handle = block_cache->Lookup(key); + if (cache_handle != nullptr) { + BumpPerfCount(&perf_context.block_cache_hit_count); + // overall cache hit + RecordTick(statistics, BLOCK_CACHE_HIT); + // block-type specific cache hit + RecordTick(statistics, block_cache_hit_ticker); + } else { + // overall cache miss + RecordTick(statistics, BLOCK_CACHE_MISS); + // block-type specific cache miss + RecordTick(statistics, block_cache_miss_ticker); + } + + return cache_handle; +} + +} // end of anonymous namespace + +Status BlockBasedTable::Open(const Options& options, + const EnvOptions& soptions, + unique_ptr && file, + uint64_t size, + unique_ptr* table_reader) { + table_reader->reset(); + if (size < Footer::kEncodedLength) { + return Status::InvalidArgument("file is too short to be an sstable"); + } + + char footer_space[Footer::kEncodedLength]; + Slice footer_input; + Status s = file->Read(size - Footer::kEncodedLength, Footer::kEncodedLength, + &footer_input, footer_space); + if (!s.ok()) return s; + + // Check that we actually read the whole footer from the file. It may be + // that size isn't correct. + if (footer_input.size() != Footer::kEncodedLength) { + return Status::InvalidArgument("file is too short to be an sstable"); + } + + Footer footer; + s = footer.DecodeFrom(&footer_input); + if (!s.ok()) return s; + + // We've successfully read the footer and the index block: we're + // ready to serve requests. + Rep* rep = new BlockBasedTable::Rep(soptions); + rep->options = options; + rep->file = std::move(file); + rep->metaindex_handle = footer.metaindex_handle(); + rep->index_handle = footer.index_handle(); + SetupCacheKeyPrefix(rep); + unique_ptr new_table(new BlockBasedTable(rep)); + + // Read meta index + std::unique_ptr meta; + std::unique_ptr meta_iter; + s = ReadMetaBlock(rep, &meta, &meta_iter); + + // Read the properties + meta_iter->Seek(kPropertiesBlock); + if (meta_iter->Valid() && meta_iter->key() == Slice(kPropertiesBlock)) { + s = meta_iter->status(); + if (s.ok()) { + s = ReadProperties(meta_iter->value(), rep, &rep->table_properties); + } + + if (!s.ok()) { + auto err_msg = + "[Warning] Encountered error while reading data from properties " + "block " + s.ToString(); + Log(rep->options.info_log, "%s", err_msg.c_str()); + } + } + + // Initialize index/filter blocks. If block cache is not specified, + // these blocks will be kept in member variables in Rep, which will + // reside in the memory as long as this table object is alive; otherwise + // they will be added to block cache. + if (!options.block_cache) { + Block* index_block = nullptr; + // TODO: we never really verify check sum for index block + s = ReadBlockFromFile( + rep->file.get(), + ReadOptions(), + footer.index_handle(), + &index_block, + options.env + ); + + if (s.ok()) { + assert(index_block->compressionType() == kNoCompression); + rep->index_block.reset(index_block); + + // Set index block + if (rep->options.filter_policy) { + std::string key = kFilterBlockPrefix; + key.append(rep->options.filter_policy->Name()); + meta_iter->Seek(key); + + if (meta_iter->Valid() && meta_iter->key() == Slice(key)) { + rep->filter.reset(ReadFilter(meta_iter->value(), rep)); + } + } + } else { + delete index_block; + } + } else { + // Call IndexBlockReader() to implicitly add index to the block_cache + unique_ptr iter( + new_table->IndexBlockReader(ReadOptions()) + ); + s = iter->status(); + + if (s.ok()) { + // Call GetFilter() to implicitly add filter to the block_cache + auto filter_entry = new_table->GetFilter(); + filter_entry.Release(options.block_cache.get()); + } + } + + if (s.ok()) { + *table_reader = std::move(new_table); + } + + return s; +} + +void BlockBasedTable::SetupForCompaction() { + switch (rep_->options.access_hint_on_compaction_start) { + case Options::NONE: + break; + case Options::NORMAL: + rep_->file->Hint(RandomAccessFile::NORMAL); + break; + case Options::SEQUENTIAL: + rep_->file->Hint(RandomAccessFile::SEQUENTIAL); + break; + case Options::WILLNEED: + rep_->file->Hint(RandomAccessFile::WILLNEED); + break; + default: + assert(false); + } + compaction_optimized_ = true; +} + +TableProperties& BlockBasedTable::GetTableProperties() { + return rep_->table_properties; +} + +// Load the meta-block from the file. On success, return the loaded meta block +// and its iterator. +Status BlockBasedTable::ReadMetaBlock( + Rep* rep, + std::unique_ptr* meta_block, + std::unique_ptr* iter) { + // TODO(sanjay): Skip this if footer.metaindex_handle() size indicates + // it is an empty block. + // TODO: we never really verify check sum for meta index block + Block* meta = nullptr; + Status s = ReadBlockFromFile( + rep->file.get(), + ReadOptions(), + rep->metaindex_handle, + &meta, + rep->options.env); + + if (!s.ok()) { + auto err_msg = + "[Warning] Encountered error while reading data from properties" + "block " + s.ToString(); + Log(rep->options.info_log, "%s", err_msg.c_str()); + } + if (!s.ok()) { + delete meta; + return s; + } + + meta_block->reset(meta); + // meta block uses bytewise comparator. + iter->reset(meta->NewIterator(BytewiseComparator())); + return Status::OK(); +} + +FilterBlockReader* BlockBasedTable::ReadFilter ( + const Slice& filter_handle_value, + BlockBasedTable::Rep* rep, + size_t* filter_size) { + Slice v = filter_handle_value; + BlockHandle filter_handle; + if (!filter_handle.DecodeFrom(&v).ok()) { + return nullptr; + } + + // TODO: We might want to unify with ReadBlockFromFile() if we start + // requiring checksum verification in Table::Open. + ReadOptions opt; + BlockContents block; + if (!ReadBlockContents(rep->file.get(), opt, filter_handle, &block, + rep->options.env, false).ok()) { + return nullptr; + } + + if (filter_size) { + *filter_size = block.data.size(); + } + + return new FilterBlockReader( + rep->options, block.data, block.heap_allocated); +} + +Status BlockBasedTable::ReadProperties( + const Slice& handle_value, Rep* rep, TableProperties* table_properties) { + assert(table_properties); + + Slice v = handle_value; + BlockHandle handle; + if (!handle.DecodeFrom(&v).ok()) { + return Status::InvalidArgument("Failed to decode properties block handle"); + } + + BlockContents block_contents; + Status s = ReadBlockContents( + rep->file.get(), + ReadOptions(), + handle, + &block_contents, + rep->options.env, + false + ); + + if (!s.ok()) { + return s; + } + + Block properties_block(block_contents); + std::unique_ptr iter( + properties_block.NewIterator(BytewiseComparator()) + ); + + // All pre-defined properties of type uint64_t + std::unordered_map predefined_uint64_properties = { + { BlockBasedTablePropertiesNames::kDataSize, + &table_properties->data_size }, + { BlockBasedTablePropertiesNames::kIndexSize, + &table_properties->index_size }, + { BlockBasedTablePropertiesNames::kFilterSize, + &table_properties->filter_size }, + { BlockBasedTablePropertiesNames::kRawKeySize, + &table_properties->raw_key_size }, + { BlockBasedTablePropertiesNames::kRawValueSize, + &table_properties->raw_value_size }, + { BlockBasedTablePropertiesNames::kNumDataBlocks, + &table_properties->num_data_blocks }, + { BlockBasedTablePropertiesNames::kNumEntries, + &table_properties->num_entries }, + }; + + std::string last_key; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + s = iter->status(); + if (!s.ok()) { + break; + } + + auto key = iter->key().ToString(); + // properties block is strictly sorted with no duplicate key. + assert( + last_key.empty() || + BytewiseComparator()->Compare(key, last_key) > 0 + ); + last_key = key; + + auto raw_val = iter->value(); + auto pos = predefined_uint64_properties.find(key); + + if (pos != predefined_uint64_properties.end()) { + // handle predefined rocksdb properties + uint64_t val; + if (!GetVarint64(&raw_val, &val)) { + // skip malformed value + auto error_msg = + "[Warning] detect malformed value in properties meta-block:" + "\tkey: " + key + "\tval: " + raw_val.ToString(); + Log(rep->options.info_log, "%s", error_msg.c_str()); + continue; + } + *(pos->second) = val; + } else if (key == BlockBasedTablePropertiesNames::kFilterPolicy) { + table_properties->filter_policy_name = raw_val.ToString(); + } else { + // handle user-collected + table_properties->user_collected_properties.insert( + std::make_pair(key, raw_val.ToString()) + ); + } + } + + return s; +} + +Status BlockBasedTable::GetBlock( + const BlockBasedTable* table, + const BlockHandle& handle, + const ReadOptions& options, + const bool for_compaction, + const Tickers block_cache_miss_ticker, + const Tickers block_cache_hit_ticker, + bool* didIO, + CachableEntry* entry) { + bool no_io = options.read_tier == kBlockCacheTier; + Cache* block_cache = table->rep_->options.block_cache.get(); + Statistics* statistics = table->rep_->options.statistics.get(); + Status s; + + if (block_cache != nullptr) { + char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; + auto key = GetCacheKey( + table->rep_->cache_key_prefix, + table->rep_->cache_key_prefix_size, + handle, + cache_key + ); + + entry->cache_handle = GetFromBlockCache( + block_cache, + key, + block_cache_miss_ticker, + block_cache_hit_ticker, + statistics + ); + + if (entry->cache_handle != nullptr) { + entry->value = + reinterpret_cast(block_cache->Value(entry->cache_handle)); + } else if (no_io) { + // Did not find in block_cache and can't do IO + return Status::Incomplete("no blocking io"); + } else { + Histograms histogram = for_compaction ? + READ_BLOCK_COMPACTION_MICROS : READ_BLOCK_GET_MICROS; + { + // block for stop watch + StopWatch sw(table->rep_->options.env, statistics, histogram); + s = ReadBlockFromFile( + table->rep_->file.get(), + options, + handle, + &entry->value, + table->rep_->options.env, + didIO + ); + } + if (s.ok()) { + if (options.fill_cache && entry->value->isCachable()) { + entry->cache_handle = block_cache->Insert( + key, entry->value, entry->value->size(), &DeleteCachedBlock); + RecordTick(statistics, BLOCK_CACHE_ADD); + } + } + } + } else if (no_io) { + // Could not read from block_cache and can't do IO + return Status::Incomplete("no blocking io"); + } else { + s = ReadBlockFromFile( + table->rep_->file.get(), + options, + handle, + &entry->value, + table->rep_->options.env, + didIO + ); + } + + return s; +} + +// Convert an index iterator value (i.e., an encoded BlockHandle) +// into an iterator over the contents of the corresponding block. +Iterator* BlockBasedTable::BlockReader(void* arg, + const ReadOptions& options, + const Slice& index_value, + bool* didIO, + bool for_compaction) { + const bool no_io = (options.read_tier == kBlockCacheTier); + BlockBasedTable* table = reinterpret_cast(arg); + Cache* block_cache = table->rep_->options.block_cache.get(); + Cache* block_cache_compressed = table->rep_->options. + block_cache_compressed.get(); + Statistics* statistics = table->rep_->options.statistics.get(); + Block* block = nullptr; + Block* cblock = nullptr; + Cache::Handle* cache_handle = nullptr; + Cache::Handle* compressed_cache_handle = nullptr; + + BlockHandle handle; + Slice input = index_value; + Status s = handle.DecodeFrom(&input); + // We intentionally allow extra stuff in index_value so that we + // can add more features in the future. + + if (!s.ok()) { + return NewErrorIterator(s); + } + + if (block_cache != nullptr || block_cache_compressed != nullptr) { + char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; + char compressed_cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; + Slice key, /* key to the block cache */ + ckey /* key to the compressed block cache */ ; + + // create key for block cache + if (block_cache != nullptr) { + key = GetCacheKey( + table->rep_->cache_key_prefix, + table->rep_->cache_key_prefix_size, + handle, + cache_key + ); + } + + if (block_cache_compressed != nullptr) { + ckey = GetCacheKey( + table->rep_->compressed_cache_key_prefix, + table->rep_->compressed_cache_key_prefix_size, + handle, + compressed_cache_key + ); + } + + // Lookup uncompressed cache first + if (block_cache != nullptr) { + assert(!key.empty()); + cache_handle = block_cache->Lookup(key); + if (cache_handle != nullptr) { + block = reinterpret_cast(block_cache->Value(cache_handle)); + RecordTick(statistics, BLOCK_CACHE_HIT); + RecordTick(statistics, BLOCK_CACHE_DATA_HIT); + } else { + RecordTick(statistics, BLOCK_CACHE_MISS); + RecordTick(statistics, BLOCK_CACHE_DATA_MISS); + } + } + + // If not found in uncompressed cache, lookup compressed cache + if (block == nullptr && block_cache_compressed != nullptr) { + assert(!ckey.empty()); + compressed_cache_handle = block_cache_compressed->Lookup(ckey); + + // if we found in the compressed cache, then uncompress and + // insert into uncompressed cache + if (compressed_cache_handle != nullptr) { + // found compressed block + cblock = reinterpret_cast(block_cache_compressed-> + Value(compressed_cache_handle)); + assert(cblock->compressionType() != kNoCompression); + + // Retrieve the uncompressed contents into a new buffer + BlockContents contents; + s = UncompressBlockContents(cblock->data(), cblock->size(), + &contents); + + // Insert uncompressed block into block cache + if (s.ok()) { + block = new Block(contents); // uncompressed block + assert(block->compressionType() == kNoCompression); + if (block_cache != nullptr && block->isCachable() && + options.fill_cache) { + cache_handle = block_cache->Insert(key, block, block->size(), + &DeleteCachedBlock); + assert(reinterpret_cast(block_cache->Value(cache_handle)) + == block); + } + } + // Release hold on compressed cache entry + block_cache_compressed->Release(compressed_cache_handle); + RecordTick(statistics, BLOCK_CACHE_COMPRESSED_HIT); + } + } + + if (block != nullptr) { + BumpPerfCount(&perf_context.block_cache_hit_count); + } else if (no_io) { + // Did not find in block_cache and can't do IO + return NewErrorIterator(Status::Incomplete("no blocking io")); + } else { + Histograms histogram = for_compaction ? + READ_BLOCK_COMPACTION_MICROS : READ_BLOCK_GET_MICROS; + { // block for stop watch + StopWatch sw(table->rep_->options.env, statistics, histogram); + s = ReadBlockFromFile( + table->rep_->file.get(), + options, + handle, + &cblock, + table->rep_->options.env, + didIO, + block_cache_compressed == nullptr + ); + } + if (s.ok()) { + assert(cblock->compressionType() == kNoCompression || + block_cache_compressed != nullptr); + + // Retrieve the uncompressed contents into a new buffer + BlockContents contents; + if (cblock->compressionType() != kNoCompression) { + s = UncompressBlockContents(cblock->data(), cblock->size(), + &contents); + } + if (s.ok()) { + if (cblock->compressionType() != kNoCompression) { + block = new Block(contents); // uncompressed block + } else { + block = cblock; + cblock = nullptr; + } + if (block->isCachable() && options.fill_cache) { + // Insert compressed block into compressed block cache. + // Release the hold on the compressed cache entry immediately. + if (block_cache_compressed != nullptr && cblock != nullptr) { + compressed_cache_handle = block_cache_compressed->Insert( + ckey, cblock, cblock->size(), &DeleteCachedBlock); + block_cache_compressed->Release(compressed_cache_handle); + RecordTick(statistics, BLOCK_CACHE_COMPRESSED_MISS); + cblock = nullptr; + } + // insert into uncompressed block cache + assert((block->compressionType() == kNoCompression)); + if (block_cache != nullptr) { + cache_handle = block_cache->Insert( + key, block, block->size(), &DeleteCachedBlock); + RecordTick(statistics, BLOCK_CACHE_ADD); + assert(reinterpret_cast(block_cache->Value( + cache_handle))== block); + } + } + } + } + if (cblock != nullptr) { + delete cblock; + } + } + } else if (no_io) { + // Could not read from block_cache and can't do IO + return NewErrorIterator(Status::Incomplete("no blocking io")); + } else { + s = ReadBlockFromFile( + table->rep_->file.get(), + options, + handle, + &block, + table->rep_->options.env, + didIO + ); + } + + Iterator* iter; + if (block != nullptr) { + iter = block->NewIterator(table->rep_->options.comparator); + if (cache_handle != nullptr) { + iter->RegisterCleanup(&ReleaseBlock, block_cache, cache_handle); + } else { + iter->RegisterCleanup(&DeleteBlock, block, nullptr); + } + } else { + iter = NewErrorIterator(s); + } + return iter; +} + +BlockBasedTable::CachableEntry +BlockBasedTable::GetFilter(bool no_io) const { + if (!rep_->options.filter_policy || !rep_->options.block_cache) { + return {rep_->filter.get(), nullptr}; + } + + // Fetching from the cache + Cache* block_cache = rep_->options.block_cache.get(); + char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; + auto key = GetCacheKey( + rep_->cache_key_prefix, + rep_->cache_key_prefix_size, + rep_->metaindex_handle, + cache_key + ); + + Statistics* statistics = rep_->options.statistics.get(); + auto cache_handle = GetFromBlockCache( + block_cache, + key, + BLOCK_CACHE_FILTER_MISS, + BLOCK_CACHE_FILTER_HIT, + statistics + ); + + FilterBlockReader* filter = nullptr; + if (cache_handle != nullptr) { + filter = reinterpret_cast( + block_cache->Value(cache_handle)); + } else if (no_io) { + // Do not invoke any io. + return CachableEntry(); + } else { + size_t filter_size = 0; + std::unique_ptr meta; + std::unique_ptr iter; + auto s = ReadMetaBlock(rep_, &meta, &iter); + + if (s.ok()) { + std::string filter_block_key = kFilterBlockPrefix; + filter_block_key.append(rep_->options.filter_policy->Name()); + iter->Seek(filter_block_key); + + if (iter->Valid() && iter->key() == Slice(filter_block_key)) { + filter = ReadFilter(iter->value(), rep_, &filter_size); + assert(filter); + assert(filter_size > 0); + + cache_handle = block_cache->Insert( + key, filter, filter_size, &DeleteCachedFilter); + RecordTick(statistics, BLOCK_CACHE_ADD); + } + } + } + + return { filter, cache_handle }; +} + +// Get the iterator from the index block. +Iterator* BlockBasedTable::IndexBlockReader(const ReadOptions& options) const { + if (rep_->index_block) { + assert (!rep_->options.block_cache); + return rep_->index_block->NewIterator(rep_->options.comparator); + } + + // get index block from cache + assert (rep_->options.block_cache); + bool didIO = false; + CachableEntry entry; + + auto s = GetBlock( + this, + rep_->index_handle, + options, + false, /* for compaction */ + BLOCK_CACHE_INDEX_MISS, + BLOCK_CACHE_INDEX_HIT, + &didIO, + &entry + ); + + Iterator* iter; + if (entry.value != nullptr) { + iter = entry.value->NewIterator(rep_->options.comparator); + if (entry.cache_handle) { + iter->RegisterCleanup( + &ReleaseBlock, rep_->options.block_cache.get(), entry.cache_handle + ); + } else { + iter->RegisterCleanup(&DeleteBlock, entry.value, nullptr); + } + } else { + iter = NewErrorIterator(s); + } + return iter; +} + +Iterator* BlockBasedTable::BlockReader(void* arg, + const ReadOptions& options, + const EnvOptions& soptions, + const Slice& index_value, + bool for_compaction) { + return BlockReader(arg, options, index_value, nullptr, for_compaction); +} + +// This will be broken if the user specifies an unusual implementation +// of Options.comparator, or if the user specifies an unusual +// definition of prefixes in Options.filter_policy. In particular, we +// require the following three properties: +// +// 1) key.starts_with(prefix(key)) +// 2) Compare(prefix(key), key) <= 0. +// 3) If Compare(key1, key2) <= 0, then Compare(prefix(key1), prefix(key2)) <= 0 +// +// Otherwise, this method guarantees no I/O will be incurred. +// +// REQUIRES: this method shouldn't be called while the DB lock is held. +bool BlockBasedTable::PrefixMayMatch(const Slice& internal_prefix) { + bool may_match = true; + Status s; + + if (!rep_->options.filter_policy) { + return true; + } + + // To prevent any io operation in this method, we set `read_tier` to make + // sure we always read index or filter only when they have already been + // loaded to memory. + ReadOptions no_io_read_options; + no_io_read_options.read_tier = kBlockCacheTier; + unique_ptr iiter( + IndexBlockReader(no_io_read_options) + ); + iiter->Seek(internal_prefix); + + if (!iiter->Valid()) { + // we're past end of file + // if it's incomplete, it means that we avoided I/O + // and we're not really sure that we're past the end + // of the file + may_match = iiter->status().IsIncomplete(); + } else if (ExtractUserKey(iiter->key()).starts_with( + ExtractUserKey(internal_prefix))) { + // we need to check for this subtle case because our only + // guarantee is that "the key is a string >= last key in that data + // block" according to the doc/table_format.txt spec. + // + // Suppose iiter->key() starts with the desired prefix; it is not + // necessarily the case that the corresponding data block will + // contain the prefix, since iiter->key() need not be in the + // block. However, the next data block may contain the prefix, so + // we return true to play it safe. + may_match = true; + } else { + // iiter->key() does NOT start with the desired prefix. Because + // Seek() finds the first key that is >= the seek target, this + // means that iiter->key() > prefix. Thus, any data blocks coming + // after the data block corresponding to iiter->key() cannot + // possibly contain the key. Thus, the corresponding data block + // is the only one which could potentially contain the prefix. + Slice handle_value = iiter->value(); + BlockHandle handle; + s = handle.DecodeFrom(&handle_value); + assert(s.ok()); + auto filter_entry = GetFilter(true /* no io */); + may_match = + filter_entry.value == nullptr || + filter_entry.value->PrefixMayMatch(handle.offset(), internal_prefix); + filter_entry.Release(rep_->options.block_cache.get()); + } + + Statistics* statistics = rep_->options.statistics.get(); + RecordTick(statistics, BLOOM_FILTER_PREFIX_CHECKED); + if (!may_match) { + RecordTick(statistics, BLOOM_FILTER_PREFIX_USEFUL); + } + + return may_match; +} + +Iterator* BlockBasedTable::NewIterator(const ReadOptions& options) { + if (options.prefix) { + InternalKey internal_prefix(*options.prefix, 0, kTypeValue); + if (!PrefixMayMatch(internal_prefix.Encode())) { + // nothing in this file can match the prefix, so we should not + // bother doing I/O to this file when iterating. + return NewEmptyIterator(); + } + } + + return NewTwoLevelIterator( + IndexBlockReader(options), + &BlockBasedTable::BlockReader, + const_cast(this), + options, + rep_->soptions + ); +} + +Status BlockBasedTable::Get( + const ReadOptions& readOptions, + const Slice& key, + void* handle_context, + bool (*result_handler)(void* handle_context, const Slice& k, + const Slice& v, bool didIO), + void (*mark_key_may_exist_handler)(void* handle_context)) { + Status s; + Iterator* iiter = IndexBlockReader(readOptions); + auto filter_entry = GetFilter(readOptions.read_tier == kBlockCacheTier); + FilterBlockReader* filter = filter_entry.value; + bool done = false; + for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) { + Slice handle_value = iiter->value(); + + BlockHandle handle; + bool may_not_exist_in_filter = + filter != nullptr && + handle.DecodeFrom(&handle_value).ok() && + !filter->KeyMayMatch(handle.offset(), key); + + if (may_not_exist_in_filter) { + // Not found + // TODO: think about interaction with Merge. If a user key cannot + // cross one data block, we should be fine. + RecordTick(rep_->options.statistics.get(), BLOOM_FILTER_USEFUL); + break; + } else { + bool didIO = false; + unique_ptr block_iter( + BlockReader(this, readOptions, iiter->value(), &didIO)); + + if (readOptions.read_tier && block_iter->status().IsIncomplete()) { + // couldn't get block from block_cache + // Update Saver.state to Found because we are only looking for whether + // we can guarantee the key is not there when "no_io" is set + (*mark_key_may_exist_handler)(handle_context); + break; + } + + // Call the *saver function on each entry/block until it returns false + for (block_iter->Seek(key); block_iter->Valid(); block_iter->Next()) { + if (!(*result_handler)(handle_context, block_iter->key(), + block_iter->value(), didIO)) { + done = true; + break; + } + } + s = block_iter->status(); + } + } + + filter_entry.Release(rep_->options.block_cache.get()); + if (s.ok()) { + s = iiter->status(); + } + delete iiter; + return s; +} + +bool SaveDidIO(void* arg, const Slice& key, const Slice& value, bool didIO) { + *reinterpret_cast(arg) = didIO; + return false; +} +bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options, + const Slice& key) { + // We use Get() as it has logic that checks whether we read the + // block from the disk or not. + bool didIO = false; + Status s = Get(options, key, &didIO, SaveDidIO); + assert(s.ok()); + return !didIO; +} + +uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key) { + Iterator* index_iter = IndexBlockReader(ReadOptions()); + + index_iter->Seek(key); + uint64_t result; + if (index_iter->Valid()) { + BlockHandle handle; + Slice input = index_iter->value(); + Status s = handle.DecodeFrom(&input); + if (s.ok()) { + result = handle.offset(); + } else { + // Strange: we can't decode the block handle in the index block. + // We'll just return the offset of the metaindex block, which is + // close to the whole file size for this case. + result = rep_->metaindex_handle.offset(); + } + } else { + // key is past the last key in the file. Approximate the offset + // by returning the offset of the metaindex block (which is + // right near the end of the file). + result = rep_->metaindex_handle.offset(); + } + delete index_iter; + return result; +} + +const std::string BlockBasedTable::kFilterBlockPrefix = + "filter."; +const std::string BlockBasedTable::kPropertiesBlock = + "rocksdb.properties"; +const std::string BlockBasedTablePropertiesNames::kDataSize = + "rocksdb.data.size"; +const std::string BlockBasedTablePropertiesNames::kIndexSize = + "rocksdb.index.size"; +const std::string BlockBasedTablePropertiesNames::kFilterSize = + "rocksdb.filter.size"; +const std::string BlockBasedTablePropertiesNames::kRawKeySize = + "rocksdb.raw.key.size"; +const std::string BlockBasedTablePropertiesNames::kRawValueSize = + "rocksdb.raw.value.size"; +const std::string BlockBasedTablePropertiesNames::kNumDataBlocks = + "rocksdb.num.data.blocks"; +const std::string BlockBasedTablePropertiesNames::kNumEntries = + "rocksdb.num.entries"; +const std::string BlockBasedTablePropertiesNames::kFilterPolicy = + "rocksdb.filter.policy"; + +} // namespace rocksdb diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h new file mode 100644 index 00000000..66f63fc5 --- /dev/null +++ b/table/block_based_table_reader.h @@ -0,0 +1,195 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include +#include "rocksdb/cache.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/statistics.h" +#include "rocksdb/table_properties.h" +#include "rocksdb/table.h" +#include "util/coding.h" + +namespace rocksdb { + +class Block; +class BlockHandle; +class Footer; +struct Options; +class RandomAccessFile; +struct ReadOptions; +class TableCache; +class TableReader; +class FilterBlockReader; + +using std::unique_ptr; + +// A Table is a sorted map from strings to strings. Tables are +// immutable and persistent. A Table may be safely accessed from +// multiple threads without external synchronization. +class BlockBasedTable : public TableReader { + public: + static const std::string kFilterBlockPrefix; + static const std::string kPropertiesBlock; + + // Attempt to open the table that is stored in bytes [0..file_size) + // of "file", and read the metadata entries necessary to allow + // retrieving data from the table. + // + // If successful, returns ok and sets "*table_reader" to the newly opened + // table. The client should delete "*table_reader" when no longer needed. + // If there was an error while initializing the table, sets "*table_reader" + // to nullptr and returns a non-ok status. + // + // *file must remain live while this Table is in use. + static Status Open(const Options& options, + const EnvOptions& soptions, + unique_ptr&& file, + uint64_t file_size, + unique_ptr* table_reader); + + bool PrefixMayMatch(const Slice& internal_prefix) override; + + // Returns a new iterator over the table contents. + // The result of NewIterator() is initially invalid (caller must + // call one of the Seek methods on the iterator before using it). + Iterator* NewIterator(const ReadOptions&) override; + + Status Get( + const ReadOptions& readOptions, + const Slice& key, + void* handle_context, + bool (*result_handler)(void* handle_context, const Slice& k, + const Slice& v, bool didIO), + void (*mark_key_may_exist_handler)(void* handle_context) = nullptr) + override; + + // Given a key, return an approximate byte offset in the file where + // the data for that key begins (or would begin if the key were + // present in the file). The returned value is in terms of file + // bytes, and so includes effects like compression of the underlying data. + // E.g., the approximate offset of the last key in the table will + // be close to the file length. + uint64_t ApproximateOffsetOf(const Slice& key) override; + + // Returns true if the block for the specified key is in cache. + // REQUIRES: key is in this table. + bool TEST_KeyInCache(const ReadOptions& options, const Slice& key) override; + + // Set up the table for Compaction. Might change some parameters with + // posix_fadvise + void SetupForCompaction() override; + + TableProperties& GetTableProperties() override; + + ~BlockBasedTable(); + + private: + template + struct CachableEntry; + + struct Rep; + Rep* rep_; + bool compaction_optimized_; + + static Iterator* BlockReader(void*, const ReadOptions&, + const EnvOptions& soptions, const Slice&, + bool for_compaction); + + static Iterator* BlockReader(void*, const ReadOptions&, const Slice&, + bool* didIO, bool for_compaction = false); + + // if `no_io == true`, we will not try to read filter from sst file + // if it is not cached yet. + CachableEntry GetFilter(bool no_io = false) const; + + Iterator* IndexBlockReader(const ReadOptions& options) const; + + // Read the block, either from sst file or from cache. This method will try + // to read from cache only when block_cache is set or ReadOption doesn't + // explicitly prohibit storage IO. + // + // If the block is read from cache, the statistics for cache miss/hit of the + // the given type of block will be updated. User can specify + // `block_cache_miss_ticker` and `block_cache_hit_ticker` for the statistics + // update. + // + // On success, the `result` parameter will be populated, which contains a + // pointer to the block and its cache handle, which will be nullptr if it's + // not read from the cache. + static Status GetBlock(const BlockBasedTable* table, + const BlockHandle& handle, + const ReadOptions& options, + bool for_compaction, + Tickers block_cache_miss_ticker, + Tickers block_cache_hit_ticker, + bool* didIO, + CachableEntry* result); + + // Calls (*handle_result)(arg, ...) repeatedly, starting with the entry found + // after a call to Seek(key), until handle_result returns false. + // May not make such a call if filter policy says that key is not present. + friend class TableCache; + friend class BlockBasedTableBuilder; + + void ReadMeta(const Footer& footer); + void ReadFilter(const Slice& filter_handle_value); + static Status ReadProperties(const Slice& handle_value, Rep* rep); + + // Read the meta block from sst. + static Status ReadMetaBlock( + Rep* rep, + std::unique_ptr* meta_block, + std::unique_ptr* iter); + + // Create the filter from the filter block. + static FilterBlockReader* ReadFilter( + const Slice& filter_handle_value, + Rep* rep, + size_t* filter_size = nullptr); + + // Read the table properties from properties block. + static Status ReadProperties( + const Slice& handle_value, Rep* rep, TableProperties* properties); + + static void SetupCacheKeyPrefix(Rep* rep); + + explicit BlockBasedTable(Rep* rep) : + compaction_optimized_(false) { + rep_ = rep; + } + // Generate a cache key prefix from the file + static void GenerateCachePrefix(Cache* cc, + RandomAccessFile* file, char* buffer, size_t* size); + static void GenerateCachePrefix(Cache* cc, + WritableFile* file, char* buffer, size_t* size); + + // The longest prefix of the cache key used to identify blocks. + // For Posix files the unique ID is three varints. + static const size_t kMaxCacheKeyPrefixSize = kMaxVarint64Length*3+1; + + // No copying allowed + explicit BlockBasedTable(const TableReader&) = delete; + void operator=(const TableReader&) = delete; +}; + +struct BlockBasedTablePropertiesNames { + static const std::string kDataSize; + static const std::string kIndexSize; + static const std::string kFilterSize; + static const std::string kRawKeySize; + static const std::string kRawValueSize; + static const std::string kNumDataBlocks; + static const std::string kNumEntries; + static const std::string kFilterPolicy; +}; + +} // namespace rocksdb diff --git a/table/block_builder.cc b/table/block_builder.cc new file mode 100644 index 00000000..91760186 --- /dev/null +++ b/table/block_builder.cc @@ -0,0 +1,134 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// BlockBuilder generates blocks where keys are prefix-compressed: +// +// When we store a key, we drop the prefix shared with the previous +// string. This helps reduce the space requirement significantly. +// Furthermore, once every K keys, we do not apply the prefix +// compression and store the entire key. We call this a "restart +// point". The tail end of the block stores the offsets of all of the +// restart points, and can be used to do a binary search when looking +// for a particular key. Values are stored as-is (without compression) +// immediately following the corresponding key. +// +// An entry for a particular key-value pair has the form: +// shared_bytes: varint32 +// unshared_bytes: varint32 +// value_length: varint32 +// key_delta: char[unshared_bytes] +// value: char[value_length] +// shared_bytes == 0 for restart points. +// +// The trailer of the block has the form: +// restarts: uint32[num_restarts] +// num_restarts: uint32 +// restarts[i] contains the offset within the block of the ith restart point. + +#include "table/block_builder.h" + +#include +#include +#include "rocksdb/comparator.h" +#include "util/coding.h" + +namespace rocksdb { + +BlockBuilder::BlockBuilder(int block_restart_interval, + const Comparator* comparator) + : block_restart_interval_(block_restart_interval), + comparator_(comparator), + restarts_(), + counter_(0), + finished_(false) { + assert(block_restart_interval_ >= 1); + restarts_.push_back(0); // First restart point is at offset 0 +} + +BlockBuilder::BlockBuilder(const Options& options) + : BlockBuilder(options.block_restart_interval, options.comparator) { +} + +void BlockBuilder::Reset() { + buffer_.clear(); + restarts_.clear(); + restarts_.push_back(0); // First restart point is at offset 0 + counter_ = 0; + finished_ = false; + last_key_.clear(); +} + +size_t BlockBuilder::CurrentSizeEstimate() const { + return (buffer_.size() + // Raw data buffer + restarts_.size() * sizeof(uint32_t) + // Restart array + sizeof(uint32_t)); // Restart array length +} + +size_t BlockBuilder::EstimateSizeAfterKV(const Slice& key, const Slice& value) + const { + size_t estimate = CurrentSizeEstimate(); + estimate += key.size() + value.size(); + if (counter_ >= block_restart_interval_) { + estimate += sizeof(uint32_t); // a new restart entry. + } + + estimate += sizeof(int32_t); // varint for shared prefix length. + estimate += VarintLength(key.size()); // varint for key length. + estimate += VarintLength(value.size()); // varint for value length. + + return estimate; +} + +Slice BlockBuilder::Finish() { + // Append restart array + for (size_t i = 0; i < restarts_.size(); i++) { + PutFixed32(&buffer_, restarts_[i]); + } + PutFixed32(&buffer_, restarts_.size()); + finished_ = true; + return Slice(buffer_); +} + +void BlockBuilder::Add(const Slice& key, const Slice& value) { + Slice last_key_piece(last_key_); + assert(!finished_); + assert(counter_ <= block_restart_interval_); + assert(buffer_.empty() // No values yet? + || comparator_->Compare(key, last_key_piece) > 0); + size_t shared = 0; + if (counter_ < block_restart_interval_) { + // See how much sharing to do with previous string + const size_t min_length = std::min(last_key_piece.size(), key.size()); + while ((shared < min_length) && (last_key_piece[shared] == key[shared])) { + shared++; + } + } else { + // Restart compression + restarts_.push_back(buffer_.size()); + counter_ = 0; + } + const size_t non_shared = key.size() - shared; + + // Add "" to buffer_ + PutVarint32(&buffer_, shared); + PutVarint32(&buffer_, non_shared); + PutVarint32(&buffer_, value.size()); + + // Add string delta to buffer_ followed by value + buffer_.append(key.data() + shared, non_shared); + buffer_.append(value.data(), value.size()); + + // Update state + last_key_.resize(shared); + last_key_.append(key.data() + shared, non_shared); + assert(Slice(last_key_) == key); + counter_++; +} + +} // namespace rocksdb diff --git a/table/block_builder.h b/table/block_builder.h new file mode 100644 index 00000000..31faf19b --- /dev/null +++ b/table/block_builder.h @@ -0,0 +1,65 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include + +#include +#include "rocksdb/slice.h" + +namespace rocksdb { + +struct Options; +class Comparator; + +class BlockBuilder { + public: + BlockBuilder(int block_builder, const Comparator* comparator); + explicit BlockBuilder(const Options& options); + + // Reset the contents as if the BlockBuilder was just constructed. + void Reset(); + + // REQUIRES: Finish() has not been callled since the last call to Reset(). + // REQUIRES: key is larger than any previously added key + void Add(const Slice& key, const Slice& value); + + // Finish building the block and return a slice that refers to the + // block contents. The returned slice will remain valid for the + // lifetime of this builder or until Reset() is called. + Slice Finish(); + + // Returns an estimate of the current (uncompressed) size of the block + // we are building. + size_t CurrentSizeEstimate() const; + + // Returns an estimated block size after appending key and value. + size_t EstimateSizeAfterKV(const Slice& key, const Slice& value) const; + + // Return true iff no entries have been added since the last Reset() + bool empty() const { + return buffer_.empty(); + } + + private: + const int block_restart_interval_; + const Comparator* comparator_; + + std::string buffer_; // Destination buffer + std::vector restarts_; // Restart points + int counter_; // Number of entries emitted since restart + bool finished_; // Has Finish() been called? + std::string last_key_; + + // No copying allowed + BlockBuilder(const BlockBuilder&); + void operator=(const BlockBuilder&); +}; + +} // namespace rocksdb diff --git a/table/block_test.cc b/table/block_test.cc new file mode 100644 index 00000000..7f33e3a9 --- /dev/null +++ b/table/block_test.cc @@ -0,0 +1,105 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include +#include "db/dbformat.h" +#include "db/memtable.h" +#include "db/write_batch_internal.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/table.h" +#include "table/block.h" +#include "table/block_builder.h" +#include "table/format.h" +#include "util/random.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { + +static std::string RandomString(Random* rnd, int len) { + std::string r; + test::RandomString(rnd, len, &r); + return r; +} + +class BlockTest {}; + +// block test +TEST(BlockTest, SimpleTest) { + Random rnd(301); + Options options = Options(); + std::vector keys; + std::vector values; + BlockBuilder builder(options); + int num_records = 100000; + char buf[10]; + char* p = &buf[0]; + + // add a bunch of records to a block + for (int i = 0; i < num_records; i++) { + // generate random kvs + sprintf(p, "%6d", i); + std::string k(p); + std::string v = RandomString(&rnd, 100); // 100 byte values + + // write kvs to the block + Slice key(k); + Slice value(v); + builder.Add(key, value); + + // remember kvs in a lookaside array + keys.push_back(k); + values.push_back(v); + } + + // read serialized contents of the block + Slice rawblock = builder.Finish(); + + // create block reader + BlockContents contents; + contents.data = rawblock; + contents.cachable = false; + contents.heap_allocated = false; + Block reader(contents); + + // read contents of block sequentially + int count = 0; + Iterator* iter = reader.NewIterator(options.comparator); + for (iter->SeekToFirst();iter->Valid(); count++, iter->Next()) { + + // read kv from block + Slice k = iter->key(); + Slice v = iter->value(); + + // compare with lookaside array + ASSERT_EQ(k.ToString().compare(keys[count]), 0); + ASSERT_EQ(v.ToString().compare(values[count]), 0); + } + delete iter; + + // read block contents randomly + iter = reader.NewIterator(options.comparator); + for (int i = 0; i < num_records; i++) { + + // find a random key in the lookaside array + int index = rnd.Uniform(num_records); + Slice k(keys[index]); + + // search in block for this key + iter->Seek(k); + ASSERT_TRUE(iter->Valid()); + Slice v = iter->value(); + ASSERT_EQ(v.ToString().compare(values[index]), 0); + } + delete iter; +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/table/filter_block.cc b/table/filter_block.cc new file mode 100644 index 00000000..82b6c6ee --- /dev/null +++ b/table/filter_block.cc @@ -0,0 +1,187 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/filter_block.h" + +#include "db/dbformat.h" +#include "rocksdb/filter_policy.h" +#include "util/coding.h" + +namespace rocksdb { + +// See doc/table_format.txt for an explanation of the filter block format. + +// Generate new filter every 2KB of data +static const size_t kFilterBaseLg = 11; +static const size_t kFilterBase = 1 << kFilterBaseLg; + +FilterBlockBuilder::FilterBlockBuilder(const Options& opt) + : policy_(opt.filter_policy), + prefix_extractor_(opt.prefix_extractor), + whole_key_filtering_(opt.whole_key_filtering), + comparator_(opt.comparator){} + +void FilterBlockBuilder::StartBlock(uint64_t block_offset) { + uint64_t filter_index = (block_offset / kFilterBase); + assert(filter_index >= filter_offsets_.size()); + while (filter_index > filter_offsets_.size()) { + GenerateFilter(); + } +} + +bool FilterBlockBuilder::SamePrefix(const Slice &key1, + const Slice &key2) const { + if (!prefix_extractor_->InDomain(key1) && + !prefix_extractor_->InDomain(key2)) { + return true; + } else if (!prefix_extractor_->InDomain(key1) || + !prefix_extractor_->InDomain(key2)) { + return false; + } else { + return (prefix_extractor_->Transform(key1) == + prefix_extractor_->Transform(key2)); + } +} + +void FilterBlockBuilder::AddKey(const Slice& key) { + // get slice for most recently added entry + Slice prev; + size_t added_to_start = 0; + + // add key to filter if needed + if (whole_key_filtering_) { + start_.push_back(entries_.size()); + ++added_to_start; + entries_.append(key.data(), key.size()); + } + + if (start_.size() > added_to_start) { + size_t prev_start = start_[start_.size() - 1 - added_to_start]; + const char* base = entries_.data() + prev_start; + size_t length = entries_.size() - prev_start; + prev = Slice(base, length); + } + + // add prefix to filter if needed + if (prefix_extractor_ && prefix_extractor_->InDomain(ExtractUserKey(key))) { + // If prefix_extractor_, this filter_block layer assumes we only + // operate on internal keys. + Slice user_key = ExtractUserKey(key); + // this assumes prefix(prefix(key)) == prefix(key), as the last + // entry in entries_ may be either a key or prefix, and we use + // prefix(last entry) to get the prefix of the last key. + if (prev.size() == 0 || + !SamePrefix(user_key, ExtractUserKey(prev))) { + Slice prefix = prefix_extractor_->Transform(user_key); + InternalKey internal_prefix_tmp(prefix, 0, kTypeValue); + Slice internal_prefix = internal_prefix_tmp.Encode(); + assert(comparator_->Compare(internal_prefix, key) <= 0); + start_.push_back(entries_.size()); + entries_.append(internal_prefix.data(), internal_prefix.size()); + } + } +} + +Slice FilterBlockBuilder::Finish() { + if (!start_.empty()) { + GenerateFilter(); + } + + // Append array of per-filter offsets + const uint32_t array_offset = result_.size(); + for (size_t i = 0; i < filter_offsets_.size(); i++) { + PutFixed32(&result_, filter_offsets_[i]); + } + + PutFixed32(&result_, array_offset); + result_.push_back(kFilterBaseLg); // Save encoding parameter in result + return Slice(result_); +} + +void FilterBlockBuilder::GenerateFilter() { + const size_t num_entries = start_.size(); + if (num_entries == 0) { + // Fast path if there are no keys for this filter + filter_offsets_.push_back(result_.size()); + return; + } + + // Make list of keys from flattened key structure + start_.push_back(entries_.size()); // Simplify length computation + tmp_entries_.resize(num_entries); + for (size_t i = 0; i < num_entries; i++) { + const char* base = entries_.data() + start_[i]; + size_t length = start_[i+1] - start_[i]; + tmp_entries_[i] = Slice(base, length); + } + + // Generate filter for current set of keys and append to result_. + filter_offsets_.push_back(result_.size()); + policy_->CreateFilter(&tmp_entries_[0], num_entries, &result_); + + tmp_entries_.clear(); + entries_.clear(); + start_.clear(); +} + +FilterBlockReader::FilterBlockReader( + const Options& opt, const Slice& contents, bool delete_contents_after_use) + : policy_(opt.filter_policy), + prefix_extractor_(opt.prefix_extractor), + whole_key_filtering_(opt.whole_key_filtering), + data_(nullptr), + offset_(nullptr), + num_(0), + base_lg_(0) { + size_t n = contents.size(); + if (n < 5) return; // 1 byte for base_lg_ and 4 for start of offset array + base_lg_ = contents[n-1]; + uint32_t last_word = DecodeFixed32(contents.data() + n - 5); + if (last_word > n - 5) return; + data_ = contents.data(); + offset_ = data_ + last_word; + num_ = (n - 5 - last_word) / 4; + if (delete_contents_after_use) { + filter_data.reset(contents.data()); + } +} + +bool FilterBlockReader::KeyMayMatch(uint64_t block_offset, + const Slice& key) { + if (!whole_key_filtering_) { + return true; + } + return MayMatch(block_offset, key); +} + +bool FilterBlockReader::PrefixMayMatch(uint64_t block_offset, + const Slice& prefix) { + if (!prefix_extractor_) { + return true; + } + return MayMatch(block_offset, prefix); +} + +bool FilterBlockReader::MayMatch(uint64_t block_offset, const Slice& entry) { + uint64_t index = block_offset >> base_lg_; + if (index < num_) { + uint32_t start = DecodeFixed32(offset_ + index*4); + uint32_t limit = DecodeFixed32(offset_ + index*4 + 4); + if (start <= limit && limit <= (offset_ - data_)) { + Slice filter = Slice(data_ + start, limit - start); + return policy_->KeyMayMatch(entry, filter); + } else if (start == limit) { + // Empty filters do not match any entries + return false; + } + } + return true; // Errors are treated as potential matches +} + +} diff --git a/table/filter_block.h b/table/filter_block.h new file mode 100644 index 00000000..e47f9465 --- /dev/null +++ b/table/filter_block.h @@ -0,0 +1,88 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A filter block is stored near the end of a Table file. It contains +// filters (e.g., bloom filters) for all data blocks in the table combined +// into a single filter block. + +#pragma once + +#include +#include +#include +#include +#include +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "util/hash.h" + +namespace rocksdb { + +class FilterPolicy; + +// A FilterBlockBuilder is used to construct all of the filters for a +// particular Table. It generates a single string which is stored as +// a special block in the Table. +// +// The sequence of calls to FilterBlockBuilder must match the regexp: +// (StartBlock AddKey*)* Finish +class FilterBlockBuilder { + public: + explicit FilterBlockBuilder(const Options& opt); + + void StartBlock(uint64_t block_offset); + void AddKey(const Slice& key); + Slice Finish(); + + private: + bool SamePrefix(const Slice &key1, const Slice &key2) const; + void GenerateFilter(); + + const FilterPolicy* policy_; + const SliceTransform* prefix_extractor_; + bool whole_key_filtering_; + const Comparator* comparator_; + + std::string entries_; // Flattened entry contents + std::vector start_; // Starting index in entries_ of each entry + std::string result_; // Filter data computed so far + std::vector tmp_entries_; // policy_->CreateFilter() argument + std::vector filter_offsets_; + + // No copying allowed + FilterBlockBuilder(const FilterBlockBuilder&); + void operator=(const FilterBlockBuilder&); +}; + +class FilterBlockReader { + public: + // REQUIRES: "contents" and *policy must stay live while *this is live. + FilterBlockReader( + const Options& opt, + const Slice& contents, + bool delete_contents_after_use = false); + bool KeyMayMatch(uint64_t block_offset, const Slice& key); + bool PrefixMayMatch(uint64_t block_offset, const Slice& prefix); + + private: + const FilterPolicy* policy_; + const SliceTransform* prefix_extractor_; + bool whole_key_filtering_; + const char* data_; // Pointer to filter data (at block-start) + const char* offset_; // Pointer to beginning of offset array (at block-end) + size_t num_; // Number of entries in offset array + size_t base_lg_; // Encoding parameter (see kFilterBaseLg in .cc file) + std::unique_ptr filter_data; + + + bool MayMatch(uint64_t block_offset, const Slice& entry); +}; + +} diff --git a/table/filter_block_test.cc b/table/filter_block_test.cc new file mode 100644 index 00000000..bc1a0d0a --- /dev/null +++ b/table/filter_block_test.cc @@ -0,0 +1,139 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/filter_block.h" + +#include "rocksdb/filter_policy.h" +#include "util/coding.h" +#include "util/hash.h" +#include "util/logging.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { + +// For testing: emit an array with one hash value per key +class TestHashFilter : public FilterPolicy { + public: + virtual const char* Name() const { + return "TestHashFilter"; + } + + virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const { + for (int i = 0; i < n; i++) { + uint32_t h = Hash(keys[i].data(), keys[i].size(), 1); + PutFixed32(dst, h); + } + } + + virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const { + uint32_t h = Hash(key.data(), key.size(), 1); + for (unsigned int i = 0; i + 4 <= filter.size(); i += 4) { + if (h == DecodeFixed32(filter.data() + i)) { + return true; + } + } + return false; + } +}; + +class FilterBlockTest { + public: + TestHashFilter policy_; + Options options_; + + FilterBlockTest() { + options_ = Options(); + options_.filter_policy = &policy_; + } +}; + +TEST(FilterBlockTest, EmptyBuilder) { + FilterBlockBuilder builder(options_); + Slice block = builder.Finish(); + ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block)); + FilterBlockReader reader(options_, block); + ASSERT_TRUE(reader.KeyMayMatch(0, "foo")); + ASSERT_TRUE(reader.KeyMayMatch(100000, "foo")); +} + +TEST(FilterBlockTest, SingleChunk) { + FilterBlockBuilder builder(options_); + builder.StartBlock(100); + builder.AddKey("foo"); + builder.AddKey("bar"); + builder.AddKey("box"); + builder.StartBlock(200); + builder.AddKey("box"); + builder.StartBlock(300); + builder.AddKey("hello"); + Slice block = builder.Finish(); + FilterBlockReader reader(options_, block); + ASSERT_TRUE(reader.KeyMayMatch(100, "foo")); + ASSERT_TRUE(reader.KeyMayMatch(100, "bar")); + ASSERT_TRUE(reader.KeyMayMatch(100, "box")); + ASSERT_TRUE(reader.KeyMayMatch(100, "hello")); + ASSERT_TRUE(reader.KeyMayMatch(100, "foo")); + ASSERT_TRUE(! reader.KeyMayMatch(100, "missing")); + ASSERT_TRUE(! reader.KeyMayMatch(100, "other")); +} + +TEST(FilterBlockTest, MultiChunk) { + FilterBlockBuilder builder(options_); + + // First filter + builder.StartBlock(0); + builder.AddKey("foo"); + builder.StartBlock(2000); + builder.AddKey("bar"); + + // Second filter + builder.StartBlock(3100); + builder.AddKey("box"); + + // Third filter is empty + + // Last filter + builder.StartBlock(9000); + builder.AddKey("box"); + builder.AddKey("hello"); + + Slice block = builder.Finish(); + FilterBlockReader reader(options_, block); + + // Check first filter + ASSERT_TRUE(reader.KeyMayMatch(0, "foo")); + ASSERT_TRUE(reader.KeyMayMatch(2000, "bar")); + ASSERT_TRUE(! reader.KeyMayMatch(0, "box")); + ASSERT_TRUE(! reader.KeyMayMatch(0, "hello")); + + // Check second filter + ASSERT_TRUE(reader.KeyMayMatch(3100, "box")); + ASSERT_TRUE(! reader.KeyMayMatch(3100, "foo")); + ASSERT_TRUE(! reader.KeyMayMatch(3100, "bar")); + ASSERT_TRUE(! reader.KeyMayMatch(3100, "hello")); + + // Check third filter (empty) + ASSERT_TRUE(! reader.KeyMayMatch(4100, "foo")); + ASSERT_TRUE(! reader.KeyMayMatch(4100, "bar")); + ASSERT_TRUE(! reader.KeyMayMatch(4100, "box")); + ASSERT_TRUE(! reader.KeyMayMatch(4100, "hello")); + + // Check last filter + ASSERT_TRUE(reader.KeyMayMatch(9000, "box")); + ASSERT_TRUE(reader.KeyMayMatch(9000, "hello")); + ASSERT_TRUE(! reader.KeyMayMatch(9000, "foo")); + ASSERT_TRUE(! reader.KeyMayMatch(9000, "bar")); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/table/flush_block_policy.cc b/table/flush_block_policy.cc new file mode 100644 index 00000000..a953a78a --- /dev/null +++ b/table/flush_block_policy.cc @@ -0,0 +1,70 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include "rocksdb/flush_block_policy.h" +#include "rocksdb/slice.h" +#include "table/block_builder.h" + +#include + +namespace rocksdb { + +// Flush block by size +class FlushBlockBySizePolicy : public FlushBlockPolicy { + public: + // @params block_size: Approximate size of user data packed per + // block. + // @params block_size_deviation: This is used to close a block before it + // reaches the configured + FlushBlockBySizePolicy(const uint64_t block_size, + const uint64_t block_size_deviation, + const BlockBuilder& data_block_builder) : + block_size_(block_size), + block_size_deviation_(block_size_deviation), + data_block_builder_(data_block_builder) { + } + + virtual bool Update(const Slice& key, + const Slice& value) override { + // it makes no sense to flush when the data block is empty + if (data_block_builder_.empty()) { + return false; + } + + auto curr_size = data_block_builder_.CurrentSizeEstimate(); + + // Do flush if one of the below two conditions is true: + // 1) if the current estimated size already exceeds the block size, + // 2) block_size_deviation is set and the estimated size after appending + // the kv will exceed the block size and the current size is under the + // the deviation. + return curr_size >= block_size_ || BlockAlmostFull(key, value); + } + + private: + bool BlockAlmostFull(const Slice& key, const Slice& value) const { + const auto curr_size = data_block_builder_.CurrentSizeEstimate(); + const auto estimated_size_after = + data_block_builder_.EstimateSizeAfterKV(key, value); + + return + estimated_size_after > block_size_ && + block_size_deviation_ > 0 && + curr_size * 100 > block_size_ * (100 - block_size_deviation_); + } + + const uint64_t block_size_; + const uint64_t block_size_deviation_; + const BlockBuilder& data_block_builder_; +}; + +FlushBlockPolicy* FlushBlockBySizePolicyFactory::NewFlushBlockPolicy( + const BlockBuilder& data_block_builder) const { + return new FlushBlockBySizePolicy(block_size_, + block_size_deviation_, + data_block_builder); +} + +} // namespace rocksdb diff --git a/table/format.cc b/table/format.cc new file mode 100644 index 00000000..ff6d8fa2 --- /dev/null +++ b/table/format.cc @@ -0,0 +1,203 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/format.h" + +#include "port/port.h" +#include "rocksdb/env.h" +#include "table/block.h" +#include "util/coding.h" +#include "util/crc32c.h" +#include "util/perf_context_imp.h" + +namespace rocksdb { + +void BlockHandle::EncodeTo(std::string* dst) const { + // Sanity check that all fields have been set + assert(offset_ != ~static_cast(0)); + assert(size_ != ~static_cast(0)); + PutVarint64(dst, offset_); + PutVarint64(dst, size_); +} + +Status BlockHandle::DecodeFrom(Slice* input) { + if (GetVarint64(input, &offset_) && + GetVarint64(input, &size_)) { + return Status::OK(); + } else { + return Status::Corruption("bad block handle"); + } +} + +void Footer::EncodeTo(std::string* dst) const { +#ifndef NDEBUG + const size_t original_size = dst->size(); +#endif + metaindex_handle_.EncodeTo(dst); + index_handle_.EncodeTo(dst); + dst->resize(2 * BlockHandle::kMaxEncodedLength); // Padding + PutFixed32(dst, static_cast(kTableMagicNumber & 0xffffffffu)); + PutFixed32(dst, static_cast(kTableMagicNumber >> 32)); + assert(dst->size() == original_size + kEncodedLength); +} + +Status Footer::DecodeFrom(Slice* input) { + assert(input != nullptr); + assert(input->size() >= kEncodedLength); + + const char* magic_ptr = input->data() + kEncodedLength - 8; + const uint32_t magic_lo = DecodeFixed32(magic_ptr); + const uint32_t magic_hi = DecodeFixed32(magic_ptr + 4); + const uint64_t magic = ((static_cast(magic_hi) << 32) | + (static_cast(magic_lo))); + if (magic != kTableMagicNumber) { + return Status::InvalidArgument("not an sstable (bad magic number)"); + } + + Status result = metaindex_handle_.DecodeFrom(input); + if (result.ok()) { + result = index_handle_.DecodeFrom(input); + } + if (result.ok()) { + // We skip over any leftover data (just padding for now) in "input" + const char* end = magic_ptr + 8; + *input = Slice(end, input->data() + input->size() - end); + } + return result; +} + +Status ReadBlockContents(RandomAccessFile* file, + const ReadOptions& options, + const BlockHandle& handle, + BlockContents* result, + Env* env, + bool do_uncompress) { + result->data = Slice(); + result->cachable = false; + result->heap_allocated = false; + + // Read the block contents as well as the type/crc footer. + // See table_builder.cc for the code that built this structure. + size_t n = static_cast(handle.size()); + char* buf = new char[n + kBlockTrailerSize]; + Slice contents; + + StopWatchNano timer(env); + StartPerfTimer(&timer); + Status s = file->Read(handle.offset(), n + kBlockTrailerSize, &contents, buf); + BumpPerfCount(&perf_context.block_read_count); + BumpPerfCount(&perf_context.block_read_byte, n + kBlockTrailerSize); + BumpPerfTime(&perf_context.block_read_time, &timer); + + if (!s.ok()) { + delete[] buf; + return s; + } + if (contents.size() != n + kBlockTrailerSize) { + delete[] buf; + return Status::Corruption("truncated block read"); + } + + // Check the crc of the type and the block contents + const char* data = contents.data(); // Pointer to where Read put the data + if (options.verify_checksums) { + const uint32_t crc = crc32c::Unmask(DecodeFixed32(data + n + 1)); + const uint32_t actual = crc32c::Value(data, n + 1); + if (actual != crc) { + delete[] buf; + s = Status::Corruption("block checksum mismatch"); + return s; + } + BumpPerfTime(&perf_context.block_checksum_time, &timer); + } + + // If the caller has requested that the block not be uncompressed + if (!do_uncompress || data[n] == kNoCompression) { + if (data != buf) { + // File implementation gave us pointer to some other data. + // Use it directly under the assumption that it will be live + // while the file is open. + delete[] buf; + result->data = Slice(data, n); + result->heap_allocated = false; + result->cachable = false; // Do not double-cache + } else { + result->data = Slice(buf, n); + result->heap_allocated = true; + result->cachable = true; + } + result->compression_type = (rocksdb::CompressionType)data[n]; + s = Status::OK(); + } else { + s = UncompressBlockContents(data, n, result); + delete[] buf; + } + BumpPerfTime(&perf_context.block_decompress_time, &timer); + return s; +} + +// +// The 'data' points to the raw block contents that was read in from file. +// This method allocates a new heap buffer and the raw block +// contents are uncompresed into this buffer. This +// buffer is returned via 'result' and it is upto the caller to +// free this buffer. +Status UncompressBlockContents(const char* data, size_t n, + BlockContents* result) { + char* ubuf = nullptr; + int decompress_size = 0; + assert(data[n] != kNoCompression); + switch (data[n]) { + case kSnappyCompression: { + size_t ulength = 0; + static char snappy_corrupt_msg[] = + "Snappy not supported or corrupted Snappy compressed block contents"; + if (!port::Snappy_GetUncompressedLength(data, n, &ulength)) { + return Status::Corruption(snappy_corrupt_msg); + } + ubuf = new char[ulength]; + if (!port::Snappy_Uncompress(data, n, ubuf)) { + delete[] ubuf; + return Status::Corruption(snappy_corrupt_msg); + } + result->data = Slice(ubuf, ulength); + result->heap_allocated = true; + result->cachable = true; + break; + } + case kZlibCompression: + ubuf = port::Zlib_Uncompress(data, n, &decompress_size); + static char zlib_corrupt_msg[] = + "Zlib not supported or corrupted Zlib compressed block contents"; + if (!ubuf) { + return Status::Corruption(zlib_corrupt_msg); + } + result->data = Slice(ubuf, decompress_size); + result->heap_allocated = true; + result->cachable = true; + break; + case kBZip2Compression: + ubuf = port::BZip2_Uncompress(data, n, &decompress_size); + static char bzip2_corrupt_msg[] = + "Bzip2 not supported or corrupted Bzip2 compressed block contents"; + if (!ubuf) { + return Status::Corruption(bzip2_corrupt_msg); + } + result->data = Slice(ubuf, decompress_size); + result->heap_allocated = true; + result->cachable = true; + break; + default: + return Status::Corruption("bad block type"); + } + result->compression_type = kNoCompression; // not compressed any more + return Status::OK(); +} + +} // namespace rocksdb diff --git a/table/format.h b/table/format.h new file mode 100644 index 00000000..2f1c1e8d --- /dev/null +++ b/table/format.h @@ -0,0 +1,122 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "rocksdb/options.h" +#include "rocksdb/table.h" + +namespace rocksdb { + +class Block; +class RandomAccessFile; +struct ReadOptions; + +// BlockHandle is a pointer to the extent of a file that stores a data +// block or a meta block. +class BlockHandle { + public: + BlockHandle(); + + // The offset of the block in the file. + uint64_t offset() const { return offset_; } + void set_offset(uint64_t offset) { offset_ = offset; } + + // The size of the stored block + uint64_t size() const { return size_; } + void set_size(uint64_t size) { size_ = size; } + + void EncodeTo(std::string* dst) const; + Status DecodeFrom(Slice* input); + + // Maximum encoding length of a BlockHandle + enum { kMaxEncodedLength = 10 + 10 }; + + private: + uint64_t offset_; + uint64_t size_; +}; + +// Footer encapsulates the fixed information stored at the tail +// end of every table file. +class Footer { + public: + Footer() { } + + // The block handle for the metaindex block of the table + const BlockHandle& metaindex_handle() const { return metaindex_handle_; } + void set_metaindex_handle(const BlockHandle& h) { metaindex_handle_ = h; } + + // The block handle for the index block of the table + const BlockHandle& index_handle() const { + return index_handle_; + } + void set_index_handle(const BlockHandle& h) { + index_handle_ = h; + } + + void EncodeTo(std::string* dst) const; + Status DecodeFrom(Slice* input); + + // Encoded length of a Footer. Note that the serialization of a + // Footer will always occupy exactly this many bytes. It consists + // of two block handles and a magic number. + enum { + kEncodedLength = 2*BlockHandle::kMaxEncodedLength + 8 + }; + + private: + BlockHandle metaindex_handle_; + BlockHandle index_handle_; +}; + +// kTableMagicNumber was picked by running +// echo http://code.google.com/p/leveldb/ | sha1sum +// and taking the leading 64 bits. +static const uint64_t kTableMagicNumber = 0xdb4775248b80fb57ull; + +// 1-byte type + 32-bit crc +static const size_t kBlockTrailerSize = 5; + +struct BlockContents { + Slice data; // Actual contents of data + bool cachable; // True iff data can be cached + bool heap_allocated; // True iff caller should delete[] data.data() + CompressionType compression_type; +}; + +// Read the block identified by "handle" from "file". On failure +// return non-OK. On success fill *result and return OK. +extern Status ReadBlockContents(RandomAccessFile* file, + const ReadOptions& options, + const BlockHandle& handle, + BlockContents* result, + Env* env, + bool do_uncompress); + +// The 'data' points to the raw block contents read in from file. +// This method allocates a new heap buffer and the raw block +// contents are uncompresed into this buffer. This buffer is +// returned via 'result' and it is upto the caller to +// free this buffer. +extern Status UncompressBlockContents(const char* data, + size_t n, + BlockContents* result); + +// Implementation details follow. Clients should ignore, + +inline BlockHandle::BlockHandle() + : offset_(~static_cast(0)), + size_(~static_cast(0)) { +} + +} // namespace rocksdb diff --git a/table/iter_heap.h b/table/iter_heap.h new file mode 100644 index 00000000..af8834e3 --- /dev/null +++ b/table/iter_heap.h @@ -0,0 +1,64 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// + +#pragma once +#include + +#include "rocksdb/comparator.h" +#include "table/iterator_wrapper.h" + +namespace rocksdb { + +// Return the max of two keys. +class MaxIteratorComparator { + public: + MaxIteratorComparator(const Comparator* comparator) : + comparator_(comparator) {} + + bool operator()(IteratorWrapper* a, IteratorWrapper* b) { + return comparator_->Compare(a->key(), b->key()) <= 0; + } + private: + const Comparator* comparator_; +}; + +// Return the max of two keys. +class MinIteratorComparator { + public: + // if maxHeap is set comparator returns the max value. + // else returns the min Value. + // Can use to create a minHeap or a maxHeap. + MinIteratorComparator(const Comparator* comparator) : + comparator_(comparator) {} + + bool operator()(IteratorWrapper* a, IteratorWrapper* b) { + return comparator_->Compare(a->key(), b->key()) > 0; + } + private: + const Comparator* comparator_; +}; + +typedef std::priority_queue< + IteratorWrapper*, + std::vector, + MaxIteratorComparator> MaxIterHeap; + +typedef std::priority_queue< + IteratorWrapper*, + std::vector, + MinIteratorComparator> MinIterHeap; + +// Return's a new MaxHeap of IteratorWrapper's using the provided Comparator. +MaxIterHeap NewMaxIterHeap(const Comparator* comparator) { + return MaxIterHeap(MaxIteratorComparator(comparator)); +} + +// Return's a new MinHeap of IteratorWrapper's using the provided Comparator. +MinIterHeap NewMinIterHeap(const Comparator* comparator) { + return MinIterHeap(MinIteratorComparator(comparator)); +} + +} // namespace rocksdb diff --git a/table/iterator.cc b/table/iterator.cc new file mode 100644 index 00000000..a3d4f638 --- /dev/null +++ b/table/iterator.cc @@ -0,0 +1,72 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/iterator.h" + +namespace rocksdb { + +Iterator::Iterator() { + cleanup_.function = nullptr; + cleanup_.next = nullptr; +} + +Iterator::~Iterator() { + if (cleanup_.function != nullptr) { + (*cleanup_.function)(cleanup_.arg1, cleanup_.arg2); + for (Cleanup* c = cleanup_.next; c != nullptr; ) { + (*c->function)(c->arg1, c->arg2); + Cleanup* next = c->next; + delete c; + c = next; + } + } +} + +void Iterator::RegisterCleanup(CleanupFunction func, void* arg1, void* arg2) { + assert(func != nullptr); + Cleanup* c; + if (cleanup_.function == nullptr) { + c = &cleanup_; + } else { + c = new Cleanup; + c->next = cleanup_.next; + cleanup_.next = c; + } + c->function = func; + c->arg1 = arg1; + c->arg2 = arg2; +} + +namespace { +class EmptyIterator : public Iterator { + public: + explicit EmptyIterator(const Status& s) : status_(s) { } + virtual bool Valid() const { return false; } + virtual void Seek(const Slice& target) { } + virtual void SeekToFirst() { } + virtual void SeekToLast() { } + virtual void Next() { assert(false); } + virtual void Prev() { assert(false); } + Slice key() const { assert(false); return Slice(); } + Slice value() const { assert(false); return Slice(); } + virtual Status status() const { return status_; } + private: + Status status_; +}; +} // namespace + +Iterator* NewEmptyIterator() { + return new EmptyIterator(Status::OK()); +} + +Iterator* NewErrorIterator(const Status& status) { + return new EmptyIterator(status); +} + +} // namespace rocksdb diff --git a/table/iterator_wrapper.h b/table/iterator_wrapper.h new file mode 100644 index 00000000..cb8520be --- /dev/null +++ b/table/iterator_wrapper.h @@ -0,0 +1,64 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +namespace rocksdb { + +// A internal wrapper class with an interface similar to Iterator that +// caches the valid() and key() results for an underlying iterator. +// This can help avoid virtual function calls and also gives better +// cache locality. +class IteratorWrapper { + public: + IteratorWrapper(): iter_(nullptr), valid_(false) { } + explicit IteratorWrapper(Iterator* iter): iter_(nullptr) { + Set(iter); + } + ~IteratorWrapper() { delete iter_; } + Iterator* iter() const { return iter_; } + + // Takes ownership of "iter" and will delete it when destroyed, or + // when Set() is invoked again. + void Set(Iterator* iter) { + delete iter_; + iter_ = iter; + if (iter_ == nullptr) { + valid_ = false; + } else { + Update(); + } + } + + + // Iterator interface methods + bool Valid() const { return valid_; } + Slice key() const { assert(Valid()); return key_; } + Slice value() const { assert(Valid()); return iter_->value(); } + // Methods below require iter() != nullptr + Status status() const { assert(iter_); return iter_->status(); } + void Next() { assert(iter_); iter_->Next(); Update(); } + void Prev() { assert(iter_); iter_->Prev(); Update(); } + void Seek(const Slice& k) { assert(iter_); iter_->Seek(k); Update(); } + void SeekToFirst() { assert(iter_); iter_->SeekToFirst(); Update(); } + void SeekToLast() { assert(iter_); iter_->SeekToLast(); Update(); } + + private: + void Update() { + valid_ = iter_->Valid(); + if (valid_) { + key_ = iter_->key(); + } + } + + Iterator* iter_; + bool valid_; + Slice key_; +}; + +} // namespace rocksdb diff --git a/table/merger.cc b/table/merger.cc new file mode 100644 index 00000000..f5ce7440 --- /dev/null +++ b/table/merger.cc @@ -0,0 +1,228 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/merger.h" + +#include "rocksdb/comparator.h" +#include "rocksdb/iterator.h" +#include "table/iter_heap.h" +#include "table/iterator_wrapper.h" + +#include + +namespace rocksdb { + +namespace { + +class MergingIterator : public Iterator { + public: + MergingIterator(const Comparator* comparator, Iterator** children, int n) + : comparator_(comparator), + children_(n), + current_(nullptr), + direction_(kForward), + maxHeap_(NewMaxIterHeap(comparator_)), + minHeap_ (NewMinIterHeap(comparator_)) { + for (int i = 0; i < n; i++) { + children_[i].Set(children[i]); + } + for (auto& child : children_) { + if (child.Valid()) { + minHeap_.push(&child); + } + } + } + + virtual ~MergingIterator() { } + + virtual bool Valid() const { + return (current_ != nullptr); + } + + virtual void SeekToFirst() { + ClearHeaps(); + for (auto& child : children_) { + child.SeekToFirst(); + if (child.Valid()) { + minHeap_.push(&child); + } + } + FindSmallest(); + direction_ = kForward; + } + + virtual void SeekToLast() { + ClearHeaps(); + for (auto& child : children_) { + child.SeekToLast(); + if (child.Valid()) { + maxHeap_.push(&child); + } + } + FindLargest(); + direction_ = kReverse; + } + + virtual void Seek(const Slice& target) { + ClearHeaps(); + for (auto& child : children_) { + child.Seek(target); + if (child.Valid()) { + minHeap_.push(&child); + } + } + FindSmallest(); + direction_ = kForward; + } + + virtual void Next() { + assert(Valid()); + + // Ensure that all children are positioned after key(). + // If we are moving in the forward direction, it is already + // true for all of the non-current_ children since current_ is + // the smallest child and key() == current_->key(). Otherwise, + // we explicitly position the non-current_ children. + if (direction_ != kForward) { + ClearHeaps(); + for (auto& child : children_) { + if (&child != current_) { + child.Seek(key()); + if (child.Valid() && + comparator_->Compare(key(), child.key()) == 0) { + child.Next(); + } + if (child.Valid()) { + minHeap_.push(&child); + } + } + } + direction_ = kForward; + } + + // as the current points to the current record. move the iterator forward. + // and if it is valid add it to the heap. + current_->Next(); + if (current_->Valid()){ + minHeap_.push(current_); + } + FindSmallest(); + } + + virtual void Prev() { + assert(Valid()); + // Ensure that all children are positioned before key(). + // If we are moving in the reverse direction, it is already + // true for all of the non-current_ children since current_ is + // the largest child and key() == current_->key(). Otherwise, + // we explicitly position the non-current_ children. + if (direction_ != kReverse) { + ClearHeaps(); + for (auto& child : children_) { + if (&child != current_) { + child.Seek(key()); + if (child.Valid()) { + // Child is at first entry >= key(). Step back one to be < key() + child.Prev(); + } else { + // Child has no entries >= key(). Position at last entry. + child.SeekToLast(); + } + if (child.Valid()) { + maxHeap_.push(&child); + } + } + } + direction_ = kReverse; + } + + current_->Prev(); + if (current_->Valid()) { + maxHeap_.push(current_); + } + FindLargest(); + } + + virtual Slice key() const { + assert(Valid()); + return current_->key(); + } + + virtual Slice value() const { + assert(Valid()); + return current_->value(); + } + + virtual Status status() const { + Status status; + for (auto& child : children_) { + status = child.status(); + if (!status.ok()) { + break; + } + } + return status; + } + + private: + void FindSmallest(); + void FindLargest(); + void ClearHeaps(); + + const Comparator* comparator_; + std::vector children_; + IteratorWrapper* current_; + // Which direction is the iterator moving? + enum Direction { + kForward, + kReverse + }; + Direction direction_; + MaxIterHeap maxHeap_; + MinIterHeap minHeap_; +}; + +void MergingIterator::FindSmallest() { + if (minHeap_.empty()) { + current_ = nullptr; + } else { + current_ = minHeap_.top(); + assert(current_->Valid()); + minHeap_.pop(); + } +} + +void MergingIterator::FindLargest() { + if (maxHeap_.empty()) { + current_ = nullptr; + } else { + current_ = maxHeap_.top(); + assert(current_->Valid()); + maxHeap_.pop(); + } +} + +void MergingIterator::ClearHeaps() { + maxHeap_ = NewMaxIterHeap(comparator_); + minHeap_ = NewMinIterHeap(comparator_); +} +} // namespace + +Iterator* NewMergingIterator(const Comparator* cmp, Iterator** list, int n) { + assert(n >= 0); + if (n == 0) { + return NewEmptyIterator(); + } else if (n == 1) { + return list[0]; + } else { + return new MergingIterator(cmp, list, n); + } +} + +} // namespace rocksdb diff --git a/table/merger.h b/table/merger.h new file mode 100644 index 00000000..dbc1f69e --- /dev/null +++ b/table/merger.h @@ -0,0 +1,28 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +namespace rocksdb { + +class Comparator; +class Iterator; + +// Return an iterator that provided the union of the data in +// children[0,n-1]. Takes ownership of the child iterators and +// will delete them when the result iterator is deleted. +// +// The result does no duplicate suppression. I.e., if a particular +// key is present in K child iterators, it will be yielded K times. +// +// REQUIRES: n >= 0 +extern Iterator* NewMergingIterator( + const Comparator* comparator, Iterator** children, int n); + +} // namespace rocksdb diff --git a/table/table_reader_bench.cc b/table/table_reader_bench.cc new file mode 100644 index 00000000..e7b6b0b7 --- /dev/null +++ b/table/table_reader_bench.cc @@ -0,0 +1,244 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include + +#include "rocksdb/db.h" +#include "rocksdb/table.h" +#include "rocksdb/slice_transform.h" +#include "db/db_impl.h" +#include "db/dbformat.h" +#include "port/atomic_pointer.h" +#include "table/block_based_table_factory.h" +#include "util/histogram.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { +// Make a key that i determines the first 4 characters and j determines the +// last 4 characters. +static std::string MakeKey(int i, int j, bool through_db) { + char buf[100]; + snprintf(buf, sizeof(buf), "%04d__key___%04d", i, j); + if (through_db) { + return std::string(buf); + } + // If we directly query table, which operates on internal keys + // instead of user keys, we need to add 8 bytes of internal + // information (row type etc) to user key to make an internal + // key. + InternalKey key(std::string(buf), 0, ValueType::kTypeValue); + return key.Encode().ToString(); +} + +static bool DummySaveValue(void* arg, const Slice& ikey, const Slice& v, + bool didIO) { + return false; +} + +// A very simple benchmark that. +// Create a table with roughly numKey1 * numKey2 keys, +// where there are numKey1 prefixes of the key, each has numKey2 number of +// distinguished key, differing in the suffix part. +// If if_query_empty_keys = false, query the existing keys numKey1 * numKey2 +// times randomly. +// If if_query_empty_keys = true, query numKey1 * numKey2 random empty keys. +// Print out the total time. +// If through_db=true, a full DB will be created and queries will be against +// it. Otherwise, operations will be directly through table level. +// +// If for_terator=true, instead of just query one key each time, it queries +// a range sharing the same prefix. +void TableReaderBenchmark(Options& opts, EnvOptions& env_options, + ReadOptions& read_options, int num_keys1, + int num_keys2, int num_iter, int prefix_len, + bool if_query_empty_keys, bool for_iterator, + bool through_db) { + Slice prefix = Slice(); + + std::string file_name = test::TmpDir() + + "/rocksdb_table_reader_benchmark"; + std::string dbname = test::TmpDir() + "/rocksdb_table_reader_bench_db"; + ReadOptions ro; + WriteOptions wo; + unique_ptr file; + Env* env = Env::Default(); + TableBuilder* tb = nullptr; + DB* db = nullptr; + Status s; + if (!through_db) { + env->NewWritableFile(file_name, &file, env_options); + tb = opts.table_factory->GetTableBuilder(opts, file.get(), + CompressionType::kNoCompression); + } else { + s = DB::Open(opts, dbname, &db); + ASSERT_OK(s); + ASSERT_TRUE(db != nullptr); + } + // Populate slightly more than 1M keys + for (int i = 0; i < num_keys1; i++) { + for (int j = 0; j < num_keys2; j++) { + std::string key = MakeKey(i * 2, j, through_db); + if (!through_db) { + tb->Add(key, key); + } else { + db->Put(wo, key, key); + } + } + } + if (!through_db) { + tb->Finish(); + file->Close(); + } else { + db->Flush(FlushOptions()); + } + + unique_ptr table_reader; + unique_ptr raf; + if (!through_db) { + Status s = env->NewRandomAccessFile(file_name, &raf, env_options); + uint64_t file_size; + env->GetFileSize(file_name, &file_size); + s = opts.table_factory->GetTableReader(opts, env_options, std::move(raf), + file_size, &table_reader); + } + + Random rnd(301); + std::string result; + HistogramImpl hist; + + void* arg = nullptr; + for (int it = 0; it < num_iter; it++) { + for (int i = 0; i < num_keys1; i++) { + for (int j = 0; j < num_keys2; j++) { + int r1 = rnd.Uniform(num_keys1) * 2; + int r2 = rnd.Uniform(num_keys2); + if (if_query_empty_keys) { + r1++; + r2 = num_keys2 * 2 - r2; + } + + if (!for_iterator) { + // Query one existing key; + std::string key = MakeKey(r1, r2, through_db); + uint64_t start_micros = env->NowMicros(); + port::MemoryBarrier(); + if (!through_db) { + s = table_reader->Get(ro, key, arg, DummySaveValue, nullptr); + } else { + s = db->Get(ro, key, &result); + } + port::MemoryBarrier(); + hist.Add(env->NowMicros() - start_micros); + } else { + int r2_len; + if (if_query_empty_keys) { + r2_len = 0; + } else { + r2_len = rnd.Uniform(num_keys2) + 1; + if (r2_len + r2 > num_keys2) { + r2_len = num_keys2 - r2; + } + } + std::string start_key = MakeKey(r1, r2, through_db); + std::string end_key = MakeKey(r1, r2 + r2_len, through_db); + if (prefix_len < 16) { + prefix = Slice(start_key.data(), prefix_len); + read_options.prefix = &prefix; + } + uint64_t total_time = 0; + uint64_t start_micros = env->NowMicros(); + port::MemoryBarrier(); + Iterator* iter; + if (!through_db) { + iter = table_reader->NewIterator(read_options); + } else { + iter = db->NewIterator(read_options); + } + int count = 0; + for(iter->Seek(start_key); iter->Valid(); iter->Next()) { + if (if_query_empty_keys) { + break; + } + // verify key; + port::MemoryBarrier(); + total_time += env->NowMicros() - start_micros; + assert(Slice(MakeKey(r1, r2 + count, through_db)) == iter->key()); + start_micros = env->NowMicros(); + if (++count >= r2_len) { + break; + } + } + if (count != r2_len) { + fprintf( + stderr, "Iterator cannot iterate expected number of entries. " + "Expected %d but got %d\n", r2_len, count); + assert(false); + } + delete iter; + port::MemoryBarrier(); + total_time += env->NowMicros() - start_micros; + hist.Add(total_time); + } + } + } + } + + fprintf( + stderr, + "===================================================" + "====================================================\n" + "InMemoryTableSimpleBenchmark: %20s num_key1: %5d " + "num_key2: %5d %10s\n" + "===================================================" + "====================================================" + "\nHistogram (unit: microseconds): \n%s", + opts.table_factory->Name(), num_keys1, num_keys2, + for_iterator? "iterator" : (if_query_empty_keys ? "empty" : "non_empty"), + hist.ToString().c_str()); + if (!through_db) { + env->DeleteFile(file_name); + } else { + delete db; + db = nullptr; + DestroyDB(dbname, opts); + } +} +} // namespace rocksdb + +DEFINE_bool(query_empty, false, "query non-existing keys instead of existing " + "ones."); +DEFINE_int32(num_keys1, 4096, "number of distinguish prefix of keys"); +DEFINE_int32(num_keys2, 512, "number of distinguish keys for each prefix"); +DEFINE_int32(iter, 3, "query non-existing keys instead of existing ones"); +DEFINE_int32(prefix_len, 16, "Prefix length used for iterators and indexes"); +DEFINE_bool(iterator, false, "For test iterator"); +DEFINE_bool(through_db, false, "If enable, a DB instance will be created and " + "the query will be against DB. Otherwise, will be directly against " + "a table reader."); + +int main(int argc, char** argv) { + google::SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) + + " [OPTIONS]..."); + google::ParseCommandLineFlags(&argc, &argv, true); + + rocksdb::TableFactory* tf = new rocksdb::BlockBasedTableFactory(); + rocksdb::Options options; + if (FLAGS_prefix_len < 16) { + options.prefix_extractor = rocksdb::NewFixedPrefixTransform( + FLAGS_prefix_len); + } + options.SetUpDefaultFlushBlockPolicyFactory(); + rocksdb::ReadOptions ro; + rocksdb::EnvOptions env_options; + options.create_if_missing = true; + options.table_factory = + std::shared_ptr(tf); + TableReaderBenchmark(options, env_options, ro, FLAGS_num_keys1, + FLAGS_num_keys2, FLAGS_iter, FLAGS_prefix_len, + FLAGS_query_empty, FLAGS_iterator, FLAGS_through_db); + delete tf; + return 0; +} diff --git a/table/table_test.cc b/table/table_test.cc new file mode 100644 index 00000000..d404e0b2 --- /dev/null +++ b/table/table_test.cc @@ -0,0 +1,1305 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include +#include +#include +#include + +#include "db/dbformat.h" +#include "db/db_statistics.h" +#include "db/memtable.h" +#include "db/write_batch_internal.h" +#include "rocksdb/cache.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/memtablerep.h" +#include "table/block_based_table_builder.h" +#include "table/block_based_table_factory.h" +#include "table/block_based_table_reader.h" +#include "table/block_builder.h" +#include "table/block.h" +#include "table/format.h" +#include "util/random.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { + +namespace { +// Return reverse of "key". +// Used to test non-lexicographic comparators. +static std::string Reverse(const Slice& key) { + std::string str(key.ToString()); + std::string rev(""); + for (std::string::reverse_iterator rit = str.rbegin(); + rit != str.rend(); ++rit) { + rev.push_back(*rit); + } + return rev; +} + +class ReverseKeyComparator : public Comparator { + public: + virtual const char* Name() const { + return "rocksdb.ReverseBytewiseComparator"; + } + + virtual int Compare(const Slice& a, const Slice& b) const { + return BytewiseComparator()->Compare(Reverse(a), Reverse(b)); + } + + virtual void FindShortestSeparator( + std::string* start, + const Slice& limit) const { + std::string s = Reverse(*start); + std::string l = Reverse(limit); + BytewiseComparator()->FindShortestSeparator(&s, l); + *start = Reverse(s); + } + + virtual void FindShortSuccessor(std::string* key) const { + std::string s = Reverse(*key); + BytewiseComparator()->FindShortSuccessor(&s); + *key = Reverse(s); + } +}; +} // namespace +static ReverseKeyComparator reverse_key_comparator; + +static void Increment(const Comparator* cmp, std::string* key) { + if (cmp == BytewiseComparator()) { + key->push_back('\0'); + } else { + assert(cmp == &reverse_key_comparator); + std::string rev = Reverse(*key); + rev.push_back('\0'); + *key = Reverse(rev); + } +} + +// An STL comparator that uses a Comparator +namespace anon { +struct STLLessThan { + const Comparator* cmp; + + STLLessThan() : cmp(BytewiseComparator()) { } + explicit STLLessThan(const Comparator* c) : cmp(c) { } + bool operator()(const std::string& a, const std::string& b) const { + return cmp->Compare(Slice(a), Slice(b)) < 0; + } +}; +} // namespace + +class StringSink: public WritableFile { + public: + ~StringSink() { } + + const std::string& contents() const { return contents_; } + + virtual Status Close() { return Status::OK(); } + virtual Status Flush() { return Status::OK(); } + virtual Status Sync() { return Status::OK(); } + + virtual Status Append(const Slice& data) { + contents_.append(data.data(), data.size()); + return Status::OK(); + } + + private: + std::string contents_; +}; + + +class StringSource: public RandomAccessFile { + public: + StringSource(const Slice& contents, uint64_t uniq_id) + : contents_(contents.data(), contents.size()), uniq_id_(uniq_id) { + } + + virtual ~StringSource() { } + + uint64_t Size() const { return contents_.size(); } + + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + if (offset > contents_.size()) { + return Status::InvalidArgument("invalid Read offset"); + } + if (offset + n > contents_.size()) { + n = contents_.size() - offset; + } + memcpy(scratch, &contents_[offset], n); + *result = Slice(scratch, n); + return Status::OK(); + } + + virtual size_t GetUniqueId(char* id, size_t max_size) const { + if (max_size < 20) { + return 0; + } + + char* rid = id; + rid = EncodeVarint64(rid, uniq_id_); + rid = EncodeVarint64(rid, 0); + return static_cast(rid-id); + } + + private: + std::string contents_; + uint64_t uniq_id_; +}; + +typedef std::map KVMap; + +// Helper class for tests to unify the interface between +// BlockBuilder/TableBuilder and Block/Table. +class Constructor { + public: + explicit Constructor(const Comparator* cmp) : data_(anon::STLLessThan(cmp)) { } + virtual ~Constructor() { } + + void Add(const std::string& key, const Slice& value) { + data_[key] = value.ToString(); + } + + // Finish constructing the data structure with all the keys that have + // been added so far. Returns the keys in sorted order in "*keys" + // and stores the key/value pairs in "*kvmap" + void Finish(const Options& options, + std::vector* keys, + KVMap* kvmap) { + *kvmap = data_; + keys->clear(); + for (KVMap::const_iterator it = data_.begin(); + it != data_.end(); + ++it) { + keys->push_back(it->first); + } + data_.clear(); + Status s = FinishImpl(options, *kvmap); + ASSERT_TRUE(s.ok()) << s.ToString(); + } + + // Construct the data structure from the data in "data" + virtual Status FinishImpl(const Options& options, const KVMap& data) = 0; + + virtual Iterator* NewIterator() const = 0; + + virtual const KVMap& data() { return data_; } + + virtual DB* db() const { return nullptr; } // Overridden in DBConstructor + + private: + KVMap data_; +}; + +class BlockConstructor: public Constructor { + public: + explicit BlockConstructor(const Comparator* cmp) + : Constructor(cmp), + comparator_(cmp), + block_(nullptr) { } + ~BlockConstructor() { + delete block_; + } + virtual Status FinishImpl(const Options& options, const KVMap& data) { + delete block_; + block_ = nullptr; + BlockBuilder builder(options); + + for (KVMap::const_iterator it = data.begin(); + it != data.end(); + ++it) { + builder.Add(it->first, it->second); + } + // Open the block + data_ = builder.Finish().ToString(); + BlockContents contents; + contents.data = data_; + contents.cachable = false; + contents.heap_allocated = false; + block_ = new Block(contents); + return Status::OK(); + } + virtual Iterator* NewIterator() const { + return block_->NewIterator(comparator_); + } + + private: + const Comparator* comparator_; + std::string data_; + Block* block_; + + BlockConstructor(); +}; + +class BlockBasedTableConstructor: public Constructor { + public: + explicit BlockBasedTableConstructor( + const Comparator* cmp) + : Constructor(cmp) { + } + ~BlockBasedTableConstructor() { + Reset(); + } + virtual Status FinishImpl(const Options& options, const KVMap& data) { + Reset(); + sink_.reset(new StringSink()); + std::unique_ptr flush_policy_factory( + new FlushBlockBySizePolicyFactory(options.block_size, + options.block_size_deviation)); + + BlockBasedTableBuilder builder( + options, + sink_.get(), + flush_policy_factory.get(), + options.compression); + + for (KVMap::const_iterator it = data.begin(); + it != data.end(); + ++it) { + builder.Add(it->first, it->second); + ASSERT_TRUE(builder.status().ok()); + } + Status s = builder.Finish(); + ASSERT_TRUE(s.ok()) << s.ToString(); + + ASSERT_EQ(sink_->contents().size(), builder.FileSize()); + + // Open the table + uniq_id_ = cur_uniq_id_++; + source_.reset(new StringSource(sink_->contents(), uniq_id_)); + unique_ptr table_factory; + return options.table_factory->GetTableReader(options, soptions, + std::move(source_), + sink_->contents().size(), + &table_reader_); + } + + virtual Iterator* NewIterator() const { + return table_reader_->NewIterator(ReadOptions()); + } + + uint64_t ApproximateOffsetOf(const Slice& key) const { + return table_reader_->ApproximateOffsetOf(key); + } + + virtual Status Reopen(const Options& options) { + source_.reset(new StringSource(sink_->contents(), uniq_id_)); + return options.table_factory->GetTableReader(options, soptions, + std::move(source_), + sink_->contents().size(), + &table_reader_); + } + + virtual TableReader* table_reader() { + return table_reader_.get(); + } + + private: + void Reset() { + uniq_id_ = 0; + table_reader_.reset(); + sink_.reset(); + source_.reset(); + } + + uint64_t uniq_id_; + unique_ptr sink_; + unique_ptr source_; + unique_ptr table_reader_; + + BlockBasedTableConstructor(); + + static uint64_t cur_uniq_id_; + const EnvOptions soptions; +}; +uint64_t BlockBasedTableConstructor::cur_uniq_id_ = 1; + +// A helper class that converts internal format keys into user keys +class KeyConvertingIterator: public Iterator { + public: + explicit KeyConvertingIterator(Iterator* iter) : iter_(iter) { } + virtual ~KeyConvertingIterator() { delete iter_; } + virtual bool Valid() const { return iter_->Valid(); } + virtual void Seek(const Slice& target) { + ParsedInternalKey ikey(target, kMaxSequenceNumber, kTypeValue); + std::string encoded; + AppendInternalKey(&encoded, ikey); + iter_->Seek(encoded); + } + virtual void SeekToFirst() { iter_->SeekToFirst(); } + virtual void SeekToLast() { iter_->SeekToLast(); } + virtual void Next() { iter_->Next(); } + virtual void Prev() { iter_->Prev(); } + + virtual Slice key() const { + assert(Valid()); + ParsedInternalKey key; + if (!ParseInternalKey(iter_->key(), &key)) { + status_ = Status::Corruption("malformed internal key"); + return Slice("corrupted key"); + } + return key.user_key; + } + + virtual Slice value() const { return iter_->value(); } + virtual Status status() const { + return status_.ok() ? iter_->status() : status_; + } + + private: + mutable Status status_; + Iterator* iter_; + + // No copying allowed + KeyConvertingIterator(const KeyConvertingIterator&); + void operator=(const KeyConvertingIterator&); +}; + +class MemTableConstructor: public Constructor { + public: + explicit MemTableConstructor(const Comparator* cmp) + : Constructor(cmp), + internal_comparator_(cmp), + table_factory_(new SkipListFactory) { + Options options; + options.memtable_factory = table_factory_; + memtable_ = new MemTable(internal_comparator_, options); + memtable_->Ref(); + } + ~MemTableConstructor() { + delete memtable_->Unref(); + } + virtual Status FinishImpl(const Options& options, const KVMap& data) { + delete memtable_->Unref(); + Options memtable_options; + memtable_options.memtable_factory = table_factory_; + memtable_ = new MemTable(internal_comparator_, memtable_options); + memtable_->Ref(); + int seq = 1; + for (KVMap::const_iterator it = data.begin(); + it != data.end(); + ++it) { + memtable_->Add(seq, kTypeValue, it->first, it->second); + seq++; + } + return Status::OK(); + } + virtual Iterator* NewIterator() const { + return new KeyConvertingIterator(memtable_->NewIterator()); + } + + private: + InternalKeyComparator internal_comparator_; + MemTable* memtable_; + std::shared_ptr table_factory_; +}; + +class DBConstructor: public Constructor { + public: + explicit DBConstructor(const Comparator* cmp) + : Constructor(cmp), + comparator_(cmp) { + db_ = nullptr; + NewDB(); + } + ~DBConstructor() { + delete db_; + } + virtual Status FinishImpl(const Options& options, const KVMap& data) { + delete db_; + db_ = nullptr; + NewDB(); + for (KVMap::const_iterator it = data.begin(); + it != data.end(); + ++it) { + WriteBatch batch; + batch.Put(it->first, it->second); + ASSERT_TRUE(db_->Write(WriteOptions(), &batch).ok()); + } + return Status::OK(); + } + virtual Iterator* NewIterator() const { + return db_->NewIterator(ReadOptions()); + } + + virtual DB* db() const { return db_; } + + private: + void NewDB() { + std::string name = test::TmpDir() + "/table_testdb"; + + Options options; + options.comparator = comparator_; + Status status = DestroyDB(name, options); + ASSERT_TRUE(status.ok()) << status.ToString(); + + options.create_if_missing = true; + options.error_if_exists = true; + options.write_buffer_size = 10000; // Something small to force merging + status = DB::Open(options, name, &db_); + ASSERT_TRUE(status.ok()) << status.ToString(); + } + + const Comparator* comparator_; + DB* db_; +}; + +static bool SnappyCompressionSupported() { + std::string out; + Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; + return port::Snappy_Compress(Options().compression_opts, + in.data(), in.size(), + &out); +} + +static bool ZlibCompressionSupported() { + std::string out; + Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; + return port::Zlib_Compress(Options().compression_opts, + in.data(), in.size(), + &out); +} + +#ifdef BZIP2 +static bool BZip2CompressionSupported() { + std::string out; + Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; + return port::BZip2_Compress(Options().compression_opts, + in.data(), in.size(), + &out); +} +#endif + +enum TestType { + TABLE_TEST, + BLOCK_TEST, + MEMTABLE_TEST, + DB_TEST +}; + +struct TestArgs { + TestType type; + bool reverse_compare; + int restart_interval; + CompressionType compression; +}; + + +static std::vector GenerateArgList() { + std::vector ret; + TestType test_type[4] = {TABLE_TEST, BLOCK_TEST, MEMTABLE_TEST, DB_TEST}; + int test_type_len = 4; + bool reverse_compare[2] = {false, true}; + int reverse_compare_len = 2; + int restart_interval[3] = {16, 1, 1024}; + int restart_interval_len = 3; + + // Only add compression if it is supported + std::vector compression_types; + compression_types.push_back(kNoCompression); +#ifdef SNAPPY + if (SnappyCompressionSupported()) + compression_types.push_back(kSnappyCompression); +#endif + +#ifdef ZLIB + if (ZlibCompressionSupported()) + compression_types.push_back(kZlibCompression); +#endif + +#ifdef BZIP2 + if (BZip2CompressionSupported()) + compression_types.push_back(kBZip2Compression); +#endif + + for(int i =0; i < test_type_len; i++) + for (int j =0; j < reverse_compare_len; j++) + for (int k =0; k < restart_interval_len; k++) + for (unsigned int n =0; n < compression_types.size(); n++) { + TestArgs one_arg; + one_arg.type = test_type[i]; + one_arg.reverse_compare = reverse_compare[j]; + one_arg.restart_interval = restart_interval[k]; + one_arg.compression = compression_types[n]; + ret.push_back(one_arg); + } + + return ret; +} + +class Harness { + public: + Harness() : constructor_(nullptr) { } + + void Init(const TestArgs& args) { + delete constructor_; + constructor_ = nullptr; + options_ = Options(); + + options_.block_restart_interval = args.restart_interval; + options_.compression = args.compression; + // Use shorter block size for tests to exercise block boundary + // conditions more. + options_.block_size = 256; + if (args.reverse_compare) { + options_.comparator = &reverse_key_comparator; + } + switch (args.type) { + case TABLE_TEST: + constructor_ = new BlockBasedTableConstructor(options_.comparator); + break; + case BLOCK_TEST: + constructor_ = new BlockConstructor(options_.comparator); + break; + case MEMTABLE_TEST: + constructor_ = new MemTableConstructor(options_.comparator); + break; + case DB_TEST: + constructor_ = new DBConstructor(options_.comparator); + break; + } + } + + ~Harness() { + delete constructor_; + } + + void Add(const std::string& key, const std::string& value) { + constructor_->Add(key, value); + } + + void Test(Random* rnd) { + std::vector keys; + KVMap data; + constructor_->Finish(options_, &keys, &data); + + TestForwardScan(keys, data); + TestBackwardScan(keys, data); + TestRandomAccess(rnd, keys, data); + } + + void TestForwardScan(const std::vector& keys, + const KVMap& data) { + Iterator* iter = constructor_->NewIterator(); + ASSERT_TRUE(!iter->Valid()); + iter->SeekToFirst(); + for (KVMap::const_iterator model_iter = data.begin(); + model_iter != data.end(); + ++model_iter) { + ASSERT_EQ(ToString(data, model_iter), ToString(iter)); + iter->Next(); + } + ASSERT_TRUE(!iter->Valid()); + delete iter; + } + + void TestBackwardScan(const std::vector& keys, + const KVMap& data) { + Iterator* iter = constructor_->NewIterator(); + ASSERT_TRUE(!iter->Valid()); + iter->SeekToLast(); + for (KVMap::const_reverse_iterator model_iter = data.rbegin(); + model_iter != data.rend(); + ++model_iter) { + ASSERT_EQ(ToString(data, model_iter), ToString(iter)); + iter->Prev(); + } + ASSERT_TRUE(!iter->Valid()); + delete iter; + } + + void TestRandomAccess(Random* rnd, + const std::vector& keys, + const KVMap& data) { + static const bool kVerbose = false; + Iterator* iter = constructor_->NewIterator(); + ASSERT_TRUE(!iter->Valid()); + KVMap::const_iterator model_iter = data.begin(); + if (kVerbose) fprintf(stderr, "---\n"); + for (int i = 0; i < 200; i++) { + const int toss = rnd->Uniform(5); + switch (toss) { + case 0: { + if (iter->Valid()) { + if (kVerbose) fprintf(stderr, "Next\n"); + iter->Next(); + ++model_iter; + ASSERT_EQ(ToString(data, model_iter), ToString(iter)); + } + break; + } + + case 1: { + if (kVerbose) fprintf(stderr, "SeekToFirst\n"); + iter->SeekToFirst(); + model_iter = data.begin(); + ASSERT_EQ(ToString(data, model_iter), ToString(iter)); + break; + } + + case 2: { + std::string key = PickRandomKey(rnd, keys); + model_iter = data.lower_bound(key); + if (kVerbose) fprintf(stderr, "Seek '%s'\n", + EscapeString(key).c_str()); + iter->Seek(Slice(key)); + ASSERT_EQ(ToString(data, model_iter), ToString(iter)); + break; + } + + case 3: { + if (iter->Valid()) { + if (kVerbose) fprintf(stderr, "Prev\n"); + iter->Prev(); + if (model_iter == data.begin()) { + model_iter = data.end(); // Wrap around to invalid value + } else { + --model_iter; + } + ASSERT_EQ(ToString(data, model_iter), ToString(iter)); + } + break; + } + + case 4: { + if (kVerbose) fprintf(stderr, "SeekToLast\n"); + iter->SeekToLast(); + if (keys.empty()) { + model_iter = data.end(); + } else { + std::string last = data.rbegin()->first; + model_iter = data.lower_bound(last); + } + ASSERT_EQ(ToString(data, model_iter), ToString(iter)); + break; + } + } + } + delete iter; + } + + std::string ToString(const KVMap& data, const KVMap::const_iterator& it) { + if (it == data.end()) { + return "END"; + } else { + return "'" + it->first + "->" + it->second + "'"; + } + } + + std::string ToString(const KVMap& data, + const KVMap::const_reverse_iterator& it) { + if (it == data.rend()) { + return "END"; + } else { + return "'" + it->first + "->" + it->second + "'"; + } + } + + std::string ToString(const Iterator* it) { + if (!it->Valid()) { + return "END"; + } else { + return "'" + it->key().ToString() + "->" + it->value().ToString() + "'"; + } + } + + std::string PickRandomKey(Random* rnd, const std::vector& keys) { + if (keys.empty()) { + return "foo"; + } else { + const int index = rnd->Uniform(keys.size()); + std::string result = keys[index]; + switch (rnd->Uniform(3)) { + case 0: + // Return an existing key + break; + case 1: { + // Attempt to return something smaller than an existing key + if (result.size() > 0 && result[result.size()-1] > '\0') { + result[result.size()-1]--; + } + break; + } + case 2: { + // Return something larger than an existing key + Increment(options_.comparator, &result); + break; + } + } + return result; + } + } + + // Returns nullptr if not running against a DB + DB* db() const { return constructor_->db(); } + + private: + Options options_ = Options(); + Constructor* constructor_; +}; + +// Test the empty key +TEST(Harness, SimpleEmptyKey) { + std::vector args = GenerateArgList(); + for (unsigned int i = 0; i < args.size(); i++) { + Init(args[i]); + Random rnd(test::RandomSeed() + 1); + Add("", "v"); + Test(&rnd); + } +} + +TEST(Harness, SimpleSingle) { + std::vector args = GenerateArgList(); + for (unsigned int i = 0; i < args.size(); i++) { + Init(args[i]); + Random rnd(test::RandomSeed() + 2); + Add("abc", "v"); + Test(&rnd); + } +} + +TEST(Harness, SimpleMulti) { + std::vector args = GenerateArgList(); + for (unsigned int i = 0; i < args.size(); i++) { + Init(args[i]); + Random rnd(test::RandomSeed() + 3); + Add("abc", "v"); + Add("abcd", "v"); + Add("ac", "v2"); + Test(&rnd); + } +} + +TEST(Harness, SimpleSpecialKey) { + std::vector args = GenerateArgList(); + for (unsigned int i = 0; i < args.size(); i++) { + Init(args[i]); + Random rnd(test::RandomSeed() + 4); + Add("\xff\xff", "v3"); + Test(&rnd); + } +} + +static bool Between(uint64_t val, uint64_t low, uint64_t high) { + bool result = (val >= low) && (val <= high); + if (!result) { + fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n", + (unsigned long long)(val), + (unsigned long long)(low), + (unsigned long long)(high)); + } + return result; +} + +class TableTest { }; + +// This test include all the basic checks except those for index size and block +// size, which will be conducted in separated unit tests. +TEST(TableTest, BasicTableProperties) { + BlockBasedTableConstructor c(BytewiseComparator()); + + c.Add("a1", "val1"); + c.Add("b2", "val2"); + c.Add("c3", "val3"); + c.Add("d4", "val4"); + c.Add("e5", "val5"); + c.Add("f6", "val6"); + c.Add("g7", "val7"); + c.Add("h8", "val8"); + c.Add("j9", "val9"); + + std::vector keys; + KVMap kvmap; + Options options; + options.compression = kNoCompression; + options.block_restart_interval = 1; + + c.Finish(options, &keys, &kvmap); + + auto& props = c.table_reader()->GetTableProperties(); + ASSERT_EQ(kvmap.size(), props.num_entries); + + auto raw_key_size = kvmap.size() * 2ul; + auto raw_value_size = kvmap.size() * 4ul; + + ASSERT_EQ(raw_key_size, props.raw_key_size); + ASSERT_EQ(raw_value_size, props.raw_value_size); + ASSERT_EQ(1ul, props.num_data_blocks); + ASSERT_EQ("", props.filter_policy_name); // no filter policy is used + + // Verify data size. + BlockBuilder block_builder(options); + for (const auto& item : kvmap) { + block_builder.Add(item.first, item.second); + } + Slice content = block_builder.Finish(); + ASSERT_EQ( + content.size() + kBlockTrailerSize, + props.data_size + ); +} + +TEST(TableTest, FilterPolicyNameProperties) { + BlockBasedTableConstructor c(BytewiseComparator()); + c.Add("a1", "val1"); + std::vector keys; + KVMap kvmap; + Options options; + std::unique_ptr filter_policy( + NewBloomFilterPolicy(10) + ); + options.filter_policy = filter_policy.get(); + + c.Finish(options, &keys, &kvmap); + auto& props = c.table_reader()->GetTableProperties(); + ASSERT_EQ("rocksdb.BuiltinBloomFilter", props.filter_policy_name); +} + +static std::string RandomString(Random* rnd, int len) { + std::string r; + test::RandomString(rnd, len, &r); + return r; +} + +// It's very hard to figure out the index block size of a block accurately. +// To make sure we get the index size, we just make sure as key number +// grows, the filter block size also grows. +TEST(TableTest, IndexSizeStat) { + uint64_t last_index_size = 0; + + // we need to use random keys since the pure human readable texts + // may be well compressed, resulting insignifcant change of index + // block size. + Random rnd(test::RandomSeed()); + std::vector keys; + + for (int i = 0; i < 100; ++i) { + keys.push_back(RandomString(&rnd, 10000)); + } + + // Each time we load one more key to the table. the table index block + // size is expected to be larger than last time's. + for (size_t i = 1; i < keys.size(); ++i) { + BlockBasedTableConstructor c(BytewiseComparator()); + for (size_t j = 0; j < i; ++j) { + c.Add(keys[j], "val"); + } + + std::vector ks; + KVMap kvmap; + Options options; + options.compression = kNoCompression; + options.block_restart_interval = 1; + + c.Finish(options, &ks, &kvmap); + auto index_size = + c.table_reader()->GetTableProperties().index_size; + ASSERT_GT(index_size, last_index_size); + last_index_size = index_size; + } +} + +TEST(TableTest, NumBlockStat) { + Random rnd(test::RandomSeed()); + BlockBasedTableConstructor c(BytewiseComparator()); + Options options; + options.compression = kNoCompression; + options.block_restart_interval = 1; + options.block_size = 1000; + + for (int i = 0; i < 10; ++i) { + // the key/val are slightly smaller than block size, so that each block + // holds roughly one key/value pair. + c.Add(RandomString(&rnd, 900), "val"); + } + + std::vector ks; + KVMap kvmap; + c.Finish(options, &ks, &kvmap); + ASSERT_EQ( + kvmap.size(), + c.table_reader()->GetTableProperties().num_data_blocks + ); +} + +class BlockCacheProperties { + public: + explicit BlockCacheProperties(Statistics* statistics) { + block_cache_miss = + statistics->getTickerCount(BLOCK_CACHE_MISS); + block_cache_hit = + statistics->getTickerCount(BLOCK_CACHE_HIT); + index_block_cache_miss = + statistics->getTickerCount(BLOCK_CACHE_INDEX_MISS); + index_block_cache_hit = + statistics->getTickerCount(BLOCK_CACHE_INDEX_HIT); + data_block_cache_miss = + statistics->getTickerCount(BLOCK_CACHE_DATA_MISS); + data_block_cache_hit = + statistics->getTickerCount(BLOCK_CACHE_DATA_HIT); + } + + // Check if the fetched props matches the expected ones. + void AssertEqual( + long index_block_cache_miss, + long index_block_cache_hit, + long data_block_cache_miss, + long data_block_cache_hit) const { + ASSERT_EQ(index_block_cache_miss, this->index_block_cache_miss); + ASSERT_EQ(index_block_cache_hit, this->index_block_cache_hit); + ASSERT_EQ(data_block_cache_miss, this->data_block_cache_miss); + ASSERT_EQ(data_block_cache_hit, this->data_block_cache_hit); + ASSERT_EQ( + index_block_cache_miss + data_block_cache_miss, + this->block_cache_miss + ); + ASSERT_EQ( + index_block_cache_hit + data_block_cache_hit, + this->block_cache_hit + ); + } + + private: + long block_cache_miss = 0; + long block_cache_hit = 0; + long index_block_cache_miss = 0; + long index_block_cache_hit = 0; + long data_block_cache_miss = 0; + long data_block_cache_hit = 0; +}; + +TEST(TableTest, BlockCacheTest) { + // -- Table construction + Options options; + options.create_if_missing = true; + options.statistics = CreateDBStatistics(); + options.block_cache = NewLRUCache(1024); + std::vector keys; + KVMap kvmap; + + BlockBasedTableConstructor c(BytewiseComparator()); + c.Add("key", "value"); + c.Finish(options, &keys, &kvmap); + + // -- PART 1: Open with regular block cache. + // Since block_cache is disabled, no cache activities will be involved. + unique_ptr iter; + + // At first, no block will be accessed. + { + BlockCacheProperties props(options.statistics.get()); + // index will be added to block cache. + props.AssertEqual( + 1, // index block miss + 0, + 0, + 0 + ); + } + + // Only index block will be accessed + { + iter.reset(c.NewIterator()); + BlockCacheProperties props(options.statistics.get()); + // NOTE: to help better highlight the "detla" of each ticker, I use + // + to indicate the increment of changed + // value; other numbers remain the same. + props.AssertEqual( + 1, + 0 + 1, // index block hit + 0, + 0 + ); + } + + // Only data block will be accessed + { + iter->SeekToFirst(); + BlockCacheProperties props(options.statistics.get()); + props.AssertEqual( + 1, + 1, + 0 + 1, // data block miss + 0 + ); + } + + // Data block will be in cache + { + iter.reset(c.NewIterator()); + iter->SeekToFirst(); + BlockCacheProperties props(options.statistics.get()); + props.AssertEqual( + 1, + 1 + 1, // index block hit + 1, + 0 + 1 // data block hit + ); + } + // release the iterator so that the block cache can reset correctly. + iter.reset(); + + // -- PART 2: Open without block cache + options.block_cache.reset(); + options.statistics = CreateDBStatistics(); // reset the stats + c.Reopen(options); + + { + iter.reset(c.NewIterator()); + iter->SeekToFirst(); + ASSERT_EQ("key", iter->key().ToString()); + BlockCacheProperties props(options.statistics.get()); + // Nothing is affected at all + props.AssertEqual(0, 0, 0, 0); + } + + // -- PART 3: Open with very small block cache + // In this test, no block will ever get hit since the block cache is + // too small to fit even one entry. + options.block_cache = NewLRUCache(1); + c.Reopen(options); + { + BlockCacheProperties props(options.statistics.get()); + props.AssertEqual( + 1, // index block miss + 0, + 0, + 0 + ); + } + + + { + // Both index and data block get accessed. + // It first cache index block then data block. But since the cache size + // is only 1, index block will be purged after data block is inserted. + iter.reset(c.NewIterator()); + BlockCacheProperties props(options.statistics.get()); + props.AssertEqual( + 1 + 1, // index block miss + 0, + 0, // data block miss + 0 + ); + } + + { + // SeekToFirst() accesses data block. With similar reason, we expect data + // block's cache miss. + iter->SeekToFirst(); + BlockCacheProperties props(options.statistics.get()); + props.AssertEqual( + 2, + 0, + 0 + 1, // data block miss + 0 + ); + } +} + +TEST(TableTest, ApproximateOffsetOfPlain) { + BlockBasedTableConstructor c(BytewiseComparator()); + c.Add("k01", "hello"); + c.Add("k02", "hello2"); + c.Add("k03", std::string(10000, 'x')); + c.Add("k04", std::string(200000, 'x')); + c.Add("k05", std::string(300000, 'x')); + c.Add("k06", "hello3"); + c.Add("k07", std::string(100000, 'x')); + std::vector keys; + KVMap kvmap; + Options options; + options.block_size = 1024; + options.compression = kNoCompression; + c.Finish(options, &keys, &kvmap); + + ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01a"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 10000, 11000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04a"), 210000, 211000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k05"), 210000, 211000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k06"), 510000, 511000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k07"), 510000, 511000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 610000, 612000)); + +} + +static void Do_Compression_Test(CompressionType comp) { + Random rnd(301); + BlockBasedTableConstructor c(BytewiseComparator()); + std::string tmp; + c.Add("k01", "hello"); + c.Add("k02", test::CompressibleString(&rnd, 0.25, 10000, &tmp)); + c.Add("k03", "hello3"); + c.Add("k04", test::CompressibleString(&rnd, 0.25, 10000, &tmp)); + std::vector keys; + KVMap kvmap; + Options options; + options.block_size = 1024; + options.compression = comp; + c.Finish(options, &keys, &kvmap); + + ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 2000, 3000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 2000, 3000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 4000, 6000)); +} + +TEST(TableTest, ApproximateOffsetOfCompressed) { + CompressionType compression_state[2]; + int valid = 0; + if (!SnappyCompressionSupported()) { + fprintf(stderr, "skipping snappy compression tests\n"); + } else { + compression_state[valid] = kSnappyCompression; + valid++; + } + + if (!ZlibCompressionSupported()) { + fprintf(stderr, "skipping zlib compression tests\n"); + } else { + compression_state[valid] = kZlibCompression; + valid++; + } + + for(int i =0; i < valid; i++) + { + Do_Compression_Test(compression_state[i]); + } + +} + +TEST(TableTest, BlockCacheLeak) { + // Check that when we reopen a table we don't lose access to blocks already + // in the cache. This test checks whether the Table actually makes use of the + // unique ID from the file. + + Options opt; + opt.block_size = 1024; + opt.compression = kNoCompression; + opt.block_cache = NewLRUCache(16*1024*1024); // big enough so we don't ever + // lose cached values. + + BlockBasedTableConstructor c(BytewiseComparator()); + c.Add("k01", "hello"); + c.Add("k02", "hello2"); + c.Add("k03", std::string(10000, 'x')); + c.Add("k04", std::string(200000, 'x')); + c.Add("k05", std::string(300000, 'x')); + c.Add("k06", "hello3"); + c.Add("k07", std::string(100000, 'x')); + std::vector keys; + KVMap kvmap; + c.Finish(opt, &keys, &kvmap); + + unique_ptr iter(c.NewIterator()); + iter->SeekToFirst(); + while (iter->Valid()) { + iter->key(); + iter->value(); + iter->Next(); + } + ASSERT_OK(iter->status()); + + ASSERT_OK(c.Reopen(opt)); + for (const std::string& key: keys) { + ASSERT_TRUE(c.table_reader()->TEST_KeyInCache(ReadOptions(), key)); + } +} + +TEST(Harness, Randomized) { + std::vector args = GenerateArgList(); + for (unsigned int i = 0; i < args.size(); i++) { + Init(args[i]); + Random rnd(test::RandomSeed() + 5); + for (int num_entries = 0; num_entries < 2000; + num_entries += (num_entries < 50 ? 1 : 200)) { + if ((num_entries % 10) == 0) { + fprintf(stderr, "case %d of %d: num_entries = %d\n", + (i + 1), int(args.size()), num_entries); + } + for (int e = 0; e < num_entries; e++) { + std::string v; + Add(test::RandomKey(&rnd, rnd.Skewed(4)), + test::RandomString(&rnd, rnd.Skewed(5), &v).ToString()); + } + Test(&rnd); + } + } +} + +TEST(Harness, RandomizedLongDB) { + Random rnd(test::RandomSeed()); + TestArgs args = { DB_TEST, false, 16, kNoCompression }; + Init(args); + int num_entries = 100000; + for (int e = 0; e < num_entries; e++) { + std::string v; + Add(test::RandomKey(&rnd, rnd.Skewed(4)), + test::RandomString(&rnd, rnd.Skewed(5), &v).ToString()); + } + Test(&rnd); + + // We must have created enough data to force merging + int files = 0; + for (int level = 0; level < db()->NumberLevels(); level++) { + std::string value; + char name[100]; + snprintf(name, sizeof(name), "rocksdb.num-files-at-level%d", level); + ASSERT_TRUE(db()->GetProperty(name, &value)); + files += atoi(value.c_str()); + } + ASSERT_GT(files, 0); +} + +class MemTableTest { }; + +TEST(MemTableTest, Simple) { + InternalKeyComparator cmp(BytewiseComparator()); + auto table_factory = std::make_shared(); + Options options; + options.memtable_factory = table_factory; + MemTable* memtable = new MemTable(cmp, options); + memtable->Ref(); + WriteBatch batch; + WriteBatchInternal::SetSequence(&batch, 100); + batch.Put(std::string("k1"), std::string("v1")); + batch.Put(std::string("k2"), std::string("v2")); + batch.Put(std::string("k3"), std::string("v3")); + batch.Put(std::string("largekey"), std::string("vlarge")); + ASSERT_TRUE(WriteBatchInternal::InsertInto(&batch, memtable, &options).ok()); + + Iterator* iter = memtable->NewIterator(); + iter->SeekToFirst(); + while (iter->Valid()) { + fprintf(stderr, "key: '%s' -> '%s'\n", + iter->key().ToString().c_str(), + iter->value().ToString().c_str()); + iter->Next(); + } + + delete iter; + delete memtable->Unref(); +} + + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/table/two_level_iterator.cc b/table/two_level_iterator.cc new file mode 100644 index 00000000..ac2d8d3d --- /dev/null +++ b/table/two_level_iterator.cc @@ -0,0 +1,205 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/two_level_iterator.h" + +#include "rocksdb/options.h" +#include "rocksdb/table.h" +#include "table/block.h" +#include "table/format.h" +#include "table/iterator_wrapper.h" + +namespace rocksdb { + +namespace { + +typedef Iterator* (*BlockFunction)(void*, const ReadOptions&, + const EnvOptions& soptions, const Slice&, + bool for_compaction); + +class TwoLevelIterator: public Iterator { + public: + TwoLevelIterator( + Iterator* index_iter, + BlockFunction block_function, + void* arg, + const ReadOptions& options, + const EnvOptions& soptions, + bool for_compaction); + + virtual ~TwoLevelIterator(); + + virtual void Seek(const Slice& target); + virtual void SeekToFirst(); + virtual void SeekToLast(); + virtual void Next(); + virtual void Prev(); + + virtual bool Valid() const { + return data_iter_.Valid(); + } + virtual Slice key() const { + assert(Valid()); + return data_iter_.key(); + } + virtual Slice value() const { + assert(Valid()); + return data_iter_.value(); + } + virtual Status status() const { + // It'd be nice if status() returned a const Status& instead of a Status + if (!index_iter_.status().ok()) { + return index_iter_.status(); + } else if (data_iter_.iter() != nullptr && !data_iter_.status().ok()) { + return data_iter_.status(); + } else { + return status_; + } + } + + private: + void SaveError(const Status& s) { + if (status_.ok() && !s.ok()) status_ = s; + } + void SkipEmptyDataBlocksForward(); + void SkipEmptyDataBlocksBackward(); + void SetDataIterator(Iterator* data_iter); + void InitDataBlock(); + + BlockFunction block_function_; + void* arg_; + const ReadOptions options_; + const EnvOptions& soptions_; + Status status_; + IteratorWrapper index_iter_; + IteratorWrapper data_iter_; // May be nullptr + // If data_iter_ is non-nullptr, then "data_block_handle_" holds the + // "index_value" passed to block_function_ to create the data_iter_. + std::string data_block_handle_; + bool for_compaction_; +}; + +TwoLevelIterator::TwoLevelIterator( + Iterator* index_iter, + BlockFunction block_function, + void* arg, + const ReadOptions& options, + const EnvOptions& soptions, + bool for_compaction) + : block_function_(block_function), + arg_(arg), + options_(options), + soptions_(soptions), + index_iter_(index_iter), + data_iter_(nullptr), + for_compaction_(for_compaction) { +} + +TwoLevelIterator::~TwoLevelIterator() { +} + +void TwoLevelIterator::Seek(const Slice& target) { + index_iter_.Seek(target); + InitDataBlock(); + if (data_iter_.iter() != nullptr) data_iter_.Seek(target); + SkipEmptyDataBlocksForward(); +} + +void TwoLevelIterator::SeekToFirst() { + index_iter_.SeekToFirst(); + InitDataBlock(); + if (data_iter_.iter() != nullptr) data_iter_.SeekToFirst(); + SkipEmptyDataBlocksForward(); +} + +void TwoLevelIterator::SeekToLast() { + index_iter_.SeekToLast(); + InitDataBlock(); + if (data_iter_.iter() != nullptr) data_iter_.SeekToLast(); + SkipEmptyDataBlocksBackward(); +} + +void TwoLevelIterator::Next() { + assert(Valid()); + data_iter_.Next(); + SkipEmptyDataBlocksForward(); +} + +void TwoLevelIterator::Prev() { + assert(Valid()); + data_iter_.Prev(); + SkipEmptyDataBlocksBackward(); +} + + +void TwoLevelIterator::SkipEmptyDataBlocksForward() { + while (data_iter_.iter() == nullptr || (!data_iter_.Valid() && + !data_iter_.status().IsIncomplete())) { + // Move to next block + if (!index_iter_.Valid()) { + SetDataIterator(nullptr); + return; + } + index_iter_.Next(); + InitDataBlock(); + if (data_iter_.iter() != nullptr) data_iter_.SeekToFirst(); + } +} + +void TwoLevelIterator::SkipEmptyDataBlocksBackward() { + while (data_iter_.iter() == nullptr || (!data_iter_.Valid() && + !data_iter_.status().IsIncomplete())) { + // Move to next block + if (!index_iter_.Valid()) { + SetDataIterator(nullptr); + return; + } + index_iter_.Prev(); + InitDataBlock(); + if (data_iter_.iter() != nullptr) data_iter_.SeekToLast(); + } +} + +void TwoLevelIterator::SetDataIterator(Iterator* data_iter) { + if (data_iter_.iter() != nullptr) SaveError(data_iter_.status()); + data_iter_.Set(data_iter); +} + +void TwoLevelIterator::InitDataBlock() { + if (!index_iter_.Valid()) { + SetDataIterator(nullptr); + } else { + Slice handle = index_iter_.value(); + if (data_iter_.iter() != nullptr + && handle.compare(data_block_handle_) == 0) { + // data_iter_ is already constructed with this iterator, so + // no need to change anything + } else { + Iterator* iter = (*block_function_)(arg_, options_, soptions_, handle, + for_compaction_); + data_block_handle_.assign(handle.data(), handle.size()); + SetDataIterator(iter); + } + } +} + +} // namespace + +Iterator* NewTwoLevelIterator( + Iterator* index_iter, + BlockFunction block_function, + void* arg, + const ReadOptions& options, + const EnvOptions& soptions, + bool for_compaction) { + return new TwoLevelIterator(index_iter, block_function, arg, + options, soptions, for_compaction); +} + +} // namespace rocksdb diff --git a/table/two_level_iterator.h b/table/two_level_iterator.h new file mode 100644 index 00000000..85aed3f1 --- /dev/null +++ b/table/two_level_iterator.h @@ -0,0 +1,40 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include "rocksdb/iterator.h" +#include "rocksdb/env.h" + +namespace rocksdb { + +struct ReadOptions; + +// Return a new two level iterator. A two-level iterator contains an +// index iterator whose values point to a sequence of blocks where +// each block is itself a sequence of key,value pairs. The returned +// two-level iterator yields the concatenation of all key/value pairs +// in the sequence of blocks. Takes ownership of "index_iter" and +// will delete it when no longer needed. +// +// Uses a supplied function to convert an index_iter value into +// an iterator over the contents of the corresponding block. +extern Iterator* NewTwoLevelIterator( + Iterator* index_iter, + Iterator* (*block_function)( + void* arg, + const ReadOptions& options, + const EnvOptions& soptions, + const Slice& index_value, + bool for_compaction), + void* arg, + const ReadOptions& options, + const EnvOptions& soptions, + bool for_compaction = false); + +} // namespace rocksdb diff --git a/tools/blob_store_bench.cc b/tools/blob_store_bench.cc new file mode 100644 index 00000000..70ece2c5 --- /dev/null +++ b/tools/blob_store_bench.cc @@ -0,0 +1,269 @@ +#include +#include +#include + +#include "rocksdb/env.h" +#include "util/blob_store.h" +#include "util/testutil.h" + +#define KB 1024LL +#define MB 1024*1024LL +// BlobStore does costly asserts to make sure it's running correctly, which +// significantly impacts benchmark runtime. +// NDEBUG will compile out those asserts. +#ifndef NDEBUG +#define NDEBUG +#endif + +using namespace rocksdb; +using namespace std; + +// used by all threads +uint64_t timeout_sec; +Env *env; +BlobStore* bs; + +static std::string RandomString(Random* rnd, uint64_t len) { + std::string r; + test::RandomString(rnd, len, &r); + return r; +} + +struct Result { + uint32_t writes; + uint32_t reads; + uint32_t deletes; + uint64_t data_written; + uint64_t data_read; + + void print() { + printf("Total writes = %u\n", writes); + printf("Total reads = %u\n", reads); + printf("Total deletes = %u\n", deletes); + printf("Write throughput = %lf MB/s\n", + (double)data_written / (1024*1024.0) / timeout_sec); + printf("Read throughput = %lf MB/s\n", + (double)data_read / (1024*1024.0) / timeout_sec); + printf("Total throughput = %lf MB/s\n", + (double)(data_read + data_written) / (1024*1024.0) / timeout_sec); + } + + Result() { + writes = reads = deletes = data_read = data_written = 0; + } + + Result (uint32_t writes, uint32_t reads, uint32_t deletes, + uint64_t data_written, uint64_t data_read) : + writes(writes), reads(reads), deletes(deletes), + data_written(data_written), data_read(data_read) {} + +}; + +Result operator + (const Result &a, const Result &b) { + return Result(a.writes + b.writes, a.reads + b.reads, + a.deletes + b.deletes, a.data_written + b.data_written, + a.data_read + b.data_read); +} + +struct WorkerThread { + uint64_t data_size_from, data_size_to; + double read_ratio; + uint64_t working_set_size; // start deleting once you reach this + Result result; + atomic stopped; + + WorkerThread(uint64_t data_size_from, uint64_t data_size_to, + double read_ratio, uint64_t working_set_size) : + data_size_from(data_size_from), data_size_to(data_size_to), + read_ratio(read_ratio), working_set_size(working_set_size), + stopped(false) {} + + WorkerThread(const WorkerThread& wt) : + data_size_from(wt.data_size_from), data_size_to(wt.data_size_to), + read_ratio(wt.read_ratio), working_set_size(wt.working_set_size), + stopped(false) {} +}; + +static void WorkerThreadBody(void* arg) { + WorkerThread* t = reinterpret_cast(arg); + Random rnd(5); + string buf; + vector> blobs; + vector random_strings; + + for (int i = 0; i < 10; ++i) { + random_strings.push_back(RandomString(&rnd, t->data_size_to)); + } + + uint64_t total_size = 0; + + uint64_t start_micros = env->NowMicros(); + while (env->NowMicros() - start_micros < timeout_sec * 1000 * 1000) { + if (blobs.size() && rand() < RAND_MAX * t->read_ratio) { + // read + int bi = rand() % blobs.size(); + Status s = bs->Get(blobs[bi].first, &buf); + assert(s.ok()); + t->result.data_read += buf.size(); + t->result.reads++; + } else { + // write + uint64_t size = rand() % (t->data_size_to - t->data_size_from) + + t->data_size_from; + total_size += size; + string put_str = random_strings[rand() % random_strings.size()]; + blobs.push_back(make_pair(Blob(), size)); + Status s = bs->Put(Slice(put_str.data(), size), &blobs.back().first); + assert(s.ok()); + t->result.data_written += size; + t->result.writes++; + } + + while (total_size >= t->working_set_size) { + // delete random + int bi = rand() % blobs.size(); + total_size -= blobs[bi].second; + bs->Delete(blobs[bi].first); + blobs.erase(blobs.begin() + bi); + t->result.deletes++; + } + } + t->stopped.store(true); +} + +Result StartBenchmark(vector& config) { + for (auto w : config) { + env->StartThread(WorkerThreadBody, w); + } + + Result result; + + for (auto w : config) { + while (!w->stopped.load()); + result = result + w->result; + } + + for (auto w : config) { + delete w; + } + + delete bs; + + return result; +} + +vector SetupBenchmarkBalanced() { + string test_path; + env->GetTestDirectory(&test_path); + test_path.append("/blob_store"); + + // config start + uint32_t block_size = 16*KB; + uint32_t file_size = 1*MB; + double read_write_ratio = 0.5; + uint64_t data_read_from = 16*KB; + uint64_t data_read_to = 32*KB; + int number_of_threads = 10; + uint64_t working_set_size = 5*MB; + timeout_sec = 5; + // config end + + bs = new BlobStore(test_path, block_size, file_size / block_size, 10000, env); + + vector config; + + for (int i = 0; i < number_of_threads; ++i) { + config.push_back(new WorkerThread(data_read_from, + data_read_to, + read_write_ratio, + working_set_size)); + }; + + return config; +} + +vector SetupBenchmarkWriteHeavy() { + string test_path; + env->GetTestDirectory(&test_path); + test_path.append("/blob_store"); + + // config start + uint32_t block_size = 16*KB; + uint32_t file_size = 1*MB; + double read_write_ratio = 0.1; + uint64_t data_read_from = 16*KB; + uint64_t data_read_to = 32*KB; + int number_of_threads = 10; + uint64_t working_set_size = 5*MB; + timeout_sec = 5; + // config end + + bs = new BlobStore(test_path, block_size, file_size / block_size, 10000, env); + + vector config; + + for (int i = 0; i < number_of_threads; ++i) { + config.push_back(new WorkerThread(data_read_from, + data_read_to, + read_write_ratio, + working_set_size)); + }; + + return config; +} + +vector SetupBenchmarkReadHeavy() { + string test_path; + env->GetTestDirectory(&test_path); + test_path.append("/blob_store"); + + // config start + uint32_t block_size = 16*KB; + uint32_t file_size = 1*MB; + double read_write_ratio = 0.9; + uint64_t data_read_from = 16*KB; + uint64_t data_read_to = 32*KB; + int number_of_threads = 10; + uint64_t working_set_size = 5*MB; + timeout_sec = 5; + // config end + + bs = new BlobStore(test_path, block_size, file_size / block_size, 10000, env); + + vector config; + + for (int i = 0; i < number_of_threads; ++i) { + config.push_back(new WorkerThread(data_read_from, + data_read_to, + read_write_ratio, + working_set_size)); + }; + + return config; +} + +int main(int argc, const char** argv) { + srand(33); + env = Env::Default(); + + { + printf("--- Balanced read/write benchmark ---\n"); + vector config = SetupBenchmarkBalanced(); + Result r = StartBenchmark(config); + r.print(); + } + { + printf("--- Write heavy benchmark ---\n"); + vector config = SetupBenchmarkWriteHeavy(); + Result r = StartBenchmark(config); + r.print(); + } + { + printf("--- Read heavy benchmark ---\n"); + vector config = SetupBenchmarkReadHeavy(); + Result r = StartBenchmark(config); + r.print(); + } + + return 0; +} diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py new file mode 100644 index 00000000..6270d69c --- /dev/null +++ b/tools/db_crashtest.py @@ -0,0 +1,136 @@ +#! /usr/bin/env python +import os +import re +import sys +import time +import random +import getopt +import logging +import tempfile +import subprocess + +# This script runs and kills db_stress multiple times. It checks consistency +# in case of unsafe crashes in Rocksdb. + +def main(argv): + try: + opts, args = getopt.getopt(argv, "hd:t:i:o:b:") + except getopt.GetoptError: + print("db_crashtest.py -d -t <#threads> " + "-i -o " + "-b \n") + sys.exit(2) + + # default values, will be overridden by cmdline args + interval = 120 # time for one db_stress instance to run + duration = 6000 # total time for this script to test db_stress + threads = 32 + # since we will be killing anyway, use large value for ops_per_thread + ops_per_thread = 100000000 + write_buf_size = 4 * 1024 * 1024 + + for opt, arg in opts: + if opt == '-h': + print("db_crashtest.py -d " + " -t <#threads> -i " + " -o -b \n") + sys.exit() + elif opt == "-d": + duration = int(arg) + elif opt == "-t": + threads = int(arg) + elif opt == "-i": + interval = int(arg) + elif opt == "-o": + ops_per_thread = int(arg) + elif opt == "-b": + write_buf_size = int(arg) + else: + print("db_crashtest.py -d " + " -t <#threads> -i " + " -o -b \n") + sys.exit(2) + + exit_time = time.time() + duration + + print("Running blackbox-crash-test with \ninterval_between_crash=" + + str(interval) + "\ntotal-duration=" + str(duration) + + "\nthreads=" + str(threads) + "\nops_per_thread=" + + str(ops_per_thread) + "\nwrite_buffer_size=" + + str(write_buf_size) + "\n") + + while time.time() < exit_time: + run_had_errors = False + killtime = time.time() + interval + + cmd = re.sub('\s+', ' ', """ + ./db_stress + --test_batches_snapshots=1 + --ops_per_thread=%s + --threads=%s + --write_buffer_size=%s + --destroy_db_initially=0 + --reopen=0 + --readpercent=45 + --prefixpercent=5 + --writepercent=35 + --delpercent=5 + --iterpercent=10 + --db=%s + --max_key=100000000 + --disable_seek_compaction=%s + --mmap_read=%s + --block_size=16384 + --cache_size=1048576 + --open_files=500000 + --verify_checksum=1 + --sync=%s + --disable_wal=0 + --disable_data_sync=%s + --target_file_size_base=2097152 + --target_file_size_multiplier=2 + --max_write_buffer_number=3 + --max_background_compactions=20 + --max_bytes_for_level_base=10485760 + --filter_deletes=%s + """ % (ops_per_thread, + threads, + write_buf_size, + tempfile.mkdtemp(), + random.randint(0, 1), + random.randint(0, 1), + random.randint(0, 1), + random.randint(0, 1), + random.randint(0, 1))) + + child = subprocess.Popen([cmd], + stderr=subprocess.PIPE, shell=True) + print("Running db_stress with pid=%d: %s\n\n" + % (child.pid, cmd)) + + while time.time() < killtime: + time.sleep(10) + + if child.poll() is not None: + print("WARNING: db_stress ended before kill: exitcode=%d\n" + % child.returncode) + else: + child.kill() + print("KILLED %d\n" % child.pid) + time.sleep(1) # time to stabilize after a kill + + while True: + line = child.stderr.readline().strip() + if line != '': + run_had_errors = True + print('***' + line + '^') + else: + break + + if run_had_errors: + sys.exit(2) + + time.sleep(1) # time to stabilize before the next run + +if __name__ == "__main__": + sys.exit(main(sys.argv[1:])) diff --git a/tools/db_crashtest2.py b/tools/db_crashtest2.py new file mode 100644 index 00000000..dbb7059f --- /dev/null +++ b/tools/db_crashtest2.py @@ -0,0 +1,163 @@ +#! /usr/bin/env python +import os +import re +import sys +import time +import random +import getopt +import logging +import tempfile +import subprocess + +# This python script runs db_stress multiple times. Some runs with +# kill_random_test that causes rocksdb to crash at various points in code. + +def main(argv): + try: + opts, args = getopt.getopt(argv, "hd:t:k:o:b:") + except getopt.GetoptError: + print str(getopt.GetoptError) + print "db_crashtest2.py -d -t <#threads> " \ + "-k -o "\ + "-b \n" + sys.exit(2) + + # default values, will be overridden by cmdline args + kill_random_test = 97 # kill with probability 1/97 by default + duration = 10000 # total time for this script to test db_stress + threads = 32 + ops_per_thread = 200000 + write_buf_size = 4 * 1024 * 1024 + + for opt, arg in opts: + if opt == '-h': + print "db_crashtest2.py -d -t <#threads> " \ + "-k -o " \ + "-b \n" + sys.exit() + elif opt == "-d": + duration = int(arg) + elif opt == "-t": + threads = int(arg) + elif opt == "-k": + kill_random_test = int(arg) + elif opt == "-o": + ops_per_thread = int(arg) + elif opt == "-b": + write_buf_size = int(arg) + else: + print "unrecognized option " + str(opt) + "\n" + print "db_crashtest2.py -d -t <#threads> " \ + "-k -o " \ + "-b \n" + sys.exit(2) + + exit_time = time.time() + duration + + print "Running whitebox-crash-test with \ntotal-duration=" + str(duration) \ + + "\nthreads=" + str(threads) + "\nops_per_thread=" \ + + str(ops_per_thread) + "\nwrite_buffer_size=" \ + + str(write_buf_size) + "\n" + + total_check_mode = 3 + check_mode = 0 + + while time.time() < exit_time: + killoption = "" + if check_mode == 0: + # run with kill_random_test + killoption = " --kill_random_test=" + str(kill_random_test) + # use large ops per thread since we will kill it anyway + additional_opts = "--ops_per_thread=" + \ + str(100 * ops_per_thread) + killoption + elif check_mode == 1: + # normal run with universal compaction mode + additional_opts = "--ops_per_thread=" + str(ops_per_thread) + \ + " --compaction_style=1" + else: + # nomral run + additional_opts = "--ops_per_thread=" + str(ops_per_thread) + + cmd = re.sub('\s+', ' ', """ + ./db_stress + --test_batches_snapshots=%s + --threads=%s + --write_buffer_size=%s + --destroy_db_initially=0 + --reopen=0 + --readpercent=45 + --prefixpercent=5 + --writepercent=35 + --delpercent=5 + --iterpercent=10 + --db=%s + --max_key=100000000 + --disable_seek_compaction=%s + --mmap_read=%s + --block_size=16384 + --cache_size=1048576 + --open_files=500000 + --verify_checksum=1 + --sync=%s + --disable_wal=0 + --disable_data_sync=%s + --target_file_size_base=2097152 + --target_file_size_multiplier=2 + --max_write_buffer_number=3 + --max_background_compactions=20 + --max_bytes_for_level_base=10485760 + --filter_deletes=%s + %s + """ % (random.randint(0, 1), + threads, + write_buf_size, + tempfile.mkdtemp(), + random.randint(0, 1), + random.randint(0, 1), + random.randint(0, 1), + random.randint(0, 1), + random.randint(0, 1), + additional_opts)) + + print "Running:" + cmd + "\n" + + popen = subprocess.Popen([cmd], stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + shell=True) + stdoutdata, stderrdata = popen.communicate() + retncode = popen.returncode + msg = ("check_mode={0}, kill option={1}, exitcode={2}\n".format( + check_mode, killoption, retncode)) + print msg + print stdoutdata + + expected = False + if (killoption == '') and (retncode == 0): + # we expect zero retncode if no kill option + expected = True + elif killoption != '' and retncode < 0: + # we expect negative retncode if kill option was given + expected = True + + if not expected: + print "TEST FAILED. See kill option and exit code above!!!\n" + sys.exit(1) + + stdoutdata = stdoutdata.lower() + errorcount = (stdoutdata.count('error') - + stdoutdata.count('got errors 0 times')) + print "#times error occurred in output is " + str(errorcount) + "\n" + + if (errorcount > 0): + print "TEST FAILED. Output has 'error'!!!\n" + sys.exit(2) + if (stdoutdata.find('fail') >= 0): + print "TEST FAILED. Output has 'fail'!!!\n" + sys.exit(2) + + check_mode = (check_mode + 1) % total_check_mode + + time.sleep(1) # time to stabilize after a kill + +if __name__ == "__main__": + sys.exit(main(sys.argv[1:])) diff --git a/tools/db_repl_stress.cc b/tools/db_repl_stress.cc new file mode 100644 index 00000000..9dfe4b64 --- /dev/null +++ b/tools/db_repl_stress.cc @@ -0,0 +1,134 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include + +#include + +#include "db/write_batch_internal.h" +#include "rocksdb/db.h" +#include "rocksdb/types.h" +#include "port/atomic_pointer.h" +#include "util/testutil.h" + + +// Run a thread to perform Put's. +// Another thread uses GetUpdatesSince API to keep getting the updates. +// options : +// --num_inserts = the num of inserts the first thread should perform. +// --wal_ttl = the wal ttl for the run. + +using namespace rocksdb; + +struct DataPumpThread { + size_t no_records; + DB* db; // Assumption DB is Open'ed already. +}; + +static std::string RandomString(Random* rnd, int len) { + std::string r; + test::RandomString(rnd, len, &r); + return r; +} + +static void DataPumpThreadBody(void* arg) { + DataPumpThread* t = reinterpret_cast(arg); + DB* db = t->db; + Random rnd(301); + size_t i = 0; + while(i++ < t->no_records) { + if(!db->Put(WriteOptions(), Slice(RandomString(&rnd, 500)), + Slice(RandomString(&rnd, 500))).ok()) { + fprintf(stderr, "Error in put\n"); + exit(1); + } + } +} + +struct ReplicationThread { + port::AtomicPointer stop; + DB* db; + volatile size_t no_read; +}; + +static void ReplicationThreadBody(void* arg) { + ReplicationThread* t = reinterpret_cast(arg); + DB* db = t->db; + unique_ptr iter; + SequenceNumber currentSeqNum = 1; + while (t->stop.Acquire_Load() != nullptr) { + iter.reset(); + Status s; + while(!db->GetUpdatesSince(currentSeqNum, &iter).ok()) { + if (t->stop.Acquire_Load() == nullptr) { + return; + } + } + fprintf(stderr, "Refreshing iterator\n"); + for(;iter->Valid(); iter->Next(), t->no_read++, currentSeqNum++) { + BatchResult res = iter->GetBatch(); + if (res.sequence != currentSeqNum) { + fprintf(stderr, + "Missed a seq no. b/w %ld and %ld\n", + (long)currentSeqNum, + (long)res.sequence); + exit(1); + } + } + } +} + +DEFINE_uint64(num_inserts, 1000, "the num of inserts the first thread should" + " perform."); +DEFINE_uint64(wal_ttl_seconds, 1000, "the wal ttl for the run(in seconds)"); +DEFINE_uint64(wal_size_limit_MB, 10, "the wal size limit for the run" + "(in MB)"); + +int main(int argc, const char** argv) { + google::SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) + + " --num_inserts= --wal_ttl_seconds=" + + " --wal_size_limit_MB="); + google::ParseCommandLineFlags(&argc, const_cast(&argv), true); + + Env* env = Env::Default(); + std::string default_db_path; + env->GetTestDirectory(&default_db_path); + default_db_path += "db_repl_stress"; + Options options; + options.create_if_missing = true; + options.WAL_ttl_seconds = FLAGS_wal_ttl_seconds; + options.WAL_size_limit_MB = FLAGS_wal_size_limit_MB; + DB* db; + DestroyDB(default_db_path, options); + + Status s = DB::Open(options, default_db_path, &db); + + if (!s.ok()) { + fprintf(stderr, "Could not open DB due to %s\n", s.ToString().c_str()); + exit(1); + } + + DataPumpThread dataPump; + dataPump.no_records = FLAGS_num_inserts; + dataPump.db = db; + env->StartThread(DataPumpThreadBody, &dataPump); + + ReplicationThread replThread; + replThread.db = db; + replThread.no_read = 0; + replThread.stop.Release_Store(env); // store something to make it non-null. + + env->StartThread(ReplicationThreadBody, &replThread); + while(replThread.no_read < FLAGS_num_inserts); + replThread.stop.Release_Store(nullptr); + if (replThread.no_read < dataPump.no_records) { + // no. read should be => than inserted. + fprintf(stderr, "No. of Record's written and read not same\nRead : %ld" + " Written : %ld\n", replThread.no_read, dataPump.no_records); + exit(1); + } + fprintf(stderr, "Successful!\n"); + exit(0); +} diff --git a/tools/db_stress.cc b/tools/db_stress.cc new file mode 100644 index 00000000..966f007e --- /dev/null +++ b/tools/db_stress.cc @@ -0,0 +1,1539 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// The test uses an array to compare against values written to the database. +// Keys written to the array are in 1:1 correspondence to the actual values in +// the database according to the formula in the function GenerateValue. + +// Space is reserved in the array from 0 to FLAGS_max_key and values are +// randomly written/deleted/read from those positions. During verification we +// compare all the positions in the array. To shorten/elongate the running +// time, you could change the settings: FLAGS_max_key, FLAGS_ops_per_thread, +// (sometimes also FLAGS_threads). +// +// NOTE that if FLAGS_test_batches_snapshots is set, the test will have +// different behavior. See comment of the flag for details. + +#include +#include +#include +#include +#include "db/db_impl.h" +#include "db/version_set.h" +#include "db/db_statistics.h" +#include "rocksdb/cache.h" +#include "utilities/utility_db.h" +#include "rocksdb/env.h" +#include "rocksdb/write_batch.h" +#include "rocksdb/statistics.h" +#include "port/port.h" +#include "util/coding.h" +#include "util/crc32c.h" +#include "util/histogram.h" +#include "util/mutexlock.h" +#include "util/random.h" +#include "util/testutil.h" +#include "util/logging.h" +#include "utilities/ttl/db_ttl.h" +#include "hdfs/env_hdfs.h" +#include "utilities/merge_operators.h" + +static const long KB = 1024; + + +static bool ValidateUint32Range(const char* flagname, uint64_t value) { + if (value > std::numeric_limits::max()) { + fprintf(stderr, + "Invalid value for --%s: %lu, overflow\n", + flagname, + (unsigned long)value); + return false; + } + return true; +} +DEFINE_uint64(seed, 2341234, "Seed for PRNG"); +static const bool FLAGS_seed_dummy = + google::RegisterFlagValidator(&FLAGS_seed, &ValidateUint32Range); + +DEFINE_int64(max_key, 1 * KB * KB * KB, + "Max number of key/values to place in database"); + +DEFINE_bool(test_batches_snapshots, false, + "If set, the test uses MultiGet(), MultiPut() and MultiDelete()" + " which read/write/delete multiple keys in a batch. In this mode," + " we do not verify db content by comparing the content with the " + "pre-allocated array. Instead, we do partial verification inside" + " MultiGet() by checking various values in a batch. Benefit of" + " this mode:\n" + "\t(a) No need to acquire mutexes during writes (less cache " + "flushes in multi-core leading to speed up)\n" + "\t(b) No long validation at the end (more speed up)\n" + "\t(c) Test snapshot and atomicity of batch writes"); + +DEFINE_int32(threads, 32, "Number of concurrent threads to run."); + +DEFINE_int32(ttl, -1, + "Opens the db with this ttl value if this is not -1. " + "Carefully specify a large value such that verifications on " + "deleted values don't fail"); + +DEFINE_int32(value_size_mult, 8, + "Size of value will be this number times rand_int(1,3) bytes"); + +DEFINE_bool(verify_before_write, false, "Verify before write"); + +DEFINE_bool(histogram, false, "Print histogram of operation timings"); + +DEFINE_bool(destroy_db_initially, true, + "Destroys the database dir before start if this is true"); + +DEFINE_bool (verbose, false, "Verbose"); + +DEFINE_int32(write_buffer_size, rocksdb::Options().write_buffer_size, + "Number of bytes to buffer in memtable before compacting"); + +DEFINE_int32(max_write_buffer_number, + rocksdb::Options().max_write_buffer_number, + "The number of in-memory memtables. " + "Each memtable is of size FLAGS_write_buffer_size."); + +DEFINE_int32(min_write_buffer_number_to_merge, + rocksdb::Options().min_write_buffer_number_to_merge, + "The minimum number of write buffers that will be merged together " + "before writing to storage. This is cheap because it is an " + "in-memory merge. If this feature is not enabled, then all these " + "write buffers are flushed to L0 as separate files and this " + "increases read amplification because a get request has to check " + "in all of these files. Also, an in-memory merge may result in " + "writing less data to storage if there are duplicate records in" + " each of these individual write buffers."); + +DEFINE_int32(open_files, rocksdb::Options().max_open_files, + "Maximum number of files to keep open at the same time " + "(use default if == 0)"); + +DEFINE_int64(compressed_cache_size, -1, + "Number of bytes to use as a cache of compressed data." + " Negative means use default settings."); + +DEFINE_int32(compaction_style, rocksdb::Options().compaction_style, ""); + +DEFINE_int32(level0_file_num_compaction_trigger, + rocksdb::Options().level0_file_num_compaction_trigger, + "Level0 compaction start trigger"); + +DEFINE_int32(level0_slowdown_writes_trigger, + rocksdb::Options().level0_slowdown_writes_trigger, + "Number of files in level-0 that will slow down writes"); + +DEFINE_int32(level0_stop_writes_trigger, + rocksdb::Options().level0_stop_writes_trigger, + "Number of files in level-0 that will trigger put stop."); + +DEFINE_int32(block_size, rocksdb::Options().block_size, + "Number of bytes in a block."); + +DEFINE_int32(max_background_compactions, + rocksdb::Options().max_background_compactions, + "The maximum number of concurrent background compactions " + "that can occur in parallel."); + +DEFINE_int32(universal_size_ratio, 0, "The ratio of file sizes that trigger" + " compaction in universal style"); + +DEFINE_int32(universal_min_merge_width, 0, "The minimum number of files to " + "compact in universal style compaction"); + +DEFINE_int32(universal_max_merge_width, 0, "The max number of files to compact" + " in universal style compaction"); + +DEFINE_int32(universal_max_size_amplification_percent, 0, + "The max size amplification for universal style compaction"); + +DEFINE_int64(cache_size, 2 * KB * KB * KB, + "Number of bytes to use as a cache of uncompressed data."); + +static bool ValidateInt32Positive(const char* flagname, int32_t value) { + if (value < 0) { + fprintf(stderr, "Invalid value for --%s: %d, must be >=0\n", + flagname, value); + return false; + } + return true; +} +DEFINE_int32(reopen, 10, "Number of times database reopens"); +static const bool FLAGS_reopen_dummy = + google::RegisterFlagValidator(&FLAGS_reopen, &ValidateInt32Positive); + +DEFINE_int32(bloom_bits, 10, "Bloom filter bits per key. " + "Negative means use default settings."); + +DEFINE_string(db, "", "Use the db with the following name."); + +DEFINE_bool(verify_checksum, false, + "Verify checksum for every block read from storage"); + +DEFINE_bool(mmap_read, rocksdb::EnvOptions().use_mmap_reads, + "Allow reads to occur via mmap-ing files"); + +// Database statistics +static std::shared_ptr dbstats; +DEFINE_bool(statistics, false, "Create database statistics"); + +DEFINE_bool(sync, false, "Sync all writes to disk"); + +DEFINE_bool(disable_data_sync, false, + "If true, do not wait until data is synced to disk."); + +DEFINE_bool(use_fsync, false, "If true, issue fsync instead of fdatasync"); + +DEFINE_int32(kill_random_test, 0, + "If non-zero, kill at various points in source code with " + "probability 1/this"); +static const bool FLAGS_kill_random_test_dummy = + google::RegisterFlagValidator(&FLAGS_kill_random_test, + &ValidateInt32Positive); +extern int rocksdb_kill_odds; + +DEFINE_bool(disable_wal, false, "If true, do not write WAL for write."); + +DEFINE_int32(target_file_size_base, 64 * KB, + "Target level-1 file size for compaction"); + +DEFINE_int32(target_file_size_multiplier, 1, + "A multiplier to compute targe level-N file size (N >= 2)"); + +DEFINE_uint64(max_bytes_for_level_base, 256 * KB, "Max bytes for level-1"); + +DEFINE_int32(max_bytes_for_level_multiplier, 2, + "A multiplier to compute max bytes for level-N (N >= 2)"); + +static bool ValidateInt32Percent(const char* flagname, int32_t value) { + if (value < 0 || value>100) { + fprintf(stderr, "Invalid value for --%s: %d, 0<= pct <=100 \n", + flagname, value); + return false; + } + return true; +} +DEFINE_int32(readpercent, 10, + "Ratio of reads to total workload (expressed as a percentage)"); +static const bool FLAGS_readpercent_dummy = + google::RegisterFlagValidator(&FLAGS_readpercent, &ValidateInt32Percent); + +DEFINE_int32(prefixpercent, 20, + "Ratio of prefix iterators to total workload (expressed as a" + " percentage)"); +static const bool FLAGS_prefixpercent_dummy = + google::RegisterFlagValidator(&FLAGS_prefixpercent, &ValidateInt32Percent); + +DEFINE_int32(writepercent, 45, + " Ratio of deletes to total workload (expressed as a percentage)"); +static const bool FLAGS_writepercent_dummy = + google::RegisterFlagValidator(&FLAGS_writepercent, &ValidateInt32Percent); + +DEFINE_int32(delpercent, 15, + "Ratio of deletes to total workload (expressed as a percentage)"); +static const bool FLAGS_delpercent_dummy = + google::RegisterFlagValidator(&FLAGS_delpercent, &ValidateInt32Percent); + +DEFINE_int32(iterpercent, 10, "Ratio of iterations to total workload" + " (expressed as a percentage)"); +static const bool FLAGS_iterpercent_dummy = + google::RegisterFlagValidator(&FLAGS_iterpercent, &ValidateInt32Percent); + +DEFINE_uint64(num_iterations, 10, "Number of iterations per MultiIterate run"); +static const bool FLAGS_num_iterations_dummy = + google::RegisterFlagValidator(&FLAGS_num_iterations, &ValidateUint32Range); + +DEFINE_bool(disable_seek_compaction, false, + "Option to disable compation triggered by read."); + +DEFINE_uint64(delete_obsolete_files_period_micros, 0, + "Option to delete obsolete files periodically" + "0 means that obsolete files are " + " deleted after every compaction run."); + +enum rocksdb::CompressionType StringToCompressionType(const char* ctype) { + assert(ctype); + + if (!strcasecmp(ctype, "none")) + return rocksdb::kNoCompression; + else if (!strcasecmp(ctype, "snappy")) + return rocksdb::kSnappyCompression; + else if (!strcasecmp(ctype, "zlib")) + return rocksdb::kZlibCompression; + else if (!strcasecmp(ctype, "bzip2")) + return rocksdb::kBZip2Compression; + + fprintf(stdout, "Cannot parse compression type '%s'\n", ctype); + return rocksdb::kSnappyCompression; //default value +} +DEFINE_string(compression_type, "snappy", + "Algorithm to use to compress the database"); +static enum rocksdb::CompressionType FLAGS_compression_type_e = + rocksdb::kSnappyCompression; + +DEFINE_string(hdfs, "", "Name of hdfs environment"); +// posix or hdfs environment +static rocksdb::Env* FLAGS_env = rocksdb::Env::Default(); + +DEFINE_uint64(ops_per_thread, 600000, "Number of operations per thread."); +static const bool FLAGS_ops_per_thread_dummy = + google::RegisterFlagValidator(&FLAGS_ops_per_thread, &ValidateUint32Range); + +DEFINE_uint64(log2_keys_per_lock, 2, "Log2 of number of keys per lock"); +static const bool FLAGS_log2_keys_per_lock_dummy = + google::RegisterFlagValidator(&FLAGS_log2_keys_per_lock, + &ValidateUint32Range); + +DEFINE_int32(purge_redundant_percent, 50, + "Percentage of times we want to purge redundant keys in memory " + "before flushing"); +static const bool FLAGS_purge_redundant_percent_dummy = + google::RegisterFlagValidator(&FLAGS_purge_redundant_percent, + &ValidateInt32Percent); + +DEFINE_bool(filter_deletes, false, "On true, deletes use KeyMayExist to drop" + " the delete if key not present"); + +enum RepFactory { + kSkipList, + kHashSkipList, + kVectorRep +}; +enum RepFactory StringToRepFactory(const char* ctype) { + assert(ctype); + + if (!strcasecmp(ctype, "skip_list")) + return kSkipList; + else if (!strcasecmp(ctype, "prefix_hash")) + return kHashSkipList; + else if (!strcasecmp(ctype, "vector")) + return kVectorRep; + + fprintf(stdout, "Cannot parse memreptable %s\n", ctype); + return kSkipList; +} +static enum RepFactory FLAGS_rep_factory; +DEFINE_string(memtablerep, "skip_list", ""); + +static bool ValidatePrefixSize(const char* flagname, int32_t value) { + if (value < 0 || value>=2000000000) { + fprintf(stderr, "Invalid value for --%s: %d. 0<= PrefixSize <=2000000000\n", + flagname, value); + return false; + } + return true; +} +DEFINE_int32(prefix_size, 0, "Control the prefix size for HashSkipListRep"); +static const bool FLAGS_prefix_size_dummy = + google::RegisterFlagValidator(&FLAGS_prefix_size, &ValidatePrefixSize); + +DEFINE_bool(use_merge, false, "On true, replaces all writes with a Merge " + "that behaves like a Put"); + + +namespace rocksdb { + +// convert long to a big-endian slice key +static std::string Key(long val) { + std::string little_endian_key; + std::string big_endian_key; + PutFixed64(&little_endian_key, val); + assert(little_endian_key.size() == sizeof(val)); + big_endian_key.resize(sizeof(val)); + for (int i=0; i<(int)sizeof(val); i++) { + big_endian_key[i] = little_endian_key[sizeof(val) - 1 - i]; + } + return big_endian_key; +} + +class StressTest; +namespace { + +class Stats { + private: + double start_; + double finish_; + double seconds_; + long done_; + long gets_; + long prefixes_; + long writes_; + long deletes_; + long iterator_size_sums_; + long founds_; + long iterations_; + long errors_; + int next_report_; + size_t bytes_; + double last_op_finish_; + HistogramImpl hist_; + + public: + Stats() { } + + void Start() { + next_report_ = 100; + hist_.Clear(); + done_ = 0; + gets_ = 0; + prefixes_ = 0; + writes_ = 0; + deletes_ = 0; + iterator_size_sums_ = 0; + founds_ = 0; + iterations_ = 0; + errors_ = 0; + bytes_ = 0; + seconds_ = 0; + start_ = FLAGS_env->NowMicros(); + last_op_finish_ = start_; + finish_ = start_; + } + + void Merge(const Stats& other) { + hist_.Merge(other.hist_); + done_ += other.done_; + gets_ += other.gets_; + prefixes_ += other.prefixes_; + writes_ += other.writes_; + deletes_ += other.deletes_; + iterator_size_sums_ += other.iterator_size_sums_; + founds_ += other.founds_; + iterations_ += other.iterations_; + errors_ += other.errors_; + bytes_ += other.bytes_; + seconds_ += other.seconds_; + if (other.start_ < start_) start_ = other.start_; + if (other.finish_ > finish_) finish_ = other.finish_; + } + + void Stop() { + finish_ = FLAGS_env->NowMicros(); + seconds_ = (finish_ - start_) * 1e-6; + } + + void FinishedSingleOp() { + if (FLAGS_histogram) { + double now = FLAGS_env->NowMicros(); + double micros = now - last_op_finish_; + hist_.Add(micros); + if (micros > 20000) { + fprintf(stdout, "long op: %.1f micros%30s\r", micros, ""); + } + last_op_finish_ = now; + } + + done_++; + if (done_ >= next_report_) { + if (next_report_ < 1000) next_report_ += 100; + else if (next_report_ < 5000) next_report_ += 500; + else if (next_report_ < 10000) next_report_ += 1000; + else if (next_report_ < 50000) next_report_ += 5000; + else if (next_report_ < 100000) next_report_ += 10000; + else if (next_report_ < 500000) next_report_ += 50000; + else next_report_ += 100000; + fprintf(stdout, "... finished %ld ops%30s\r", done_, ""); + } + } + + void AddBytesForWrites(int nwrites, size_t nbytes) { + writes_ += nwrites; + bytes_ += nbytes; + } + + void AddGets(int ngets, int nfounds) { + founds_ += nfounds; + gets_ += ngets; + } + + void AddPrefixes(int nprefixes, int count) { + prefixes_ += nprefixes; + iterator_size_sums_ += count; + } + + void AddIterations(int n) { + iterations_ += n; + } + + void AddDeletes(int n) { + deletes_ += n; + } + + void AddErrors(int n) { + errors_ += n; + } + + void Report(const char* name) { + std::string extra; + if (bytes_ < 1 || done_ < 1) { + fprintf(stderr, "No writes or ops?\n"); + return; + } + + double elapsed = (finish_ - start_) * 1e-6; + double bytes_mb = bytes_ / 1048576.0; + double rate = bytes_mb / elapsed; + double throughput = (double)done_/elapsed; + + fprintf(stdout, "%-12s: ", name); + fprintf(stdout, "%.3f micros/op %ld ops/sec\n", + seconds_ * 1e6 / done_, (long)throughput); + fprintf(stdout, "%-12s: Wrote %.2f MB (%.2f MB/sec) (%ld%% of %ld ops)\n", + "", bytes_mb, rate, (100*writes_)/done_, done_); + fprintf(stdout, "%-12s: Wrote %ld times\n", "", writes_); + fprintf(stdout, "%-12s: Deleted %ld times\n", "", deletes_); + fprintf(stdout, "%-12s: %ld read and %ld found the key\n", "", + gets_, founds_); + fprintf(stdout, "%-12s: Prefix scanned %ld times\n", "", prefixes_); + fprintf(stdout, "%-12s: Iterator size sum is %ld\n", "", + iterator_size_sums_); + fprintf(stdout, "%-12s: Iterated %ld times\n", "", iterations_); + fprintf(stdout, "%-12s: Got errors %ld times\n", "", errors_); + + if (FLAGS_histogram) { + fprintf(stdout, "Microseconds per op:\n%s\n", hist_.ToString().c_str()); + } + fflush(stdout); + } +}; + +// State shared by all concurrent executions of the same benchmark. +class SharedState { + public: + static const uint32_t SENTINEL = 0xffffffff; + + explicit SharedState(StressTest* stress_test) : + cv_(&mu_), + seed_(FLAGS_seed), + max_key_(FLAGS_max_key), + log2_keys_per_lock_(FLAGS_log2_keys_per_lock), + num_threads_(FLAGS_threads), + num_initialized_(0), + num_populated_(0), + vote_reopen_(0), + num_done_(0), + start_(false), + start_verify_(false), + stress_test_(stress_test) { + if (FLAGS_test_batches_snapshots) { + key_locks_ = nullptr; + values_ = nullptr; + fprintf(stdout, "No lock creation because test_batches_snapshots set\n"); + return; + } + values_ = new uint32_t[max_key_]; + for (long i = 0; i < max_key_; i++) { + values_[i] = SENTINEL; + } + + long num_locks = (max_key_ >> log2_keys_per_lock_); + if (max_key_ & ((1 << log2_keys_per_lock_) - 1)) { + num_locks ++; + } + fprintf(stdout, "Creating %ld locks\n", num_locks); + key_locks_ = new port::Mutex[num_locks]; + } + + ~SharedState() { + delete[] values_; + delete[] key_locks_; + } + + port::Mutex* GetMutex() { + return &mu_; + } + + port::CondVar* GetCondVar() { + return &cv_; + } + + StressTest* GetStressTest() const { + return stress_test_; + } + + long GetMaxKey() const { + return max_key_; + } + + uint32_t GetNumThreads() const { + return num_threads_; + } + + void IncInitialized() { + num_initialized_++; + } + + void IncOperated() { + num_populated_++; + } + + void IncDone() { + num_done_++; + } + + void IncVotedReopen() { + vote_reopen_ = (vote_reopen_ + 1) % num_threads_; + } + + bool AllInitialized() const { + return num_initialized_ >= num_threads_; + } + + bool AllOperated() const { + return num_populated_ >= num_threads_; + } + + bool AllDone() const { + return num_done_ >= num_threads_; + } + + bool AllVotedReopen() { + return (vote_reopen_ == 0); + } + + void SetStart() { + start_ = true; + } + + void SetStartVerify() { + start_verify_ = true; + } + + bool Started() const { + return start_; + } + + bool VerifyStarted() const { + return start_verify_; + } + + port::Mutex* GetMutexForKey(long key) { + return &key_locks_[key >> log2_keys_per_lock_]; + } + + void Put(long key, uint32_t value_base) { + values_[key] = value_base; + } + + uint32_t Get(long key) const { + return values_[key]; + } + + void Delete(long key) const { + values_[key] = SENTINEL; + } + + uint32_t GetSeed() const { + return seed_; + } + + private: + port::Mutex mu_; + port::CondVar cv_; + const uint32_t seed_; + const long max_key_; + const uint32_t log2_keys_per_lock_; + const int num_threads_; + long num_initialized_; + long num_populated_; + long vote_reopen_; + long num_done_; + bool start_; + bool start_verify_; + StressTest* stress_test_; + + uint32_t *values_; + port::Mutex *key_locks_; + +}; + +// Per-thread state for concurrent executions of the same benchmark. +struct ThreadState { + uint32_t tid; // 0..n-1 + Random rand; // Has different seeds for different threads + SharedState* shared; + Stats stats; + + ThreadState(uint32_t index, SharedState *shared) + : tid(index), + rand(1000 + index + shared->GetSeed()), + shared(shared) { + } +}; + +} // namespace + +class StressTest { + public: + StressTest() + : cache_(NewLRUCache(FLAGS_cache_size)), + compressed_cache_(FLAGS_compressed_cache_size >= 0 ? + NewLRUCache(FLAGS_compressed_cache_size) : + nullptr), + filter_policy_(FLAGS_bloom_bits >= 0 + ? NewBloomFilterPolicy(FLAGS_bloom_bits) + : nullptr), + prefix_extractor_(NewFixedPrefixTransform( + FLAGS_test_batches_snapshots ? + sizeof(long) : sizeof(long)-1)), + db_(nullptr), + num_times_reopened_(0) { + if (FLAGS_destroy_db_initially) { + std::vector files; + FLAGS_env->GetChildren(FLAGS_db, &files); + for (unsigned int i = 0; i < files.size(); i++) { + if (Slice(files[i]).starts_with("heap-")) { + FLAGS_env->DeleteFile(FLAGS_db + "/" + files[i]); + } + } + DestroyDB(FLAGS_db, Options()); + } + } + + ~StressTest() { + delete db_; + delete filter_policy_; + delete prefix_extractor_; + } + + void Run() { + PrintEnv(); + Open(); + SharedState shared(this); + uint32_t n = shared.GetNumThreads(); + + std::vector threads(n); + for (uint32_t i = 0; i < n; i++) { + threads[i] = new ThreadState(i, &shared); + FLAGS_env->StartThread(ThreadBody, threads[i]); + } + // Each thread goes through the following states: + // initializing -> wait for others to init -> read/populate/depopulate + // wait for others to operate -> verify -> done + + { + MutexLock l(shared.GetMutex()); + while (!shared.AllInitialized()) { + shared.GetCondVar()->Wait(); + } + + double now = FLAGS_env->NowMicros(); + fprintf(stdout, "%s Starting database operations\n", + FLAGS_env->TimeToString((uint64_t) now/1000000).c_str()); + + shared.SetStart(); + shared.GetCondVar()->SignalAll(); + while (!shared.AllOperated()) { + shared.GetCondVar()->Wait(); + } + + now = FLAGS_env->NowMicros(); + if (FLAGS_test_batches_snapshots) { + fprintf(stdout, "%s Limited verification already done during gets\n", + FLAGS_env->TimeToString((uint64_t) now/1000000).c_str()); + } else { + fprintf(stdout, "%s Starting verification\n", + FLAGS_env->TimeToString((uint64_t) now/1000000).c_str()); + } + + shared.SetStartVerify(); + shared.GetCondVar()->SignalAll(); + while (!shared.AllDone()) { + shared.GetCondVar()->Wait(); + } + } + + for (unsigned int i = 1; i < n; i++) { + threads[0]->stats.Merge(threads[i]->stats); + } + threads[0]->stats.Report("Stress Test"); + + for (unsigned int i = 0; i < n; i++) { + delete threads[i]; + threads[i] = nullptr; + } + double now = FLAGS_env->NowMicros(); + if (!FLAGS_test_batches_snapshots) { + fprintf(stdout, "%s Verification successful\n", + FLAGS_env->TimeToString((uint64_t) now/1000000).c_str()); + } + PrintStatistics(); + } + + private: + + static void ThreadBody(void* v) { + ThreadState* thread = reinterpret_cast(v); + SharedState* shared = thread->shared; + + { + MutexLock l(shared->GetMutex()); + shared->IncInitialized(); + if (shared->AllInitialized()) { + shared->GetCondVar()->SignalAll(); + } + while (!shared->Started()) { + shared->GetCondVar()->Wait(); + } + } + thread->shared->GetStressTest()->OperateDb(thread); + + { + MutexLock l(shared->GetMutex()); + shared->IncOperated(); + if (shared->AllOperated()) { + shared->GetCondVar()->SignalAll(); + } + while (!shared->VerifyStarted()) { + shared->GetCondVar()->Wait(); + } + } + + if (!FLAGS_test_batches_snapshots) { + thread->shared->GetStressTest()->VerifyDb(thread); + } + + { + MutexLock l(shared->GetMutex()); + shared->IncDone(); + if (shared->AllDone()) { + shared->GetCondVar()->SignalAll(); + } + } + + } + + // Given a key K and value V, this puts ("0"+K, "0"+V), ("1"+K, "1"+V), ... + // ("9"+K, "9"+V) in DB atomically i.e in a single batch. + // Also refer MultiGet. + Status MultiPut(ThreadState* thread, + const WriteOptions& writeoptions, + const Slice& key, const Slice& value, size_t sz) { + std::string keys[10] = {"9", "8", "7", "6", "5", + "4", "3", "2", "1", "0"}; + std::string values[10] = {"9", "8", "7", "6", "5", + "4", "3", "2", "1", "0"}; + Slice value_slices[10]; + WriteBatch batch; + Status s; + for (int i = 0; i < 10; i++) { + keys[i] += key.ToString(); + values[i] += value.ToString(); + value_slices[i] = values[i]; + if (FLAGS_use_merge) { + batch.Merge(keys[i], value_slices[i]); + } else { + batch.Put(keys[i], value_slices[i]); + } + } + + s = db_->Write(writeoptions, &batch); + if (!s.ok()) { + fprintf(stderr, "multiput error: %s\n", s.ToString().c_str()); + thread->stats.AddErrors(1); + } else { + // we did 10 writes each of size sz + 1 + thread->stats.AddBytesForWrites(10, (sz + 1) * 10); + } + + return s; + } + + // Given a key K, this deletes ("0"+K), ("1"+K),... ("9"+K) + // in DB atomically i.e in a single batch. Also refer MultiGet. + Status MultiDelete(ThreadState* thread, + const WriteOptions& writeoptions, + const Slice& key) { + std::string keys[10] = {"9", "7", "5", "3", "1", + "8", "6", "4", "2", "0"}; + + WriteBatch batch; + Status s; + for (int i = 0; i < 10; i++) { + keys[i] += key.ToString(); + batch.Delete(keys[i]); + } + + s = db_->Write(writeoptions, &batch); + if (!s.ok()) { + fprintf(stderr, "multidelete error: %s\n", s.ToString().c_str()); + thread->stats.AddErrors(1); + } else { + thread->stats.AddDeletes(10); + } + + return s; + } + + // Given a key K, this gets values for "0"+K, "1"+K,..."9"+K + // in the same snapshot, and verifies that all the values are of the form + // "0"+V, "1"+V,..."9"+V. + // ASSUMES that MultiPut was used to put (K, V) into the DB. + Status MultiGet(ThreadState* thread, + const ReadOptions& readoptions, + const Slice& key, std::string* value) { + std::string keys[10] = {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"}; + Slice key_slices[10]; + std::string values[10]; + ReadOptions readoptionscopy = readoptions; + readoptionscopy.snapshot = db_->GetSnapshot(); + Status s; + for (int i = 0; i < 10; i++) { + keys[i] += key.ToString(); + key_slices[i] = keys[i]; + s = db_->Get(readoptionscopy, key_slices[i], value); + if (!s.ok() && !s.IsNotFound()) { + fprintf(stderr, "get error: %s\n", s.ToString().c_str()); + values[i] = ""; + thread->stats.AddErrors(1); + // we continue after error rather than exiting so that we can + // find more errors if any + } else if (s.IsNotFound()) { + values[i] = ""; + thread->stats.AddGets(1, 0); + } else { + values[i] = *value; + + char expected_prefix = (keys[i])[0]; + char actual_prefix = (values[i])[0]; + if (actual_prefix != expected_prefix) { + fprintf(stderr, "error expected prefix = %c actual = %c\n", + expected_prefix, actual_prefix); + } + (values[i])[0] = ' '; // blank out the differing character + thread->stats.AddGets(1, 1); + } + } + db_->ReleaseSnapshot(readoptionscopy.snapshot); + + // Now that we retrieved all values, check that they all match + for (int i = 1; i < 10; i++) { + if (values[i] != values[0]) { + fprintf(stderr, "error : inconsistent values for key %s: %s, %s\n", + key.ToString().c_str(), values[0].c_str(), + values[i].c_str()); + // we continue after error rather than exiting so that we can + // find more errors if any + } + } + + return s; + } + + // Given a prefix P, this does prefix scans for "0"+P, "1"+P,..."9"+P + // in the same snapshot. Each of these 10 scans returns a series of + // values; each series should be the same length, and it is verified + // for each index i that all the i'th values are of the form "0"+V, + // "1"+V,..."9"+V. + // ASSUMES that MultiPut was used to put (K, V) + Status MultiPrefixScan(ThreadState* thread, + const ReadOptions& readoptions, + const Slice& prefix) { + std::string prefixes[10] = {"0", "1", "2", "3", "4", + "5", "6", "7", "8", "9"}; + Slice prefix_slices[10]; + ReadOptions readoptionscopy[10]; + const Snapshot* snapshot = db_->GetSnapshot(); + Iterator* iters[10]; + Status s = Status::OK(); + for (int i = 0; i < 10; i++) { + prefixes[i] += prefix.ToString(); + prefix_slices[i] = prefixes[i]; + readoptionscopy[i] = readoptions; + readoptionscopy[i].prefix = &prefix_slices[i]; + readoptionscopy[i].snapshot = snapshot; + iters[i] = db_->NewIterator(readoptionscopy[i]); + iters[i]->SeekToFirst(); + } + + int count = 0; + while (iters[0]->Valid()) { + count++; + std::string values[10]; + // get list of all values for this iteration + for (int i = 0; i < 10; i++) { + // no iterator should finish before the first one + assert(iters[i]->Valid()); + values[i] = iters[i]->value().ToString(); + + char expected_first = (prefixes[i])[0]; + char actual_first = (values[i])[0]; + + if (actual_first != expected_first) { + fprintf(stderr, "error expected first = %c actual = %c\n", + expected_first, actual_first); + } + (values[i])[0] = ' '; // blank out the differing character + } + // make sure all values are equivalent + for (int i = 0; i < 10; i++) { + if (values[i] != values[0]) { + fprintf(stderr, "error : inconsistent values for prefix %s: %s, %s\n", + prefix.ToString().c_str(), values[0].c_str(), + values[i].c_str()); + // we continue after error rather than exiting so that we can + // find more errors if any + } + iters[i]->Next(); + } + } + + // cleanup iterators and snapshot + for (int i = 0; i < 10; i++) { + // if the first iterator finished, they should have all finished + assert(!iters[i]->Valid()); + assert(iters[i]->status().ok()); + delete iters[i]; + } + db_->ReleaseSnapshot(snapshot); + + if (s.ok()) { + thread->stats.AddPrefixes(1, count); + } else { + thread->stats.AddErrors(1); + } + + return s; + } + + // Given a key K, this creates an iterator which scans to K and then + // does a random sequence of Next/Prev operations. + Status MultiIterate(ThreadState* thread, + const ReadOptions& readoptions, + const Slice& key) { + Status s; + const Snapshot* snapshot = db_->GetSnapshot(); + ReadOptions readoptionscopy = readoptions; + readoptionscopy.snapshot = snapshot; + unique_ptr iter(db_->NewIterator(readoptionscopy)); + + iter->Seek(key); + for (uint64_t i = 0; i < FLAGS_num_iterations && iter->Valid(); i++) { + if (thread->rand.OneIn(2)) { + iter->Next(); + } else { + iter->Prev(); + } + } + + if (s.ok()) { + thread->stats.AddIterations(1); + } else { + thread->stats.AddErrors(1); + } + + db_->ReleaseSnapshot(snapshot); + + return s; + } + + void OperateDb(ThreadState* thread) { + ReadOptions read_opts(FLAGS_verify_checksum, true); + WriteOptions write_opts; + char value[100]; + long max_key = thread->shared->GetMaxKey(); + std::string from_db; + if (FLAGS_sync) { + write_opts.sync = true; + } + write_opts.disableWAL = FLAGS_disable_wal; + const int prefixBound = (int)FLAGS_readpercent + (int)FLAGS_prefixpercent; + const int writeBound = prefixBound + (int)FLAGS_writepercent; + const int delBound = writeBound + (int)FLAGS_delpercent; + + thread->stats.Start(); + for (uint64_t i = 0; i < FLAGS_ops_per_thread; i++) { + if(i != 0 && (i % (FLAGS_ops_per_thread / (FLAGS_reopen + 1))) == 0) { + { + thread->stats.FinishedSingleOp(); + MutexLock l(thread->shared->GetMutex()); + thread->shared->IncVotedReopen(); + if (thread->shared->AllVotedReopen()) { + thread->shared->GetStressTest()->Reopen(); + thread->shared->GetCondVar()->SignalAll(); + } + else { + thread->shared->GetCondVar()->Wait(); + } + // Commenting this out as we don't want to reset stats on each open. + // thread->stats.Start(); + } + } + + long rand_key = thread->rand.Next() % max_key; + std::string keystr = Key(rand_key); + Slice key = keystr; + int prob_op = thread->rand.Uniform(100); + + if (prob_op >= 0 && prob_op < (int)FLAGS_readpercent) { + // OPERATION read + if (!FLAGS_test_batches_snapshots) { + Status s = db_->Get(read_opts, key, &from_db); + if (s.ok()) { + // found case + thread->stats.AddGets(1, 1); + } else if (s.IsNotFound()) { + // not found case + thread->stats.AddGets(1, 0); + } else { + // errors case + thread->stats.AddErrors(1); + } + } else { + MultiGet(thread, read_opts, key, &from_db); + } + } else if ((int)FLAGS_readpercent <= prob_op && prob_op < prefixBound) { + // OPERATION prefix scan + // keys are longs (e.g., 8 bytes), so we let prefixes be + // everything except the last byte. So there will be 2^8=256 + // keys per prefix. + Slice prefix = Slice(key.data(), key.size() - 1); + if (!FLAGS_test_batches_snapshots) { + read_opts.prefix = &prefix; + Iterator* iter = db_->NewIterator(read_opts); + int count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + assert(iter->key().starts_with(prefix)); + count++; + } + assert(count <= 256); + if (iter->status().ok()) { + thread->stats.AddPrefixes(1, count); + } else { + thread->stats.AddErrors(1); + } + delete iter; + } else { + MultiPrefixScan(thread, read_opts, prefix); + } + read_opts.prefix = nullptr; + } else if (prefixBound <= prob_op && prob_op < writeBound) { + // OPERATION write + uint32_t value_base = thread->rand.Next(); + size_t sz = GenerateValue(value_base, value, sizeof(value)); + Slice v(value, sz); + if (!FLAGS_test_batches_snapshots) { + MutexLock l(thread->shared->GetMutexForKey(rand_key)); + if (FLAGS_verify_before_write) { + std::string keystr2 = Key(rand_key); + Slice k = keystr2; + Status s = db_->Get(read_opts, k, &from_db); + VerifyValue(rand_key, + read_opts, + *(thread->shared), + from_db, + s, + true); + } + thread->shared->Put(rand_key, value_base); + if (FLAGS_use_merge) { + db_->Merge(write_opts, key, v); + } else { + db_->Put(write_opts, key, v); + } + thread->stats.AddBytesForWrites(1, sz); + } else { + MultiPut(thread, write_opts, key, v, sz); + } + PrintKeyValue(rand_key, value, sz); + } else if (writeBound <= prob_op && prob_op < delBound) { + // OPERATION delete + if (!FLAGS_test_batches_snapshots) { + MutexLock l(thread->shared->GetMutexForKey(rand_key)); + thread->shared->Delete(rand_key); + db_->Delete(write_opts, key); + thread->stats.AddDeletes(1); + } else { + MultiDelete(thread, write_opts, key); + } + } else { + // OPERATION iterate + MultiIterate(thread, read_opts, key); + } + thread->stats.FinishedSingleOp(); + } + + thread->stats.Stop(); + } + + void VerifyDb(ThreadState* thread) const { + ReadOptions options(FLAGS_verify_checksum, true); + const SharedState& shared = *(thread->shared); + static const long max_key = shared.GetMaxKey(); + static const long keys_per_thread = max_key / shared.GetNumThreads(); + long start = keys_per_thread * thread->tid; + long end = start + keys_per_thread; + if (thread->tid == shared.GetNumThreads() - 1) { + end = max_key; + } + if (!thread->rand.OneIn(2)) { + // Use iterator to verify this range + unique_ptr iter(db_->NewIterator(options)); + iter->Seek(Key(start)); + for (long i = start; i < end; i++) { + std::string from_db; + std::string keystr = Key(i); + Slice k = keystr; + Status s = iter->status(); + if (iter->Valid()) { + if (iter->key().compare(k) > 0) { + s = Status::NotFound(Slice()); + } else if (iter->key().compare(k) == 0) { + from_db = iter->value().ToString(); + iter->Next(); + } else if (iter->key().compare(k) < 0) { + VerificationAbort("An out of range key was found", i); + } + } else { + // The iterator found no value for the key in question, so do not + // move to the next item in the iterator + s = Status::NotFound(Slice()); + } + VerifyValue(i, options, shared, from_db, s, true); + if (from_db.length()) { + PrintKeyValue(i, from_db.data(), from_db.length()); + } + } + } + else { + // Use Get to verify this range + for (long i = start; i < end; i++) { + std::string from_db; + std::string keystr = Key(i); + Slice k = keystr; + Status s = db_->Get(options, k, &from_db); + VerifyValue(i, options, shared, from_db, s, true); + if (from_db.length()) { + PrintKeyValue(i, from_db.data(), from_db.length()); + } + } + } + } + + void VerificationAbort(std::string msg, long key) const { + fprintf(stderr, "Verification failed for key %ld: %s\n", + key, msg.c_str()); + exit(1); + } + + void VerifyValue(long key, + const ReadOptions &opts, + const SharedState &shared, + const std::string &value_from_db, + Status s, + bool strict=false) const { + // compare value_from_db with the value in the shared state + char value[100]; + uint32_t value_base = shared.Get(key); + if (value_base == SharedState::SENTINEL && !strict) { + return; + } + + if (s.ok()) { + if (value_base == SharedState::SENTINEL) { + VerificationAbort("Unexpected value found", key); + } + size_t sz = GenerateValue(value_base, value, sizeof(value)); + if (value_from_db.length() != sz) { + VerificationAbort("Length of value read is not equal", key); + } + if (memcmp(value_from_db.data(), value, sz) != 0) { + VerificationAbort("Contents of value read don't match", key); + } + } else { + if (value_base != SharedState::SENTINEL) { + VerificationAbort("Value not found", key); + } + } + } + + static void PrintKeyValue(uint32_t key, const char *value, size_t sz) { + if (!FLAGS_verbose) return; + fprintf(stdout, "%u ==> (%u) ", key, (unsigned int)sz); + for (size_t i=0; i= sizeof(uint32_t)); + *((uint32_t*)v) = rand; + for (size_t i=sizeof(uint32_t); i < value_sz; i++) { + v[i] = (char)(rand ^ i); + } + v[value_sz] = '\0'; + return value_sz; // the size of the value set. + } + + void PrintEnv() const { + fprintf(stdout, "LevelDB version : %d.%d\n", + kMajorVersion, kMinorVersion); + fprintf(stdout, "Number of threads : %d\n", FLAGS_threads); + fprintf(stdout, + "Ops per thread : %lu\n", + (unsigned long)FLAGS_ops_per_thread); + std::string ttl_state("unused"); + if (FLAGS_ttl > 0) { + ttl_state = NumberToString(FLAGS_ttl); + } + fprintf(stdout, "Time to live(sec) : %s\n", ttl_state.c_str()); + fprintf(stdout, "Read percentage : %d%%\n", FLAGS_readpercent); + fprintf(stdout, "Prefix percentage : %d%%\n", FLAGS_prefixpercent); + fprintf(stdout, "Write percentage : %d%%\n", FLAGS_writepercent); + fprintf(stdout, "Delete percentage : %d%%\n", FLAGS_delpercent); + fprintf(stdout, "Iterate percentage : %d%%\n", FLAGS_iterpercent); + fprintf(stdout, "Write-buffer-size : %d\n", FLAGS_write_buffer_size); + fprintf(stdout, + "Iterations : %lu\n", + (unsigned long)FLAGS_num_iterations); + fprintf(stdout, + "Max key : %lu\n", + (unsigned long)FLAGS_max_key); + fprintf(stdout, "Ratio #ops/#keys : %f\n", + (1.0 * FLAGS_ops_per_thread * FLAGS_threads)/FLAGS_max_key); + fprintf(stdout, "Num times DB reopens: %d\n", FLAGS_reopen); + fprintf(stdout, "Batches/snapshots : %d\n", + FLAGS_test_batches_snapshots); + fprintf(stdout, "Purge redundant %% : %d\n", + FLAGS_purge_redundant_percent); + fprintf(stdout, "Deletes use filter : %d\n", + FLAGS_filter_deletes); + fprintf(stdout, "Num keys per lock : %d\n", + 1 << FLAGS_log2_keys_per_lock); + + const char* compression = ""; + switch (FLAGS_compression_type_e) { + case rocksdb::kNoCompression: + compression = "none"; + break; + case rocksdb::kSnappyCompression: + compression = "snappy"; + break; + case rocksdb::kZlibCompression: + compression = "zlib"; + break; + case rocksdb::kBZip2Compression: + compression = "bzip2"; + break; + } + + fprintf(stdout, "Compression : %s\n", compression); + + const char* memtablerep = ""; + switch (FLAGS_rep_factory) { + case kSkipList: + memtablerep = "skip_list"; + break; + case kHashSkipList: + memtablerep = "prefix_hash"; + break; + case kVectorRep: + memtablerep = "vector"; + break; + } + + fprintf(stdout, "Memtablerep : %s\n", memtablerep); + + fprintf(stdout, "------------------------------------------------\n"); + } + + void Open() { + assert(db_ == nullptr); + Options options; + options.block_cache = cache_; + options.block_cache_compressed = compressed_cache_; + options.write_buffer_size = FLAGS_write_buffer_size; + options.max_write_buffer_number = FLAGS_max_write_buffer_number; + options.min_write_buffer_number_to_merge = + FLAGS_min_write_buffer_number_to_merge; + options.max_background_compactions = FLAGS_max_background_compactions; + options.compaction_style = + static_cast(FLAGS_compaction_style); + options.block_size = FLAGS_block_size; + options.filter_policy = filter_policy_; + options.prefix_extractor = prefix_extractor_; + options.max_open_files = FLAGS_open_files; + options.statistics = dbstats; + options.env = FLAGS_env; + options.disableDataSync = FLAGS_disable_data_sync; + options.use_fsync = FLAGS_use_fsync; + options.allow_mmap_reads = FLAGS_mmap_read; + rocksdb_kill_odds = FLAGS_kill_random_test; + options.target_file_size_base = FLAGS_target_file_size_base; + options.target_file_size_multiplier = FLAGS_target_file_size_multiplier; + options.max_bytes_for_level_base = FLAGS_max_bytes_for_level_base; + options.max_bytes_for_level_multiplier = + FLAGS_max_bytes_for_level_multiplier; + options.level0_stop_writes_trigger = FLAGS_level0_stop_writes_trigger; + options.level0_slowdown_writes_trigger = + FLAGS_level0_slowdown_writes_trigger; + options.level0_file_num_compaction_trigger = + FLAGS_level0_file_num_compaction_trigger; + options.compression = FLAGS_compression_type_e; + options.create_if_missing = true; + options.disable_seek_compaction = FLAGS_disable_seek_compaction; + options.delete_obsolete_files_period_micros = + FLAGS_delete_obsolete_files_period_micros; + options.max_manifest_file_size = 1024; + options.filter_deletes = FLAGS_filter_deletes; + if ((FLAGS_prefix_size == 0) == (FLAGS_rep_factory == kHashSkipList)) { + fprintf(stderr, + "prefix_size should be non-zero iff memtablerep == prefix_hash\n"); + exit(1); + } + switch (FLAGS_rep_factory) { + case kHashSkipList: + options.memtable_factory.reset(NewHashSkipListRepFactory( + NewFixedPrefixTransform(FLAGS_prefix_size))); + break; + case kSkipList: + // no need to do anything + break; + case kVectorRep: + options.memtable_factory.reset( + new VectorRepFactory() + ); + break; + } + static Random purge_percent(1000); // no benefit from non-determinism here + if (static_cast(purge_percent.Uniform(100)) < + FLAGS_purge_redundant_percent - 1) { + options.purge_redundant_kvs_while_flush = false; + } + + if (FLAGS_use_merge) { + options.merge_operator = MergeOperators::CreatePutOperator(); + } + + // set universal style compaction configurations, if applicable + if (FLAGS_universal_size_ratio != 0) { + options.compaction_options_universal.size_ratio = + FLAGS_universal_size_ratio; + } + if (FLAGS_universal_min_merge_width != 0) { + options.compaction_options_universal.min_merge_width = + FLAGS_universal_min_merge_width; + } + if (FLAGS_universal_max_merge_width != 0) { + options.compaction_options_universal.max_merge_width = + FLAGS_universal_max_merge_width; + } + if (FLAGS_universal_max_size_amplification_percent != 0) { + options.compaction_options_universal.max_size_amplification_percent = + FLAGS_universal_max_size_amplification_percent; + } + + fprintf(stdout, "DB path: [%s]\n", FLAGS_db.c_str()); + + Status s; + if (FLAGS_ttl == -1) { + s = DB::Open(options, FLAGS_db, &db_); + } else { + s = UtilityDB::OpenTtlDB(options, FLAGS_db, &sdb_, FLAGS_ttl); + db_ = sdb_; + } + if (!s.ok()) { + fprintf(stderr, "open error: %s\n", s.ToString().c_str()); + exit(1); + } + } + + void Reopen() { + // do not close the db. Just delete the lock file. This + // simulates a crash-recovery kind of situation. + if (FLAGS_ttl != -1) { + ((DBWithTTL*) db_)->TEST_Destroy_DBWithTtl(); + } else { + ((DBImpl*) db_)->TEST_Destroy_DBImpl(); + } + db_ = nullptr; + + num_times_reopened_++; + double now = FLAGS_env->NowMicros(); + fprintf(stdout, "%s Reopening database for the %dth time\n", + FLAGS_env->TimeToString((uint64_t) now/1000000).c_str(), + num_times_reopened_); + Open(); + } + + void PrintStatistics() { + if (dbstats) { + fprintf(stdout, "STATISTICS:\n%s\n", dbstats->ToString().c_str()); + } + } + + private: + shared_ptr cache_; + shared_ptr compressed_cache_; + const FilterPolicy* filter_policy_; + const SliceTransform* prefix_extractor_; + DB* db_; + StackableDB* sdb_; + int num_times_reopened_; +}; + +} // namespace rocksdb + + + +int main(int argc, char** argv) { + google::SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) + + " [OPTIONS]..."); + google::ParseCommandLineFlags(&argc, &argv, true); + + if (FLAGS_statistics) { + dbstats = rocksdb::CreateDBStatistics(); + } + FLAGS_compression_type_e = + StringToCompressionType(FLAGS_compression_type.c_str()); + if (!FLAGS_hdfs.empty()) { + FLAGS_env = new rocksdb::HdfsEnv(FLAGS_hdfs); + } + FLAGS_rep_factory = StringToRepFactory(FLAGS_memtablerep.c_str()); + + // The number of background threads should be at least as much the + // max number of concurrent compactions. + FLAGS_env->SetBackgroundThreads(FLAGS_max_background_compactions); + + if ((FLAGS_readpercent + FLAGS_prefixpercent + + FLAGS_writepercent + FLAGS_delpercent + FLAGS_iterpercent) != 100) { + fprintf(stderr, + "Error: Read+Prefix+Write+Delete+Iterate percents != 100!\n"); + exit(1); + } + if (FLAGS_disable_wal == 1 && FLAGS_reopen > 0) { + fprintf(stderr, "Error: Db cannot reopen safely with disable_wal set!\n"); + exit(1); + } + if ((unsigned)FLAGS_reopen >= FLAGS_ops_per_thread) { + fprintf(stderr, + "Error: #DB-reopens should be < ops_per_thread\n" + "Provided reopens = %d and ops_per_thread = %lu\n", + FLAGS_reopen, + (unsigned long)FLAGS_ops_per_thread); + exit(1); + } + + // Choose a location for the test database if none given with --db= + if (FLAGS_db.empty()) { + std::string default_db_path; + rocksdb::Env::Default()->GetTestDirectory(&default_db_path); + default_db_path += "/dbstress"; + FLAGS_db = default_db_path; + } + + rocksdb::StressTest stress; + stress.Run(); + return 0; +} diff --git a/tools/ldb.cc b/tools/ldb.cc new file mode 100644 index 00000000..4581b801 --- /dev/null +++ b/tools/ldb.cc @@ -0,0 +1,13 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// + +#include "rocksdb/ldb_tool.h" + +int main(int argc, char** argv) { + rocksdb::LDBTool tool; + tool.Run(argc, argv); + return 0; +} diff --git a/tools/ldb_test.py b/tools/ldb_test.py new file mode 100644 index 00000000..fe9a6c60 --- /dev/null +++ b/tools/ldb_test.py @@ -0,0 +1,356 @@ +import os +import os.path +import shutil +import subprocess +import time +import unittest +import tempfile + +def my_check_output(*popenargs, **kwargs): + """ + If we had python 2.7, we should simply use subprocess.check_output. + This is a stop-gap solution for python 2.6 + """ + if 'stdout' in kwargs: + raise ValueError('stdout argument not allowed, it will be overridden.') + process = subprocess.Popen(stderr=subprocess.PIPE, stdout=subprocess.PIPE, + *popenargs, **kwargs) + output, unused_err = process.communicate() + retcode = process.poll() + if retcode: + cmd = kwargs.get("args") + if cmd is None: + cmd = popenargs[0] + raise Exception("Exit code is not 0. It is %d. Command: %s" % + (retcode, cmd)) + return output + +def run_err_null(cmd): + return os.system(cmd + " 2>/dev/null ") + +class LDBTestCase(unittest.TestCase): + def setUp(self): + self.TMP_DIR = tempfile.mkdtemp(prefix="ldb_test_") + self.DB_NAME = "testdb" + + def tearDown(self): + assert(self.TMP_DIR.strip() != "/" + and self.TMP_DIR.strip() != "/tmp" + and self.TMP_DIR.strip() != "/tmp/") #Just some paranoia + + shutil.rmtree(self.TMP_DIR) + + def dbParam(self, dbName): + return "--db=%s" % os.path.join(self.TMP_DIR, dbName) + + def assertRunOKFull(self, params, expectedOutput, unexpected=False): + """ + All command-line params must be specified. + Allows full flexibility in testing; for example: missing db param. + + """ + + output = my_check_output("./ldb %s |grep -v \"Created bg thread\"" % + params, shell=True) + if not unexpected: + self.assertEqual(output.strip(), expectedOutput.strip()) + else: + self.assertNotEqual(output.strip(), expectedOutput.strip()) + + def assertRunFAILFull(self, params): + """ + All command-line params must be specified. + Allows full flexibility in testing; for example: missing db param. + + """ + try: + + my_check_output("./ldb %s >/dev/null 2>&1 |grep -v \"Created bg \ + thread\"" % params, shell=True) + except Exception, e: + return + self.fail( + "Exception should have been raised for command with params: %s" % + params) + + def assertRunOK(self, params, expectedOutput, unexpected=False): + """ + Uses the default test db. + + """ + self.assertRunOKFull("%s %s" % (self.dbParam(self.DB_NAME), params), + expectedOutput, unexpected) + + def assertRunFAIL(self, params): + """ + Uses the default test db. + """ + self.assertRunFAILFull("%s %s" % (self.dbParam(self.DB_NAME), params)) + + def testSimpleStringPutGet(self): + print "Running testSimpleStringPutGet..." + self.assertRunFAIL("put x1 y1") + self.assertRunOK("put --create_if_missing x1 y1", "OK") + self.assertRunOK("get x1", "y1") + self.assertRunFAIL("get x2") + + self.assertRunOK("put x2 y2", "OK") + self.assertRunOK("get x1", "y1") + self.assertRunOK("get x2", "y2") + self.assertRunFAIL("get x3") + + self.assertRunOK("scan --from=x1 --to=z", "x1 : y1\nx2 : y2") + self.assertRunOK("put x3 y3", "OK") + + self.assertRunOK("scan --from=x1 --to=z", "x1 : y1\nx2 : y2\nx3 : y3") + self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3") + self.assertRunOK("scan --from=x", "x1 : y1\nx2 : y2\nx3 : y3") + + self.assertRunOK("scan --to=x2", "x1 : y1") + self.assertRunOK("scan --from=x1 --to=z --max_keys=1", "x1 : y1") + self.assertRunOK("scan --from=x1 --to=z --max_keys=2", + "x1 : y1\nx2 : y2") + + self.assertRunOK("scan --from=x1 --to=z --max_keys=3", + "x1 : y1\nx2 : y2\nx3 : y3") + self.assertRunOK("scan --from=x1 --to=z --max_keys=4", + "x1 : y1\nx2 : y2\nx3 : y3") + self.assertRunOK("scan --from=x1 --to=x2", "x1 : y1") + self.assertRunOK("scan --from=x2 --to=x4", "x2 : y2\nx3 : y3") + self.assertRunFAIL("scan --from=x4 --to=z") # No results => FAIL + self.assertRunFAIL("scan --from=x1 --to=z --max_keys=foo") + + self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3") + + self.assertRunOK("delete x1", "OK") + self.assertRunOK("scan", "x2 : y2\nx3 : y3") + + self.assertRunOK("delete NonExistentKey", "OK") + # It is weird that GET and SCAN raise exception for + # non-existent key, while delete does not + + def dumpDb(self, params, dumpFile): + return 0 == run_err_null("./ldb dump %s > %s" % (params, dumpFile)) + + def loadDb(self, params, dumpFile): + return 0 == run_err_null("cat %s | ./ldb load %s" % (dumpFile, params)) + + def testStringBatchPut(self): + print "Running testStringBatchPut..." + self.assertRunOK("batchput x1 y1 --create_if_missing", "OK") + self.assertRunOK("scan", "x1 : y1") + self.assertRunOK("batchput x2 y2 x3 y3 \"x4 abc\" \"y4 xyz\"", "OK") + self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 abc : y4 xyz") + self.assertRunFAIL("batchput") + self.assertRunFAIL("batchput k1") + self.assertRunFAIL("batchput k1 v1 k2") + + def testCountDelimDump(self): + print "Running testCountDelimDump..." + self.assertRunOK("batchput x.1 x1 --create_if_missing", "OK") + self.assertRunOK("batchput y.abc abc y.2 2 z.13c pqr", "OK") + self.assertRunOK("dump --count_delim", "x => count:1\tsize:5\ny => count:2\tsize:12\nz => count:1\tsize:8") + self.assertRunOK("dump --count_delim=\".\"", "x => count:1\tsize:5\ny => count:2\tsize:12\nz => count:1\tsize:8") + self.assertRunOK("batchput x,2 x2 x,abc xabc", "OK") + self.assertRunOK("dump --count_delim=\",\"", "x => count:2\tsize:14\nx.1 => count:1\tsize:5\ny.2 => count:1\tsize:4\ny.abc => count:1\tsize:8\nz.13c => count:1\tsize:8") + + def testCountDelimIDump(self): + print "Running testCountDelimIDump..." + self.assertRunOK("batchput x.1 x1 --create_if_missing", "OK") + self.assertRunOK("batchput y.abc abc y.2 2 z.13c pqr", "OK") + self.assertRunOK("dump --count_delim", "x => count:1\tsize:5\ny => count:2\tsize:12\nz => count:1\tsize:8") + self.assertRunOK("dump --count_delim=\".\"", "x => count:1\tsize:5\ny => count:2\tsize:12\nz => count:1\tsize:8") + self.assertRunOK("batchput x,2 x2 x,abc xabc", "OK") + self.assertRunOK("dump --count_delim=\",\"", "x => count:2\tsize:14\nx.1 => count:1\tsize:5\ny.2 => count:1\tsize:4\ny.abc => count:1\tsize:8\nz.13c => count:1\tsize:8") + + def testInvalidCmdLines(self): + print "Running testInvalidCmdLines..." + # db not specified + self.assertRunFAILFull("put 0x6133 0x6233 --hex --create_if_missing") + # No param called he + self.assertRunFAIL("put 0x6133 0x6233 --he --create_if_missing") + # max_keys is not applicable for put + self.assertRunFAIL("put 0x6133 0x6233 --max_keys=1 --create_if_missing") + # hex has invalid boolean value + + def testHexPutGet(self): + print "Running testHexPutGet..." + self.assertRunOK("put a1 b1 --create_if_missing", "OK") + self.assertRunOK("scan", "a1 : b1") + self.assertRunOK("scan --hex", "0x6131 : 0x6231") + self.assertRunFAIL("put --hex 6132 6232") + self.assertRunOK("put --hex 0x6132 0x6232", "OK") + self.assertRunOK("scan --hex", "0x6131 : 0x6231\n0x6132 : 0x6232") + self.assertRunOK("scan", "a1 : b1\na2 : b2") + self.assertRunOK("get a1", "b1") + self.assertRunOK("get --hex 0x6131", "0x6231") + self.assertRunOK("get a2", "b2") + self.assertRunOK("get --hex 0x6132", "0x6232") + self.assertRunOK("get --key_hex 0x6132", "b2") + self.assertRunOK("get --key_hex --value_hex 0x6132", "0x6232") + self.assertRunOK("get --value_hex a2", "0x6232") + self.assertRunOK("scan --key_hex --value_hex", + "0x6131 : 0x6231\n0x6132 : 0x6232") + self.assertRunOK("scan --hex --from=0x6131 --to=0x6133", + "0x6131 : 0x6231\n0x6132 : 0x6232") + self.assertRunOK("scan --hex --from=0x6131 --to=0x6132", + "0x6131 : 0x6231") + self.assertRunOK("scan --key_hex", "0x6131 : b1\n0x6132 : b2") + self.assertRunOK("scan --value_hex", "a1 : 0x6231\na2 : 0x6232") + self.assertRunOK("batchput --hex 0x6133 0x6233 0x6134 0x6234", "OK") + self.assertRunOK("scan", "a1 : b1\na2 : b2\na3 : b3\na4 : b4") + self.assertRunOK("delete --hex 0x6133", "OK") + self.assertRunOK("scan", "a1 : b1\na2 : b2\na4 : b4") + + def testTtlPutGet(self): + print "Running testTtlPutGet..." + self.assertRunOK("put a1 b1 --ttl --create_if_missing", "OK") + self.assertRunOK("scan ", "a1 : b1", True) + self.assertRunOK("dump --ttl ", "a1 ==> b1", True) + self.assertRunOK("scan --hex --ttl", "0x6131 : 0x6231") + self.assertRunOK("get a1", "b1", True) + self.assertRunOK("get --ttl a1", "b1") + self.assertRunOK("put a3 b3 --create_if_missing", "OK") + # fails because timstamp's length is greater than value's + self.assertRunFAIL("get --ttl a3") + + def testInvalidCmdLines(self): + print "Running testInvalidCmdLines..." + # db not specified + self.assertRunFAILFull("put 0x6133 0x6233 --hex --create_if_missing") + # No param called he + self.assertRunFAIL("put 0x6133 0x6233 --he --create_if_missing") + # max_keys is not applicable for put + self.assertRunFAIL("put 0x6133 0x6233 --max_keys=1 --create_if_missing") + # hex has invalid boolean value + self.assertRunFAIL("put 0x6133 0x6233 --hex=Boo --create_if_missing") + + def testDumpLoad(self): + print "Running testDumpLoad..." + self.assertRunOK("batchput --create_if_missing x1 y1 x2 y2 x3 y3 x4 y4", + "OK") + self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4") + origDbPath = os.path.join(self.TMP_DIR, self.DB_NAME) + + # Dump and load without any additional params specified + dumpFilePath = os.path.join(self.TMP_DIR, "dump1") + loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump1") + self.assertTrue(self.dumpDb("--db=%s" % origDbPath, dumpFilePath)) + self.assertTrue(self.loadDb( + "--db=%s --create_if_missing" % loadedDbPath, dumpFilePath)) + self.assertRunOKFull("scan --db=%s" % loadedDbPath, + "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4") + + # Dump and load in hex + dumpFilePath = os.path.join(self.TMP_DIR, "dump2") + loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump2") + self.assertTrue(self.dumpDb("--db=%s --hex" % origDbPath, dumpFilePath)) + self.assertTrue(self.loadDb( + "--db=%s --hex --create_if_missing" % loadedDbPath, dumpFilePath)) + self.assertRunOKFull("scan --db=%s" % loadedDbPath, + "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4") + + # Dump only a portion of the key range + dumpFilePath = os.path.join(self.TMP_DIR, "dump3") + loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump3") + self.assertTrue(self.dumpDb( + "--db=%s --from=x1 --to=x3" % origDbPath, dumpFilePath)) + self.assertTrue(self.loadDb( + "--db=%s --create_if_missing" % loadedDbPath, dumpFilePath)) + self.assertRunOKFull("scan --db=%s" % loadedDbPath, "x1 : y1\nx2 : y2") + + # Dump upto max_keys rows + dumpFilePath = os.path.join(self.TMP_DIR, "dump4") + loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump4") + self.assertTrue(self.dumpDb( + "--db=%s --max_keys=3" % origDbPath, dumpFilePath)) + self.assertTrue(self.loadDb( + "--db=%s --create_if_missing" % loadedDbPath, dumpFilePath)) + self.assertRunOKFull("scan --db=%s" % loadedDbPath, + "x1 : y1\nx2 : y2\nx3 : y3") + + # Load into an existing db, create_if_missing is not specified + self.assertTrue(self.dumpDb("--db=%s" % origDbPath, dumpFilePath)) + self.assertTrue(self.loadDb("--db=%s" % loadedDbPath, dumpFilePath)) + self.assertRunOKFull("scan --db=%s" % loadedDbPath, + "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4") + + # Dump and load with WAL disabled + dumpFilePath = os.path.join(self.TMP_DIR, "dump5") + loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump5") + self.assertTrue(self.dumpDb("--db=%s" % origDbPath, dumpFilePath)) + self.assertTrue(self.loadDb( + "--db=%s --disable_wal --create_if_missing" % loadedDbPath, + dumpFilePath)) + self.assertRunOKFull("scan --db=%s" % loadedDbPath, + "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4") + + # Dump and load with lots of extra params specified + extraParams = " ".join(["--bloom_bits=14", "--compression_type=bzip2", + "--block_size=1024", "--auto_compaction=true", + "--write_buffer_size=4194304", + "--file_size=2097152"]) + dumpFilePath = os.path.join(self.TMP_DIR, "dump6") + loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump6") + self.assertTrue(self.dumpDb( + "--db=%s %s" % (origDbPath, extraParams), dumpFilePath)) + self.assertTrue(self.loadDb( + "--db=%s %s --create_if_missing" % (loadedDbPath, extraParams), + dumpFilePath)) + self.assertRunOKFull("scan --db=%s" % loadedDbPath, + "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4") + + # Dump with count_only + dumpFilePath = os.path.join(self.TMP_DIR, "dump7") + loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump7") + self.assertTrue(self.dumpDb( + "--db=%s --count_only" % origDbPath, dumpFilePath)) + self.assertTrue(self.loadDb( + "--db=%s --create_if_missing" % loadedDbPath, dumpFilePath)) + # DB should have atleast one value for scan to work + self.assertRunOKFull("put --db=%s k1 v1" % loadedDbPath, "OK") + self.assertRunOKFull("scan --db=%s" % loadedDbPath, "k1 : v1") + + # Dump command fails because of typo in params + dumpFilePath = os.path.join(self.TMP_DIR, "dump8") + self.assertFalse(self.dumpDb( + "--db=%s --create_if_missing" % origDbPath, dumpFilePath)) + + def testMiscAdminTask(self): + print "Running testMiscAdminTask..." + # These tests need to be improved; for example with asserts about + # whether compaction or level reduction actually took place. + self.assertRunOK("batchput --create_if_missing x1 y1 x2 y2 x3 y3 x4 y4", + "OK") + self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4") + origDbPath = os.path.join(self.TMP_DIR, self.DB_NAME) + + self.assertTrue(0 == run_err_null( + "./ldb compact --db=%s" % origDbPath)) + self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4") + + self.assertTrue(0 == run_err_null( + "./ldb reduce_levels --db=%s --new_levels=2" % origDbPath)) + self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4") + + self.assertTrue(0 == run_err_null( + "./ldb reduce_levels --db=%s --new_levels=3" % origDbPath)) + self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4") + + self.assertTrue(0 == run_err_null( + "./ldb compact --db=%s --from=x1 --to=x3" % origDbPath)) + self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4") + + self.assertTrue(0 == run_err_null( + "./ldb compact --db=%s --hex --from=0x6131 --to=0x6134" + % origDbPath)) + self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4") + + #TODO(dilip): Not sure what should be passed to WAL.Currently corrupted. + self.assertTrue(0 == run_err_null( + "./ldb dump_wal --db=%s --walfile=%s --header" % ( + origDbPath, os.path.join(origDbPath, "LOG")))) + self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4") + +if __name__ == "__main__": + unittest.main() diff --git a/tools/reduce_levels_test.cc b/tools/reduce_levels_test.cc new file mode 100644 index 00000000..b588b52d --- /dev/null +++ b/tools/reduce_levels_test.cc @@ -0,0 +1,197 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include "rocksdb/db.h" +#include "db/db_impl.h" +#include "db/version_set.h" +#include "util/logging.h" +#include "util/testutil.h" +#include "util/testharness.h" +#include "util/ldb_cmd.h" + +namespace rocksdb { + +class ReduceLevelTest { +public: + ReduceLevelTest() { + dbname_ = test::TmpDir() + "/db_reduce_levels_test"; + DestroyDB(dbname_, Options()); + db_ = nullptr; + } + + Status OpenDB(bool create_if_missing, int levels, + int mem_table_compact_level); + + Status Put(const std::string& k, const std::string& v) { + return db_->Put(WriteOptions(), k, v); + } + + std::string Get(const std::string& k) { + ReadOptions options; + std::string result; + Status s = db_->Get(options, k, &result); + if (s.IsNotFound()) { + result = "NOT_FOUND"; + } else if (!s.ok()) { + result = s.ToString(); + } + return result; + } + + Status CompactMemTable() { + if (db_ == nullptr) { + return Status::InvalidArgument("DB not opened."); + } + DBImpl* db_impl = reinterpret_cast(db_); + return db_impl->TEST_FlushMemTable(); + } + + void CloseDB() { + if (db_ != nullptr) { + delete db_; + db_ = nullptr; + } + } + + bool ReduceLevels(int target_level); + + int FilesOnLevel(int level) { + std::string property; + ASSERT_TRUE( + db_->GetProperty("rocksdb.num-files-at-level" + NumberToString(level), + &property)); + return atoi(property.c_str()); + } + +private: + std::string dbname_; + DB* db_; +}; + +Status ReduceLevelTest::OpenDB(bool create_if_missing, int num_levels, + int mem_table_compact_level) { + rocksdb::Options opt; + opt.num_levels = num_levels; + opt.create_if_missing = create_if_missing; + opt.max_mem_compaction_level = mem_table_compact_level; + rocksdb::Status st = rocksdb::DB::Open(opt, dbname_, &db_); + if (!st.ok()) { + fprintf(stderr, "Can't open the db:%s\n", st.ToString().c_str()); + } + return st; +} + +bool ReduceLevelTest::ReduceLevels(int target_level) { + std::vector args = rocksdb::ReduceDBLevelsCommand::PrepareArgs( + dbname_, target_level, false); + LDBCommand* level_reducer = LDBCommand::InitFromCmdLineArgs(args); + level_reducer->Run(); + bool is_succeed = level_reducer->GetExecuteState().IsSucceed(); + delete level_reducer; + return is_succeed; +} + +TEST(ReduceLevelTest, Last_Level) { + // create files on all levels; + ASSERT_OK(OpenDB(true, 4, 3)); + ASSERT_OK(Put("aaaa", "11111")); + ASSERT_OK(CompactMemTable()); + ASSERT_EQ(FilesOnLevel(3), 1); + CloseDB(); + + ASSERT_TRUE(ReduceLevels(3)); + ASSERT_OK(OpenDB(true, 3, 1)); + ASSERT_EQ(FilesOnLevel(2), 1); + CloseDB(); + + ASSERT_TRUE(ReduceLevels(2)); + ASSERT_OK(OpenDB(true, 2, 1)); + ASSERT_EQ(FilesOnLevel(1), 1); + CloseDB(); +} + +TEST(ReduceLevelTest, Top_Level) { + // create files on all levels; + ASSERT_OK(OpenDB(true, 5, 0)); + ASSERT_OK(Put("aaaa", "11111")); + ASSERT_OK(CompactMemTable()); + ASSERT_EQ(FilesOnLevel(0), 1); + CloseDB(); + + ASSERT_TRUE(ReduceLevels(4)); + ASSERT_OK(OpenDB(true, 4, 0)); + CloseDB(); + + ASSERT_TRUE(ReduceLevels(3)); + ASSERT_OK(OpenDB(true, 3, 0)); + CloseDB(); + + ASSERT_TRUE(ReduceLevels(2)); + ASSERT_OK(OpenDB(true, 2, 0)); + CloseDB(); +} + +TEST(ReduceLevelTest, All_Levels) { + // create files on all levels; + ASSERT_OK(OpenDB(true, 5, 1)); + ASSERT_OK(Put("a", "a11111")); + ASSERT_OK(CompactMemTable()); + ASSERT_EQ(FilesOnLevel(1), 1); + CloseDB(); + + ASSERT_OK(OpenDB(true, 5, 2)); + ASSERT_OK(Put("b", "b11111")); + ASSERT_OK(CompactMemTable()); + ASSERT_EQ(FilesOnLevel(1), 1); + ASSERT_EQ(FilesOnLevel(2), 1); + CloseDB(); + + ASSERT_OK(OpenDB(true, 5, 3)); + ASSERT_OK(Put("c", "c11111")); + ASSERT_OK(CompactMemTable()); + ASSERT_EQ(FilesOnLevel(1), 1); + ASSERT_EQ(FilesOnLevel(2), 1); + ASSERT_EQ(FilesOnLevel(3), 1); + CloseDB(); + + ASSERT_OK(OpenDB(true, 5, 4)); + ASSERT_OK(Put("d", "d11111")); + ASSERT_OK(CompactMemTable()); + ASSERT_EQ(FilesOnLevel(1), 1); + ASSERT_EQ(FilesOnLevel(2), 1); + ASSERT_EQ(FilesOnLevel(3), 1); + ASSERT_EQ(FilesOnLevel(4), 1); + CloseDB(); + + ASSERT_TRUE(ReduceLevels(4)); + ASSERT_OK(OpenDB(true, 4, 0)); + ASSERT_EQ("a11111", Get("a")); + ASSERT_EQ("b11111", Get("b")); + ASSERT_EQ("c11111", Get("c")); + ASSERT_EQ("d11111", Get("d")); + CloseDB(); + + ASSERT_TRUE(ReduceLevels(3)); + ASSERT_OK(OpenDB(true, 3, 0)); + ASSERT_EQ("a11111", Get("a")); + ASSERT_EQ("b11111", Get("b")); + ASSERT_EQ("c11111", Get("c")); + ASSERT_EQ("d11111", Get("d")); + CloseDB(); + + ASSERT_TRUE(ReduceLevels(2)); + ASSERT_OK(OpenDB(true, 2, 0)); + ASSERT_EQ("a11111", Get("a")); + ASSERT_EQ("b11111", Get("b")); + ASSERT_EQ("c11111", Get("c")); + ASSERT_EQ("d11111", Get("d")); + CloseDB(); +} + +} + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/tools/shell/DBClientProxy.cpp b/tools/shell/DBClientProxy.cpp new file mode 100644 index 00000000..93277ac1 --- /dev/null +++ b/tools/shell/DBClientProxy.cpp @@ -0,0 +1,271 @@ + +#include + +#include "DBClientProxy.h" + + +#include "thrift/lib/cpp/protocol/TBinaryProtocol.h" +#include "thrift/lib/cpp/transport/TSocket.h" +#include "thrift/lib/cpp/transport/TTransportUtils.h" + + + +using namespace std; +using namespace boost; +using namespace Tleveldb; +using namespace apache::thrift::protocol; +using namespace apache::thrift::transport; + +namespace rocksdb { + +DBClientProxy::DBClientProxy(const string & host, int port) : + host_(host), + port_(port), + dbToHandle_(), + dbClient_() { +} + +DBClientProxy::~DBClientProxy() { + cleanUp(); +} + + +void DBClientProxy::connect(void) { + cleanUp(); + printf("Connecting to %s:%d\n", host_.c_str(), port_); + try { + boost::shared_ptr socket(new TSocket(host_, port_)); + boost::shared_ptr transport(new TBufferedTransport(socket)); + boost::shared_ptr protocol(new TBinaryProtocol(transport)); + dbClient_.reset(new DBClient(protocol)); + + transport->open(); + } catch (const std::exception & e) { + dbClient_.reset(); + throw; + } +} + +void DBClientProxy::cleanUp(void) { + if(dbClient_.get()) { + for(map::iterator itor = dbToHandle_.begin(); + itor != dbToHandle_.end(); + ++itor) { + dbClient_->Close(itor->second, itor->first); + } + dbClient_.reset(); + } + dbToHandle_.clear(); +} + +void DBClientProxy::open(const string & db) { + if(!dbClient_.get()) { + printf("please connect() first\n"); + return; + } + + // printf("opening database : %s\n", db.c_str()); + // we use default DBOptions here + DBOptions opt; + DBHandle handle; + try { + dbClient_->Open(handle, db, opt); + } catch (const LeveldbException & e) { + printf("%s\n", e.message.c_str()); + if(kIOError == e.errorCode) { + printf("no such database : %s\n", db.c_str()); + return; + }else { + printf("Unknown error : %d\n", e.errorCode); + return; + } + } + + dbToHandle_[db] = handle; +} + + +bool DBClientProxy::create(const string & db) { + if(!dbClient_.get()) { + printf("please connect() first\n"); + return false; + } + + printf("creating database : %s\n", db.c_str()); + DBOptions opt; + opt.create_if_missing = true; + opt.error_if_exists = true; + DBHandle handle; + try { + dbClient_->Open(handle, db, opt); + }catch (const LeveldbException & e) { + printf("%s\n", e.message.c_str()); + printf("error code : %d\n", e.errorCode); + if(kNotFound == e.errorCode) { + printf("no such database : %s\n", db.c_str()); + return false;; + } else { + printf("Unknown error : %d\n", e.errorCode); + return false; + } + } + + dbToHandle_[db] = handle; + return true; +} + + +map::iterator +DBClientProxy::getHandle(const string & db) { + map::iterator itor = dbToHandle_.find(db); + if(dbToHandle_.end() == itor) { + open(db); + itor = dbToHandle_.find(db); + } + + return itor; +} + + +bool DBClientProxy::get(const string & db, + const string & key, + string & value) { + if(!dbClient_.get()) { + printf("please connect() first\n"); + return false; + } + + map::iterator itor = getHandle(db); + if(dbToHandle_.end() == itor) { + return false; + } + + ResultItem ret; + Slice k; + k.data = key; + k.size = key.size(); + // we use default values of options here + ReadOptions opt; + dbClient_->Get(ret, + itor->second, + k, + opt); + if(kOk == ret.status) { + value = ret.value.data; + return true; + } else if(kNotFound == ret.status) { + printf("no such key : %s\n", key.c_str()); + return false; + } else { + printf("get data error : %d\n", ret.status); + return false; + } +} + + + +bool DBClientProxy::put(const string & db, + const string & key, + const string & value) { + if(!dbClient_.get()) { + printf("please connect() first\n"); + return false; + } + + map::iterator itor = getHandle(db); + if(dbToHandle_.end() == itor) { + return false; + } + + kv temp; + temp.key.data = key; + temp.key.size = key.size(); + temp.value.data = value; + temp.value.size = value.size(); + WriteOptions opt; + opt.sync = true; + Code code; + code = dbClient_->Put(itor->second, + temp, + opt); + + + if(kOk == code) { + // printf("set value finished\n"); + return true; + } else { + printf("put data error : %d\n", code); + return false; + } +} + +bool DBClientProxy::scan(const string & db, + const string & start_key, + const string & end_key, + const string & limit, + vector > & kvs) { + if(!dbClient_.get()) { + printf("please connect() first\n"); + return false; + } + + int limitInt = -1; + limitInt = atoi(limit.c_str()); + if(limitInt <= 0) { + printf("Error while parse limit : %s\n", limit.c_str()); + return false; + } + + if(start_key > end_key) { + printf("empty range.\n"); + return false; + } + + map::iterator itor = getHandle(db); + if(dbToHandle_.end() == itor) { + return false; + } + + ResultIterator ret; + // we use the default values of options here + ReadOptions opt; + Slice k; + k.data = start_key; + k.size = start_key.size(); + dbClient_->NewIterator(ret, + itor->second, + opt, + seekToKey, + k); + Iterator it; + if(kOk == ret.status) { + it = ret.iterator; + } else { + printf("get iterator error : %d\n", ret.status); + return false; + } + + int idx = 0; + string ck = start_key; + while(idx < limitInt && ck < end_key) { + ResultPair retPair; + dbClient_->GetNext(retPair, itor->second, it); + if(kOk == retPair.status) { + ++idx; + ck = retPair.keyvalue.key.data; + if (ck < end_key) { + kvs.push_back(make_pair(retPair.keyvalue.key.data, + retPair.keyvalue.value.data)); + } + } else if(kEnd == retPair.status) { + printf("not enough values\n"); + return true; + } else { + printf("GetNext() error : %d\n", retPair.status); + return false; + } + } + return true; +} + +} // namespace diff --git a/tools/shell/DBClientProxy.h b/tools/shell/DBClientProxy.h new file mode 100644 index 00000000..fba228b9 --- /dev/null +++ b/tools/shell/DBClientProxy.h @@ -0,0 +1,64 @@ + +#ifndef TOOLS_SHELL_DBCLIENTPROXY +#define TOOLS_SHELL_DBCLIENTPROXY + +#include +#include +#include +#include +#include + +#include "DB.h" + +/* + * class DBClientProxy maintains: + * 1. a connection to rocksdb service + * 2. a map from db names to opened db handles + * + * it's client codes' responsibility to catch all possible exceptions. + */ + +namespace rocksdb { + +class DBClientProxy : private boost::noncopyable { + public: + // connect to host_:port_ + void connect(void); + + // return true on success, false otherwise + bool get(const std::string & db, + const std::string & key, + std::string & value); + + // return true on success, false otherwise + bool put(const std::string & db, + const std::string & key, + const std::string & value); + + // return true on success, false otherwise + bool scan(const std::string & db, + const std::string & start_key, + const std::string & end_key, + const std::string & limit, + std::vector > & kvs); + + // return true on success, false otherwise + bool create(const std::string & db); + + DBClientProxy(const std::string & host, int port); + ~DBClientProxy(); + + private: + // some internal help functions + void cleanUp(void); + void open(const std::string & db); + std::map::iterator getHandle(const std::string & db); + + const std::string host_; + const int port_; + std::map dbToHandle_; + boost::shared_ptr dbClient_; +}; + +} // namespace +#endif diff --git a/tools/shell/LeveldbShell.cpp b/tools/shell/LeveldbShell.cpp new file mode 100644 index 00000000..e6274d3b --- /dev/null +++ b/tools/shell/LeveldbShell.cpp @@ -0,0 +1,8 @@ + + +#include "ShellContext.h" + +int main(int argc, char ** argv) { + ShellContext c(argc, argv); + c.run(); +} diff --git a/tools/shell/ShellContext.cpp b/tools/shell/ShellContext.cpp new file mode 100644 index 00000000..05a9bb81 --- /dev/null +++ b/tools/shell/ShellContext.cpp @@ -0,0 +1,104 @@ + +#include +#include + +#include "ShellContext.h" +#include "ShellState.h" + + + +#include "thrift/lib/cpp/protocol/TBinaryProtocol.h" +#include "thrift/lib/cpp/transport/TSocket.h" +#include "thrift/lib/cpp/transport/TTransportUtils.h" + + + +using namespace std; +using namespace boost; +using namespace Tleveldb; +using namespace rocksdb; +using namespace apache::thrift::protocol; +using namespace apache::thrift::transport; + +void ShellContext::changeState(ShellState * pState) { + pShellState_ = pState; +} + +void ShellContext::stop(void) { + exit_ = true; +} + +bool ShellContext::ParseInput(void) { + if(argc_ != 3) { + printf("leveldb_shell host port\n"); + return false; + } + + port_ = atoi(argv_[2]); + if(port_ <= 0) { + printf("Error while parse port : %s\n", argv_[2]); + return false; + } + + clientProxy_.reset(new DBClientProxy(argv_[1], port_)); + if(!clientProxy_.get()) { + return false; + } else { + return true; + } +} + +void ShellContext::connect(void) { + clientProxy_->connect(); +} + +void ShellContext::create(const string & db) { + if (clientProxy_->create(db)) { + printf("%s created\n", db.c_str()); + } +} + +void ShellContext::get(const string & db, + const string & key) { + string v; + if (clientProxy_->get(db, key, v)) { + printf("%s\n", v.c_str()); + } +} + +void ShellContext::put(const string & db, + const string & key, + const string & value) { + if (clientProxy_->put(db, key, value)) { + printf("(%s, %s) has been set\n", key.c_str(), value.c_str()); + } +} + +void ShellContext::scan(const string & db, + const string & start_key, + const string & end_key, + const string & limit) { + vector > kvs; + if (clientProxy_->scan(db, start_key, end_key, limit, kvs)) { + for(unsigned int i = 0; i < kvs.size(); ++i) { + printf("%d (%s, %s)\n", i, kvs[i].first.c_str(), kvs[i].second.c_str()); + } + } +} + +void ShellContext::run(void) { + while(!exit_) { + pShellState_->run(this); + } +} + +ShellContext::ShellContext(int argc, char ** argv) : + pShellState_(ShellStateStart::getInstance()), + exit_(false), + argc_(argc), + argv_(argv), + port_(-1), + clientProxy_() { +} + + diff --git a/tools/shell/ShellContext.h b/tools/shell/ShellContext.h new file mode 100644 index 00000000..5c2b9448 --- /dev/null +++ b/tools/shell/ShellContext.h @@ -0,0 +1,51 @@ +#ifndef TOOLS_SHELL_SHELLCONTEXT +#define TOOLS_SHELL_SHELLCONTEXT + +#include +#include +#include +#include + +#include "DB.h" +#include "DBClientProxy.h" + +class ShellState; + +class ShellContext : private boost::noncopyable { + public: + void changeState(ShellState * pState); + + void stop(void); + + bool ParseInput(void); + + void connect(void); + + void get(const std::string & db, + const std::string & key); + + void put(const std::string & db, + const std::string & key, + const std::string & value); + + void scan(const std::string & db, + const std::string & start_key, + const std::string & end_key, + const std::string & limit); + + void create(const std::string & db); + + void run(void); + + ShellContext(int argc, char ** argv); + + private: + ShellState * pShellState_; + bool exit_; + int argc_; + char ** argv_; + int port_; + boost::shared_ptr clientProxy_; +}; + +#endif diff --git a/tools/shell/ShellState.cpp b/tools/shell/ShellState.cpp new file mode 100644 index 00000000..057a337a --- /dev/null +++ b/tools/shell/ShellState.cpp @@ -0,0 +1,139 @@ +#include +#include +#include +#include + +#include "ShellState.h" +#include "ShellContext.h" +#include "transport/TTransportException.h" + +using namespace std; + +using namespace apache::thrift::transport; + +const char * PMT = ">> "; + + +void ShellStateStart::run(ShellContext * c) { + if(!c->ParseInput()) { + c->changeState(ShellStateStop::getInstance()); + } else { + c->changeState(ShellStateConnecting::getInstance()); + } +} + + +void ShellStateStop::run(ShellContext * c) { + c->stop(); +} + +void ShellStateConnecting::run(ShellContext * c) { + try { + c->connect(); + } catch (const TTransportException & e) { + cout << e.what() << endl; + c->changeState(ShellStateStop::getInstance()); + return; + } + + c->changeState(ShellStateConnected::getInstance()); +} + +void ShellStateConnected::unknownCmd(void) { + cout << "Unknown command!" << endl; + cout << "Use help to list all available commands" << endl; +} + +void ShellStateConnected::helpMsg(void) { + cout << "Currently supported commands:" << endl; + cout << "create db" << endl; + cout << "get db key" << endl; + cout << "scan db start_key end_key limit" << endl; + cout << "put db key value" << endl; + cout << "exit/quit" << endl; +} + +void ShellStateConnected::handleConError(ShellContext * c) { + cout << "Connection down" << endl; + cout << "Reconnect ? (y/n) :" << endl; + string s; + while(getline(cin, s)) { + if("y" == s) { + c->changeState(ShellStateConnecting::getInstance()); + break; + } else if("n" == s) { + c->changeState(ShellStateStop::getInstance()); + break; + } else { + cout << "Reconnect ? (y/n) :" << endl; + } + } +} + +void ShellStateConnected::run(ShellContext * c) { + string line; + cout << PMT; + getline(cin, line); + istringstream is(line); + vector params; + string param; + while(is >> param) { + params.push_back(param); + } + + // empty input line + if(params.empty()) + return; + + if("quit" == params[0] || "exit" == params[0]) { + c->changeState(ShellStateStop::getInstance()); + } else if("get" == params[0]) { + if(params.size() == 3) { + try { + c->get(params[1], params[2]); + } catch (const TTransportException & e) { + cout << e.what() << endl; + handleConError(c); + } + } else { + unknownCmd(); + } + } else if("create" == params[0]) { + if(params.size() == 2) { + try { + c->create(params[1]); + } catch (const TTransportException & e) { + cout << e.what() << endl; + handleConError(c); + } + } else { + unknownCmd(); + } + }else if("put" == params[0]) { + if(params.size() == 4) { + try { + c->put(params[1], params[2], params[3]); + } catch (const TTransportException & e) { + cout << e.what() << endl; + handleConError(c); + } + } else { + unknownCmd(); + } + } else if("scan" == params[0]) { + if(params.size() == 5) { + try { + c->scan(params[1], params[2], params[3], params[4]); + } catch (const TTransportException & e) { + cout << e.what() << endl; + handleConError(c); + } + } else { + unknownCmd(); + } + } else if("help" == params[0]) { + helpMsg(); + } else { + unknownCmd(); + } +} diff --git a/tools/shell/ShellState.h b/tools/shell/ShellState.h new file mode 100644 index 00000000..4027af20 --- /dev/null +++ b/tools/shell/ShellState.h @@ -0,0 +1,87 @@ + +#ifndef TOOLS_SHELL_SHELLSTATE +#define TOOLS_SHELL_SHELLSTATE + +class ShellContext; + +/* + * Currently, there are four types of state in total + * 1. start state: the first state the program enters + * 2. connecting state: the program try to connect to a rocksdb server, whose + * previous states could be "start" or "connected" states + * 3. connected states: the program has already connected to a server, and is + * processing user commands + * 4. stop state: the last state the program enters, do some cleaning up things + */ + +class ShellState { + public: + virtual void run(ShellContext *) = 0; + virtual ~ShellState() {} +}; + + +class ShellStateStart : public ShellState { + public: + static ShellStateStart * getInstance(void) { + static ShellStateStart instance; + return &instance; + } + + virtual void run(ShellContext *); + + private: + ShellStateStart() {} + virtual ~ShellStateStart() {} +}; + +class ShellStateStop : public ShellState { + public: + static ShellStateStop * getInstance(void) { + static ShellStateStop instance; + return &instance; + } + + virtual void run(ShellContext *); + + private: + ShellStateStop() {} + virtual ~ShellStateStop() {} + +}; + +class ShellStateConnecting : public ShellState { + public: + static ShellStateConnecting * getInstance(void) { + static ShellStateConnecting instance; + return &instance; + } + + virtual void run(ShellContext *); + + private: + ShellStateConnecting() {} + virtual ~ShellStateConnecting() {} + +}; + +class ShellStateConnected : public ShellState { + public: + static ShellStateConnected * getInstance(void) { + static ShellStateConnected instance; + return &instance; + } + + virtual void run(ShellContext *); + + private: + ShellStateConnected() {} + virtual ~ShellStateConnected() {} + + void unknownCmd(); + void handleConError(ShellContext *); + void helpMsg(); +}; + +#endif + diff --git a/tools/shell/test/DBClientProxyTest.cpp b/tools/shell/test/DBClientProxyTest.cpp new file mode 100644 index 00000000..3b64ffc5 --- /dev/null +++ b/tools/shell/test/DBClientProxyTest.cpp @@ -0,0 +1,182 @@ +/** + * Tests for DBClientProxy class for leveldb + * @author Bo Liu (newpoo.liu@gmail.com) + * Copyright 2012 Facebook + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "server_options.h" + + +#include "../DBClientProxy.h" +using namespace rocksdb; + + +using namespace apache::thrift; +using namespace apache::thrift::protocol; +using namespace apache::thrift::transport; +using boost::shared_ptr; +using namespace Tleveldb; +using namespace std; + + + +extern "C" void startServer(int argc, char**argv); +extern "C" void stopServer(int port); +extern ServerOptions server_options; + +static const string db1("db1"); + + +static void testDBClientProxy(DBClientProxy & dbcp) { + bool flag; + const int NOK = 100; + const int BUFSIZE = 16; + int testcase = 0; + + vector keys, values; + vector > kvs, correctKvs; + string k, v; + + for(int i = 0; i < NOK; ++i) { + char bufKey[BUFSIZE]; + char bufValue[BUFSIZE]; + snprintf(bufKey, BUFSIZE, "key%d", i); + snprintf(bufValue, BUFSIZE, "value%d", i); + keys.push_back(bufKey); + values.push_back(bufValue); + correctKvs.push_back((make_pair(string(bufKey), string(bufValue)))); + } + + sort(correctKvs.begin(), correctKvs.end()); + + + // can not do get(), put(), scan() or create() before connected. + flag = dbcp.get(db1, keys[0], v); + ASSERT_TRUE(false == flag); + printf("\033[01;40;32mTEST CASE %d passed\033[01;40;37m\n", ++testcase); + flag = dbcp.put(db1, keys[0], keys[1]); + ASSERT_TRUE(false == flag); + printf("\033[01;40;32mTEST CASE %d passed\033[01;40;37m\n", ++testcase); + flag = dbcp.scan(db1, "a", "w", "100", kvs); + ASSERT_TRUE(false == flag); + printf("\033[01;40;32mTEST CASE %d passed\033[01;40;37m\n", ++testcase); + flag = dbcp.create(db1); + ASSERT_TRUE(false == flag); + printf("\033[01;40;32mTEST CASE %d passed\033[01;40;37m\n", ++testcase); + + dbcp.connect(); + + // create a database + flag = dbcp.create(db1); + ASSERT_TRUE(true == flag); + printf("\033[01;40;32mTEST CASE %d passed\033[01;40;37m\n", ++testcase); + + // no such key + flag = dbcp.get(db1, keys[0], v); + ASSERT_TRUE(false == flag); + printf("\033[01;40;32mTEST CASE %d passed\033[01;40;37m\n", ++testcase); + + + // scan() success with empty returned key-value pairs + kvs.clear(); + flag = dbcp.scan(db1, "a", "w", "100", kvs); + ASSERT_TRUE(true == flag); + ASSERT_TRUE(kvs.empty()); + printf("\033[01;40;32mTEST CASE %d passed\033[01;40;37m\n", ++testcase); + + + // put() + for(int i = 0; i < NOK; ++i) { + flag = dbcp.put(db1, keys[i], values[i]); + ASSERT_TRUE(true == flag); + } + printf("\033[01;40;32mTEST CASE %d passed\033[01;40;37m\n", ++testcase); + + + // scan all of key-value pairs + kvs.clear(); + flag = dbcp.scan(db1, "a", "w", "100", kvs); + ASSERT_TRUE(true == flag); + ASSERT_TRUE(kvs == correctKvs); + printf("\033[01;40;32mTEST CASE %d passed\033[01;40;37m\n", ++testcase); + + + // scan the first 20 key-value pairs + { + kvs.clear(); + flag = dbcp.scan(db1, "a", "w", "20", kvs); + ASSERT_TRUE(true == flag); + vector > tkvs(correctKvs.begin(), correctKvs.begin() + 20); + ASSERT_TRUE(kvs == tkvs); + printf("\033[01;40;32mTEST CASE %d passed\033[01;40;37m\n", ++testcase); + } + + // scan key[10] to key[50] + { + kvs.clear(); + flag = dbcp.scan(db1, correctKvs[10].first, correctKvs[50].first, "100", kvs); + ASSERT_TRUE(true == flag); + + vector > tkvs(correctKvs.begin() + 10, correctKvs.begin() + 50); + ASSERT_TRUE(kvs == tkvs); + printf("\033[01;40;32mTEST CASE %d passed\033[01;40;37m\n", ++testcase); + } + + // scan "key10" to "key40" by limit constraint + { + kvs.clear(); + flag = dbcp.scan(db1, correctKvs[10].first.c_str(), "w", "30", kvs); + ASSERT_TRUE(true == flag); + vector > tkvs(correctKvs.begin() + 10, correctKvs.begin() + 40); + ASSERT_TRUE(kvs == tkvs); + printf("\033[01;40;32mTEST CASE %d passed\033[01;40;37m\n", ++testcase); + } + + + // get() + flag = dbcp.get(db1, "unknownKey", v); + ASSERT_TRUE(false == flag); + printf("\033[01;40;32mTEST CASE %d passed\033[01;40;37m\n", ++testcase); + + flag = dbcp.get(db1, keys[0], v); + ASSERT_TRUE(true == flag); + ASSERT_TRUE(v == values[0]); + printf("\033[01;40;32mTEST CASE %d passed\033[01;40;37m\n", ++testcase); +} + + + +static void cleanupDir(std::string dir) { + // remove old data, if any + char* cleanup = new char[100]; + snprintf(cleanup, 100, "rm -rf %s", dir.c_str()); + system(cleanup); +} + +int main(int argc, char **argv) { + // create a server + startServer(argc, argv); + printf("Server thread created.\n"); + + // give some time to the server to initialize itself + while (server_options.getPort() == 0) { + sleep(1); + } + + cleanupDir(server_options.getDataDirectory(db1)); + + DBClientProxy dbcp("localhost", server_options.getPort()); + testDBClientProxy(dbcp); +} + diff --git a/tools/sst_dump.cc b/tools/sst_dump.cc new file mode 100644 index 00000000..90388955 --- /dev/null +++ b/tools/sst_dump.cc @@ -0,0 +1,261 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// + +#include +#include +#include + +#include "db/dbformat.h" +#include "db/memtable.h" +#include "db/write_batch_internal.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/table.h" +#include "table/block.h" +#include "table/block_builder.h" +#include "table/format.h" +#include "util/ldb_cmd.h" +#include "util/random.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { + +class SstFileReader { + public: + explicit SstFileReader(const std::string& file_name, + bool verify_checksum, + bool output_hex); + + Status ReadSequential(bool print_kv, + uint64_t read_num, + bool has_from, + const std::string& from_key, + bool has_to, + const std::string& to_key); + + uint64_t GetReadNumber() { return read_num_; } + +private: + std::string file_name_; + uint64_t read_num_; + bool verify_checksum_; + bool output_hex_; + EnvOptions soptions_; +}; + +SstFileReader::SstFileReader(const std::string& file_path, + bool verify_checksum, + bool output_hex) + :file_name_(file_path), read_num_(0), verify_checksum_(verify_checksum), + output_hex_(output_hex) { + std::cout << "Process " << file_path << "\n"; +} + +Status SstFileReader::ReadSequential(bool print_kv, + uint64_t read_num, + bool has_from, + const std::string& from_key, + bool has_to, + const std::string& to_key) +{ + unique_ptr table_reader; + InternalKeyComparator internal_comparator_(BytewiseComparator()); + Options table_options; + table_options.comparator = &internal_comparator_; + unique_ptr file; + Status s = table_options.env->NewRandomAccessFile(file_name_, &file, + soptions_); + if(!s.ok()) { + return s; + } + uint64_t file_size; + table_options.env->GetFileSize(file_name_, &file_size); + unique_ptr table_factory; + s = table_options.table_factory->GetTableReader(table_options, soptions_, + std::move(file), file_size, + &table_reader); + if(!s.ok()) { + return s; + } + + Iterator* iter = table_reader->NewIterator(ReadOptions(verify_checksum_, + false)); + uint64_t i = 0; + if (has_from) { + InternalKey ikey(from_key, kMaxSequenceNumber, kValueTypeForSeek); + iter->Seek(ikey.Encode()); + } else { + iter->SeekToFirst(); + } + for (; iter->Valid(); iter->Next()) { + Slice key = iter->key(); + Slice value = iter->value(); + ++i; + if (read_num > 0 && i > read_num) + break; + + ParsedInternalKey ikey; + if (!ParseInternalKey(key, &ikey)) { + std::cerr << "Internal Key [" + << key.ToString(true /* in hex*/) + << "] parse error!\n"; + continue; + } + + // If end marker was specified, we stop before it + if (has_to && BytewiseComparator()->Compare(ikey.user_key, to_key) >= 0) { + break; + } + + if (print_kv) { + std::cout << ikey.DebugString(output_hex_) + << " => " + << value.ToString(output_hex_) << "\n"; + } + + } + + read_num_ += i; + + Status ret = iter->status(); + delete iter; + return ret; +} + +} // namespace rocksdb + +static void print_help() { + fprintf(stderr, + "sst_dump [--command=check|scan] [--verify_checksum] " + "--file=data_dir_OR_sst_file" + " [--output_hex]" + " [--input_key_hex]" + " [--from=]" + " [--to=]" + " [--read_num=NUM]\n"); +} + +string HexToString(const string& str) { + string parsed; + if (str[0] != '0' || str[1] != 'x') { + fprintf(stderr, "Invalid hex input %s. Must start with 0x\n", + str.c_str()); + throw "Invalid hex input"; + } + + for (unsigned int i = 2; i < str.length();) { + int c; + sscanf(str.c_str() + i, "%2X", &c); + parsed.push_back(c); + i += 2; + } + return parsed; +} + +int main(int argc, char** argv) { + + const char* dir_or_file = nullptr; + uint64_t read_num = -1; + std::string command; + + char junk; + uint64_t n; + bool verify_checksum = false; + bool output_hex = false; + bool input_key_hex = false; + bool has_from = false; + bool has_to = false; + std::string from_key; + std::string to_key; + for (int i = 1; i < argc; i++) + { + if (strncmp(argv[i], "--file=", 7) == 0) { + dir_or_file = argv[i] + 7; + } else if (strcmp(argv[i], "--output_hex") == 0) { + output_hex = true; + } else if (strcmp(argv[i], "--input_key_hex") == 0) { + input_key_hex = true; + } else if (sscanf(argv[i], + "--read_num=%lu%c", + (unsigned long*)&n, &junk) == 1) { + read_num = n; + } else if (strcmp(argv[i], "--verify_checksum") == 0) { + verify_checksum = true; + } else if (strncmp(argv[i], "--command=", 10) == 0) { + command = argv[i] + 10; + } else if (strncmp(argv[i], "--from=", 7) == 0) { + from_key = argv[i] + 7; + has_from = true; + } else if (strncmp(argv[i], "--to=", 5) == 0) { + to_key = argv[i] + 5; + has_to = true; + }else { + print_help(); + exit(1); + } + } + + + if (input_key_hex) { + if (has_from) { + from_key = HexToString(from_key); + } + if (has_to) { + to_key = HexToString(to_key); + } + } + + if(dir_or_file == nullptr) { + print_help(); + exit(1); + } + + std::vector filenames; + rocksdb::Env* env = rocksdb::Env::Default(); + rocksdb::Status st = env->GetChildren(dir_or_file, &filenames); + bool dir = true; + if (!st.ok()) { + filenames.clear(); + filenames.push_back(dir_or_file); + dir = false; + } + + std::cout << "from [" << rocksdb::Slice(from_key).ToString(true) + << "] to [" << rocksdb::Slice(to_key).ToString(true) << "]\n"; + + uint64_t total_read = 0; + for (size_t i = 0; i < filenames.size(); i++) { + std::string filename = filenames.at(i); + if (filename.length() <= 4 || + filename.rfind(".sst") != filename.length() - 4) { + //ignore + continue; + } + if(dir) { + filename = std::string(dir_or_file) + "/" + filename; + } + rocksdb::SstFileReader reader(filename, verify_checksum, + output_hex); + rocksdb::Status st; + // scan all files in give file path. + if (command == "" || command == "scan" || command == "check") { + st = reader.ReadSequential(command != "check", + read_num > 0 ? (read_num - total_read) : + read_num, + has_from, from_key, has_to, to_key); + if (!st.ok()) { + fprintf(stderr, "%s: %s\n", filename.c_str(), + st.ToString().c_str()); + } + total_read += reader.GetReadNumber(); + if (read_num > 0 && total_read > read_num) { + break; + } + } + } +} diff --git a/util/arena_impl.cc b/util/arena_impl.cc new file mode 100644 index 00000000..5125e236 --- /dev/null +++ b/util/arena_impl.cc @@ -0,0 +1,93 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/arena_impl.h" +#include + +namespace rocksdb { + +const size_t ArenaImpl::kMinBlockSize = 4096; +const size_t ArenaImpl::kMaxBlockSize = 2 << 30; +static const int kAlignUnit = sizeof(void*); + +size_t OptimizeBlockSize(size_t block_size) { + // Make sure block_size is in optimal range + block_size = std::max(ArenaImpl::kMinBlockSize, block_size); + block_size = std::min(ArenaImpl::kMaxBlockSize, block_size); + + // make sure block_size is the multiple of kAlignUnit + if (block_size % kAlignUnit != 0) { + block_size = (1 + block_size / kAlignUnit) * kAlignUnit; + } + + return block_size; +} + +ArenaImpl::ArenaImpl(size_t block_size) + : kBlockSize(OptimizeBlockSize(block_size)) { + assert(kBlockSize >= kMinBlockSize && kBlockSize <= kMaxBlockSize && + kBlockSize % kAlignUnit == 0); +} + +ArenaImpl::~ArenaImpl() { + for (const auto& block : blocks_) { + delete[] block; + } +} + +char* ArenaImpl::AllocateFallback(size_t bytes, bool aligned) { + if (bytes > kBlockSize / 4) { + // Object is more than a quarter of our block size. Allocate it separately + // to avoid wasting too much space in leftover bytes. + return AllocateNewBlock(bytes); + } + + // We waste the remaining space in the current block. + auto block_head = AllocateNewBlock(kBlockSize); + alloc_bytes_remaining_ = kBlockSize - bytes; + + if (aligned) { + aligned_alloc_ptr_ = block_head + bytes; + unaligned_alloc_ptr_ = block_head + kBlockSize; + return block_head; + } else { + aligned_alloc_ptr_ = block_head; + unaligned_alloc_ptr_ = block_head + kBlockSize - bytes; + return unaligned_alloc_ptr_; + } +} + +char* ArenaImpl::AllocateAligned(size_t bytes) { + assert((kAlignUnit & (kAlignUnit - 1)) == + 0); // Pointer size should be a power of 2 + size_t current_mod = + reinterpret_cast(aligned_alloc_ptr_) & (kAlignUnit - 1); + size_t slop = (current_mod == 0 ? 0 : kAlignUnit - current_mod); + size_t needed = bytes + slop; + char* result; + if (needed <= alloc_bytes_remaining_) { + result = aligned_alloc_ptr_ + slop; + aligned_alloc_ptr_ += needed; + alloc_bytes_remaining_ -= needed; + } else { + // AllocateFallback always returned aligned memory + result = AllocateFallback(bytes, true /* aligned */); + } + assert((reinterpret_cast(result) & (kAlignUnit - 1)) == 0); + return result; +} + +char* ArenaImpl::AllocateNewBlock(size_t block_bytes) { + char* block = new char[block_bytes]; + blocks_memory_ += block_bytes; + blocks_.push_back(block); + return block; +} + +} // namespace rocksdb diff --git a/util/arena_impl.h b/util/arena_impl.h new file mode 100644 index 00000000..538385cc --- /dev/null +++ b/util/arena_impl.h @@ -0,0 +1,93 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +// ArenaImpl is an implementation of Arena class. For a request of small size, +// it allocates a block with pre-defined block size. For a request of big +// size, it uses malloc to directly get the requested size. + +#pragma once +#include +#include +#include +#include +#include "rocksdb/arena.h" + +namespace rocksdb { + +class ArenaImpl : public Arena { + public: + // No copying allowed + ArenaImpl(const ArenaImpl&) = delete; + void operator=(const ArenaImpl&) = delete; + + static const size_t kMinBlockSize; + static const size_t kMaxBlockSize; + + explicit ArenaImpl(size_t block_size = kMinBlockSize); + virtual ~ArenaImpl(); + + virtual char* Allocate(size_t bytes) override; + + virtual char* AllocateAligned(size_t bytes) override; + + // Returns an estimate of the total memory usage of data allocated + // by the arena (exclude the space allocated but not yet used for future + // allocations). + virtual const size_t ApproximateMemoryUsage() { + return blocks_memory_ + blocks_.capacity() * sizeof(char*) - + alloc_bytes_remaining_; + } + + virtual const size_t MemoryAllocatedBytes() override { + return blocks_memory_; + } + + private: + // Number of bytes allocated in one block + const size_t kBlockSize; + // Array of new[] allocated memory blocks + typedef std::vector Blocks; + Blocks blocks_; + + // Stats for current active block. + // For each block, we allocate aligned memory chucks from one end and + // allocate unaligned memory chucks from the other end. Otherwise the + // memory waste for alignment will be higher if we allocate both types of + // memory from one direction. + char* unaligned_alloc_ptr_ = nullptr; + char* aligned_alloc_ptr_ = nullptr; + // How many bytes left in currently active block? + size_t alloc_bytes_remaining_ = 0; + + char* AllocateFallback(size_t bytes, bool aligned); + char* AllocateNewBlock(size_t block_bytes); + + // Bytes of memory in blocks allocated so far + size_t blocks_memory_ = 0; +}; + +inline char* ArenaImpl::Allocate(size_t bytes) { + // The semantics of what to return are a bit messy if we allow + // 0-byte allocations, so we disallow them here (we don't need + // them for our internal use). + assert(bytes > 0); + if (bytes <= alloc_bytes_remaining_) { + unaligned_alloc_ptr_ -= bytes; + alloc_bytes_remaining_ -= bytes; + return unaligned_alloc_ptr_; + } + return AllocateFallback(bytes, false /* unaligned */); +} + +// check and adjust the block_size so that the return value is +// 1. in the range of [kMinBlockSize, kMaxBlockSize]. +// 2. the multiple of align unit. +extern size_t OptimizeBlockSize(size_t block_size); + +} // namespace rocksdb diff --git a/util/arena_test.cc b/util/arena_test.cc new file mode 100644 index 00000000..ca6dfc99 --- /dev/null +++ b/util/arena_test.cc @@ -0,0 +1,137 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/arena_impl.h" +#include "util/random.h" +#include "util/testharness.h" + +namespace rocksdb { + +class ArenaImplTest { }; + +TEST(ArenaImplTest, Empty) { + ArenaImpl arena0; +} + +TEST(ArenaImplTest, MemoryAllocatedBytes) { + const int N = 17; + size_t req_sz; //requested size + size_t bsz = 8192; // block size + size_t expected_memory_allocated; + + ArenaImpl arena_impl(bsz); + + // requested size > quarter of a block: + // allocate requested size separately + req_sz = 3001; + for (int i = 0; i < N; i++) { + arena_impl.Allocate(req_sz); + } + expected_memory_allocated = req_sz * N; + ASSERT_EQ(arena_impl.MemoryAllocatedBytes(), expected_memory_allocated); + + // requested size < quarter of a block: + // allocate a block with the default size, then try to use unused part + // of the block. So one new block will be allocated for the first + // Allocate(99) call. All the remaining calls won't lead to new allocation. + req_sz = 99; + for (int i = 0; i < N; i++) { + arena_impl.Allocate(req_sz); + } + expected_memory_allocated += bsz; + ASSERT_EQ(arena_impl.MemoryAllocatedBytes(), expected_memory_allocated); + + // requested size > quarter of a block: + // allocate requested size separately + req_sz = 99999999; + for (int i = 0; i < N; i++) { + arena_impl.Allocate(req_sz); + } + expected_memory_allocated += req_sz * N; + ASSERT_EQ(arena_impl.MemoryAllocatedBytes(), expected_memory_allocated); +} + +// Make sure we didn't count the allocate but not used memory space in +// Arena::ApproximateMemoryUsage() +TEST(ArenaImplTest, ApproximateMemoryUsageTest) { + const size_t kBlockSize = 4096; + const size_t kEntrySize = kBlockSize / 8; + const size_t kZero = 0; + ArenaImpl arena(kBlockSize); + ASSERT_EQ(kZero, arena.ApproximateMemoryUsage()); + + auto num_blocks = kBlockSize / kEntrySize; + + // first allocation + arena.AllocateAligned(kEntrySize); + auto mem_usage = arena.MemoryAllocatedBytes(); + ASSERT_EQ(mem_usage, kBlockSize); + auto usage = arena.ApproximateMemoryUsage(); + ASSERT_LT(usage, mem_usage); + for (size_t i = 1; i < num_blocks; ++i) { + arena.AllocateAligned(kEntrySize); + ASSERT_EQ(mem_usage, arena.MemoryAllocatedBytes()); + ASSERT_EQ(arena.ApproximateMemoryUsage(), usage + kEntrySize); + usage = arena.ApproximateMemoryUsage(); + } + ASSERT_GT(usage, mem_usage); +} + +TEST(ArenaImplTest, Simple) { + std::vector> allocated; + ArenaImpl arena_impl; + const int N = 100000; + size_t bytes = 0; + Random rnd(301); + for (int i = 0; i < N; i++) { + size_t s; + if (i % (N / 10) == 0) { + s = i; + } else { + s = rnd.OneIn(4000) + ? rnd.Uniform(6000) + : (rnd.OneIn(10) ? rnd.Uniform(100) : rnd.Uniform(20)); + } + if (s == 0) { + // Our arena disallows size 0 allocations. + s = 1; + } + char* r; + if (rnd.OneIn(10)) { + r = arena_impl.AllocateAligned(s); + } else { + r = arena_impl.Allocate(s); + } + + for (unsigned int b = 0; b < s; b++) { + // Fill the "i"th allocation with a known bit pattern + r[b] = i % 256; + } + bytes += s; + allocated.push_back(std::make_pair(s, r)); + ASSERT_GE(arena_impl.ApproximateMemoryUsage(), bytes); + if (i > N / 10) { + ASSERT_LE(arena_impl.ApproximateMemoryUsage(), bytes * 1.10); + } + } + for (unsigned int i = 0; i < allocated.size(); i++) { + size_t num_bytes = allocated[i].first; + const char* p = allocated[i].second; + for (unsigned int b = 0; b < num_bytes; b++) { + // Check the "i"th allocation for the known bit pattern + ASSERT_EQ(int(p[b]) & 0xff, (int)(i % 256)); + } + } +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/util/auto_roll_logger.cc b/util/auto_roll_logger.cc new file mode 100644 index 00000000..95f2fae0 --- /dev/null +++ b/util/auto_roll_logger.cc @@ -0,0 +1,108 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include "util/auto_roll_logger.h" +#include "util/mutexlock.h" + +using namespace std; + +namespace rocksdb { + +// -- AutoRollLogger +Status AutoRollLogger::ResetLogger() { + status_ = env_->NewLogger(log_fname_, &logger_); + + if (!status_.ok()) { + return status_; + } + + if (logger_->GetLogFileSize() == + (size_t)Logger::DO_NOT_SUPPORT_GET_LOG_FILE_SIZE) { + status_ = Status::NotSupported( + "The underlying logger doesn't support GetLogFileSize()"); + } + if (status_.ok()) { + cached_now = static_cast(env_->NowMicros() * 1e-6); + ctime_ = cached_now; + cached_now_access_count = 0; + } + + return status_; +} + +void AutoRollLogger::RollLogFile() { + std::string old_fname = OldInfoLogFileName( + dbname_, env_->NowMicros(), db_absolute_path_, db_log_dir_); + env_->RenameFile(log_fname_, old_fname); +} + +void AutoRollLogger::Logv(const char* format, va_list ap) { + assert(GetStatus().ok()); + + std::shared_ptr logger; + { + MutexLock l(&mutex_); + if ((kLogFileTimeToRoll > 0 && LogExpired()) || + (kMaxLogFileSize > 0 && logger_->GetLogFileSize() >= kMaxLogFileSize)) { + RollLogFile(); + ResetLogger(); + } + + // pin down the current logger_ instance before releasing the mutex. + logger = logger_; + } + + // Another thread could have put a new Logger instance into logger_ by now. + // However, since logger is still hanging on to the previous instance + // (reference count is not zero), we don't have to worry about it being + // deleted while we are accessing it. + // Note that logv itself is not mutex protected to allow maximum concurrency, + // as thread safety should have been handled by the underlying logger. + logger->Logv(format, ap); +} + +bool AutoRollLogger::LogExpired() { + if (cached_now_access_count >= call_NowMicros_every_N_records_) { + cached_now = static_cast(env_->NowMicros() * 1e-6); + cached_now_access_count = 0; + } + + ++cached_now_access_count; + return cached_now >= ctime_ + kLogFileTimeToRoll; +} + +Status CreateLoggerFromOptions( + const std::string& dbname, + const std::string& db_log_dir, + Env* env, + const Options& options, + std::shared_ptr* logger) { + std::string db_absolute_path; + env->GetAbsolutePath(dbname, &db_absolute_path); + std::string fname = InfoLogFileName(dbname, db_absolute_path, db_log_dir); + + // Currently we only support roll by time-to-roll and log size + if (options.log_file_time_to_roll > 0 || options.max_log_file_size > 0) { + AutoRollLogger* result = new AutoRollLogger( + env, dbname, db_log_dir, + options.max_log_file_size, + options.log_file_time_to_roll); + Status s = result->GetStatus(); + if (!s.ok()) { + delete result; + } else { + logger->reset(result); + } + return s; + } else { + // Open a log file in the same directory as the db + env->CreateDir(dbname); // In case it does not exist + env->RenameFile(fname, OldInfoLogFileName(dbname, env->NowMicros(), + db_absolute_path, db_log_dir)); + return env->NewLogger(fname, logger); + } +} + +} // namespace rocksdb diff --git a/util/auto_roll_logger.h b/util/auto_roll_logger.h new file mode 100644 index 00000000..db70f158 --- /dev/null +++ b/util/auto_roll_logger.h @@ -0,0 +1,90 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Logger implementation that can be shared by all environments +// where enough posix functionality is available. + +#pragma once +#include "db/filename.h" +#include "port/port.h" +#include "util/posix_logger.h" + +namespace rocksdb { + +// Rolls the log file by size and/or time +class AutoRollLogger : public Logger { + public: + AutoRollLogger(Env* env, const std::string& dbname, + const std::string& db_log_dir, + size_t log_max_size, + size_t log_file_time_to_roll): + dbname_(dbname), + db_log_dir_(db_log_dir), + env_(env), + status_(Status::OK()), + kMaxLogFileSize(log_max_size), + kLogFileTimeToRoll(log_file_time_to_roll), + cached_now(static_cast(env_->NowMicros() * 1e-6)), + ctime_(cached_now), + cached_now_access_count(0), + call_NowMicros_every_N_records_(100), + mutex_() { + env->GetAbsolutePath(dbname, &db_absolute_path_); + log_fname_ = InfoLogFileName(dbname_, db_absolute_path_, db_log_dir_); + RollLogFile(); + ResetLogger(); + } + + void Logv(const char* format, va_list ap); + + // check if the logger has encountered any problem. + Status GetStatus() { + return status_; + } + + size_t GetLogFileSize() const { + return logger_->GetLogFileSize(); + } + + virtual ~AutoRollLogger() { + } + + void SetCallNowMicrosEveryNRecords(uint64_t call_NowMicros_every_N_records) { + call_NowMicros_every_N_records_ = call_NowMicros_every_N_records; + } + + private: + + bool LogExpired(); + Status ResetLogger(); + void RollLogFile(); + + std::string log_fname_; // Current active info log's file name. + std::string dbname_; + std::string db_log_dir_; + std::string db_absolute_path_; + Env* env_; + std::shared_ptr logger_; + // current status of the logger + Status status_; + const size_t kMaxLogFileSize; + const size_t kLogFileTimeToRoll; + // to avoid frequent env->NowMicros() calls, we cached the current time + uint64_t cached_now; + uint64_t ctime_; + uint64_t cached_now_access_count; + uint64_t call_NowMicros_every_N_records_; + port::Mutex mutex_; +}; + +// Facade to craete logger automatically +Status CreateLoggerFromOptions( + const std::string& dbname, + const std::string& db_log_dir, + Env* env, + const Options& options, + std::shared_ptr* logger); + +} // namespace rocksdb diff --git a/util/auto_roll_logger_test.cc b/util/auto_roll_logger_test.cc new file mode 100755 index 00000000..2fd2c51f --- /dev/null +++ b/util/auto_roll_logger_test.cc @@ -0,0 +1,262 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include +#include +#include "util/testharness.h" +#include "util/auto_roll_logger.h" +#include "rocksdb/db.h" +#include +#include +#include + +using namespace std; + +namespace rocksdb { + +class AutoRollLoggerTest { + public: + static void InitTestDb() { + string deleteCmd = "rm -rf " + kTestDir; + ASSERT_TRUE(system(deleteCmd.c_str()) == 0); + Env::Default()->CreateDir(kTestDir); + } + + void RollLogFileBySizeTest(AutoRollLogger* logger, + size_t log_max_size, + const string& log_message); + uint64_t RollLogFileByTimeTest(AutoRollLogger* logger, + size_t time, + const string& log_message); + + static const string kSampleMessage; + static const string kTestDir; + static const string kLogFile; + static Env* env; +}; + +const string AutoRollLoggerTest::kSampleMessage( + "this is the message to be written to the log file!!"); +const string AutoRollLoggerTest::kTestDir( + test::TmpDir() + "/db_log_test"); +const string AutoRollLoggerTest::kLogFile( + test::TmpDir() + "/db_log_test/LOG"); +Env* AutoRollLoggerTest::env = Env::Default(); + +// In this test we only want to Log some simple log message with +// no format. LogMessage() provides such a simple interface and +// avoids the [format-security] warning which occurs when you +// call Log(logger, log_message) directly. +void LogMessage(Logger* logger, const char* message) { + Log(logger, "%s", message); +} + +void GetFileCreateTime(const std::string& fname, uint64_t* file_ctime) { + struct stat s; + if (stat(fname.c_str(), &s) != 0) { + *file_ctime = (uint64_t)0; + } + *file_ctime = static_cast(s.st_ctime); +} + +void AutoRollLoggerTest::RollLogFileBySizeTest(AutoRollLogger* logger, + size_t log_max_size, + const string& log_message) { + // measure the size of each message, which is supposed + // to be equal or greater than log_message.size() + LogMessage(logger, log_message.c_str()); + size_t message_size = logger->GetLogFileSize(); + size_t current_log_size = message_size; + + // Test the cases when the log file will not be rolled. + while (current_log_size + message_size < log_max_size) { + LogMessage(logger, log_message.c_str()); + current_log_size += message_size; + ASSERT_EQ(current_log_size, logger->GetLogFileSize()); + } + + // Now the log file will be rolled + LogMessage(logger, log_message.c_str()); + // Since rotation is checked before actual logging, we need to + // trigger the rotation by logging another message. + LogMessage(logger, log_message.c_str()); + + ASSERT_TRUE(message_size == logger->GetLogFileSize()); +} + +uint64_t AutoRollLoggerTest::RollLogFileByTimeTest( + AutoRollLogger* logger, size_t time, const string& log_message) { + uint64_t expected_create_time; + uint64_t actual_create_time; + uint64_t total_log_size; + ASSERT_OK(env->GetFileSize(kLogFile, &total_log_size)); + GetFileCreateTime(kLogFile, &expected_create_time); + logger->SetCallNowMicrosEveryNRecords(0); + + // -- Write to the log for several times, which is supposed + // to be finished before time. + for (int i = 0; i < 10; ++i) { + LogMessage(logger, log_message.c_str()); + ASSERT_OK(logger->GetStatus()); + // Make sure we always write to the same log file (by + // checking the create time); + GetFileCreateTime(kLogFile, &actual_create_time); + + // Also make sure the log size is increasing. + ASSERT_EQ(expected_create_time, actual_create_time); + ASSERT_GT(logger->GetLogFileSize(), total_log_size); + total_log_size = logger->GetLogFileSize(); + } + + // -- Make the log file expire + sleep(time); + LogMessage(logger, log_message.c_str()); + + // At this time, the new log file should be created. + GetFileCreateTime(kLogFile, &actual_create_time); + ASSERT_GT(actual_create_time, expected_create_time); + ASSERT_LT(logger->GetLogFileSize(), total_log_size); + expected_create_time = actual_create_time; + + return expected_create_time; +} + +TEST(AutoRollLoggerTest, RollLogFileBySize) { + InitTestDb(); + size_t log_max_size = 1024 * 5; + + AutoRollLogger logger(Env::Default(), kTestDir, "", log_max_size, 0); + + RollLogFileBySizeTest(&logger, log_max_size, + kSampleMessage + ":RollLogFileBySize"); + +} + +TEST(AutoRollLoggerTest, RollLogFileByTime) { + size_t time = 1; + size_t log_size = 1024 * 5; + + InitTestDb(); + // -- Test the existence of file during the server restart. + ASSERT_TRUE(!env->FileExists(kLogFile)); + AutoRollLogger logger(Env::Default(), kTestDir, "", log_size, 1); + ASSERT_TRUE(env->FileExists(kLogFile)); + + RollLogFileByTimeTest(&logger, time, kSampleMessage + ":RollLogFileByTime"); +} + +TEST(AutoRollLoggerTest, + OpenLogFilesMultipleTimesWithOptionLog_max_size) { + // If only 'log_max_size' options is specified, then every time + // when rocksdb is restarted, a new empty log file will be created. + InitTestDb(); + // WORKAROUND: + // avoid complier's complaint of "comparison between signed + // and unsigned integer expressions" because literal 0 is + // treated as "singed". + size_t kZero = 0; + size_t log_size = 1024; + + AutoRollLogger* logger = new AutoRollLogger( + Env::Default(), kTestDir, "", log_size, 0); + + LogMessage(logger, kSampleMessage.c_str()); + ASSERT_GT(logger->GetLogFileSize(), kZero); + delete logger; + + // reopens the log file and an empty log file will be created. + logger = new AutoRollLogger( + Env::Default(), kTestDir, "", log_size, 0); + ASSERT_EQ(logger->GetLogFileSize(), kZero); + delete logger; +} + +TEST(AutoRollLoggerTest, CompositeRollByTimeAndSizeLogger) { + size_t time = 1, log_max_size = 1024 * 5; + + InitTestDb(); + + AutoRollLogger logger(Env::Default(), kTestDir, "", log_max_size, time); + + // Test the ability to roll by size + RollLogFileBySizeTest( + &logger, log_max_size, + kSampleMessage + ":CompositeRollByTimeAndSizeLogger"); + + // Test the ability to roll by Time + RollLogFileByTimeTest( &logger, time, + kSampleMessage + ":CompositeRollByTimeAndSizeLogger"); +} + +TEST(AutoRollLoggerTest, CreateLoggerFromOptions) { + Options options; + shared_ptr logger; + + // Normal logger + ASSERT_OK(CreateLoggerFromOptions(kTestDir, "", env, options, &logger)); + ASSERT_TRUE(dynamic_cast(logger.get())); + + // Only roll by size + InitTestDb(); + options.max_log_file_size = 1024; + ASSERT_OK(CreateLoggerFromOptions(kTestDir, "", env, options, &logger)); + AutoRollLogger* auto_roll_logger = + dynamic_cast(logger.get()); + ASSERT_TRUE(auto_roll_logger); + RollLogFileBySizeTest( + auto_roll_logger, options.max_log_file_size, + kSampleMessage + ":CreateLoggerFromOptions - size"); + + // Only roll by Time + InitTestDb(); + options.max_log_file_size = 0; + options.log_file_time_to_roll = 1; + ASSERT_OK(CreateLoggerFromOptions(kTestDir, "", env, options, &logger)); + auto_roll_logger = + dynamic_cast(logger.get()); + RollLogFileByTimeTest( + auto_roll_logger, options.log_file_time_to_roll, + kSampleMessage + ":CreateLoggerFromOptions - time"); + + // roll by both Time and size + InitTestDb(); + options.max_log_file_size = 1024 * 5; + options.log_file_time_to_roll = 1; + ASSERT_OK(CreateLoggerFromOptions(kTestDir, "", env, options, &logger)); + auto_roll_logger = + dynamic_cast(logger.get()); + RollLogFileBySizeTest( + auto_roll_logger, options.max_log_file_size, + kSampleMessage + ":CreateLoggerFromOptions - both"); + RollLogFileByTimeTest( + auto_roll_logger, options.log_file_time_to_roll, + kSampleMessage + ":CreateLoggerFromOptions - both"); +} + +int OldLogFileCount(const string& dir) { + std::vector files; + Env::Default()->GetChildren(dir, &files); + int log_file_count = 0; + + for (std::vector::iterator it = files.begin(); + it != files.end(); ++it) { + uint64_t create_time; + FileType type; + if (!ParseFileName(*it, &create_time, &type)) { + continue; + } + if (type == kInfoLogFile && create_time > 0) { + ++log_file_count; + } + } + + return log_file_count; +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/util/autovector.h b/util/autovector.h new file mode 100644 index 00000000..9998e295 --- /dev/null +++ b/util/autovector.h @@ -0,0 +1,329 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +#pragma once + +#include +#include +#include +#include +#include + +namespace rocksdb { + +// A vector that leverages pre-allocated stack-based array to achieve better +// performance for array with small amount of items. +// +// The interface resembles that of vector, but with less features since we aim +// to solve the problem that we have in hand, rather than implementing a +// full-fledged generic container. +// +// Currently we don't support: +// * reserve()/shrink_to_fit()/resize() +// If used correctly, in most cases, people should not touch the +// underlying vector at all. +// * random insert()/erase(), please only use push_back()/pop_back(). +// * No move/swap operations. Each autovector instance has a +// stack-allocated array and if we want support move/swap operations, we +// need to copy the arrays other than just swapping the pointers. In this +// case we'll just explicitly forbid these operations since they may +// lead users to make false assumption by thinking they are inexpensive +// operations. +// +// Naming style of public methods almost follows that of the STL's. +template +class autovector { + public: + // General STL-style container member types. + typedef T value_type; + typedef typename std::vector::difference_type difference_type; + typedef typename std::vector::size_type size_type; + typedef value_type& reference; + typedef const value_type& const_reference; + typedef value_type* pointer; + typedef const value_type* const_pointer; + + // This class is the base for regular/const iterator + template + class iterator_impl { + public: + // -- iterator traits + typedef iterator_impl self_type; + typedef TValueType value_type; + typedef TValueType& reference; + typedef TValueType* pointer; + typedef typename TAutoVector::difference_type difference_type; + typedef std::random_access_iterator_tag iterator_category; + + iterator_impl(TAutoVector* vect, size_t index) + : vect_(vect) + , index_(index) { + }; + iterator_impl(const iterator_impl&) = default; + ~iterator_impl() { } + iterator_impl& operator=(const iterator_impl&) = default; + + // -- Advancement + // iterator++ + self_type& operator++() { + ++index_; + return *this; + } + + // ++iterator + self_type operator++(int) { + auto old = *this; + ++index_; + return old; + } + + // iterator-- + self_type& operator--() { + --index_; + return *this; + } + + // --iterator + self_type operator--(int) { + auto old = *this; + --index_; + return old; + } + + self_type operator-(difference_type len) { + return self_type(vect_, index_ - len); + } + + difference_type operator-(const self_type& other) { + assert(vect_ == other.vect_); + return index_ - other.index_; + } + + self_type operator+(difference_type len) { + return self_type(vect_, index_ + len); + } + + self_type& operator+=(difference_type len) { + index_ += len; + return *this; + } + + self_type& operator-=(difference_type len) { + index_ -= len; + return *this; + } + + // -- Reference + reference operator*() { + assert(vect_->size() >= index_); + return (*vect_)[index_]; + } + pointer operator->() { + assert(vect_->size() >= index_); + return &(*vect_)[index_]; + } + + // -- Logical Operators + bool operator==(const self_type& other) const { + assert(vect_ == other.vect_); + return index_ == other.index_; + } + + bool operator!=(const self_type& other) const { + return !(*this == other); + } + + bool operator>(const self_type& other) const { + assert(vect_ == other.vect_); + return index_ > other.index_; + } + + bool operator<(const self_type& other) const { + assert(vect_ == other.vect_); + return index_ < other.index_; + } + + bool operator>=(const self_type& other) const { + assert(vect_ == other.vect_); + return index_ >= other.index_; + } + + bool operator<=(const self_type& other) const { + assert(vect_ == other.vect_); + return index_ <= other.index_; + } + + private: + TAutoVector* vect_ = nullptr; + size_t index_ = 0; + }; + + typedef iterator_impl iterator; + typedef iterator_impl const_iterator; + typedef std::reverse_iterator reverse_iterator; + typedef std::reverse_iterator const_reverse_iterator; + + autovector() = default; + ~autovector() = default; + + // -- Immutable operations + // Indicate if all data resides in in-stack data structure. + bool only_in_stack() const { + // If no element was inserted at all, the vector's capacity will be `0`. + return vect_.capacity() == 0; + } + + size_type size() const { + return num_stack_items_ + vect_.size(); + } + + bool empty() const { + return size() == 0; + } + + // will not check boundry + const_reference operator[](size_type n) const { + return n < kSize ? values_[n] : vect_[n - kSize]; + } + + reference operator[](size_type n) { + return n < kSize ? values_[n] : vect_[n - kSize]; + } + + // will check boundry + const_reference at(size_type n) const { + if (n >= size()) { + throw std::out_of_range("autovector: index out of range"); + } + return (*this)[n]; + } + + reference at(size_type n) { + if (n >= size()) { + throw std::out_of_range("autovector: index out of range"); + } + return (*this)[n]; + } + + reference front() { + assert(!empty()); + return *begin(); + } + + const_reference front() const { + assert(!empty()); + return *begin(); + } + + reference back() { + assert(!empty()); + return *(end() - 1); + } + + const_reference back() const { + assert(!empty()); + return *(end() - 1); + } + + // -- Mutable Operations + void push_back(T&& item) { + if (num_stack_items_ < kSize) { + values_[num_stack_items_++] = std::move(item); + } else { + vect_.push_back(item); + } + } + + void push_back(const T& item) { + push_back(value_type(item)); + } + + template + void emplace_back(Args&&... args) { + push_back(value_type(args...)); + } + + void pop_back() { + assert(!empty()); + if (!vect_.empty()) { + vect_.pop_back(); + } else { + --num_stack_items_; + } + } + + void clear() { + num_stack_items_ = 0; + vect_.clear(); + } + + // -- Copy and Assignment + autovector& assign(const autovector& other); + + autovector(const autovector& other) { + assign(other); + } + + autovector& operator=(const autovector& other) { + return assign(other); + } + + // move operation are disallowed since it is very hard to make sure both + // autovectors are allocated from the same function stack. + autovector& operator=(autovector&& other) = delete; + autovector(autovector&& other) = delete; + + // -- Iterator Operations + iterator begin() { + return iterator(this, 0); + } + + const_iterator begin() const { + return const_iterator(this, 0); + } + + iterator end() { + return iterator(this, this->size()); + } + + const_iterator end() const { + return const_iterator(this, this->size()); + } + + reverse_iterator rbegin() { + return reverse_iterator(end()); + } + + const_reverse_iterator rbegin() const { + return const_reverse_iterator(end()); + } + + reverse_iterator rend() { + return reverse_iterator(begin()); + } + + const_reverse_iterator rend() const { + return const_reverse_iterator(begin()); + } + + private: + size_type num_stack_items_ = 0; // current number of items + value_type values_[kSize]; // the first `kSize` items + // used only if there are more than `kSize` items. + std::vector vect_; +}; + +template +autovector& autovector::assign(const autovector& other) { + // copy the internal vector + vect_.assign(other.vect_.begin(), other.vect_.end()); + + // copy array + num_stack_items_ = other.num_stack_items_; + std::copy(other.values_, other.values_ + num_stack_items_, values_); + + return *this; +} + +} // rocksdb diff --git a/util/autovector_test.cc b/util/autovector_test.cc new file mode 100644 index 00000000..eb244aab --- /dev/null +++ b/util/autovector_test.cc @@ -0,0 +1,290 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include +#include + +#include "rocksdb/env.h" +#include "util/autovector.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { + +using namespace std; + +class AutoVectorTest { }; + +const unsigned long kSize = 8; +TEST(AutoVectorTest, PushBackAndPopBack) { + autovector vec; + ASSERT_TRUE(vec.empty()); + ASSERT_EQ(0ul, vec.size()); + + for (size_t i = 0; i < 1000 * kSize; ++i) { + vec.push_back(i); + ASSERT_TRUE(!vec.empty()); + if (i < kSize) { + ASSERT_TRUE(vec.only_in_stack()); + } else { + ASSERT_TRUE(!vec.only_in_stack()); + } + ASSERT_EQ(i + 1, vec.size()); + ASSERT_EQ(i, vec[i]); + ASSERT_EQ(i, vec.at(i)); + } + + size_t size = vec.size(); + while (size != 0) { + vec.pop_back(); + // will always be in heap + ASSERT_TRUE(!vec.only_in_stack()); + ASSERT_EQ(--size, vec.size()); + } + + ASSERT_TRUE(vec.empty()); +} + +TEST(AutoVectorTest, EmplaceBack) { + typedef std::pair ValueType; + autovector vec; + + for (size_t i = 0; i < 1000 * kSize; ++i) { + vec.emplace_back(i, std::to_string(i + 123)); + ASSERT_TRUE(!vec.empty()); + if (i < kSize) { + ASSERT_TRUE(vec.only_in_stack()); + } else { + ASSERT_TRUE(!vec.only_in_stack()); + } + + ASSERT_EQ(i + 1, vec.size()); + ASSERT_EQ(i, vec[i].first); + ASSERT_EQ(std::to_string(i + 123), vec[i].second); + } + + vec.clear(); + ASSERT_TRUE(vec.empty()); + ASSERT_TRUE(!vec.only_in_stack()); +} + +void AssertEqual( + const autovector& a, const autovector& b) { + ASSERT_EQ(a.size(), b.size()); + ASSERT_EQ(a.empty(), b.empty()); + ASSERT_EQ(a.only_in_stack(), b.only_in_stack()); + for (size_t i = 0; i < a.size(); ++i) { + ASSERT_EQ(a[i], b[i]); + } +} + +TEST(AutoVectorTest, CopyAndAssignment) { + // Test both heap-allocated and stack-allocated cases. + for (auto size : { kSize / 2, kSize * 1000 }) { + autovector vec; + for (size_t i = 0; i < size; ++i) { + vec.push_back(i); + } + + { + autovector other; + other = vec; + AssertEqual(other, vec); + } + + { + autovector other(vec); + AssertEqual(other, vec); + } + } +} + +TEST(AutoVectorTest, Iterators) { + autovector vec; + for (size_t i = 0; i < kSize * 1000; ++i) { + vec.push_back(std::to_string(i)); + } + + // basic operator test + ASSERT_EQ(vec.front(), *vec.begin()); + ASSERT_EQ(vec.back(), *(vec.end() - 1)); + ASSERT_TRUE(vec.begin() < vec.end()); + + // non-const iterator + size_t index = 0; + for (const auto& item : vec) { + ASSERT_EQ(vec[index++], item); + } + + index = vec.size() - 1; + for (auto pos = vec.rbegin(); pos != vec.rend(); ++pos) { + ASSERT_EQ(vec[index--], *pos); + } + + // const iterator + const auto& cvec = vec; + index = 0; + for (const auto& item : cvec) { + ASSERT_EQ(cvec[index++], item); + } + + index = vec.size() - 1; + for (auto pos = cvec.rbegin(); pos != cvec.rend(); ++pos) { + ASSERT_EQ(cvec[index--], *pos); + } + + // forward and backward + auto pos = vec.begin(); + while (pos != vec.end()) { + auto old_val = *pos; + auto old = pos++; + // HACK: make sure -> works + ASSERT_TRUE(!old->empty()); + ASSERT_EQ(old_val, *old); + ASSERT_TRUE(pos == vec.end() || old_val != *pos); + } + + pos = vec.begin(); + for (size_t i = 0; i < vec.size(); i += 2) { + // Cannot use ASSERT_EQ since that macro depends on iostream serialization + ASSERT_TRUE(pos + 2 - 2 == pos); + pos += 2; + ASSERT_TRUE(pos >= vec.begin()); + ASSERT_TRUE(pos <= vec.end()); + + size_t diff = static_cast(pos - vec.begin()); + ASSERT_EQ(i + 2, diff); + } +} + +vector GetTestKeys(size_t size) { + vector keys; + keys.resize(size); + + int index = 0; + for (auto& key : keys) { + key = "item-" + to_string(index++); + } + return keys; +} + +template +void BenchmarkVectorCreationAndInsertion( + string name, size_t ops, size_t item_size, + const std::vector& items) { + auto env = Env::Default(); + + int index = 0; + auto start_time = env->NowNanos(); + auto ops_remaining = ops; + while(ops_remaining--) { + TVector v; + for (size_t i = 0; i < item_size; ++i) { + v.push_back(items[index++]); + } + } + auto elapsed = env->NowNanos() - start_time; + cout << "created " << ops << " " << name << " instances:\n\t" + << "each was inserted with " << item_size << " elements\n\t" + << "total time elapsed: " << elapsed << " (ns)" << endl; +} + +template +size_t BenchmarkSequenceAccess(string name, size_t ops, size_t elem_size) { + TVector v; + for (const auto& item : GetTestKeys(elem_size)) { + v.push_back(item); + } + auto env = Env::Default(); + + auto ops_remaining = ops; + auto start_time = env->NowNanos(); + size_t total = 0; + while (ops_remaining--) { + auto end = v.end(); + for (auto pos = v.begin(); pos != end; ++pos) { + total += pos->size(); + } + } + auto elapsed = env->NowNanos() - start_time; + cout << "performed " << ops << " sequence access against " << name << "\n\t" + << "size: " << elem_size << "\n\t" + << "total time elapsed: " << elapsed << " (ns)" << endl; + // HACK avoid compiler's optimization to ignore total + return total; +} + +// This test case only reports the performance between std::vector +// and autovector. We chose string for comparison because in most +// o our use cases we used std::vector. +TEST(AutoVectorTest, PerfBench) { + // We run same operations for kOps times in order to get a more fair result. + size_t kOps = 100000; + + // Creation and insertion test + // Test the case when there is: + // * no element inserted: internal array of std::vector may not really get + // initialize. + // * one element inserted: internal array of std::vector must have + // initialized. + // * kSize elements inserted. This shows the most time we'll spend if we + // keep everything in stack. + // * 2 * kSize elements inserted. The internal vector of + // autovector must have been initialized. + cout << "=====================================================" << endl; + cout << "Creation and Insertion Test (value type: std::string)" << endl; + cout << "=====================================================" << endl; + + // pre-generated unique keys + auto string_keys = GetTestKeys(kOps * 2 * kSize); + for (auto insertions : { 0ul, 1ul, kSize / 2, kSize, 2 * kSize }) { + BenchmarkVectorCreationAndInsertion>( + "vector", kOps, insertions, string_keys + ); + BenchmarkVectorCreationAndInsertion>( + "autovector", kOps, insertions, string_keys + ); + cout << "-----------------------------------" << endl; + } + + cout << "=====================================================" << endl; + cout << "Creation and Insertion Test (value type: uint64_t)" << endl; + cout << "=====================================================" << endl; + + // pre-generated unique keys + vector int_keys(kOps * 2 * kSize); + for (size_t i = 0; i < kOps * 2 * kSize; ++i) { + int_keys[i] = i; + } + for (auto insertions : { 0ul, 1ul, kSize / 2, kSize, 2 * kSize }) { + BenchmarkVectorCreationAndInsertion>( + "vector", kOps, insertions, int_keys + ); + BenchmarkVectorCreationAndInsertion>( + "autovector", kOps, insertions, int_keys + ); + cout << "-----------------------------------" << endl; + } + + // Sequence Access Test + cout << "=====================================================" << endl; + cout << "Sequence Access Test" << endl; + cout << "=====================================================" << endl; + for (auto elem_size : { kSize / 2, kSize, 2 * kSize }) { + BenchmarkSequenceAccess>( + "vector", kOps, elem_size + ); + BenchmarkSequenceAccess>( + "autovector", kOps, elem_size + ); + cout << "-----------------------------------" << endl; + } +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/util/bit_set.h b/util/bit_set.h new file mode 100644 index 00000000..01727060 --- /dev/null +++ b/util/bit_set.h @@ -0,0 +1,71 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#pragma once +#include + +namespace rocksdb { + +class BitSet { + public: + /** + * Create a bit set of numBits, with the bits set to either true or false. + */ + explicit BitSet(size_t numBits, bool initial=false) + : numBits_(numBits), + data_(numWords(), initial ? ~0UL : 0UL) { + } + + /** + * Set bit b to 1. + */ + void set(size_t b) { + assert(b >= 0 && b < numBits_); + data_[word(b)] |= wordOffsetMask(b); + } + + /** + * Set bit b to 0; + */ + void reset(size_t b) { + assert(b >= 0 && b < numBits_); + data_[word(b)] &= ~wordOffsetMask(b); + } + + /** + * Get a bit. + */ + bool test(int b) const { + return data_[word(b)] & wordOffsetMask(b); + } + + /** + * Return the size of the BitSet, in bits. + */ + size_t size() const { + return numBits_; + } + + private: + + inline size_t numWords() const { + if (numBits_ == 0) return 0; + return 1 + (numBits_-1) / (8*sizeof(unsigned long)); + } + inline static size_t word(int b) { + return b / (8*sizeof(unsigned long)); + } + inline static int wordOffset(int b) { + return b % (8*sizeof(unsigned long)); + } + inline static unsigned long wordOffsetMask(int b) { + return 1UL << wordOffset(b); + } + + size_t numBits_; + std::vector data_; +}; + +} // namespace facebook diff --git a/util/blob_store.cc b/util/blob_store.cc new file mode 100644 index 00000000..9f067128 --- /dev/null +++ b/util/blob_store.cc @@ -0,0 +1,264 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/blob_store.h" + +namespace rocksdb { + +using namespace std; + +// BlobChunk +bool BlobChunk::ImmediatelyBefore(const BlobChunk& chunk) const { + // overlapping!? + assert(!Overlap(chunk)); + // size == 0 is a marker, not a block + return size != 0 && + bucket_id == chunk.bucket_id && + offset + size == chunk.offset; +} + +bool BlobChunk::Overlap(const BlobChunk &chunk) const { + return size != 0 && chunk.size != 0 && bucket_id == chunk.bucket_id && + ((offset >= chunk.offset && offset < chunk.offset + chunk.size) || + (chunk.offset >= offset && chunk.offset < offset + size)); +} + +// Blob +string Blob::ToString() const { + string ret; + for (auto chunk : chunks) { + PutFixed32(&ret, chunk.bucket_id); + PutFixed32(&ret, chunk.offset); + PutFixed32(&ret, chunk.size); + } + return ret; +} + +Blob::Blob(const std::string& blob) { + for (uint32_t i = 0; i < blob.size(); ) { + uint32_t t[3] = {0}; + for (int j = 0; j < 3 && i + sizeof(uint32_t) - 1 < blob.size(); + ++j, i += sizeof(uint32_t)) { + t[j] = DecodeFixed32(blob.data() + i); + } + chunks.push_back(BlobChunk(t[0], t[1], t[2])); + } +} + +// FreeList +Status FreeList::Free(const Blob& blob) { + // add it back to the free list + for (auto chunk : blob.chunks) { + free_blocks_ += chunk.size; + if (fifo_free_chunks_.size() && + fifo_free_chunks_.back().ImmediatelyBefore(chunk)) { + fifo_free_chunks_.back().size += chunk.size; + } else { + fifo_free_chunks_.push_back(chunk); + } + } + + return Status::OK(); +} + +Status FreeList::Allocate(uint32_t blocks, Blob* blob) { + if (free_blocks_ < blocks) { + return Status::Incomplete(""); + } + + blob->chunks.clear(); + free_blocks_ -= blocks; + + while (blocks > 0) { + assert(fifo_free_chunks_.size() > 0); + auto& front = fifo_free_chunks_.front(); + if (front.size > blocks) { + blob->chunks.push_back(BlobChunk(front.bucket_id, front.offset, blocks)); + front.offset += blocks; + front.size -= blocks; + blocks = 0; + } else { + blob->chunks.push_back(front); + blocks -= front.size; + fifo_free_chunks_.pop_front(); + } + } + assert(blocks == 0); + + return Status::OK(); +} + +bool FreeList::Overlap(const Blob &blob) const { + for (auto chunk : blob.chunks) { + for (auto itr = fifo_free_chunks_.begin(); + itr != fifo_free_chunks_.end(); + ++itr) { + if (itr->Overlap(chunk)) { + return true; + } + } + } + return false; +} + +// BlobStore +BlobStore::BlobStore(const string& directory, + uint64_t block_size, + uint32_t blocks_per_bucket, + uint32_t max_buckets, + Env* env) : + directory_(directory), + block_size_(block_size), + blocks_per_bucket_(blocks_per_bucket), + env_(env), + max_buckets_(max_buckets) { + env_->CreateDirIfMissing(directory_); + + storage_options_.use_mmap_writes = false; + storage_options_.use_mmap_reads = false; + + buckets_size_ = 0; + buckets_ = new unique_ptr[max_buckets_]; + + CreateNewBucket(); +} + +BlobStore::~BlobStore() { + // TODO we don't care about recovery for now + delete [] buckets_; +} + +Status BlobStore::Put(const Slice& value, Blob* blob) { + // convert size to number of blocks + Status s = Allocate((value.size() + block_size_ - 1) / block_size_, blob); + if (!s.ok()) { + return s; + } + auto size_left = (uint64_t) value.size(); + + uint64_t offset = 0; // in bytes, not blocks + for (auto chunk : blob->chunks) { + uint64_t write_size = min(chunk.size * block_size_, size_left); + assert(chunk.bucket_id < buckets_size_); + s = buckets_[chunk.bucket_id].get()->Write(chunk.offset * block_size_, + Slice(value.data() + offset, + write_size)); + if (!s.ok()) { + Delete(*blob); + return s; + } + offset += write_size; + size_left -= write_size; + if (write_size < chunk.size * block_size_) { + // if we have any space left in the block, fill it up with zeros + string zero_string(chunk.size * block_size_ - write_size, 0); + s = buckets_[chunk.bucket_id].get()->Write(chunk.offset * block_size_ + + write_size, + Slice(zero_string)); + } + } + + if (size_left > 0) { + Delete(*blob); + return Status::IOError("Tried to write more data than fits in the blob"); + } + + return Status::OK(); +} + +Status BlobStore::Get(const Blob& blob, + string* value) const { + { + // assert that it doesn't overlap with free list + // it will get compiled out for release + MutexLock l(&free_list_mutex_); + assert(!free_list_.Overlap(blob)); + } + + value->resize(blob.Size() * block_size_); + + uint64_t offset = 0; // in bytes, not blocks + for (auto chunk : blob.chunks) { + Slice result; + assert(chunk.bucket_id < buckets_size_); + Status s; + s = buckets_[chunk.bucket_id].get()->Read(chunk.offset * block_size_, + chunk.size * block_size_, + &result, + &value->at(offset)); + if (!s.ok() || result.size() < chunk.size * block_size_) { + value->clear(); + return Status::IOError("Could not read in from file"); + } + offset += chunk.size * block_size_; + } + + // remove the '\0's at the end of the string + value->erase(find(value->begin(), value->end(), '\0'), value->end()); + + return Status::OK(); +} + +Status BlobStore::Delete(const Blob& blob) { + MutexLock l(&free_list_mutex_); + return free_list_.Free(blob); +} + +Status BlobStore::Sync() { + for (size_t i = 0; i < buckets_size_; ++i) { + Status s = buckets_[i].get()->Sync(); + if (!s.ok()) { + return s; + } + } + return Status::OK(); +} + +Status BlobStore::Allocate(uint32_t blocks, Blob* blob) { + MutexLock l(&free_list_mutex_); + Status s; + + s = free_list_.Allocate(blocks, blob); + if (!s.ok()) { + s = CreateNewBucket(); + if (!s.ok()) { + return s; + } + s = free_list_.Allocate(blocks, blob); + } + + return s; +} + +// called with free_list_mutex_ held +Status BlobStore::CreateNewBucket() { + MutexLock l(&buckets_mutex_); + + if (buckets_size_ >= max_buckets_) { + return Status::IOError("Max size exceeded\n"); + } + + int new_bucket_id = buckets_size_; + + char fname[200]; + sprintf(fname, "%s/%d.bs", directory_.c_str(), new_bucket_id); + + Status s = env_->NewRandomRWFile(string(fname), + &buckets_[new_bucket_id], + storage_options_); + if (!s.ok()) { + return s; + } + + // whether Allocate succeeds or not, does not affect the overall correctness + // of this function - calling Allocate is really optional + // (also, tmpfs does not support allocate) + buckets_[new_bucket_id].get()->Allocate(0, block_size_ * blocks_per_bucket_); + + buckets_size_ = new_bucket_id + 1; + + return free_list_.Free(Blob(new_bucket_id, 0, blocks_per_bucket_)); +} + +} // namespace rocksdb diff --git a/util/blob_store.h b/util/blob_store.h new file mode 100644 index 00000000..0a81d01d --- /dev/null +++ b/util/blob_store.h @@ -0,0 +1,161 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include "rocksdb/env.h" +#include "rocksdb/status.h" +#include "port/port.h" +#include "util/mutexlock.h" +#include "util/coding.h" + +#include +#include +#include +#include +#include +#include +#include + +namespace rocksdb { + +struct BlobChunk { + uint32_t bucket_id; + uint32_t offset; // in blocks + uint32_t size; // in blocks + BlobChunk() {} + BlobChunk(uint32_t bucket_id, uint32_t offset, uint32_t size) : + bucket_id(bucket_id), offset(offset), size(size) {} + + // returns true if it's immediately before chunk + bool ImmediatelyBefore(const BlobChunk& chunk) const; + // returns true if chunks overlap + bool Overlap(const BlobChunk &chunk) const; +}; + +// We represent each Blob as a string in format: +// bucket_id offset size|bucket_id offset size... +// The string can be used to reference the Blob stored on external +// device/file +// Not thread-safe! +struct Blob { + // Generates the string + std::string ToString() const; + // Parses the previously generated string + explicit Blob(const std::string& blob); + // Creates unfragmented Blob + Blob(uint32_t bucket_id, uint32_t offset, uint32_t size) { + SetOneChunk(bucket_id, offset, size); + } + Blob() {} + + void SetOneChunk(uint32_t bucket_id, uint32_t offset, uint32_t size) { + chunks.clear(); + chunks.push_back(BlobChunk(bucket_id, offset, size)); + } + + uint32_t Size() const { // in blocks + uint32_t ret = 0; + for (auto chunk : chunks) { + ret += chunk.size; + } + assert(ret > 0); + return ret; + } + + // bucket_id, offset, size + std::vector chunks; +}; + +// Keeps a list of free chunks +// NOT thread-safe. Externally synchronized +class FreeList { + public: + FreeList() : + free_blocks_(0) {} + ~FreeList() {} + + // Allocates a a blob. Stores the allocated blob in + // 'blob'. Returns non-OK status if it failed to allocate. + // Thread-safe + Status Allocate(uint32_t blocks, Blob* blob); + // Frees the blob for reuse. Thread-safe + Status Free(const Blob& blob); + + // returns true if blob is overlapping with any of the + // chunks stored in free list + bool Overlap(const Blob &blob) const; + + private: + std::deque fifo_free_chunks_; + uint32_t free_blocks_; + mutable port::Mutex mutex_; +}; + +// thread-safe +class BlobStore { + public: + // directory - wherever the blobs should be stored. It will be created + // if missing + // block_size - self explanatory + // blocks_per_bucket - how many blocks we want to keep in one bucket. + // Bucket is a device or a file that we use to store the blobs. + // If we don't have enough blocks to allocate a new blob, we will + // try to create a new file or device. + // max_buckets - maximum number of buckets BlobStore will create + // BlobStore max size in bytes is + // max_buckets * blocks_per_bucket * block_size + // env - env for creating new files + BlobStore(const std::string& directory, + uint64_t block_size, + uint32_t blocks_per_bucket, + uint32_t max_buckets, + Env* env); + ~BlobStore(); + + // Allocates space for value.size bytes (rounded up to be multiple of + // block size) and writes value.size bytes from value.data to a backing store. + // Sets Blob blob that can than be used for addressing the + // stored value. Returns non-OK status on error. + Status Put(const Slice& value, Blob* blob); + // Value needs to have enough space to store all the loaded stuff. + // This function is thread safe! + Status Get(const Blob& blob, std::string* value) const; + // Frees the blob for reuse, but does not delete the data + // on the backing store. + Status Delete(const Blob& blob); + // Sync all opened files that are modified + Status Sync(); + + private: + const std::string directory_; + // block_size_ is uint64_t because when we multiply with + // blocks_size_ we want the result to be uint64_t or + // we risk overflowing + const uint64_t block_size_; + const uint32_t blocks_per_bucket_; + Env* env_; + EnvOptions storage_options_; + // protected by free_list_mutex_ + FreeList free_list_; + // free_list_mutex_ is locked BEFORE buckets_mutex_ + mutable port::Mutex free_list_mutex_; + // protected by buckets_mutex_ + // array of buckets + unique_ptr* buckets_; + // number of buckets in the array + uint32_t buckets_size_; + uint32_t max_buckets_; + mutable port::Mutex buckets_mutex_; + + // Calls FreeList allocate. If free list can't allocate + // new blob, creates new bucket and tries again + // Thread-safe + Status Allocate(uint32_t blocks, Blob* blob); + + // Creates a new backing store and adds all the blocks + // from the new backing store to the free list + Status CreateNewBucket(); +}; + +} // namespace rocksdb diff --git a/util/blob_store_test.cc b/util/blob_store_test.cc new file mode 100644 index 00000000..f199f5dd --- /dev/null +++ b/util/blob_store_test.cc @@ -0,0 +1,200 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/blob_store.h" + +#include "util/testharness.h" +#include "util/testutil.h" +#include "util/random.h" + +#include +#include + +namespace rocksdb { + +using namespace std; + +class BlobStoreTest { }; + +TEST(BlobStoreTest, RangeParseTest) { + Blob e; + for (int i = 0; i < 5; ++i) { + e.chunks.push_back(BlobChunk(rand(), rand(), rand())); + } + string x = e.ToString(); + Blob nx(x); + + ASSERT_EQ(nx.ToString(), x); +} + +// make sure we're reusing the freed space +TEST(BlobStoreTest, SanityTest) { + const uint64_t block_size = 10; + const uint32_t blocks_per_file = 20; + Random random(5); + + BlobStore blob_store(test::TmpDir() + "/blob_store_test", + block_size, + blocks_per_file, + 1000, + Env::Default()); + + string buf; + + // put string of size 170 + test::RandomString(&random, 170, &buf); + Blob r1; + ASSERT_OK(blob_store.Put(Slice(buf), &r1)); + // use the first file + for (size_t i = 0; i < r1.chunks.size(); ++i) { + ASSERT_EQ(r1.chunks[0].bucket_id, 0u); + } + + // put string of size 30 + test::RandomString(&random, 30, &buf); + Blob r2; + ASSERT_OK(blob_store.Put(Slice(buf), &r2)); + // use the first file + for (size_t i = 0; i < r2.chunks.size(); ++i) { + ASSERT_EQ(r2.chunks[0].bucket_id, 0u); + } + + // delete blob of size 170 + ASSERT_OK(blob_store.Delete(r1)); + + // put a string of size 100 + test::RandomString(&random, 100, &buf); + Blob r3; + ASSERT_OK(blob_store.Put(Slice(buf), &r3)); + // use the first file + for (size_t i = 0; i < r3.chunks.size(); ++i) { + ASSERT_EQ(r3.chunks[0].bucket_id, 0u); + } + + // put a string of size 70 + test::RandomString(&random, 70, &buf); + Blob r4; + ASSERT_OK(blob_store.Put(Slice(buf), &r4)); + // use the first file + for (size_t i = 0; i < r4.chunks.size(); ++i) { + ASSERT_EQ(r4.chunks[0].bucket_id, 0u); + } + + // put a string of size 5 + test::RandomString(&random, 5, &buf); + Blob r5; + ASSERT_OK(blob_store.Put(Slice(buf), &r5)); + // now you get to use the second file + for (size_t i = 0; i < r5.chunks.size(); ++i) { + ASSERT_EQ(r5.chunks[0].bucket_id, 1u); + } +} + +TEST(BlobStoreTest, FragmentedChunksTest) { + const uint64_t block_size = 10; + const uint32_t blocks_per_file = 20; + Random random(5); + + BlobStore blob_store(test::TmpDir() + "/blob_store_test", + block_size, + blocks_per_file, + 1000, + Env::Default()); + + string buf; + + vector r(4); + + // put 4 strings of size 50 + for (int k = 0; k < 4; ++k) { + test::RandomString(&random, 50, &buf); + ASSERT_OK(blob_store.Put(Slice(buf), &r[k])); + // use the first file + for (size_t i = 0; i < r[k].chunks.size(); ++i) { + ASSERT_EQ(r[k].chunks[0].bucket_id, 0u); + } + } + + // delete the first and third + ASSERT_OK(blob_store.Delete(r[0])); + ASSERT_OK(blob_store.Delete(r[2])); + + // put string of size 100. it should reuse space that we deleting + // by deleting first and third strings of size 50 + test::RandomString(&random, 100, &buf); + Blob r2; + ASSERT_OK(blob_store.Put(Slice(buf), &r2)); + // use the first file + for (size_t i = 0; i < r2.chunks.size(); ++i) { + ASSERT_EQ(r2.chunks[0].bucket_id, 0u); + } +} + +TEST(BlobStoreTest, CreateAndStoreTest) { + const uint64_t block_size = 10; + const uint32_t blocks_per_file = 1000; + const int max_blurb_size = 300; + Random random(5); + + BlobStore blob_store(test::TmpDir() + "/blob_store_test", + block_size, + blocks_per_file, + 10000, + Env::Default()); + vector> ranges; + + for (int i = 0; i < 2000; ++i) { + int decision = rand() % 5; + if (decision <= 2 || ranges.size() == 0) { + string buf; + int size_blocks = (rand() % max_blurb_size + 1); + int string_size = size_blocks * block_size - (rand() % block_size); + test::RandomString(&random, string_size, &buf); + Blob r; + ASSERT_OK(blob_store.Put(Slice(buf), &r)); + ranges.push_back(make_pair(r, buf)); + } else if (decision == 3) { + int ti = rand() % ranges.size(); + string out_buf; + ASSERT_OK(blob_store.Get(ranges[ti].first, &out_buf)); + ASSERT_EQ(ranges[ti].second, out_buf); + } else { + int ti = rand() % ranges.size(); + ASSERT_OK(blob_store.Delete(ranges[ti].first)); + ranges.erase(ranges.begin() + ti); + } + } + ASSERT_OK(blob_store.Sync()); +} + +TEST(BlobStoreTest, MaxSizeTest) { + const uint64_t block_size = 10; + const uint32_t blocks_per_file = 100; + const int max_buckets = 10; + Random random(5); + + BlobStore blob_store(test::TmpDir() + "/blob_store_test", + block_size, + blocks_per_file, + max_buckets, + Env::Default()); + string buf; + for (int i = 0; i < max_buckets; ++i) { + test::RandomString(&random, 1000, &buf); + Blob r; + ASSERT_OK(blob_store.Put(Slice(buf), &r)); + } + + test::RandomString(&random, 1000, &buf); + Blob r; + // should fail because max size + Status s = blob_store.Put(Slice(buf), &r); + ASSERT_EQ(s.ok(), false); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/util/bloom.cc b/util/bloom.cc new file mode 100644 index 00000000..78ae04a2 --- /dev/null +++ b/util/bloom.cc @@ -0,0 +1,111 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/filter_policy.h" + +#include "rocksdb/slice.h" +#include "util/hash.h" + +namespace rocksdb { + +namespace { +static uint32_t BloomHash(const Slice& key) { + return Hash(key.data(), key.size(), 0xbc9f1d34); +} + +class BloomFilterPolicy : public FilterPolicy { + private: + size_t bits_per_key_; + size_t k_; + uint32_t (*hash_func_)(const Slice& key); + + void initialize() { + // We intentionally round down to reduce probing cost a little bit + k_ = static_cast(bits_per_key_ * 0.69); // 0.69 =~ ln(2) + if (k_ < 1) k_ = 1; + if (k_ > 30) k_ = 30; + } + + public: + explicit BloomFilterPolicy(int bits_per_key, + uint32_t (*hash_func)(const Slice& key)) + : bits_per_key_(bits_per_key), hash_func_(hash_func) { + initialize(); + } + explicit BloomFilterPolicy(int bits_per_key) + : bits_per_key_(bits_per_key) { + hash_func_ = BloomHash; + initialize(); + } + + virtual const char* Name() const { + return "rocksdb.BuiltinBloomFilter"; + } + + virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const { + // Compute bloom filter size (in both bits and bytes) + size_t bits = n * bits_per_key_; + + // For small n, we can see a very high false positive rate. Fix it + // by enforcing a minimum bloom filter length. + if (bits < 64) bits = 64; + + size_t bytes = (bits + 7) / 8; + bits = bytes * 8; + + const size_t init_size = dst->size(); + dst->resize(init_size + bytes, 0); + dst->push_back(static_cast(k_)); // Remember # of probes in filter + char* array = &(*dst)[init_size]; + for (size_t i = 0; i < (size_t)n; i++) { + // Use double-hashing to generate a sequence of hash values. + // See analysis in [Kirsch,Mitzenmacher 2006]. + uint32_t h = hash_func_(keys[i]); + const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits + for (size_t j = 0; j < k_; j++) { + const uint32_t bitpos = h % bits; + array[bitpos/8] |= (1 << (bitpos % 8)); + h += delta; + } + } + } + + virtual bool KeyMayMatch(const Slice& key, const Slice& bloom_filter) const { + const size_t len = bloom_filter.size(); + if (len < 2) return false; + + const char* array = bloom_filter.data(); + const size_t bits = (len - 1) * 8; + + // Use the encoded k so that we can read filters generated by + // bloom filters created using different parameters. + const size_t k = array[len-1]; + if (k > 30) { + // Reserved for potentially new encodings for short bloom filters. + // Consider it a match. + return true; + } + + uint32_t h = hash_func_(key); + const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits + for (size_t j = 0; j < k; j++) { + const uint32_t bitpos = h % bits; + if ((array[bitpos/8] & (1 << (bitpos % 8))) == 0) return false; + h += delta; + } + return true; + } +}; +} + +const FilterPolicy* NewBloomFilterPolicy(int bits_per_key) { + return new BloomFilterPolicy(bits_per_key); +} + +} // namespace rocksdb diff --git a/util/bloom_test.cc b/util/bloom_test.cc new file mode 100644 index 00000000..9dbd5d2c --- /dev/null +++ b/util/bloom_test.cc @@ -0,0 +1,164 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/filter_policy.h" + +#include "util/logging.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { + +static const int kVerbose = 1; + +static Slice Key(int i, char* buffer) { + memcpy(buffer, &i, sizeof(i)); + return Slice(buffer, sizeof(i)); +} + +class BloomTest { + private: + const FilterPolicy* policy_; + std::string filter_; + std::vector keys_; + + public: + BloomTest() : policy_(NewBloomFilterPolicy(10)) { } + + ~BloomTest() { + delete policy_; + } + + void Reset() { + keys_.clear(); + filter_.clear(); + } + + void Add(const Slice& s) { + keys_.push_back(s.ToString()); + } + + void Build() { + std::vector key_slices; + for (size_t i = 0; i < keys_.size(); i++) { + key_slices.push_back(Slice(keys_[i])); + } + filter_.clear(); + policy_->CreateFilter(&key_slices[0], key_slices.size(), &filter_); + keys_.clear(); + if (kVerbose >= 2) DumpFilter(); + } + + size_t FilterSize() const { + return filter_.size(); + } + + void DumpFilter() { + fprintf(stderr, "F("); + for (size_t i = 0; i+1 < filter_.size(); i++) { + const unsigned int c = static_cast(filter_[i]); + for (int j = 0; j < 8; j++) { + fprintf(stderr, "%c", (c & (1 <KeyMayMatch(s, filter_); + } + + double FalsePositiveRate() { + char buffer[sizeof(int)]; + int result = 0; + for (int i = 0; i < 10000; i++) { + if (Matches(Key(i + 1000000000, buffer))) { + result++; + } + } + return result / 10000.0; + } +}; + +TEST(BloomTest, EmptyFilter) { + ASSERT_TRUE(! Matches("hello")); + ASSERT_TRUE(! Matches("world")); +} + +TEST(BloomTest, Small) { + Add("hello"); + Add("world"); + ASSERT_TRUE(Matches("hello")); + ASSERT_TRUE(Matches("world")); + ASSERT_TRUE(! Matches("x")); + ASSERT_TRUE(! Matches("foo")); +} + +static int NextLength(int length) { + if (length < 10) { + length += 1; + } else if (length < 100) { + length += 10; + } else if (length < 1000) { + length += 100; + } else { + length += 1000; + } + return length; +} + +TEST(BloomTest, VaryingLengths) { + char buffer[sizeof(int)]; + + // Count number of filters that significantly exceed the false positive rate + int mediocre_filters = 0; + int good_filters = 0; + + for (int length = 1; length <= 10000; length = NextLength(length)) { + Reset(); + for (int i = 0; i < length; i++) { + Add(Key(i, buffer)); + } + Build(); + + ASSERT_LE(FilterSize(), (size_t)((length * 10 / 8) + 40)) << length; + + // All added keys must match + for (int i = 0; i < length; i++) { + ASSERT_TRUE(Matches(Key(i, buffer))) + << "Length " << length << "; key " << i; + } + + // Check false positive rate + double rate = FalsePositiveRate(); + if (kVerbose >= 1) { + fprintf(stderr, "False positives: %5.2f%% @ length = %6d ; bytes = %6d\n", + rate*100.0, length, static_cast(FilterSize())); + } + ASSERT_LE(rate, 0.02); // Must not be over 2% + if (rate > 0.0125) mediocre_filters++; // Allowed, but not too often + else good_filters++; + } + if (kVerbose >= 1) { + fprintf(stderr, "Filters: %d good, %d mediocre\n", + good_filters, mediocre_filters); + } + ASSERT_LE(mediocre_filters, good_filters/5); +} + +// Different bits-per-byte + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/util/build_version.h b/util/build_version.h new file mode 100644 index 00000000..3d2ed291 --- /dev/null +++ b/util/build_version.h @@ -0,0 +1,13 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#pragma once +// these variables tell us about the git config and time +extern const char* rocksdb_build_git_sha; + +// these variables tell us when the compilation occurred +extern const char* rocksdb_build_compile_time; +extern const char* rocksdb_build_compile_date; + diff --git a/util/cache.cc b/util/cache.cc new file mode 100644 index 00000000..8fa03626 --- /dev/null +++ b/util/cache.cc @@ -0,0 +1,434 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include +#include +#include + +#include "rocksdb/cache.h" +#include "port/port.h" +#include "util/hash.h" +#include "util/mutexlock.h" + +namespace rocksdb { + +Cache::~Cache() { +} + +namespace { + +// LRU cache implementation + +// An entry is a variable length heap-allocated structure. Entries +// are kept in a circular doubly linked list ordered by access time. +struct LRUHandle { + void* value; + void (*deleter)(const Slice&, void* value); + LRUHandle* next_hash; + LRUHandle* next; + LRUHandle* prev; + size_t charge; // TODO(opt): Only allow uint32_t? + size_t key_length; + uint32_t refs; + uint32_t hash; // Hash of key(); used for fast sharding and comparisons + char key_data[1]; // Beginning of key + + Slice key() const { + // For cheaper lookups, we allow a temporary Handle object + // to store a pointer to a key in "value". + if (next == this) { + return *(reinterpret_cast(value)); + } else { + return Slice(key_data, key_length); + } + } +}; + +// We provide our own simple hash table since it removes a whole bunch +// of porting hacks and is also faster than some of the built-in hash +// table implementations in some of the compiler/runtime combinations +// we have tested. E.g., readrandom speeds up by ~5% over the g++ +// 4.4.3's builtin hashtable. +class HandleTable { + public: + HandleTable() : length_(0), elems_(0), list_(nullptr) { Resize(); } + ~HandleTable() { delete[] list_; } + + LRUHandle* Lookup(const Slice& key, uint32_t hash) { + return *FindPointer(key, hash); + } + + LRUHandle* Insert(LRUHandle* h) { + LRUHandle** ptr = FindPointer(h->key(), h->hash); + LRUHandle* old = *ptr; + h->next_hash = (old == nullptr ? nullptr : old->next_hash); + *ptr = h; + if (old == nullptr) { + ++elems_; + if (elems_ > length_) { + // Since each cache entry is fairly large, we aim for a small + // average linked list length (<= 1). + Resize(); + } + } + return old; + } + + LRUHandle* Remove(const Slice& key, uint32_t hash) { + LRUHandle** ptr = FindPointer(key, hash); + LRUHandle* result = *ptr; + if (result != nullptr) { + *ptr = result->next_hash; + --elems_; + } + return result; + } + + private: + // The table consists of an array of buckets where each bucket is + // a linked list of cache entries that hash into the bucket. + uint32_t length_; + uint32_t elems_; + LRUHandle** list_; + + // Return a pointer to slot that points to a cache entry that + // matches key/hash. If there is no such cache entry, return a + // pointer to the trailing slot in the corresponding linked list. + LRUHandle** FindPointer(const Slice& key, uint32_t hash) { + LRUHandle** ptr = &list_[hash & (length_ - 1)]; + while (*ptr != nullptr && + ((*ptr)->hash != hash || key != (*ptr)->key())) { + ptr = &(*ptr)->next_hash; + } + return ptr; + } + + void Resize() { + uint32_t new_length = 16; + while (new_length < elems_ * 1.5) { + new_length *= 2; + } + LRUHandle** new_list = new LRUHandle*[new_length]; + memset(new_list, 0, sizeof(new_list[0]) * new_length); + uint32_t count = 0; + for (uint32_t i = 0; i < length_; i++) { + LRUHandle* h = list_[i]; + while (h != nullptr) { + LRUHandle* next = h->next_hash; + uint32_t hash = h->hash; + LRUHandle** ptr = &new_list[hash & (new_length - 1)]; + h->next_hash = *ptr; + *ptr = h; + h = next; + count++; + } + } + assert(elems_ == count); + delete[] list_; + list_ = new_list; + length_ = new_length; + } +}; + +// A single shard of sharded cache. +class LRUCache { + public: + LRUCache(); + ~LRUCache(); + + // Separate from constructor so caller can easily make an array of LRUCache + void SetCapacity(size_t capacity) { capacity_ = capacity; } + void SetRemoveScanCountLimit(size_t remove_scan_count_limit) { + remove_scan_count_limit_ = remove_scan_count_limit; + } + + // Like Cache methods, but with an extra "hash" parameter. + Cache::Handle* Insert(const Slice& key, uint32_t hash, + void* value, size_t charge, + void (*deleter)(const Slice& key, void* value)); + Cache::Handle* Lookup(const Slice& key, uint32_t hash); + void Release(Cache::Handle* handle); + void Erase(const Slice& key, uint32_t hash); + + private: + void LRU_Remove(LRUHandle* e); + void LRU_Append(LRUHandle* e); + // Just reduce the reference count by 1. + // Return true if last reference + bool Unref(LRUHandle* e); + // Call deleter and free + void FreeEntry(LRUHandle* e); + + // Initialized before use. + size_t capacity_; + uint32_t remove_scan_count_limit_; + + // mutex_ protects the following state. + port::Mutex mutex_; + size_t usage_; + + // Dummy head of LRU list. + // lru.prev is newest entry, lru.next is oldest entry. + LRUHandle lru_; + + HandleTable table_; +}; + +LRUCache::LRUCache() + : usage_(0) { + // Make empty circular linked list + lru_.next = &lru_; + lru_.prev = &lru_; +} + +LRUCache::~LRUCache() { + for (LRUHandle* e = lru_.next; e != &lru_; ) { + LRUHandle* next = e->next; + assert(e->refs == 1); // Error if caller has an unreleased handle + if (Unref(e)) { + FreeEntry(e); + } + e = next; + } +} + +bool LRUCache::Unref(LRUHandle* e) { + assert(e->refs > 0); + e->refs--; + return e->refs == 0; +} + +void LRUCache::FreeEntry(LRUHandle* e) { + assert(e->refs == 0); + (*e->deleter)(e->key(), e->value); + free(e); +} + +void LRUCache::LRU_Remove(LRUHandle* e) { + e->next->prev = e->prev; + e->prev->next = e->next; + usage_ -= e->charge; +} + +void LRUCache::LRU_Append(LRUHandle* e) { + // Make "e" newest entry by inserting just before lru_ + e->next = &lru_; + e->prev = lru_.prev; + e->prev->next = e; + e->next->prev = e; + usage_ += e->charge; +} + +Cache::Handle* LRUCache::Lookup(const Slice& key, uint32_t hash) { + MutexLock l(&mutex_); + LRUHandle* e = table_.Lookup(key, hash); + if (e != nullptr) { + e->refs++; + LRU_Remove(e); + LRU_Append(e); + } + return reinterpret_cast(e); +} + +void LRUCache::Release(Cache::Handle* handle) { + LRUHandle* e = reinterpret_cast(handle); + bool last_reference = false; + { + MutexLock l(&mutex_); + last_reference = Unref(e); + } + if (last_reference) { + FreeEntry(e); + } +} + +Cache::Handle* LRUCache::Insert( + const Slice& key, uint32_t hash, void* value, size_t charge, + void (*deleter)(const Slice& key, void* value)) { + + LRUHandle* e = reinterpret_cast( + malloc(sizeof(LRUHandle)-1 + key.size())); + std::vector last_reference_list; + last_reference_list.reserve(1); + + e->value = value; + e->deleter = deleter; + e->charge = charge; + e->key_length = key.size(); + e->hash = hash; + e->refs = 2; // One from LRUCache, one for the returned handle + memcpy(e->key_data, key.data(), key.size()); + + { + MutexLock l(&mutex_); + + LRU_Append(e); + + LRUHandle* old = table_.Insert(e); + if (old != nullptr) { + LRU_Remove(old); + if (Unref(old)) { + last_reference_list.push_back(old); + } + } + + if (remove_scan_count_limit_ > 0) { + // Try to free the space by evicting the entries that are only + // referenced by the cache first. + LRUHandle* cur = lru_.next; + for (unsigned int scanCount = 0; + usage_ > capacity_ && cur != &lru_ + && scanCount < remove_scan_count_limit_; scanCount++) { + LRUHandle* next = cur->next; + if (cur->refs <= 1) { + LRU_Remove(cur); + table_.Remove(cur->key(), cur->hash); + if (Unref(cur)) { + last_reference_list.push_back(cur); + } + } + cur = next; + } + } + + // Free the space following strict LRU policy until enough space + // is freed. + while (usage_ > capacity_ && lru_.next != &lru_) { + LRUHandle* old = lru_.next; + LRU_Remove(old); + table_.Remove(old->key(), old->hash); + if (Unref(old)) { + last_reference_list.push_back(old); + } + } + } + + // we free the entries here outside of mutex for + // performance reasons + for (auto entry : last_reference_list) { + FreeEntry(entry); + } + + return reinterpret_cast(e); +} + +void LRUCache::Erase(const Slice& key, uint32_t hash) { + LRUHandle* e; + bool last_reference = false; + { + MutexLock l(&mutex_); + e = table_.Remove(key, hash); + if (e != nullptr) { + LRU_Remove(e); + last_reference = Unref(e); + } + } + // mutex not held here + // last_reference will only be true if e != nullptr + if (last_reference) { + FreeEntry(e); + } +} + +static int kNumShardBits = 4; // default values, can be overridden +static int kRemoveScanCountLimit = 0; // default values, can be overridden + +class ShardedLRUCache : public Cache { + private: + LRUCache* shard_; + port::Mutex id_mutex_; + uint64_t last_id_; + int numShardBits; + size_t capacity_; + + static inline uint32_t HashSlice(const Slice& s) { + return Hash(s.data(), s.size(), 0); + } + + uint32_t Shard(uint32_t hash) { + // Note, hash >> 32 yields hash in gcc, not the zero we expect! + return (numShardBits > 0) ? (hash >> (32 - numShardBits)) : 0; + } + + void init(size_t capacity, int numbits, int removeScanCountLimit) { + numShardBits = numbits; + capacity_ = capacity; + int numShards = 1 << numShardBits; + shard_ = new LRUCache[numShards]; + const size_t per_shard = (capacity + (numShards - 1)) / numShards; + for (int s = 0; s < numShards; s++) { + shard_[s].SetCapacity(per_shard); + shard_[s].SetRemoveScanCountLimit(removeScanCountLimit); + } + } + + public: + explicit ShardedLRUCache(size_t capacity) + : last_id_(0) { + init(capacity, kNumShardBits, kRemoveScanCountLimit); + } + ShardedLRUCache(size_t capacity, int numShardBits, + int removeScanCountLimit) + : last_id_(0) { + init(capacity, numShardBits, removeScanCountLimit); + } + virtual ~ShardedLRUCache() { + delete[] shard_; + } + virtual Handle* Insert(const Slice& key, void* value, size_t charge, + void (*deleter)(const Slice& key, void* value)) { + const uint32_t hash = HashSlice(key); + return shard_[Shard(hash)].Insert(key, hash, value, charge, deleter); + } + virtual Handle* Lookup(const Slice& key) { + const uint32_t hash = HashSlice(key); + return shard_[Shard(hash)].Lookup(key, hash); + } + virtual void Release(Handle* handle) { + LRUHandle* h = reinterpret_cast(handle); + shard_[Shard(h->hash)].Release(handle); + } + virtual void Erase(const Slice& key) { + const uint32_t hash = HashSlice(key); + shard_[Shard(hash)].Erase(key, hash); + } + virtual void* Value(Handle* handle) { + return reinterpret_cast(handle)->value; + } + virtual uint64_t NewId() { + MutexLock l(&id_mutex_); + return ++(last_id_); + } + virtual size_t GetCapacity() { + return capacity_; + } +}; + +} // end anonymous namespace + +shared_ptr NewLRUCache(size_t capacity) { + return NewLRUCache(capacity, kNumShardBits); +} + +shared_ptr NewLRUCache(size_t capacity, int numShardBits) { + return NewLRUCache(capacity, numShardBits, kRemoveScanCountLimit); +} + +shared_ptr NewLRUCache(size_t capacity, int numShardBits, + int removeScanCountLimit) { + if (numShardBits >= 20) { + return nullptr; // the cache cannot be sharded into too many fine pieces + } + return std::make_shared(capacity, + numShardBits, + removeScanCountLimit); +} + +} // namespace rocksdb diff --git a/util/cache_test.cc b/util/cache_test.cc new file mode 100644 index 00000000..87ab9138 --- /dev/null +++ b/util/cache_test.cc @@ -0,0 +1,391 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/cache.h" + +#include +#include +#include +#include "util/coding.h" +#include "util/testharness.h" + +namespace rocksdb { + +// Conversions between numeric keys/values and the types expected by Cache. +static std::string EncodeKey(int k) { + std::string result; + PutFixed32(&result, k); + return result; +} +static int DecodeKey(const Slice& k) { + assert(k.size() == 4); + return DecodeFixed32(k.data()); +} +static void* EncodeValue(uintptr_t v) { return reinterpret_cast(v); } +static int DecodeValue(void* v) { return reinterpret_cast(v); } + +class CacheTest { + public: + static CacheTest* current_; + + static void Deleter(const Slice& key, void* v) { + current_->deleted_keys_.push_back(DecodeKey(key)); + current_->deleted_values_.push_back(DecodeValue(v)); + } + + static const int kCacheSize = 1000; + static const int kNumShardBits = 4; + static const int kRemoveScanCountLimit = 16; + + static const int kCacheSize2 = 100; + static const int kNumShardBits2 = 2; + static const int kRemoveScanCountLimit2 = 200; + + std::vector deleted_keys_; + std::vector deleted_values_; + shared_ptr cache_; + shared_ptr cache2_; + + CacheTest() : + cache_(NewLRUCache(kCacheSize, kNumShardBits, kRemoveScanCountLimit)), + cache2_(NewLRUCache(kCacheSize2, kNumShardBits2, + kRemoveScanCountLimit2)) { + current_ = this; + } + + ~CacheTest() { + } + + int Lookup(shared_ptr cache, int key) { + Cache::Handle* handle = cache->Lookup(EncodeKey(key)); + const int r = (handle == nullptr) ? -1 : DecodeValue(cache->Value(handle)); + if (handle != nullptr) { + cache->Release(handle); + } + return r; + } + + void Insert(shared_ptr cache, int key, int value, int charge = 1) { + cache->Release(cache->Insert(EncodeKey(key), EncodeValue(value), charge, + &CacheTest::Deleter)); + } + + void Erase(shared_ptr cache, int key) { + cache->Erase(EncodeKey(key)); + } + + + int Lookup(int key) { + return Lookup(cache_, key); + } + + void Insert(int key, int value, int charge = 1) { + Insert(cache_, key, value, charge); + } + + void Erase(int key) { + Erase(cache_, key); + } + + int Lookup2(int key) { + return Lookup(cache2_, key); + } + + void Insert2(int key, int value, int charge = 1) { + Insert(cache2_, key, value, charge); + } + + void Erase2(int key) { + Erase(cache2_, key); + } +}; +CacheTest* CacheTest::current_; + +TEST(CacheTest, HitAndMiss) { + ASSERT_EQ(-1, Lookup(100)); + + Insert(100, 101); + ASSERT_EQ(101, Lookup(100)); + ASSERT_EQ(-1, Lookup(200)); + ASSERT_EQ(-1, Lookup(300)); + + Insert(200, 201); + ASSERT_EQ(101, Lookup(100)); + ASSERT_EQ(201, Lookup(200)); + ASSERT_EQ(-1, Lookup(300)); + + Insert(100, 102); + ASSERT_EQ(102, Lookup(100)); + ASSERT_EQ(201, Lookup(200)); + ASSERT_EQ(-1, Lookup(300)); + + ASSERT_EQ(1U, deleted_keys_.size()); + ASSERT_EQ(100, deleted_keys_[0]); + ASSERT_EQ(101, deleted_values_[0]); +} + +TEST(CacheTest, Erase) { + Erase(200); + ASSERT_EQ(0U, deleted_keys_.size()); + + Insert(100, 101); + Insert(200, 201); + Erase(100); + ASSERT_EQ(-1, Lookup(100)); + ASSERT_EQ(201, Lookup(200)); + ASSERT_EQ(1U, deleted_keys_.size()); + ASSERT_EQ(100, deleted_keys_[0]); + ASSERT_EQ(101, deleted_values_[0]); + + Erase(100); + ASSERT_EQ(-1, Lookup(100)); + ASSERT_EQ(201, Lookup(200)); + ASSERT_EQ(1U, deleted_keys_.size()); +} + +TEST(CacheTest, EntriesArePinned) { + Insert(100, 101); + Cache::Handle* h1 = cache_->Lookup(EncodeKey(100)); + ASSERT_EQ(101, DecodeValue(cache_->Value(h1))); + + Insert(100, 102); + Cache::Handle* h2 = cache_->Lookup(EncodeKey(100)); + ASSERT_EQ(102, DecodeValue(cache_->Value(h2))); + ASSERT_EQ(0U, deleted_keys_.size()); + + cache_->Release(h1); + ASSERT_EQ(1U, deleted_keys_.size()); + ASSERT_EQ(100, deleted_keys_[0]); + ASSERT_EQ(101, deleted_values_[0]); + + Erase(100); + ASSERT_EQ(-1, Lookup(100)); + ASSERT_EQ(1U, deleted_keys_.size()); + + cache_->Release(h2); + ASSERT_EQ(2U, deleted_keys_.size()); + ASSERT_EQ(100, deleted_keys_[1]); + ASSERT_EQ(102, deleted_values_[1]); +} + +TEST(CacheTest, EvictionPolicy) { + Insert(100, 101); + Insert(200, 201); + + // Frequently used entry must be kept around + for (int i = 0; i < kCacheSize + 100; i++) { + Insert(1000+i, 2000+i); + ASSERT_EQ(2000+i, Lookup(1000+i)); + ASSERT_EQ(101, Lookup(100)); + } + ASSERT_EQ(101, Lookup(100)); + ASSERT_EQ(-1, Lookup(200)); +} + +TEST(CacheTest, EvictionPolicyRef) { + Insert(100, 101); + Insert(101, 102); + Insert(102, 103); + Insert(103, 104); + Insert(200, 101); + Insert(201, 102); + Insert(202, 103); + Insert(203, 104); + Cache::Handle* h201 = cache_->Lookup(EncodeKey(200)); + Cache::Handle* h202 = cache_->Lookup(EncodeKey(201)); + Cache::Handle* h203 = cache_->Lookup(EncodeKey(202)); + Cache::Handle* h204 = cache_->Lookup(EncodeKey(203)); + Insert(300, 101); + Insert(301, 102); + Insert(302, 103); + Insert(303, 104); + + // Insert entries much more than Cache capacity + for (int i = 0; i < kCacheSize + 100; i++) { + Insert(1000 + i, 2000 + i); + } + + // Check whether the entries inserted in the beginning + // are evicted. Ones without extra ref are evicted and + // those with are not. + ASSERT_EQ(-1, Lookup(100)); + ASSERT_EQ(-1, Lookup(101)); + ASSERT_EQ(-1, Lookup(102)); + ASSERT_EQ(-1, Lookup(103)); + + ASSERT_EQ(-1, Lookup(300)); + ASSERT_EQ(-1, Lookup(301)); + ASSERT_EQ(-1, Lookup(302)); + ASSERT_EQ(-1, Lookup(303)); + + ASSERT_EQ(101, Lookup(200)); + ASSERT_EQ(102, Lookup(201)); + ASSERT_EQ(103, Lookup(202)); + ASSERT_EQ(104, Lookup(203)); + + // Cleaning up all the handles + cache_->Release(h201); + cache_->Release(h202); + cache_->Release(h203); + cache_->Release(h204); +} + +TEST(CacheTest, EvictionPolicyRef2) { + std::vector handles; + + Insert(100, 101); + // Insert entries much more than Cache capacity + for (int i = 0; i < kCacheSize + 100; i++) { + Insert(1000 + i, 2000 + i); + if (i < kCacheSize ) { + handles.push_back(cache_->Lookup(EncodeKey(1000 + i))); + } + } + + // Make sure referenced keys are also possible to be deleted + // if there are not sufficient non-referenced keys + for (int i = 0; i < 5; i++) { + ASSERT_EQ(-1, Lookup(1000 + i)); + } + + for (int i = kCacheSize; i < kCacheSize + 100; i++) { + ASSERT_EQ(2000 + i, Lookup(1000 + i)); + } + ASSERT_EQ(-1, Lookup(100)); + + // Cleaning up all the handles + while (handles.size() > 0) { + cache_->Release(handles.back()); + handles.pop_back(); + } +} + +TEST(CacheTest, EvictionPolicyRefLargeScanLimit) { + std::vector handles2; + + // Cache2 has a cache RemoveScanCountLimit higher than cache size + // so it would trigger a boundary condition. + + // Populate the cache with 10 more keys than its size. + // Reference all keys except one close to the end. + for (int i = 0; i < kCacheSize2 + 10; i++) { + Insert2(1000 + i, 2000+i); + if (i != kCacheSize2 ) { + handles2.push_back(cache2_->Lookup(EncodeKey(1000 + i))); + } + } + + // Make sure referenced keys are also possible to be deleted + // if there are not sufficient non-referenced keys + for (int i = 0; i < 3; i++) { + ASSERT_EQ(-1, Lookup2(1000 + i)); + } + // The non-referenced value is deleted even if it's accessed + // recently. + ASSERT_EQ(-1, Lookup2(1000 + kCacheSize2)); + // Other values recently accessed are not deleted since they + // are referenced. + for (int i = kCacheSize2 - 10; i < kCacheSize2 + 10; i++) { + if (i != kCacheSize2) { + ASSERT_EQ(2000 + i, Lookup2(1000 + i)); + } + } + + // Cleaning up all the handles + while (handles2.size() > 0) { + cache2_->Release(handles2.back()); + handles2.pop_back(); + } +} + + + +TEST(CacheTest, HeavyEntries) { + // Add a bunch of light and heavy entries and then count the combined + // size of items still in the cache, which must be approximately the + // same as the total capacity. + const int kLight = 1; + const int kHeavy = 10; + int added = 0; + int index = 0; + while (added < 2*kCacheSize) { + const int weight = (index & 1) ? kLight : kHeavy; + Insert(index, 1000+index, weight); + added += weight; + index++; + } + + int cached_weight = 0; + for (int i = 0; i < index; i++) { + const int weight = (i & 1 ? kLight : kHeavy); + int r = Lookup(i); + if (r >= 0) { + cached_weight += weight; + ASSERT_EQ(1000+i, r); + } + } + ASSERT_LE(cached_weight, kCacheSize + kCacheSize/10); +} + +TEST(CacheTest, NewId) { + uint64_t a = cache_->NewId(); + uint64_t b = cache_->NewId(); + ASSERT_NE(a, b); +} + + +class Value { + private: + int v_; + public: + explicit Value(int v) : v_(v) { } + + ~Value() { std::cout << v_ << " is destructed\n"; } +}; + +void deleter(const Slice& key, void* value) { + delete (Value *)value; +} + + +TEST(CacheTest, BadEviction) { + int n = 10; + + // a LRUCache with n entries and one shard only + std::shared_ptr cache = NewLRUCache(n, 0); + + std::vector handles(n+1); + + // Insert n+1 entries, but not releasing. + for (int i = 0; i < n+1; i++) { + std::string key = std::to_string(i+1); + handles[i] = cache->Insert(key, new Value(i+1), 1, &deleter); + } + + // Guess what's in the cache now? + for (int i = 0; i < n+1; i++) { + std::string key = std::to_string(i+1); + auto h = cache->Lookup(key); + std::cout << key << (h?" found\n":" not found\n"); + // Only the first entry should be missing + ASSERT_TRUE(h || i == 0); + if (h) cache->Release(h); + } + + for (int i = 0; i < n+1; i++) { + cache->Release(handles[i]); + } + std::cout << "Poor entries\n"; +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/util/coding.cc b/util/coding.cc new file mode 100644 index 00000000..ce67fa48 --- /dev/null +++ b/util/coding.cc @@ -0,0 +1,329 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/coding.h" + +#include + +namespace rocksdb { + +void EncodeFixed32(char* buf, uint32_t value) { +#if __BYTE_ORDER == __LITTLE_ENDIAN + memcpy(buf, &value, sizeof(value)); +#else + buf[0] = value & 0xff; + buf[1] = (value >> 8) & 0xff; + buf[2] = (value >> 16) & 0xff; + buf[3] = (value >> 24) & 0xff; +#endif +} + +void EncodeFixed64(char* buf, uint64_t value) { +#if __BYTE_ORDER == __LITTLE_ENDIAN + memcpy(buf, &value, sizeof(value)); +#else + buf[0] = value & 0xff; + buf[1] = (value >> 8) & 0xff; + buf[2] = (value >> 16) & 0xff; + buf[3] = (value >> 24) & 0xff; + buf[4] = (value >> 32) & 0xff; + buf[5] = (value >> 40) & 0xff; + buf[6] = (value >> 48) & 0xff; + buf[7] = (value >> 56) & 0xff; +#endif +} + +void PutFixed32(std::string* dst, uint32_t value) { + char buf[sizeof(value)]; + EncodeFixed32(buf, value); + dst->append(buf, sizeof(buf)); +} + +void PutFixed64(std::string* dst, uint64_t value) { + char buf[sizeof(value)]; + EncodeFixed64(buf, value); + dst->append(buf, sizeof(buf)); +} + +char* EncodeVarint32(char* dst, uint32_t v) { + // Operate on characters as unsigneds + unsigned char* ptr = reinterpret_cast(dst); + static const int B = 128; + if (v < (1<<7)) { + *(ptr++) = v; + } else if (v < (1<<14)) { + *(ptr++) = v | B; + *(ptr++) = v>>7; + } else if (v < (1<<21)) { + *(ptr++) = v | B; + *(ptr++) = (v>>7) | B; + *(ptr++) = v>>14; + } else if (v < (1<<28)) { + *(ptr++) = v | B; + *(ptr++) = (v>>7) | B; + *(ptr++) = (v>>14) | B; + *(ptr++) = v>>21; + } else { + *(ptr++) = v | B; + *(ptr++) = (v>>7) | B; + *(ptr++) = (v>>14) | B; + *(ptr++) = (v>>21) | B; + *(ptr++) = v>>28; + } + return reinterpret_cast(ptr); +} + +void PutVarint32(std::string* dst, uint32_t v) { + char buf[5]; + char* ptr = EncodeVarint32(buf, v); + dst->append(buf, ptr - buf); +} + +char* EncodeVarint64(char* dst, uint64_t v) { + static const unsigned int B = 128; + unsigned char* ptr = reinterpret_cast(dst); + while (v >= B) { + *(ptr++) = (v & (B-1)) | B; + v >>= 7; + } + *(ptr++) = static_cast(v); + return reinterpret_cast(ptr); +} + +void PutVarint64(std::string* dst, uint64_t v) { + char buf[10]; + char* ptr = EncodeVarint64(buf, v); + dst->append(buf, ptr - buf); +} + +void PutLengthPrefixedSlice(std::string* dst, const Slice& value) { + PutVarint32(dst, value.size()); + dst->append(value.data(), value.size()); +} + +void PutLengthPrefixedSliceParts(std::string* dst, + const SliceParts& slice_parts) { + uint32_t total_bytes = 0; + for (int i = 0; i < slice_parts.num_parts; ++i) { + total_bytes += slice_parts.parts[i].size(); + } + PutVarint32(dst, total_bytes); + for (int i = 0; i < slice_parts.num_parts; ++i) { + dst->append(slice_parts.parts[i].data(), slice_parts.parts[i].size()); + } +} + +int VarintLength(uint64_t v) { + int len = 1; + while (v >= 128) { + v >>= 7; + len++; + } + return len; +} + +const char* GetVarint32PtrFallback(const char* p, + const char* limit, + uint32_t* value) { + uint32_t result = 0; + for (uint32_t shift = 0; shift <= 28 && p < limit; shift += 7) { + uint32_t byte = *(reinterpret_cast(p)); + p++; + if (byte & 128) { + // More bytes are present + result |= ((byte & 127) << shift); + } else { + result |= (byte << shift); + *value = result; + return reinterpret_cast(p); + } + } + return nullptr; +} + +bool GetVarint32(Slice* input, uint32_t* value) { + const char* p = input->data(); + const char* limit = p + input->size(); + const char* q = GetVarint32Ptr(p, limit, value); + if (q == nullptr) { + return false; + } else { + *input = Slice(q, limit - q); + return true; + } +} + +const char* GetVarint64Ptr(const char* p, const char* limit, uint64_t* value) { + uint64_t result = 0; + for (uint32_t shift = 0; shift <= 63 && p < limit; shift += 7) { + uint64_t byte = *(reinterpret_cast(p)); + p++; + if (byte & 128) { + // More bytes are present + result |= ((byte & 127) << shift); + } else { + result |= (byte << shift); + *value = result; + return reinterpret_cast(p); + } + } + return nullptr; +} + +bool GetVarint64(Slice* input, uint64_t* value) { + const char* p = input->data(); + const char* limit = p + input->size(); + const char* q = GetVarint64Ptr(p, limit, value); + if (q == nullptr) { + return false; + } else { + *input = Slice(q, limit - q); + return true; + } +} + +const char* GetLengthPrefixedSlice(const char* p, const char* limit, + Slice* result) { + uint32_t len; + p = GetVarint32Ptr(p, limit, &len); + if (p == nullptr) return nullptr; + if (p + len > limit) return nullptr; + *result = Slice(p, len); + return p + len; +} + +bool GetLengthPrefixedSlice(Slice* input, Slice* result) { + uint32_t len; + if (GetVarint32(input, &len) && + input->size() >= len) { + *result = Slice(input->data(), len); + input->remove_prefix(len); + return true; + } else { + return false; + } +} + +Slice GetLengthPrefixedSlice(const char* data) { + uint32_t len; + const char* p = data; + p = GetVarint32Ptr(p, p + 5, &len); // +5: we assume "p" is not corrupted + return Slice(p, len); +} + +Slice GetSliceUntil(Slice* slice, char delimiter) { + uint32_t len; + for (len = 0; len < slice->size() && slice->data()[len] != delimiter; ++len) { + // nothing + } + + Slice ret(slice->data(), len); + slice->remove_prefix(len + ((len < slice->size()) ? 1 : 0)); + return ret; +} + +void BitStreamPutInt(char* dst, size_t dstlen, size_t offset, + uint32_t bits, uint64_t value) { + assert((offset + bits + 7)/8 <= dstlen); + assert(bits <= 64); + + unsigned char* ptr = reinterpret_cast(dst); + + size_t byteOffset = offset / 8; + size_t bitOffset = offset % 8; + + // This prevents unused variable warnings when compiling. +#ifndef NDEBUG + // Store truncated value. + uint64_t origValue = (bits < 64)?(value & (((uint64_t)1 << bits) - 1)):value; + uint32_t origBits = bits; +#endif + + while (bits > 0) { + size_t bitsToGet = std::min(bits, 8 - bitOffset); + unsigned char mask = ((1 << bitsToGet) - 1); + + ptr[byteOffset] = (ptr[byteOffset] & ~(mask << bitOffset)) + + ((value & mask) << bitOffset); + + value >>= bitsToGet; + byteOffset += 1; + bitOffset = 0; + bits -= bitsToGet; + } + + assert(origValue == BitStreamGetInt(dst, dstlen, offset, origBits)); +} + +uint64_t BitStreamGetInt(const char* src, size_t srclen, size_t offset, + uint32_t bits) { + assert((offset + bits + 7)/8 <= srclen); + assert(bits <= 64); + + const unsigned char* ptr = reinterpret_cast(src); + + uint64_t result = 0; + + size_t byteOffset = offset / 8; + size_t bitOffset = offset % 8; + size_t shift = 0; + + while (bits > 0) { + size_t bitsToGet = std::min(bits, 8 - bitOffset); + unsigned char mask = ((1 << bitsToGet) - 1); + + result += (uint64_t)((ptr[byteOffset] >> bitOffset) & mask) << shift; + + shift += bitsToGet; + byteOffset += 1; + bitOffset = 0; + bits -= bitsToGet; + } + + return result; +} + +void BitStreamPutInt(std::string* dst, size_t offset, uint32_t bits, + uint64_t value) { + assert((offset + bits + 7)/8 <= dst->size()); + + const size_t kTmpBufLen = sizeof(value) + 1; + char tmpBuf[kTmpBufLen]; + + // Number of bytes of tmpBuf being used + const size_t kUsedBytes = (offset%8 + bits)/8; + + // Copy relevant parts of dst to tmpBuf + for (size_t idx = 0; idx <= kUsedBytes; ++idx) { + tmpBuf[idx] = (*dst)[offset/8 + idx]; + } + + BitStreamPutInt(tmpBuf, kTmpBufLen, offset%8, bits, value); + + // Copy tmpBuf back to dst + for (size_t idx = 0; idx <= kUsedBytes; ++idx) { + (*dst)[offset/8 + idx] = tmpBuf[idx]; + } + + // Do the check here too as we are working with a buffer. + assert(((bits < 64)?(value & (((uint64_t)1 << bits) - 1)):value) == + BitStreamGetInt(dst, offset, bits)); +} + +uint64_t BitStreamGetInt(const std::string* src, size_t offset, + uint32_t bits) { + return BitStreamGetInt(src->data(), src->size(), offset, bits); +} + +uint64_t BitStreamGetInt(const Slice* src, size_t offset, + uint32_t bits) { + return BitStreamGetInt(src->data(), src->size(), offset, bits); +} + +} // namespace rocksdb diff --git a/util/coding.h b/util/coding.h new file mode 100644 index 00000000..4477dc79 --- /dev/null +++ b/util/coding.h @@ -0,0 +1,139 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Endian-neutral encoding: +// * Fixed-length numbers are encoded with least-significant byte first +// * In addition we support variable length "varint" encoding +// * Strings are encoded prefixed by their length in varint format + +#pragma once +#include +#include +#include +#include "port/port.h" + +namespace rocksdb { + +// The maximum length of a varint in bytes for 32 and 64 bits respectively. +const unsigned int kMaxVarint32Length = 5; +const unsigned int kMaxVarint64Length = 10; + +// Standard Put... routines append to a string +extern void PutFixed32(std::string* dst, uint32_t value); +extern void PutFixed64(std::string* dst, uint64_t value); +extern void PutVarint32(std::string* dst, uint32_t value); +extern void PutVarint64(std::string* dst, uint64_t value); +extern void PutLengthPrefixedSlice(std::string* dst, const Slice& value); +extern void PutLengthPrefixedSliceParts(std::string* dst, + const SliceParts& slice_parts); + +// Standard Get... routines parse a value from the beginning of a Slice +// and advance the slice past the parsed value. +extern bool GetVarint32(Slice* input, uint32_t* value); +extern bool GetVarint64(Slice* input, uint64_t* value); +extern bool GetLengthPrefixedSlice(Slice* input, Slice* result); +extern Slice GetLengthPrefixedSlice(const char* data); + +extern Slice GetSliceUntil(Slice* slice, char delimiter); + +// Pointer-based variants of GetVarint... These either store a value +// in *v and return a pointer just past the parsed value, or return +// nullptr on error. These routines only look at bytes in the range +// [p..limit-1] +extern const char* GetVarint32Ptr(const char* p,const char* limit, uint32_t* v); +extern const char* GetVarint64Ptr(const char* p,const char* limit, uint64_t* v); + +// Returns the length of the varint32 or varint64 encoding of "v" +extern int VarintLength(uint64_t v); + +// Lower-level versions of Put... that write directly into a character buffer +// REQUIRES: dst has enough space for the value being written +extern void EncodeFixed32(char* dst, uint32_t value); +extern void EncodeFixed64(char* dst, uint64_t value); + +// Lower-level versions of Put... that write directly into a character buffer +// and return a pointer just past the last byte written. +// REQUIRES: dst has enough space for the value being written +extern char* EncodeVarint32(char* dst, uint32_t value); +extern char* EncodeVarint64(char* dst, uint64_t value); + +// Lower-level versions of Get... that read directly from a character buffer +// without any bounds checking. + +inline uint32_t DecodeFixed32(const char* ptr) { + if (port::kLittleEndian) { + // Load the raw bytes + uint32_t result; + memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load + return result; + } else { + return ((static_cast(static_cast(ptr[0]))) + | (static_cast(static_cast(ptr[1])) << 8) + | (static_cast(static_cast(ptr[2])) << 16) + | (static_cast(static_cast(ptr[3])) << 24)); + } +} + +inline uint64_t DecodeFixed64(const char* ptr) { + if (port::kLittleEndian) { + // Load the raw bytes + uint64_t result; + memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load + return result; + } else { + uint64_t lo = DecodeFixed32(ptr); + uint64_t hi = DecodeFixed32(ptr + 4); + return (hi << 32) | lo; + } +} + +// Internal routine for use by fallback path of GetVarint32Ptr +extern const char* GetVarint32PtrFallback(const char* p, + const char* limit, + uint32_t* value); +inline const char* GetVarint32Ptr(const char* p, + const char* limit, + uint32_t* value) { + if (p < limit) { + uint32_t result = *(reinterpret_cast(p)); + if ((result & 128) == 0) { + *value = result; + return p + 1; + } + } + return GetVarint32PtrFallback(p, limit, value); +} + +// Writes an unsigned integer with bits number of bits with its least +// significant bit at offset. +// Bits are numbered from 0 to 7 in the first byte, 8 to 15 in the second and +// so on. +// value is truncated to the bits number of least significant bits. +// REQUIRES: (offset+bits+7)/8 <= dstlen +// REQUIRES: bits <= 64 +extern void BitStreamPutInt(char* dst, size_t dstlen, size_t offset, + uint32_t bits, uint64_t value); + +// Reads an unsigned integer with bits number of bits with its least +// significant bit at offset. +// Bits are numbered in the same way as ByteStreamPutInt(). +// REQUIRES: (offset+bits+7)/8 <= srclen +// REQUIRES: bits <= 64 +extern uint64_t BitStreamGetInt(const char* src, size_t srclen, size_t offset, + uint32_t bits); + +// Convenience functions +extern void BitStreamPutInt(std::string* dst, size_t offset, uint32_t bits, + uint64_t value); +extern uint64_t BitStreamGetInt(const std::string* src, size_t offset, + uint32_t bits); +extern uint64_t BitStreamGetInt(const Slice* src, size_t offset, + uint32_t bits); + +} // namespace rocksdb diff --git a/util/coding_test.cc b/util/coding_test.cc new file mode 100644 index 00000000..fb061323 --- /dev/null +++ b/util/coding_test.cc @@ -0,0 +1,296 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/coding.h" + +#include "util/testharness.h" + +namespace rocksdb { + +class Coding { }; + +TEST(Coding, Fixed32) { + std::string s; + for (uint32_t v = 0; v < 100000; v++) { + PutFixed32(&s, v); + } + + const char* p = s.data(); + for (uint32_t v = 0; v < 100000; v++) { + uint32_t actual = DecodeFixed32(p); + ASSERT_EQ(v, actual); + p += sizeof(uint32_t); + } +} + +TEST(Coding, Fixed64) { + std::string s; + for (int power = 0; power <= 63; power++) { + uint64_t v = static_cast(1) << power; + PutFixed64(&s, v - 1); + PutFixed64(&s, v + 0); + PutFixed64(&s, v + 1); + } + + const char* p = s.data(); + for (int power = 0; power <= 63; power++) { + uint64_t v = static_cast(1) << power; + uint64_t actual; + actual = DecodeFixed64(p); + ASSERT_EQ(v-1, actual); + p += sizeof(uint64_t); + + actual = DecodeFixed64(p); + ASSERT_EQ(v+0, actual); + p += sizeof(uint64_t); + + actual = DecodeFixed64(p); + ASSERT_EQ(v+1, actual); + p += sizeof(uint64_t); + } +} + +// Test that encoding routines generate little-endian encodings +TEST(Coding, EncodingOutput) { + std::string dst; + PutFixed32(&dst, 0x04030201); + ASSERT_EQ(4U, dst.size()); + ASSERT_EQ(0x01, static_cast(dst[0])); + ASSERT_EQ(0x02, static_cast(dst[1])); + ASSERT_EQ(0x03, static_cast(dst[2])); + ASSERT_EQ(0x04, static_cast(dst[3])); + + dst.clear(); + PutFixed64(&dst, 0x0807060504030201ull); + ASSERT_EQ(8U, dst.size()); + ASSERT_EQ(0x01, static_cast(dst[0])); + ASSERT_EQ(0x02, static_cast(dst[1])); + ASSERT_EQ(0x03, static_cast(dst[2])); + ASSERT_EQ(0x04, static_cast(dst[3])); + ASSERT_EQ(0x05, static_cast(dst[4])); + ASSERT_EQ(0x06, static_cast(dst[5])); + ASSERT_EQ(0x07, static_cast(dst[6])); + ASSERT_EQ(0x08, static_cast(dst[7])); +} + +TEST(Coding, Varint32) { + std::string s; + for (uint32_t i = 0; i < (32 * 32); i++) { + uint32_t v = (i / 32) << (i % 32); + PutVarint32(&s, v); + } + + const char* p = s.data(); + const char* limit = p + s.size(); + for (uint32_t i = 0; i < (32 * 32); i++) { + uint32_t expected = (i / 32) << (i % 32); + uint32_t actual; + const char* start = p; + p = GetVarint32Ptr(p, limit, &actual); + ASSERT_TRUE(p != nullptr); + ASSERT_EQ(expected, actual); + ASSERT_EQ(VarintLength(actual), p - start); + } + ASSERT_EQ(p, s.data() + s.size()); +} + +TEST(Coding, Varint64) { + // Construct the list of values to check + std::vector values; + // Some special values + values.push_back(0); + values.push_back(100); + values.push_back(~static_cast(0)); + values.push_back(~static_cast(0) - 1); + for (uint32_t k = 0; k < 64; k++) { + // Test values near powers of two + const uint64_t power = 1ull << k; + values.push_back(power); + values.push_back(power-1); + values.push_back(power+1); + }; + + std::string s; + for (unsigned int i = 0; i < values.size(); i++) { + PutVarint64(&s, values[i]); + } + + const char* p = s.data(); + const char* limit = p + s.size(); + for (unsigned int i = 0; i < values.size(); i++) { + ASSERT_TRUE(p < limit); + uint64_t actual; + const char* start = p; + p = GetVarint64Ptr(p, limit, &actual); + ASSERT_TRUE(p != nullptr); + ASSERT_EQ(values[i], actual); + ASSERT_EQ(VarintLength(actual), p - start); + } + ASSERT_EQ(p, limit); + +} + +TEST(Coding, Varint32Overflow) { + uint32_t result; + std::string input("\x81\x82\x83\x84\x85\x11"); + ASSERT_TRUE(GetVarint32Ptr(input.data(), input.data() + input.size(), &result) + == nullptr); +} + +TEST(Coding, Varint32Truncation) { + uint32_t large_value = (1u << 31) + 100; + std::string s; + PutVarint32(&s, large_value); + uint32_t result; + for (unsigned int len = 0; len < s.size() - 1; len++) { + ASSERT_TRUE(GetVarint32Ptr(s.data(), s.data() + len, &result) == nullptr); + } + ASSERT_TRUE( + GetVarint32Ptr(s.data(), s.data() + s.size(), &result) != nullptr); + ASSERT_EQ(large_value, result); +} + +TEST(Coding, Varint64Overflow) { + uint64_t result; + std::string input("\x81\x82\x83\x84\x85\x81\x82\x83\x84\x85\x11"); + ASSERT_TRUE(GetVarint64Ptr(input.data(), input.data() + input.size(), &result) + == nullptr); +} + +TEST(Coding, Varint64Truncation) { + uint64_t large_value = (1ull << 63) + 100ull; + std::string s; + PutVarint64(&s, large_value); + uint64_t result; + for (unsigned int len = 0; len < s.size() - 1; len++) { + ASSERT_TRUE(GetVarint64Ptr(s.data(), s.data() + len, &result) == nullptr); + } + ASSERT_TRUE( + GetVarint64Ptr(s.data(), s.data() + s.size(), &result) != nullptr); + ASSERT_EQ(large_value, result); +} + +TEST(Coding, Strings) { + std::string s; + PutLengthPrefixedSlice(&s, Slice("")); + PutLengthPrefixedSlice(&s, Slice("foo")); + PutLengthPrefixedSlice(&s, Slice("bar")); + PutLengthPrefixedSlice(&s, Slice(std::string(200, 'x'))); + + Slice input(s); + Slice v; + ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v)); + ASSERT_EQ("", v.ToString()); + ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v)); + ASSERT_EQ("foo", v.ToString()); + ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v)); + ASSERT_EQ("bar", v.ToString()); + ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v)); + ASSERT_EQ(std::string(200, 'x'), v.ToString()); + ASSERT_EQ("", input.ToString()); +} + +TEST(Coding, BitStream) { + const int kNumBytes = 10; + char bytes[kNumBytes+1]; + for (int i = 0; i < kNumBytes + 1; ++i) { + bytes[i] = '\0'; + } + + // Simple byte aligned test. + for (int i = 0; i < kNumBytes; ++i) { + BitStreamPutInt(bytes, kNumBytes, i*8, 8, 255-i); + + ASSERT_EQ((unsigned char)bytes[i], (unsigned char)(255-i)); + } + for (int i = 0; i < kNumBytes; ++i) { + ASSERT_EQ(BitStreamGetInt(bytes, kNumBytes, i*8, 8), (uint32_t)(255-i)); + } + ASSERT_EQ(bytes[kNumBytes], '\0'); + + // Write and read back at strange offsets + for (int i = 0; i < kNumBytes + 1; ++i) { + bytes[i] = '\0'; + } + for (int i = 0; i < kNumBytes; ++i) { + BitStreamPutInt(bytes, kNumBytes, i*5+1, 4, (i * 7) % (1 << 4)); + } + for (int i = 0; i < kNumBytes; ++i) { + ASSERT_EQ(BitStreamGetInt(bytes, kNumBytes, i*5+1, 4), + (uint32_t)((i * 7) % (1 << 4))); + } + ASSERT_EQ(bytes[kNumBytes], '\0'); + + // Create 11011011 as a bit pattern + for (int i = 0; i < kNumBytes + 1; ++i) { + bytes[i] = '\0'; + } + for (int i = 0; i < kNumBytes; ++i) { + BitStreamPutInt(bytes, kNumBytes, i*8, 2, 3); + BitStreamPutInt(bytes, kNumBytes, i*8+3, 2, 3); + BitStreamPutInt(bytes, kNumBytes, i*8+6, 2, 3); + + ASSERT_EQ((unsigned char)bytes[i], + (unsigned char)(3 + (3 << 3) + (3 << 6))); + } + ASSERT_EQ(bytes[kNumBytes], '\0'); + + + // Test large values + for (int i = 0; i < kNumBytes + 1; ++i) { + bytes[i] = '\0'; + } + BitStreamPutInt(bytes, kNumBytes, 0, 64, (uint64_t)(-1)); + for (int i = 0; i < 64/8; ++i) { + ASSERT_EQ((unsigned char)bytes[i], + (unsigned char)(255)); + } + ASSERT_EQ(bytes[64/8], '\0'); + + +} + +TEST(Coding, BitStreamConvenienceFuncs) { + std::string bytes(1, '\0'); + + // Check that independent changes to byte are preserved. + BitStreamPutInt(&bytes, 0, 2, 3); + BitStreamPutInt(&bytes, 3, 2, 3); + BitStreamPutInt(&bytes, 6, 2, 3); + ASSERT_EQ((unsigned char)bytes[0], (unsigned char)(3 + (3 << 3) + (3 << 6))); + ASSERT_EQ(BitStreamGetInt(&bytes, 0, 2), 3u); + ASSERT_EQ(BitStreamGetInt(&bytes, 3, 2), 3u); + ASSERT_EQ(BitStreamGetInt(&bytes, 6, 2), 3u); + Slice slice(bytes); + ASSERT_EQ(BitStreamGetInt(&slice, 0, 2), 3u); + ASSERT_EQ(BitStreamGetInt(&slice, 3, 2), 3u); + ASSERT_EQ(BitStreamGetInt(&slice, 6, 2), 3u); + + // Test overlapping crossing over byte boundaries + bytes = std::string(2, '\0'); + BitStreamPutInt(&bytes, 6, 4, 15); + ASSERT_EQ((unsigned char)bytes[0], 3 << 6); + ASSERT_EQ((unsigned char)bytes[1], 3); + ASSERT_EQ(BitStreamGetInt(&bytes, 6, 4), 15u); + slice = Slice(bytes); + ASSERT_EQ(BitStreamGetInt(&slice, 6, 4), 15u); + + // Test 64-bit number + bytes = std::string(64/8, '\0'); + BitStreamPutInt(&bytes, 0, 64, (uint64_t)(-1)); + ASSERT_EQ(BitStreamGetInt(&bytes, 0, 64), (uint64_t)(-1)); + slice = Slice(bytes); + ASSERT_EQ(BitStreamGetInt(&slice, 0, 64), (uint64_t)(-1)); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/util/comparator.cc b/util/comparator.cc new file mode 100644 index 00000000..adeacac0 --- /dev/null +++ b/util/comparator.cc @@ -0,0 +1,86 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include +#include "rocksdb/comparator.h" +#include "rocksdb/slice.h" +#include "port/port.h" +#include "util/logging.h" + +namespace rocksdb { + +Comparator::~Comparator() { } + +namespace { +class BytewiseComparatorImpl : public Comparator { + public: + BytewiseComparatorImpl() { } + + virtual const char* Name() const { + return "leveldb.BytewiseComparator"; + } + + virtual int Compare(const Slice& a, const Slice& b) const { + return a.compare(b); + } + + virtual void FindShortestSeparator( + std::string* start, + const Slice& limit) const { + // Find length of common prefix + size_t min_length = std::min(start->size(), limit.size()); + size_t diff_index = 0; + while ((diff_index < min_length) && + ((*start)[diff_index] == limit[diff_index])) { + diff_index++; + } + + if (diff_index >= min_length) { + // Do not shorten if one string is a prefix of the other + } else { + uint8_t diff_byte = static_cast((*start)[diff_index]); + if (diff_byte < static_cast(0xff) && + diff_byte + 1 < static_cast(limit[diff_index])) { + (*start)[diff_index]++; + start->resize(diff_index + 1); + assert(Compare(*start, limit) < 0); + } + } + } + + virtual void FindShortSuccessor(std::string* key) const { + // Find first character that can be incremented + size_t n = key->size(); + for (size_t i = 0; i < n; i++) { + const uint8_t byte = (*key)[i]; + if (byte != static_cast(0xff)) { + (*key)[i] = byte + 1; + key->resize(i+1); + return; + } + } + // *key is a run of 0xffs. Leave it alone. + } +}; +} // namespace + +static port::OnceType once = LEVELDB_ONCE_INIT; +static const Comparator* bytewise; + +static void InitModule() { + bytewise = new BytewiseComparatorImpl; +} + +const Comparator* BytewiseComparator() { + port::InitOnce(&once, InitModule); + return bytewise; +} + +} // namespace rocksdb diff --git a/util/crc32c.cc b/util/crc32c.cc new file mode 100644 index 00000000..bca955a0 --- /dev/null +++ b/util/crc32c.cc @@ -0,0 +1,393 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A portable implementation of crc32c, optimized to handle +// four bytes at a time. + +#include "util/crc32c.h" + +#include +#ifdef __SSE4_2__ +#include +#endif +#include "util/coding.h" + +namespace rocksdb { +namespace crc32c { + +static const uint32_t table0_[256] = { + 0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4, + 0xc79a971f, 0x35f1141c, 0x26a1e7e8, 0xd4ca64eb, + 0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b, + 0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24, + 0x105ec76f, 0xe235446c, 0xf165b798, 0x030e349b, + 0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384, + 0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54, + 0x5d1d08bf, 0xaf768bbc, 0xbc267848, 0x4e4dfb4b, + 0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a, + 0xe72719c1, 0x154c9ac2, 0x061c6936, 0xf477ea35, + 0xaa64d611, 0x580f5512, 0x4b5fa6e6, 0xb93425e5, + 0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa, + 0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45, + 0xf779deae, 0x05125dad, 0x1642ae59, 0xe4292d5a, + 0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a, + 0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595, + 0x417b1dbc, 0xb3109ebf, 0xa0406d4b, 0x522bee48, + 0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957, + 0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687, + 0x0c38d26c, 0xfe53516f, 0xed03a29b, 0x1f682198, + 0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927, + 0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38, + 0xdbfc821c, 0x2997011f, 0x3ac7f2eb, 0xc8ac71e8, + 0x1c661503, 0xee0d9600, 0xfd5d65f4, 0x0f36e6f7, + 0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096, + 0xa65c047d, 0x5437877e, 0x4767748a, 0xb50cf789, + 0xeb1fcbad, 0x197448ae, 0x0a24bb5a, 0xf84f3859, + 0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46, + 0x7198540d, 0x83f3d70e, 0x90a324fa, 0x62c8a7f9, + 0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6, + 0xfb410cc2, 0x092a8fc1, 0x1a7a7c35, 0xe811ff36, + 0x3cdb9bdd, 0xceb018de, 0xdde0eb2a, 0x2f8b6829, + 0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c, + 0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93, + 0x082f63b7, 0xfa44e0b4, 0xe9141340, 0x1b7f9043, + 0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c, + 0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3, + 0x55326b08, 0xa759e80b, 0xb4091bff, 0x466298fc, + 0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0x0b21572c, + 0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033, + 0xa24bb5a6, 0x502036a5, 0x4370c551, 0xb11b4652, + 0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d, + 0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d, + 0xef087a76, 0x1d63f975, 0x0e330a81, 0xfc588982, + 0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d, + 0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622, + 0x38cc2a06, 0xcaa7a905, 0xd9f75af1, 0x2b9cd9f2, + 0xff56bd19, 0x0d3d3e1a, 0x1e6dcdee, 0xec064eed, + 0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530, + 0x0417b1db, 0xf67c32d8, 0xe52cc12c, 0x1747422f, + 0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff, + 0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0, + 0xd3d3e1ab, 0x21b862a8, 0x32e8915c, 0xc083125f, + 0x144976b4, 0xe622f5b7, 0xf5720643, 0x07198540, + 0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90, + 0x9e902e7b, 0x6cfbad78, 0x7fab5e8c, 0x8dc0dd8f, + 0xe330a81a, 0x115b2b19, 0x020bd8ed, 0xf0605bee, + 0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1, + 0x69e9f0d5, 0x9b8273d6, 0x88d28022, 0x7ab90321, + 0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e, + 0xf36e6f75, 0x0105ec76, 0x12551f82, 0xe03e9c81, + 0x34f4f86a, 0xc69f7b69, 0xd5cf889d, 0x27a40b9e, + 0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e, + 0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351 +}; +static const uint32_t table1_[256] = { + 0x00000000, 0x13a29877, 0x274530ee, 0x34e7a899, + 0x4e8a61dc, 0x5d28f9ab, 0x69cf5132, 0x7a6dc945, + 0x9d14c3b8, 0x8eb65bcf, 0xba51f356, 0xa9f36b21, + 0xd39ea264, 0xc03c3a13, 0xf4db928a, 0xe7790afd, + 0x3fc5f181, 0x2c6769f6, 0x1880c16f, 0x0b225918, + 0x714f905d, 0x62ed082a, 0x560aa0b3, 0x45a838c4, + 0xa2d13239, 0xb173aa4e, 0x859402d7, 0x96369aa0, + 0xec5b53e5, 0xfff9cb92, 0xcb1e630b, 0xd8bcfb7c, + 0x7f8be302, 0x6c297b75, 0x58ced3ec, 0x4b6c4b9b, + 0x310182de, 0x22a31aa9, 0x1644b230, 0x05e62a47, + 0xe29f20ba, 0xf13db8cd, 0xc5da1054, 0xd6788823, + 0xac154166, 0xbfb7d911, 0x8b507188, 0x98f2e9ff, + 0x404e1283, 0x53ec8af4, 0x670b226d, 0x74a9ba1a, + 0x0ec4735f, 0x1d66eb28, 0x298143b1, 0x3a23dbc6, + 0xdd5ad13b, 0xcef8494c, 0xfa1fe1d5, 0xe9bd79a2, + 0x93d0b0e7, 0x80722890, 0xb4958009, 0xa737187e, + 0xff17c604, 0xecb55e73, 0xd852f6ea, 0xcbf06e9d, + 0xb19da7d8, 0xa23f3faf, 0x96d89736, 0x857a0f41, + 0x620305bc, 0x71a19dcb, 0x45463552, 0x56e4ad25, + 0x2c896460, 0x3f2bfc17, 0x0bcc548e, 0x186eccf9, + 0xc0d23785, 0xd370aff2, 0xe797076b, 0xf4359f1c, + 0x8e585659, 0x9dface2e, 0xa91d66b7, 0xbabffec0, + 0x5dc6f43d, 0x4e646c4a, 0x7a83c4d3, 0x69215ca4, + 0x134c95e1, 0x00ee0d96, 0x3409a50f, 0x27ab3d78, + 0x809c2506, 0x933ebd71, 0xa7d915e8, 0xb47b8d9f, + 0xce1644da, 0xddb4dcad, 0xe9537434, 0xfaf1ec43, + 0x1d88e6be, 0x0e2a7ec9, 0x3acdd650, 0x296f4e27, + 0x53028762, 0x40a01f15, 0x7447b78c, 0x67e52ffb, + 0xbf59d487, 0xacfb4cf0, 0x981ce469, 0x8bbe7c1e, + 0xf1d3b55b, 0xe2712d2c, 0xd69685b5, 0xc5341dc2, + 0x224d173f, 0x31ef8f48, 0x050827d1, 0x16aabfa6, + 0x6cc776e3, 0x7f65ee94, 0x4b82460d, 0x5820de7a, + 0xfbc3faf9, 0xe861628e, 0xdc86ca17, 0xcf245260, + 0xb5499b25, 0xa6eb0352, 0x920cabcb, 0x81ae33bc, + 0x66d73941, 0x7575a136, 0x419209af, 0x523091d8, + 0x285d589d, 0x3bffc0ea, 0x0f186873, 0x1cbaf004, + 0xc4060b78, 0xd7a4930f, 0xe3433b96, 0xf0e1a3e1, + 0x8a8c6aa4, 0x992ef2d3, 0xadc95a4a, 0xbe6bc23d, + 0x5912c8c0, 0x4ab050b7, 0x7e57f82e, 0x6df56059, + 0x1798a91c, 0x043a316b, 0x30dd99f2, 0x237f0185, + 0x844819fb, 0x97ea818c, 0xa30d2915, 0xb0afb162, + 0xcac27827, 0xd960e050, 0xed8748c9, 0xfe25d0be, + 0x195cda43, 0x0afe4234, 0x3e19eaad, 0x2dbb72da, + 0x57d6bb9f, 0x447423e8, 0x70938b71, 0x63311306, + 0xbb8de87a, 0xa82f700d, 0x9cc8d894, 0x8f6a40e3, + 0xf50789a6, 0xe6a511d1, 0xd242b948, 0xc1e0213f, + 0x26992bc2, 0x353bb3b5, 0x01dc1b2c, 0x127e835b, + 0x68134a1e, 0x7bb1d269, 0x4f567af0, 0x5cf4e287, + 0x04d43cfd, 0x1776a48a, 0x23910c13, 0x30339464, + 0x4a5e5d21, 0x59fcc556, 0x6d1b6dcf, 0x7eb9f5b8, + 0x99c0ff45, 0x8a626732, 0xbe85cfab, 0xad2757dc, + 0xd74a9e99, 0xc4e806ee, 0xf00fae77, 0xe3ad3600, + 0x3b11cd7c, 0x28b3550b, 0x1c54fd92, 0x0ff665e5, + 0x759baca0, 0x663934d7, 0x52de9c4e, 0x417c0439, + 0xa6050ec4, 0xb5a796b3, 0x81403e2a, 0x92e2a65d, + 0xe88f6f18, 0xfb2df76f, 0xcfca5ff6, 0xdc68c781, + 0x7b5fdfff, 0x68fd4788, 0x5c1aef11, 0x4fb87766, + 0x35d5be23, 0x26772654, 0x12908ecd, 0x013216ba, + 0xe64b1c47, 0xf5e98430, 0xc10e2ca9, 0xd2acb4de, + 0xa8c17d9b, 0xbb63e5ec, 0x8f844d75, 0x9c26d502, + 0x449a2e7e, 0x5738b609, 0x63df1e90, 0x707d86e7, + 0x0a104fa2, 0x19b2d7d5, 0x2d557f4c, 0x3ef7e73b, + 0xd98eedc6, 0xca2c75b1, 0xfecbdd28, 0xed69455f, + 0x97048c1a, 0x84a6146d, 0xb041bcf4, 0xa3e32483 +}; +static const uint32_t table2_[256] = { + 0x00000000, 0xa541927e, 0x4f6f520d, 0xea2ec073, + 0x9edea41a, 0x3b9f3664, 0xd1b1f617, 0x74f06469, + 0x38513ec5, 0x9d10acbb, 0x773e6cc8, 0xd27ffeb6, + 0xa68f9adf, 0x03ce08a1, 0xe9e0c8d2, 0x4ca15aac, + 0x70a27d8a, 0xd5e3eff4, 0x3fcd2f87, 0x9a8cbdf9, + 0xee7cd990, 0x4b3d4bee, 0xa1138b9d, 0x045219e3, + 0x48f3434f, 0xedb2d131, 0x079c1142, 0xa2dd833c, + 0xd62de755, 0x736c752b, 0x9942b558, 0x3c032726, + 0xe144fb14, 0x4405696a, 0xae2ba919, 0x0b6a3b67, + 0x7f9a5f0e, 0xdadbcd70, 0x30f50d03, 0x95b49f7d, + 0xd915c5d1, 0x7c5457af, 0x967a97dc, 0x333b05a2, + 0x47cb61cb, 0xe28af3b5, 0x08a433c6, 0xade5a1b8, + 0x91e6869e, 0x34a714e0, 0xde89d493, 0x7bc846ed, + 0x0f382284, 0xaa79b0fa, 0x40577089, 0xe516e2f7, + 0xa9b7b85b, 0x0cf62a25, 0xe6d8ea56, 0x43997828, + 0x37691c41, 0x92288e3f, 0x78064e4c, 0xdd47dc32, + 0xc76580d9, 0x622412a7, 0x880ad2d4, 0x2d4b40aa, + 0x59bb24c3, 0xfcfab6bd, 0x16d476ce, 0xb395e4b0, + 0xff34be1c, 0x5a752c62, 0xb05bec11, 0x151a7e6f, + 0x61ea1a06, 0xc4ab8878, 0x2e85480b, 0x8bc4da75, + 0xb7c7fd53, 0x12866f2d, 0xf8a8af5e, 0x5de93d20, + 0x29195949, 0x8c58cb37, 0x66760b44, 0xc337993a, + 0x8f96c396, 0x2ad751e8, 0xc0f9919b, 0x65b803e5, + 0x1148678c, 0xb409f5f2, 0x5e273581, 0xfb66a7ff, + 0x26217bcd, 0x8360e9b3, 0x694e29c0, 0xcc0fbbbe, + 0xb8ffdfd7, 0x1dbe4da9, 0xf7908dda, 0x52d11fa4, + 0x1e704508, 0xbb31d776, 0x511f1705, 0xf45e857b, + 0x80aee112, 0x25ef736c, 0xcfc1b31f, 0x6a802161, + 0x56830647, 0xf3c29439, 0x19ec544a, 0xbcadc634, + 0xc85da25d, 0x6d1c3023, 0x8732f050, 0x2273622e, + 0x6ed23882, 0xcb93aafc, 0x21bd6a8f, 0x84fcf8f1, + 0xf00c9c98, 0x554d0ee6, 0xbf63ce95, 0x1a225ceb, + 0x8b277743, 0x2e66e53d, 0xc448254e, 0x6109b730, + 0x15f9d359, 0xb0b84127, 0x5a968154, 0xffd7132a, + 0xb3764986, 0x1637dbf8, 0xfc191b8b, 0x595889f5, + 0x2da8ed9c, 0x88e97fe2, 0x62c7bf91, 0xc7862def, + 0xfb850ac9, 0x5ec498b7, 0xb4ea58c4, 0x11abcaba, + 0x655baed3, 0xc01a3cad, 0x2a34fcde, 0x8f756ea0, + 0xc3d4340c, 0x6695a672, 0x8cbb6601, 0x29faf47f, + 0x5d0a9016, 0xf84b0268, 0x1265c21b, 0xb7245065, + 0x6a638c57, 0xcf221e29, 0x250cde5a, 0x804d4c24, + 0xf4bd284d, 0x51fcba33, 0xbbd27a40, 0x1e93e83e, + 0x5232b292, 0xf77320ec, 0x1d5de09f, 0xb81c72e1, + 0xccec1688, 0x69ad84f6, 0x83834485, 0x26c2d6fb, + 0x1ac1f1dd, 0xbf8063a3, 0x55aea3d0, 0xf0ef31ae, + 0x841f55c7, 0x215ec7b9, 0xcb7007ca, 0x6e3195b4, + 0x2290cf18, 0x87d15d66, 0x6dff9d15, 0xc8be0f6b, + 0xbc4e6b02, 0x190ff97c, 0xf321390f, 0x5660ab71, + 0x4c42f79a, 0xe90365e4, 0x032da597, 0xa66c37e9, + 0xd29c5380, 0x77ddc1fe, 0x9df3018d, 0x38b293f3, + 0x7413c95f, 0xd1525b21, 0x3b7c9b52, 0x9e3d092c, + 0xeacd6d45, 0x4f8cff3b, 0xa5a23f48, 0x00e3ad36, + 0x3ce08a10, 0x99a1186e, 0x738fd81d, 0xd6ce4a63, + 0xa23e2e0a, 0x077fbc74, 0xed517c07, 0x4810ee79, + 0x04b1b4d5, 0xa1f026ab, 0x4bdee6d8, 0xee9f74a6, + 0x9a6f10cf, 0x3f2e82b1, 0xd50042c2, 0x7041d0bc, + 0xad060c8e, 0x08479ef0, 0xe2695e83, 0x4728ccfd, + 0x33d8a894, 0x96993aea, 0x7cb7fa99, 0xd9f668e7, + 0x9557324b, 0x3016a035, 0xda386046, 0x7f79f238, + 0x0b899651, 0xaec8042f, 0x44e6c45c, 0xe1a75622, + 0xdda47104, 0x78e5e37a, 0x92cb2309, 0x378ab177, + 0x437ad51e, 0xe63b4760, 0x0c158713, 0xa954156d, + 0xe5f54fc1, 0x40b4ddbf, 0xaa9a1dcc, 0x0fdb8fb2, + 0x7b2bebdb, 0xde6a79a5, 0x3444b9d6, 0x91052ba8 +}; +static const uint32_t table3_[256] = { + 0x00000000, 0xdd45aab8, 0xbf672381, 0x62228939, + 0x7b2231f3, 0xa6679b4b, 0xc4451272, 0x1900b8ca, + 0xf64463e6, 0x2b01c95e, 0x49234067, 0x9466eadf, + 0x8d665215, 0x5023f8ad, 0x32017194, 0xef44db2c, + 0xe964b13d, 0x34211b85, 0x560392bc, 0x8b463804, + 0x924680ce, 0x4f032a76, 0x2d21a34f, 0xf06409f7, + 0x1f20d2db, 0xc2657863, 0xa047f15a, 0x7d025be2, + 0x6402e328, 0xb9474990, 0xdb65c0a9, 0x06206a11, + 0xd725148b, 0x0a60be33, 0x6842370a, 0xb5079db2, + 0xac072578, 0x71428fc0, 0x136006f9, 0xce25ac41, + 0x2161776d, 0xfc24ddd5, 0x9e0654ec, 0x4343fe54, + 0x5a43469e, 0x8706ec26, 0xe524651f, 0x3861cfa7, + 0x3e41a5b6, 0xe3040f0e, 0x81268637, 0x5c632c8f, + 0x45639445, 0x98263efd, 0xfa04b7c4, 0x27411d7c, + 0xc805c650, 0x15406ce8, 0x7762e5d1, 0xaa274f69, + 0xb327f7a3, 0x6e625d1b, 0x0c40d422, 0xd1057e9a, + 0xaba65fe7, 0x76e3f55f, 0x14c17c66, 0xc984d6de, + 0xd0846e14, 0x0dc1c4ac, 0x6fe34d95, 0xb2a6e72d, + 0x5de23c01, 0x80a796b9, 0xe2851f80, 0x3fc0b538, + 0x26c00df2, 0xfb85a74a, 0x99a72e73, 0x44e284cb, + 0x42c2eeda, 0x9f874462, 0xfda5cd5b, 0x20e067e3, + 0x39e0df29, 0xe4a57591, 0x8687fca8, 0x5bc25610, + 0xb4868d3c, 0x69c32784, 0x0be1aebd, 0xd6a40405, + 0xcfa4bccf, 0x12e11677, 0x70c39f4e, 0xad8635f6, + 0x7c834b6c, 0xa1c6e1d4, 0xc3e468ed, 0x1ea1c255, + 0x07a17a9f, 0xdae4d027, 0xb8c6591e, 0x6583f3a6, + 0x8ac7288a, 0x57828232, 0x35a00b0b, 0xe8e5a1b3, + 0xf1e51979, 0x2ca0b3c1, 0x4e823af8, 0x93c79040, + 0x95e7fa51, 0x48a250e9, 0x2a80d9d0, 0xf7c57368, + 0xeec5cba2, 0x3380611a, 0x51a2e823, 0x8ce7429b, + 0x63a399b7, 0xbee6330f, 0xdcc4ba36, 0x0181108e, + 0x1881a844, 0xc5c402fc, 0xa7e68bc5, 0x7aa3217d, + 0x52a0c93f, 0x8fe56387, 0xedc7eabe, 0x30824006, + 0x2982f8cc, 0xf4c75274, 0x96e5db4d, 0x4ba071f5, + 0xa4e4aad9, 0x79a10061, 0x1b838958, 0xc6c623e0, + 0xdfc69b2a, 0x02833192, 0x60a1b8ab, 0xbde41213, + 0xbbc47802, 0x6681d2ba, 0x04a35b83, 0xd9e6f13b, + 0xc0e649f1, 0x1da3e349, 0x7f816a70, 0xa2c4c0c8, + 0x4d801be4, 0x90c5b15c, 0xf2e73865, 0x2fa292dd, + 0x36a22a17, 0xebe780af, 0x89c50996, 0x5480a32e, + 0x8585ddb4, 0x58c0770c, 0x3ae2fe35, 0xe7a7548d, + 0xfea7ec47, 0x23e246ff, 0x41c0cfc6, 0x9c85657e, + 0x73c1be52, 0xae8414ea, 0xcca69dd3, 0x11e3376b, + 0x08e38fa1, 0xd5a62519, 0xb784ac20, 0x6ac10698, + 0x6ce16c89, 0xb1a4c631, 0xd3864f08, 0x0ec3e5b0, + 0x17c35d7a, 0xca86f7c2, 0xa8a47efb, 0x75e1d443, + 0x9aa50f6f, 0x47e0a5d7, 0x25c22cee, 0xf8878656, + 0xe1873e9c, 0x3cc29424, 0x5ee01d1d, 0x83a5b7a5, + 0xf90696d8, 0x24433c60, 0x4661b559, 0x9b241fe1, + 0x8224a72b, 0x5f610d93, 0x3d4384aa, 0xe0062e12, + 0x0f42f53e, 0xd2075f86, 0xb025d6bf, 0x6d607c07, + 0x7460c4cd, 0xa9256e75, 0xcb07e74c, 0x16424df4, + 0x106227e5, 0xcd278d5d, 0xaf050464, 0x7240aedc, + 0x6b401616, 0xb605bcae, 0xd4273597, 0x09629f2f, + 0xe6264403, 0x3b63eebb, 0x59416782, 0x8404cd3a, + 0x9d0475f0, 0x4041df48, 0x22635671, 0xff26fcc9, + 0x2e238253, 0xf36628eb, 0x9144a1d2, 0x4c010b6a, + 0x5501b3a0, 0x88441918, 0xea669021, 0x37233a99, + 0xd867e1b5, 0x05224b0d, 0x6700c234, 0xba45688c, + 0xa345d046, 0x7e007afe, 0x1c22f3c7, 0xc167597f, + 0xc747336e, 0x1a0299d6, 0x782010ef, 0xa565ba57, + 0xbc65029d, 0x6120a825, 0x0302211c, 0xde478ba4, + 0x31035088, 0xec46fa30, 0x8e647309, 0x5321d9b1, + 0x4a21617b, 0x9764cbc3, 0xf54642fa, 0x2803e842 +}; + +// Used to fetch a naturally-aligned 32-bit word in little endian byte-order +static inline uint32_t LE_LOAD32(const uint8_t *p) { + return DecodeFixed32(reinterpret_cast(p)); +} + +static inline uint64_t LE_LOAD64(const uint8_t *p) { + return DecodeFixed64(reinterpret_cast(p)); +} + +static inline void Slow_CRC32(uint64_t* l, uint8_t const **p) { + uint32_t c = *l ^ LE_LOAD32(*p); + *p += 4; + *l = table3_[c & 0xff] ^ + table2_[(c >> 8) & 0xff] ^ + table1_[(c >> 16) & 0xff] ^ + table0_[c >> 24]; + // DO it twice. + c = *l ^ LE_LOAD32(*p); + *p += 4; + *l = table3_[c & 0xff] ^ + table2_[(c >> 8) & 0xff] ^ + table1_[(c >> 16) & 0xff] ^ + table0_[c >> 24]; +} + +static inline void Fast_CRC32(uint64_t* l, uint8_t const **p) { + #ifdef __SSE4_2__ + *l = _mm_crc32_u64(*l, LE_LOAD64(*p)); + *p += 8; + #else + Slow_CRC32(l, p); + #endif +} + +// Detect if SS42 or not. +static bool isSSE42() { + #ifdef __GNUC__ + uint32_t c_; + uint32_t d_; + __asm__("cpuid" : "=c"(c_), "=d"(d_) : "a"(1) : "ebx"); + return c_ & (1U << 20); // copied from CpuId.h in Folly. + #else + return false; + #endif +} + +typedef void (*Function)(uint64_t*, uint8_t const**); +static Function func = nullptr; + +static inline Function Choose_CRC32() { + return isSSE42() ? Fast_CRC32 : Slow_CRC32; +} + +static inline void CRC32(uint64_t* l, uint8_t const **p) { + if (func != nullptr) { + return func(l, p); + } + func = Choose_CRC32(); + func(l, p); +} + +uint32_t Extend(uint32_t crc, const char* buf, size_t size) { + const uint8_t *p = reinterpret_cast(buf); + const uint8_t *e = p + size; + uint64_t l = crc ^ 0xffffffffu; + +// Align n to (1 << m) byte boundary +#define ALIGN(n, m) ((n + ((1 << m) - 1)) & ~((1 << m) - 1)) + +#define STEP1 do { \ + int c = (l & 0xff) ^ *p++; \ + l = table0_[c] ^ (l >> 8); \ +} while (0) + + + // Point x at first 16-byte aligned byte in string. This might be + // just past the end of the string. + const uintptr_t pval = reinterpret_cast(p); + const uint8_t* x = reinterpret_cast(ALIGN(pval, 4)); + if (x <= e) { + // Process bytes until finished or p is 16-byte aligned + while (p != x) { + STEP1; + } + } + // Process bytes 16 at a time + while ((e-p) >= 16) { + CRC32(&l, &p); + CRC32(&l, &p); + } + // Process bytes 8 at a time + while ((e-p) >= 8) { + CRC32(&l, &p); + } + // Process the last few bytes + while (p != e) { + STEP1; + } +#undef STEP1 +#undef ALIGN + return l ^ 0xffffffffu; +} + +} // namespace crc32c +} // namespace rocksdb diff --git a/util/crc32c.h b/util/crc32c.h new file mode 100644 index 00000000..e5e6e143 --- /dev/null +++ b/util/crc32c.h @@ -0,0 +1,46 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include + +namespace rocksdb { +namespace crc32c { + +// Return the crc32c of concat(A, data[0,n-1]) where init_crc is the +// crc32c of some string A. Extend() is often used to maintain the +// crc32c of a stream of data. +extern uint32_t Extend(uint32_t init_crc, const char* data, size_t n); + +// Return the crc32c of data[0,n-1] +inline uint32_t Value(const char* data, size_t n) { + return Extend(0, data, n); +} + +static const uint32_t kMaskDelta = 0xa282ead8ul; + +// Return a masked representation of crc. +// +// Motivation: it is problematic to compute the CRC of a string that +// contains embedded CRCs. Therefore we recommend that CRCs stored +// somewhere (e.g., in files) should be masked before being stored. +inline uint32_t Mask(uint32_t crc) { + // Rotate right by 15 bits and add a constant. + return ((crc >> 15) | (crc << 17)) + kMaskDelta; +} + +// Return the crc whose masked representation is masked_crc. +inline uint32_t Unmask(uint32_t masked_crc) { + uint32_t rot = masked_crc - kMaskDelta; + return ((rot >> 17) | (rot << 15)); +} + +} // namespace crc32c +} // namespace rocksdb diff --git a/util/crc32c_test.cc b/util/crc32c_test.cc new file mode 100644 index 00000000..300c9d3c --- /dev/null +++ b/util/crc32c_test.cc @@ -0,0 +1,77 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/crc32c.h" +#include "util/testharness.h" + +namespace rocksdb { +namespace crc32c { + +class CRC { }; + +TEST(CRC, StandardResults) { + // From rfc3720 section B.4. + char buf[32]; + + memset(buf, 0, sizeof(buf)); + ASSERT_EQ(0x8a9136aaU, Value(buf, sizeof(buf))); + + memset(buf, 0xff, sizeof(buf)); + ASSERT_EQ(0x62a8ab43U, Value(buf, sizeof(buf))); + + for (int i = 0; i < 32; i++) { + buf[i] = i; + } + ASSERT_EQ(0x46dd794eU, Value(buf, sizeof(buf))); + + for (int i = 0; i < 32; i++) { + buf[i] = 31 - i; + } + ASSERT_EQ(0x113fdb5cU, Value(buf, sizeof(buf))); + + unsigned char data[48] = { + 0x01, 0xc0, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + 0x14, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x00, 0x14, + 0x00, 0x00, 0x00, 0x18, + 0x28, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + }; + ASSERT_EQ(0xd9963a56, Value(reinterpret_cast(data), sizeof(data))); +} + +TEST(CRC, Values) { + ASSERT_NE(Value("a", 1), Value("foo", 3)); +} + +TEST(CRC, Extend) { + ASSERT_EQ(Value("hello world", 11), + Extend(Value("hello ", 6), "world", 5)); +} + +TEST(CRC, Mask) { + uint32_t crc = Value("foo", 3); + ASSERT_NE(crc, Mask(crc)); + ASSERT_NE(crc, Mask(Mask(crc))); + ASSERT_EQ(crc, Unmask(Mask(crc))); + ASSERT_EQ(crc, Unmask(Unmask(Mask(Mask(crc))))); +} + +} // namespace crc32c +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/util/env.cc b/util/env.cc new file mode 100644 index 00000000..bd19d48e --- /dev/null +++ b/util/env.cc @@ -0,0 +1,142 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/env.h" +#include "rocksdb/options.h" + +namespace rocksdb { + +Env::~Env() { +} + +SequentialFile::~SequentialFile() { +} + +RandomAccessFile::~RandomAccessFile() { +} + +WritableFile::~WritableFile() { +} + +Logger::~Logger() { +} + +FileLock::~FileLock() { +} + +void LogFlush(Logger *info_log) { + if (info_log) { + info_log->Flush(); + } +} + +void Log(Logger* info_log, const char* format, ...) { + if (info_log) { + va_list ap; + va_start(ap, format); + info_log->Logv(format, ap); + va_end(ap); + } +} + +void LogFlush(const shared_ptr& info_log) { + if (info_log) { + info_log->Flush(); + } +} + +void Log(const shared_ptr& info_log, const char* format, ...) { + if (info_log) { + va_list ap; + va_start(ap, format); + info_log->Logv(format, ap); + va_end(ap); + } +} + +static Status DoWriteStringToFile(Env* env, const Slice& data, + const std::string& fname, + bool should_sync) { + unique_ptr file; + EnvOptions soptions; + Status s = env->NewWritableFile(fname, &file, soptions); + if (!s.ok()) { + return s; + } + s = file->Append(data); + if (s.ok() && should_sync) { + s = file->Sync(); + } + if (!s.ok()) { + env->DeleteFile(fname); + } + return s; +} + +Status WriteStringToFile(Env* env, const Slice& data, + const std::string& fname) { + return DoWriteStringToFile(env, data, fname, false); +} + +Status WriteStringToFileSync(Env* env, const Slice& data, + const std::string& fname) { + return DoWriteStringToFile(env, data, fname, true); +} + +Status ReadFileToString(Env* env, const std::string& fname, std::string* data) { + EnvOptions soptions; + data->clear(); + unique_ptr file; + Status s = env->NewSequentialFile(fname, &file, soptions); + if (!s.ok()) { + return s; + } + static const int kBufferSize = 8192; + char* space = new char[kBufferSize]; + while (true) { + Slice fragment; + s = file->Read(kBufferSize, &fragment, space); + if (!s.ok()) { + break; + } + data->append(fragment.data(), fragment.size()); + if (fragment.empty()) { + break; + } + } + delete[] space; + return s; +} + +EnvWrapper::~EnvWrapper() { +} + +namespace { // anonymous namespace + +void AssignEnvOptions(EnvOptions* env_options, const Options& options) { + env_options->use_os_buffer = options.allow_os_buffer; + env_options->use_mmap_reads = options.allow_mmap_reads; + env_options->use_mmap_writes = options.allow_mmap_writes; + env_options->set_fd_cloexec = options.is_fd_close_on_exec; + env_options->bytes_per_sync = options.bytes_per_sync; +} + +} + +EnvOptions::EnvOptions(const Options& options) { + AssignEnvOptions(this, options); +} + +EnvOptions::EnvOptions() { + Options options; + AssignEnvOptions(this, options); +} + + +} // namespace rocksdb diff --git a/util/env_hdfs.cc b/util/env_hdfs.cc new file mode 100644 index 00000000..0f8fe0d1 --- /dev/null +++ b/util/env_hdfs.cc @@ -0,0 +1,517 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#ifdef USE_HDFS +#ifndef ROCKSDB_HDFS_FILE_C +#define ROCKSDB_HDFS_FILE_C + +#include +#include +#include +#include +#include +#include +#include "rocksdb/env.h" +#include "rocksdb/status.h" +#include "hdfs/hdfs.h" +#include "hdfs/env_hdfs.h" + +// +// This file defines an HDFS environment for rocksdb. It uses the libhdfs +// api to access HDFS. All HDFS files created by one instance of rocksdb +// will reside on the same HDFS cluster. +// + +namespace rocksdb { + +namespace { + +// Log error message +static Status IOError(const std::string& context, int err_number) { + return Status::IOError(context, strerror(err_number)); +} + +// assume that there is one global logger for now. It is not thread-safe, +// but need not be because the logger is initialized at db-open time. +static Logger* mylog = nullptr; + +// Used for reading a file from HDFS. It implements both sequential-read +// access methods as well as random read access methods. +class HdfsReadableFile: virtual public SequentialFile, virtual public RandomAccessFile { + private: + hdfsFS fileSys_; + std::string filename_; + hdfsFile hfile_; + + public: + HdfsReadableFile(hdfsFS fileSys, const std::string& fname) + : fileSys_(fileSys), filename_(fname), hfile_(nullptr) { + Log(mylog, "[hdfs] HdfsReadableFile opening file %s\n", + filename_.c_str()); + hfile_ = hdfsOpenFile(fileSys_, filename_.c_str(), O_RDONLY, 0, 0, 0); + Log(mylog, "[hdfs] HdfsReadableFile opened file %s hfile_=0x%p\n", + filename_.c_str(), hfile_); + } + + virtual ~HdfsReadableFile() { + Log(mylog, "[hdfs] HdfsReadableFile closing file %s\n", + filename_.c_str()); + hdfsCloseFile(fileSys_, hfile_); + Log(mylog, "[hdfs] HdfsReadableFile closed file %s\n", + filename_.c_str()); + hfile_ = nullptr; + } + + bool isValid() { + return hfile_ != nullptr; + } + + // sequential access, read data at current offset in file + virtual Status Read(size_t n, Slice* result, char* scratch) { + Status s; + Log(mylog, "[hdfs] HdfsReadableFile reading %s %ld\n", + filename_.c_str(), n); + size_t bytes_read = hdfsRead(fileSys_, hfile_, scratch, (tSize)n); + Log(mylog, "[hdfs] HdfsReadableFile read %s\n", filename_.c_str()); + *result = Slice(scratch, bytes_read); + if (bytes_read < n) { + if (feof()) { + // We leave status as ok if we hit the end of the file + } else { + // A partial read with an error: return a non-ok status + s = IOError(filename_, errno); + } + } + return s; + } + + // random access, read data from specified offset in file + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + Status s; + Log(mylog, "[hdfs] HdfsReadableFile preading %s\n", filename_.c_str()); + ssize_t bytes_read = hdfsPread(fileSys_, hfile_, offset, + (void*)scratch, (tSize)n); + Log(mylog, "[hdfs] HdfsReadableFile pread %s\n", filename_.c_str()); + *result = Slice(scratch, (bytes_read < 0) ? 0 : bytes_read); + if (bytes_read < 0) { + // An error: return a non-ok status + s = IOError(filename_, errno); + } + return s; + } + + virtual Status Skip(uint64_t n) { + Log(mylog, "[hdfs] HdfsReadableFile skip %s\n", filename_.c_str()); + // get current offset from file + tOffset current = hdfsTell(fileSys_, hfile_); + if (current < 0) { + return IOError(filename_, errno); + } + // seek to new offset in file + tOffset newoffset = current + n; + int val = hdfsSeek(fileSys_, hfile_, newoffset); + if (val < 0) { + return IOError(filename_, errno); + } + return Status::OK(); + } + + private: + + // returns true if we are at the end of file, false otherwise + bool feof() { + Log(mylog, "[hdfs] HdfsReadableFile feof %s\n", filename_.c_str()); + if (hdfsTell(fileSys_, hfile_) == fileSize()) { + return true; + } + return false; + } + + // the current size of the file + tOffset fileSize() { + Log(mylog, "[hdfs] HdfsReadableFile fileSize %s\n", filename_.c_str()); + hdfsFileInfo* pFileInfo = hdfsGetPathInfo(fileSys_, filename_.c_str()); + tOffset size = 0L; + if (pFileInfo != nullptr) { + size = pFileInfo->mSize; + hdfsFreeFileInfo(pFileInfo, 1); + } else { + throw rocksdb::HdfsFatalException("fileSize on unknown file " + + filename_); + } + return size; + } +}; + +// Appends to an existing file in HDFS. +class HdfsWritableFile: public WritableFile { + private: + hdfsFS fileSys_; + std::string filename_; + hdfsFile hfile_; + + public: + HdfsWritableFile(hdfsFS fileSys, const std::string& fname) + : fileSys_(fileSys), filename_(fname) , hfile_(nullptr) { + Log(mylog, "[hdfs] HdfsWritableFile opening %s\n", filename_.c_str()); + hfile_ = hdfsOpenFile(fileSys_, filename_.c_str(), O_WRONLY, 0, 0, 0); + Log(mylog, "[hdfs] HdfsWritableFile opened %s\n", filename_.c_str()); + assert(hfile_ != nullptr); + } + virtual ~HdfsWritableFile() { + if (hfile_ != nullptr) { + Log(mylog, "[hdfs] HdfsWritableFile closing %s\n", filename_.c_str()); + hdfsCloseFile(fileSys_, hfile_); + Log(mylog, "[hdfs] HdfsWritableFile closed %s\n", filename_.c_str()); + hfile_ = nullptr; + } + } + + // If the file was successfully created, then this returns true. + // Otherwise returns false. + bool isValid() { + return hfile_ != nullptr; + } + + // The name of the file, mostly needed for debug logging. + const std::string& getName() { + return filename_; + } + + virtual Status Append(const Slice& data) { + Log(mylog, "[hdfs] HdfsWritableFile Append %s\n", filename_.c_str()); + const char* src = data.data(); + size_t left = data.size(); + size_t ret = hdfsWrite(fileSys_, hfile_, src, left); + Log(mylog, "[hdfs] HdfsWritableFile Appended %s\n", filename_.c_str()); + if (ret != left) { + return IOError(filename_, errno); + } + return Status::OK(); + } + + virtual Status Flush() { + return Status::OK(); + } + + virtual Status Sync() { + Status s; + Log(mylog, "[hdfs] HdfsWritableFile Sync %s\n", filename_.c_str()); + if (hdfsFlush(fileSys_, hfile_) == -1) { + return IOError(filename_, errno); + } + if (hdfsSync(fileSys_, hfile_) == -1) { + return IOError(filename_, errno); + } + Log(mylog, "[hdfs] HdfsWritableFile Synced %s\n", filename_.c_str()); + return Status::OK(); + } + + // This is used by HdfsLogger to write data to the debug log file + virtual Status Append(const char* src, size_t size) { + if (hdfsWrite(fileSys_, hfile_, src, size) != (tSize)size) { + return IOError(filename_, errno); + } + return Status::OK(); + } + + virtual Status Close() { + Log(mylog, "[hdfs] HdfsWritableFile closing %s\n", filename_.c_str()); + if (hdfsCloseFile(fileSys_, hfile_) != 0) { + return IOError(filename_, errno); + } + Log(mylog, "[hdfs] HdfsWritableFile closed %s\n", filename_.c_str()); + hfile_ = nullptr; + return Status::OK(); + } +}; + +// The object that implements the debug logs to reside in HDFS. +class HdfsLogger : public Logger { + private: + HdfsWritableFile* file_; + uint64_t (*gettid_)(); // Return the thread id for the current thread + + public: + HdfsLogger(HdfsWritableFile* f, uint64_t (*gettid)()) + : file_(f), gettid_(gettid) { + Log(mylog, "[hdfs] HdfsLogger opened %s\n", + file_->getName().c_str()); + } + + virtual ~HdfsLogger() { + Log(mylog, "[hdfs] HdfsLogger closed %s\n", + file_->getName().c_str()); + delete file_; + if (mylog != nullptr && mylog == this) { + mylog = nullptr; + } + } + + virtual void Logv(const char* format, va_list ap) { + const uint64_t thread_id = (*gettid_)(); + + // We try twice: the first time with a fixed-size stack allocated buffer, + // and the second time with a much larger dynamically allocated buffer. + char buffer[500]; + for (int iter = 0; iter < 2; iter++) { + char* base; + int bufsize; + if (iter == 0) { + bufsize = sizeof(buffer); + base = buffer; + } else { + bufsize = 30000; + base = new char[bufsize]; + } + char* p = base; + char* limit = base + bufsize; + + struct timeval now_tv; + gettimeofday(&now_tv, nullptr); + const time_t seconds = now_tv.tv_sec; + struct tm t; + localtime_r(&seconds, &t); + p += snprintf(p, limit - p, + "%04d/%02d/%02d-%02d:%02d:%02d.%06d %llx ", + t.tm_year + 1900, + t.tm_mon + 1, + t.tm_mday, + t.tm_hour, + t.tm_min, + t.tm_sec, + static_cast(now_tv.tv_usec), + static_cast(thread_id)); + + // Print the message + if (p < limit) { + va_list backup_ap; + va_copy(backup_ap, ap); + p += vsnprintf(p, limit - p, format, backup_ap); + va_end(backup_ap); + } + + // Truncate to available space if necessary + if (p >= limit) { + if (iter == 0) { + continue; // Try again with larger buffer + } else { + p = limit - 1; + } + } + + // Add newline if necessary + if (p == base || p[-1] != '\n') { + *p++ = '\n'; + } + + assert(p <= limit); + file_->Append(base, p-base); + file_->Flush(); + if (base != buffer) { + delete[] base; + } + break; + } + } +}; + +} // namespace + +// Finally, the hdfs environment + +// open a file for sequential reading +Status HdfsEnv::NewSequentialFile(const std::string& fname, + SequentialFile** result) { + HdfsReadableFile* f = new HdfsReadableFile(fileSys_, fname); + if (f == nullptr) { + *result = nullptr; + return IOError(fname, errno); + } + *result = dynamic_cast(f); + return Status::OK(); +} + +// open a file for random reading +Status HdfsEnv::NewRandomAccessFile(const std::string& fname, + RandomAccessFile** result) { + HdfsReadableFile* f = new HdfsReadableFile(fileSys_, fname); + if (f == nullptr) { + *result = nullptr; + return IOError(fname, errno); + } + *result = dynamic_cast(f); + return Status::OK(); +} + +// create a new file for writing +Status HdfsEnv::NewWritableFile(const std::string& fname, + WritableFile** result) { + Status s; + HdfsWritableFile* f = new HdfsWritableFile(fileSys_, fname); + if (f == nullptr || !f->isValid()) { + *result = nullptr; + return IOError(fname, errno); + } + *result = dynamic_cast(f); + return Status::OK(); +} + +Status HdfsEnv::NewRandomRWFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& options) { + return Status::NotSupported("NewRandomRWFile not supported on HdfsEnv"); +} + +bool HdfsEnv::FileExists(const std::string& fname) { + int value = hdfsExists(fileSys_, fname.c_str()); + if (value == 0) { + return true; + } + return false; +} + +Status HdfsEnv::GetChildren(const std::string& path, + std::vector* result) { + int value = hdfsExists(fileSys_, path.c_str()); + switch (value) { + case 0: { + int numEntries = 0; + hdfsFileInfo* pHdfsFileInfo = 0; + pHdfsFileInfo = hdfsListDirectory(fileSys_, path.c_str(), &numEntries); + if (numEntries >= 0) { + for(int i = 0; i < numEntries; i++) { + char* pathname = pHdfsFileInfo[i].mName; + char* filename = rindex(pathname, '/'); + if (filename != nullptr) { + result->push_back(filename+1); + } + } + if (pHdfsFileInfo != nullptr) { + hdfsFreeFileInfo(pHdfsFileInfo, numEntries); + } + } else { + // numEntries < 0 indicates error + Log(mylog, "hdfsListDirectory call failed with error "); + throw HdfsFatalException("hdfsListDirectory call failed negative error.\n"); + } + break; + } + case 1: // directory does not exist, exit + break; + default: // anything else should be an error + Log(mylog, "hdfsListDirectory call failed with error "); + throw HdfsFatalException("hdfsListDirectory call failed with error.\n"); + } + return Status::OK(); +} + +Status HdfsEnv::DeleteFile(const std::string& fname) { + if (hdfsDelete(fileSys_, fname.c_str()) == 0) { + return Status::OK(); + } + return IOError(fname, errno); +}; + +Status HdfsEnv::CreateDir(const std::string& name) { + if (hdfsCreateDirectory(fileSys_, name.c_str()) == 0) { + return Status::OK(); + } + return IOError(name, errno); +}; + +Status HdfsEnv::CreateDirIfMissing(const std::string& name) { + const int value = hdfsExists(fileSys_, name.c_str()); + // Not atomic. state might change b/w hdfsExists and CreateDir. + if (value == 0) { + return Status::OK(); + } else { + return CreateDir(name); + } +}; + +Status HdfsEnv::DeleteDir(const std::string& name) { + return DeleteFile(name); +}; + +Status HdfsEnv::GetFileSize(const std::string& fname, uint64_t* size) { + *size = 0L; + hdfsFileInfo* pFileInfo = hdfsGetPathInfo(fileSys_, fname.c_str()); + if (pFileInfo != nullptr) { + *size = pFileInfo->mSize; + hdfsFreeFileInfo(pFileInfo, 1); + return Status::OK(); + } + return IOError(fname, errno); +} + +Status HdfsEnv::GetFileModificationTime(const std::string& fname, + uint64_t* time) { + hdfsFileInfo* pFileInfo = hdfsGetPathInfo(fileSys_, fname.c_str()); + if (pFileInfo != nullptr) { + *time = static_cast(pFileInfo->mLastMod); + hdfsFreeFileInfo(pFileInfo, 1); + return Status::OK(); + } + return IOError(fname, errno); + +} + +// The rename is not atomic. HDFS does not allow a renaming if the +// target already exists. So, we delete the target before attemting the +// rename. +Status HdfsEnv::RenameFile(const std::string& src, const std::string& target) { + hdfsDelete(fileSys_, target.c_str()); + if (hdfsRename(fileSys_, src.c_str(), target.c_str()) == 0) { + return Status::OK(); + } + return IOError(src, errno); +} + +Status HdfsEnv::LockFile(const std::string& fname, FileLock** lock) { + // there isn's a very good way to atomically check and create + // a file via libhdfs + *lock = nullptr; + return Status::OK(); +} + +Status HdfsEnv::UnlockFile(FileLock* lock) { + return Status::OK(); +} + +Status HdfsEnv::NewLogger(const std::string& fname, + shared_ptr* result) { + HdfsWritableFile* f = new HdfsWritableFile(fileSys_, fname); + if (f == nullptr || !f->isValid()) { + *result = nullptr; + return IOError(fname, errno); + } + HdfsLogger* h = new HdfsLogger(f, &HdfsEnv::gettid); + *result = h; + if (mylog == nullptr) { + // mylog = h; // uncomment this for detailed logging + } + return Status::OK(); +} + +} // namespace rocksdb + +#endif // ROCKSDB_HDFS_FILE_C + +#else // USE_HDFS + +// dummy placeholders used when HDFS is not available +#include "rocksdb/env.h" +#include "hdfs/env_hdfs.h" +namespace rocksdb { + Status HdfsEnv::NewSequentialFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& options) { + return Status::NotSupported("Not compiled with hdfs support"); + } +} + +#endif diff --git a/util/env_posix.cc b/util/env_posix.cc new file mode 100644 index 00000000..2be524e9 --- /dev/null +++ b/util/env_posix.cc @@ -0,0 +1,1511 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef OS_LINUX +#include +#endif +#include +#include +#include +#include +#if defined(OS_LINUX) +#include +#include +#endif +#if defined(LEVELDB_PLATFORM_ANDROID) +#include +#endif +#include "rocksdb/env.h" +#include "rocksdb/slice.h" +#include "port/port.h" +#include "util/coding.h" +#include "util/logging.h" +#include "util/posix_logger.h" +#include "util/random.h" +#include + +// Get nano time for mach systems +#ifdef __MACH__ +#include +#include +#endif + +#if !defined(TMPFS_MAGIC) +#define TMPFS_MAGIC 0x01021994 +#endif +#if !defined(XFS_SUPER_MAGIC) +#define XFS_SUPER_MAGIC 0x58465342 +#endif +#if !defined(EXT4_SUPER_MAGIC) +#define EXT4_SUPER_MAGIC 0xEF53 +#endif + +// For non linux platform, the following macros are used only as place +// holder. +#ifndef OS_LINUX +#define POSIX_FADV_NORMAL 0 /* [MC1] no further special treatment */ +#define POSIX_FADV_RANDOM 1 /* [MC1] expect random page refs */ +#define POSIX_FADV_SEQUENTIAL 2 /* [MC1] expect sequential page refs */ +#define POSIX_FADV_WILLNEED 3 /* [MC1] will need these pages */ +#define POSIX_FADV_DONTNEED 4 /* [MC1] dont need these pages */ +#endif + +// This is only set from db_stress.cc and for testing only. +// If non-zero, kill at various points in source code with probability 1/this +int rocksdb_kill_odds = 0; + +namespace rocksdb { + +namespace { + +// A wrapper for fadvise, if the platform doesn't support fadvise, +// it will simply return Status::NotSupport. +int Fadvise(int fd, off_t offset, size_t len, int advice) { +#ifdef OS_LINUX + return posix_fadvise(fd, offset, len, advice); +#else + return 0; // simply do nothing. +#endif +} + +// list of pathnames that are locked +static std::set lockedFiles; +static port::Mutex mutex_lockedFiles; + +static Status IOError(const std::string& context, int err_number) { + return Status::IOError(context, strerror(err_number)); +} + +#ifdef NDEBUG +// empty in release build +#define TEST_KILL_RANDOM(rocksdb_kill_odds) +#else + +// Kill the process with probablity 1/odds for testing. +static void TestKillRandom(int odds, const std::string& srcfile, + int srcline) { + time_t curtime = time(nullptr); + Random r((uint32_t)curtime); + + assert(odds > 0); + bool crash = r.OneIn(odds); + if (crash) { + fprintf(stdout, "Crashing at %s:%d\n", srcfile.c_str(), srcline); + fflush(stdout); + kill(getpid(), SIGTERM); + } +} + +// To avoid crashing always at some frequently executed codepaths (during +// kill random test), use this factor to reduce odds +#define REDUCE_ODDS 2 +#define REDUCE_ODDS2 4 + +#define TEST_KILL_RANDOM(rocksdb_kill_odds) { \ + if (rocksdb_kill_odds > 0) { \ + TestKillRandom(rocksdb_kill_odds, __FILE__, __LINE__); \ + } \ +} + +#endif + +#if defined(OS_LINUX) +namespace { + static size_t GetUniqueIdFromFile(int fd, char* id, size_t max_size) { + if (max_size < kMaxVarint64Length*3) { + return 0; + } + + struct stat buf; + int result = fstat(fd, &buf); + if (result == -1) { + return 0; + } + + long version = 0; + result = ioctl(fd, FS_IOC_GETVERSION, &version); + if (result == -1) { + return 0; + } + uint64_t uversion = (uint64_t)version; + + char* rid = id; + rid = EncodeVarint64(rid, buf.st_dev); + rid = EncodeVarint64(rid, buf.st_ino); + rid = EncodeVarint64(rid, uversion); + assert(rid >= id); + return static_cast(rid-id); + } +} +#endif + +class PosixSequentialFile: public SequentialFile { + private: + std::string filename_; + FILE* file_; + int fd_; + bool use_os_buffer_; + + public: + PosixSequentialFile(const std::string& fname, FILE* f, + const EnvOptions& options) + : filename_(fname), file_(f), fd_(fileno(f)), + use_os_buffer_(options.use_os_buffer) { + } + virtual ~PosixSequentialFile() { fclose(file_); } + + virtual Status Read(size_t n, Slice* result, char* scratch) { + Status s; + size_t r = fread_unlocked(scratch, 1, n, file_); + *result = Slice(scratch, r); + if (r < n) { + if (feof(file_)) { + // We leave status as ok if we hit the end of the file + } else { + // A partial read with an error: return a non-ok status + s = IOError(filename_, errno); + } + } + if (!use_os_buffer_) { + // we need to fadvise away the entire range of pages because + // we do not want readahead pages to be cached. + Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED); // free OS pages + } + return s; + } + + virtual Status Skip(uint64_t n) { + if (fseek(file_, n, SEEK_CUR)) { + return IOError(filename_, errno); + } + return Status::OK(); + } + + virtual Status InvalidateCache(size_t offset, size_t length) { +#ifndef OS_LINUX + return Status::OK(); +#else + // free OS pages + int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED); + if (ret == 0) { + return Status::OK(); + } + return IOError(filename_, errno); +#endif + } +}; + +// pread() based random-access +class PosixRandomAccessFile: public RandomAccessFile { + private: + std::string filename_; + int fd_; + bool use_os_buffer_; + + public: + PosixRandomAccessFile(const std::string& fname, int fd, + const EnvOptions& options) + : filename_(fname), fd_(fd), use_os_buffer_(options.use_os_buffer) { + assert(!options.use_mmap_reads); + } + virtual ~PosixRandomAccessFile() { close(fd_); } + + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + Status s; + ssize_t r = pread(fd_, scratch, n, static_cast(offset)); + *result = Slice(scratch, (r < 0) ? 0 : r); + if (r < 0) { + // An error: return a non-ok status + s = IOError(filename_, errno); + } + if (!use_os_buffer_) { + // we need to fadvise away the entire range of pages because + // we do not want readahead pages to be cached. + Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED); // free OS pages + } + return s; + } + +#ifdef OS_LINUX + virtual size_t GetUniqueId(char* id, size_t max_size) const { + return GetUniqueIdFromFile(fd_, id, max_size); + } +#endif + + virtual void Hint(AccessPattern pattern) { + switch(pattern) { + case NORMAL: + Fadvise(fd_, 0, 0, POSIX_FADV_NORMAL); + break; + case RANDOM: + Fadvise(fd_, 0, 0, POSIX_FADV_RANDOM); + break; + case SEQUENTIAL: + Fadvise(fd_, 0, 0, POSIX_FADV_SEQUENTIAL); + break; + case WILLNEED: + Fadvise(fd_, 0, 0, POSIX_FADV_WILLNEED); + break; + case DONTNEED: + Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED); + break; + default: + assert(false); + break; + } + } + + virtual Status InvalidateCache(size_t offset, size_t length) { +#ifndef OS_LINUX + return Status::OK(); +#else + // free OS pages + int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED); + if (ret == 0) { + return Status::OK(); + } + return IOError(filename_, errno); +#endif + } +}; + +// mmap() based random-access +class PosixMmapReadableFile: public RandomAccessFile { + private: + int fd_; + std::string filename_; + void* mmapped_region_; + size_t length_; + + public: + // base[0,length-1] contains the mmapped contents of the file. + PosixMmapReadableFile(const int fd, const std::string& fname, + void* base, size_t length, + const EnvOptions& options) + : fd_(fd), filename_(fname), mmapped_region_(base), length_(length) { + fd_ = fd_ + 0; // suppress the warning for used variables + assert(options.use_mmap_reads); + assert(options.use_os_buffer); + } + virtual ~PosixMmapReadableFile() { munmap(mmapped_region_, length_); } + + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + Status s; + if (offset + n > length_) { + *result = Slice(); + s = IOError(filename_, EINVAL); + } else { + *result = Slice(reinterpret_cast(mmapped_region_) + offset, n); + } + return s; + } + virtual Status InvalidateCache(size_t offset, size_t length) { +#ifndef OS_LINUX + return Status::OK(); +#else + // free OS pages + int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED); + if (ret == 0) { + return Status::OK(); + } + return IOError(filename_, errno); +#endif + } +}; + +// We preallocate up to an extra megabyte and use memcpy to append new +// data to the file. This is safe since we either properly close the +// file before reading from it, or for log files, the reading code +// knows enough to skip zero suffixes. +class PosixMmapFile : public WritableFile { + private: + std::string filename_; + int fd_; + size_t page_size_; + size_t map_size_; // How much extra memory to map at a time + char* base_; // The mapped region + char* limit_; // Limit of the mapped region + char* dst_; // Where to write next (in range [base_,limit_]) + char* last_sync_; // Where have we synced up to + uint64_t file_offset_; // Offset of base_ in file + + // Have we done an munmap of unsynced data? + bool pending_sync_; + + // Roundup x to a multiple of y + static size_t Roundup(size_t x, size_t y) { + return ((x + y - 1) / y) * y; + } + + size_t TruncateToPageBoundary(size_t s) { + s -= (s & (page_size_ - 1)); + assert((s % page_size_) == 0); + return s; + } + + bool UnmapCurrentRegion() { + bool result = true; + TEST_KILL_RANDOM(rocksdb_kill_odds); + if (base_ != nullptr) { + if (last_sync_ < limit_) { + // Defer syncing this data until next Sync() call, if any + pending_sync_ = true; + } + if (munmap(base_, limit_ - base_) != 0) { + result = false; + } + file_offset_ += limit_ - base_; + base_ = nullptr; + limit_ = nullptr; + last_sync_ = nullptr; + dst_ = nullptr; + + // Increase the amount we map the next time, but capped at 1MB + if (map_size_ < (1<<20)) { + map_size_ *= 2; + } + } + return result; + } + + Status MapNewRegion() { +#ifdef ROCKSDB_FALLOCATE_PRESENT + assert(base_ == nullptr); + + TEST_KILL_RANDOM(rocksdb_kill_odds); + int alloc_status = posix_fallocate(fd_, file_offset_, map_size_); + if (alloc_status != 0) { + return Status::IOError("Error allocating space to file : " + filename_ + + "Error : " + strerror(alloc_status)); + } + + TEST_KILL_RANDOM(rocksdb_kill_odds); + void* ptr = mmap(nullptr, map_size_, PROT_READ | PROT_WRITE, MAP_SHARED, + fd_, file_offset_); + if (ptr == MAP_FAILED) { + return Status::IOError("MMap failed on " + filename_); + } + + TEST_KILL_RANDOM(rocksdb_kill_odds); + + base_ = reinterpret_cast(ptr); + limit_ = base_ + map_size_; + dst_ = base_; + last_sync_ = base_; + return Status::OK(); +#else + return Status::NotSupported("This platform doesn't support fallocate()"); +#endif + } + + public: + PosixMmapFile(const std::string& fname, int fd, size_t page_size, + const EnvOptions& options) + : filename_(fname), + fd_(fd), + page_size_(page_size), + map_size_(Roundup(65536, page_size)), + base_(nullptr), + limit_(nullptr), + dst_(nullptr), + last_sync_(nullptr), + file_offset_(0), + pending_sync_(false) { + assert((page_size & (page_size - 1)) == 0); + assert(options.use_mmap_writes); + } + + + ~PosixMmapFile() { + if (fd_ >= 0) { + PosixMmapFile::Close(); + } + } + + virtual Status Append(const Slice& data) { + const char* src = data.data(); + size_t left = data.size(); + TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS); + PrepareWrite(GetFileSize(), left); + while (left > 0) { + assert(base_ <= dst_); + assert(dst_ <= limit_); + size_t avail = limit_ - dst_; + if (avail == 0) { + if (UnmapCurrentRegion()) { + Status s = MapNewRegion(); + if (!s.ok()) { + return s; + } + TEST_KILL_RANDOM(rocksdb_kill_odds); + } + } + + size_t n = (left <= avail) ? left : avail; + memcpy(dst_, src, n); + dst_ += n; + src += n; + left -= n; + } + TEST_KILL_RANDOM(rocksdb_kill_odds); + return Status::OK(); + } + + virtual Status Close() { + Status s; + size_t unused = limit_ - dst_; + + TEST_KILL_RANDOM(rocksdb_kill_odds); + + if (!UnmapCurrentRegion()) { + s = IOError(filename_, errno); + } else if (unused > 0) { + // Trim the extra space at the end of the file + if (ftruncate(fd_, file_offset_ - unused) < 0) { + s = IOError(filename_, errno); + } + } + + TEST_KILL_RANDOM(rocksdb_kill_odds); + + if (close(fd_) < 0) { + if (s.ok()) { + s = IOError(filename_, errno); + } + } + + fd_ = -1; + base_ = nullptr; + limit_ = nullptr; + return s; + } + + virtual Status Flush() { + TEST_KILL_RANDOM(rocksdb_kill_odds); + return Status::OK(); + } + + virtual Status Sync() { + Status s; + + if (pending_sync_) { + // Some unmapped data was not synced + TEST_KILL_RANDOM(rocksdb_kill_odds); + pending_sync_ = false; + if (fdatasync(fd_) < 0) { + s = IOError(filename_, errno); + } + TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS); + } + + if (dst_ > last_sync_) { + // Find the beginnings of the pages that contain the first and last + // bytes to be synced. + size_t p1 = TruncateToPageBoundary(last_sync_ - base_); + size_t p2 = TruncateToPageBoundary(dst_ - base_ - 1); + last_sync_ = dst_; + TEST_KILL_RANDOM(rocksdb_kill_odds); + if (msync(base_ + p1, p2 - p1 + page_size_, MS_SYNC) < 0) { + s = IOError(filename_, errno); + } + TEST_KILL_RANDOM(rocksdb_kill_odds); + } + + return s; + } + + /** + * Flush data as well as metadata to stable storage. + */ + virtual Status Fsync() { + if (pending_sync_) { + // Some unmapped data was not synced + TEST_KILL_RANDOM(rocksdb_kill_odds); + pending_sync_ = false; + if (fsync(fd_) < 0) { + return IOError(filename_, errno); + } + TEST_KILL_RANDOM(rocksdb_kill_odds); + } + // This invocation to Sync will not issue the call to + // fdatasync because pending_sync_ has already been cleared. + return Sync(); + } + + /** + * Get the size of valid data in the file. This will not match the + * size that is returned from the filesystem because we use mmap + * to extend file by map_size every time. + */ + virtual uint64_t GetFileSize() { + size_t used = dst_ - base_; + return file_offset_ + used; + } + + virtual Status InvalidateCache(size_t offset, size_t length) { +#ifndef OS_LINUX + return Status::OK(); +#else + // free OS pages + int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED); + if (ret == 0) { + return Status::OK(); + } + return IOError(filename_, errno); +#endif + } + +#ifdef ROCKSDB_FALLOCATE_PRESENT + virtual Status Allocate(off_t offset, off_t len) { + TEST_KILL_RANDOM(rocksdb_kill_odds); + if (!fallocate(fd_, FALLOC_FL_KEEP_SIZE, offset, len)) { + return Status::OK(); + } else { + return IOError(filename_, errno); + } + } +#endif +}; + +// Use posix write to write data to a file. +class PosixWritableFile : public WritableFile { + private: + const std::string filename_; + int fd_; + size_t cursize_; // current size of cached data in buf_ + size_t capacity_; // max size of buf_ + unique_ptr buf_; // a buffer to cache writes + uint64_t filesize_; + bool pending_sync_; + bool pending_fsync_; + uint64_t last_sync_size_; + uint64_t bytes_per_sync_; + + public: + PosixWritableFile(const std::string& fname, int fd, size_t capacity, + const EnvOptions& options) : + filename_(fname), + fd_(fd), + cursize_(0), + capacity_(capacity), + buf_(new char[capacity]), + filesize_(0), + pending_sync_(false), + pending_fsync_(false), + last_sync_size_(0), + bytes_per_sync_(options.bytes_per_sync) { + assert(!options.use_mmap_writes); + } + + ~PosixWritableFile() { + if (fd_ >= 0) { + PosixWritableFile::Close(); + } + } + + virtual Status Append(const Slice& data) { + const char* src = data.data(); + size_t left = data.size(); + Status s; + pending_sync_ = true; + pending_fsync_ = true; + + TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS2); + + PrepareWrite(GetFileSize(), left); + // if there is no space in the cache, then flush + if (cursize_ + left > capacity_) { + s = Flush(); + if (!s.ok()) { + return s; + } + // Increase the buffer size, but capped at 1MB + if (capacity_ < (1<<20)) { + capacity_ *= 2; + buf_.reset(new char[capacity_]); + } + assert(cursize_ == 0); + } + + // if the write fits into the cache, then write to cache + // otherwise do a write() syscall to write to OS buffers. + if (cursize_ + left <= capacity_) { + memcpy(buf_.get()+cursize_, src, left); + cursize_ += left; + } else { + while (left != 0) { + ssize_t done = write(fd_, src, left); + if (done < 0) { + return IOError(filename_, errno); + } + TEST_KILL_RANDOM(rocksdb_kill_odds); + + left -= done; + src += done; + } + } + filesize_ += data.size(); + return Status::OK(); + } + + virtual Status Close() { + Status s; + s = Flush(); // flush cache to OS + if (!s.ok()) { + } + + TEST_KILL_RANDOM(rocksdb_kill_odds); + + if (close(fd_) < 0) { + if (s.ok()) { + s = IOError(filename_, errno); + } + } + fd_ = -1; + return s; + } + + // write out the cached data to the OS cache + virtual Status Flush() { + TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS2); + size_t left = cursize_; + char* src = buf_.get(); + while (left != 0) { + ssize_t done = write(fd_, src, left); + if (done < 0) { + return IOError(filename_, errno); + } + TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS2); + left -= done; + src += done; + } + cursize_ = 0; + + // sync OS cache to disk for every bytes_per_sync_ + // TODO: give log file and sst file different options (log + // files could be potentially cached in OS for their whole + // life time, thus we might not want to flush at all). + if (bytes_per_sync_ && + filesize_ - last_sync_size_ >= bytes_per_sync_) { + RangeSync(last_sync_size_, filesize_ - last_sync_size_); + last_sync_size_ = filesize_; + } + + return Status::OK(); + } + + virtual Status Sync() { + TEST_KILL_RANDOM(rocksdb_kill_odds); + if (pending_sync_ && fdatasync(fd_) < 0) { + return IOError(filename_, errno); + } + TEST_KILL_RANDOM(rocksdb_kill_odds); + pending_sync_ = false; + return Status::OK(); + } + + virtual Status Fsync() { + TEST_KILL_RANDOM(rocksdb_kill_odds); + if (pending_fsync_ && fsync(fd_) < 0) { + return IOError(filename_, errno); + } + TEST_KILL_RANDOM(rocksdb_kill_odds); + pending_fsync_ = false; + pending_sync_ = false; + return Status::OK(); + } + + virtual uint64_t GetFileSize() { + return filesize_; + } + + virtual Status InvalidateCache(size_t offset, size_t length) { +#ifndef OS_LINUX + return Status::OK(); +#else + // free OS pages + int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED); + if (ret == 0) { + return Status::OK(); + } + return IOError(filename_, errno); +#endif + } + +#ifdef ROCKSDB_FALLOCATE_PRESENT + virtual Status Allocate(off_t offset, off_t len) { + TEST_KILL_RANDOM(rocksdb_kill_odds); + if (!fallocate(fd_, FALLOC_FL_KEEP_SIZE, offset, len)) { + return Status::OK(); + } else { + return IOError(filename_, errno); + } + } + + virtual Status RangeSync(off64_t offset, off64_t nbytes) { + if (sync_file_range(fd_, offset, nbytes, SYNC_FILE_RANGE_WRITE) == 0) { + return Status::OK(); + } else { + return IOError(filename_, errno); + } + } + virtual size_t GetUniqueId(char* id, size_t max_size) const { + return GetUniqueIdFromFile(fd_, id, max_size); + } +#endif +}; + +class PosixRandomRWFile : public RandomRWFile { + private: + const std::string filename_; + int fd_; + bool pending_sync_; + bool pending_fsync_; + + public: + PosixRandomRWFile(const std::string& fname, int fd, + const EnvOptions& options) : + filename_(fname), + fd_(fd), + pending_sync_(false), + pending_fsync_(false) { + assert(!options.use_mmap_writes && !options.use_mmap_reads); + } + + ~PosixRandomRWFile() { + if (fd_ >= 0) { + Close(); + } + } + + virtual Status Write(uint64_t offset, const Slice& data) { + const char* src = data.data(); + size_t left = data.size(); + Status s; + pending_sync_ = true; + pending_fsync_ = true; + + while (left != 0) { + ssize_t done = pwrite(fd_, src, left, offset); + if (done < 0) { + return IOError(filename_, errno); + } + + left -= done; + src += done; + offset += done; + } + + return Status::OK(); + } + + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + Status s; + ssize_t r = pread(fd_, scratch, n, static_cast(offset)); + *result = Slice(scratch, (r < 0) ? 0 : r); + if (r < 0) { + s = IOError(filename_, errno); + } + return s; + } + + virtual Status Close() { + Status s = Status::OK(); + if (fd_ >= 0 && close(fd_) < 0) { + s = IOError(filename_, errno); + } + fd_ = -1; + return s; + } + + virtual Status Sync() { + if (pending_sync_ && fdatasync(fd_) < 0) { + return IOError(filename_, errno); + } + pending_sync_ = false; + return Status::OK(); + } + + virtual Status Fsync() { + if (pending_fsync_ && fsync(fd_) < 0) { + return IOError(filename_, errno); + } + pending_fsync_ = false; + pending_sync_ = false; + return Status::OK(); + } + +#ifdef ROCKSDB_FALLOCATE_PRESENT + virtual Status Allocate(off_t offset, off_t len) { + if (!fallocate(fd_, FALLOC_FL_KEEP_SIZE, offset, len)) { + return Status::OK(); + } else { + return IOError(filename_, errno); + } + } +#endif +}; + +static int LockOrUnlock(const std::string& fname, int fd, bool lock) { + mutex_lockedFiles.Lock(); + if (lock) { + // If it already exists in the lockedFiles set, then it is already locked, + // and fail this lock attempt. Otherwise, insert it into lockedFiles. + // This check is needed because fcntl() does not detect lock conflict + // if the fcntl is issued by the same thread that earlier acquired + // this lock. + if (lockedFiles.insert(fname).second == false) { + mutex_lockedFiles.Unlock(); + errno = ENOLCK; + return -1; + } + } else { + // If we are unlocking, then verify that we had locked it earlier, + // it should already exist in lockedFiles. Remove it from lockedFiles. + if (lockedFiles.erase(fname) != 1) { + mutex_lockedFiles.Unlock(); + errno = ENOLCK; + return -1; + } + } + errno = 0; + struct flock f; + memset(&f, 0, sizeof(f)); + f.l_type = (lock ? F_WRLCK : F_UNLCK); + f.l_whence = SEEK_SET; + f.l_start = 0; + f.l_len = 0; // Lock/unlock entire file + int value = fcntl(fd, F_SETLK, &f); + if (value == -1 && lock) { + // if there is an error in locking, then remove the pathname from lockedfiles + lockedFiles.erase(fname); + } + mutex_lockedFiles.Unlock(); + return value; +} + +class PosixFileLock : public FileLock { + public: + int fd_; + std::string filename; +}; + + +namespace { +void PthreadCall(const char* label, int result) { + if (result != 0) { + fprintf(stderr, "pthread %s: %s\n", label, strerror(result)); + exit(1); + } +} +} + +class PosixEnv : public Env { + public: + PosixEnv(); + + virtual ~PosixEnv(){ + for (const auto tid : threads_to_join_) { + pthread_join(tid, nullptr); + } + } + + void SetFD_CLOEXEC(int fd, const EnvOptions* options) { + if ((options == nullptr || options->set_fd_cloexec) && fd > 0) { + fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC); + } + } + + virtual Status NewSequentialFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& options) { + result->reset(); + FILE* f = fopen(fname.c_str(), "r"); + if (f == nullptr) { + *result = nullptr; + return IOError(fname, errno); + } else { + int fd = fileno(f); + SetFD_CLOEXEC(fd, &options); + result->reset(new PosixSequentialFile(fname, f, options)); + return Status::OK(); + } + } + + virtual Status NewRandomAccessFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& options) { + result->reset(); + Status s; + int fd = open(fname.c_str(), O_RDONLY); + SetFD_CLOEXEC(fd, &options); + if (fd < 0) { + s = IOError(fname, errno); + } else if (options.use_mmap_reads && sizeof(void*) >= 8) { + // Use of mmap for random reads has been removed because it + // kills performance when storage is fast. + // Use mmap when virtual address-space is plentiful. + uint64_t size; + s = GetFileSize(fname, &size); + if (s.ok()) { + void* base = mmap(nullptr, size, PROT_READ, MAP_SHARED, fd, 0); + if (base != MAP_FAILED) { + result->reset(new PosixMmapReadableFile(fd, fname, base, + size, options)); + } else { + s = IOError(fname, errno); + } + } + close(fd); + } else { + result->reset(new PosixRandomAccessFile(fname, fd, options)); + } + return s; + } + + virtual Status NewWritableFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& options) { + result->reset(); + Status s; + const int fd = open(fname.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644); + if (fd < 0) { + s = IOError(fname, errno); + } else { + SetFD_CLOEXEC(fd, &options); + if (options.use_mmap_writes) { + if (!checkedDiskForMmap_) { + // this will be executed once in the program's lifetime. + // do not use mmapWrite on non ext-3/xfs/tmpfs systems. + if (!SupportsFastAllocate(fname)) { + forceMmapOff = true; + } + checkedDiskForMmap_ = true; + } + } + if (options.use_mmap_writes && !forceMmapOff) { + result->reset(new PosixMmapFile(fname, fd, page_size_, options)); + } else { + // disable mmap writes + EnvOptions no_mmap_writes_options = options; + no_mmap_writes_options.use_mmap_writes = false; + + result->reset( + new PosixWritableFile(fname, fd, 65536, no_mmap_writes_options) + ); + } + } + return s; + } + + virtual Status NewRandomRWFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& options) { + result->reset(); + Status s; + const int fd = open(fname.c_str(), O_CREAT | O_RDWR, 0644); + if (fd < 0) { + s = IOError(fname, errno); + } else { + SetFD_CLOEXEC(fd, &options); + // no support for mmap yet + if (options.use_mmap_writes || options.use_mmap_reads) { + return Status::NotSupported("No support for mmap read/write yet"); + } + result->reset(new PosixRandomRWFile(fname, fd, options)); + } + return s; + } + + virtual bool FileExists(const std::string& fname) { + return access(fname.c_str(), F_OK) == 0; + } + + virtual Status GetChildren(const std::string& dir, + std::vector* result) { + result->clear(); + DIR* d = opendir(dir.c_str()); + if (d == nullptr) { + return IOError(dir, errno); + } + struct dirent* entry; + while ((entry = readdir(d)) != nullptr) { + result->push_back(entry->d_name); + } + closedir(d); + return Status::OK(); + } + + virtual Status DeleteFile(const std::string& fname) { + Status result; + if (unlink(fname.c_str()) != 0) { + result = IOError(fname, errno); + } + return result; + }; + + virtual Status CreateDir(const std::string& name) { + Status result; + if (mkdir(name.c_str(), 0755) != 0) { + result = IOError(name, errno); + } + return result; + }; + + virtual Status CreateDirIfMissing(const std::string& name) { + Status result; + if (mkdir(name.c_str(), 0755) != 0) { + if (errno != EEXIST) { + result = IOError(name, errno); + } else if (!DirExists(name)) { // Check that name is actually a + // directory. + // Message is taken from mkdir + result = Status::IOError("`"+name+"' exists but is not a directory"); + } + } + return result; + }; + + virtual Status DeleteDir(const std::string& name) { + Status result; + if (rmdir(name.c_str()) != 0) { + result = IOError(name, errno); + } + return result; + }; + + virtual Status GetFileSize(const std::string& fname, uint64_t* size) { + Status s; + struct stat sbuf; + if (stat(fname.c_str(), &sbuf) != 0) { + *size = 0; + s = IOError(fname, errno); + } else { + *size = sbuf.st_size; + } + return s; + } + + virtual Status GetFileModificationTime(const std::string& fname, + uint64_t* file_mtime) { + struct stat s; + if (stat(fname.c_str(), &s) !=0) { + return IOError(fname, errno); + } + *file_mtime = static_cast(s.st_mtime); + return Status::OK(); + } + virtual Status RenameFile(const std::string& src, const std::string& target) { + Status result; + if (rename(src.c_str(), target.c_str()) != 0) { + result = IOError(src, errno); + } + return result; + } + + virtual Status LockFile(const std::string& fname, FileLock** lock) { + *lock = nullptr; + Status result; + int fd = open(fname.c_str(), O_RDWR | O_CREAT, 0644); + if (fd < 0) { + result = IOError(fname, errno); + } else if (LockOrUnlock(fname, fd, true) == -1) { + result = IOError("lock " + fname, errno); + close(fd); + } else { + SetFD_CLOEXEC(fd, nullptr); + PosixFileLock* my_lock = new PosixFileLock; + my_lock->fd_ = fd; + my_lock->filename = fname; + *lock = my_lock; + } + return result; + } + + virtual Status UnlockFile(FileLock* lock) { + PosixFileLock* my_lock = reinterpret_cast(lock); + Status result; + if (LockOrUnlock(my_lock->filename, my_lock->fd_, false) == -1) { + result = IOError("unlock", errno); + } + close(my_lock->fd_); + delete my_lock; + return result; + } + + virtual void Schedule(void (*function)(void*), void* arg, Priority pri = LOW); + + virtual void StartThread(void (*function)(void* arg), void* arg); + + virtual Status GetTestDirectory(std::string* result) { + const char* env = getenv("TEST_TMPDIR"); + if (env && env[0] != '\0') { + *result = env; + } else { + char buf[100]; + snprintf(buf, sizeof(buf), "/tmp/rocksdbtest-%d", int(geteuid())); + *result = buf; + } + // Directory may already exist + CreateDir(*result); + return Status::OK(); + } + + static uint64_t gettid() { + pthread_t tid = pthread_self(); + uint64_t thread_id = 0; + memcpy(&thread_id, &tid, std::min(sizeof(thread_id), sizeof(tid))); + return thread_id; + } + + virtual Status NewLogger(const std::string& fname, + shared_ptr* result) { + FILE* f = fopen(fname.c_str(), "w"); + if (f == nullptr) { + result->reset(); + return IOError(fname, errno); + } else { + int fd = fileno(f); + SetFD_CLOEXEC(fd, nullptr); + result->reset(new PosixLogger(f, &PosixEnv::gettid, this)); + return Status::OK(); + } + } + + virtual uint64_t NowMicros() { + struct timeval tv; + // TODO(kailiu) MAC DON'T HAVE THIS + gettimeofday(&tv, nullptr); + return static_cast(tv.tv_sec) * 1000000 + tv.tv_usec; + } + + virtual uint64_t NowNanos() { +#ifdef OS_LINUX + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return static_cast(ts.tv_sec) * 1000000000 + ts.tv_nsec; +#elif __MACH__ + clock_serv_t cclock; + mach_timespec_t ts; + host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock); + clock_get_time(cclock, &ts); + mach_port_deallocate(mach_task_self(), cclock); +#endif + return static_cast(ts.tv_sec) * 1000000000 + ts.tv_nsec; + } + + virtual void SleepForMicroseconds(int micros) { + usleep(micros); + } + + virtual Status GetHostName(char* name, uint64_t len) { + int ret = gethostname(name, len); + if (ret < 0) { + if (errno == EFAULT || errno == EINVAL) + return Status::InvalidArgument(strerror(errno)); + else + return IOError("GetHostName", errno); + } + return Status::OK(); + } + + virtual Status GetCurrentTime(int64_t* unix_time) { + time_t ret = time(nullptr); + if (ret == (time_t) -1) { + return IOError("GetCurrentTime", errno); + } + *unix_time = (int64_t) ret; + return Status::OK(); + } + + virtual Status GetAbsolutePath(const std::string& db_path, + std::string* output_path) { + if (db_path.find('/') == 0) { + *output_path = db_path; + return Status::OK(); + } + + char the_path[256]; + char* ret = getcwd(the_path, 256); + if (ret == nullptr) { + return Status::IOError(strerror(errno)); + } + + *output_path = ret; + return Status::OK(); + } + + // Allow increasing the number of worker threads. + virtual void SetBackgroundThreads(int num, Priority pri) { + assert(pri >= Priority::LOW && pri <= Priority::HIGH); + thread_pools_[pri].SetBackgroundThreads(num); + } + + virtual std::string TimeToString(uint64_t secondsSince1970) { + const time_t seconds = (time_t)secondsSince1970; + struct tm t; + int maxsize = 64; + std::string dummy; + dummy.reserve(maxsize); + dummy.resize(maxsize); + char* p = &dummy[0]; + localtime_r(&seconds, &t); + snprintf(p, maxsize, + "%04d/%02d/%02d-%02d:%02d:%02d ", + t.tm_year + 1900, + t.tm_mon + 1, + t.tm_mday, + t.tm_hour, + t.tm_min, + t.tm_sec); + return dummy; + } + + private: + bool checkedDiskForMmap_; + bool forceMmapOff; // do we override Env options? + + + // Returns true iff the named directory exists and is a directory. + virtual bool DirExists(const std::string& dname) { + struct stat statbuf; + if (stat(dname.c_str(), &statbuf) == 0) { + return S_ISDIR(statbuf.st_mode); + } + return false; // stat() failed return false + } + + bool SupportsFastAllocate(const std::string& path) { +#ifdef ROCKSDB_FALLOCATE_PRESENT + struct statfs s; + if (statfs(path.c_str(), &s)){ + return false; + } + switch (s.f_type) { + case EXT4_SUPER_MAGIC: + return true; + case XFS_SUPER_MAGIC: + return true; + case TMPFS_MAGIC: + return true; + default: + return false; + } +#else + return false; +#endif + } + + size_t page_size_; + + + class ThreadPool { + public: + + ThreadPool() : + total_threads_limit_(1), + bgthreads_(0), + queue_(), + exit_all_threads_(false) { + PthreadCall("mutex_init", pthread_mutex_init(&mu_, nullptr)); + PthreadCall("cvar_init", pthread_cond_init(&bgsignal_, nullptr)); + } + + ~ThreadPool() { + PthreadCall("lock", pthread_mutex_lock(&mu_)); + assert(!exit_all_threads_); + exit_all_threads_ = true; + PthreadCall("signalall", pthread_cond_broadcast(&bgsignal_)); + PthreadCall("unlock", pthread_mutex_unlock(&mu_)); + for (const auto tid : bgthreads_) { + pthread_join(tid, nullptr); + } + } + + void BGThread() { + while (true) { + // Wait until there is an item that is ready to run + PthreadCall("lock", pthread_mutex_lock(&mu_)); + while (queue_.empty() && !exit_all_threads_) { + PthreadCall("wait", pthread_cond_wait(&bgsignal_, &mu_)); + } + if (exit_all_threads_) { // mechanism to let BG threads exit safely + PthreadCall("unlock", pthread_mutex_unlock(&mu_)); + break; + } + void (*function)(void*) = queue_.front().function; + void* arg = queue_.front().arg; + queue_.pop_front(); + + PthreadCall("unlock", pthread_mutex_unlock(&mu_)); + (*function)(arg); + } + } + + static void* BGThreadWrapper(void* arg) { + reinterpret_cast(arg)->BGThread(); + return nullptr; + } + + void SetBackgroundThreads(int num) { + PthreadCall("lock", pthread_mutex_lock(&mu_)); + if (num > total_threads_limit_) { + total_threads_limit_ = num; + } + assert(total_threads_limit_ > 0); + PthreadCall("unlock", pthread_mutex_unlock(&mu_)); + } + + void Schedule(void (*function)(void*), void* arg) { + PthreadCall("lock", pthread_mutex_lock(&mu_)); + + if (exit_all_threads_) { + PthreadCall("unlock", pthread_mutex_unlock(&mu_)); + return; + } + // Start background thread if necessary + while ((int)bgthreads_.size() < total_threads_limit_) { + pthread_t t; + PthreadCall( + "create thread", + pthread_create(&t, + nullptr, + &ThreadPool::BGThreadWrapper, + this)); + fprintf(stdout, + "Created bg thread 0x%lx\n", + (unsigned long)t); + + // Set the thread name to aid debugging +#if defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ) +#if __GLIBC_PREREQ(2, 12) + char name_buf[16]; + snprintf(name_buf, sizeof name_buf, "rocksdb:bg%zu", bgthreads_.size()); + name_buf[sizeof name_buf - 1] = '\0'; + pthread_setname_np(t, name_buf); +#endif +#endif + + bgthreads_.push_back(t); + } + + // Add to priority queue + queue_.push_back(BGItem()); + queue_.back().function = function; + queue_.back().arg = arg; + + // always wake up at least one waiting thread. + PthreadCall("signal", pthread_cond_signal(&bgsignal_)); + + PthreadCall("unlock", pthread_mutex_unlock(&mu_)); + } + + private: + // Entry per Schedule() call + struct BGItem { void* arg; void (*function)(void*); }; + typedef std::deque BGQueue; + + pthread_mutex_t mu_; + pthread_cond_t bgsignal_; + int total_threads_limit_; + std::vector bgthreads_; + BGQueue queue_; + bool exit_all_threads_; + }; + + std::vector thread_pools_; + + pthread_mutex_t mu_; + std::vector threads_to_join_; + +}; + +PosixEnv::PosixEnv() : checkedDiskForMmap_(false), + forceMmapOff(false), + page_size_(getpagesize()), + thread_pools_(Priority::TOTAL) { + PthreadCall("mutex_init", pthread_mutex_init(&mu_, nullptr)); +} + +void PosixEnv::Schedule(void (*function)(void*), void* arg, Priority pri) { + assert(pri >= Priority::LOW && pri <= Priority::HIGH); + thread_pools_[pri].Schedule(function, arg); +} + +namespace { +struct StartThreadState { + void (*user_function)(void*); + void* arg; +}; +} +static void* StartThreadWrapper(void* arg) { + StartThreadState* state = reinterpret_cast(arg); + state->user_function(state->arg); + delete state; + return nullptr; +} + +void PosixEnv::StartThread(void (*function)(void* arg), void* arg) { + pthread_t t; + StartThreadState* state = new StartThreadState; + state->user_function = function; + state->arg = arg; + PthreadCall("start thread", + pthread_create(&t, nullptr, &StartThreadWrapper, state)); + PthreadCall("lock", pthread_mutex_lock(&mu_)); + threads_to_join_.push_back(t); + PthreadCall("unlock", pthread_mutex_unlock(&mu_)); +} + +} // namespace + +std::string Env::GenerateUniqueId() { + std::string uuid_file = "/proc/sys/kernel/random/uuid"; + if (FileExists(uuid_file)) { + std::string uuid; + Status s = ReadFileToString(this, uuid_file, &uuid); + if (s.ok()) { + return uuid; + } + } + // Could not read uuid_file - generate uuid using "nanos-random" + Random64 r(time(nullptr)); + uint64_t random_uuid_portion = + r.Uniform(std::numeric_limits::max()); + uint64_t nanos_uuid_portion = NowNanos(); + char uuid2[200]; + snprintf(uuid2, + 200, + "%lx-%lx", + (unsigned long)nanos_uuid_portion, + (unsigned long)random_uuid_portion); + return uuid2; +} + +Env* Env::Default() { + static PosixEnv default_env; + return &default_env; +} + +} // namespace rocksdb diff --git a/util/env_test.cc b/util/env_test.cc new file mode 100644 index 00000000..828b49a0 --- /dev/null +++ b/util/env_test.cc @@ -0,0 +1,397 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + + +#include +#include + +#include "rocksdb/env.h" +#include "port/port.h" +#include "util/coding.h" +#include "util/mutexlock.h" +#include "util/testharness.h" + +namespace rocksdb { + +static const int kDelayMicros = 100000; + +class EnvPosixTest { + private: + port::Mutex mu_; + std::string events_; + + public: + Env* env_; + EnvPosixTest() : env_(Env::Default()) { } +}; + +static void SetBool(void* ptr) { + reinterpret_cast(ptr)->NoBarrier_Store(ptr); +} + +TEST(EnvPosixTest, RunImmediately) { + port::AtomicPointer called (nullptr); + env_->Schedule(&SetBool, &called); + Env::Default()->SleepForMicroseconds(kDelayMicros); + ASSERT_TRUE(called.NoBarrier_Load() != nullptr); +} + +TEST(EnvPosixTest, RunMany) { + port::AtomicPointer last_id (nullptr); + + struct CB { + port::AtomicPointer* last_id_ptr; // Pointer to shared slot + uintptr_t id; // Order# for the execution of this callback + + CB(port::AtomicPointer* p, int i) : last_id_ptr(p), id(i) { } + + static void Run(void* v) { + CB* cb = reinterpret_cast(v); + void* cur = cb->last_id_ptr->NoBarrier_Load(); + ASSERT_EQ(cb->id-1, reinterpret_cast(cur)); + cb->last_id_ptr->Release_Store(reinterpret_cast(cb->id)); + } + }; + + // Schedule in different order than start time + CB cb1(&last_id, 1); + CB cb2(&last_id, 2); + CB cb3(&last_id, 3); + CB cb4(&last_id, 4); + env_->Schedule(&CB::Run, &cb1); + env_->Schedule(&CB::Run, &cb2); + env_->Schedule(&CB::Run, &cb3); + env_->Schedule(&CB::Run, &cb4); + + Env::Default()->SleepForMicroseconds(kDelayMicros); + void* cur = last_id.Acquire_Load(); + ASSERT_EQ(4U, reinterpret_cast(cur)); +} + +struct State { + port::Mutex mu; + int val; + int num_running; +}; + +static void ThreadBody(void* arg) { + State* s = reinterpret_cast(arg); + s->mu.Lock(); + s->val += 1; + s->num_running -= 1; + s->mu.Unlock(); +} + +TEST(EnvPosixTest, StartThread) { + State state; + state.val = 0; + state.num_running = 3; + for (int i = 0; i < 3; i++) { + env_->StartThread(&ThreadBody, &state); + } + while (true) { + state.mu.Lock(); + int num = state.num_running; + state.mu.Unlock(); + if (num == 0) { + break; + } + Env::Default()->SleepForMicroseconds(kDelayMicros); + } + ASSERT_EQ(state.val, 3); +} + +TEST(EnvPosixTest, TwoPools) { + + class CB { + public: + CB(const std::string& pool_name, int pool_size) + : mu_(), + num_running_(0), + num_finished_(0), + pool_size_(pool_size), + pool_name_(pool_name) { } + + static void Run(void* v) { + CB* cb = reinterpret_cast(v); + cb->Run(); + } + + void Run() { + { + MutexLock l(&mu_); + num_running_++; + std::cout << "Pool " << pool_name_ << ": " + << num_running_ << " running threads.\n"; + // make sure we don't have more than pool_size_ jobs running. + ASSERT_LE(num_running_, pool_size_); + } + + // sleep for 1 sec + Env::Default()->SleepForMicroseconds(1000000); + + { + MutexLock l(&mu_); + num_running_--; + num_finished_++; + } + } + + int NumFinished() { + MutexLock l(&mu_); + return num_finished_; + } + + private: + port::Mutex mu_; + int num_running_; + int num_finished_; + int pool_size_; + std::string pool_name_; + }; + + const int kLowPoolSize = 2; + const int kHighPoolSize = 4; + const int kJobs = 8; + + CB low_pool_job("low", kLowPoolSize); + CB high_pool_job("high", kHighPoolSize); + + env_->SetBackgroundThreads(kLowPoolSize); + env_->SetBackgroundThreads(kHighPoolSize, Env::Priority::HIGH); + + // schedule same number of jobs in each pool + for (int i = 0; i < kJobs; i++) { + env_->Schedule(&CB::Run, &low_pool_job); + env_->Schedule(&CB::Run, &high_pool_job, Env::Priority::HIGH); + } + + // wait for all jobs to finish + while (low_pool_job.NumFinished() < kJobs || + high_pool_job.NumFinished() < kJobs) { + env_->SleepForMicroseconds(kDelayMicros); + } +} + +bool IsSingleVarint(const std::string& s) { + Slice slice(s); + + uint64_t v; + if (!GetVarint64(&slice, &v)) { + return false; + } + + return slice.size() == 0; +} + +#ifdef OS_LINUX +bool IsUniqueIDValid(const std::string& s) { + return !s.empty() && !IsSingleVarint(s); +} + +const size_t MAX_ID_SIZE = 100; +char temp_id[MAX_ID_SIZE]; + +// Only works in linux platforms +TEST(EnvPosixTest, RandomAccessUniqueID) { + // Create file. + const EnvOptions soptions; + std::string fname = test::TmpDir() + "/" + "testfile"; + unique_ptr wfile; + ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions)); + + unique_ptr file; + + // Get Unique ID + ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions)); + size_t id_size = file->GetUniqueId(temp_id, MAX_ID_SIZE); + ASSERT_TRUE(id_size > 0); + std::string unique_id1(temp_id, id_size); + ASSERT_TRUE(IsUniqueIDValid(unique_id1)); + + // Get Unique ID again + ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions)); + id_size = file->GetUniqueId(temp_id, MAX_ID_SIZE); + ASSERT_TRUE(id_size > 0); + std::string unique_id2(temp_id, id_size); + ASSERT_TRUE(IsUniqueIDValid(unique_id2)); + + // Get Unique ID again after waiting some time. + env_->SleepForMicroseconds(1000000); + ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions)); + id_size = file->GetUniqueId(temp_id, MAX_ID_SIZE); + ASSERT_TRUE(id_size > 0); + std::string unique_id3(temp_id, id_size); + ASSERT_TRUE(IsUniqueIDValid(unique_id3)); + + // Check IDs are the same. + ASSERT_EQ(unique_id1, unique_id2); + ASSERT_EQ(unique_id2, unique_id3); + + // Delete the file + env_->DeleteFile(fname); +} + +// Returns true if any of the strings in ss are the prefix of another string. +bool HasPrefix(const std::unordered_set& ss) { + for (const std::string& s: ss) { + if (s.empty()) { + return true; + } + for (size_t i = 1; i < s.size(); ++i) { + if (ss.count(s.substr(0, i)) != 0) { + return true; + } + } + } + return false; +} + +// Only works in linux platforms +TEST(EnvPosixTest, RandomAccessUniqueIDConcurrent) { + // Check whether a bunch of concurrently existing files have unique IDs. + const EnvOptions soptions; + + // Create the files + std::vector fnames; + for (int i = 0; i < 1000; ++i) { + fnames.push_back(test::TmpDir() + "/" + "testfile" + std::to_string(i)); + + // Create file. + unique_ptr wfile; + ASSERT_OK(env_->NewWritableFile(fnames[i], &wfile, soptions)); + } + + // Collect and check whether the IDs are unique. + std::unordered_set ids; + for (const std::string fname: fnames) { + unique_ptr file; + std::string unique_id; + ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions)); + size_t id_size = file->GetUniqueId(temp_id, MAX_ID_SIZE); + ASSERT_TRUE(id_size > 0); + unique_id = std::string(temp_id, id_size); + ASSERT_TRUE(IsUniqueIDValid(unique_id)); + + ASSERT_TRUE(ids.count(unique_id) == 0); + ids.insert(unique_id); + } + + // Delete the files + for (const std::string fname: fnames) { + ASSERT_OK(env_->DeleteFile(fname)); + } + + ASSERT_TRUE(!HasPrefix(ids)); +} + +// Only works in linux platforms +TEST(EnvPosixTest, RandomAccessUniqueIDDeletes) { + const EnvOptions soptions; + std::string fname = test::TmpDir() + "/" + "testfile"; + + // Check that after file is deleted we don't get same ID again in a new file. + std::unordered_set ids; + for (int i = 0; i < 1000; ++i) { + // Create file. + { + unique_ptr wfile; + ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions)); + } + + // Get Unique ID + std::string unique_id; + { + unique_ptr file; + ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions)); + size_t id_size = file->GetUniqueId(temp_id, MAX_ID_SIZE); + ASSERT_TRUE(id_size > 0); + unique_id = std::string(temp_id, id_size); + } + + ASSERT_TRUE(IsUniqueIDValid(unique_id)); + ASSERT_TRUE(ids.count(unique_id) == 0); + ids.insert(unique_id); + + // Delete the file + ASSERT_OK(env_->DeleteFile(fname)); + } + + ASSERT_TRUE(!HasPrefix(ids)); +} + +// Only works in linux platforms +TEST(EnvPosixTest, InvalidateCache) { + const EnvOptions soptions; + std::string fname = test::TmpDir() + "/" + "testfile"; + + // Create file. + { + unique_ptr wfile; + ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions)); + ASSERT_OK(wfile.get()->Append(Slice("Hello world"))); + ASSERT_OK(wfile.get()->InvalidateCache(0, 0)); + ASSERT_OK(wfile.get()->Close()); + } + + // Random Read + { + unique_ptr file; + char scratch[100]; + Slice result; + ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions)); + ASSERT_OK(file.get()->Read(0, 11, &result, scratch)); + ASSERT_EQ(memcmp(scratch, "Hello world", 11), 0); + ASSERT_OK(file.get()->InvalidateCache(0, 11)); + ASSERT_OK(file.get()->InvalidateCache(0, 0)); + } + + // Sequential Read + { + unique_ptr file; + char scratch[100]; + Slice result; + ASSERT_OK(env_->NewSequentialFile(fname, &file, soptions)); + ASSERT_OK(file.get()->Read(11, &result, scratch)); + ASSERT_EQ(memcmp(scratch, "Hello world", 11), 0); + ASSERT_OK(file.get()->InvalidateCache(0, 11)); + ASSERT_OK(file.get()->InvalidateCache(0, 0)); + } + // Delete the file + ASSERT_OK(env_->DeleteFile(fname)); +} +#endif + +TEST(EnvPosixTest, PosixRandomRWFileTest) { + EnvOptions soptions; + soptions.use_mmap_writes = soptions.use_mmap_reads = false; + std::string fname = test::TmpDir() + "/" + "testfile"; + + unique_ptr file; + ASSERT_OK(env_->NewRandomRWFile(fname, &file, soptions)); + // If you run the unit test on tmpfs, then tmpfs might not + // support fallocate. It is still better to trigger that + // code-path instead of eliminating it completely. + file.get()->Allocate(0, 10*1024*1024); + ASSERT_OK(file.get()->Write(100, Slice("Hello world"))); + ASSERT_OK(file.get()->Write(105, Slice("Hello world"))); + ASSERT_OK(file.get()->Sync()); + ASSERT_OK(file.get()->Fsync()); + char scratch[100]; + Slice result; + ASSERT_OK(file.get()->Read(100, 16, &result, scratch)); + ASSERT_EQ(result.compare("HelloHello world"), 0); + ASSERT_OK(file.get()->Close()); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/util/filelock_test.cc b/util/filelock_test.cc new file mode 100644 index 00000000..a9e30a5d --- /dev/null +++ b/util/filelock_test.cc @@ -0,0 +1,58 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include "rocksdb/status.h" +#include "rocksdb/env.h" + +#include +#include "util/coding.h" +#include "util/testharness.h" + +namespace rocksdb { + +class LockTest { + public: + static LockTest* current_; + std::string file_; + rocksdb::Env* env_; + + LockTest() : file_(test::TmpDir() + "/db_testlock_file"), + env_(rocksdb::Env::Default()) { + current_ = this; + } + + ~LockTest() { + } + + Status LockFile(FileLock** db_lock) { + return env_->LockFile(file_, db_lock); + } + + Status UnlockFile(FileLock* db_lock) { + return env_->UnlockFile(db_lock); + } +}; +LockTest* LockTest::current_; + +TEST(LockTest, LockBySameThread) { + FileLock* lock1; + FileLock* lock2; + + // acquire a lock on a file + ASSERT_OK(LockFile(&lock1)); + + // re-acquire the lock on the same file. This should fail. + ASSERT_TRUE(LockFile(&lock2).IsIOError()); + + // release the lock + ASSERT_OK(UnlockFile(lock1)); + +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/util/filter_policy.cc b/util/filter_policy.cc new file mode 100644 index 00000000..e950b75f --- /dev/null +++ b/util/filter_policy.cc @@ -0,0 +1,16 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/filter_policy.h" + +namespace rocksdb { + +FilterPolicy::~FilterPolicy() { } + +} // namespace rocksdb diff --git a/util/hash.cc b/util/hash.cc new file mode 100644 index 00000000..6f0e9cc9 --- /dev/null +++ b/util/hash.cc @@ -0,0 +1,50 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include "util/coding.h" +#include "util/hash.h" + +namespace rocksdb { + +uint32_t Hash(const char* data, size_t n, uint32_t seed) { + // Similar to murmur hash + const uint32_t m = 0xc6a4a793; + const uint32_t r = 24; + const char* limit = data + n; + uint32_t h = seed ^ (n * m); + + // Pick up four bytes at a time + while (data + 4 <= limit) { + uint32_t w = DecodeFixed32(data); + data += 4; + h += w; + h *= m; + h ^= (h >> 16); + } + + // Pick up remaining bytes + switch (limit - data) { + case 3: + h += data[2] << 16; + // fall through + case 2: + h += data[1] << 8; + // fall through + case 1: + h += data[0]; + h *= m; + h ^= (h >> r); + break; + } + return h; +} + + +} // namespace rocksdb diff --git a/util/hash.h b/util/hash.h new file mode 100644 index 00000000..c9eb659a --- /dev/null +++ b/util/hash.h @@ -0,0 +1,20 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Simple hash function used for internal data structures + +#pragma once +#include +#include + +namespace rocksdb { + +extern uint32_t Hash(const char* data, size_t n, uint32_t seed); + +} diff --git a/util/hash_skiplist_rep.cc b/util/hash_skiplist_rep.cc new file mode 100644 index 00000000..c669769e --- /dev/null +++ b/util/hash_skiplist_rep.cc @@ -0,0 +1,313 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// + +#include "util/hash_skiplist_rep.h" + +#include "rocksdb/memtablerep.h" +#include "rocksdb/arena.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "port/port.h" +#include "port/atomic_pointer.h" +#include "util/murmurhash.h" +#include "db/skiplist.h" + +namespace rocksdb { +namespace { + +class HashSkipListRep : public MemTableRep { + public: + HashSkipListRep(MemTableRep::KeyComparator& compare, Arena* arena, + const SliceTransform* transform, size_t bucket_size); + + virtual void Insert(const char* key) override; + + virtual bool Contains(const char* key) const override; + + virtual size_t ApproximateMemoryUsage() override; + + virtual ~HashSkipListRep(); + + virtual std::shared_ptr GetIterator() override; + + virtual std::shared_ptr GetIterator( + const Slice& slice) override; + + virtual std::shared_ptr GetPrefixIterator( + const Slice& prefix) override; + + virtual std::shared_ptr GetDynamicPrefixIterator() + override; + + private: + friend class DynamicIterator; + typedef SkipList Bucket; + + size_t bucket_size_; + + // Maps slices (which are transformed user keys) to buckets of keys sharing + // the same transform. + port::AtomicPointer* buckets_; + + // The user-supplied transform whose domain is the user keys. + const SliceTransform* transform_; + + MemTableRep::KeyComparator& compare_; + // immutable after construction + Arena* const arena_; + + inline size_t GetHash(const Slice& slice) const { + return MurmurHash(slice.data(), slice.size(), 0) % bucket_size_; + } + inline Bucket* GetBucket(size_t i) const { + return static_cast(buckets_[i].Acquire_Load()); + } + inline Bucket* GetBucket(const Slice& slice) const { + return GetBucket(GetHash(slice)); + } + // Get a bucket from buckets_. If the bucket hasn't been initialized yet, + // initialize it before returning. + Bucket* GetInitializedBucket(const Slice& transformed); + + class Iterator : public MemTableRep::Iterator { + public: + explicit Iterator(Bucket* list, bool own_list = true) + : list_(list), + iter_(list), + own_list_(own_list) {} + + virtual ~Iterator() { + // if we own the list, we should also delete it + if (own_list_) { + assert(list_ != nullptr); + delete list_; + } + } + + // Returns true iff the iterator is positioned at a valid node. + virtual bool Valid() const { + return list_ != nullptr && iter_.Valid(); + } + + // Returns the key at the current position. + // REQUIRES: Valid() + virtual const char* key() const { + assert(Valid()); + return iter_.key(); + } + + // Advances to the next position. + // REQUIRES: Valid() + virtual void Next() { + assert(Valid()); + iter_.Next(); + } + + // Advances to the previous position. + // REQUIRES: Valid() + virtual void Prev() { + assert(Valid()); + iter_.Prev(); + } + + // Advance to the first entry with a key >= target + virtual void Seek(const char* target) { + if (list_ != nullptr) { + iter_.Seek(target); + } + } + + // Position at the first entry in collection. + // Final state of iterator is Valid() iff collection is not empty. + virtual void SeekToFirst() { + if (list_ != nullptr) { + iter_.SeekToFirst(); + } + } + + // Position at the last entry in collection. + // Final state of iterator is Valid() iff collection is not empty. + virtual void SeekToLast() { + if (list_ != nullptr) { + iter_.SeekToLast(); + } + } + protected: + void Reset(Bucket* list) { + if (own_list_) { + assert(list_ != nullptr); + delete list_; + } + list_ = list; + iter_.SetList(list); + own_list_ = false; + } + private: + // if list_ is nullptr, we should NEVER call any methods on iter_ + // if list_ is nullptr, this Iterator is not Valid() + Bucket* list_; + Bucket::Iterator iter_; + // here we track if we own list_. If we own it, we are also + // responsible for it's cleaning. This is a poor man's shared_ptr + bool own_list_; + }; + + class DynamicIterator : public HashSkipListRep::Iterator { + public: + explicit DynamicIterator(const HashSkipListRep& memtable_rep) + : HashSkipListRep::Iterator(nullptr, false), + memtable_rep_(memtable_rep) {} + + // Advance to the first entry with a key >= target + virtual void Seek(const char* target) { + auto transformed = memtable_rep_.transform_->Transform( + memtable_rep_.UserKey(target)); + Reset(memtable_rep_.GetBucket(transformed)); + HashSkipListRep::Iterator::Seek(target); + } + + // Position at the first entry in collection. + // Final state of iterator is Valid() iff collection is not empty. + virtual void SeekToFirst() { + // Prefix iterator does not support total order. + // We simply set the iterator to invalid state + Reset(nullptr); + } + + // Position at the last entry in collection. + // Final state of iterator is Valid() iff collection is not empty. + virtual void SeekToLast() { + // Prefix iterator does not support total order. + // We simply set the iterator to invalid state + Reset(nullptr); + } + private: + // the underlying memtable + const HashSkipListRep& memtable_rep_; + }; + + class EmptyIterator : public MemTableRep::Iterator { + // This is used when there wasn't a bucket. It is cheaper than + // instantiating an empty bucket over which to iterate. + public: + EmptyIterator() { } + virtual bool Valid() const { + return false; + } + virtual const char* key() const { + assert(false); + return nullptr; + } + virtual void Next() { } + virtual void Prev() { } + virtual void Seek(const char* target) { } + virtual void SeekToFirst() { } + virtual void SeekToLast() { } + private: + }; + + std::shared_ptr empty_iterator_; +}; + +HashSkipListRep::HashSkipListRep(MemTableRep::KeyComparator& compare, + Arena* arena, const SliceTransform* transform, size_t bucket_size) + : bucket_size_(bucket_size), + transform_(transform), + compare_(compare), + arena_(arena), + empty_iterator_(std::make_shared()) { + + buckets_ = new port::AtomicPointer[bucket_size]; + + for (size_t i = 0; i < bucket_size_; ++i) { + buckets_[i].NoBarrier_Store(nullptr); + } +} + +HashSkipListRep::~HashSkipListRep() { + delete[] buckets_; +} + +HashSkipListRep::Bucket* HashSkipListRep::GetInitializedBucket( + const Slice& transformed) { + size_t hash = GetHash(transformed); + auto bucket = GetBucket(hash); + if (bucket == nullptr) { + auto addr = arena_->AllocateAligned(sizeof(Bucket)); + bucket = new (addr) Bucket(compare_, arena_); + buckets_[hash].Release_Store(static_cast(bucket)); + } + return bucket; +} + +void HashSkipListRep::Insert(const char* key) { + assert(!Contains(key)); + auto transformed = transform_->Transform(UserKey(key)); + auto bucket = GetInitializedBucket(transformed); + bucket->Insert(key); +} + +bool HashSkipListRep::Contains(const char* key) const { + auto transformed = transform_->Transform(UserKey(key)); + auto bucket = GetBucket(transformed); + if (bucket == nullptr) { + return false; + } + return bucket->Contains(key); +} + +size_t HashSkipListRep::ApproximateMemoryUsage() { + return sizeof(buckets_); +} + +std::shared_ptr HashSkipListRep::GetIterator() { + auto list = new Bucket(compare_, arena_); + for (size_t i = 0; i < bucket_size_; ++i) { + auto bucket = GetBucket(i); + if (bucket != nullptr) { + Bucket::Iterator itr(bucket); + for (itr.SeekToFirst(); itr.Valid(); itr.Next()) { + list->Insert(itr.key()); + } + } + } + return std::make_shared(list); +} + +std::shared_ptr HashSkipListRep::GetPrefixIterator( + const Slice& prefix) { + auto bucket = GetBucket(prefix); + if (bucket == nullptr) { + return empty_iterator_; + } + return std::make_shared(bucket, false); +} + +std::shared_ptr HashSkipListRep::GetIterator( + const Slice& slice) { + return GetPrefixIterator(transform_->Transform(slice)); +} + +std::shared_ptr + HashSkipListRep::GetDynamicPrefixIterator() { + return std::make_shared(*this); +} + +} // anon namespace + +std::shared_ptr +HashSkipListRepFactory::CreateMemTableRep(MemTableRep::KeyComparator &compare, + Arena *arena) { + return std::make_shared(compare, arena, transform_, + bucket_count_); +} + +MemTableRepFactory* NewHashSkipListRepFactory( + const SliceTransform* transform, size_t bucket_count) { + return new HashSkipListRepFactory(transform, bucket_count); +} + +} // namespace rocksdb diff --git a/util/hash_skiplist_rep.h b/util/hash_skiplist_rep.h new file mode 100644 index 00000000..b946cf05 --- /dev/null +++ b/util/hash_skiplist_rep.h @@ -0,0 +1,38 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include "rocksdb/slice_transform.h" +#include "rocksdb/memtablerep.h" + +namespace rocksdb { + +class HashSkipListRepFactory : public MemTableRepFactory { + public: + explicit HashSkipListRepFactory(const SliceTransform* transform, + size_t bucket_count = 1000000) + : transform_(transform), + bucket_count_(bucket_count) { } + + virtual ~HashSkipListRepFactory() { delete transform_; } + + virtual std::shared_ptr CreateMemTableRep( + MemTableRep::KeyComparator& compare, Arena* arena) override; + + virtual const char* Name() const override { + return "HashSkipListRepFactory"; + } + + const SliceTransform* GetTransform() { return transform_; } + + private: + const SliceTransform* transform_; + const size_t bucket_count_; +}; + +} diff --git a/util/histogram.cc b/util/histogram.cc new file mode 100644 index 00000000..e8399801 --- /dev/null +++ b/util/histogram.cc @@ -0,0 +1,194 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/histogram.h" + +#include +#include +#include +#include "port/port.h" + +namespace rocksdb { + +HistogramBucketMapper::HistogramBucketMapper() : + // Add newer bucket index here. + // Should be alwyas added in sorted order. + bucketValues_({ + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 25, 30, 35, 40, 45, + 50, 60, 70, 80, 90, 100, 120, 140, 160, 180, 200, 250, 300, 350, 400, 450, + 500, 600, 700, 800, 900, 1000, 1200, 1400, 1600, 1800, 2000, 2500, 3000, + 3500, 4000, 4500, 5000, 6000, 7000, 8000, 9000, 10000, 12000, 14000, + 16000, 18000, 20000, 25000, 30000, 35000, 40000, 45000, 50000, 60000, + 70000, 80000, 90000, 100000, 120000, 140000, 160000, 180000, 200000, + 250000, 300000, 350000, 400000, 450000, 500000, 600000, 700000, 800000, + 900000, 1000000, 1200000, 1400000, 1600000, 1800000, 2000000, 2500000, + 3000000, 3500000, 4000000, 4500000, 5000000, 6000000, 7000000, 8000000, + 9000000, 10000000, 12000000, 14000000, 16000000, 18000000, 20000000, + 25000000, 30000000, 35000000, 40000000, 45000000, 50000000, 60000000, + 70000000, 80000000, 90000000, 100000000, 120000000, 140000000, 160000000, + 180000000, 200000000, 250000000, 300000000, 350000000, 400000000, + 450000000, 500000000, 600000000, 700000000, 800000000, 900000000, + 1000000000}), + maxBucketValue_(bucketValues_.back()), + minBucketValue_(bucketValues_.front()) { + for (size_t i =0; i < bucketValues_.size(); ++i) { + valueIndexMap_[bucketValues_[i]] = i; + } +} + +const size_t HistogramBucketMapper::IndexForValue(const uint64_t value) const { + if (value >= maxBucketValue_) { + return bucketValues_.size() - 1; + } else if ( value >= minBucketValue_ ) { + std::map::const_iterator lowerBound = + valueIndexMap_.lower_bound(value); + if (lowerBound != valueIndexMap_.end()) { + return lowerBound->second; + } else { + return 0; + } + } else { + return 0; + } +} + +namespace { + const HistogramBucketMapper bucketMapper; +} + + +HistogramImpl::HistogramImpl() : + min_(bucketMapper.LastValue()), + max_(0), + num_(0), + sum_(0), + sum_squares_(0), + buckets_(std::vector(bucketMapper.BucketCount(), 0)) {} + +void HistogramImpl::Clear() { + min_ = bucketMapper.LastValue(); + max_ = 0; + num_ = 0; + sum_ = 0; + sum_squares_ = 0; + buckets_.resize(bucketMapper.BucketCount(), 0); +} + +void HistogramImpl::Add(uint64_t value) { + const size_t index = bucketMapper.IndexForValue(value); + buckets_[index] += 1; + if (min_ > value) min_ = value; + if (max_ < value) max_ = value; + num_++; + sum_ += value; + sum_squares_ += (value * value); +} + +void HistogramImpl::Merge(const HistogramImpl& other) { + if (other.min_ < min_) min_ = other.min_; + if (other.max_ > max_) max_ = other.max_; + num_ += other.num_; + sum_ += other.sum_; + sum_squares_ += other.sum_squares_; + for (unsigned int b = 0; b < bucketMapper.BucketCount(); b++) { + buckets_[b] += other.buckets_[b]; + } +} + +double HistogramImpl::Median() const { + return Percentile(50.0); +} + +double HistogramImpl::Percentile(double p) const { + double threshold = num_ * (p / 100.0); + double sum = 0; + for (unsigned int b = 0; b < bucketMapper.BucketCount(); b++) { + sum += buckets_[b]; + if (sum >= threshold) { + // Scale linearly within this bucket + double left_point = (b == 0) ? 0 : bucketMapper.BucketLimit(b-1); + double right_point = bucketMapper.BucketLimit(b); + double left_sum = sum - buckets_[b]; + double right_sum = sum; + double pos = 0; + double right_left_diff = right_sum - left_sum; + if (right_left_diff != 0) { + pos = (threshold - left_sum) / (right_sum - left_sum); + } + double r = left_point + (right_point - left_point) * pos; + if (r < min_) r = min_; + if (r > max_) r = max_; + return r; + } + } + return max_; +} + +double HistogramImpl::Average() const { + if (num_ == 0.0) return 0; + return sum_ / num_; +} + +double HistogramImpl::StandardDeviation() const { + if (num_ == 0.0) return 0; + double variance = (sum_squares_ * num_ - sum_ * sum_) / (num_ * num_); + return sqrt(variance); +} + +std::string HistogramImpl::ToString() const { + std::string r; + char buf[200]; + snprintf(buf, sizeof(buf), + "Count: %.0f Average: %.4f StdDev: %.2f\n", + num_, Average(), StandardDeviation()); + r.append(buf); + snprintf(buf, sizeof(buf), + "Min: %.4f Median: %.4f Max: %.4f\n", + (num_ == 0.0 ? 0.0 : min_), Median(), max_); + r.append(buf); + snprintf(buf, sizeof(buf), + "Percentiles: " + "P50: %.2f P75: %.2f P99: %.2f P99.9: %.2f P99.99: %.2f\n", + Percentile(50), Percentile(75), Percentile(99), Percentile(99.9), + Percentile(99.99)); + r.append(buf); + r.append("------------------------------------------------------\n"); + const double mult = 100.0 / num_; + double sum = 0; + for (unsigned int b = 0; b < bucketMapper.BucketCount(); b++) { + if (buckets_[b] <= 0.0) continue; + sum += buckets_[b]; + snprintf(buf, sizeof(buf), + "[ %7lu, %7lu ) %8lu %7.3f%% %7.3f%% ", + // left + (unsigned long)((b == 0) ? 0 : bucketMapper.BucketLimit(b-1)), + (unsigned long)bucketMapper.BucketLimit(b), // right + (unsigned long)buckets_[b], // count + (mult * buckets_[b]), // percentage + (mult * sum)); // cumulative percentage + r.append(buf); + + // Add hash marks based on percentage; 20 marks for 100%. + int marks = static_cast(20*(buckets_[b] / num_) + 0.5); + r.append(marks, '#'); + r.push_back('\n'); + } + return r; +} + +void HistogramImpl::Data(HistogramData * const data) const { + assert(data); + data->median = Median(); + data->percentile95 = Percentile(95); + data->percentile99 = Percentile(99); + data->average = Average(); + data->standard_deviation = StandardDeviation(); +} + +} // namespace levedb diff --git a/util/histogram.h b/util/histogram.h new file mode 100644 index 00000000..c01594da --- /dev/null +++ b/util/histogram.h @@ -0,0 +1,79 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include "rocksdb/statistics.h" + +#include +#include +#include +#include + +namespace rocksdb { + +class HistogramBucketMapper { + public: + + HistogramBucketMapper(); + + // converts a value to the bucket index. + const size_t IndexForValue(const uint64_t value) const; + // number of buckets required. + + const size_t BucketCount() const { + return bucketValues_.size(); + } + + uint64_t LastValue() const { + return maxBucketValue_; + } + + uint64_t FirstValue() const { + return minBucketValue_; + } + + uint64_t BucketLimit(const uint64_t bucketNumber) const { + assert(bucketNumber < BucketCount()); + return bucketValues_[bucketNumber]; + } + + private: + const std::vector bucketValues_; + const uint64_t maxBucketValue_; + const uint64_t minBucketValue_; + std::map valueIndexMap_; +}; + +class HistogramImpl { + public: + HistogramImpl(); + virtual ~HistogramImpl() {} + virtual void Clear(); + virtual void Add(uint64_t value); + void Merge(const HistogramImpl& other); + + virtual std::string ToString() const; + + virtual double Median() const; + virtual double Percentile(double p) const; + virtual double Average() const; + virtual double StandardDeviation() const; + virtual void Data(HistogramData * const data) const; + + private: + double min_; + double max_; + double num_; + double sum_; + double sum_squares_; + std::vector buckets_; + +}; + +} // namespace rocksdb diff --git a/util/histogram_test.cc b/util/histogram_test.cc new file mode 100644 index 00000000..065f9579 --- /dev/null +++ b/util/histogram_test.cc @@ -0,0 +1,62 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include "util/histogram.h" + +#include "util/testharness.h" + +namespace rocksdb { + +class HistogramTest { }; + +TEST(HistogramTest, BasicOperation) { + + HistogramImpl histogram; + for (uint64_t i = 1; i <= 100; i++) { + histogram.Add(i); + } + + { + double median = histogram.Median(); + // ASSERT_LE(median, 50); + ASSERT_GT(median, 0); + } + + { + double percentile100 = histogram.Percentile(100.0); + ASSERT_LE(percentile100, 100.0); + ASSERT_GT(percentile100, 0.0); + double percentile99 = histogram.Percentile(99.0); + double percentile85 = histogram.Percentile(85.0); + ASSERT_LE(percentile99, 99.0); + ASSERT_TRUE(percentile99 >= percentile85); + } + + ASSERT_EQ(histogram.Average(), 50.5); // avg is acurately caluclated. +} + +TEST(HistogramTest, EmptyHistogram) { + HistogramImpl histogram; + ASSERT_EQ(histogram.Median(), 0.0); + ASSERT_EQ(histogram.Percentile(85.0), 0.0); + ASSERT_EQ(histogram.Average(), 0.0); +} + +TEST(HistogramTest, ClearHistogram) { + HistogramImpl histogram; + for (uint64_t i = 1; i <= 100; i++) { + histogram.Add(i); + } + histogram.Clear(); + ASSERT_EQ(histogram.Median(), 0); + ASSERT_EQ(histogram.Percentile(85.0), 0); + ASSERT_EQ(histogram.Average(), 0); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/util/ldb_cmd.cc b/util/ldb_cmd.cc new file mode 100644 index 00000000..65ecd61a --- /dev/null +++ b/util/ldb_cmd.cc @@ -0,0 +1,1764 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include "util/ldb_cmd.h" + +#include "db/dbformat.h" +#include "db/db_impl.h" +#include "db/log_reader.h" +#include "db/filename.h" +#include "db/write_batch_internal.h" +#include "rocksdb/write_batch.h" +#include "util/coding.h" + +#include +#include +#include +#include +#include + +namespace rocksdb { + +using namespace std; + +const string LDBCommand::ARG_DB = "db"; +const string LDBCommand::ARG_HEX = "hex"; +const string LDBCommand::ARG_KEY_HEX = "key_hex"; +const string LDBCommand::ARG_VALUE_HEX = "value_hex"; +const string LDBCommand::ARG_TTL = "ttl"; +const string LDBCommand::ARG_TTL_START = "start_time"; +const string LDBCommand::ARG_TTL_END = "end_time"; +const string LDBCommand::ARG_TIMESTAMP = "timestamp"; +const string LDBCommand::ARG_FROM = "from"; +const string LDBCommand::ARG_TO = "to"; +const string LDBCommand::ARG_MAX_KEYS = "max_keys"; +const string LDBCommand::ARG_BLOOM_BITS = "bloom_bits"; +const string LDBCommand::ARG_COMPRESSION_TYPE = "compression_type"; +const string LDBCommand::ARG_BLOCK_SIZE = "block_size"; +const string LDBCommand::ARG_AUTO_COMPACTION = "auto_compaction"; +const string LDBCommand::ARG_WRITE_BUFFER_SIZE = "write_buffer_size"; +const string LDBCommand::ARG_FILE_SIZE = "file_size"; +const string LDBCommand::ARG_CREATE_IF_MISSING = "create_if_missing"; + +const char* LDBCommand::DELIM = " ==> "; + +LDBCommand* LDBCommand::InitFromCmdLineArgs( + int argc, + char** argv, + const Options& options +) { + vector args; + for (int i = 1; i < argc; i++) { + args.push_back(argv[i]); + } + return InitFromCmdLineArgs(args, options); +} + +/** + * Parse the command-line arguments and create the appropriate LDBCommand2 + * instance. + * The command line arguments must be in the following format: + * ./ldb --db=PATH_TO_DB [--commonOpt1=commonOpt1Val] .. + * COMMAND ... [-cmdSpecificOpt1=cmdSpecificOpt1Val] .. + * This is similar to the command line format used by HBaseClientTool. + * Command name is not included in args. + * Returns nullptr if the command-line cannot be parsed. + */ +LDBCommand* LDBCommand::InitFromCmdLineArgs( + const vector& args, + const Options& options +) { + // --x=y command line arguments are added as x->y map entries. + map option_map; + + // Command-line arguments of the form --hex end up in this array as hex + vector flags; + + // Everything other than option_map and flags. Represents commands + // and their parameters. For eg: put key1 value1 go into this vector. + vector cmdTokens; + + const string OPTION_PREFIX = "--"; + + for (const auto& arg : args) { + if (arg[0] == '-' && arg[1] == '-'){ + vector splits = stringSplit(arg, '='); + if (splits.size() == 2) { + string optionKey = splits[0].substr(OPTION_PREFIX.size()); + option_map[optionKey] = splits[1]; + } else { + string optionKey = splits[0].substr(OPTION_PREFIX.size()); + flags.push_back(optionKey); + } + } else { + cmdTokens.push_back(arg); + } + } + + if (cmdTokens.size() < 1) { + fprintf(stderr, "Command not specified!"); + return nullptr; + } + + string cmd = cmdTokens[0]; + vector cmdParams(cmdTokens.begin()+1, cmdTokens.end()); + LDBCommand* command = LDBCommand::SelectCommand( + cmd, + cmdParams, + option_map, + flags + ); + + if (command) { + command->SetOptions(options); + } + return command; +} + +LDBCommand* LDBCommand::SelectCommand( + const std::string& cmd, + const vector& cmdParams, + const map& option_map, + const vector& flags + ) { + + if (cmd == GetCommand::Name()) { + return new GetCommand(cmdParams, option_map, flags); + } else if (cmd == PutCommand::Name()) { + return new PutCommand(cmdParams, option_map, flags); + } else if (cmd == BatchPutCommand::Name()) { + return new BatchPutCommand(cmdParams, option_map, flags); + } else if (cmd == ScanCommand::Name()) { + return new ScanCommand(cmdParams, option_map, flags); + } else if (cmd == DeleteCommand::Name()) { + return new DeleteCommand(cmdParams, option_map, flags); + } else if (cmd == ApproxSizeCommand::Name()) { + return new ApproxSizeCommand(cmdParams, option_map, flags); + } else if (cmd == DBQuerierCommand::Name()) { + return new DBQuerierCommand(cmdParams, option_map, flags); + } else if (cmd == CompactorCommand::Name()) { + return new CompactorCommand(cmdParams, option_map, flags); + } else if (cmd == WALDumperCommand::Name()) { + return new WALDumperCommand(cmdParams, option_map, flags); + } else if (cmd == ReduceDBLevelsCommand::Name()) { + return new ReduceDBLevelsCommand(cmdParams, option_map, flags); + } else if (cmd == ChangeCompactionStyleCommand::Name()) { + return new ChangeCompactionStyleCommand(cmdParams, option_map, flags); + } else if (cmd == DBDumperCommand::Name()) { + return new DBDumperCommand(cmdParams, option_map, flags); + } else if (cmd == DBLoaderCommand::Name()) { + return new DBLoaderCommand(cmdParams, option_map, flags); + } else if (cmd == ManifestDumpCommand::Name()) { + return new ManifestDumpCommand(cmdParams, option_map, flags); + } else if (cmd == InternalDumpCommand::Name()) { + return new InternalDumpCommand(cmdParams, option_map, flags); + } + return nullptr; +} + + +/** + * Parses the specific integer option and fills in the value. + * Returns true if the option is found. + * Returns false if the option is not found or if there is an error parsing the + * value. If there is an error, the specified exec_state is also + * updated. + */ +bool LDBCommand::ParseIntOption(const map& options, + const string& option, int& value, + LDBCommandExecuteResult& exec_state) { + + map::const_iterator itr = option_map_.find(option); + if (itr != option_map_.end()) { + try { + value = stoi(itr->second); + return true; + } catch(const invalid_argument&) { + exec_state = LDBCommandExecuteResult::FAILED(option + + " has an invalid value."); + } catch(const out_of_range&) { + exec_state = LDBCommandExecuteResult::FAILED(option + + " has a value out-of-range."); + } + } + return false; +} + +/** + * Parses the specified option and fills in the value. + * Returns true if the option is found. + * Returns false otherwise. + */ +bool LDBCommand::ParseStringOption(const map& options, + const string& option, string* value) { + auto itr = option_map_.find(option); + if (itr != option_map_.end()) { + *value = itr->second; + return true; + } + return false; +} + +Options LDBCommand::PrepareOptionsForOpenDB() { + + Options opt = options_; + opt.create_if_missing = false; + + map::const_iterator itr; + + int bits; + if (ParseIntOption(option_map_, ARG_BLOOM_BITS, bits, exec_state_)) { + if (bits > 0) { + opt.filter_policy = NewBloomFilterPolicy(bits); + } else { + exec_state_ = LDBCommandExecuteResult::FAILED(ARG_BLOOM_BITS + + " must be > 0."); + } + } + + int block_size; + if (ParseIntOption(option_map_, ARG_BLOCK_SIZE, block_size, exec_state_)) { + if (block_size > 0) { + opt.block_size = block_size; + } else { + exec_state_ = LDBCommandExecuteResult::FAILED(ARG_BLOCK_SIZE + + " must be > 0."); + } + } + + itr = option_map_.find(ARG_AUTO_COMPACTION); + if (itr != option_map_.end()) { + opt.disable_auto_compactions = ! StringToBool(itr->second); + } + + itr = option_map_.find(ARG_COMPRESSION_TYPE); + if (itr != option_map_.end()) { + string comp = itr->second; + if (comp == "no") { + opt.compression = kNoCompression; + } else if (comp == "snappy") { + opt.compression = kSnappyCompression; + } else if (comp == "zlib") { + opt.compression = kZlibCompression; + } else if (comp == "bzip2") { + opt.compression = kBZip2Compression; + } else { + // Unknown compression. + exec_state_ = LDBCommandExecuteResult::FAILED( + "Unknown compression level: " + comp); + } + } + + int write_buffer_size; + if (ParseIntOption(option_map_, ARG_WRITE_BUFFER_SIZE, write_buffer_size, + exec_state_)) { + if (write_buffer_size > 0) { + opt.write_buffer_size = write_buffer_size; + } else { + exec_state_ = LDBCommandExecuteResult::FAILED(ARG_WRITE_BUFFER_SIZE + + " must be > 0."); + } + } + + int file_size; + if (ParseIntOption(option_map_, ARG_FILE_SIZE, file_size, exec_state_)) { + if (file_size > 0) { + opt.target_file_size_base = file_size; + } else { + exec_state_ = LDBCommandExecuteResult::FAILED(ARG_FILE_SIZE + + " must be > 0."); + } + } + + return opt; +} + +bool LDBCommand::ParseKeyValue(const string& line, string* key, string* value, + bool is_key_hex, bool is_value_hex) { + size_t pos = line.find(DELIM); + if (pos != string::npos) { + *key = line.substr(0, pos); + *value = line.substr(pos + strlen(DELIM)); + if (is_key_hex) { + *key = HexToString(*key); + } + if (is_value_hex) { + *value = HexToString(*value); + } + return true; + } else { + return false; + } +} + +/** + * Make sure that ONLY the command-line options and flags expected by this + * command are specified on the command-line. Extraneous options are usually + * the result of user error. + * Returns true if all checks pass. Else returns false, and prints an + * appropriate error msg to stderr. + */ +bool LDBCommand::ValidateCmdLineOptions() { + + for (map::const_iterator itr = option_map_.begin(); + itr != option_map_.end(); itr++) { + if (find(valid_cmd_line_options_.begin(), + valid_cmd_line_options_.end(), itr->first) == + valid_cmd_line_options_.end()) { + fprintf(stderr, "Invalid command-line option %s\n", itr->first.c_str()); + return false; + } + } + + for (vector::const_iterator itr = flags_.begin(); + itr != flags_.end(); itr++) { + if (find(valid_cmd_line_options_.begin(), + valid_cmd_line_options_.end(), *itr) == + valid_cmd_line_options_.end()) { + fprintf(stderr, "Invalid command-line flag %s\n", itr->c_str()); + return false; + } + } + + if (!NoDBOpen() && option_map_.find(ARG_DB) == option_map_.end()) { + fprintf(stderr, "%s must be specified\n", ARG_DB.c_str()); + return false; + } + + return true; +} + +CompactorCommand::CompactorCommand(const vector& params, + const map& options, const vector& flags) : + LDBCommand(options, flags, false, + BuildCmdLineOptions({ARG_FROM, ARG_TO, ARG_HEX, ARG_KEY_HEX, + ARG_VALUE_HEX, ARG_TTL})), + null_from_(true), null_to_(true) { + + map::const_iterator itr = options.find(ARG_FROM); + if (itr != options.end()) { + null_from_ = false; + from_ = itr->second; + } + + itr = options.find(ARG_TO); + if (itr != options.end()) { + null_to_ = false; + to_ = itr->second; + } + + if (is_key_hex_) { + if (!null_from_) { + from_ = HexToString(from_); + } + if (!null_to_) { + to_ = HexToString(to_); + } + } +} + +void CompactorCommand::Help(string& ret) { + ret.append(" "); + ret.append(CompactorCommand::Name()); + ret.append(HelpRangeCmdArgs()); + ret.append("\n"); +} + +void CompactorCommand::DoCommand() { + + Slice* begin = nullptr; + Slice* end = nullptr; + if (!null_from_) { + begin = new Slice(from_); + } + if (!null_to_) { + end = new Slice(to_); + } + + db_->CompactRange(begin, end); + exec_state_ = LDBCommandExecuteResult::SUCCEED(""); + + delete begin; + delete end; +} + +const string DBLoaderCommand::ARG_DISABLE_WAL = "disable_wal"; +const string DBLoaderCommand::ARG_BULK_LOAD = "bulk_load"; +const string DBLoaderCommand::ARG_COMPACT = "compact"; + +DBLoaderCommand::DBLoaderCommand(const vector& params, + const map& options, const vector& flags) : + LDBCommand(options, flags, false, + BuildCmdLineOptions({ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX, + ARG_FROM, ARG_TO, ARG_CREATE_IF_MISSING, + ARG_DISABLE_WAL, ARG_BULK_LOAD, + ARG_COMPACT})), + create_if_missing_(false), disable_wal_(false), bulk_load_(false), + compact_(false) { + + create_if_missing_ = IsFlagPresent(flags, ARG_CREATE_IF_MISSING); + disable_wal_ = IsFlagPresent(flags, ARG_DISABLE_WAL); + bulk_load_ = IsFlagPresent(flags, ARG_BULK_LOAD); + compact_ = IsFlagPresent(flags, ARG_COMPACT); +} + +void DBLoaderCommand::Help(string& ret) { + ret.append(" "); + ret.append(DBLoaderCommand::Name()); + ret.append(" [--" + ARG_CREATE_IF_MISSING + "]"); + ret.append(" [--" + ARG_DISABLE_WAL + "]"); + ret.append(" [--" + ARG_BULK_LOAD + "]"); + ret.append(" [--" + ARG_COMPACT + "]"); + ret.append("\n"); +} + +Options DBLoaderCommand::PrepareOptionsForOpenDB() { + Options opt = LDBCommand::PrepareOptionsForOpenDB(); + opt.create_if_missing = create_if_missing_; + if (bulk_load_) { + opt.PrepareForBulkLoad(); + } + return opt; +} + +void DBLoaderCommand::DoCommand() { + if (!db_) { + return; + } + + WriteOptions write_options; + if (disable_wal_) { + write_options.disableWAL = true; + } + + int bad_lines = 0; + string line; + while (getline(cin, line, '\n')) { + string key; + string value; + if (ParseKeyValue(line, &key, &value, is_key_hex_, is_value_hex_)) { + db_->Put(write_options, Slice(key), Slice(value)); + } else if (0 == line.find("Keys in range:")) { + // ignore this line + } else if (0 == line.find("Created bg thread 0x")) { + // ignore this line + } else { + bad_lines ++; + } + } + + if (bad_lines > 0) { + cout << "Warning: " << bad_lines << " bad lines ignored." << endl; + } + if (compact_) { + db_->CompactRange(nullptr, nullptr); + } +} + +// ---------------------------------------------------------------------------- + +const string ManifestDumpCommand::ARG_VERBOSE = "verbose"; +const string ManifestDumpCommand::ARG_PATH = "path"; + +void ManifestDumpCommand::Help(string& ret) { + ret.append(" "); + ret.append(ManifestDumpCommand::Name()); + ret.append(" [--" + ARG_VERBOSE + "]"); + ret.append(" [--" + ARG_PATH + "=]"); + ret.append("\n"); +} + +ManifestDumpCommand::ManifestDumpCommand(const vector& params, + const map& options, const vector& flags) : + LDBCommand(options, flags, false, + BuildCmdLineOptions({ARG_VERBOSE, ARG_PATH, ARG_HEX})), + verbose_(false), + path_("") +{ + verbose_ = IsFlagPresent(flags, ARG_VERBOSE); + + map::const_iterator itr = options.find(ARG_PATH); + if (itr != options.end()) { + path_ = itr->second; + if (path_.empty()) { + exec_state_ = LDBCommandExecuteResult::FAILED("--path: missing pathname"); + } + } +} + +void ManifestDumpCommand::DoCommand() { + + std::string manifestfile; + + if (!path_.empty()) { + manifestfile = path_; + } else { + bool found = false; + // We need to find the manifest file by searching the directory + // containing the db for files of the form MANIFEST_[0-9]+ + DIR* d = opendir(db_path_.c_str()); + if (d == nullptr) { + exec_state_ = LDBCommandExecuteResult::FAILED( + db_path_ + " is not a directory"); + return; + } + struct dirent* entry; + while ((entry = readdir(d)) != nullptr) { + unsigned int match; + unsigned long long num; + if (sscanf(entry->d_name, + "MANIFEST-%ln%ln", + (unsigned long*)&num, + (unsigned long*)&match) + && match == strlen(entry->d_name)) { + if (!found) { + manifestfile = db_path_ + "/" + std::string(entry->d_name); + found = true; + } else { + exec_state_ = LDBCommandExecuteResult::FAILED( + "Multiple MANIFEST files found; use --path to select one"); + return; + } + } + } + closedir(d); + } + + if (verbose_) { + printf("Processing Manifest file %s\n", manifestfile.c_str()); + } + + Options options; + EnvOptions sopt; + std::string file(manifestfile); + std::string dbname("dummy"); + TableCache* tc = new TableCache(dbname, &options, sopt, 10); + const InternalKeyComparator* cmp = + new InternalKeyComparator(options.comparator); + + VersionSet* versions = new VersionSet(dbname, &options, sopt, tc, cmp); + Status s = versions->DumpManifest(options, file, verbose_, is_key_hex_); + if (!s.ok()) { + printf("Error in processing file %s %s\n", manifestfile.c_str(), + s.ToString().c_str()); + } + if (verbose_) { + printf("Processing Manifest file %s done\n", manifestfile.c_str()); + } +} + +// ---------------------------------------------------------------------------- + +string ReadableTime(int unixtime) { + char time_buffer [80]; + time_t rawtime = unixtime; + struct tm * timeinfo = localtime(&rawtime); + strftime(time_buffer, 80, "%c", timeinfo); + return string(time_buffer); +} + +// This function only called when it's the sane case of >1 buckets in time-range +// Also called only when timekv falls between ttl_start and ttl_end provided +void IncBucketCounts(vector& bucket_counts, int ttl_start, + int time_range, int bucket_size, int timekv, int num_buckets) { + assert(time_range > 0 && timekv >= ttl_start && bucket_size > 0 && + timekv < (ttl_start + time_range) && num_buckets > 1); + int bucket = (timekv - ttl_start) / bucket_size; + bucket_counts[bucket]++; +} + +void PrintBucketCounts(const vector& bucket_counts, int ttl_start, + int ttl_end, int bucket_size, int num_buckets) { + int time_point = ttl_start; + for(int i = 0; i < num_buckets - 1; i++, time_point += bucket_size) { + fprintf(stdout, "Keys in range %s to %s : %lu\n", + ReadableTime(time_point).c_str(), + ReadableTime(time_point + bucket_size).c_str(), + (unsigned long)bucket_counts[i]); + } + fprintf(stdout, "Keys in range %s to %s : %lu\n", + ReadableTime(time_point).c_str(), + ReadableTime(ttl_end).c_str(), + (unsigned long)bucket_counts[num_buckets - 1]); +} + +const string InternalDumpCommand::ARG_COUNT_ONLY = "count_only"; +const string InternalDumpCommand::ARG_COUNT_DELIM = "count_delim"; +const string InternalDumpCommand::ARG_STATS = "stats"; +const string InternalDumpCommand::ARG_INPUT_KEY_HEX = "input_key_hex"; + +InternalDumpCommand::InternalDumpCommand(const vector& params, + const map& options, + const vector& flags) : + LDBCommand(options, flags, true, + BuildCmdLineOptions({ ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX, + ARG_FROM, ARG_TO, ARG_MAX_KEYS, + ARG_COUNT_ONLY, ARG_COUNT_DELIM, ARG_STATS, + ARG_INPUT_KEY_HEX})), + has_from_(false), + has_to_(false), + max_keys_(-1), + delim_("."), + count_only_(false), + count_delim_(false), + print_stats_(false), + is_input_key_hex_(false) { + + has_from_ = ParseStringOption(options, ARG_FROM, &from_); + has_to_ = ParseStringOption(options, ARG_TO, &to_); + + ParseIntOption(options, ARG_MAX_KEYS, max_keys_, exec_state_); + map::const_iterator itr = options.find(ARG_COUNT_DELIM); + if (itr != options.end()) { + delim_ = itr->second; + count_delim_ = true; + // fprintf(stdout,"delim = %c\n",delim_[0]); + } else { + count_delim_ = IsFlagPresent(flags, ARG_COUNT_DELIM); + delim_="."; + } + + print_stats_ = IsFlagPresent(flags, ARG_STATS); + count_only_ = IsFlagPresent(flags, ARG_COUNT_ONLY); + is_input_key_hex_ = IsFlagPresent(flags, ARG_INPUT_KEY_HEX); + + if (is_input_key_hex_) { + if (has_from_) { + from_ = HexToString(from_); + } + if (has_to_) { + to_ = HexToString(to_); + } + } +} + +void InternalDumpCommand::Help(string& ret) { + ret.append(" "); + ret.append(InternalDumpCommand::Name()); + ret.append(HelpRangeCmdArgs()); + ret.append(" [--" + ARG_INPUT_KEY_HEX + "]"); + ret.append(" [--" + ARG_MAX_KEYS + "=]"); + ret.append(" [--" + ARG_COUNT_ONLY + "]"); + ret.append(" [--" + ARG_COUNT_DELIM + "=]"); + ret.append(" [--" + ARG_STATS + "]"); + ret.append("\n"); +} + +void InternalDumpCommand::DoCommand() { + if (!db_) { + return; + } + + if (print_stats_) { + string stats; + if (db_->GetProperty("rocksdb.stats", &stats)) { + fprintf(stdout, "%s\n", stats.c_str()); + } + } + + // Cast as DBImpl to get internal iterator + DBImpl* idb = dynamic_cast(db_); + if (!idb) { + exec_state_ = LDBCommandExecuteResult::FAILED("DB is not DBImpl"); + return; + } + string rtype1,rtype2,row,val; + rtype2 = ""; + uint64_t c=0; + uint64_t s1=0,s2=0; + // Setup internal key iterator + auto iter = unique_ptr(idb->TEST_NewInternalIterator()); + Status st = iter->status(); + if (!st.ok()) { + exec_state_ = LDBCommandExecuteResult::FAILED("Iterator error:" + + st.ToString()); + } + + if (has_from_) { + InternalKey ikey(from_, kMaxSequenceNumber, kValueTypeForSeek); + iter->Seek(ikey.Encode()); + } else { + iter->SeekToFirst(); + } + + long long count = 0; + for (; iter->Valid(); iter->Next()) { + ParsedInternalKey ikey; + if (!ParseInternalKey(iter->key(), &ikey)) { + fprintf(stderr, "Internal Key [%s] parse error!\n", + iter->key().ToString(true /* in hex*/).data()); + // TODO: add error counter + continue; + } + + // If end marker was specified, we stop before it + if (has_to_ && options_.comparator->Compare(ikey.user_key, to_) >= 0) { + break; + } + + ++count; + int k; + if (count_delim_) { + rtype1 = ""; + s1=0; + row = iter->key().ToString(); + val = iter->value().ToString(); + for(k=0;row[k]!='\x01' && row[k]!='\0';k++) + s1++; + for(k=0;val[k]!='\x01' && val[k]!='\0';k++) + s1++; + for(int j=0;row[j]!=delim_[0] && row[j]!='\0' && row[j]!='\x01';j++) + rtype1+=row[j]; + if(rtype2.compare("") && rtype2.compare(rtype1)!=0) { + fprintf(stdout,"%s => count:%lld\tsize:%lld\n",rtype2.c_str(), + (long long)c,(long long)s2); + c=1; + s2=s1; + rtype2 = rtype1; + } else { + c++; + s2+=s1; + rtype2=rtype1; + } + } + + if (!count_only_ && !count_delim_) { + string key = ikey.DebugString(is_key_hex_); + string value = iter->value().ToString(is_value_hex_); + std::cout << key << " => " << value << "\n"; + } + + // Terminate if maximum number of keys have been dumped + if (max_keys_ > 0 && count >= max_keys_) break; + } + if(count_delim_) { + fprintf(stdout,"%s => count:%lld\tsize:%lld\n", rtype2.c_str(), + (long long)c,(long long)s2); + } else + fprintf(stdout, "Internal keys in range: %lld\n", (long long) count); +} + + +const string DBDumperCommand::ARG_COUNT_ONLY = "count_only"; +const string DBDumperCommand::ARG_COUNT_DELIM = "count_delim"; +const string DBDumperCommand::ARG_STATS = "stats"; +const string DBDumperCommand::ARG_TTL_BUCKET = "bucket"; + +DBDumperCommand::DBDumperCommand(const vector& params, + const map& options, const vector& flags) : + LDBCommand(options, flags, true, + BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX, + ARG_VALUE_HEX, ARG_FROM, ARG_TO, + ARG_MAX_KEYS, ARG_COUNT_ONLY, + ARG_COUNT_DELIM, ARG_STATS, ARG_TTL_START, + ARG_TTL_END, ARG_TTL_BUCKET, + ARG_TIMESTAMP})), + null_from_(true), + null_to_(true), + max_keys_(-1), + count_only_(false), + count_delim_(false), + print_stats_(false) { + + map::const_iterator itr = options.find(ARG_FROM); + if (itr != options.end()) { + null_from_ = false; + from_ = itr->second; + } + + itr = options.find(ARG_TO); + if (itr != options.end()) { + null_to_ = false; + to_ = itr->second; + } + + itr = options.find(ARG_MAX_KEYS); + if (itr != options.end()) { + try { + max_keys_ = stoi(itr->second); + } catch(const invalid_argument&) { + exec_state_ = LDBCommandExecuteResult::FAILED(ARG_MAX_KEYS + + " has an invalid value"); + } catch(const out_of_range&) { + exec_state_ = LDBCommandExecuteResult::FAILED(ARG_MAX_KEYS + + " has a value out-of-range"); + } + } + itr = options.find(ARG_COUNT_DELIM); + if (itr != options.end()) { + delim_ = itr->second; + count_delim_ = true; + } else { + count_delim_ = IsFlagPresent(flags, ARG_COUNT_DELIM); + delim_="."; + } + + print_stats_ = IsFlagPresent(flags, ARG_STATS); + count_only_ = IsFlagPresent(flags, ARG_COUNT_ONLY); + + if (is_key_hex_) { + if (!null_from_) { + from_ = HexToString(from_); + } + if (!null_to_) { + to_ = HexToString(to_); + } + } +} + +void DBDumperCommand::Help(string& ret) { + ret.append(" "); + ret.append(DBDumperCommand::Name()); + ret.append(HelpRangeCmdArgs()); + ret.append(" [--" + ARG_TTL + "]"); + ret.append(" [--" + ARG_MAX_KEYS + "=]"); + ret.append(" [--" + ARG_TIMESTAMP + "]"); + ret.append(" [--" + ARG_COUNT_ONLY + "]"); + ret.append(" [--" + ARG_COUNT_DELIM + "=]"); + ret.append(" [--" + ARG_STATS + "]"); + ret.append(" [--" + ARG_TTL_BUCKET + "=]"); + ret.append(" [--" + ARG_TTL_START + "=:- is inclusive]"); + ret.append(" [--" + ARG_TTL_END + "=:- is exclusive]"); + ret.append("\n"); +} + +void DBDumperCommand::DoCommand() { + if (!db_) { + return; + } + // Parse command line args + uint64_t count = 0; + if (print_stats_) { + string stats; + if (db_->GetProperty("rocksdb.stats", &stats)) { + fprintf(stdout, "%s\n", stats.c_str()); + } + } + + // Setup key iterator + Iterator* iter = db_->NewIterator(ReadOptions()); + Status st = iter->status(); + if (!st.ok()) { + exec_state_ = LDBCommandExecuteResult::FAILED("Iterator error." + + st.ToString()); + } + + if (!null_from_) { + iter->Seek(from_); + } else { + iter->SeekToFirst(); + } + + int max_keys = max_keys_; + int ttl_start; + if (!ParseIntOption(option_map_, ARG_TTL_START, ttl_start, exec_state_)) { + ttl_start = DBWithTTL::kMinTimestamp; // TTL introduction time + } + int ttl_end; + if (!ParseIntOption(option_map_, ARG_TTL_END, ttl_end, exec_state_)) { + ttl_end = DBWithTTL::kMaxTimestamp; // Max time allowed by TTL feature + } + if (ttl_end < ttl_start) { + fprintf(stderr, "Error: End time can't be less than start time\n"); + delete iter; + return; + } + int time_range = ttl_end - ttl_start; + int bucket_size; + if (!ParseIntOption(option_map_, ARG_TTL_BUCKET, bucket_size, exec_state_) || + bucket_size <= 0) { + bucket_size = time_range; // Will have just 1 bucket by default + } + //cretaing variables for row count of each type + string rtype1,rtype2,row,val; + rtype2 = ""; + uint64_t c=0; + uint64_t s1=0,s2=0; + + // At this point, bucket_size=0 => time_range=0 + uint64_t num_buckets = (bucket_size >= time_range) ? 1 : + ((time_range + bucket_size - 1) / bucket_size); + vector bucket_counts(num_buckets, 0); + if (is_db_ttl_ && !count_only_ && timestamp_ && !count_delim_) { + fprintf(stdout, "Dumping key-values from %s to %s\n", + ReadableTime(ttl_start).c_str(), ReadableTime(ttl_end).c_str()); + } + + for (; iter->Valid(); iter->Next()) { + int rawtime = 0; + // If end marker was specified, we stop before it + if (!null_to_ && (iter->key().ToString() >= to_)) + break; + // Terminate if maximum number of keys have been dumped + if (max_keys == 0) + break; + if (is_db_ttl_) { + TtlIterator* it_ttl = dynamic_cast(iter); + assert(it_ttl); + rawtime = it_ttl->timestamp(); + if (rawtime < ttl_start || rawtime >= ttl_end) { + continue; + } + } + if (max_keys > 0) { + --max_keys; + } + if (is_db_ttl_ && num_buckets > 1) { + IncBucketCounts(bucket_counts, ttl_start, time_range, bucket_size, + rawtime, num_buckets); + } + ++count; + if (count_delim_) { + rtype1 = ""; + row = iter->key().ToString(); + val = iter->value().ToString(); + s1 = row.size()+val.size(); + for(int j=0;row[j]!=delim_[0] && row[j]!='\0';j++) + rtype1+=row[j]; + if(rtype2.compare("") && rtype2.compare(rtype1)!=0) { + fprintf(stdout,"%s => count:%lld\tsize:%lld\n",rtype2.c_str(), + (long long )c,(long long)s2); + c=1; + s2=s1; + rtype2 = rtype1; + } else { + c++; + s2+=s1; + rtype2=rtype1; + } + + } + + + + if (!count_only_ && !count_delim_) { + if (is_db_ttl_ && timestamp_) { + fprintf(stdout, "%s ", ReadableTime(rawtime).c_str()); + } + string str = PrintKeyValue(iter->key().ToString(), + iter->value().ToString(), is_key_hex_, + is_value_hex_); + fprintf(stdout, "%s\n", str.c_str()); + } + } + + if (num_buckets > 1 && is_db_ttl_) { + PrintBucketCounts(bucket_counts, ttl_start, ttl_end, bucket_size, + num_buckets); + } else if(count_delim_) { + fprintf(stdout,"%s => count:%lld\tsize:%lld\n",rtype2.c_str(), + (long long )c,(long long)s2); + } else { + fprintf(stdout, "Keys in range: %lld\n", (long long) count); + } + // Clean up + delete iter; +} + +const string ReduceDBLevelsCommand::ARG_NEW_LEVELS = "new_levels"; +const string ReduceDBLevelsCommand::ARG_PRINT_OLD_LEVELS = "print_old_levels"; + +ReduceDBLevelsCommand::ReduceDBLevelsCommand(const vector& params, + const map& options, const vector& flags) : + LDBCommand(options, flags, false, + BuildCmdLineOptions({ARG_NEW_LEVELS, ARG_PRINT_OLD_LEVELS})), + old_levels_(1 << 16), + new_levels_(-1), + print_old_levels_(false) { + + + ParseIntOption(option_map_, ARG_NEW_LEVELS, new_levels_, exec_state_); + print_old_levels_ = IsFlagPresent(flags, ARG_PRINT_OLD_LEVELS); + + if(new_levels_ <= 0) { + exec_state_ = LDBCommandExecuteResult::FAILED( + " Use --" + ARG_NEW_LEVELS + " to specify a new level number\n"); + } +} + +vector ReduceDBLevelsCommand::PrepareArgs(const string& db_path, + int new_levels, bool print_old_level) { + vector ret; + ret.push_back("reduce_levels"); + ret.push_back("--" + ARG_DB + "=" + db_path); + ret.push_back("--" + ARG_NEW_LEVELS + "=" + to_string(new_levels)); + if(print_old_level) { + ret.push_back("--" + ARG_PRINT_OLD_LEVELS); + } + return ret; +} + +void ReduceDBLevelsCommand::Help(string& ret) { + ret.append(" "); + ret.append(ReduceDBLevelsCommand::Name()); + ret.append(" --" + ARG_NEW_LEVELS + "="); + ret.append(" [--" + ARG_PRINT_OLD_LEVELS + "]"); + ret.append("\n"); +} + +Options ReduceDBLevelsCommand::PrepareOptionsForOpenDB() { + Options opt = LDBCommand::PrepareOptionsForOpenDB(); + opt.num_levels = old_levels_; + opt.max_bytes_for_level_multiplier_additional.resize(opt.num_levels, 1); + // Disable size compaction + opt.max_bytes_for_level_base = 1UL << 50; + opt.max_bytes_for_level_multiplier = 1; + opt.max_mem_compaction_level = 0; + return opt; +} + +Status ReduceDBLevelsCommand::GetOldNumOfLevels(Options& opt, + int* levels) { + EnvOptions soptions; + TableCache tc(db_path_, &opt, soptions, 10); + const InternalKeyComparator cmp(opt.comparator); + VersionSet versions(db_path_, &opt, soptions, &tc, &cmp); + // We rely the VersionSet::Recover to tell us the internal data structures + // in the db. And the Recover() should never do any change + // (like LogAndApply) to the manifest file. + Status st = versions.Recover(); + if (!st.ok()) { + return st; + } + int max = -1; + for (int i = 0; i < versions.NumberLevels(); i++) { + if (versions.current()->NumLevelFiles(i)) { + max = i; + } + } + + *levels = max + 1; + return st; +} + +void ReduceDBLevelsCommand::DoCommand() { + if (new_levels_ <= 1) { + exec_state_ = LDBCommandExecuteResult::FAILED( + "Invalid number of levels.\n"); + return; + } + + Status st; + Options opt = PrepareOptionsForOpenDB(); + int old_level_num = -1; + st = GetOldNumOfLevels(opt, &old_level_num); + if (!st.ok()) { + exec_state_ = LDBCommandExecuteResult::FAILED(st.ToString()); + return; + } + + if (print_old_levels_) { + fprintf(stdout, "The old number of levels in use is %d\n", old_level_num); + } + + if (old_level_num <= new_levels_) { + return; + } + + old_levels_ = old_level_num; + + OpenDB(); + if (!db_) { + return; + } + // Compact the whole DB to put all files to the highest level. + fprintf(stdout, "Compacting the db...\n"); + db_->CompactRange(nullptr, nullptr); + CloseDB(); + + EnvOptions soptions; + TableCache tc(db_path_, &opt, soptions, 10); + const InternalKeyComparator cmp(opt.comparator); + VersionSet versions(db_path_, &opt, soptions, &tc, &cmp); + // We rely the VersionSet::Recover to tell us the internal data structures + // in the db. And the Recover() should never do any change (like LogAndApply) + // to the manifest file. + st = versions.Recover(); + if (!st.ok()) { + exec_state_ = LDBCommandExecuteResult::FAILED(st.ToString()); + return; + } + + port::Mutex mu; + mu.Lock(); + st = versions.ReduceNumberOfLevels(new_levels_, &mu); + mu.Unlock(); + + if (!st.ok()) { + exec_state_ = LDBCommandExecuteResult::FAILED(st.ToString()); + return; + } +} + +const string ChangeCompactionStyleCommand::ARG_OLD_COMPACTION_STYLE = + "old_compaction_style"; +const string ChangeCompactionStyleCommand::ARG_NEW_COMPACTION_STYLE = + "new_compaction_style"; + +ChangeCompactionStyleCommand::ChangeCompactionStyleCommand( + const vector& params, const map& options, + const vector& flags) : + LDBCommand(options, flags, false, + BuildCmdLineOptions({ARG_OLD_COMPACTION_STYLE, + ARG_NEW_COMPACTION_STYLE})), + old_compaction_style_(-1), + new_compaction_style_(-1) { + + ParseIntOption(option_map_, ARG_OLD_COMPACTION_STYLE, old_compaction_style_, + exec_state_); + if (old_compaction_style_ != kCompactionStyleLevel && + old_compaction_style_ != kCompactionStyleUniversal) { + exec_state_ = LDBCommandExecuteResult::FAILED( + "Use --" + ARG_OLD_COMPACTION_STYLE + " to specify old compaction " + + "style. Check ldb help for proper compaction style value.\n"); + return; + } + + ParseIntOption(option_map_, ARG_NEW_COMPACTION_STYLE, new_compaction_style_, + exec_state_); + if (new_compaction_style_ != kCompactionStyleLevel && + new_compaction_style_ != kCompactionStyleUniversal) { + exec_state_ = LDBCommandExecuteResult::FAILED( + "Use --" + ARG_NEW_COMPACTION_STYLE + " to specify new compaction " + + "style. Check ldb help for proper compaction style value.\n"); + return; + } + + if (new_compaction_style_ == old_compaction_style_) { + exec_state_ = LDBCommandExecuteResult::FAILED( + "Old compaction style is the same as new compaction style. " + "Nothing to do.\n"); + return; + } + + if (old_compaction_style_ == kCompactionStyleUniversal && + new_compaction_style_ == kCompactionStyleLevel) { + exec_state_ = LDBCommandExecuteResult::FAILED( + "Convert from universal compaction to level compaction. " + "Nothing to do.\n"); + return; + } +} + +void ChangeCompactionStyleCommand::Help(string& ret) { + ret.append(" "); + ret.append(ChangeCompactionStyleCommand::Name()); + ret.append(" --" + ARG_OLD_COMPACTION_STYLE + "="); + ret.append(" --" + ARG_NEW_COMPACTION_STYLE + "="); + ret.append("\n"); +} + +Options ChangeCompactionStyleCommand::PrepareOptionsForOpenDB() { + Options opt = LDBCommand::PrepareOptionsForOpenDB(); + + if (old_compaction_style_ == kCompactionStyleLevel && + new_compaction_style_ == kCompactionStyleUniversal) { + // In order to convert from level compaction to universal compaction, we + // need to compact all data into a single file and move it to level 0. + opt.disable_auto_compactions = true; + opt.target_file_size_base = INT_MAX; + opt.target_file_size_multiplier = 1; + opt.max_bytes_for_level_base = INT_MAX; + opt.max_bytes_for_level_multiplier = 1; + } + + return opt; +} + +void ChangeCompactionStyleCommand::DoCommand() { + // print db stats before we have made any change + std::string property; + std::string files_per_level; + for (int i = 0; i < db_->NumberLevels(); i++) { + db_->GetProperty("rocksdb.num-files-at-level" + NumberToString(i), + &property); + + // format print string + char buf[100]; + snprintf(buf, sizeof(buf), "%s%s", (i ? "," : ""), property.c_str()); + files_per_level += buf; + } + fprintf(stdout, "files per level before compaction: %s\n", + files_per_level.c_str()); + + // manual compact into a single file and move the file to level 0 + db_->CompactRange(nullptr, nullptr, + true /* reduce level */, + 0 /* reduce to level 0 */); + + // verify compaction result + files_per_level = ""; + int num_files = 0; + for (int i = 0; i < db_->NumberLevels(); i++) { + db_->GetProperty("rocksdb.num-files-at-level" + NumberToString(i), + &property); + + // format print string + char buf[100]; + snprintf(buf, sizeof(buf), "%s%s", (i ? "," : ""), property.c_str()); + files_per_level += buf; + + num_files = atoi(property.c_str()); + + // level 0 should have only 1 file + if (i == 0 && num_files != 1) { + exec_state_ = LDBCommandExecuteResult::FAILED("Number of db files at " + "level 0 after compaction is " + std::to_string(num_files) + + ", not 1.\n"); + return; + } + // other levels should have no file + if (i > 0 && num_files != 0) { + exec_state_ = LDBCommandExecuteResult::FAILED("Number of db files at " + "level " + std::to_string(i) + " after compaction is " + + std::to_string(num_files) + ", not 0.\n"); + return; + } + } + + fprintf(stdout, "files per level after compaction: %s\n", + files_per_level.c_str()); +} + +class InMemoryHandler : public WriteBatch::Handler { + public: + InMemoryHandler(stringstream& row, bool print_values) : Handler(),row_(row) { + print_values_ = print_values; + } + + void commonPutMerge(const Slice& key, const Slice& value) { + string k = LDBCommand::StringToHex(key.ToString()); + if (print_values_) { + string v = LDBCommand::StringToHex(value.ToString()); + row_ << k << " : "; + row_ << v << " "; + } else { + row_ << k << " "; + } + } + + virtual void Put(const Slice& key, const Slice& value) { + row_ << "PUT : "; + commonPutMerge(key, value); + } + + virtual void Merge(const Slice& key, const Slice& value) { + row_ << "MERGE : "; + commonPutMerge(key, value); + } + + virtual void Delete(const Slice& key) { + row_ <<",DELETE : "; + row_ << LDBCommand::StringToHex(key.ToString()) << " "; + } + + virtual ~InMemoryHandler() { }; + + private: + stringstream & row_; + bool print_values_; +}; + +const string WALDumperCommand::ARG_WAL_FILE = "walfile"; +const string WALDumperCommand::ARG_PRINT_VALUE = "print_value"; +const string WALDumperCommand::ARG_PRINT_HEADER = "header"; + +WALDumperCommand::WALDumperCommand(const vector& params, + const map& options, const vector& flags) : + LDBCommand(options, flags, true, + BuildCmdLineOptions( + {ARG_WAL_FILE, ARG_PRINT_HEADER, ARG_PRINT_VALUE})), + print_header_(false), print_values_(false) { + + wal_file_.clear(); + + map::const_iterator itr = options.find(ARG_WAL_FILE); + if (itr != options.end()) { + wal_file_ = itr->second; + } + + + print_header_ = IsFlagPresent(flags, ARG_PRINT_HEADER); + print_values_ = IsFlagPresent(flags, ARG_PRINT_VALUE); + if (wal_file_.empty()) { + exec_state_ = LDBCommandExecuteResult::FAILED( + "Argument " + ARG_WAL_FILE + " must be specified."); + } +} + +void WALDumperCommand::Help(string& ret) { + ret.append(" "); + ret.append(WALDumperCommand::Name()); + ret.append(" --" + ARG_WAL_FILE + "="); + ret.append(" [--" + ARG_PRINT_HEADER + "] "); + ret.append(" [--" + ARG_PRINT_VALUE + "] "); + ret.append("\n"); +} + +void WALDumperCommand::DoCommand() { + struct StdErrReporter : public log::Reader::Reporter { + virtual void Corruption(size_t bytes, const Status& s) { + cerr<<"Corruption detected in log file "< file; + Env* env_ = Env::Default(); + EnvOptions soptions; + Status status = env_->NewSequentialFile(wal_file_, &file, soptions); + if (!status.ok()) { + exec_state_ = LDBCommandExecuteResult::FAILED("Failed to open WAL file " + + status.ToString()); + } else { + StdErrReporter reporter; + log::Reader reader(move(file), &reporter, true, 0); + string scratch; + WriteBatch batch; + Slice record; + stringstream row; + if (print_header_) { + cout<<"Sequence,Count,ByteSize,Physical Offset,Key(s)"; + if (print_values_) { + cout << " : value "; + } + cout << "\n"; + } + while(reader.ReadRecord(&record, &scratch)) { + row.str(""); + if (record.size() < 12) { + reporter.Corruption( + record.size(), Status::Corruption("log record too small")); + } else { + WriteBatchInternal::SetContents(&batch, record); + row<& params, + const map& options, const vector& flags) : + LDBCommand(options, flags, true, BuildCmdLineOptions({ARG_TTL, ARG_HEX, + ARG_KEY_HEX, + ARG_VALUE_HEX})) { + + if (params.size() != 1) { + exec_state_ = LDBCommandExecuteResult::FAILED( + " must be specified for the get command"); + } else { + key_ = params.at(0); + } + + if (is_key_hex_) { + key_ = HexToString(key_); + } +} + +void GetCommand::Help(string& ret) { + ret.append(" "); + ret.append(GetCommand::Name()); + ret.append(" "); + ret.append(" [--" + ARG_TTL + "]"); + ret.append("\n"); +} + +void GetCommand::DoCommand() { + string value; + Status st = db_->Get(ReadOptions(), key_, &value); + if (st.ok()) { + fprintf(stdout, "%s\n", + (is_value_hex_ ? StringToHex(value) : value).c_str()); + } else { + exec_state_ = LDBCommandExecuteResult::FAILED(st.ToString()); + } +} + + +ApproxSizeCommand::ApproxSizeCommand(const vector& params, + const map& options, const vector& flags) : + LDBCommand(options, flags, true, + BuildCmdLineOptions({ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX, + ARG_FROM, ARG_TO})) { + + if (options.find(ARG_FROM) != options.end()) { + start_key_ = options.find(ARG_FROM)->second; + } else { + exec_state_ = LDBCommandExecuteResult::FAILED(ARG_FROM + + " must be specified for approxsize command"); + return; + } + + if (options.find(ARG_TO) != options.end()) { + end_key_ = options.find(ARG_TO)->second; + } else { + exec_state_ = LDBCommandExecuteResult::FAILED(ARG_TO + + " must be specified for approxsize command"); + return; + } + + if (is_key_hex_) { + start_key_ = HexToString(start_key_); + end_key_ = HexToString(end_key_); + } +} + +void ApproxSizeCommand::Help(string& ret) { + ret.append(" "); + ret.append(ApproxSizeCommand::Name()); + ret.append(HelpRangeCmdArgs()); + ret.append("\n"); +} + +void ApproxSizeCommand::DoCommand() { + + Range ranges[1]; + ranges[0] = Range(start_key_, end_key_); + uint64_t sizes[1]; + db_->GetApproximateSizes(ranges, 1, sizes); + fprintf(stdout, "%lu\n", (unsigned long)sizes[0]); + /* Weird that GetApproximateSizes() returns void, although documentation + * says that it returns a Status object. + if (!st.ok()) { + exec_state_ = LDBCommandExecuteResult::FAILED(st.ToString()); + } + */ +} + + +BatchPutCommand::BatchPutCommand(const vector& params, + const map& options, const vector& flags) : + LDBCommand(options, flags, false, + BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX, + ARG_CREATE_IF_MISSING})) { + + if (params.size() < 2) { + exec_state_ = LDBCommandExecuteResult::FAILED( + "At least one pair must be specified batchput."); + } else if (params.size() % 2 != 0) { + exec_state_ = LDBCommandExecuteResult::FAILED( + "Equal number of s and s must be specified for batchput."); + } else { + for (size_t i = 0; i < params.size(); i += 2) { + string key = params.at(i); + string value = params.at(i+1); + key_values_.push_back(pair( + is_key_hex_ ? HexToString(key) : key, + is_value_hex_ ? HexToString(value) : value)); + } + } +} + +void BatchPutCommand::Help(string& ret) { + ret.append(" "); + ret.append(BatchPutCommand::Name()); + ret.append(" [ ] [..]"); + ret.append(" [--" + ARG_TTL + "]"); + ret.append("\n"); +} + +void BatchPutCommand::DoCommand() { + WriteBatch batch; + + for (vector>::const_iterator itr + = key_values_.begin(); itr != key_values_.end(); itr++) { + batch.Put(itr->first, itr->second); + } + Status st = db_->Write(WriteOptions(), &batch); + if (st.ok()) { + fprintf(stdout, "OK\n"); + } else { + exec_state_ = LDBCommandExecuteResult::FAILED(st.ToString()); + } +} + +Options BatchPutCommand::PrepareOptionsForOpenDB() { + Options opt = LDBCommand::PrepareOptionsForOpenDB(); + opt.create_if_missing = IsFlagPresent(flags_, ARG_CREATE_IF_MISSING); + return opt; +} + + +ScanCommand::ScanCommand(const vector& params, + const map& options, const vector& flags) : + LDBCommand(options, flags, true, + BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX, ARG_TO, + ARG_VALUE_HEX, ARG_FROM, ARG_TIMESTAMP, + ARG_MAX_KEYS, ARG_TTL_START, ARG_TTL_END})), + start_key_specified_(false), + end_key_specified_(false), + max_keys_scanned_(-1) { + + map::const_iterator itr = options.find(ARG_FROM); + if (itr != options.end()) { + start_key_ = itr->second; + if (is_key_hex_) { + start_key_ = HexToString(start_key_); + } + start_key_specified_ = true; + } + itr = options.find(ARG_TO); + if (itr != options.end()) { + end_key_ = itr->second; + if (is_key_hex_) { + end_key_ = HexToString(end_key_); + } + end_key_specified_ = true; + } + + itr = options.find(ARG_MAX_KEYS); + if (itr != options.end()) { + try { + max_keys_scanned_ = stoi(itr->second); + } catch(const invalid_argument&) { + exec_state_ = LDBCommandExecuteResult::FAILED(ARG_MAX_KEYS + + " has an invalid value"); + } catch(const out_of_range&) { + exec_state_ = LDBCommandExecuteResult::FAILED(ARG_MAX_KEYS + + " has a value out-of-range"); + } + } +} + +void ScanCommand::Help(string& ret) { + ret.append(" "); + ret.append(ScanCommand::Name()); + ret.append(HelpRangeCmdArgs()); + ret.append(" [--" + ARG_TTL + "]"); + ret.append(" [--" + ARG_TIMESTAMP + "]"); + ret.append(" [--" + ARG_MAX_KEYS + "=q] "); + ret.append(" [--" + ARG_TTL_START + "=:- is inclusive]"); + ret.append(" [--" + ARG_TTL_END + "=:- is exclusive]"); + ret.append("\n"); +} + +void ScanCommand::DoCommand() { + + int num_keys_scanned = 0; + Iterator* it = db_->NewIterator(ReadOptions()); + if (start_key_specified_) { + it->Seek(start_key_); + } else { + it->SeekToFirst(); + } + int ttl_start; + if (!ParseIntOption(option_map_, ARG_TTL_START, ttl_start, exec_state_)) { + ttl_start = DBWithTTL::kMinTimestamp; // TTL introduction time + } + int ttl_end; + if (!ParseIntOption(option_map_, ARG_TTL_END, ttl_end, exec_state_)) { + ttl_end = DBWithTTL::kMaxTimestamp; // Max time allowed by TTL feature + } + if (ttl_end < ttl_start) { + fprintf(stderr, "Error: End time can't be less than start time\n"); + delete it; + return; + } + if (is_db_ttl_ && timestamp_) { + fprintf(stdout, "Scanning key-values from %s to %s\n", + ReadableTime(ttl_start).c_str(), ReadableTime(ttl_end).c_str()); + } + for ( ; + it->Valid() && (!end_key_specified_ || it->key().ToString() < end_key_); + it->Next()) { + string key = it->key().ToString(); + if (is_db_ttl_) { + TtlIterator* it_ttl = dynamic_cast(it); + assert(it_ttl); + int rawtime = it_ttl->timestamp(); + if (rawtime < ttl_start || rawtime >= ttl_end) { + continue; + } + if (timestamp_) { + fprintf(stdout, "%s ", ReadableTime(rawtime).c_str()); + } + } + string value = it->value().ToString(); + fprintf(stdout, "%s : %s\n", + (is_key_hex_ ? StringToHex(key) : key).c_str(), + (is_value_hex_ ? StringToHex(value) : value).c_str() + ); + num_keys_scanned++; + if (max_keys_scanned_ >= 0 && num_keys_scanned >= max_keys_scanned_) { + break; + } + } + if (!it->status().ok()) { // Check for any errors found during the scan + exec_state_ = LDBCommandExecuteResult::FAILED(it->status().ToString()); + } + delete it; +} + + +DeleteCommand::DeleteCommand(const vector& params, + const map& options, const vector& flags) : + LDBCommand(options, flags, false, + BuildCmdLineOptions({ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX})) { + + if (params.size() != 1) { + exec_state_ = LDBCommandExecuteResult::FAILED( + "KEY must be specified for the delete command"); + } else { + key_ = params.at(0); + if (is_key_hex_) { + key_ = HexToString(key_); + } + } +} + +void DeleteCommand::Help(string& ret) { + ret.append(" "); + ret.append(DeleteCommand::Name() + " "); + ret.append("\n"); +} + +void DeleteCommand::DoCommand() { + Status st = db_->Delete(WriteOptions(), key_); + if (st.ok()) { + fprintf(stdout, "OK\n"); + } else { + exec_state_ = LDBCommandExecuteResult::FAILED(st.ToString()); + } +} + + +PutCommand::PutCommand(const vector& params, + const map& options, const vector& flags) : + LDBCommand(options, flags, false, + BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX, + ARG_CREATE_IF_MISSING})) { + + if (params.size() != 2) { + exec_state_ = LDBCommandExecuteResult::FAILED( + " and must be specified for the put command"); + } else { + key_ = params.at(0); + value_ = params.at(1); + } + + if (is_key_hex_) { + key_ = HexToString(key_); + } + + if (is_value_hex_) { + value_ = HexToString(value_); + } +} + +void PutCommand::Help(string& ret) { + ret.append(" "); + ret.append(PutCommand::Name()); + ret.append(" "); + ret.append(" [--" + ARG_TTL + "]"); + ret.append("\n"); +} + +void PutCommand::DoCommand() { + Status st = db_->Put(WriteOptions(), key_, value_); + if (st.ok()) { + fprintf(stdout, "OK\n"); + } else { + exec_state_ = LDBCommandExecuteResult::FAILED(st.ToString()); + } +} + +Options PutCommand::PrepareOptionsForOpenDB() { + Options opt = LDBCommand::PrepareOptionsForOpenDB(); + opt.create_if_missing = IsFlagPresent(flags_, ARG_CREATE_IF_MISSING); + return opt; +} + + +const char* DBQuerierCommand::HELP_CMD = "help"; +const char* DBQuerierCommand::GET_CMD = "get"; +const char* DBQuerierCommand::PUT_CMD = "put"; +const char* DBQuerierCommand::DELETE_CMD = "delete"; + +DBQuerierCommand::DBQuerierCommand(const vector& params, + const map& options, const vector& flags) : + LDBCommand(options, flags, false, + BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX, + ARG_VALUE_HEX})) { + +} + +void DBQuerierCommand::Help(string& ret) { + ret.append(" "); + ret.append(DBQuerierCommand::Name()); + ret.append(" [--" + ARG_TTL + "]"); + ret.append("\n"); + ret.append(" Starts a REPL shell. Type help for list of available " + "commands."); + ret.append("\n"); +} + +void DBQuerierCommand::DoCommand() { + if (!db_) { + return; + } + + ReadOptions read_options; + WriteOptions write_options; + + string line; + string key; + string value; + while (getline(cin, line, '\n')) { + + // Parse line into vector + vector tokens; + size_t pos = 0; + while (true) { + size_t pos2 = line.find(' ', pos); + if (pos2 == string::npos) { + break; + } + tokens.push_back(line.substr(pos, pos2-pos)); + pos = pos2 + 1; + } + tokens.push_back(line.substr(pos)); + + const string& cmd = tokens[0]; + + if (cmd == HELP_CMD) { + fprintf(stdout, + "get \n" + "put \n" + "delete \n"); + } else if (cmd == DELETE_CMD && tokens.size() == 2) { + key = (is_key_hex_ ? HexToString(tokens[1]) : tokens[1]); + db_->Delete(write_options, Slice(key)); + fprintf(stdout, "Successfully deleted %s\n", tokens[1].c_str()); + } else if (cmd == PUT_CMD && tokens.size() == 3) { + key = (is_key_hex_ ? HexToString(tokens[1]) : tokens[1]); + value = (is_value_hex_ ? HexToString(tokens[2]) : tokens[2]); + db_->Put(write_options, Slice(key), Slice(value)); + fprintf(stdout, "Successfully put %s %s\n", + tokens[1].c_str(), tokens[2].c_str()); + } else if (cmd == GET_CMD && tokens.size() == 2) { + key = (is_key_hex_ ? HexToString(tokens[1]) : tokens[1]); + if (db_->Get(read_options, Slice(key), &value).ok()) { + fprintf(stdout, "%s\n", PrintKeyValue(key, value, + is_key_hex_, is_value_hex_).c_str()); + } else { + fprintf(stdout, "Not found %s\n", tokens[1].c_str()); + } + } else { + fprintf(stdout, "Unknown command %s\n", line.c_str()); + } + } +} + + +} diff --git a/util/ldb_cmd.h b/util/ldb_cmd.h new file mode 100644 index 00000000..022f5fad --- /dev/null +++ b/util/ldb_cmd.h @@ -0,0 +1,689 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#pragma once +#include +#include +#include +#include +#include +#include + +#include "db/version_set.h" +#include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "rocksdb/iterator.h" +#include "rocksdb/slice.h" +#include "util/logging.h" +#include "util/ldb_cmd_execute_result.h" +#include "util/string_util.h" +#include "utilities/utility_db.h" +#include "utilities/ttl/db_ttl.h" + +using std::string; +using std::map; +using std::vector; +using std::ostringstream; + +namespace rocksdb { + +class LDBCommand { +public: + + // Command-line arguments + static const string ARG_DB; + static const string ARG_HEX; + static const string ARG_KEY_HEX; + static const string ARG_VALUE_HEX; + static const string ARG_TTL; + static const string ARG_TTL_START; + static const string ARG_TTL_END; + static const string ARG_TIMESTAMP; + static const string ARG_FROM; + static const string ARG_TO; + static const string ARG_MAX_KEYS; + static const string ARG_BLOOM_BITS; + static const string ARG_COMPRESSION_TYPE; + static const string ARG_BLOCK_SIZE; + static const string ARG_AUTO_COMPACTION; + static const string ARG_WRITE_BUFFER_SIZE; + static const string ARG_FILE_SIZE; + static const string ARG_CREATE_IF_MISSING; + + static LDBCommand* InitFromCmdLineArgs( + const vector& args, + const Options& options = Options() + ); + + static LDBCommand* InitFromCmdLineArgs( + int argc, + char** argv, + const Options& options = Options() + ); + + bool ValidateCmdLineOptions(); + + virtual Options PrepareOptionsForOpenDB(); + + virtual void SetOptions(Options options) { + options_ = options; + } + + virtual bool NoDBOpen() { + return false; + } + + virtual ~LDBCommand() { + if (db_ != nullptr) { + delete db_; + db_ = nullptr; + } + } + + /* Run the command, and return the execute result. */ + void Run() { + if (!exec_state_.IsNotStarted()) { + return; + } + + if (db_ == nullptr && !NoDBOpen()) { + OpenDB(); + if (!exec_state_.IsNotStarted()) { + return; + } + } + + DoCommand(); + if (exec_state_.IsNotStarted()) { + exec_state_ = LDBCommandExecuteResult::SUCCEED(""); + } + + if (db_ != nullptr) { + CloseDB (); + } + } + + virtual void DoCommand() = 0; + + LDBCommandExecuteResult GetExecuteState() { + return exec_state_; + } + + void ClearPreviousRunState() { + exec_state_.Reset(); + } + + static string HexToString(const string& str) { + string parsed; + if (str[0] != '0' || str[1] != 'x') { + fprintf(stderr, "Invalid hex input %s. Must start with 0x\n", + str.c_str()); + throw "Invalid hex input"; + } + + for (unsigned int i = 2; i < str.length();) { + int c; + sscanf(str.c_str() + i, "%2X", &c); + parsed.push_back(c); + i += 2; + } + return parsed; + } + + static string StringToHex(const string& str) { + string result = "0x"; + char buf[10]; + for (size_t i = 0; i < str.length(); i++) { + snprintf(buf, 10, "%02X", (unsigned char)str[i]); + result += buf; + } + return result; + } + + static const char* DELIM; + +protected: + + LDBCommandExecuteResult exec_state_; + string db_path_; + DB* db_; + StackableDB* sdb_; + + /** + * true implies that this command can work if the db is opened in read-only + * mode. + */ + bool is_read_only_; + + /** If true, the key is input/output as hex in get/put/scan/delete etc. */ + bool is_key_hex_; + + /** If true, the value is input/output as hex in get/put/scan/delete etc. */ + bool is_value_hex_; + + /** If true, the value is treated as timestamp suffixed */ + bool is_db_ttl_; + + // If true, the kvs are output with their insert/modify timestamp in a ttl db + bool timestamp_; + + /** + * Map of options passed on the command-line. + */ + const map option_map_; + + /** + * Flags passed on the command-line. + */ + const vector flags_; + + /** List of command-line options valid for this command */ + const vector valid_cmd_line_options_; + + bool ParseKeyValue(const string& line, string* key, string* value, + bool is_key_hex, bool is_value_hex); + + LDBCommand(const map& options, const vector& flags, + bool is_read_only, const vector& valid_cmd_line_options) : + db_(nullptr), + is_read_only_(is_read_only), + is_key_hex_(false), + is_value_hex_(false), + is_db_ttl_(false), + timestamp_(false), + option_map_(options), + flags_(flags), + valid_cmd_line_options_(valid_cmd_line_options) { + + map::const_iterator itr = options.find(ARG_DB); + if (itr != options.end()) { + db_path_ = itr->second; + } + + is_key_hex_ = IsKeyHex(options, flags); + is_value_hex_ = IsValueHex(options, flags); + is_db_ttl_ = IsFlagPresent(flags, ARG_TTL); + timestamp_ = IsFlagPresent(flags, ARG_TIMESTAMP); + } + + void OpenDB() { + Options opt = PrepareOptionsForOpenDB(); + if (!exec_state_.IsNotStarted()) { + return; + } + // Open the DB. + Status st; + if (is_db_ttl_) { + if (is_read_only_) { + st = UtilityDB::OpenTtlDB(opt, db_path_, &sdb_, 0, true); + } else { + st = UtilityDB::OpenTtlDB(opt, db_path_, &sdb_); + } + db_ = sdb_; + } else if (is_read_only_) { + st = DB::OpenForReadOnly(opt, db_path_, &db_); + } else { + st = DB::Open(opt, db_path_, &db_); + } + if (!st.ok()) { + string msg = st.ToString(); + exec_state_ = LDBCommandExecuteResult::FAILED(msg); + } + + options_ = opt; + } + + void CloseDB () { + if (db_ != nullptr) { + delete db_; + db_ = nullptr; + } + } + + static string PrintKeyValue(const string& key, const string& value, + bool is_key_hex, bool is_value_hex) { + string result; + result.append(is_key_hex ? StringToHex(key) : key); + result.append(DELIM); + result.append(is_value_hex ? StringToHex(value) : value); + return result; + } + + static string PrintKeyValue(const string& key, const string& value, + bool is_hex) { + return PrintKeyValue(key, value, is_hex, is_hex); + } + + /** + * Return true if the specified flag is present in the specified flags vector + */ + static bool IsFlagPresent(const vector& flags, const string& flag) { + return (std::find(flags.begin(), flags.end(), flag) != flags.end()); + } + + static string HelpRangeCmdArgs() { + ostringstream str_stream; + str_stream << " "; + str_stream << "[--" << ARG_FROM << "] "; + str_stream << "[--" << ARG_TO << "] "; + return str_stream.str(); + } + + /** + * A helper function that returns a list of command line options + * used by this command. It includes the common options and the ones + * passed in. + */ + vector BuildCmdLineOptions(vector options) { + vector ret = {ARG_DB, ARG_BLOOM_BITS, ARG_BLOCK_SIZE, + ARG_AUTO_COMPACTION, ARG_COMPRESSION_TYPE, + ARG_WRITE_BUFFER_SIZE, ARG_FILE_SIZE}; + ret.insert(ret.end(), options.begin(), options.end()); + return ret; + } + + bool ParseIntOption(const map& options, const string& option, + int& value, LDBCommandExecuteResult& exec_state); + + bool ParseStringOption(const map& options, + const string& option, string* value); + + Options options_; + +private: + + /** + * Interpret command line options and flags to determine if the key + * should be input/output in hex. + */ + bool IsKeyHex(const map& options, + const vector& flags) { + return (IsFlagPresent(flags, ARG_HEX) || + IsFlagPresent(flags, ARG_KEY_HEX) || + ParseBooleanOption(options, ARG_HEX, false) || + ParseBooleanOption(options, ARG_KEY_HEX, false)); + } + + /** + * Interpret command line options and flags to determine if the value + * should be input/output in hex. + */ + bool IsValueHex(const map& options, + const vector& flags) { + return (IsFlagPresent(flags, ARG_HEX) || + IsFlagPresent(flags, ARG_VALUE_HEX) || + ParseBooleanOption(options, ARG_HEX, false) || + ParseBooleanOption(options, ARG_VALUE_HEX, false)); + } + + /** + * Returns the value of the specified option as a boolean. + * default_val is used if the option is not found in options. + * Throws an exception if the value of the option is not + * "true" or "false" (case insensitive). + */ + bool ParseBooleanOption(const map& options, + const string& option, bool default_val) { + + map::const_iterator itr = options.find(option); + if (itr != options.end()) { + string option_val = itr->second; + return StringToBool(itr->second); + } + return default_val; + } + + /** + * Converts val to a boolean. + * val must be either true or false (case insensitive). + * Otherwise an exception is thrown. + */ + bool StringToBool(string val) { + std::transform(val.begin(), val.end(), val.begin(), ::tolower); + if (val == "true") { + return true; + } else if (val == "false") { + return false; + } else { + throw "Invalid value for boolean argument"; + } + } + + static LDBCommand* SelectCommand( + const string& cmd, + const vector& cmdParams, + const map& option_map, + const vector& flags + ); + +}; + +class CompactorCommand: public LDBCommand { +public: + static string Name() { return "compact"; } + + CompactorCommand(const vector& params, + const map& options, const vector& flags); + + static void Help(string& ret); + + virtual void DoCommand(); + +private: + bool null_from_; + string from_; + bool null_to_; + string to_; +}; + +class DBDumperCommand: public LDBCommand { +public: + static string Name() { return "dump"; } + + DBDumperCommand(const vector& params, + const map& options, const vector& flags); + + static void Help(string& ret); + + virtual void DoCommand(); + +private: + bool null_from_; + string from_; + bool null_to_; + string to_; + int max_keys_; + string delim_; + bool count_only_; + bool count_delim_; + bool print_stats_; + + static const string ARG_COUNT_ONLY; + static const string ARG_COUNT_DELIM; + static const string ARG_STATS; + static const string ARG_TTL_BUCKET; +}; + +class InternalDumpCommand: public LDBCommand { +public: + static string Name() { return "idump"; } + + InternalDumpCommand(const vector& params, + const map& options, + const vector& flags); + + static void Help(string& ret); + + virtual void DoCommand(); + +private: + bool has_from_; + string from_; + bool has_to_; + string to_; + int max_keys_; + string delim_; + bool count_only_; + bool count_delim_; + bool print_stats_; + bool is_input_key_hex_; + + static const string ARG_DELIM; + static const string ARG_COUNT_ONLY; + static const string ARG_COUNT_DELIM; + static const string ARG_STATS; + static const string ARG_INPUT_KEY_HEX; +}; + +class DBLoaderCommand: public LDBCommand { +public: + static string Name() { return "load"; } + + DBLoaderCommand(string& db_name, vector& args); + + DBLoaderCommand(const vector& params, + const map& options, const vector& flags); + + static void Help(string& ret); + virtual void DoCommand(); + + virtual Options PrepareOptionsForOpenDB(); + +private: + bool create_if_missing_; + bool disable_wal_; + bool bulk_load_; + bool compact_; + + static const string ARG_DISABLE_WAL; + static const string ARG_BULK_LOAD; + static const string ARG_COMPACT; +}; + +class ManifestDumpCommand: public LDBCommand { +public: + static string Name() { return "manifest_dump"; } + + ManifestDumpCommand(const vector& params, + const map& options, const vector& flags); + + static void Help(string& ret); + virtual void DoCommand(); + + virtual bool NoDBOpen() { + return true; + } + +private: + bool verbose_; + string path_; + + static const string ARG_VERBOSE; + static const string ARG_PATH; +}; + +class ReduceDBLevelsCommand : public LDBCommand { +public: + static string Name() { return "reduce_levels"; } + + ReduceDBLevelsCommand(const vector& params, + const map& options, const vector& flags); + + virtual Options PrepareOptionsForOpenDB(); + + virtual void DoCommand(); + + virtual bool NoDBOpen() { + return true; + } + + static void Help(string& msg); + + static vector PrepareArgs(const string& db_path, int new_levels, + bool print_old_level = false); + +private: + int old_levels_; + int new_levels_; + bool print_old_levels_; + + static const string ARG_NEW_LEVELS; + static const string ARG_PRINT_OLD_LEVELS; + + Status GetOldNumOfLevels(Options& opt, int* levels); +}; + +class ChangeCompactionStyleCommand : public LDBCommand { +public: + static string Name() { return "change_compaction_style"; } + + ChangeCompactionStyleCommand(const vector& params, + const map& options, const vector& flags); + + virtual Options PrepareOptionsForOpenDB(); + + virtual void DoCommand(); + + static void Help(string& msg); + +private: + int old_compaction_style_; + int new_compaction_style_; + + static const string ARG_OLD_COMPACTION_STYLE; + static const string ARG_NEW_COMPACTION_STYLE; +}; + +class WALDumperCommand : public LDBCommand { +public: + static string Name() { return "dump_wal"; } + + WALDumperCommand(const vector& params, + const map& options, const vector& flags); + + virtual bool NoDBOpen() { + return true; + } + + static void Help(string& ret); + virtual void DoCommand(); + +private: + bool print_header_; + string wal_file_; + bool print_values_; + + static const string ARG_WAL_FILE; + static const string ARG_PRINT_HEADER; + static const string ARG_PRINT_VALUE; +}; + + +class GetCommand : public LDBCommand { +public: + static string Name() { return "get"; } + + GetCommand(const vector& params, const map& options, + const vector& flags); + + virtual void DoCommand(); + + static void Help(string& ret); + +private: + string key_; +}; + +class ApproxSizeCommand : public LDBCommand { +public: + static string Name() { return "approxsize"; } + + ApproxSizeCommand(const vector& params, + const map& options, const vector& flags); + + virtual void DoCommand(); + + static void Help(string& ret); + +private: + string start_key_; + string end_key_; +}; + +class BatchPutCommand : public LDBCommand { +public: + static string Name() { return "batchput"; } + + BatchPutCommand(const vector& params, + const map& options, const vector& flags); + + virtual void DoCommand(); + + static void Help(string& ret); + + virtual Options PrepareOptionsForOpenDB(); + +private: + /** + * The key-values to be inserted. + */ + vector> key_values_; +}; + +class ScanCommand : public LDBCommand { +public: + static string Name() { return "scan"; } + + ScanCommand(const vector& params, const map& options, + const vector& flags); + + virtual void DoCommand(); + + static void Help(string& ret); + +private: + string start_key_; + string end_key_; + bool start_key_specified_; + bool end_key_specified_; + int max_keys_scanned_; +}; + +class DeleteCommand : public LDBCommand { +public: + static string Name() { return "delete"; } + + DeleteCommand(const vector& params, + const map& options, const vector& flags); + + virtual void DoCommand(); + + static void Help(string& ret); + +private: + string key_; +}; + +class PutCommand : public LDBCommand { +public: + static string Name() { return "put"; } + + PutCommand(const vector& params, const map& options, + const vector& flags); + + virtual void DoCommand(); + + static void Help(string& ret); + + virtual Options PrepareOptionsForOpenDB(); + +private: + string key_; + string value_; +}; + +/** + * Command that starts up a REPL shell that allows + * get/put/delete. + */ +class DBQuerierCommand: public LDBCommand { +public: + static string Name() { return "query"; } + + DBQuerierCommand(const vector& params, + const map& options, const vector& flags); + + static void Help(string& ret); + + virtual void DoCommand(); + +private: + static const char* HELP_CMD; + static const char* GET_CMD; + static const char* PUT_CMD; + static const char* DELETE_CMD; +}; + +} // namespace rocksdb diff --git a/util/ldb_cmd_execute_result.h b/util/ldb_cmd_execute_result.h new file mode 100644 index 00000000..b9121b2b --- /dev/null +++ b/util/ldb_cmd_execute_result.h @@ -0,0 +1,76 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#pragma once + +namespace rocksdb { + +class LDBCommandExecuteResult { +public: + enum State { + EXEC_NOT_STARTED = 0, EXEC_SUCCEED = 1, EXEC_FAILED = 2, + }; + + LDBCommandExecuteResult() { + state_ = EXEC_NOT_STARTED; + message_ = ""; + } + + LDBCommandExecuteResult(State state, std::string& msg) { + state_ = state; + message_ = msg; + } + + std::string ToString() { + std::string ret; + switch (state_) { + case EXEC_SUCCEED: + break; + case EXEC_FAILED: + ret.append("Failed: "); + break; + case EXEC_NOT_STARTED: + ret.append("Not started: "); + } + if (!message_.empty()) { + ret.append(message_); + } + return ret; + } + + void Reset() { + state_ = EXEC_NOT_STARTED; + message_ = ""; + } + + bool IsSucceed() { + return state_ == EXEC_SUCCEED; + } + + bool IsNotStarted() { + return state_ == EXEC_NOT_STARTED; + } + + bool IsFailed() { + return state_ == EXEC_FAILED; + } + + static LDBCommandExecuteResult SUCCEED(std::string msg) { + return LDBCommandExecuteResult(EXEC_SUCCEED, msg); + } + + static LDBCommandExecuteResult FAILED(std::string msg) { + return LDBCommandExecuteResult(EXEC_FAILED, msg); + } + +private: + State state_; + std::string message_; + + bool operator==(const LDBCommandExecuteResult&); + bool operator!=(const LDBCommandExecuteResult&); +}; + +} diff --git a/util/ldb_tool.cc b/util/ldb_tool.cc new file mode 100644 index 00000000..2dbbbf8e --- /dev/null +++ b/util/ldb_tool.cc @@ -0,0 +1,103 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include "rocksdb/ldb_tool.h" +#include "util/ldb_cmd.h" + +namespace rocksdb { + +class LDBCommandRunner { +public: + + static void PrintHelp(const char* exec_name) { + string ret; + + ret.append("ldb - LevelDB Tool"); + ret.append("\n\n"); + ret.append("commands MUST specify --" + LDBCommand::ARG_DB + + "= when necessary\n"); + ret.append("\n"); + ret.append("The following optional parameters control if keys/values are " + "input/output as hex or as plain strings:\n"); + ret.append(" --" + LDBCommand::ARG_KEY_HEX + + " : Keys are input/output as hex\n"); + ret.append(" --" + LDBCommand::ARG_VALUE_HEX + + " : Values are input/output as hex\n"); + ret.append(" --" + LDBCommand::ARG_HEX + + " : Both keys and values are input/output as hex\n"); + ret.append("\n"); + + ret.append("The following optional parameters control the database " + "internals:\n"); + ret.append(" --" + LDBCommand::ARG_TTL + + " with 'put','get','scan','dump','query','batchput'" + " : DB supports ttl and value is internally timestamp-suffixed\n"); + ret.append(" --" + LDBCommand::ARG_BLOOM_BITS + "=\n"); + ret.append(" --" + LDBCommand::ARG_COMPRESSION_TYPE + + "=\n"); + ret.append(" --" + LDBCommand::ARG_BLOCK_SIZE + + "=\n"); + ret.append(" --" + LDBCommand::ARG_AUTO_COMPACTION + "=\n"); + ret.append(" --" + LDBCommand::ARG_WRITE_BUFFER_SIZE + + "=\n"); + ret.append(" --" + LDBCommand::ARG_FILE_SIZE + "=\n"); + + ret.append("\n\n"); + ret.append("Data Access Commands:\n"); + PutCommand::Help(ret); + GetCommand::Help(ret); + BatchPutCommand::Help(ret); + ScanCommand::Help(ret); + DeleteCommand::Help(ret); + DBQuerierCommand::Help(ret); + ApproxSizeCommand::Help(ret); + + ret.append("\n\n"); + ret.append("Admin Commands:\n"); + WALDumperCommand::Help(ret); + CompactorCommand::Help(ret); + ReduceDBLevelsCommand::Help(ret); + ChangeCompactionStyleCommand::Help(ret); + DBDumperCommand::Help(ret); + DBLoaderCommand::Help(ret); + ManifestDumpCommand::Help(ret); + InternalDumpCommand::Help(ret); + + fprintf(stderr, "%s\n", ret.c_str()); + } + + static void RunCommand(int argc, char** argv, Options options) { + if (argc <= 2) { + PrintHelp(argv[0]); + exit(1); + } + + LDBCommand* cmdObj = LDBCommand::InitFromCmdLineArgs(argc, argv, options); + if (cmdObj == nullptr) { + fprintf(stderr, "Unknown command\n"); + PrintHelp(argv[0]); + exit(1); + } + + if (!cmdObj->ValidateCmdLineOptions()) { + exit(1); + } + + cmdObj->Run(); + LDBCommandExecuteResult ret = cmdObj->GetExecuteState(); + fprintf(stderr, "%s\n", ret.ToString().c_str()); + delete cmdObj; + + exit(ret.IsFailed()); + } + +}; + + +void LDBTool::Run(int argc, char** argv, Options options) { + LDBCommandRunner::RunCommand(argc, argv, options); +} +} // namespace rocksdb + diff --git a/util/logging.cc b/util/logging.cc new file mode 100644 index 00000000..69734134 --- /dev/null +++ b/util/logging.cc @@ -0,0 +1,86 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/logging.h" + +#include +#include +#include +#include +#include "rocksdb/env.h" +#include "rocksdb/slice.h" + +namespace rocksdb { + +void AppendNumberTo(std::string* str, uint64_t num) { + char buf[30]; + snprintf(buf, sizeof(buf), "%llu", (unsigned long long) num); + str->append(buf); +} + +void AppendEscapedStringTo(std::string* str, const Slice& value) { + for (size_t i = 0; i < value.size(); i++) { + char c = value[i]; + if (c >= ' ' && c <= '~') { + str->push_back(c); + } else { + char buf[10]; + snprintf(buf, sizeof(buf), "\\x%02x", + static_cast(c) & 0xff); + str->append(buf); + } + } +} + +std::string NumberToString(uint64_t num) { + std::string r; + AppendNumberTo(&r, num); + return r; +} + +std::string EscapeString(const Slice& value) { + std::string r; + AppendEscapedStringTo(&r, value); + return r; +} + +bool ConsumeChar(Slice* in, char c) { + if (!in->empty() && (*in)[0] == c) { + in->remove_prefix(1); + return true; + } else { + return false; + } +} + +bool ConsumeDecimalNumber(Slice* in, uint64_t* val) { + uint64_t v = 0; + int digits = 0; + while (!in->empty()) { + char c = (*in)[0]; + if (c >= '0' && c <= '9') { + ++digits; + const unsigned int delta = (c - '0'); + static const uint64_t kMaxUint64 = ~static_cast(0); + if (v > kMaxUint64/10 || + (v == kMaxUint64/10 && delta > kMaxUint64%10)) { + // Overflow + return false; + } + v = (v * 10) + delta; + in->remove_prefix(1); + } else { + break; + } + } + *val = v; + return (digits > 0); +} + +} // namespace rocksdb diff --git a/util/logging.h b/util/logging.h new file mode 100644 index 00000000..411c83be --- /dev/null +++ b/util/logging.h @@ -0,0 +1,48 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Must not be included from any .h files to avoid polluting the namespace +// with macros. + +#pragma once +#include +#include +#include +#include "port/port.h" + +namespace rocksdb { + +class Slice; +class WritableFile; + +// Append a human-readable printout of "num" to *str +extern void AppendNumberTo(std::string* str, uint64_t num); + +// Append a human-readable printout of "value" to *str. +// Escapes any non-printable characters found in "value". +extern void AppendEscapedStringTo(std::string* str, const Slice& value); + +// Return a human-readable printout of "num" +extern std::string NumberToString(uint64_t num); + +// Return a human-readable version of "value". +// Escapes any non-printable characters found in "value". +extern std::string EscapeString(const Slice& value); + +// If *in starts with "c", advances *in past the first character and +// returns true. Otherwise, returns false. +extern bool ConsumeChar(Slice* in, char c); + +// Parse a human-readable number from "*in" into *value. On success, +// advances "*in" past the consumed number and sets "*val" to the +// numeric value. Otherwise, returns false and leaves *in in an +// unspecified state. +extern bool ConsumeDecimalNumber(Slice* in, uint64_t* val); + +} // namespace rocksdb diff --git a/util/manual_compaction_test.cc b/util/manual_compaction_test.cc new file mode 100644 index 00000000..dd615f05 --- /dev/null +++ b/util/manual_compaction_test.cc @@ -0,0 +1,156 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Test for issue 178: a manual compaction causes deleted data to reappear. +#include +#include +#include + +#include "rocksdb/db.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/slice.h" +#include "rocksdb/write_batch.h" +#include "util/testharness.h" + +using namespace rocksdb; + +namespace { + +const int kNumKeys = 1100000; + +std::string Key1(int i) { + char buf[100]; + snprintf(buf, sizeof(buf), "my_key_%d", i); + return buf; +} + +std::string Key2(int i) { + return Key1(i) + "_xxx"; +} + +class ManualCompactionTest { + public: + ManualCompactionTest() { + // Get rid of any state from an old run. + dbname_ = rocksdb::test::TmpDir() + "/rocksdb_cbug_test"; + DestroyDB(dbname_, rocksdb::Options()); + } + + std::string dbname_; +}; + +class DestroyAllCompactionFilter : public CompactionFilter { + public: + DestroyAllCompactionFilter() {} + + virtual bool Filter(int level, + const Slice& key, + const Slice& existing_value, + std::string* new_value, + bool* value_changed) const { + return existing_value.ToString() == "destroy"; + } + + virtual const char* Name() const { + return "DestroyAllCompactionFilter"; + } +}; + +TEST(ManualCompactionTest, CompactTouchesAllKeys) { + for (int iter = 0; iter < 2; ++iter) { + DB* db; + Options options; + if (iter == 0) { // level compaction + options.num_levels = 3; + options.compaction_style = kCompactionStyleLevel; + } else { // universal compaction + options.compaction_style = kCompactionStyleUniversal; + } + options.create_if_missing = true; + options.compression = rocksdb::kNoCompression; + options.compaction_filter = new DestroyAllCompactionFilter(); + ASSERT_OK(DB::Open(options, dbname_, &db)); + + db->Put(WriteOptions(), Slice("key1"), Slice("destroy")); + db->Put(WriteOptions(), Slice("key2"), Slice("destroy")); + db->Put(WriteOptions(), Slice("key3"), Slice("value3")); + db->Put(WriteOptions(), Slice("key4"), Slice("destroy")); + + Slice key4("key4"); + db->CompactRange(nullptr, &key4); + Iterator* itr = db->NewIterator(ReadOptions()); + itr->SeekToFirst(); + ASSERT_TRUE(itr->Valid()); + ASSERT_EQ("key3", itr->key().ToString()); + itr->Next(); + ASSERT_TRUE(!itr->Valid()); + delete itr; + + delete options.compaction_filter; + delete db; + DestroyDB(dbname_, options); + } +} + +TEST(ManualCompactionTest, Test) { + + // Open database. Disable compression since it affects the creation + // of layers and the code below is trying to test against a very + // specific scenario. + rocksdb::DB* db; + rocksdb::Options db_options; + db_options.create_if_missing = true; + db_options.compression = rocksdb::kNoCompression; + ASSERT_OK(rocksdb::DB::Open(db_options, dbname_, &db)); + + // create first key range + rocksdb::WriteBatch batch; + for (int i = 0; i < kNumKeys; i++) { + batch.Put(Key1(i), "value for range 1 key"); + } + ASSERT_OK(db->Write(rocksdb::WriteOptions(), &batch)); + + // create second key range + batch.Clear(); + for (int i = 0; i < kNumKeys; i++) { + batch.Put(Key2(i), "value for range 2 key"); + } + ASSERT_OK(db->Write(rocksdb::WriteOptions(), &batch)); + + // delete second key range + batch.Clear(); + for (int i = 0; i < kNumKeys; i++) { + batch.Delete(Key2(i)); + } + ASSERT_OK(db->Write(rocksdb::WriteOptions(), &batch)); + + // compact database + std::string start_key = Key1(0); + std::string end_key = Key1(kNumKeys - 1); + rocksdb::Slice least(start_key.data(), start_key.size()); + rocksdb::Slice greatest(end_key.data(), end_key.size()); + + // commenting out the line below causes the example to work correctly + db->CompactRange(&least, &greatest); + + // count the keys + rocksdb::Iterator* iter = db->NewIterator(rocksdb::ReadOptions()); + int num_keys = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + num_keys++; + } + delete iter; + ASSERT_EQ(kNumKeys, num_keys) << "Bad number of keys"; + + // close database + delete db; + DestroyDB(dbname_, rocksdb::Options()); +} + +} // anonymous namespace + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/util/murmurhash.cc b/util/murmurhash.cc new file mode 100644 index 00000000..d9d8b706 --- /dev/null +++ b/util/murmurhash.cc @@ -0,0 +1,183 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +/* + Murmurhash from http://sites.google.com/site/murmurhash/ + + All code is released to the public domain. For business purposes, Murmurhash is + under the MIT license. +*/ +#include "murmurhash.h" + +#if defined(__x86_64__) + +// ------------------------------------------------------------------- +// +// The same caveats as 32-bit MurmurHash2 apply here - beware of alignment +// and endian-ness issues if used across multiple platforms. +// +// 64-bit hash for 64-bit platforms + +uint64_t MurmurHash64A ( const void * key, int len, unsigned int seed ) +{ + const uint64_t m = 0xc6a4a7935bd1e995; + const int r = 47; + + uint64_t h = seed ^ (len * m); + + const uint64_t * data = (const uint64_t *)key; + const uint64_t * end = data + (len/8); + + while(data != end) + { + uint64_t k = *data++; + + k *= m; + k ^= k >> r; + k *= m; + + h ^= k; + h *= m; + } + + const unsigned char * data2 = (const unsigned char*)data; + + switch(len & 7) + { + case 7: h ^= ((uint64_t)data2[6]) << 48; + case 6: h ^= ((uint64_t)data2[5]) << 40; + case 5: h ^= ((uint64_t)data2[4]) << 32; + case 4: h ^= ((uint64_t)data2[3]) << 24; + case 3: h ^= ((uint64_t)data2[2]) << 16; + case 2: h ^= ((uint64_t)data2[1]) << 8; + case 1: h ^= ((uint64_t)data2[0]); + h *= m; + }; + + h ^= h >> r; + h *= m; + h ^= h >> r; + + return h; +} + +#elif defined(__i386__) + +// ------------------------------------------------------------------- +// +// Note - This code makes a few assumptions about how your machine behaves - +// +// 1. We can read a 4-byte value from any address without crashing +// 2. sizeof(int) == 4 +// +// And it has a few limitations - +// +// 1. It will not work incrementally. +// 2. It will not produce the same results on little-endian and big-endian +// machines. + +unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed ) +{ + // 'm' and 'r' are mixing constants generated offline. + // They're not really 'magic', they just happen to work well. + + const unsigned int m = 0x5bd1e995; + const int r = 24; + + // Initialize the hash to a 'random' value + + unsigned int h = seed ^ len; + + // Mix 4 bytes at a time into the hash + + const unsigned char * data = (const unsigned char *)key; + + while(len >= 4) + { + unsigned int k = *(unsigned int *)data; + + k *= m; + k ^= k >> r; + k *= m; + + h *= m; + h ^= k; + + data += 4; + len -= 4; + } + + // Handle the last few bytes of the input array + + switch(len) + { + case 3: h ^= data[2] << 16; + case 2: h ^= data[1] << 8; + case 1: h ^= data[0]; + h *= m; + }; + + // Do a few final mixes of the hash to ensure the last few + // bytes are well-incorporated. + + h ^= h >> 13; + h *= m; + h ^= h >> 15; + + return h; +} + +#else + +// ------------------------------------------------------------------- +// +// Same as MurmurHash2, but endian- and alignment-neutral. +// Half the speed though, alas. + +unsigned int MurmurHashNeutral2 ( const void * key, int len, unsigned int seed ) +{ + const unsigned int m = 0x5bd1e995; + const int r = 24; + + unsigned int h = seed ^ len; + + const unsigned char * data = (const unsigned char *)key; + + while(len >= 4) + { + unsigned int k; + + k = data[0]; + k |= data[1] << 8; + k |= data[2] << 16; + k |= data[3] << 24; + + k *= m; + k ^= k >> r; + k *= m; + + h *= m; + h ^= k; + + data += 4; + len -= 4; + } + + switch(len) + { + case 3: h ^= data[2] << 16; + case 2: h ^= data[1] << 8; + case 1: h ^= data[0]; + h *= m; + }; + + h ^= h >> 13; + h *= m; + h ^= h >> 15; + + return h; +} + +#endif diff --git a/util/murmurhash.h b/util/murmurhash.h new file mode 100644 index 00000000..9707e563 --- /dev/null +++ b/util/murmurhash.h @@ -0,0 +1,33 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +/* + Murmurhash from http://sites.google.com/site/murmurhash/ + + All code is released to the public domain. For business purposes, Murmurhash is + under the MIT license. +*/ +#pragma once +#include + +#if defined(__x86_64__) +#define MURMUR_HASH MurmurHash64A +uint64_t MurmurHash64A ( const void * key, int len, unsigned int seed ); +#define MurmurHash MurmurHash64A +typedef uint64_t murmur_t; + +#elif defined(__i386__) +#define MURMUR_HASH MurmurHash2 +unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed ); +#define MurmurHash MurmurHash2 +typedef unsigned int murmur_t; + +#else +#define MURMUR_HASH MurmurHashNeutral2 +unsigned int MurmurHashNeutral2 ( const void * key, int len, unsigned int seed ); +#define MurmurHash MurmurHashNeutral2 +typedef unsigned int murmur_t; + +#endif diff --git a/util/mutexlock.h b/util/mutexlock.h new file mode 100644 index 00000000..0f4e5c8b --- /dev/null +++ b/util/mutexlock.h @@ -0,0 +1,78 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include "port/port.h" + +namespace rocksdb { + +// Helper class that locks a mutex on construction and unlocks the mutex when +// the destructor of the MutexLock object is invoked. +// +// Typical usage: +// +// void MyClass::MyMethod() { +// MutexLock l(&mu_); // mu_ is an instance variable +// ... some complex code, possibly with multiple return paths ... +// } + +class MutexLock { + public: + explicit MutexLock(port::Mutex *mu) : mu_(mu) { + this->mu_->Lock(); + } + ~MutexLock() { this->mu_->Unlock(); } + + private: + port::Mutex *const mu_; + // No copying allowed + MutexLock(const MutexLock&); + void operator=(const MutexLock&); +}; + +// +// Acquire a ReadLock on the specified RWMutex. +// The Lock will be automatically released then the +// object goes out of scope. +// +class ReadLock { + public: + explicit ReadLock(port::RWMutex *mu) : mu_(mu) { + this->mu_->ReadLock(); + } + ~ReadLock() { this->mu_->Unlock(); } + + private: + port::RWMutex *const mu_; + // No copying allowed + ReadLock(const ReadLock&); + void operator=(const ReadLock&); +}; + + +// +// Acquire a WriteLock on the specified RWMutex. +// The Lock will be automatically released then the +// object goes out of scope. +// +class WriteLock { + public: + explicit WriteLock(port::RWMutex *mu) : mu_(mu) { + this->mu_->WriteLock(); + } + ~WriteLock() { this->mu_->Unlock(); } + + private: + port::RWMutex *const mu_; + // No copying allowed + WriteLock(const WriteLock&); + void operator=(const WriteLock&); +}; + +} // namespace rocksdb diff --git a/util/options.cc b/util/options.cc new file mode 100644 index 00000000..64cabc8c --- /dev/null +++ b/util/options.cc @@ -0,0 +1,337 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/options.h" + +#include + +#include "rocksdb/cache.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/comparator.h" +#include "rocksdb/env.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/merge_operator.h" +#include "table/block_based_table_factory.h" + +namespace rocksdb { + +Options::Options() + : comparator(BytewiseComparator()), + merge_operator(nullptr), + compaction_filter(nullptr), + compaction_filter_factory( + std::shared_ptr( + new DefaultCompactionFilterFactory())), + create_if_missing(false), + error_if_exists(false), + paranoid_checks(false), + env(Env::Default()), + info_log(nullptr), + write_buffer_size(4<<20), + max_write_buffer_number(2), + min_write_buffer_number_to_merge(1), + max_open_files(1000), + block_cache(nullptr), + block_cache_compressed(nullptr), + block_size(4096), + block_restart_interval(16), + compression(kSnappyCompression), + filter_policy(nullptr), + prefix_extractor(nullptr), + whole_key_filtering(true), + num_levels(7), + level0_file_num_compaction_trigger(4), + level0_slowdown_writes_trigger(8), + level0_stop_writes_trigger(12), + max_mem_compaction_level(2), + target_file_size_base(2 * 1048576), + target_file_size_multiplier(1), + max_bytes_for_level_base(10 * 1048576), + max_bytes_for_level_multiplier(10), + max_bytes_for_level_multiplier_additional(num_levels, 1), + expanded_compaction_factor(25), + source_compaction_factor(1), + max_grandparent_overlap_factor(10), + disableDataSync(false), + use_fsync(false), + db_stats_log_interval(1800), + db_log_dir(""), + wal_dir(""), + disable_seek_compaction(false), + delete_obsolete_files_period_micros(6 * 60 * 60 * 1000000UL), + max_background_compactions(1), + max_background_flushes(0), + max_log_file_size(0), + log_file_time_to_roll(0), + keep_log_file_num(1000), + soft_rate_limit(0.0), + hard_rate_limit(0.0), + rate_limit_delay_max_milliseconds(1000), + max_manifest_file_size(std::numeric_limits::max()), + no_block_cache(false), + table_cache_numshardbits(4), + table_cache_remove_scan_count_limit(16), + arena_block_size(0), + disable_auto_compactions(false), + WAL_ttl_seconds(0), + WAL_size_limit_MB(0), + manifest_preallocation_size(4 * 1024 * 1024), + purge_redundant_kvs_while_flush(true), + allow_os_buffer(true), + allow_mmap_reads(false), + allow_mmap_writes(true), + is_fd_close_on_exec(true), + skip_log_error_on_recovery(false), + stats_dump_period_sec(3600), + block_size_deviation (10), + advise_random_on_open(true), + access_hint_on_compaction_start(NORMAL), + use_adaptive_mutex(false), + bytes_per_sync(0), + compaction_style(kCompactionStyleLevel), + filter_deletes(false), + max_sequential_skip_in_iterations(8), + memtable_factory(std::shared_ptr(new SkipListFactory)), + table_factory( + std::shared_ptr(new BlockBasedTableFactory())), + inplace_update_support(false), + inplace_update_num_locks(10000), + max_successive_merges(0) { + assert(memtable_factory.get() != nullptr); +} + +static const char* const access_hints[] = { + "NONE", "NORMAL", "SEQUENTIAL", "WILLNEED" +}; + +void +Options::Dump(Logger* log) const +{ + Log(log," Options.comparator: %s", comparator->Name()); + Log(log," Options.merge_operator: %s", + merge_operator? merge_operator->Name() : "None"); + Log(log," Options.compaction_filter: %s", + compaction_filter? compaction_filter->Name() : "None"); + Log(log," Options.compaction_filter_factory: %s", + compaction_filter_factory->Name()); + Log(log," Options.memtable_factory: %s", + memtable_factory->Name()); + Log(log," Options.table_factory: %s", table_factory->Name()); + Log(log," Options.error_if_exists: %d", error_if_exists); + Log(log," Options.create_if_missing: %d", create_if_missing); + Log(log," Options.paranoid_checks: %d", paranoid_checks); + Log(log," Options.env: %p", env); + Log(log," Options.info_log: %p", info_log.get()); + Log(log," Options.write_buffer_size: %zd", write_buffer_size); + Log(log," Options.max_write_buffer_number: %d", max_write_buffer_number); + Log(log," Options.max_open_files: %d", max_open_files); + Log(log," Options.block_cache: %p", block_cache.get()); + Log(log," Options.block_cache_compressed: %p", + block_cache_compressed.get()); + if (block_cache) { + Log(log," Options.block_cache_size: %zd", + block_cache->GetCapacity()); + } + if (block_cache_compressed) { + Log(log,"Options.block_cache_compressed_size: %zd", + block_cache_compressed->GetCapacity()); + } + Log(log," Options.block_size: %zd", block_size); + Log(log," Options.block_restart_interval: %d", block_restart_interval); + if (!compression_per_level.empty()) { + for (unsigned int i = 0; i < compression_per_level.size(); i++) { + Log(log," Options.compression[%d]: %d", + i, compression_per_level[i]); + } + } else { + Log(log," Options.compression: %d", compression); + } + Log(log," Options.filter_policy: %s", + filter_policy == nullptr ? "nullptr" : filter_policy->Name()); + Log(log," Options.prefix_extractor: %s", + prefix_extractor == nullptr ? "nullptr" : prefix_extractor->Name()); + Log(log," Options.whole_key_filtering: %d", whole_key_filtering); + Log(log," Options.num_levels: %d", num_levels); + Log(log," Options.disableDataSync: %d", disableDataSync); + Log(log," Options.use_fsync: %d", use_fsync); + Log(log," Options.max_log_file_size: %ld", max_log_file_size); + Log(log,"Options.max_manifest_file_size: %lu", + (unsigned long)max_manifest_file_size); + Log(log," Options.log_file_time_to_roll: %ld", log_file_time_to_roll); + Log(log," Options.keep_log_file_num: %ld", keep_log_file_num); + Log(log," Options.db_stats_log_interval: %d", + db_stats_log_interval); + Log(log," Options.allow_os_buffer: %d", allow_os_buffer); + Log(log," Options.allow_mmap_reads: %d", allow_mmap_reads); + Log(log," Options.allow_mmap_writes: %d", allow_mmap_writes); + Log(log," Options.min_write_buffer_number_to_merge: %d", + min_write_buffer_number_to_merge); + Log(log," Options.purge_redundant_kvs_while_flush: %d", + purge_redundant_kvs_while_flush); + Log(log," Options.compression_opts.window_bits: %d", + compression_opts.window_bits); + Log(log," Options.compression_opts.level: %d", + compression_opts.level); + Log(log," Options.compression_opts.strategy: %d", + compression_opts.strategy); + Log(log," Options.level0_file_num_compaction_trigger: %d", + level0_file_num_compaction_trigger); + Log(log," Options.level0_slowdown_writes_trigger: %d", + level0_slowdown_writes_trigger); + Log(log," Options.level0_stop_writes_trigger: %d", + level0_stop_writes_trigger); + Log(log," Options.max_mem_compaction_level: %d", + max_mem_compaction_level); + Log(log," Options.target_file_size_base: %d", + target_file_size_base); + Log(log," Options.target_file_size_multiplier: %d", + target_file_size_multiplier); + Log(log," Options.max_bytes_for_level_base: %lu", + (unsigned long)max_bytes_for_level_base); + Log(log," Options.max_bytes_for_level_multiplier: %d", + max_bytes_for_level_multiplier); + for (int i = 0; i < num_levels; i++) { + Log(log,"Options.max_bytes_for_level_multiplier_addtl[%d]: %d", + i, max_bytes_for_level_multiplier_additional[i]); + } + Log(log," Options.max_sequential_skip_in_iterations: %lu", + (unsigned long)max_sequential_skip_in_iterations); + Log(log," Options.expanded_compaction_factor: %d", + expanded_compaction_factor); + Log(log," Options.source_compaction_factor: %d", + source_compaction_factor); + Log(log," Options.max_grandparent_overlap_factor: %d", + max_grandparent_overlap_factor); + Log(log," Options.db_log_dir: %s", + db_log_dir.c_str()); + Log(log," Options.wal_dir: %s", + wal_dir.c_str()); + Log(log," Options.disable_seek_compaction: %d", + disable_seek_compaction); + Log(log," Options.no_block_cache: %d", + no_block_cache); + Log(log," Options.table_cache_numshardbits: %d", + table_cache_numshardbits); + Log(log," Options.table_cache_remove_scan_count_limit: %d", + table_cache_remove_scan_count_limit); + Log(log," Options.arena_block_size: %ld", + arena_block_size); + Log(log," Options.delete_obsolete_files_period_micros: %lu", + (unsigned long)delete_obsolete_files_period_micros); + Log(log," Options.max_background_compactions: %d", + max_background_compactions); + Log(log," Options.max_background_flushes: %d", + max_background_flushes); + Log(log," Options.soft_rate_limit: %.2f", + soft_rate_limit); + Log(log," Options.hard_rate_limit: %.2f", + hard_rate_limit); + Log(log," Options.rate_limit_delay_max_milliseconds: %u", + rate_limit_delay_max_milliseconds); + Log(log," Options.disable_auto_compactions: %d", + disable_auto_compactions); + Log(log," Options.WAL_ttl_seconds: %lu", + (unsigned long)WAL_ttl_seconds); + Log(log," Options.WAL_size_limit_MB: %lu", + (unsigned long)WAL_size_limit_MB); + Log(log," Options.manifest_preallocation_size: %ld", + manifest_preallocation_size); + Log(log," Options.purge_redundant_kvs_while_flush: %d", + purge_redundant_kvs_while_flush); + Log(log," Options.allow_os_buffer: %d", + allow_os_buffer); + Log(log," Options.allow_mmap_reads: %d", + allow_mmap_reads); + Log(log," Options.allow_mmap_writes: %d", + allow_mmap_writes); + Log(log," Options.is_fd_close_on_exec: %d", + is_fd_close_on_exec); + Log(log," Options.skip_log_error_on_recovery: %d", + skip_log_error_on_recovery); + Log(log," Options.stats_dump_period_sec: %u", + stats_dump_period_sec); + Log(log," Options.block_size_deviation: %d", + block_size_deviation); + Log(log," Options.advise_random_on_open: %d", + advise_random_on_open); + Log(log," Options.access_hint_on_compaction_start: %s", + access_hints[access_hint_on_compaction_start]); + Log(log," Options.use_adaptive_mutex: %d", + use_adaptive_mutex); + Log(log," Options.bytes_per_sync: %lu", + (unsigned long)bytes_per_sync); + Log(log," Options.filter_deletes: %d", + filter_deletes); + Log(log," Options.compaction_style: %d", + compaction_style); + Log(log," Options.compaction_options_universal.size_ratio: %u", + compaction_options_universal.size_ratio); + Log(log,"Options.compaction_options_universal.min_merge_width: %u", + compaction_options_universal.min_merge_width); + Log(log,"Options.compaction_options_universal.max_merge_width: %u", + compaction_options_universal.max_merge_width); + Log(log,"Options.compaction_options_universal." + "max_size_amplification_percent: %u", + compaction_options_universal.max_size_amplification_percent); + Log(log, + "Options.compaction_options_universal.compression_size_percent: %u", + compaction_options_universal.compression_size_percent); + std::string collector_names; + for (auto collector : table_properties_collectors) { + collector_names.append(collector->Name()); + collector_names.append("; "); + } + Log(log, " Options.table_properties_collectors: %s", + collector_names.c_str()); + Log(log, " Options.inplace_update_support: %d", + inplace_update_support); + Log(log, " Options.inplace_update_num_locks: %zd", + inplace_update_num_locks); + Log(log, " Options.max_successive_merges: %zd", + max_successive_merges); +} // Options::Dump + +// +// The goal of this method is to create a configuration that +// allows an application to write all files into L0 and +// then do a single compaction to output all files into L1. +Options* +Options::PrepareForBulkLoad() +{ + // never slowdown ingest. + level0_file_num_compaction_trigger = (1<<30); + level0_slowdown_writes_trigger = (1<<30); + level0_stop_writes_trigger = (1<<30); + + // no auto compactions please. The application should issue a + // manual compaction after all data is loaded into L0. + disable_auto_compactions = true; + disable_seek_compaction = true; + disableDataSync = true; + + // A manual compaction run should pick all files in L0 in + // a single compaction run. + source_compaction_factor = (1<<30); + + // It is better to have only 2 levels, otherwise a manual + // compaction would compact at every possible level, thereby + // increasing the total time needed for compactions. + num_levels = 2; + + // Prevent a memtable flush to automatically promote files + // to L1. This is helpful so that all files that are + // input to the manual compaction are all at L0. + max_background_compactions = 2; + + // The compaction would create large files in L1. + target_file_size_base = 256 * 1024 * 1024; + return this; +} + +} // namespace rocksdb diff --git a/util/perf_context.cc b/util/perf_context.cc new file mode 100644 index 00000000..1e8ddfb5 --- /dev/null +++ b/util/perf_context.cc @@ -0,0 +1,30 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include "util/perf_context_imp.h" + +namespace rocksdb { + +// by default, enable counts only +PerfLevel perf_level = kEnableCount; + +void SetPerfLevel(PerfLevel level) { perf_level = level; } + +void PerfContext::Reset() { + user_key_comparison_count = 0; + block_cache_hit_count = 0; + block_read_count = 0; + block_read_byte = 0; + block_read_time = 0; + block_checksum_time = 0; + block_decompress_time = 0; + internal_key_skipped_count = 0; + internal_delete_skipped_count = 0; + wal_write_time = 0; +} + +__thread PerfContext perf_context; + +} diff --git a/util/perf_context_imp.h b/util/perf_context_imp.h new file mode 100644 index 00000000..f7818e69 --- /dev/null +++ b/util/perf_context_imp.h @@ -0,0 +1,34 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#pragma once +#include "rocksdb/perf_context.h" +#include "util/stop_watch.h" + +namespace rocksdb { + +extern enum PerfLevel perf_level; + +inline void StartPerfTimer(StopWatchNano* timer) { + if (perf_level >= PerfLevel::kEnableTime) { + timer->Start(); + } +} + +inline void BumpPerfCount(uint64_t* count, uint64_t delta = 1) { + if (perf_level >= PerfLevel::kEnableCount) { + *count += delta; + } +} + +inline void BumpPerfTime(uint64_t* time, + StopWatchNano* timer, + bool reset = true) { + if (perf_level >= PerfLevel::kEnableTime) { + *time += timer->ElapsedNanos(reset); + } +} + +} diff --git a/util/posix_logger.h b/util/posix_logger.h new file mode 100644 index 00000000..8f7463c9 --- /dev/null +++ b/util/posix_logger.h @@ -0,0 +1,154 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Logger implementation that can be shared by all environments +// where enough posix functionality is available. + +#pragma once +#include +#include +#include +#include +#include +#include +#ifdef OS_LINUX +#include +#endif +#include "rocksdb/env.h" +#include + +namespace rocksdb { + +const int kDebugLogChunkSize = 128 * 1024; + +class PosixLogger : public Logger { + private: + FILE* file_; + uint64_t (*gettid_)(); // Return the thread id for the current thread + std::atomic_size_t log_size_; + int fd_; + const static uint64_t flush_every_seconds_ = 5; + std::atomic_uint_fast64_t last_flush_micros_; + Env* env_; + bool flush_pending_; + public: + PosixLogger(FILE* f, uint64_t (*gettid)(), Env* env) : + file_(f), gettid_(gettid), log_size_(0), fd_(fileno(f)), + last_flush_micros_(0), env_(env), flush_pending_(false) { } + virtual ~PosixLogger() { + fclose(file_); + } + virtual void Flush() { + if (flush_pending_) { + flush_pending_ = false; + fflush(file_); + } + last_flush_micros_ = env_->NowMicros(); + } + virtual void Logv(const char* format, va_list ap) { + const uint64_t thread_id = (*gettid_)(); + + // We try twice: the first time with a fixed-size stack allocated buffer, + // and the second time with a much larger dynamically allocated buffer. + char buffer[500]; + for (int iter = 0; iter < 2; iter++) { + char* base; + int bufsize; + if (iter == 0) { + bufsize = sizeof(buffer); + base = buffer; + } else { + bufsize = 30000; + base = new char[bufsize]; + } + char* p = base; + char* limit = base + bufsize; + + struct timeval now_tv; + gettimeofday(&now_tv, nullptr); + const time_t seconds = now_tv.tv_sec; + struct tm t; + localtime_r(&seconds, &t); + p += snprintf(p, limit - p, + "%04d/%02d/%02d-%02d:%02d:%02d.%06d %llx ", + t.tm_year + 1900, + t.tm_mon + 1, + t.tm_mday, + t.tm_hour, + t.tm_min, + t.tm_sec, + static_cast(now_tv.tv_usec), + static_cast(thread_id)); + + // Print the message + if (p < limit) { + va_list backup_ap; + va_copy(backup_ap, ap); + p += vsnprintf(p, limit - p, format, backup_ap); + va_end(backup_ap); + } + + // Truncate to available space if necessary + if (p >= limit) { + if (iter == 0) { + continue; // Try again with larger buffer + } else { + p = limit - 1; + } + } + + // Add newline if necessary + if (p == base || p[-1] != '\n') { + *p++ = '\n'; + } + + assert(p <= limit); + const size_t write_size = p - base; + +#ifdef ROCKSDB_FALLOCATE_PRESENT + // If this write would cross a boundary of kDebugLogChunkSize + // space, pre-allocate more space to avoid overly large + // allocations from filesystem allocsize options. + const size_t log_size = log_size_; + const int last_allocation_chunk = + ((kDebugLogChunkSize - 1 + log_size) / kDebugLogChunkSize); + const int desired_allocation_chunk = + ((kDebugLogChunkSize - 1 + log_size + write_size) / + kDebugLogChunkSize); + if (last_allocation_chunk != desired_allocation_chunk) { + fallocate(fd_, FALLOC_FL_KEEP_SIZE, 0, + desired_allocation_chunk * kDebugLogChunkSize); + } +#endif + + size_t sz = fwrite(base, 1, write_size, file_); + flush_pending_ = true; + assert(sz == write_size); + if (sz > 0) { + log_size_ += write_size; + } + uint64_t now_micros = static_cast(now_tv.tv_sec) * 1000000 + + now_tv.tv_usec; + if (now_micros - last_flush_micros_ >= flush_every_seconds_ * 1000000) { + flush_pending_ = false; + fflush(file_); + last_flush_micros_ = now_micros; + } + if (base != buffer) { + delete[] base; + } + break; + } + } + size_t GetLogFileSize() const { + return log_size_; + } +}; + +} // namespace rocksdb diff --git a/util/random.h b/util/random.h new file mode 100644 index 00000000..e5b33150 --- /dev/null +++ b/util/random.h @@ -0,0 +1,90 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include + +namespace rocksdb { + +// A very simple random number generator. Not especially good at +// generating truly random bits, but good enough for our needs in this +// package. +class Random { + private: + uint32_t seed_; + public: + explicit Random(uint32_t s) : seed_(s & 0x7fffffffu) { } + uint32_t Next() { + static const uint32_t M = 2147483647L; // 2^31-1 + static const uint64_t A = 16807; // bits 14, 8, 7, 5, 2, 1, 0 + // We are computing + // seed_ = (seed_ * A) % M, where M = 2^31-1 + // + // seed_ must not be zero or M, or else all subsequent computed values + // will be zero or M respectively. For all other values, seed_ will end + // up cycling through every number in [1,M-1] + uint64_t product = seed_ * A; + + // Compute (product % M) using the fact that ((x << 31) % M) == x. + seed_ = static_cast((product >> 31) + (product & M)); + // The first reduction may overflow by 1 bit, so we may need to + // repeat. mod == M is not possible; using > allows the faster + // sign-bit-based test. + if (seed_ > M) { + seed_ -= M; + } + return seed_; + } + // Returns a uniformly distributed value in the range [0..n-1] + // REQUIRES: n > 0 + uint32_t Uniform(int n) { return Next() % n; } + + // Randomly returns true ~"1/n" of the time, and false otherwise. + // REQUIRES: n > 0 + bool OneIn(int n) { return (Next() % n) == 0; } + + // Skewed: pick "base" uniformly from range [0,max_log] and then + // return "base" random bits. The effect is to pick a number in the + // range [0,2^max_log-1] with exponential bias towards smaller numbers. + uint32_t Skewed(int max_log) { + return Uniform(1 << Uniform(max_log + 1)); + } +}; + +// A simple 64bit random number generator based on std::mt19937_64 +class Random64 { + private: + std::mt19937_64 generator_; + + public: + explicit Random64(uint64_t s) : generator_(s) { } + + // Generates the next random number + uint64_t Next() { return generator_(); } + + // Returns a uniformly distributed value in the range [0..n-1] + // REQUIRES: n > 0 + uint64_t Uniform(uint64_t n) { + return std::uniform_int_distribution(0, n - 1)(generator_); + } + + // Randomly returns true ~"1/n" of the time, and false otherwise. + // REQUIRES: n > 0 + bool OneIn(uint64_t n) { return Uniform(n) == 0; } + + // Skewed: pick "base" uniformly from range [0,max_log] and then + // return "base" random bits. The effect is to pick a number in the + // range [0,2^max_log-1] with exponential bias towards smaller numbers. + uint64_t Skewed(int max_log) { + return Uniform(1 << Uniform(max_log + 1)); + } +}; + +} // namespace rocksdb diff --git a/util/signal_test.cc b/util/signal_test.cc new file mode 100644 index 00000000..bffc298d --- /dev/null +++ b/util/signal_test.cc @@ -0,0 +1,32 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include "util/stack_trace.h" +#include + +void f0() { + char *p = nullptr; + *p = 10; /* SIGSEGV here!! */ +} + +void f1() { + f0(); +} + +void f2() { + f1(); +} + +void f3() { + f2(); +} + +int main() { + rocksdb::InstallStackTraceHandler(); + + f3(); + + return 0; +} diff --git a/util/skiplistrep.cc b/util/skiplistrep.cc new file mode 100644 index 00000000..955d754b --- /dev/null +++ b/util/skiplistrep.cc @@ -0,0 +1,104 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include "rocksdb/memtablerep.h" +#include "db/memtable.h" +#include "db/skiplist.h" + +namespace rocksdb { +namespace { +class SkipListRep : public MemTableRep { + SkipList skip_list_; +public: + explicit SkipListRep(MemTableRep::KeyComparator& compare, Arena* arena) + : skip_list_(compare, arena) { +} + + // Insert key into the list. + // REQUIRES: nothing that compares equal to key is currently in the list. + virtual void Insert(const char* key) override { + skip_list_.Insert(key); + } + + // Returns true iff an entry that compares equal to key is in the list. + virtual bool Contains(const char* key) const override { + return skip_list_.Contains(key); + } + + virtual size_t ApproximateMemoryUsage() override { + // All memory is allocated through arena; nothing to report here + return 0; + } + + virtual ~SkipListRep() override { } + + // Iteration over the contents of a skip list + class Iterator : public MemTableRep::Iterator { + SkipList::Iterator iter_; + public: + // Initialize an iterator over the specified list. + // The returned iterator is not valid. + explicit Iterator( + const SkipList* list + ) : iter_(list) { } + + virtual ~Iterator() override { } + + // Returns true iff the iterator is positioned at a valid node. + virtual bool Valid() const override { + return iter_.Valid(); + } + + // Returns the key at the current position. + // REQUIRES: Valid() + virtual const char* key() const override { + return iter_.key(); + } + + // Advances to the next position. + // REQUIRES: Valid() + virtual void Next() override { + iter_.Next(); + } + + // Advances to the previous position. + // REQUIRES: Valid() + virtual void Prev() override { + iter_.Prev(); + } + + // Advance to the first entry with a key >= target + virtual void Seek(const char* target) override { + iter_.Seek(target); + } + + // Position at the first entry in list. + // Final state of iterator is Valid() iff list is not empty. + virtual void SeekToFirst() override { + iter_.SeekToFirst(); + } + + // Position at the last entry in list. + // Final state of iterator is Valid() iff list is not empty. + virtual void SeekToLast() override { + iter_.SeekToLast(); + } + }; + + // Unhide default implementations of GetIterator + using MemTableRep::GetIterator; + + virtual std::shared_ptr GetIterator() override { + return std::make_shared(&skip_list_); + } +}; +} + +std::shared_ptr SkipListFactory::CreateMemTableRep ( + MemTableRep::KeyComparator& compare, Arena* arena) { + return std::shared_ptr(new SkipListRep(compare, arena)); +} + +} // namespace rocksdb diff --git a/util/slice.cc b/util/slice.cc new file mode 100644 index 00000000..55f561f0 --- /dev/null +++ b/util/slice.cc @@ -0,0 +1,73 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/slice_transform.h" +#include "rocksdb/slice.h" + +namespace rocksdb { + +namespace { + +class FixedPrefixTransform : public SliceTransform { + private: + size_t prefix_len_; + + public: + explicit FixedPrefixTransform(size_t prefix_len) : prefix_len_(prefix_len) { } + + virtual const char* Name() const { + return "rocksdb.FixedPrefix"; + } + + virtual Slice Transform(const Slice& src) const { + assert(InDomain(src)); + return Slice(src.data(), prefix_len_); + } + + virtual bool InDomain(const Slice& src) const { + return (src.size() >= prefix_len_); + } + + virtual bool InRange(const Slice& dst) const { + return (dst.size() == prefix_len_); + } +}; + +class NoopTransform : public SliceTransform { + public: + explicit NoopTransform() { } + + virtual const char* Name() const { + return "rocksdb.Noop"; + } + + virtual Slice Transform(const Slice& src) const { + return src; + } + + virtual bool InDomain(const Slice& src) const { + return true; + } + + virtual bool InRange(const Slice& dst) const { + return true; + } +}; + +} + +const SliceTransform* NewFixedPrefixTransform(size_t prefix_len) { + return new FixedPrefixTransform(prefix_len); +} + +const SliceTransform* NewNoopTransform() { + return new NoopTransform; +} + +} // namespace rocksdb diff --git a/util/stack_trace.h b/util/stack_trace.h new file mode 100644 index 00000000..3b06e1df --- /dev/null +++ b/util/stack_trace.h @@ -0,0 +1,17 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#pragma once +namespace rocksdb { + +// Install a signal handler to print callstack on the following signals: +// SIGILL SIGSEGV SIGBUS SIGABRT +// Currently supports linux only. No-op otherwise. +void InstallStackTraceHandler(); + +// Prints stack, skips skip_first_frames frames +void PrintStack(int first_frames_to_skip = 0); + +} // namespace rocksdb diff --git a/util/statistics.cc b/util/statistics.cc new file mode 100644 index 00000000..5f7a5ba4 --- /dev/null +++ b/util/statistics.cc @@ -0,0 +1,60 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include "rocksdb/statistics.h" +#include + +namespace rocksdb { + +namespace { +// a buffer size used for temp string buffers +const int kBufferSize = 200; + +std::string HistogramToString ( + Statistics* dbstats, + const Histograms& histogram_type, + const std::string& name) { + + char buffer[kBufferSize]; + HistogramData histogramData; + dbstats->histogramData(histogram_type, &histogramData); + snprintf( + buffer, + kBufferSize, + "%s statistics Percentiles :=> 50 : %f 95 : %f 99 : %f\n", + name.c_str(), + histogramData.median, + histogramData.percentile95, + histogramData.percentile99 + ); + return std::string(buffer); +}; + +std::string TickerToString ( + Statistics* dbstats, + const Tickers& ticker, + const std::string& name) { + + char buffer[kBufferSize]; + snprintf(buffer, kBufferSize, "%s COUNT : %ld\n", + name.c_str(), dbstats->getTickerCount(ticker)); + return std::string(buffer); +}; +} // namespace + +std::string Statistics::ToString() { + std::string res; + res.reserve(20000); + for (const auto& t : TickersNameMap) { + res.append(TickerToString(this, t.first, t.second)); + } + for (const auto& h : HistogramsNameMap) { + res.append(HistogramToString(this, h.first, h.second)); + } + res.shrink_to_fit(); + return res; +} + +} // namespace rocksdb diff --git a/util/statistics_imp.h b/util/statistics_imp.h new file mode 100644 index 00000000..0dc8884c --- /dev/null +++ b/util/statistics_imp.h @@ -0,0 +1,32 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#pragma once +#include "rocksdb/statistics.h" + +namespace rocksdb { + +// Utility functions +inline void RecordTick(Statistics* statistics, + Tickers ticker, + uint64_t count = 1) { + assert(HistogramsNameMap.size() == HISTOGRAM_ENUM_MAX); + assert(TickersNameMap.size() == TICKER_ENUM_MAX); + if (statistics) { + statistics->recordTick(ticker, count); + } +} + +inline void SetTickerCount(Statistics* statistics, + Tickers ticker, + uint64_t count) { + assert(HistogramsNameMap.size() == HISTOGRAM_ENUM_MAX); + assert(TickersNameMap.size() == TICKER_ENUM_MAX); + if (statistics) { + statistics->setTickerCount(ticker, count); + } +} + +} diff --git a/util/stats_logger.h b/util/stats_logger.h new file mode 100644 index 00000000..f0b45404 --- /dev/null +++ b/util/stats_logger.h @@ -0,0 +1,26 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#pragma once + +namespace rocksdb { + +class StatsLogger { + + public: + + virtual void Log_Deploy_Stats(const std::string& db_version, + const std::string& machine_info, + const std::string& data_dir, + const uint64_t data_size, + const uint32_t file_number, + const std::string& data_size_per_level, + const std::string& file_number_per_level, + const int64_t& ts_unix) = 0; + virtual ~StatsLogger() {} + +}; + +} diff --git a/util/status.cc b/util/status.cc new file mode 100644 index 00000000..69060a7c --- /dev/null +++ b/util/status.cc @@ -0,0 +1,80 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include "port/port.h" +#include "rocksdb/status.h" + +namespace rocksdb { + +const char* Status::CopyState(const char* state) { + uint32_t size; + memcpy(&size, state, sizeof(size)); + char* result = new char[size + 4]; + memcpy(result, state, size + 4); + return result; +} + +Status::Status(Code code, const Slice& msg, const Slice& msg2) : + code_(code) { + assert(code != kOk); + const uint32_t len1 = msg.size(); + const uint32_t len2 = msg2.size(); + const uint32_t size = len1 + (len2 ? (2 + len2) : 0); + char* result = new char[size + 4]; + memcpy(result, &size, sizeof(size)); + memcpy(result + 4, msg.data(), len1); + if (len2) { + result[4 + len1] = ':'; + result[5 + len1] = ' '; + memcpy(result + 6 + len1, msg2.data(), len2); + } + state_ = result; +} + +std::string Status::ToString() const { + char tmp[30]; + const char* type; + switch (code_) { + case kOk: + return "OK"; + case kNotFound: + type = "NotFound: "; + break; + case kCorruption: + type = "Corruption: "; + break; + case kNotSupported: + type = "Not implemented: "; + break; + case kInvalidArgument: + type = "Invalid argument: "; + break; + case kIOError: + type = "IO error: "; + break; + case kMergeInProgress: + type = "Merge In Progress: "; + break; + default: + snprintf(tmp, sizeof(tmp), "Unknown code(%d): ", + static_cast(code())); + type = tmp; + break; + } + std::string result(type); + if (state_ != nullptr) { + uint32_t length; + memcpy(&length, state_, sizeof(length)); + result.append(state_ + 4, length); + } + return result; +} + +} // namespace rocksdb diff --git a/util/stl_wrappers.h b/util/stl_wrappers.h new file mode 100644 index 00000000..b4c14b4b --- /dev/null +++ b/util/stl_wrappers.h @@ -0,0 +1,32 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#pragma once + +#include "util/murmurhash.h" +#include "util/coding.h" + +#include "rocksdb/memtablerep.h" +#include "rocksdb/slice.h" + +namespace rocksdb { +namespace stl_wrappers { + class Base { + protected: + const MemTableRep::KeyComparator& compare_; + explicit Base(const MemTableRep::KeyComparator& compare) + : compare_(compare) { } + }; + + struct Compare : private Base { + explicit Compare(const MemTableRep::KeyComparator& compare) + : Base(compare) { } + inline bool operator()(const char* a, const char* b) const { + return compare_(a, b) < 0; + } + }; + +} +} diff --git a/util/stop_watch.h b/util/stop_watch.h new file mode 100644 index 00000000..6325a744 --- /dev/null +++ b/util/stop_watch.h @@ -0,0 +1,71 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#pragma once +#include "rocksdb/env.h" +#include "util/statistics_imp.h" + +namespace rocksdb { +// Auto-scoped. +// Records the statistic into the corresponding histogram. +class StopWatch { + public: + explicit StopWatch( + Env * const env, + Statistics* statistics = nullptr, + const Histograms histogram_name = DB_GET, + bool auto_start = true) : + env_(env), + start_time_((!auto_start && !statistics) ? 0 : env->NowMicros()), + statistics_(statistics), + histogram_name_(histogram_name) {} + + + + uint64_t ElapsedMicros() { + return env_->NowMicros() - start_time_; + } + + ~StopWatch() { + if (statistics_) { + statistics_->measureTime(histogram_name_, ElapsedMicros()); + } + } + + private: + Env* const env_; + const uint64_t start_time_; + Statistics* statistics_; + const Histograms histogram_name_; + +}; + +// a nano second precision stopwatch +class StopWatchNano { + public: + explicit StopWatchNano(Env* const env, bool auto_start = false) + : env_(env), start_(0) { + if (auto_start) { + Start(); + } + } + + void Start() { start_ = env_->NowNanos(); } + + uint64_t ElapsedNanos(bool reset = false) { + auto now = env_->NowNanos(); + auto elapsed = now - start_; + if (reset) { + start_ = now; + } + return elapsed; + } + + private: + Env* const env_; + uint64_t start_; +}; + +} // namespace rocksdb diff --git a/util/string_util.cc b/util/string_util.cc new file mode 100644 index 00000000..33f84d9b --- /dev/null +++ b/util/string_util.cc @@ -0,0 +1,26 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include +#include +#include + +namespace rocksdb { + +using namespace std; +using std::string; +using std::vector; +using std::stringstream; + +vector stringSplit(string arg, char delim) { + vector splits; + stringstream ss(arg); + string item; + while(getline(ss, item, delim)) { + splits.push_back(item); + } + return splits; +} +} diff --git a/util/string_util.h b/util/string_util.h new file mode 100644 index 00000000..7dfd68aa --- /dev/null +++ b/util/string_util.h @@ -0,0 +1,11 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#pragma once +namespace rocksdb { + +extern std::vector stringSplit(std::string arg, char delim); + +} diff --git a/util/testharness.cc b/util/testharness.cc new file mode 100644 index 00000000..85716cda --- /dev/null +++ b/util/testharness.cc @@ -0,0 +1,82 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/testharness.h" + +#include +#include +#include +#include + +namespace rocksdb { +namespace test { + +namespace { +struct Test { + const char* base; + const char* name; + void (*func)(); +}; +std::vector* tests; +} + +bool RegisterTest(const char* base, const char* name, void (*func)()) { + if (tests == nullptr) { + tests = new std::vector; + } + Test t; + t.base = base; + t.name = name; + t.func = func; + tests->push_back(t); + return true; +} + +int RunAllTests() { + const char* matcher = getenv("ROCKSDB_TESTS"); + + int num = 0; + if (tests != nullptr) { + for (unsigned int i = 0; i < tests->size(); i++) { + const Test& t = (*tests)[i]; + if (matcher != nullptr) { + std::string name = t.base; + name.push_back('.'); + name.append(t.name); + if (strstr(name.c_str(), matcher) == nullptr) { + continue; + } + } + fprintf(stderr, "==== Test %s.%s\n", t.base, t.name); + (*t.func)(); + ++num; + } + } + fprintf(stderr, "==== PASSED %d tests\n", num); + return 0; +} + +std::string TmpDir() { + std::string dir; + Status s = Env::Default()->GetTestDirectory(&dir); + ASSERT_TRUE(s.ok()) << s.ToString(); + return dir; +} + +int RandomSeed() { + const char* env = getenv("TEST_RANDOM_SEED"); + int result = (env != nullptr ? atoi(env) : 301); + if (result <= 0) { + result = 301; + } + return result; +} + +} // namespace test +} // namespace rocksdb diff --git a/util/testharness.h b/util/testharness.h new file mode 100644 index 00000000..f1591781 --- /dev/null +++ b/util/testharness.h @@ -0,0 +1,142 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include +#include +#include +#include "rocksdb/env.h" +#include "rocksdb/slice.h" +#include "util/random.h" +#include "util/stack_trace.h" + +namespace rocksdb { +namespace test { + +// Run some of the tests registered by the TEST() macro. If the +// environment variable "ROCKSDB_TESTS" is not set, runs all tests. +// Otherwise, runs only the tests whose name contains the value of +// "ROCKSDB_TESTS" as a substring. E.g., suppose the tests are: +// TEST(Foo, Hello) { ... } +// TEST(Foo, World) { ... } +// ROCKSDB_TESTS=Hello will run the first test +// ROCKSDB_TESTS=o will run both tests +// ROCKSDB_TESTS=Junk will run no tests +// +// Returns 0 if all tests pass. +// Dies or returns a non-zero value if some test fails. +extern int RunAllTests(); + +// Return the directory to use for temporary storage. +extern std::string TmpDir(); + +// Return a randomization seed for this run. Typically returns the +// same number on repeated invocations of this binary, but automated +// runs may be able to vary the seed. +extern int RandomSeed(); + +// An instance of Tester is allocated to hold temporary state during +// the execution of an assertion. +class Tester { + private: + bool ok_; + const char* fname_; + int line_; + std::stringstream ss_; + + public: + Tester(const char* f, int l) + : ok_(true), fname_(f), line_(l) { + } + + ~Tester() { + if (!ok_) { + fprintf(stderr, "%s:%d:%s\n", fname_, line_, ss_.str().c_str()); + PrintStack(2); + exit(1); + } + } + + Tester& Is(bool b, const char* msg) { + if (!b) { + ss_ << " Assertion failure " << msg; + ok_ = false; + } + return *this; + } + + Tester& IsOk(const Status& s) { + if (!s.ok()) { + ss_ << " " << s.ToString(); + ok_ = false; + } + return *this; + } + +#define BINARY_OP(name,op) \ + template \ + Tester& name(const X& x, const Y& y) { \ + if (! (x op y)) { \ + ss_ << " failed: " << x << (" " #op " ") << y; \ + ok_ = false; \ + } \ + return *this; \ + } + + BINARY_OP(IsEq, ==) + BINARY_OP(IsNe, !=) + BINARY_OP(IsGe, >=) + BINARY_OP(IsGt, >) + BINARY_OP(IsLe, <=) + BINARY_OP(IsLt, <) +#undef BINARY_OP + + // Attach the specified value to the error message if an error has occurred + template + Tester& operator<<(const V& value) { + if (!ok_) { + ss_ << " " << value; + } + return *this; + } +}; + +#define ASSERT_TRUE(c) ::rocksdb::test::Tester(__FILE__, __LINE__).Is((c), #c) +#define ASSERT_OK(s) ::rocksdb::test::Tester(__FILE__, __LINE__).IsOk((s)) +#define ASSERT_EQ(a,b) ::rocksdb::test::Tester(__FILE__, __LINE__).IsEq((a),(b)) +#define ASSERT_NE(a,b) ::rocksdb::test::Tester(__FILE__, __LINE__).IsNe((a),(b)) +#define ASSERT_GE(a,b) ::rocksdb::test::Tester(__FILE__, __LINE__).IsGe((a),(b)) +#define ASSERT_GT(a,b) ::rocksdb::test::Tester(__FILE__, __LINE__).IsGt((a),(b)) +#define ASSERT_LE(a,b) ::rocksdb::test::Tester(__FILE__, __LINE__).IsLe((a),(b)) +#define ASSERT_LT(a,b) ::rocksdb::test::Tester(__FILE__, __LINE__).IsLt((a),(b)) + +#define TCONCAT(a,b) TCONCAT1(a,b) +#define TCONCAT1(a,b) a##b + +#define TEST(base,name) \ +class TCONCAT(_Test_,name) : public base { \ + public: \ + void _Run(); \ + static void _RunIt() { \ + TCONCAT(_Test_,name) t; \ + t._Run(); \ + } \ +}; \ +bool TCONCAT(_Test_ignored_,name) = \ + ::rocksdb::test::RegisterTest(#base, #name, &TCONCAT(_Test_,name)::_RunIt); \ +void TCONCAT(_Test_,name)::_Run() + +// Register the specified test. Typically not used directly, but +// invoked via the macro expansion of TEST. +extern bool RegisterTest(const char* base, const char* name, void (*func)()); + + +} // namespace test +} // namespace rocksdb diff --git a/util/testutil.cc b/util/testutil.cc new file mode 100644 index 00000000..13e781e6 --- /dev/null +++ b/util/testutil.cc @@ -0,0 +1,56 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/testutil.h" + +#include "util/random.h" + +namespace rocksdb { +namespace test { + +Slice RandomString(Random* rnd, int len, std::string* dst) { + dst->resize(len); + for (int i = 0; i < len; i++) { + (*dst)[i] = static_cast(' ' + rnd->Uniform(95)); // ' ' .. '~' + } + return Slice(*dst); +} + +std::string RandomKey(Random* rnd, int len) { + // Make sure to generate a wide variety of characters so we + // test the boundary conditions for short-key optimizations. + static const char kTestChars[] = { + '\0', '\1', 'a', 'b', 'c', 'd', 'e', '\xfd', '\xfe', '\xff' + }; + std::string result; + for (int i = 0; i < len; i++) { + result += kTestChars[rnd->Uniform(sizeof(kTestChars))]; + } + return result; +} + + +extern Slice CompressibleString(Random* rnd, double compressed_fraction, + int len, std::string* dst) { + int raw = static_cast(len * compressed_fraction); + if (raw < 1) raw = 1; + std::string raw_data; + RandomString(rnd, raw, &raw_data); + + // Duplicate the random data until we have filled "len" bytes + dst->clear(); + while (dst->size() < (unsigned int)len) { + dst->append(raw_data); + } + dst->resize(len); + return Slice(*dst); +} + +} // namespace test +} // namespace rocksdb diff --git a/util/testutil.h b/util/testutil.h new file mode 100644 index 00000000..c73210fe --- /dev/null +++ b/util/testutil.h @@ -0,0 +1,55 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include "rocksdb/env.h" +#include "rocksdb/slice.h" +#include "util/random.h" + +namespace rocksdb { +namespace test { + +// Store in *dst a random string of length "len" and return a Slice that +// references the generated data. +extern Slice RandomString(Random* rnd, int len, std::string* dst); + +// Return a random key with the specified length that may contain interesting +// characters (e.g. \x00, \xff, etc.). +extern std::string RandomKey(Random* rnd, int len); + +// Store in *dst a string of length "len" that will compress to +// "N*compressed_fraction" bytes and return a Slice that references +// the generated data. +extern Slice CompressibleString(Random* rnd, double compressed_fraction, + int len, std::string* dst); + +// A wrapper that allows injection of errors. +class ErrorEnv : public EnvWrapper { + public: + bool writable_file_error_; + int num_writable_file_errors_; + + ErrorEnv() : EnvWrapper(Env::Default()), + writable_file_error_(false), + num_writable_file_errors_(0) { } + + virtual Status NewWritableFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& soptions) { + result->reset(); + if (writable_file_error_) { + ++num_writable_file_errors_; + return Status::IOError(fname, "fake error"); + } + return target()->NewWritableFile(fname, result, soptions); + } +}; + +} // namespace test +} // namespace rocksdb diff --git a/util/vectorrep.cc b/util/vectorrep.cc new file mode 100644 index 00000000..8d3ccc9d --- /dev/null +++ b/util/vectorrep.cc @@ -0,0 +1,249 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include "rocksdb/memtablerep.h" + +#include +#include +#include +#include +#include + +#include "rocksdb/arena.h" +#include "port/port.h" +#include "util/mutexlock.h" +#include "util/stl_wrappers.h" + +namespace rocksdb { +namespace { + +using namespace stl_wrappers; + +class VectorRep : public MemTableRep { + public: + VectorRep(const KeyComparator& compare, Arena* arena, size_t count); + + // Insert key into the collection. (The caller will pack key and value into a + // single buffer and pass that in as the parameter to Insert) + // REQUIRES: nothing that compares equal to key is currently in the + // collection. + virtual void Insert(const char* key) override; + + // Returns true iff an entry that compares equal to key is in the collection. + virtual bool Contains(const char* key) const override; + + virtual void MarkReadOnly() override; + + virtual size_t ApproximateMemoryUsage() override; + + virtual ~VectorRep() override { } + + class Iterator : public MemTableRep::Iterator { + class VectorRep* vrep_; + std::shared_ptr> bucket_; + typename std::vector::const_iterator mutable cit_; + const KeyComparator& compare_; + bool mutable sorted_; + void DoSort() const; + public: + explicit Iterator(class VectorRep* vrep, + std::shared_ptr> bucket, + const KeyComparator& compare); + + // Initialize an iterator over the specified collection. + // The returned iterator is not valid. + // explicit Iterator(const MemTableRep* collection); + virtual ~Iterator() override { }; + + // Returns true iff the iterator is positioned at a valid node. + virtual bool Valid() const override; + + // Returns the key at the current position. + // REQUIRES: Valid() + virtual const char* key() const override; + + // Advances to the next position. + // REQUIRES: Valid() + virtual void Next() override; + + // Advances to the previous position. + // REQUIRES: Valid() + virtual void Prev() override; + + // Advance to the first entry with a key >= target + virtual void Seek(const char* target) override; + + // Position at the first entry in collection. + // Final state of iterator is Valid() iff collection is not empty. + virtual void SeekToFirst() override; + + // Position at the last entry in collection. + // Final state of iterator is Valid() iff collection is not empty. + virtual void SeekToLast() override; + }; + + // Unhide default implementations of GetIterator() + using MemTableRep::GetIterator; + + // Return an iterator over the keys in this representation. + virtual std::shared_ptr GetIterator() override; + + private: + friend class Iterator; + typedef std::vector Bucket; + std::shared_ptr bucket_; + mutable port::RWMutex rwlock_; + bool immutable_; + bool sorted_; + const KeyComparator& compare_; +}; + +void VectorRep::Insert(const char* key) { + assert(!Contains(key)); + WriteLock l(&rwlock_); + assert(!immutable_); + bucket_->push_back(key); +} + +// Returns true iff an entry that compares equal to key is in the collection. +bool VectorRep::Contains(const char* key) const { + ReadLock l(&rwlock_); + return std::find(bucket_->begin(), bucket_->end(), key) != bucket_->end(); +} + +void VectorRep::MarkReadOnly() { + WriteLock l(&rwlock_); + immutable_ = true; +} + +size_t VectorRep::ApproximateMemoryUsage() { + return + sizeof(bucket_) + sizeof(*bucket_) + + bucket_->size() * + sizeof( + std::remove_reference::type::value_type + ); +} + +VectorRep::VectorRep(const KeyComparator& compare, Arena* arena, size_t count) + : bucket_(new Bucket()), + immutable_(false), + sorted_(false), + compare_(compare) { bucket_.get()->reserve(count); } + +VectorRep::Iterator::Iterator(class VectorRep* vrep, + std::shared_ptr> bucket, + const KeyComparator& compare) +: vrep_(vrep), + bucket_(bucket), + cit_(bucket_->end()), + compare_(compare), + sorted_(false) { } + +void VectorRep::Iterator::DoSort() const { + // vrep is non-null means that we are working on an immutable memtable + if (!sorted_ && vrep_ != nullptr) { + WriteLock l(&vrep_->rwlock_); + if (!vrep_->sorted_) { + std::sort(bucket_->begin(), bucket_->end(), Compare(compare_)); + cit_ = bucket_->begin(); + vrep_->sorted_ = true; + } + sorted_ = true; + } + if (!sorted_) { + std::sort(bucket_->begin(), bucket_->end(), Compare(compare_)); + cit_ = bucket_->begin(); + sorted_ = true; + } + assert(sorted_); + assert(vrep_ == nullptr || vrep_->sorted_); +} + +// Returns true iff the iterator is positioned at a valid node. +bool VectorRep::Iterator::Valid() const { + DoSort(); + return cit_ != bucket_->end(); +} + +// Returns the key at the current position. +// REQUIRES: Valid() +const char* VectorRep::Iterator::key() const { + assert(Valid()); + return *cit_; +} + +// Advances to the next position. +// REQUIRES: Valid() +void VectorRep::Iterator::Next() { + assert(Valid()); + if (cit_ == bucket_->end()) { + return; + } + ++cit_; +} + +// Advances to the previous position. +// REQUIRES: Valid() +void VectorRep::Iterator::Prev() { + assert(Valid()); + if (cit_ == bucket_->begin()) { + // If you try to go back from the first element, the iterator should be + // invalidated. So we set it to past-the-end. This means that you can + // treat the container circularly. + cit_ = bucket_->end(); + } else { + --cit_; + } +} + +// Advance to the first entry with a key >= target +void VectorRep::Iterator::Seek(const char* target) { + DoSort(); + // Do binary search to find first value not less than the target + cit_ = std::equal_range(bucket_->begin(), + bucket_->end(), + target, + [this] (const char* a, const char* b) { + return compare_(a, b) < 0; + }).first; +} + +// Position at the first entry in collection. +// Final state of iterator is Valid() iff collection is not empty. +void VectorRep::Iterator::SeekToFirst() { + DoSort(); + cit_ = bucket_->begin(); +} + +// Position at the last entry in collection. +// Final state of iterator is Valid() iff collection is not empty. +void VectorRep::Iterator::SeekToLast() { + DoSort(); + cit_ = bucket_->end(); + if (bucket_->size() != 0) { + --cit_; + } +} + +std::shared_ptr VectorRep::GetIterator() { + ReadLock l(&rwlock_); + // Do not sort here. The sorting would be done the first time + // a Seek is performed on the iterator. + if (immutable_) { + return std::make_shared(this, bucket_, compare_); + } else { + std::shared_ptr tmp; + tmp.reset(new Bucket(*bucket_)); // make a copy + return std::make_shared(nullptr, tmp, compare_); + } +} +} // anon namespace + +std::shared_ptr VectorRepFactory::CreateMemTableRep( + MemTableRep::KeyComparator& compare, Arena* arena) { + return std::make_shared(compare, arena, count_); +} +} // namespace rocksdb diff --git a/utilities/backupable/backupable_db.cc b/utilities/backupable/backupable_db.cc new file mode 100644 index 00000000..26bdd254 --- /dev/null +++ b/utilities/backupable/backupable_db.cc @@ -0,0 +1,912 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "utilities/backupable_db.h" +#include "db/filename.h" +#include "util/coding.h" +#include "rocksdb/transaction_log.h" + +#define __STDC_FORMAT_MACROS + +#include +#include +#include +#include +#include +#include +#include + +namespace rocksdb { + +// -------- BackupEngine class --------- +class BackupEngine { + public: + BackupEngine(Env* db_env, const BackupableDBOptions& options); + ~BackupEngine(); + Status CreateNewBackup(DB* db, bool flush_before_backup = false); + Status PurgeOldBackups(uint32_t num_backups_to_keep); + Status DeleteBackup(BackupID backup_id); + void StopBackup() { + stop_backup_.store(true, std::memory_order_release); + } + + void GetBackupInfo(std::vector* backup_info); + Status RestoreDBFromBackup(BackupID backup_id, const std::string &db_dir, + const std::string &wal_dir); + Status RestoreDBFromLatestBackup(const std::string &db_dir, + const std::string &wal_dir) { + return RestoreDBFromBackup(latest_backup_id_, db_dir, wal_dir); + } + + void DeleteBackupsNewerThan(uint64_t sequence_number); + + private: + class BackupMeta { + public: + BackupMeta(const std::string& meta_filename, + std::unordered_map* file_refs, Env* env) + : timestamp_(0), size_(0), meta_filename_(meta_filename), + file_refs_(file_refs), env_(env) {} + + ~BackupMeta() {} + + void RecordTimestamp() { + env_->GetCurrentTime(×tamp_); + } + int64_t GetTimestamp() const { + return timestamp_; + } + uint64_t GetSize() const { + return size_; + } + void SetSequenceNumber(uint64_t sequence_number) { + sequence_number_ = sequence_number; + } + uint64_t GetSequenceNumber() { + return sequence_number_; + } + + void AddFile(const std::string& filename, uint64_t size); + void Delete(); + + bool Empty() { + return files_.empty(); + } + + const std::vector& GetFiles() { + return files_; + } + + Status LoadFromFile(const std::string& backup_dir); + Status StoreToFile(bool sync); + + private: + int64_t timestamp_; + // sequence number is only approximate, should not be used + // by clients + uint64_t sequence_number_; + uint64_t size_; + std::string const meta_filename_; + // files with relative paths (without "/" prefix!!) + std::vector files_; + std::unordered_map* file_refs_; + Env* env_; + + static const size_t max_backup_meta_file_size_ = 10 * 1024 * 1024; // 10MB + }; // BackupMeta + + inline std::string GetAbsolutePath( + const std::string &relative_path = "") const { + assert(relative_path.size() == 0 || relative_path[0] != '/'); + return options_.backup_dir + "/" + relative_path; + } + inline std::string GetPrivateDirRel() const { + return "private"; + } + inline std::string GetPrivateFileRel(BackupID backup_id, + bool tmp = false, + const std::string& file = "") const { + assert(file.size() == 0 || file[0] != '/'); + return GetPrivateDirRel() + "/" + std::to_string(backup_id) + + (tmp ? ".tmp" : "") + "/" + file; + } + inline std::string GetSharedFileRel(const std::string& file = "", + bool tmp = false) const { + assert(file.size() == 0 || file[0] != '/'); + return "shared/" + file + (tmp ? ".tmp" : ""); + } + inline std::string GetLatestBackupFile(bool tmp = false) const { + return GetAbsolutePath(std::string("LATEST_BACKUP") + (tmp ? ".tmp" : "")); + } + inline std::string GetBackupMetaDir() const { + return GetAbsolutePath("meta"); + } + inline std::string GetBackupMetaFile(BackupID backup_id) const { + return GetBackupMetaDir() + "/" + std::to_string(backup_id); + } + + Status GetLatestBackupFileContents(uint32_t* latest_backup); + Status PutLatestBackupFileContents(uint32_t latest_backup); + // if size_limit == 0, there is no size limit, copy everything + Status CopyFile(const std::string& src, + const std::string& dst, + Env* src_env, + Env* dst_env, + bool sync, + uint64_t* size = nullptr, + uint64_t size_limit = 0); + // if size_limit == 0, there is no size limit, copy everything + Status BackupFile(BackupID backup_id, + BackupMeta* backup, + bool shared, + const std::string& src_dir, + const std::string& src_fname, // starts with "/" + uint64_t size_limit = 0); + // Will delete all the files we don't need anymore + // If full_scan == true, it will do the full scan of files/ directory + // and delete all the files that are not referenced from backuped_file_refs_ + void GarbageCollection(bool full_scan); + + // backup state data + BackupID latest_backup_id_; + std::map backups_; + std::unordered_map backuped_file_refs_; + std::vector obsolete_backups_; + std::atomic stop_backup_; + + // options data + BackupableDBOptions options_; + Env* db_env_; + Env* backup_env_; + + static const size_t copy_file_buffer_size_ = 5 * 1024 * 1024LL; // 5MB +}; + +BackupEngine::BackupEngine(Env* db_env, const BackupableDBOptions& options) + : stop_backup_(false), + options_(options), + db_env_(db_env), + backup_env_(options.backup_env != nullptr ? options.backup_env + : db_env_) { + + // create all the dirs we need + backup_env_->CreateDirIfMissing(GetAbsolutePath()); + if (options_.share_table_files) { + backup_env_->CreateDirIfMissing(GetAbsolutePath(GetSharedFileRel())); + } + backup_env_->CreateDirIfMissing(GetAbsolutePath(GetPrivateDirRel())); + backup_env_->CreateDirIfMissing(GetBackupMetaDir()); + + std::vector backup_meta_files; + backup_env_->GetChildren(GetBackupMetaDir(), &backup_meta_files); + // create backups_ structure + for (auto& file : backup_meta_files) { + BackupID backup_id = 0; + sscanf(file.c_str(), "%u", &backup_id); + if (backup_id == 0 || file != std::to_string(backup_id)) { + // invalid file name, delete that + backup_env_->DeleteFile(GetBackupMetaDir() + "/" + file); + continue; + } + assert(backups_.find(backup_id) == backups_.end()); + backups_.insert(std::make_pair( + backup_id, BackupMeta(GetBackupMetaFile(backup_id), + &backuped_file_refs_, backup_env_))); + } + + if (options_.destroy_old_data) { // Destory old data + for (auto& backup : backups_) { + backup.second.Delete(); + obsolete_backups_.push_back(backup.first); + } + backups_.clear(); + // start from beginning + latest_backup_id_ = 0; + // GarbageCollection() will do the actual deletion + } else { // Load data from storage + // load the backups if any + for (auto& backup : backups_) { + Status s = backup.second.LoadFromFile(options_.backup_dir); + if (!s.ok()) { + Log(options_.info_log, "Backup %u corrupted - deleting -- %s", + backup.first, s.ToString().c_str()); + backup.second.Delete(); + obsolete_backups_.push_back(backup.first); + } + } + // delete obsolete backups from the structure + for (auto ob : obsolete_backups_) { + backups_.erase(ob); + } + + Status s = GetLatestBackupFileContents(&latest_backup_id_); + // If latest backup file is corrupted or non-existent + // set latest backup as the biggest backup we have + // or 0 if we have no backups + if (!s.ok() || + backups_.find(latest_backup_id_) == backups_.end()) { + auto itr = backups_.end(); + latest_backup_id_ = (itr == backups_.begin()) ? 0 : (--itr)->first; + } + } + + // delete any backups that claim to be later than latest + for (auto itr = backups_.upper_bound(latest_backup_id_); + itr != backups_.end();) { + itr->second.Delete(); + obsolete_backups_.push_back(itr->first); + itr = backups_.erase(itr); + } + + PutLatestBackupFileContents(latest_backup_id_); // Ignore errors + GarbageCollection(true); + Log(options_.info_log, + "Initialized BackupEngine, the latest backup is %u.", + latest_backup_id_); +} + +BackupEngine::~BackupEngine() { + LogFlush(options_.info_log); +} + +void BackupEngine::DeleteBackupsNewerThan(uint64_t sequence_number) { + for (auto backup : backups_) { + if (backup.second.GetSequenceNumber() > sequence_number) { + Log(options_.info_log, + "Deleting backup %u because sequence number (%" PRIu64 + ") is newer than %" PRIu64 "", + backup.first, backup.second.GetSequenceNumber(), sequence_number); + backup.second.Delete(); + obsolete_backups_.push_back(backup.first); + } + } + for (auto ob : obsolete_backups_) { + backups_.erase(backups_.find(ob)); + } + auto itr = backups_.end(); + latest_backup_id_ = (itr == backups_.begin()) ? 0 : (--itr)->first; + PutLatestBackupFileContents(latest_backup_id_); // Ignore errors + GarbageCollection(false); +} + +Status BackupEngine::CreateNewBackup(DB* db, bool flush_before_backup) { + Status s; + std::vector live_files; + VectorLogPtr live_wal_files; + uint64_t manifest_file_size = 0; + uint64_t sequence_number = db->GetLatestSequenceNumber(); + + s = db->DisableFileDeletions(); + if (s.ok()) { + // this will return live_files prefixed with "/" + s = db->GetLiveFiles(live_files, &manifest_file_size, flush_before_backup); + } + // if we didn't flush before backup, we need to also get WAL files + if (s.ok() && !flush_before_backup) { + // returns file names prefixed with "/" + s = db->GetSortedWalFiles(live_wal_files); + } + if (!s.ok()) { + db->EnableFileDeletions(); + return s; + } + + BackupID new_backup_id = latest_backup_id_ + 1; + assert(backups_.find(new_backup_id) == backups_.end()); + auto ret = backups_.insert(std::make_pair( + new_backup_id, BackupMeta(GetBackupMetaFile(new_backup_id), + &backuped_file_refs_, backup_env_))); + assert(ret.second == true); + auto& new_backup = ret.first->second; + new_backup.RecordTimestamp(); + new_backup.SetSequenceNumber(sequence_number); + + Log(options_.info_log, "Started the backup process -- creating backup %u", + new_backup_id); + + // create temporary private dir + s = backup_env_->CreateDir( + GetAbsolutePath(GetPrivateFileRel(new_backup_id, true))); + + // copy live_files + for (size_t i = 0; s.ok() && i < live_files.size(); ++i) { + uint64_t number; + FileType type; + bool ok = ParseFileName(live_files[i], &number, &type); + if (!ok) { + assert(false); + return Status::Corruption("Can't parse file name. This is very bad"); + } + // we should only get sst, manifest and current files here + assert(type == kTableFile || + type == kDescriptorFile || + type == kCurrentFile); + + // rules: + // * if it's kTableFile, than it's shared + // * if it's kDescriptorFile, limit the size to manifest_file_size + s = BackupFile(new_backup_id, + &new_backup, + options_.share_table_files && type == kTableFile, + db->GetName(), /* src_dir */ + live_files[i], /* src_fname */ + (type == kDescriptorFile) ? manifest_file_size : 0); + } + + // copy WAL files + for (size_t i = 0; s.ok() && i < live_wal_files.size(); ++i) { + if (live_wal_files[i]->Type() == kAliveLogFile) { + // we only care about live log files + // copy the file into backup_dir/files// + s = BackupFile(new_backup_id, + &new_backup, + false, /* not shared */ + db->GetOptions().wal_dir, + live_wal_files[i]->PathName()); + } + } + + // we copied all the files, enable file deletions + db->EnableFileDeletions(); + + if (s.ok()) { + // move tmp private backup to real backup folder + s = backup_env_->RenameFile( + GetAbsolutePath(GetPrivateFileRel(new_backup_id, true)), // tmp + GetAbsolutePath(GetPrivateFileRel(new_backup_id, false))); + } + + if (s.ok()) { + // persist the backup metadata on the disk + s = new_backup.StoreToFile(options_.sync); + } + if (s.ok()) { + // install the newly created backup meta! (atomic) + s = PutLatestBackupFileContents(new_backup_id); + } + if (!s.ok()) { + // clean all the files we might have created + Log(options_.info_log, "Backup failed -- %s", s.ToString().c_str()); + backups_.erase(new_backup_id); + GarbageCollection(true); + return s; + } + + // here we know that we succeeded and installed the new backup + // in the LATEST_BACKUP file + latest_backup_id_ = new_backup_id; + Log(options_.info_log, "Backup DONE. All is good"); + return s; +} + +Status BackupEngine::PurgeOldBackups(uint32_t num_backups_to_keep) { + Log(options_.info_log, "Purging old backups, keeping %u", + num_backups_to_keep); + while (num_backups_to_keep < backups_.size()) { + Log(options_.info_log, "Deleting backup %u", backups_.begin()->first); + backups_.begin()->second.Delete(); + obsolete_backups_.push_back(backups_.begin()->first); + backups_.erase(backups_.begin()); + } + GarbageCollection(false); + return Status::OK(); +} + +Status BackupEngine::DeleteBackup(BackupID backup_id) { + Log(options_.info_log, "Deleting backup %u", backup_id); + auto backup = backups_.find(backup_id); + if (backup == backups_.end()) { + return Status::NotFound("Backup not found"); + } + backup->second.Delete(); + obsolete_backups_.push_back(backup_id); + backups_.erase(backup); + GarbageCollection(false); + return Status::OK(); +} + +void BackupEngine::GetBackupInfo(std::vector* backup_info) { + backup_info->reserve(backups_.size()); + for (auto& backup : backups_) { + if (!backup.second.Empty()) { + backup_info->push_back(BackupInfo( + backup.first, backup.second.GetTimestamp(), backup.second.GetSize())); + } + } +} + +Status BackupEngine::RestoreDBFromBackup(BackupID backup_id, + const std::string &db_dir, + const std::string &wal_dir) { + auto backup_itr = backups_.find(backup_id); + if (backup_itr == backups_.end()) { + return Status::NotFound("Backup not found"); + } + auto& backup = backup_itr->second; + if (backup.Empty()) { + return Status::NotFound("Backup not found"); + } + + Log(options_.info_log, "Restoring backup id %u\n", backup_id); + + // just in case. Ignore errors + db_env_->CreateDirIfMissing(db_dir); + db_env_->CreateDirIfMissing(wal_dir); + + // delete log files that might have been already in wal_dir. + // This is important since they might get replayed to the restored DB, + // which will then differ from the backuped DB + std::vector delete_children; + db_env_->GetChildren(wal_dir, &delete_children); // ignore errors + for (auto f : delete_children) { + db_env_->DeleteFile(wal_dir + "/" + f); // ignore errors + } + // Also delete all the db_dir children. This is not so important + // because obsolete files will be deleted by DBImpl::PurgeObsoleteFiles() + delete_children.clear(); + db_env_->GetChildren(db_dir, &delete_children); // ignore errors + for (auto f : delete_children) { + db_env_->DeleteFile(db_dir + "/" + f); // ignore errors + } + + Status s; + for (auto& file : backup.GetFiles()) { + std::string dst; + // 1. extract the filename + size_t slash = file.find_last_of('/'); + // file will either be shared/ or private// + assert(slash != std::string::npos); + dst = file.substr(slash + 1); + + // 2. find the filetype + uint64_t number; + FileType type; + bool ok = ParseFileName(dst, &number, &type); + if (!ok) { + return Status::Corruption("Backup corrupted"); + } + // 3. Construct the final path + // kLogFile lives in wal_dir and all the rest live in db_dir + dst = ((type == kLogFile) ? wal_dir : db_dir) + + "/" + dst; + + Log(options_.info_log, "Restoring %s to %s\n", file.c_str(), dst.c_str()); + s = CopyFile(GetAbsolutePath(file), dst, backup_env_, db_env_, false); + if (!s.ok()) { + break; + } + } + + Log(options_.info_log, "Restoring done -- %s\n", s.ToString().c_str()); + return s; +} + +// latest backup id is an ASCII representation of latest backup id +Status BackupEngine::GetLatestBackupFileContents(uint32_t* latest_backup) { + Status s; + unique_ptr file; + s = backup_env_->NewSequentialFile(GetLatestBackupFile(), + &file, + EnvOptions()); + if (!s.ok()) { + return s; + } + + char buf[11]; + Slice data; + s = file->Read(10, &data, buf); + if (!s.ok() || data.size() == 0) { + return s.ok() ? Status::Corruption("Latest backup file corrupted") : s; + } + buf[data.size()] = 0; + + *latest_backup = 0; + sscanf(data.data(), "%u", latest_backup); + if (backup_env_->FileExists(GetBackupMetaFile(*latest_backup)) == false) { + s = Status::Corruption("Latest backup file corrupted"); + } + return Status::OK(); +} + +// this operation HAS to be atomic +// writing 4 bytes to the file is atomic alright, but we should *never* +// do something like 1. delete file, 2. write new file +// We write to a tmp file and then atomically rename +Status BackupEngine::PutLatestBackupFileContents(uint32_t latest_backup) { + Status s; + unique_ptr file; + EnvOptions env_options; + env_options.use_mmap_writes = false; + s = backup_env_->NewWritableFile(GetLatestBackupFile(true), + &file, + env_options); + if (!s.ok()) { + backup_env_->DeleteFile(GetLatestBackupFile(true)); + return s; + } + + char file_contents[10]; + int len = sprintf(file_contents, "%u\n", latest_backup); + s = file->Append(Slice(file_contents, len)); + if (s.ok() && options_.sync) { + file->Sync(); + } + if (s.ok()) { + s = file->Close(); + } + if (s.ok()) { + // atomically replace real file with new tmp + s = backup_env_->RenameFile(GetLatestBackupFile(true), + GetLatestBackupFile(false)); + } + return s; +} + +Status BackupEngine::CopyFile(const std::string& src, + const std::string& dst, + Env* src_env, + Env* dst_env, + bool sync, + uint64_t* size, + uint64_t size_limit) { + Status s; + unique_ptr dst_file; + unique_ptr src_file; + EnvOptions env_options; + env_options.use_mmap_writes = false; + if (size != nullptr) { + *size = 0; + } + + // Check if size limit is set. if not, set it to very big number + if (size_limit == 0) { + size_limit = std::numeric_limits::max(); + } + + s = src_env->NewSequentialFile(src, &src_file, env_options); + if (s.ok()) { + s = dst_env->NewWritableFile(dst, &dst_file, env_options); + } + if (!s.ok()) { + return s; + } + + unique_ptr buf(new char[copy_file_buffer_size_]); + Slice data; + + do { + if (stop_backup_.load(std::memory_order_acquire)) { + return Status::Incomplete("Backup stopped"); + } + size_t buffer_to_read = (copy_file_buffer_size_ < size_limit) ? + copy_file_buffer_size_ : size_limit; + s = src_file->Read(buffer_to_read, &data, buf.get()); + size_limit -= data.size(); + if (size != nullptr) { + *size += data.size(); + } + if (s.ok()) { + s = dst_file->Append(data); + } + } while (s.ok() && data.size() > 0 && size_limit > 0); + + if (s.ok() && sync) { + s = dst_file->Sync(); + } + + return s; +} + +// src_fname will always start with "/" +Status BackupEngine::BackupFile(BackupID backup_id, + BackupMeta* backup, + bool shared, + const std::string& src_dir, + const std::string& src_fname, + uint64_t size_limit) { + + assert(src_fname.size() > 0 && src_fname[0] == '/'); + std::string dst_relative = src_fname.substr(1); + std::string dst_relative_tmp; + if (shared) { + dst_relative_tmp = GetSharedFileRel(dst_relative, true); + dst_relative = GetSharedFileRel(dst_relative, false); + } else { + dst_relative_tmp = GetPrivateFileRel(backup_id, true, dst_relative); + dst_relative = GetPrivateFileRel(backup_id, false, dst_relative); + } + std::string dst_path = GetAbsolutePath(dst_relative); + std::string dst_path_tmp = GetAbsolutePath(dst_relative_tmp); + Status s; + uint64_t size; + + // if it's shared, we also need to check if it exists -- if it does, + // no need to copy it again + if (shared && backup_env_->FileExists(dst_path)) { + backup_env_->GetFileSize(dst_path, &size); // Ignore error + Log(options_.info_log, "%s already present", src_fname.c_str()); + } else { + Log(options_.info_log, "Copying %s", src_fname.c_str()); + s = CopyFile(src_dir + src_fname, + dst_path_tmp, + db_env_, + backup_env_, + options_.sync, + &size, + size_limit); + if (s.ok() && shared) { + s = backup_env_->RenameFile(dst_path_tmp, dst_path); + } + } + if (s.ok()) { + backup->AddFile(dst_relative, size); + } + return s; +} + +void BackupEngine::GarbageCollection(bool full_scan) { + Log(options_.info_log, "Starting garbage collection"); + std::vector to_delete; + for (auto& itr : backuped_file_refs_) { + if (itr.second == 0) { + Status s = backup_env_->DeleteFile(GetAbsolutePath(itr.first)); + Log(options_.info_log, "Deleting %s -- %s", itr.first.c_str(), + s.ToString().c_str()); + to_delete.push_back(itr.first); + } + } + for (auto& td : to_delete) { + backuped_file_refs_.erase(td); + } + if (!full_scan) { + // take care of private dirs -- if full_scan == true, then full_scan will + // take care of them + for (auto backup_id : obsolete_backups_) { + std::string private_dir = GetPrivateFileRel(backup_id); + Status s = backup_env_->DeleteDir(GetAbsolutePath(private_dir)); + Log(options_.info_log, "Deleting private dir %s -- %s", + private_dir.c_str(), s.ToString().c_str()); + } + } + obsolete_backups_.clear(); + + if (full_scan) { + Log(options_.info_log, "Starting full scan garbage collection"); + // delete obsolete shared files + std::vector shared_children; + backup_env_->GetChildren(GetAbsolutePath(GetSharedFileRel()), + &shared_children); + for (auto& child : shared_children) { + std::string rel_fname = GetSharedFileRel(child); + // if it's not refcounted, delete it + if (backuped_file_refs_.find(rel_fname) == backuped_file_refs_.end()) { + // this might be a directory, but DeleteFile will just fail in that + // case, so we're good + Status s = backup_env_->DeleteFile(GetAbsolutePath(rel_fname)); + if (s.ok()) { + Log(options_.info_log, "Deleted %s", rel_fname.c_str()); + } + } + } + + // delete obsolete private files + std::vector private_children; + backup_env_->GetChildren(GetAbsolutePath(GetPrivateDirRel()), + &private_children); + for (auto& child : private_children) { + BackupID backup_id = 0; + bool tmp_dir = child.find(".tmp") != std::string::npos; + sscanf(child.c_str(), "%u", &backup_id); + if (!tmp_dir && // if it's tmp_dir, delete it + (backup_id == 0 || backups_.find(backup_id) != backups_.end())) { + // it's either not a number or it's still alive. continue + continue; + } + // here we have to delete the dir and all its children + std::string full_private_path = + GetAbsolutePath(GetPrivateFileRel(backup_id, tmp_dir)); + std::vector subchildren; + backup_env_->GetChildren(full_private_path, &subchildren); + for (auto& subchild : subchildren) { + Status s = backup_env_->DeleteFile(full_private_path + subchild); + if (s.ok()) { + Log(options_.info_log, "Deleted %s", + (full_private_path + subchild).c_str()); + } + } + // finally delete the private dir + Status s = backup_env_->DeleteDir(full_private_path); + Log(options_.info_log, "Deleted dir %s -- %s", full_private_path.c_str(), + s.ToString().c_str()); + } + } +} + +// ------- BackupMeta class -------- + +void BackupEngine::BackupMeta::AddFile(const std::string& filename, + uint64_t size) { + size_ += size; + files_.push_back(filename); + auto itr = file_refs_->find(filename); + if (itr == file_refs_->end()) { + file_refs_->insert(std::make_pair(filename, 1)); + } else { + ++itr->second; // increase refcount if already present + } +} + +void BackupEngine::BackupMeta::Delete() { + for (auto& file : files_) { + auto itr = file_refs_->find(file); + assert(itr != file_refs_->end()); + --(itr->second); // decrease refcount + } + files_.clear(); + // delete meta file + env_->DeleteFile(meta_filename_); + timestamp_ = 0; +} + +// each backup meta file is of the format: +// +// +// +// +// +// ... +// TODO: maybe add checksum? +Status BackupEngine::BackupMeta::LoadFromFile(const std::string& backup_dir) { + assert(Empty()); + Status s; + unique_ptr backup_meta_file; + s = env_->NewSequentialFile(meta_filename_, &backup_meta_file, EnvOptions()); + if (!s.ok()) { + return s; + } + + unique_ptr buf(new char[max_backup_meta_file_size_ + 1]); + Slice data; + s = backup_meta_file->Read(max_backup_meta_file_size_, &data, buf.get()); + + if (!s.ok() || data.size() == max_backup_meta_file_size_) { + return s.ok() ? Status::IOError("File size too big") : s; + } + buf[data.size()] = 0; + + uint32_t num_files = 0; + int bytes_read = 0; + sscanf(data.data(), "%" PRId64 "%n", ×tamp_, &bytes_read); + data.remove_prefix(bytes_read + 1); // +1 for '\n' + sscanf(data.data(), "%" PRIu64 "%n", &sequence_number_, &bytes_read); + data.remove_prefix(bytes_read + 1); // +1 for '\n' + sscanf(data.data(), "%u%n", &num_files, &bytes_read); + data.remove_prefix(bytes_read + 1); // +1 for '\n' + + std::vector> files; + + for (uint32_t i = 0; s.ok() && i < num_files; ++i) { + std::string filename = GetSliceUntil(&data, '\n').ToString(); + uint64_t size; + s = env_->GetFileSize(backup_dir + "/" + filename, &size); + files.push_back(std::make_pair(filename, size)); + } + + if (s.ok()) { + for (auto file : files) { + AddFile(file.first, file.second); + } + } + + return s; +} + +Status BackupEngine::BackupMeta::StoreToFile(bool sync) { + Status s; + unique_ptr backup_meta_file; + EnvOptions env_options; + env_options.use_mmap_writes = false; + s = env_->NewWritableFile(meta_filename_ + ".tmp", &backup_meta_file, + env_options); + if (!s.ok()) { + return s; + } + + unique_ptr buf(new char[max_backup_meta_file_size_]); + int len = 0, buf_size = max_backup_meta_file_size_; + len += snprintf(buf.get(), buf_size, "%" PRId64 "\n", timestamp_); + len += snprintf(buf.get() + len, buf_size - len, "%" PRIu64 "\n", + sequence_number_); + len += snprintf(buf.get() + len, buf_size - len, "%zu\n", files_.size()); + for (size_t i = 0; i < files_.size(); ++i) { + len += snprintf(buf.get() + len, buf_size - len, "%s\n", files_[i].c_str()); + } + + s = backup_meta_file->Append(Slice(buf.get(), (size_t)len)); + if (s.ok() && sync) { + s = backup_meta_file->Sync(); + } + if (s.ok()) { + s = backup_meta_file->Close(); + } + if (s.ok()) { + s = env_->RenameFile(meta_filename_ + ".tmp", meta_filename_); + } + return s; +} + +// --- BackupableDB methods -------- + +BackupableDB::BackupableDB(DB* db, const BackupableDBOptions& options) + : StackableDB(db), backup_engine_(new BackupEngine(db->GetEnv(), options)) { + if (options.share_table_files) { + backup_engine_->DeleteBackupsNewerThan(GetLatestSequenceNumber()); + } +} + +BackupableDB::~BackupableDB() { + delete backup_engine_; +} + +Status BackupableDB::CreateNewBackup(bool flush_before_backup) { + return backup_engine_->CreateNewBackup(this, flush_before_backup); +} + +void BackupableDB::GetBackupInfo(std::vector* backup_info) { + backup_engine_->GetBackupInfo(backup_info); +} + +Status BackupableDB::PurgeOldBackups(uint32_t num_backups_to_keep) { + return backup_engine_->PurgeOldBackups(num_backups_to_keep); +} + +Status BackupableDB::DeleteBackup(BackupID backup_id) { + return backup_engine_->DeleteBackup(backup_id); +} + +void BackupableDB::StopBackup() { + backup_engine_->StopBackup(); +} + +// --- RestoreBackupableDB methods ------ + +RestoreBackupableDB::RestoreBackupableDB(Env* db_env, + const BackupableDBOptions& options) + : backup_engine_(new BackupEngine(db_env, options)) {} + +RestoreBackupableDB::~RestoreBackupableDB() { + delete backup_engine_; +} + +void +RestoreBackupableDB::GetBackupInfo(std::vector* backup_info) { + backup_engine_->GetBackupInfo(backup_info); +} + +Status RestoreBackupableDB::RestoreDBFromBackup(BackupID backup_id, + const std::string& db_dir, + const std::string& wal_dir) { + return backup_engine_->RestoreDBFromBackup(backup_id, db_dir, wal_dir); +} + +Status +RestoreBackupableDB::RestoreDBFromLatestBackup(const std::string& db_dir, + const std::string& wal_dir) { + return backup_engine_->RestoreDBFromLatestBackup(db_dir, wal_dir); +} + +Status RestoreBackupableDB::PurgeOldBackups(uint32_t num_backups_to_keep) { + return backup_engine_->PurgeOldBackups(num_backups_to_keep); +} + +Status RestoreBackupableDB::DeleteBackup(BackupID backup_id) { + return backup_engine_->DeleteBackup(backup_id); +} + +} // namespace rocksdb diff --git a/utilities/backupable/backupable_db_test.cc b/utilities/backupable/backupable_db_test.cc new file mode 100644 index 00000000..de240558 --- /dev/null +++ b/utilities/backupable/backupable_db_test.cc @@ -0,0 +1,702 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/types.h" +#include "rocksdb/transaction_log.h" +#include "utilities/utility_db.h" +#include "utilities/backupable_db.h" +#include "util/testharness.h" +#include "util/random.h" +#include "util/testutil.h" +#include "util/auto_roll_logger.h" + +#include +#include + +namespace rocksdb { + +namespace { + +using std::unique_ptr; + +class DummyDB : public StackableDB { + public: + /* implicit */ + DummyDB(const Options& options, const std::string& dbname) + : StackableDB(nullptr), options_(options), dbname_(dbname), + deletions_enabled_(true), sequence_number_(0) {} + + virtual SequenceNumber GetLatestSequenceNumber() const { + return ++sequence_number_; + } + + virtual const std::string& GetName() const override { + return dbname_; + } + + virtual Env* GetEnv() const override { + return options_.env; + } + + virtual const Options& GetOptions() const override { + return options_; + } + + virtual Status EnableFileDeletions(bool force) override { + ASSERT_TRUE(!deletions_enabled_); + deletions_enabled_ = true; + return Status::OK(); + } + + virtual Status DisableFileDeletions() override { + ASSERT_TRUE(deletions_enabled_); + deletions_enabled_ = false; + return Status::OK(); + } + + virtual Status GetLiveFiles(std::vector& vec, uint64_t* mfs, + bool flush_memtable = true) override { + ASSERT_TRUE(!deletions_enabled_); + vec = live_files_; + *mfs = 100; + return Status::OK(); + } + + class DummyLogFile : public LogFile { + public: + /* implicit */ + DummyLogFile(const std::string& path, bool alive = true) + : path_(path), alive_(alive) {} + + virtual std::string PathName() const override { + return path_; + } + + virtual uint64_t LogNumber() const { + // what business do you have calling this method? + ASSERT_TRUE(false); + return 0; + } + + virtual WalFileType Type() const override { + return alive_ ? kAliveLogFile : kArchivedLogFile; + } + + virtual SequenceNumber StartSequence() const { + // backupabledb should not need this method + ASSERT_TRUE(false); + return 0; + } + + virtual uint64_t SizeFileBytes() const { + // backupabledb should not need this method + ASSERT_TRUE(false); + return 0; + } + + private: + std::string path_; + bool alive_; + }; // DummyLogFile + + virtual Status GetSortedWalFiles(VectorLogPtr& files) override { + ASSERT_TRUE(!deletions_enabled_); + files.resize(wal_files_.size()); + for (size_t i = 0; i < files.size(); ++i) { + files[i].reset( + new DummyLogFile(wal_files_[i].first, wal_files_[i].second)); + } + return Status::OK(); + } + + std::vector live_files_; + // pair + std::vector> wal_files_; + private: + Options options_; + std::string dbname_; + bool deletions_enabled_; + mutable SequenceNumber sequence_number_; +}; // DummyDB + +class TestEnv : public EnvWrapper { + public: + explicit TestEnv(Env* t) : EnvWrapper(t) {} + + class DummySequentialFile : public SequentialFile { + public: + DummySequentialFile() : SequentialFile(), rnd_(5) {} + virtual Status Read(size_t n, Slice* result, char* scratch) { + size_t read_size = (n > size_left) ? size_left : n; + for (size_t i = 0; i < read_size; ++i) { + scratch[i] = rnd_.Next() & 255; + } + *result = Slice(scratch, read_size); + size_left -= read_size; + return Status::OK(); + } + + virtual Status Skip(uint64_t n) { + size_left = (n > size_left) ? size_left - n : 0; + return Status::OK(); + } + private: + size_t size_left = 200; + Random rnd_; + }; + + Status NewSequentialFile(const std::string& f, + unique_ptr* r, + const EnvOptions& options) { + opened_files_.push_back(f); + if (dummy_sequential_file_) { + r->reset(new TestEnv::DummySequentialFile()); + return Status::OK(); + } else { + return EnvWrapper::NewSequentialFile(f, r, options); + } + } + + Status NewWritableFile(const std::string& f, unique_ptr* r, + const EnvOptions& options) { + if (limit_written_files_ <= 0) { + return Status::IOError("Sorry, can't do this"); + } + limit_written_files_--; + return EnvWrapper::NewWritableFile(f, r, options); + } + + void AssertOpenedFiles(std::vector& should_have_opened) { + sort(should_have_opened.begin(), should_have_opened.end()); + sort(opened_files_.begin(), opened_files_.end()); + ASSERT_TRUE(opened_files_ == should_have_opened); + } + + void ClearOpenedFiles() { + opened_files_.clear(); + } + + void SetLimitWrittenFiles(uint64_t limit) { + limit_written_files_ = limit; + } + + void SetDummySequentialFile(bool dummy_sequential_file) { + dummy_sequential_file_ = dummy_sequential_file; + } + + private: + bool dummy_sequential_file_ = false; + std::vector opened_files_; + uint64_t limit_written_files_ = 1000000; +}; // TestEnv + +class FileManager : public EnvWrapper { + public: + explicit FileManager(Env* t) : EnvWrapper(t), rnd_(5) {} + + Status DeleteRandomFileInDir(const std::string dir) { + std::vector children; + GetChildren(dir, &children); + if (children.size() <= 2) { // . and .. + return Status::NotFound(""); + } + while (true) { + int i = rnd_.Next() % children.size(); + if (children[i] != "." && children[i] != "..") { + return DeleteFile(dir + "/" + children[i]); + } + } + // should never get here + assert(false); + return Status::NotFound(""); + } + + Status CorruptFile(const std::string& fname, uint64_t bytes_to_corrupt) { + uint64_t size; + Status s = GetFileSize(fname, &size); + if (!s.ok()) { + return s; + } + unique_ptr file; + EnvOptions env_options; + env_options.use_mmap_writes = false; + s = NewRandomRWFile(fname, &file, env_options); + if (!s.ok()) { + return s; + } + + for (uint64_t i = 0; s.ok() && i < bytes_to_corrupt; ++i) { + std::string tmp; + // write one random byte to a random position + s = file->Write(rnd_.Next() % size, test::RandomString(&rnd_, 1, &tmp)); + } + return s; + } + + Status WriteToFile(const std::string& fname, const std::string& data) { + unique_ptr file; + EnvOptions env_options; + env_options.use_mmap_writes = false; + Status s = EnvWrapper::NewWritableFile(fname, &file, env_options); + if (!s.ok()) { + return s; + } + return file->Append(Slice(data)); + } + private: + Random rnd_; +}; // FileManager + +// utility functions +static void FillDB(DB* db, int from, int to) { + for (int i = from; i < to; ++i) { + std::string key = "testkey" + std::to_string(i); + std::string value = "testvalue" + std::to_string(i); + + ASSERT_OK(db->Put(WriteOptions(), Slice(key), Slice(value))); + } +} + +static void AssertExists(DB* db, int from, int to) { + for (int i = from; i < to; ++i) { + std::string key = "testkey" + std::to_string(i); + std::string value; + Status s = db->Get(ReadOptions(), Slice(key), &value); + ASSERT_EQ(value, "testvalue" + std::to_string(i)); + } +} + +static void AssertEmpty(DB* db, int from, int to) { + for (int i = from; i < to; ++i) { + std::string key = "testkey" + std::to_string(i); + std::string value = "testvalue" + std::to_string(i); + + Status s = db->Get(ReadOptions(), Slice(key), &value); + ASSERT_TRUE(s.IsNotFound()); + } +} + +class BackupableDBTest { + public: + BackupableDBTest() { + // set up files + dbname_ = test::TmpDir() + "/backupable_db"; + backupdir_ = test::TmpDir() + "/backupable_db_backup"; + + // set up envs + env_ = Env::Default(); + test_db_env_.reset(new TestEnv(env_)); + test_backup_env_.reset(new TestEnv(env_)); + file_manager_.reset(new FileManager(env_)); + + // set up db options + options_.create_if_missing = true; + options_.paranoid_checks = true; + options_.write_buffer_size = 1 << 17; // 128KB + options_.env = test_db_env_.get(); + options_.wal_dir = dbname_; + // set up backup db options + CreateLoggerFromOptions(dbname_, backupdir_, env_, + Options(), &logger_); + backupable_options_.reset(new BackupableDBOptions( + backupdir_, test_backup_env_.get(), true, logger_.get(), true)); + + // delete old files in db + DestroyDB(dbname_, Options()); + } + + DB* OpenDB() { + DB* db; + ASSERT_OK(DB::Open(options_, dbname_, &db)); + return db; + } + + void OpenBackupableDB(bool destroy_old_data = false, bool dummy = false, + bool share_table_files = true) { + // reset all the defaults + test_backup_env_->SetLimitWrittenFiles(1000000); + test_db_env_->SetLimitWrittenFiles(1000000); + test_db_env_->SetDummySequentialFile(dummy); + + DB* db; + if (dummy) { + dummy_db_ = new DummyDB(options_, dbname_); + db = dummy_db_; + } else { + ASSERT_OK(DB::Open(options_, dbname_, &db)); + } + backupable_options_->destroy_old_data = destroy_old_data; + backupable_options_->share_table_files = share_table_files; + db_.reset(new BackupableDB(db, *backupable_options_)); + } + + void CloseBackupableDB() { + db_.reset(nullptr); + } + + void OpenRestoreDB() { + backupable_options_->destroy_old_data = false; + restore_db_.reset( + new RestoreBackupableDB(test_db_env_.get(), *backupable_options_)); + } + + void CloseRestoreDB() { + restore_db_.reset(nullptr); + } + + // restores backup backup_id and asserts the existence of + // [start_exist, end_exist> and not-existence of + // [end_exist, end> + // + // if backup_id == 0, it means restore from latest + // if end == 0, don't check AssertEmpty + void AssertBackupConsistency(BackupID backup_id, uint32_t start_exist, + uint32_t end_exist, uint32_t end = 0) { + bool opened_restore = false; + if (restore_db_.get() == nullptr) { + opened_restore = true; + OpenRestoreDB(); + } + if (backup_id > 0) { + ASSERT_OK(restore_db_->RestoreDBFromBackup(backup_id, dbname_, dbname_)); + } else { + ASSERT_OK(restore_db_->RestoreDBFromLatestBackup(dbname_, dbname_)); + } + DB* db = OpenDB(); + AssertExists(db, start_exist, end_exist); + if (end != 0) { + AssertEmpty(db, end_exist, end); + } + delete db; + if (opened_restore) { + CloseRestoreDB(); + } + } + + // files + std::string dbname_; + std::string backupdir_; + + // envs + Env* env_; + unique_ptr test_db_env_; + unique_ptr test_backup_env_; + unique_ptr file_manager_; + + // all the dbs! + DummyDB* dummy_db_; // BackupableDB owns dummy_db_ + unique_ptr db_; + unique_ptr restore_db_; + + // options + Options options_; + unique_ptr backupable_options_; + std::shared_ptr logger_; +}; // BackupableDBTest + +void AppendPath(const std::string& path, std::vector& v) { + for (auto& f : v) { + f = path + f; + } +} + +// this will make sure that backup does not copy the same file twice +TEST(BackupableDBTest, NoDoubleCopy) { + OpenBackupableDB(true, true); + + // should write 5 DB files + LATEST_BACKUP + one meta file + test_backup_env_->SetLimitWrittenFiles(7); + test_db_env_->ClearOpenedFiles(); + test_db_env_->SetLimitWrittenFiles(0); + dummy_db_->live_files_ = { "/00010.sst", "/00011.sst", + "/CURRENT", "/MANIFEST-01" }; + dummy_db_->wal_files_ = {{"/00011.log", true}, {"/00012.log", false}}; + ASSERT_OK(db_->CreateNewBackup(false)); + std::vector should_have_openened = dummy_db_->live_files_; + should_have_openened.push_back("/00011.log"); + AppendPath(dbname_, should_have_openened); + test_db_env_->AssertOpenedFiles(should_have_openened); + + // should write 4 new DB files + LATEST_BACKUP + one meta file + // should not write/copy 00010.sst, since it's already there! + test_backup_env_->SetLimitWrittenFiles(6); + test_db_env_->ClearOpenedFiles(); + dummy_db_->live_files_ = { "/00010.sst", "/00015.sst", + "/CURRENT", "/MANIFEST-01" }; + dummy_db_->wal_files_ = {{"/00011.log", true}, {"/00012.log", false}}; + ASSERT_OK(db_->CreateNewBackup(false)); + // should not open 00010.sst - it's already there + should_have_openened = { "/00015.sst", "/CURRENT", + "/MANIFEST-01", "/00011.log" }; + AppendPath(dbname_, should_have_openened); + test_db_env_->AssertOpenedFiles(should_have_openened); + + ASSERT_OK(db_->DeleteBackup(1)); + ASSERT_EQ(true, + test_backup_env_->FileExists(backupdir_ + "/shared/00010.sst")); + // 00011.sst was only in backup 1, should be deleted + ASSERT_EQ(false, + test_backup_env_->FileExists(backupdir_ + "/shared/00011.sst")); + ASSERT_EQ(true, + test_backup_env_->FileExists(backupdir_ + "/shared/00015.sst")); + + // MANIFEST file size should be only 100 + uint64_t size; + test_backup_env_->GetFileSize(backupdir_ + "/private/2/MANIFEST-01", &size); + ASSERT_EQ(100UL, size); + test_backup_env_->GetFileSize(backupdir_ + "/shared/00015.sst", &size); + ASSERT_EQ(200UL, size); + + CloseBackupableDB(); +} + +// test various kind of corruptions that may happen: +// 1. Not able to write a file for backup - that backup should fail, +// everything else should work +// 2. Corrupted/deleted LATEST_BACKUP - everything should work fine +// 3. Corrupted backup meta file or missing backuped file - we should +// not be able to open that backup, but all other backups should be +// fine +TEST(BackupableDBTest, CorruptionsTest) { + const int keys_iteration = 5000; + Random rnd(6); + Status s; + + OpenBackupableDB(true); + // create five backups + for (int i = 0; i < 5; ++i) { + FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1)); + ASSERT_OK(db_->CreateNewBackup(!!(rnd.Next() % 2))); + } + + // ---------- case 1. - fail a write ----------- + // try creating backup 6, but fail a write + FillDB(db_.get(), keys_iteration * 5, keys_iteration * 6); + test_backup_env_->SetLimitWrittenFiles(2); + // should fail + s = db_->CreateNewBackup(!!(rnd.Next() % 2)); + ASSERT_TRUE(!s.ok()); + test_backup_env_->SetLimitWrittenFiles(1000000); + // latest backup should have all the keys + CloseBackupableDB(); + AssertBackupConsistency(0, 0, keys_iteration * 5, keys_iteration * 6); + + // ---------- case 2. - corrupt/delete latest backup ----------- + ASSERT_OK(file_manager_->CorruptFile(backupdir_ + "/LATEST_BACKUP", 2)); + AssertBackupConsistency(0, 0, keys_iteration * 5); + ASSERT_OK(file_manager_->DeleteFile(backupdir_ + "/LATEST_BACKUP")); + AssertBackupConsistency(0, 0, keys_iteration * 5); + // create backup 6, point LATEST_BACKUP to 5 + OpenBackupableDB(); + FillDB(db_.get(), keys_iteration * 5, keys_iteration * 6); + ASSERT_OK(db_->CreateNewBackup(false)); + CloseBackupableDB(); + ASSERT_OK(file_manager_->WriteToFile(backupdir_ + "/LATEST_BACKUP", "5")); + AssertBackupConsistency(0, 0, keys_iteration * 5, keys_iteration * 6); + // assert that all 6 data is gone! + ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/6") == false); + ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/private/6") == false); + + // --------- case 3. corrupted backup meta or missing backuped file ---- + ASSERT_OK(file_manager_->CorruptFile(backupdir_ + "/meta/5", 3)); + // since 5 meta is now corrupted, latest backup should be 4 + AssertBackupConsistency(0, 0, keys_iteration * 4, keys_iteration * 5); + OpenRestoreDB(); + s = restore_db_->RestoreDBFromBackup(5, dbname_, dbname_); + ASSERT_TRUE(!s.ok()); + CloseRestoreDB(); + ASSERT_OK(file_manager_->DeleteRandomFileInDir(backupdir_ + "/private/4")); + // 4 is corrupted, 3 is the latest backup now + AssertBackupConsistency(0, 0, keys_iteration * 3, keys_iteration * 5); + OpenRestoreDB(); + s = restore_db_->RestoreDBFromBackup(4, dbname_, dbname_); + CloseRestoreDB(); + ASSERT_TRUE(!s.ok()); + + // new backup should be 4! + OpenBackupableDB(); + FillDB(db_.get(), keys_iteration * 3, keys_iteration * 4); + ASSERT_OK(db_->CreateNewBackup(!!(rnd.Next() % 2))); + CloseBackupableDB(); + AssertBackupConsistency(4, 0, keys_iteration * 4, keys_iteration * 5); +} + +// open DB, write, close DB, backup, restore, repeat +TEST(BackupableDBTest, OfflineIntegrationTest) { + // has to be a big number, so that it triggers the memtable flush + const int keys_iteration = 5000; + const int max_key = keys_iteration * 4 + 10; + // first iter -- flush before backup + // second iter -- don't flush before backup + for (int iter = 0; iter < 2; ++iter) { + // delete old data + DestroyDB(dbname_, Options()); + bool destroy_data = true; + + // every iteration -- + // 1. insert new data in the DB + // 2. backup the DB + // 3. destroy the db + // 4. restore the db, check everything is still there + for (int i = 0; i < 5; ++i) { + // in last iteration, put smaller amount of data, + int fill_up_to = std::min(keys_iteration * (i + 1), max_key); + // ---- insert new data and back up ---- + OpenBackupableDB(destroy_data); + destroy_data = false; + FillDB(db_.get(), keys_iteration * i, fill_up_to); + ASSERT_OK(db_->CreateNewBackup(iter == 0)); + CloseBackupableDB(); + DestroyDB(dbname_, Options()); + + // ---- make sure it's empty ---- + DB* db = OpenDB(); + AssertEmpty(db, 0, fill_up_to); + delete db; + + // ---- restore the DB ---- + OpenRestoreDB(); + if (i >= 3) { // test purge old backups + // when i == 4, purge to only 1 backup + // when i == 3, purge to 2 backups + ASSERT_OK(restore_db_->PurgeOldBackups(5 - i)); + } + // ---- make sure the data is there --- + AssertBackupConsistency(0, 0, fill_up_to, max_key); + CloseRestoreDB(); + } + } +} + +// open DB, write, backup, write, backup, close, restore +TEST(BackupableDBTest, OnlineIntegrationTest) { + // has to be a big number, so that it triggers the memtable flush + const int keys_iteration = 5000; + const int max_key = keys_iteration * 4 + 10; + Random rnd(7); + // delete old data + DestroyDB(dbname_, Options()); + + OpenBackupableDB(true); + // write some data, backup, repeat + for (int i = 0; i < 5; ++i) { + if (i == 4) { + // delete backup number 2, online delete! + OpenRestoreDB(); + ASSERT_OK(restore_db_->DeleteBackup(2)); + CloseRestoreDB(); + } + // in last iteration, put smaller amount of data, + // so that backups can share sst files + int fill_up_to = std::min(keys_iteration * (i + 1), max_key); + FillDB(db_.get(), keys_iteration * i, fill_up_to); + // we should get consistent results with flush_before_backup + // set to both true and false + ASSERT_OK(db_->CreateNewBackup(!!(rnd.Next() % 2))); + } + // close and destroy + CloseBackupableDB(); + DestroyDB(dbname_, Options()); + + // ---- make sure it's empty ---- + DB* db = OpenDB(); + AssertEmpty(db, 0, max_key); + delete db; + + // ---- restore every backup and verify all the data is there ---- + OpenRestoreDB(); + for (int i = 1; i <= 5; ++i) { + if (i == 2) { + // we deleted backup 2 + Status s = restore_db_->RestoreDBFromBackup(2, dbname_, dbname_); + ASSERT_TRUE(!s.ok()); + } else { + int fill_up_to = std::min(keys_iteration * i, max_key); + AssertBackupConsistency(i, 0, fill_up_to, max_key); + } + } + + // delete some backups -- this should leave only backups 3 and 5 alive + ASSERT_OK(restore_db_->DeleteBackup(4)); + ASSERT_OK(restore_db_->PurgeOldBackups(2)); + + std::vector backup_info; + restore_db_->GetBackupInfo(&backup_info); + ASSERT_EQ(2UL, backup_info.size()); + + // check backup 3 + AssertBackupConsistency(3, 0, 3 * keys_iteration, max_key); + // check backup 5 + AssertBackupConsistency(5, 0, max_key); + + CloseRestoreDB(); +} + +TEST(BackupableDBTest, DeleteNewerBackups) { + // create backups 1, 2, 3, 4, 5 + OpenBackupableDB(true); + for (int i = 0; i < 5; ++i) { + FillDB(db_.get(), 100 * i, 100 * (i + 1)); + ASSERT_OK(db_->CreateNewBackup(!!(i % 2))); + } + CloseBackupableDB(); + + // backup 3 is fine + AssertBackupConsistency(3, 0, 300, 500); + // this should delete backups 4 and 5 + OpenBackupableDB(); + CloseBackupableDB(); + // backups 4 and 5 don't exist + OpenRestoreDB(); + Status s = restore_db_->RestoreDBFromBackup(4, dbname_, dbname_); + ASSERT_TRUE(s.IsNotFound()); + s = restore_db_->RestoreDBFromBackup(5, dbname_, dbname_); + ASSERT_TRUE(s.IsNotFound()); + CloseRestoreDB(); +} + +TEST(BackupableDBTest, NoShareTableFiles) { + const int keys_iteration = 5000; + OpenBackupableDB(true, false, false); + for (int i = 0; i < 5; ++i) { + FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1)); + ASSERT_OK(db_->CreateNewBackup(!!(i % 2))); + } + CloseBackupableDB(); + + for (int i = 0; i < 5; ++i) { + AssertBackupConsistency(i + 1, 0, keys_iteration * (i + 1), + keys_iteration * 6); + } +} + +TEST(BackupableDBTest, DeleteTmpFiles) { + OpenBackupableDB(); + CloseBackupableDB(); + std::string shared_tmp = backupdir_ + "/shared/00006.sst.tmp"; + std::string private_tmp_dir = backupdir_ + "/private/10.tmp"; + std::string private_tmp_file = private_tmp_dir + "/00003.sst"; + file_manager_->WriteToFile(shared_tmp, "tmp"); + file_manager_->CreateDir(private_tmp_dir); + file_manager_->WriteToFile(private_tmp_file, "tmp"); + ASSERT_EQ(true, file_manager_->FileExists(private_tmp_dir)); + OpenBackupableDB(); + CloseBackupableDB(); + ASSERT_EQ(false, file_manager_->FileExists(shared_tmp)); + ASSERT_EQ(false, file_manager_->FileExists(private_tmp_file)); + ASSERT_EQ(false, file_manager_->FileExists(private_tmp_dir)); +} + +} // anon namespace + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/utilities/merge_operators.h b/utilities/merge_operators.h new file mode 100644 index 00000000..fdf06645 --- /dev/null +++ b/utilities/merge_operators.h @@ -0,0 +1,45 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#ifndef MERGE_OPERATORS_H +#define MERGE_OPERATORS_H + +#include +#include + +#include "rocksdb/merge_operator.h" + +namespace rocksdb { + +class MergeOperators { + public: + static std::shared_ptr CreatePutOperator(); + static std::shared_ptr CreateUInt64AddOperator(); + static std::shared_ptr CreateStringAppendOperator(); + static std::shared_ptr CreateStringAppendTESTOperator(); + + // Will return a different merge operator depending on the string. + // TODO: Hook the "name" up to the actual Name() of the MergeOperators? + static std::shared_ptr CreateFromStringId( + const std::string& name) { + if (name == "put") { + return CreatePutOperator(); + } else if ( name == "uint64add") { + return CreateUInt64AddOperator(); + } else if (name == "stringappend") { + return CreateStringAppendOperator(); + } else if (name == "stringappendtest") { + return CreateStringAppendTESTOperator(); + } else { + // Empty or unknown, just return nullptr + return nullptr; + } + } + +}; + +} // namespace rocksdb + +#endif diff --git a/utilities/merge_operators/put.cc b/utilities/merge_operators/put.cc new file mode 100644 index 00000000..e77449d3 --- /dev/null +++ b/utilities/merge_operators/put.cc @@ -0,0 +1,54 @@ +#include +#include "rocksdb/slice.h" +#include "rocksdb/merge_operator.h" +#include "utilities/merge_operators.h" + +using namespace rocksdb; + +namespace { // anonymous namespace + +// A merge operator that mimics Put semantics +// Since this merge-operator will not be used in production, +// it is implemented as a non-associative merge operator to illustrate the +// new interface and for testing purposes. (That is, we inherit from +// the MergeOperator class rather than the AssociativeMergeOperator +// which would be simpler in this case). +// +// From the client-perspective, semantics are the same. +class PutOperator : public MergeOperator { + public: + virtual bool FullMerge(const Slice& key, + const Slice* existing_value, + const std::deque& operand_sequence, + std::string* new_value, + Logger* logger) const override { + // Put basically only looks at the current/latest value + assert(!operand_sequence.empty()); + assert(new_value != nullptr); + new_value->assign(operand_sequence.back()); + return true; + } + + virtual bool PartialMerge(const Slice& key, + const Slice& left_operand, + const Slice& right_operand, + std::string* new_value, + Logger* logger) const override { + new_value->assign(right_operand.data(), right_operand.size()); + return true; + } + + virtual const char* Name() const override { + return "PutOperator"; + } +}; + +} // end of anonymous namespace + +namespace rocksdb { + +std::shared_ptr MergeOperators::CreatePutOperator() { + return std::make_shared(); +} + +} diff --git a/utilities/merge_operators/string_append/stringappend.cc b/utilities/merge_operators/string_append/stringappend.cc new file mode 100644 index 00000000..38cd22eb --- /dev/null +++ b/utilities/merge_operators/string_append/stringappend.cc @@ -0,0 +1,60 @@ +/** + * A MergeOperator for rocksdb that implements string append. + * @author Deon Nicholas (dnicholas@fb.com) + * Copyright 2013 Facebook + */ + +#include "stringappend.h" + +#include +#include + +#include "rocksdb/slice.h" +#include "rocksdb/merge_operator.h" +#include "utilities/merge_operators.h" + +namespace rocksdb { + +// Constructor: also specify the delimiter character. +StringAppendOperator::StringAppendOperator(char delim_char) + : delim_(delim_char) { +} + +// Implementation for the merge operation (concatenates two strings) +bool StringAppendOperator::Merge(const Slice& key, + const Slice* existing_value, + const Slice& value, + std::string* new_value, + Logger* logger) const { + + // Clear the *new_value for writing. + assert(new_value); + new_value->clear(); + + if (!existing_value) { + // No existing_value. Set *new_value = value + new_value->assign(value.data(),value.size()); + } else { + // Generic append (existing_value != null). + // Reserve *new_value to correct size, and apply concatenation. + new_value->reserve(existing_value->size() + 1 + value.size()); + new_value->assign(existing_value->data(),existing_value->size()); + new_value->append(1,delim_); + new_value->append(value.data(), value.size()); + } + + return true; +} + +const char* StringAppendOperator::Name() const { + return "StringAppendOperator"; +} + +std::shared_ptr MergeOperators::CreateStringAppendOperator() { + return std::make_shared(','); +} + +} // namespace rocksdb + + + diff --git a/utilities/merge_operators/string_append/stringappend.h b/utilities/merge_operators/string_append/stringappend.h new file mode 100644 index 00000000..ca5b97ec --- /dev/null +++ b/utilities/merge_operators/string_append/stringappend.h @@ -0,0 +1,31 @@ +/** + * A MergeOperator for rocksdb that implements string append. + * @author Deon Nicholas (dnicholas@fb.com) + * Copyright 2013 Facebook + */ + +#pragma once +#include "rocksdb/merge_operator.h" +#include "rocksdb/slice.h" + +namespace rocksdb { + +class StringAppendOperator : public AssociativeMergeOperator { + public: + StringAppendOperator(char delim_char); /// Constructor: specify delimiter + + virtual bool Merge(const Slice& key, + const Slice* existing_value, + const Slice& value, + std::string* new_value, + Logger* logger) const override; + + virtual const char* Name() const override; + + private: + char delim_; // The delimiter is inserted between elements + +}; + +} // namespace rocksdb + diff --git a/utilities/merge_operators/string_append/stringappend2.cc b/utilities/merge_operators/string_append/stringappend2.cc new file mode 100644 index 00000000..e153a388 --- /dev/null +++ b/utilities/merge_operators/string_append/stringappend2.cc @@ -0,0 +1,104 @@ +/** + * @author Deon Nicholas (dnicholas@fb.com) + * Copyright 2013 Facebook + */ + +#include "stringappend2.h" + +#include +#include + +#include "rocksdb/slice.h" +#include "rocksdb/merge_operator.h" +#include "utilities/merge_operators.h" + +namespace rocksdb { + +// Constructor: also specify the delimiter character. +StringAppendTESTOperator::StringAppendTESTOperator(char delim_char) + : delim_(delim_char) { +} + +// Implementation for the merge operation (concatenates two strings) +bool StringAppendTESTOperator::FullMerge( + const Slice& key, + const Slice* existing_value, + const std::deque& operands, + std::string* new_value, + Logger* logger) const { + + // Clear the *new_value for writing. + assert(new_value); + new_value->clear(); + + // Compute the space needed for the final result. + int numBytes = 0; + for(auto it = operands.begin(); it != operands.end(); ++it) { + numBytes += it->size() + 1; // Plus 1 for the delimiter + } + + // Only print the delimiter after the first entry has been printed + bool printDelim = false; + + // Prepend the *existing_value if one exists. + if (existing_value) { + new_value->reserve(numBytes + existing_value->size()); + new_value->append(existing_value->data(), existing_value->size()); + printDelim = true; + } else if (numBytes) { + new_value->reserve(numBytes-1); // Minus 1 since we have one less delimiter + } + + // Concatenate the sequence of strings (and add a delimiter between each) + for(auto it = operands.begin(); it != operands.end(); ++it) { + if (printDelim) { + new_value->append(1,delim_); + } + new_value->append(*it); + printDelim = true; + } + + return true; +} + +bool StringAppendTESTOperator::PartialMerge(const Slice& key, + const Slice& left_operand, + const Slice& right_operand, + std::string* new_value, + Logger* logger) const { + return false; +} + +// A version of PartialMerge that actually performs "partial merging". +// Use this to simulate the exact behaviour of the StringAppendOperator. +bool StringAppendTESTOperator::_AssocPartialMerge(const Slice& key, + const Slice& left_operand, + const Slice& right_operand, + std::string* new_value, + Logger* logger) const { + // Clear the *new_value for writing. + assert(new_value); + new_value->clear(); + + // Generic append + // Reserve correct size for *new_value, and apply concatenation. + new_value->reserve(left_operand.size() + 1 + right_operand.size()); + new_value->assign(left_operand.data(), left_operand.size()); + new_value->append(1,delim_); + new_value->append(right_operand.data(), right_operand.size()); + + return true; +} + +const char* StringAppendTESTOperator::Name() const { + return "StringAppendTESTOperator"; +} + + +std::shared_ptr +MergeOperators::CreateStringAppendTESTOperator() { + return std::make_shared(','); +} + +} // namespace rocksdb + diff --git a/utilities/merge_operators/string_append/stringappend2.h b/utilities/merge_operators/string_append/stringappend2.h new file mode 100644 index 00000000..01a4be4d --- /dev/null +++ b/utilities/merge_operators/string_append/stringappend2.h @@ -0,0 +1,51 @@ +/** + * A TEST MergeOperator for rocksdb that implements string append. + * It is built using the MergeOperator interface rather than the simpler + * AssociativeMergeOperator interface. This is useful for testing/benchmarking. + * While the two operators are semantically the same, all production code + * should use the StringAppendOperator defined in stringappend.{h,cc}. The + * operator defined in the present file is primarily for testing. + * + * @author Deon Nicholas (dnicholas@fb.com) + * Copyright 2013 Facebook + */ + +#pragma once +#include "rocksdb/merge_operator.h" +#include "rocksdb/slice.h" + +namespace rocksdb { + +class StringAppendTESTOperator : public MergeOperator { + public: + + StringAppendTESTOperator(char delim_char); /// Constructor with delimiter + + virtual bool FullMerge(const Slice& key, + const Slice* existing_value, + const std::deque& operand_sequence, + std::string* new_value, + Logger* logger) const override; + + virtual bool PartialMerge(const Slice& key, + const Slice& left_operand, + const Slice& right_operand, + std::string* new_value, + Logger* logger) const override; + + virtual const char* Name() const override; + + private: + // A version of PartialMerge that actually performs "partial merging". + // Use this to simulate the exact behaviour of the StringAppendOperator. + bool _AssocPartialMerge(const Slice& key, + const Slice& left_operand, + const Slice& right_operand, + std::string* new_value, + Logger* logger) const; + + char delim_; // The delimiter is inserted between elements + +}; + +} // namespace rocksdb diff --git a/utilities/merge_operators/string_append/stringappend_test.cc b/utilities/merge_operators/string_append/stringappend_test.cc new file mode 100644 index 00000000..81af6462 --- /dev/null +++ b/utilities/merge_operators/string_append/stringappend_test.cc @@ -0,0 +1,593 @@ +/** + * An persistent map : key -> (list of strings), using rocksdb merge. + * This file is a test-harness / use-case for the StringAppendOperator. + * + * @author Deon Nicholas (dnicholas@fb.com) + * Copyright 2013 Facebook, Inc. +*/ + +#include +#include + +#include "rocksdb/db.h" +#include "rocksdb/merge_operator.h" +#include "utilities/merge_operators.h" +#include "utilities/merge_operators/string_append/stringappend.h" +#include "utilities/merge_operators/string_append/stringappend2.h" +#include "utilities/ttl/db_ttl.h" +#include "util/testharness.h" +#include "util/random.h" + +using namespace rocksdb; + +namespace rocksdb { + +// Path to the database on file system +const std::string kDbName = "/tmp/mergetestdb"; + +// OpenDb opens a (possibly new) rocksdb database with a StringAppendOperator +std::shared_ptr OpenNormalDb(char delim_char) { + DB* db; + Options options; + options.create_if_missing = true; + options.merge_operator.reset(new StringAppendOperator(delim_char)); + ASSERT_OK(DB::Open(options, kDbName, &db)); + return std::shared_ptr(db); +} + +// Open a TtlDB with a non-associative StringAppendTESTOperator +std::shared_ptr OpenTtlDb(char delim_char) { + StackableDB* db; + Options options; + options.create_if_missing = true; + options.merge_operator.reset(new StringAppendTESTOperator(delim_char)); + ASSERT_OK(UtilityDB::OpenTtlDB(options, kDbName, &db, 123456)); + return std::shared_ptr(db); +} + +/// StringLists represents a set of string-lists, each with a key-index. +/// Supports Append(list, string) and Get(list) +class StringLists { + public: + + //Constructor: specifies the rocksdb db + /* implicit */ + StringLists(std::shared_ptr db) + : db_(db), + merge_option_(), + get_option_() { + assert(db); + } + + // Append string val onto the list defined by key; return true on success + bool Append(const std::string& key, const std::string& val){ + Slice valSlice(val.data(), val.size()); + auto s = db_->Merge(merge_option_, key, valSlice); + + if (s.ok()) { + return true; + } else { + std::cerr << "ERROR " << s.ToString() << std::endl; + return false; + } + } + + // Returns the list of strings associated with key (or "" if does not exist) + bool Get(const std::string& key, std::string* const result){ + assert(result != nullptr); // we should have a place to store the result + auto s = db_->Get(get_option_, key, result); + + if (s.ok()) { + return true; + } + + // Either key does not exist, or there is some error. + *result = ""; // Always return empty string (just for convention) + + //NotFound is okay; just return empty (similar to std::map) + //But network or db errors, etc, should fail the test (or at least yell) + if (!s.IsNotFound()) { + std::cerr << "ERROR " << s.ToString() << std::endl; + } + + // Always return false if s.ok() was not true + return false; + } + + + private: + std::shared_ptr db_; + WriteOptions merge_option_; + ReadOptions get_option_; + +}; + + +// The class for unit-testing +class StringAppendOperatorTest { + public: + StringAppendOperatorTest() { + DestroyDB(kDbName, Options()); // Start each test with a fresh DB + } + + typedef std::shared_ptr (* OpenFuncPtr)(char); + + // Allows user to open databases with different configurations. + // e.g.: Can open a DB or a TtlDB, etc. + static void SetOpenDbFunction(OpenFuncPtr func) { + OpenDb = func; + } + + protected: + static OpenFuncPtr OpenDb; +}; +StringAppendOperatorTest::OpenFuncPtr StringAppendOperatorTest::OpenDb = nullptr; + +// THE TEST CASES BEGIN HERE + +TEST(StringAppendOperatorTest, IteratorTest) { + auto db_ = OpenDb(','); + StringLists slists(db_); + + slists.Append("k1", "v1"); + slists.Append("k1", "v2"); + slists.Append("k1", "v3"); + + slists.Append("k2", "a1"); + slists.Append("k2", "a2"); + slists.Append("k2", "a3"); + + std::string res; + std::unique_ptr it(db_->NewIterator(ReadOptions())); + std::string k1("k1"); + std::string k2("k2"); + bool first = true; + for (it->Seek(k1); it->Valid(); it->Next()) { + res = it->value().ToString(); + if (first) { + ASSERT_EQ(res, "v1,v2,v3"); + first = false; + } else { + ASSERT_EQ(res, "a1,a2,a3"); + } + } + slists.Append("k2", "a4"); + slists.Append("k1", "v4"); + + // Snapshot should still be the same. Should ignore a4 and v4. + first = true; + for (it->Seek(k1); it->Valid(); it->Next()) { + res = it->value().ToString(); + if (first) { + ASSERT_EQ(res, "v1,v2,v3"); + first = false; + } else { + ASSERT_EQ(res, "a1,a2,a3"); + } + } + + + // Should release the snapshot and be aware of the new stuff now + it.reset(db_->NewIterator(ReadOptions())); + first = true; + for (it->Seek(k1); it->Valid(); it->Next()) { + res = it->value().ToString(); + if (first) { + ASSERT_EQ(res, "v1,v2,v3,v4"); + first = false; + } else { + ASSERT_EQ(res, "a1,a2,a3,a4"); + } + } + + // start from k2 this time. + for (it->Seek(k2); it->Valid(); it->Next()) { + res = it->value().ToString(); + if (first) { + ASSERT_EQ(res, "v1,v2,v3,v4"); + first = false; + } else { + ASSERT_EQ(res, "a1,a2,a3,a4"); + } + } + + slists.Append("k3", "g1"); + + it.reset(db_->NewIterator(ReadOptions())); + first = true; + std::string k3("k3"); + for(it->Seek(k2); it->Valid(); it->Next()) { + res = it->value().ToString(); + if (first) { + ASSERT_EQ(res, "a1,a2,a3,a4"); + first = false; + } else { + ASSERT_EQ(res, "g1"); + } + } + for(it->Seek(k3); it->Valid(); it->Next()) { + res = it->value().ToString(); + if (first) { + // should not be hit + ASSERT_EQ(res, "a1,a2,a3,a4"); + first = false; + } else { + ASSERT_EQ(res, "g1"); + } + } + +} + +TEST(StringAppendOperatorTest, SimpleTest) { + auto db = OpenDb(','); + StringLists slists(db); + + slists.Append("k1", "v1"); + slists.Append("k1", "v2"); + slists.Append("k1", "v3"); + + std::string res; + bool status = slists.Get("k1", &res); + + ASSERT_TRUE(status); + ASSERT_EQ(res, "v1,v2,v3"); +} + +TEST(StringAppendOperatorTest, SimpleDelimiterTest) { + auto db = OpenDb('|'); + StringLists slists(db); + + slists.Append("k1", "v1"); + slists.Append("k1", "v2"); + slists.Append("k1", "v3"); + + std::string res; + slists.Get("k1", &res); + ASSERT_EQ(res, "v1|v2|v3"); +} + +TEST(StringAppendOperatorTest, OneValueNoDelimiterTest) { + auto db = OpenDb('!'); + StringLists slists(db); + + slists.Append("random_key", "single_val"); + + std::string res; + slists.Get("random_key", &res); + ASSERT_EQ(res, "single_val"); +} + +TEST(StringAppendOperatorTest, VariousKeys) { + auto db = OpenDb('\n'); + StringLists slists(db); + + slists.Append("c", "asdasd"); + slists.Append("a", "x"); + slists.Append("b", "y"); + slists.Append("a", "t"); + slists.Append("a", "r"); + slists.Append("b", "2"); + slists.Append("c", "asdasd"); + + std::string a, b, c; + bool sa, sb, sc; + sa = slists.Get("a", &a); + sb = slists.Get("b", &b); + sc = slists.Get("c", &c); + + ASSERT_TRUE(sa && sb && sc); // All three keys should have been found + + ASSERT_EQ(a, "x\nt\nr"); + ASSERT_EQ(b, "y\n2"); + ASSERT_EQ(c, "asdasd\nasdasd"); +} + +// Generate semi random keys/words from a small distribution. +TEST(StringAppendOperatorTest, RandomMixGetAppend) { + auto db = OpenDb(' '); + StringLists slists(db); + + // Generate a list of random keys and values + const int kWordCount = 15; + std::string words[] = {"sdasd", "triejf", "fnjsdfn", "dfjisdfsf", "342839", + "dsuha", "mabuais", "sadajsid", "jf9834hf", "2d9j89", + "dj9823jd", "a", "dk02ed2dh", "$(jd4h984$(*", "mabz"}; + const int kKeyCount = 6; + std::string keys[] = {"dhaiusdhu", "denidw", "daisda", "keykey", "muki", + "shzassdianmd"}; + + // Will store a local copy of all data in order to verify correctness + std::map parallel_copy; + + // Generate a bunch of random queries (Append and Get)! + enum query_t { APPEND_OP, GET_OP, NUM_OPS }; + Random randomGen(1337); //deterministic seed; always get same results! + + const int kNumQueries = 30; + for (int q=0; q 0) { + parallel_copy[key] += " " + word; + } else { + parallel_copy[key] = word; + } + + } else if (query == GET_OP) { + // Assumes that a non-existent key just returns + std::string res; + slists.Get(key, &res); + ASSERT_EQ(res, parallel_copy[key]); + } + + } + +} + +TEST(StringAppendOperatorTest, BIGRandomMixGetAppend) { + auto db = OpenDb(' '); + StringLists slists(db); + + // Generate a list of random keys and values + const int kWordCount = 15; + std::string words[] = {"sdasd", "triejf", "fnjsdfn", "dfjisdfsf", "342839", + "dsuha", "mabuais", "sadajsid", "jf9834hf", "2d9j89", + "dj9823jd", "a", "dk02ed2dh", "$(jd4h984$(*", "mabz"}; + const int kKeyCount = 6; + std::string keys[] = {"dhaiusdhu", "denidw", "daisda", "keykey", "muki", + "shzassdianmd"}; + + // Will store a local copy of all data in order to verify correctness + std::map parallel_copy; + + // Generate a bunch of random queries (Append and Get)! + enum query_t { APPEND_OP, GET_OP, NUM_OPS }; + Random randomGen(9138204); // deterministic seed + + const int kNumQueries = 1000; + for (int q=0; q 0) { + parallel_copy[key] += " " + word; + } else { + parallel_copy[key] = word; + } + + } else if (query == GET_OP) { + // Assumes that a non-existent key just returns + std::string res; + slists.Get(key, &res); + ASSERT_EQ(res, parallel_copy[key]); + } + + } + +} + + +TEST(StringAppendOperatorTest, PersistentVariousKeys) { + // Perform the following operations in limited scope + { + auto db = OpenDb('\n'); + StringLists slists(db); + + slists.Append("c", "asdasd"); + slists.Append("a", "x"); + slists.Append("b", "y"); + slists.Append("a", "t"); + slists.Append("a", "r"); + slists.Append("b", "2"); + slists.Append("c", "asdasd"); + + std::string a, b, c; + slists.Get("a", &a); + slists.Get("b", &b); + slists.Get("c", &c); + + ASSERT_EQ(a, "x\nt\nr"); + ASSERT_EQ(b, "y\n2"); + ASSERT_EQ(c, "asdasd\nasdasd"); + } + + // Reopen the database (the previous changes should persist / be remembered) + { + auto db = OpenDb('\n'); + StringLists slists(db); + + slists.Append("c", "bbnagnagsx"); + slists.Append("a", "sa"); + slists.Append("b", "df"); + slists.Append("a", "gh"); + slists.Append("a", "jk"); + slists.Append("b", "l;"); + slists.Append("c", "rogosh"); + + // The previous changes should be on disk (L0) + // The most recent changes should be in memory (MemTable) + // Hence, this will test both Get() paths. + std::string a, b, c; + slists.Get("a", &a); + slists.Get("b", &b); + slists.Get("c", &c); + + ASSERT_EQ(a, "x\nt\nr\nsa\ngh\njk"); + ASSERT_EQ(b, "y\n2\ndf\nl;"); + ASSERT_EQ(c, "asdasd\nasdasd\nbbnagnagsx\nrogosh"); + } + + // Reopen the database (the previous changes should persist / be remembered) + { + auto db = OpenDb('\n'); + StringLists slists(db); + + // All changes should be on disk. This will test VersionSet Get() + std::string a, b, c; + slists.Get("a", &a); + slists.Get("b", &b); + slists.Get("c", &c); + + ASSERT_EQ(a, "x\nt\nr\nsa\ngh\njk"); + ASSERT_EQ(b, "y\n2\ndf\nl;"); + ASSERT_EQ(c, "asdasd\nasdasd\nbbnagnagsx\nrogosh"); + } +} + +TEST(StringAppendOperatorTest, PersistentFlushAndCompaction) { + // Perform the following operations in limited scope + { + auto db = OpenDb('\n'); + StringLists slists(db); + std::string a, b, c; + bool success; + + // Append, Flush, Get + slists.Append("c", "asdasd"); + db->Flush(rocksdb::FlushOptions()); + success = slists.Get("c", &c); + ASSERT_TRUE(success); + ASSERT_EQ(c, "asdasd"); + + // Append, Flush, Append, Get + slists.Append("a", "x"); + slists.Append("b", "y"); + db->Flush(rocksdb::FlushOptions()); + slists.Append("a", "t"); + slists.Append("a", "r"); + slists.Append("b", "2"); + + success = slists.Get("a", &a); + assert(success == true); + ASSERT_EQ(a, "x\nt\nr"); + + success = slists.Get("b", &b); + assert(success == true); + ASSERT_EQ(b, "y\n2"); + + // Append, Get + success = slists.Append("c", "asdasd"); + assert(success); + success = slists.Append("b", "monkey"); + assert(success); + + // I omit the "assert(success)" checks here. + slists.Get("a", &a); + slists.Get("b", &b); + slists.Get("c", &c); + + ASSERT_EQ(a, "x\nt\nr"); + ASSERT_EQ(b, "y\n2\nmonkey"); + ASSERT_EQ(c, "asdasd\nasdasd"); + } + + // Reopen the database (the previous changes should persist / be remembered) + { + auto db = OpenDb('\n'); + StringLists slists(db); + std::string a, b, c; + + // Get (Quick check for persistence of previous database) + slists.Get("a", &a); + ASSERT_EQ(a, "x\nt\nr"); + + //Append, Compact, Get + slists.Append("c", "bbnagnagsx"); + slists.Append("a", "sa"); + slists.Append("b", "df"); + db->CompactRange(nullptr, nullptr); + slists.Get("a", &a); + slists.Get("b", &b); + slists.Get("c", &c); + ASSERT_EQ(a, "x\nt\nr\nsa"); + ASSERT_EQ(b, "y\n2\nmonkey\ndf"); + ASSERT_EQ(c, "asdasd\nasdasd\nbbnagnagsx"); + + // Append, Get + slists.Append("a", "gh"); + slists.Append("a", "jk"); + slists.Append("b", "l;"); + slists.Append("c", "rogosh"); + slists.Get("a", &a); + slists.Get("b", &b); + slists.Get("c", &c); + ASSERT_EQ(a, "x\nt\nr\nsa\ngh\njk"); + ASSERT_EQ(b, "y\n2\nmonkey\ndf\nl;"); + ASSERT_EQ(c, "asdasd\nasdasd\nbbnagnagsx\nrogosh"); + + // Compact, Get + db->CompactRange(nullptr, nullptr); + ASSERT_EQ(a, "x\nt\nr\nsa\ngh\njk"); + ASSERT_EQ(b, "y\n2\nmonkey\ndf\nl;"); + ASSERT_EQ(c, "asdasd\nasdasd\nbbnagnagsx\nrogosh"); + + // Append, Flush, Compact, Get + slists.Append("b", "afcg"); + db->Flush(rocksdb::FlushOptions()); + db->CompactRange(nullptr, nullptr); + slists.Get("b", &b); + ASSERT_EQ(b, "y\n2\nmonkey\ndf\nl;\nafcg"); + } +} + +TEST(StringAppendOperatorTest, SimpleTestNullDelimiter) { + auto db = OpenDb('\0'); + StringLists slists(db); + + slists.Append("k1", "v1"); + slists.Append("k1", "v2"); + slists.Append("k1", "v3"); + + std::string res; + bool status = slists.Get("k1", &res); + ASSERT_TRUE(status); + + // Construct the desired string. Default constructor doesn't like '\0' chars. + std::string checker("v1,v2,v3"); // Verify that the string is right size. + checker[2] = '\0'; // Use null delimiter instead of comma. + checker[5] = '\0'; + assert(checker.size() == 8); // Verify it is still the correct size + + // Check that the rocksdb result string matches the desired string + assert(res.size() == checker.size()); + ASSERT_EQ(res, checker); +} + +} // namespace rocksdb + +int main(int arc, char** argv) { + // Run with regular database + { + fprintf(stderr, "Running tests with regular db and operator.\n"); + StringAppendOperatorTest::SetOpenDbFunction(&OpenNormalDb); + rocksdb::test::RunAllTests(); + } + + // Run with TTL + { + fprintf(stderr, "Running tests with ttl db and generic operator.\n"); + StringAppendOperatorTest::SetOpenDbFunction(&OpenTtlDb); + rocksdb::test::RunAllTests(); + } + + return 0; +} diff --git a/utilities/merge_operators/uint64add.cc b/utilities/merge_operators/uint64add.cc new file mode 100644 index 00000000..9d78651e --- /dev/null +++ b/utilities/merge_operators/uint64add.cc @@ -0,0 +1,65 @@ +#include +#include "rocksdb/env.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/slice.h" +#include "util/coding.h" +#include "utilities/merge_operators.h" + +using namespace rocksdb; + +namespace { // anonymous namespace + +// A 'model' merge operator with uint64 addition semantics +// Implemented as an AssociativeMergeOperator for simplicity and example. +class UInt64AddOperator : public AssociativeMergeOperator { + public: + virtual bool Merge(const Slice& key, + const Slice* existing_value, + const Slice& value, + std::string* new_value, + Logger* logger) const override { + uint64_t orig_value = 0; + if (existing_value){ + orig_value = DecodeInteger(*existing_value, logger); + } + uint64_t operand = DecodeInteger(value, logger); + + assert(new_value); + new_value->clear(); + PutFixed64(new_value, orig_value + operand); + + return true; // Return true always since corruption will be treated as 0 + } + + virtual const char* Name() const override { + return "UInt64AddOperator"; + } + + private: + // Takes the string and decodes it into a uint64_t + // On error, prints a message and returns 0 + uint64_t DecodeInteger(const Slice& value, Logger* logger) const { + uint64_t result = 0; + + if (value.size() == sizeof(uint64_t)) { + result = DecodeFixed64(value.data()); + } else if (logger != nullptr) { + // If value is corrupted, treat it as 0 + Log(logger, "uint64 value corruption, size: %zu > %zu", + value.size(), sizeof(uint64_t)); + } + + return result; + } + +}; + +} + +namespace rocksdb { + +std::shared_ptr MergeOperators::CreateUInt64AddOperator() { + return std::make_shared(); +} + +} diff --git a/utilities/redis/README b/utilities/redis/README new file mode 100644 index 00000000..8b17bc05 --- /dev/null +++ b/utilities/redis/README @@ -0,0 +1,14 @@ +This folder defines a REDIS-style interface for Rocksdb. +Right now it is written as a simple tag-on in the rocksdb::RedisLists class. +It implements Redis Lists, and supports only the "non-blocking operations". + +Internally, the set of lists are stored in a rocksdb database, mapping keys to +values. Each "value" is the list itself, storing a sequence of "elements". +Each element is stored as a 32-bit-integer, followed by a sequence of bytes. +The 32-bit-integer represents the length of the element (that is, the number +of bytes that follow). And then that many bytes follow. + + +NOTE: This README file may be old. See the actual redis_lists.cc file for +definitive details on the implementation. There should be a header at the top +of that file, explaining a bit of the implementation details. diff --git a/utilities/redis/redis_list_exception.h b/utilities/redis/redis_list_exception.h new file mode 100644 index 00000000..d409095a --- /dev/null +++ b/utilities/redis/redis_list_exception.h @@ -0,0 +1,20 @@ +/** + * A simple structure for exceptions in RedisLists. + * + * @author Deon Nicholas (dnicholas@fb.com) + * Copyright 2013 Facebook + */ + +#pragma once +#include + +namespace rocksdb { + +class RedisListException: public std::exception { + public: + const char* what() const throw() { + return "Invalid operation or corrupt data in Redis List."; + } +}; + +} // namespace rocksdb diff --git a/utilities/redis/redis_list_iterator.h b/utilities/redis/redis_list_iterator.h new file mode 100644 index 00000000..d57f8ac9 --- /dev/null +++ b/utilities/redis/redis_list_iterator.h @@ -0,0 +1,308 @@ +/** + * RedisListIterator: + * An abstraction over the "list" concept (e.g.: for redis lists). + * Provides functionality to read, traverse, edit, and write these lists. + * + * Upon construction, the RedisListIterator is given a block of list data. + * Internally, it stores a pointer to the data and a pointer to current item. + * It also stores a "result" list that will be mutated over time. + * + * Traversal and mutation are done by "forward iteration". + * The Push() and Skip() methods will advance the iterator to the next item. + * However, Push() will also "write the current item to the result". + * Skip() will simply move to next item, causing current item to be dropped. + * + * Upon completion, the result (accessible by WriteResult()) will be saved. + * All "skipped" items will be gone; all "pushed" items will remain. + * + * @throws Any of the operations may throw a RedisListException if an invalid + * operation is performed or if the data is found to be corrupt. + * + * @notes By default, if WriteResult() is called part-way through iteration, + * it will automatically advance the iterator to the end, and Keep() + * all items that haven't been traversed yet. This may be subject + * to review. + * + * @notes Can access the "current" item via GetCurrent(), and other + * list-specific information such as Length(). + * + * @notes The internal representation is due to change at any time. Presently, + * the list is represented as follows: + * - 32-bit integer header: the number of items in the list + * - For each item: + * - 32-bit int (n): the number of bytes representing this item + * - n bytes of data: the actual data. + * + * @author Deon Nicholas (dnicholas@fb.com) + * Copyright 2013 Facebook + */ + +#pragma once + +#include + +#include "redis_list_exception.h" +#include "rocksdb/slice.h" +#include "util/coding.h" + +namespace rocksdb { + +/// An abstraction over the "list" concept. +/// All operations may throw a RedisListException +class RedisListIterator { + public: + /// Construct a redis-list-iterator based on data. + /// If the data is non-empty, it must formatted according to @notes above. + /// + /// If the data is valid, we can assume the following invariant(s): + /// a) length_, num_bytes_ are set correctly. + /// b) cur_byte_ always refers to the start of the current element, + /// just before the bytes that specify element length. + /// c) cur_elem_ is always the index of the current element. + /// d) cur_elem_length_ is always the number of bytes in current element, + /// excluding the 4-byte header itself. + /// e) result_ will always contain data_[0..cur_byte_) and a header + /// f) Whenever corrupt data is encountered or an invalid operation is + /// attempted, a RedisListException will immediately be thrown. + RedisListIterator(const std::string& list_data) + : data_(list_data.data()), + num_bytes_(list_data.size()), + cur_byte_(0), + cur_elem_(0), + cur_elem_length_(0), + length_(0), + result_() { + + // Initialize the result_ (reserve enough space for header) + InitializeResult(); + + // Parse the data only if it is not empty. + if (num_bytes_ == 0) { + return; + } + + // If non-empty, but less than 4 bytes, data must be corrupt + if (num_bytes_ < sizeof(length_)) { + ThrowError("Corrupt header."); // Will break control flow + } + + // Good. The first bytes specify the number of elements + length_ = DecodeFixed32(data_); + cur_byte_ = sizeof(length_); + + // If we have at least one element, point to that element. + // Also, read the first integer of the element (specifying the size), + // if possible. + if (length_ > 0) { + if (cur_byte_ + sizeof(cur_elem_length_) <= num_bytes_) { + cur_elem_length_ = DecodeFixed32(data_+cur_byte_); + } else { + ThrowError("Corrupt data for first element."); + } + } + + // At this point, we are fully set-up. + // The invariants described in the header should now be true. + } + + /// Reserve some space for the result_. + /// Equivalent to result_.reserve(bytes). + void Reserve(int bytes) { + result_.reserve(bytes); + } + + /// Go to next element in data file. + /// Also writes the current element to result_. + RedisListIterator& Push() { + WriteCurrentElement(); + MoveNext(); + return *this; + } + + /// Go to next element in data file. + /// Drops/skips the current element. It will not be written to result_. + RedisListIterator& Skip() { + MoveNext(); + --length_; // One less item + --cur_elem_; // We moved one forward, but index did not change + return *this; + } + + /// Insert elem into the result_ (just BEFORE the current element / byte) + /// Note: if Done() (i.e.: iterator points to end), this will append elem. + void InsertElement(const Slice& elem) { + // Ensure we are in a valid state + CheckErrors(); + + const int kOrigSize = result_.size(); + result_.resize(kOrigSize + SizeOf(elem)); + EncodeFixed32(result_.data() + kOrigSize, elem.size()); + memcpy(result_.data() + kOrigSize + sizeof(uint32_t), + elem.data(), + elem.size()); + ++length_; + ++cur_elem_; + } + + /// Access the current element, and save the result into *curElem + void GetCurrent(Slice* curElem) { + // Ensure we are in a valid state + CheckErrors(); + + // Ensure that we are not past the last element. + if (Done()) { + ThrowError("Invalid dereferencing."); + } + + // Dereference the element + *curElem = Slice(data_+cur_byte_+sizeof(cur_elem_length_), + cur_elem_length_); + } + + // Number of elements + int Length() const { + return length_; + } + + // Number of bytes in the final representation (i.e: WriteResult().size()) + int Size() const { + // result_ holds the currently written data + // data_[cur_byte..num_bytes-1] is the remainder of the data + return result_.size() + (num_bytes_ - cur_byte_); + } + + // Reached the end? + bool Done() const { + return cur_byte_ >= num_bytes_ || cur_elem_ >= length_; + } + + /// Returns a string representing the final, edited, data. + /// Assumes that all bytes of data_ in the range [0,cur_byte_) have been read + /// and that result_ contains this data. + /// The rest of the data must still be written. + /// So, this method ADVANCES THE ITERATOR TO THE END before writing. + Slice WriteResult() { + CheckErrors(); + + // The header should currently be filled with dummy data (0's) + // Correctly update the header. + // Note, this is safe since result_ is a vector (guaranteed contiguous) + EncodeFixed32(&result_[0],length_); + + // Append the remainder of the data to the result. + result_.insert(result_.end(),data_+cur_byte_, data_ +num_bytes_); + + // Seek to end of file + cur_byte_ = num_bytes_; + cur_elem_ = length_; + cur_elem_length_ = 0; + + // Return the result + return Slice(result_.data(),result_.size()); + } + + public: // Static public functions + + /// An upper-bound on the amount of bytes needed to store this element. + /// This is used to hide representation information from the client. + /// E.G. This can be used to compute the bytes we want to Reserve(). + static uint32_t SizeOf(const Slice& elem) { + // [Integer Length . Data] + return sizeof(uint32_t) + elem.size(); + } + + private: // Private functions + + /// Initializes the result_ string. + /// It will fill the first few bytes with 0's so that there is + /// enough space for header information when we need to write later. + /// Currently, "header information" means: the length (number of elements) + /// Assumes that result_ is empty to begin with + void InitializeResult() { + assert(result_.empty()); // Should always be true. + result_.resize(sizeof(uint32_t),0); // Put a block of 0's as the header + } + + /// Go to the next element (used in Push() and Skip()) + void MoveNext() { + CheckErrors(); + + // Check to make sure we are not already in a finished state + if (Done()) { + ThrowError("Attempting to iterate past end of list."); + } + + // Move forward one element. + cur_byte_ += sizeof(cur_elem_length_) + cur_elem_length_; + ++cur_elem_; + + // If we are at the end, finish + if (Done()) { + cur_elem_length_ = 0; + return; + } + + // Otherwise, we should be able to read the new element's length + if (cur_byte_ + sizeof(cur_elem_length_) > num_bytes_) { + ThrowError("Corrupt element data."); + } + + // Set the new element's length + cur_elem_length_ = DecodeFixed32(data_+cur_byte_); + + return; + } + + /// Append the current element (pointed to by cur_byte_) to result_ + /// Assumes result_ has already been reserved appropriately. + void WriteCurrentElement() { + // First verify that the iterator is still valid. + CheckErrors(); + if (Done()) { + ThrowError("Attempting to write invalid element."); + } + + // Append the cur element. + result_.insert(result_.end(), + data_+cur_byte_, + data_+cur_byte_+ sizeof(uint32_t) + cur_elem_length_); + } + + /// Will ThrowError() if neccessary. + /// Checks for common/ubiquitous errors that can arise after most operations. + /// This method should be called before any reading operation. + /// If this function succeeds, then we are guaranteed to be in a valid state. + /// Other member functions should check for errors and ThrowError() also + /// if an error occurs that is specific to it even while in a valid state. + void CheckErrors() { + // Check if any crazy thing has happened recently + if ((cur_elem_ > length_) || // Bad index + (cur_byte_ > num_bytes_) || // No more bytes + (cur_byte_ + cur_elem_length_ > num_bytes_) || // Item too large + (cur_byte_ == num_bytes_ && cur_elem_ != length_) || // Too many items + (cur_elem_ == length_ && cur_byte_ != num_bytes_)) { // Too many bytes + ThrowError("Corrupt data."); + } + } + + /// Will throw an exception based on the passed-in message. + /// This function is guaranteed to STOP THE CONTROL-FLOW. + /// (i.e.: you do not have to call "return" after calling ThrowError) + void ThrowError(const char* const msg = NULL) { + // TODO: For now we ignore the msg parameter. This can be expanded later. + throw RedisListException(); + } + + private: + const char* const data_; // A pointer to the data (the first byte) + const uint32_t num_bytes_; // The number of bytes in this list + + uint32_t cur_byte_; // The current byte being read + uint32_t cur_elem_; // The current element being read + uint32_t cur_elem_length_; // The number of bytes in current element + + uint32_t length_; // The number of elements in this list + std::vector result_; // The output data +}; + +} // namespace rocksdb diff --git a/utilities/redis/redis_lists.cc b/utilities/redis/redis_lists.cc new file mode 100644 index 00000000..50c544a3 --- /dev/null +++ b/utilities/redis/redis_lists.cc @@ -0,0 +1,551 @@ +/** + * A (persistent) Redis API built using the rocksdb backend. + * Implements Redis Lists as described on: http://redis.io/commands#list + * + * @throws All functions may throw a RedisListException on error/corruption. + * + * @notes Internally, the set of lists is stored in a rocksdb database, + * mapping keys to values. Each "value" is the list itself, storing + * some kind of internal representation of the data. All the + * representation details are handled by the RedisListIterator class. + * The present file should be oblivious to the representation details, + * handling only the client (Redis) API, and the calls to rocksdb. + * + * @TODO Presently, all operations take at least O(NV) time where + * N is the number of elements in the list, and V is the average + * number of bytes per value in the list. So maybe, with merge operator + * we can improve this to an optimal O(V) amortized time, since we + * wouldn't have to read and re-write the entire list. + * + * @author Deon Nicholas (dnicholas@fb.com) + * Copyright 2013 Facebook + */ + +#include "redis_lists.h" + +#include +#include +#include + +#include "rocksdb/slice.h" +#include "util/coding.h" + +namespace rocksdb +{ + +/// Constructors + +RedisLists::RedisLists(const std::string& db_path, + Options options, bool destructive) + : put_option_(), + get_option_() { + + // Store the name of the database + db_name_ = db_path; + + // If destructive, destroy the DB before re-opening it. + if (destructive) { + DestroyDB(db_name_, Options()); + } + + // Now open and deal with the db + DB* db; + Status s = DB::Open(options, db_name_, &db); + if (!s.ok()) { + std::cerr << "ERROR " << s.ToString() << std::endl; + assert(false); + } + + db_ = std::unique_ptr(db); +} + + +/// Accessors + +// Number of elements in the list associated with key +// : throws RedisListException +int RedisLists::Length(const std::string& key) { + // Extract the string data representing the list. + std::string data; + db_->Get(get_option_, key, &data); + + // Return the length + RedisListIterator it(data); + return it.Length(); +} + +// Get the element at the specified index in the (list: key) +// Returns ("") on out-of-bounds +// : throws RedisListException +bool RedisLists::Index(const std::string& key, int32_t index, + std::string* result) { + // Extract the string data representing the list. + std::string data; + db_->Get(get_option_, key, &data); + + // Handle REDIS negative indices (from the end); fast iff Length() takes O(1) + if (index < 0) { + index = Length(key) - (-index); //replace (-i) with (N-i). + } + + // Iterate through the list until the desired index is found. + int curIndex = 0; + RedisListIterator it(data); + while(curIndex < index && !it.Done()) { + ++curIndex; + it.Skip(); + } + + // If we actually found the index + if (curIndex == index && !it.Done()) { + Slice elem; + it.GetCurrent(&elem); + if (result != NULL) { + *result = elem.ToString(); + } + + return true; + } else { + return false; + } +} + +// Return a truncated version of the list. +// First, negative values for first/last are interpreted as "end of list". +// So, if first == -1, then it is re-set to index: (Length(key) - 1) +// Then, return exactly those indices i such that first <= i <= last. +// : throws RedisListException +std::vector RedisLists::Range(const std::string& key, + int32_t first, int32_t last) { + // Extract the string data representing the list. + std::string data; + db_->Get(get_option_, key, &data); + + // Handle negative bounds (-1 means last element, etc.) + int listLen = Length(key); + if (first < 0) { + first = listLen - (-first); // Replace (-x) with (N-x) + } + if (last < 0) { + last = listLen - (-last); + } + + // Verify bounds (and truncate the range so that it is valid) + first = std::max(first, 0); + last = std::min(last, listLen-1); + int len = std::max(last-first+1, 0); + + // Initialize the resulting list + std::vector result(len); + + // Traverse the list and update the vector + int curIdx = 0; + Slice elem; + for (RedisListIterator it(data); !it.Done() && curIdx<=last; it.Skip()) { + if (first <= curIdx && curIdx <= last) { + it.GetCurrent(&elem); + result[curIdx-first].assign(elem.data(),elem.size()); + } + + ++curIdx; + } + + // Return the result. Might be empty + return result; +} + +// Print the (list: key) out to stdout. For debugging mostly. Public for now. +void RedisLists::Print(const std::string& key) { + // Extract the string data representing the list. + std::string data; + db_->Get(get_option_, key, &data); + + // Iterate through the list and print the items + Slice elem; + for (RedisListIterator it(data); !it.Done(); it.Skip()) { + it.GetCurrent(&elem); + std::cout << "ITEM " << elem.ToString() << std::endl; + } + + //Now print the byte data + RedisListIterator it(data); + std::cout << "==Printing data==" << std::endl; + std::cout << data.size() << std::endl; + std::cout << it.Size() << " " << it.Length() << std::endl; + Slice result = it.WriteResult(); + std::cout << result.data() << std::endl; + if (true) { + std::cout << "size: " << result.size() << std::endl; + const char* val = result.data(); + for(int i=0; i<(int)result.size(); ++i) { + std::cout << (int)val[i] << " " << (val[i]>=32?val[i]:' ') << std::endl; + } + std::cout << std::endl; + } +} + +/// Insert/Update Functions +/// Note: The "real" insert function is private. See below. + +// InsertBefore and InsertAfter are simply wrappers around the Insert function. +int RedisLists::InsertBefore(const std::string& key, const std::string& pivot, + const std::string& value) { + return Insert(key, pivot, value, false); +} + +int RedisLists::InsertAfter(const std::string& key, const std::string& pivot, + const std::string& value) { + return Insert(key, pivot, value, true); +} + +// Prepend value onto beginning of (list: key) +// : throws RedisListException +int RedisLists::PushLeft(const std::string& key, const std::string& value) { + // Get the original list data + std::string data; + db_->Get(get_option_, key, &data); + + // Construct the result + RedisListIterator it(data); + it.Reserve(it.Size() + it.SizeOf(value)); + it.InsertElement(value); + + // Push the data back to the db and return the length + db_->Put(put_option_, key, it.WriteResult()); + return it.Length(); +} + +// Append value onto end of (list: key) +// TODO: Make this O(1) time. Might require MergeOperator. +// : throws RedisListException +int RedisLists::PushRight(const std::string& key, const std::string& value) { + // Get the original list data + std::string data; + db_->Get(get_option_, key, &data); + + // Create an iterator to the data and seek to the end. + RedisListIterator it(data); + it.Reserve(it.Size() + it.SizeOf(value)); + while (!it.Done()) { + it.Push(); // Write each element as we go + } + + // Insert the new element at the current position (the end) + it.InsertElement(value); + + // Push it back to the db, and return length + db_->Put(put_option_, key, it.WriteResult()); + return it.Length(); +} + +// Set (list: key)[idx] = val. Return true on success, false on fail. +// : throws RedisListException +bool RedisLists::Set(const std::string& key, int32_t index, + const std::string& value) { + // Get the original list data + std::string data; + db_->Get(get_option_, key, &data); + + // Handle negative index for REDIS (meaning -index from end of list) + if (index < 0) { + index = Length(key) - (-index); + } + + // Iterate through the list until we find the element we want + int curIndex = 0; + RedisListIterator it(data); + it.Reserve(it.Size() + it.SizeOf(value)); // Over-estimate is fine + while(curIndex < index && !it.Done()) { + it.Push(); + ++curIndex; + } + + // If not found, return false (this occurs when index was invalid) + if (it.Done() || curIndex != index) { + return false; + } + + // Write the new element value, and drop the previous element value + it.InsertElement(value); + it.Skip(); + + // Write the data to the database + // Check status, since it needs to return true/false guarantee + Status s = db_->Put(put_option_, key, it.WriteResult()); + + // Success + return s.ok(); +} + +/// Delete / Remove / Pop functions + +// Trim (list: key) so that it will only contain the indices from start..stop +// Invalid indices will not generate an error, just empty, +// or the portion of the list that fits in this interval +// : throws RedisListException +bool RedisLists::Trim(const std::string& key, int32_t start, int32_t stop) { + // Get the original list data + std::string data; + db_->Get(get_option_, key, &data); + + // Handle negative indices in REDIS + int listLen = Length(key); + if (start < 0) { + start = listLen - (-start); + } + if (stop < 0) { + stop = listLen - (-stop); + } + + // Truncate bounds to only fit in the list + start = std::max(start, 0); + stop = std::min(stop, listLen-1); + + // Construct an iterator for the list. Drop all undesired elements. + int curIndex = 0; + RedisListIterator it(data); + it.Reserve(it.Size()); // Over-estimate + while(!it.Done()) { + // If not within the range, just skip the item (drop it). + // Otherwise, continue as usual. + if (start <= curIndex && curIndex <= stop) { + it.Push(); + } else { + it.Skip(); + } + + // Increment the current index + ++curIndex; + } + + // Write the (possibly empty) result to the database + Status s = db_->Put(put_option_, key, it.WriteResult()); + + // Return true as long as the write succeeded + return s.ok(); +} + +// Return and remove the first element in the list (or "" if empty) +// : throws RedisListException +bool RedisLists::PopLeft(const std::string& key, std::string* result) { + // Get the original list data + std::string data; + db_->Get(get_option_, key, &data); + + // Point to first element in the list (if it exists), and get its value/size + RedisListIterator it(data); + if (it.Length() > 0) { // Proceed only if list is non-empty + Slice elem; + it.GetCurrent(&elem); // Store the value of the first element + it.Reserve(it.Size() - it.SizeOf(elem)); + it.Skip(); // DROP the first item and move to next + + // Update the db + db_->Put(put_option_, key, it.WriteResult()); + + // Return the value + if (result != NULL) { + *result = elem.ToString(); + } + return true; + } else { + return false; + } +} + +// Remove and return the last element in the list (or "" if empty) +// TODO: Make this O(1). Might require MergeOperator. +// : throws RedisListException +bool RedisLists::PopRight(const std::string& key, std::string* result) { + // Extract the original list data + std::string data; + db_->Get(get_option_, key, &data); + + // Construct an iterator to the data and move to last element + RedisListIterator it(data); + it.Reserve(it.Size()); + int len = it.Length(); + int curIndex = 0; + while(curIndex < (len-1) && !it.Done()) { + it.Push(); + ++curIndex; + } + + // Extract and drop/skip the last element + if (curIndex == len-1) { + assert(!it.Done()); // Sanity check. Should not have ended here. + + // Extract and pop the element + Slice elem; + it.GetCurrent(&elem); // Save value of element. + it.Skip(); // Skip the element + + // Write the result to the database + db_->Put(put_option_, key, it.WriteResult()); + + // Return the value + if (result != NULL) { + *result = elem.ToString(); + } + return true; + } else { + // Must have been an empty list + assert(it.Done() && len==0 && curIndex == 0); + return false; + } +} + +// Remove the (first or last) "num" occurrences of value in (list: key) +// : throws RedisListException +int RedisLists::Remove(const std::string& key, int32_t num, + const std::string& value) { + // Negative num ==> RemoveLast; Positive num ==> Remove First + if (num < 0) { + return RemoveLast(key, -num, value); + } else if (num > 0) { + return RemoveFirst(key, num, value); + } else { + return RemoveFirst(key, Length(key), value); + } +} + +// Remove the first "num" occurrences of value in (list: key). +// : throws RedisListException +int RedisLists::RemoveFirst(const std::string& key, int32_t num, + const std::string& value) { + // Ensure that the number is positive + assert(num >= 0); + + // Extract the original list data + std::string data; + db_->Get(get_option_, key, &data); + + // Traverse the list, appending all but the desired occurrences of value + int numSkipped = 0; // Keep track of the number of times value is seen + Slice elem; + RedisListIterator it(data); + it.Reserve(it.Size()); + while (!it.Done()) { + it.GetCurrent(&elem); + + if (elem == value && numSkipped < num) { + // Drop this item if desired + it.Skip(); + ++numSkipped; + } else { + // Otherwise keep the item and proceed as normal + it.Push(); + } + } + + // Put the result back to the database + db_->Put(put_option_, key, it.WriteResult()); + + // Return the number of elements removed + return numSkipped; +} + + +// Remove the last "num" occurrences of value in (list: key). +// TODO: I traverse the list 2x. Make faster. Might require MergeOperator. +// : throws RedisListException +int RedisLists::RemoveLast(const std::string& key, int32_t num, + const std::string& value) { + // Ensure that the number is positive + assert(num >= 0); + + // Extract the original list data + std::string data; + db_->Get(get_option_, key, &data); + + // Temporary variable to hold the "current element" in the blocks below + Slice elem; + + // Count the total number of occurrences of value + int totalOccs = 0; + for (RedisListIterator it(data); !it.Done(); it.Skip()) { + it.GetCurrent(&elem); + if (elem == value) { + ++totalOccs; + } + } + + // Construct an iterator to the data. Reserve enough space for the result. + RedisListIterator it(data); + int bytesRemoved = std::min(num,totalOccs)*it.SizeOf(value); + it.Reserve(it.Size() - bytesRemoved); + + // Traverse the list, appending all but the desired occurrences of value. + // Note: "Drop the last k occurrences" is equivalent to + // "keep only the first n-k occurrences", where n is total occurrences. + int numKept = 0; // Keep track of the number of times value is kept + while(!it.Done()) { + it.GetCurrent(&elem); + + // If we are within the deletion range and equal to value, drop it. + // Otherwise, append/keep/push it. + if (elem == value) { + if (numKept < totalOccs - num) { + it.Push(); + ++numKept; + } else { + it.Skip(); + } + } else { + // Always append the others + it.Push(); + } + } + + // Put the result back to the database + db_->Put(put_option_, key, it.WriteResult()); + + // Return the number of elements removed + return totalOccs - numKept; +} + +/// Private functions + +// Insert element value into (list: key), right before/after +// the first occurrence of pivot +// : throws RedisListException +int RedisLists::Insert(const std::string& key, const std::string& pivot, + const std::string& value, bool insert_after) { + // Get the original list data + std::string data; + db_->Get(get_option_, key, &data); + + // Construct an iterator to the data and reserve enough space for result. + RedisListIterator it(data); + it.Reserve(it.Size() + it.SizeOf(value)); + + // Iterate through the list until we find the element we want + Slice elem; + bool found = false; + while(!it.Done() && !found) { + it.GetCurrent(&elem); + + // When we find the element, insert the element and mark found + if (elem == pivot) { // Found it! + found = true; + if (insert_after == true) { // Skip one more, if inserting after it + it.Push(); + } + it.InsertElement(value); + } else { + it.Push(); + } + + } + + // Put the data (string) into the database + if (found) { + db_->Put(put_option_, key, it.WriteResult()); + } + + // Returns the new (possibly unchanged) length of the list + return it.Length(); +} + + +} diff --git a/utilities/redis/redis_lists.h b/utilities/redis/redis_lists.h new file mode 100644 index 00000000..8c149bc4 --- /dev/null +++ b/utilities/redis/redis_lists.h @@ -0,0 +1,106 @@ +/** + * A (persistent) Redis API built using the rocksdb backend. + * Implements Redis Lists as described on: http://redis.io/commands#list + * + * @throws All functions may throw a RedisListException + * + * @author Deon Nicholas (dnicholas@fb.com) + * Copyright 2013 Facebook + */ + +#pragma once + +#include +#include "rocksdb/db.h" +#include "redis_list_iterator.h" +#include "redis_list_exception.h" + +namespace rocksdb { + +/// The Redis functionality (see http://redis.io/commands#list) +/// All functions may THROW a RedisListException +class RedisLists { + public: // Constructors / Destructors + /// Construct a new RedisLists database, with name/path of db. + /// Will clear the database on open iff destructive is true (default false). + /// Otherwise, it will restore saved changes. + /// May throw RedisListException + RedisLists(const std::string& db_path, + Options options, bool destructive = false); + + public: // Accessors + /// The number of items in (list: key) + int Length(const std::string& key); + + /// Search the list for the (index)'th item (0-based) in (list:key) + /// A negative index indicates: "from end-of-list" + /// If index is within range: return true, and return the value in *result. + /// If (index < -length OR index>=length), then index is out of range: + /// return false (and *result is left unchanged) + /// May throw RedisListException + bool Index(const std::string& key, int32_t index, + std::string* result); + + /// Return (list: key)[first..last] (inclusive) + /// May throw RedisListException + std::vector Range(const std::string& key, + int32_t first, int32_t last); + + /// Prints the entire (list: key), for debugging. + void Print(const std::string& key); + + public: // Insert/Update + /// Insert value before/after pivot in (list: key). Return the length. + /// May throw RedisListException + int InsertBefore(const std::string& key, const std::string& pivot, + const std::string& value); + int InsertAfter(const std::string& key, const std::string& pivot, + const std::string& value); + + /// Push / Insert value at beginning/end of the list. Return the length. + /// May throw RedisListException + int PushLeft(const std::string& key, const std::string& value); + int PushRight(const std::string& key, const std::string& value); + + /// Set (list: key)[idx] = val. Return true on success, false on fail + /// May throw RedisListException + bool Set(const std::string& key, int32_t index, const std::string& value); + + public: // Delete / Remove / Pop / Trim + /// Trim (list: key) so that it will only contain the indices from start..stop + /// Returns true on success + /// May throw RedisListException + bool Trim(const std::string& key, int32_t start, int32_t stop); + + /// If list is empty, return false and leave *result unchanged. + /// Else, remove the first/last elem, store it in *result, and return true + bool PopLeft(const std::string& key, std::string* result); // First + bool PopRight(const std::string& key, std::string* result); // Last + + /// Remove the first (or last) num occurrences of value from the list (key) + /// Return the number of elements removed. + /// May throw RedisListException + int Remove(const std::string& key, int32_t num, + const std::string& value); + int RemoveFirst(const std::string& key, int32_t num, + const std::string& value); + int RemoveLast(const std::string& key, int32_t num, + const std::string& value); + + private: // Private Functions + /// Calls InsertBefore or InsertAfter + int Insert(const std::string& key, const std::string& pivot, + const std::string& value, bool insert_after); + private: + std::string db_name_; // The actual database name/path + WriteOptions put_option_; + ReadOptions get_option_; + + /// The backend rocksdb database. + /// Map : key --> list + /// where a list is a sequence of elements + /// and an element is a 4-byte integer (n), followed by n bytes of data + std::unique_ptr db_; +}; + +} // namespace rocksdb diff --git a/utilities/redis/redis_lists_test.cc b/utilities/redis/redis_lists_test.cc new file mode 100644 index 00000000..0600e0e5 --- /dev/null +++ b/utilities/redis/redis_lists_test.cc @@ -0,0 +1,875 @@ +/** + * A test harness for the Redis API built on rocksdb. + * + * USAGE: Build with: "make redis_test" (in rocksdb directory). + * Run unit tests with: "./redis_test" + * Manual/Interactive user testing: "./redis_test -m" + * Manual user testing + restart database: "./redis_test -m -d" + * + * TODO: Add LARGE random test cases to verify efficiency and scalability + * + * @author Deon Nicholas (dnicholas@fb.com) + * Copyright 2013 Facebook + */ + + +#include +#include + +#include "redis_lists.h" +#include "util/testharness.h" +#include "util/random.h" + +using namespace rocksdb; +using namespace std; + +namespace rocksdb { + +class RedisListsTest { + public: + static const string kDefaultDbName; + static Options options; + + RedisListsTest() { + options.create_if_missing = true; + } +}; + +const string RedisListsTest::kDefaultDbName = "/tmp/redisdefaultdb/"; +Options RedisListsTest::options = Options(); + +// operator== and operator<< are defined below for vectors (lists) +// Needed for ASSERT_EQ + +void AssertListEq(const std::vector& result, + const std::vector& expected_result) { + ASSERT_EQ(result.size(), expected_result.size()); + for (size_t i = 0; i < result.size(); ++i) { + ASSERT_EQ(result[i], expected_result[i]); + } +} + +// PushRight, Length, Index, Range +TEST(RedisListsTest, SimpleTest) { + RedisLists redis(kDefaultDbName, options, true); // Destructive + + string tempv; // Used below for all Index(), PopRight(), PopLeft() + + // Simple PushRight (should return the new length each time) + ASSERT_EQ(redis.PushRight("k1", "v1"), 1); + ASSERT_EQ(redis.PushRight("k1", "v2"), 2); + ASSERT_EQ(redis.PushRight("k1", "v3"), 3); + + // Check Length and Index() functions + ASSERT_EQ(redis.Length("k1"), 3); // Check length + ASSERT_TRUE(redis.Index("k1", 0, &tempv)); + ASSERT_EQ(tempv, "v1"); // Check valid indices + ASSERT_TRUE(redis.Index("k1", 1, &tempv)); + ASSERT_EQ(tempv, "v2"); + ASSERT_TRUE(redis.Index("k1", 2, &tempv)); + ASSERT_EQ(tempv, "v3"); + + // Check range function and vectors + std::vector result = redis.Range("k1", 0, 2); // Get the list + std::vector expected_result(3); + expected_result[0] = "v1"; + expected_result[1] = "v2"; + expected_result[2] = "v3"; + AssertListEq(result, expected_result); +} + +// PushLeft, Length, Index, Range +TEST(RedisListsTest, SimpleTest2) { + RedisLists redis(kDefaultDbName, options, true); // Destructive + + string tempv; // Used below for all Index(), PopRight(), PopLeft() + + // Simple PushRight + ASSERT_EQ(redis.PushLeft("k1", "v3"), 1); + ASSERT_EQ(redis.PushLeft("k1", "v2"), 2); + ASSERT_EQ(redis.PushLeft("k1", "v1"), 3); + + // Check Length and Index() functions + ASSERT_EQ(redis.Length("k1"), 3); // Check length + ASSERT_TRUE(redis.Index("k1", 0, &tempv)); + ASSERT_EQ(tempv, "v1"); // Check valid indices + ASSERT_TRUE(redis.Index("k1", 1, &tempv)); + ASSERT_EQ(tempv, "v2"); + ASSERT_TRUE(redis.Index("k1", 2, &tempv)); + ASSERT_EQ(tempv, "v3"); + + // Check range function and vectors + std::vector result = redis.Range("k1", 0, 2); // Get the list + std::vector expected_result(3); + expected_result[0] = "v1"; + expected_result[1] = "v2"; + expected_result[2] = "v3"; + AssertListEq(result, expected_result); +} + +// Exhaustive test of the Index() function +TEST(RedisListsTest, IndexTest) { + RedisLists redis(kDefaultDbName, options, true); // Destructive + + string tempv; // Used below for all Index(), PopRight(), PopLeft() + + // Empty Index check (return empty and should not crash or edit tempv) + tempv = "yo"; + ASSERT_TRUE(!redis.Index("k1", 0, &tempv)); + ASSERT_EQ(tempv, "yo"); + ASSERT_TRUE(!redis.Index("fda", 3, &tempv)); + ASSERT_EQ(tempv, "yo"); + ASSERT_TRUE(!redis.Index("random", -12391, &tempv)); + ASSERT_EQ(tempv, "yo"); + + // Simple Pushes (will yield: [v6, v4, v4, v1, v2, v3] + redis.PushRight("k1", "v1"); + redis.PushRight("k1", "v2"); + redis.PushRight("k1", "v3"); + redis.PushLeft("k1", "v4"); + redis.PushLeft("k1", "v4"); + redis.PushLeft("k1", "v6"); + + // Simple, non-negative indices + ASSERT_TRUE(redis.Index("k1", 0, &tempv)); + ASSERT_EQ(tempv, "v6"); + ASSERT_TRUE(redis.Index("k1", 1, &tempv)); + ASSERT_EQ(tempv, "v4"); + ASSERT_TRUE(redis.Index("k1", 2, &tempv)); + ASSERT_EQ(tempv, "v4"); + ASSERT_TRUE(redis.Index("k1", 3, &tempv)); + ASSERT_EQ(tempv, "v1"); + ASSERT_TRUE(redis.Index("k1", 4, &tempv)); + ASSERT_EQ(tempv, "v2"); + ASSERT_TRUE(redis.Index("k1", 5, &tempv)); + ASSERT_EQ(tempv, "v3"); + + // Negative indices + ASSERT_TRUE(redis.Index("k1", -6, &tempv)); + ASSERT_EQ(tempv, "v6"); + ASSERT_TRUE(redis.Index("k1", -5, &tempv)); + ASSERT_EQ(tempv, "v4"); + ASSERT_TRUE(redis.Index("k1", -4, &tempv)); + ASSERT_EQ(tempv, "v4"); + ASSERT_TRUE(redis.Index("k1", -3, &tempv)); + ASSERT_EQ(tempv, "v1"); + ASSERT_TRUE(redis.Index("k1", -2, &tempv)); + ASSERT_EQ(tempv, "v2"); + ASSERT_TRUE(redis.Index("k1", -1, &tempv)); + ASSERT_EQ(tempv, "v3"); + + // Out of bounds (return empty, no crash) + ASSERT_TRUE(!redis.Index("k1", 6, &tempv)); + ASSERT_TRUE(!redis.Index("k1", 123219, &tempv)); + ASSERT_TRUE(!redis.Index("k1", -7, &tempv)); + ASSERT_TRUE(!redis.Index("k1", -129, &tempv)); +} + + +// Exhaustive test of the Range() function +TEST(RedisListsTest, RangeTest) { + RedisLists redis(kDefaultDbName, options, true); // Destructive + + string tempv; // Used below for all Index(), PopRight(), PopLeft() + + // Simple Pushes (will yield: [v6, v4, v4, v1, v2, v3]) + redis.PushRight("k1", "v1"); + redis.PushRight("k1", "v2"); + redis.PushRight("k1", "v3"); + redis.PushLeft("k1", "v4"); + redis.PushLeft("k1", "v4"); + redis.PushLeft("k1", "v6"); + + // Sanity check (check the length; make sure it's 6) + ASSERT_EQ(redis.Length("k1"), 6); + + // Simple range + std::vector res = redis.Range("k1", 1, 4); + ASSERT_EQ((int)res.size(), 4); + ASSERT_EQ(res[0], "v4"); + ASSERT_EQ(res[1], "v4"); + ASSERT_EQ(res[2], "v1"); + ASSERT_EQ(res[3], "v2"); + + // Negative indices (i.e.: measured from the end) + res = redis.Range("k1", 2, -1); + ASSERT_EQ((int)res.size(), 4); + ASSERT_EQ(res[0], "v4"); + ASSERT_EQ(res[1], "v1"); + ASSERT_EQ(res[2], "v2"); + ASSERT_EQ(res[3], "v3"); + + res = redis.Range("k1", -6, -4); + ASSERT_EQ((int)res.size(), 3); + ASSERT_EQ(res[0], "v6"); + ASSERT_EQ(res[1], "v4"); + ASSERT_EQ(res[2], "v4"); + + res = redis.Range("k1", -1, 5); + ASSERT_EQ((int)res.size(), 1); + ASSERT_EQ(res[0], "v3"); + + // Partial / Broken indices + res = redis.Range("k1", -3, 1000000); + ASSERT_EQ((int)res.size(), 3); + ASSERT_EQ(res[0], "v1"); + ASSERT_EQ(res[1], "v2"); + ASSERT_EQ(res[2], "v3"); + + res = redis.Range("k1", -1000000, 1); + ASSERT_EQ((int)res.size(), 2); + ASSERT_EQ(res[0], "v6"); + ASSERT_EQ(res[1], "v4"); + + // Invalid indices + res = redis.Range("k1", 7, 9); + ASSERT_EQ((int)res.size(), 0); + + res = redis.Range("k1", -8, -7); + ASSERT_EQ((int)res.size(), 0); + + res = redis.Range("k1", 3, 2); + ASSERT_EQ((int)res.size(), 0); + + res = redis.Range("k1", 5, -2); + ASSERT_EQ((int)res.size(), 0); + + // Range matches Index + res = redis.Range("k1", -6, -4); + ASSERT_TRUE(redis.Index("k1", -6, &tempv)); + ASSERT_EQ(tempv, res[0]); + ASSERT_TRUE(redis.Index("k1", -5, &tempv)); + ASSERT_EQ(tempv, res[1]); + ASSERT_TRUE(redis.Index("k1", -4, &tempv)); + ASSERT_EQ(tempv, res[2]); + + // Last check + res = redis.Range("k1", 0, -6); + ASSERT_EQ((int)res.size(), 1); + ASSERT_EQ(res[0], "v6"); +} + +// Exhaustive test for InsertBefore(), and InsertAfter() +TEST(RedisListsTest, InsertTest) { + RedisLists redis(kDefaultDbName, options, true); + + string tempv; // Used below for all Index(), PopRight(), PopLeft() + + // Insert on empty list (return 0, and do not crash) + ASSERT_EQ(redis.InsertBefore("k1", "non-exist", "a"), 0); + ASSERT_EQ(redis.InsertAfter("k1", "other-non-exist", "c"), 0); + ASSERT_EQ(redis.Length("k1"), 0); + + // Push some preliminary stuff [g, f, e, d, c, b, a] + redis.PushLeft("k1", "a"); + redis.PushLeft("k1", "b"); + redis.PushLeft("k1", "c"); + redis.PushLeft("k1", "d"); + redis.PushLeft("k1", "e"); + redis.PushLeft("k1", "f"); + redis.PushLeft("k1", "g"); + ASSERT_EQ(redis.Length("k1"), 7); + + // Test InsertBefore + int newLength = redis.InsertBefore("k1", "e", "hello"); + ASSERT_EQ(newLength, 8); + ASSERT_EQ(redis.Length("k1"), newLength); + ASSERT_TRUE(redis.Index("k1", 1, &tempv)); + ASSERT_EQ(tempv, "f"); + ASSERT_TRUE(redis.Index("k1", 3, &tempv)); + ASSERT_EQ(tempv, "e"); + ASSERT_TRUE(redis.Index("k1", 2, &tempv)); + ASSERT_EQ(tempv, "hello"); + + // Test InsertAfter + newLength = redis.InsertAfter("k1", "c", "bye"); + ASSERT_EQ(newLength, 9); + ASSERT_EQ(redis.Length("k1"), newLength); + ASSERT_TRUE(redis.Index("k1", 6, &tempv)); + ASSERT_EQ(tempv, "bye"); + + // Test bad value on InsertBefore + newLength = redis.InsertBefore("k1", "yo", "x"); + ASSERT_EQ(newLength, 9); + ASSERT_EQ(redis.Length("k1"), newLength); + + // Test bad value on InsertAfter + newLength = redis.InsertAfter("k1", "xxxx", "y"); + ASSERT_EQ(newLength, 9); + ASSERT_EQ(redis.Length("k1"), newLength); + + // Test InsertBefore beginning + newLength = redis.InsertBefore("k1", "g", "begggggggggggggggg"); + ASSERT_EQ(newLength, 10); + ASSERT_EQ(redis.Length("k1"), newLength); + + // Test InsertAfter end + newLength = redis.InsertAfter("k1", "a", "enddd"); + ASSERT_EQ(newLength, 11); + ASSERT_EQ(redis.Length("k1"), newLength); + + // Make sure nothing weird happened. + ASSERT_TRUE(redis.Index("k1", 0, &tempv)); + ASSERT_EQ(tempv, "begggggggggggggggg"); + ASSERT_TRUE(redis.Index("k1", 1, &tempv)); + ASSERT_EQ(tempv, "g"); + ASSERT_TRUE(redis.Index("k1", 2, &tempv)); + ASSERT_EQ(tempv, "f"); + ASSERT_TRUE(redis.Index("k1", 3, &tempv)); + ASSERT_EQ(tempv, "hello"); + ASSERT_TRUE(redis.Index("k1", 4, &tempv)); + ASSERT_EQ(tempv, "e"); + ASSERT_TRUE(redis.Index("k1", 5, &tempv)); + ASSERT_EQ(tempv, "d"); + ASSERT_TRUE(redis.Index("k1", 6, &tempv)); + ASSERT_EQ(tempv, "c"); + ASSERT_TRUE(redis.Index("k1", 7, &tempv)); + ASSERT_EQ(tempv, "bye"); + ASSERT_TRUE(redis.Index("k1", 8, &tempv)); + ASSERT_EQ(tempv, "b"); + ASSERT_TRUE(redis.Index("k1", 9, &tempv)); + ASSERT_EQ(tempv, "a"); + ASSERT_TRUE(redis.Index("k1", 10, &tempv)); + ASSERT_EQ(tempv, "enddd"); +} + +// Exhaustive test of Set function +TEST(RedisListsTest, SetTest) { + RedisLists redis(kDefaultDbName, options, true); + + string tempv; // Used below for all Index(), PopRight(), PopLeft() + + // Set on empty list (return false, and do not crash) + ASSERT_EQ(redis.Set("k1", 7, "a"), false); + ASSERT_EQ(redis.Set("k1", 0, "a"), false); + ASSERT_EQ(redis.Set("k1", -49, "cx"), false); + ASSERT_EQ(redis.Length("k1"), 0); + + // Push some preliminary stuff [g, f, e, d, c, b, a] + redis.PushLeft("k1", "a"); + redis.PushLeft("k1", "b"); + redis.PushLeft("k1", "c"); + redis.PushLeft("k1", "d"); + redis.PushLeft("k1", "e"); + redis.PushLeft("k1", "f"); + redis.PushLeft("k1", "g"); + ASSERT_EQ(redis.Length("k1"), 7); + + // Test Regular Set + ASSERT_TRUE(redis.Set("k1", 0, "0")); + ASSERT_TRUE(redis.Set("k1", 3, "3")); + ASSERT_TRUE(redis.Set("k1", 6, "6")); + ASSERT_TRUE(redis.Set("k1", 2, "2")); + ASSERT_TRUE(redis.Set("k1", 5, "5")); + ASSERT_TRUE(redis.Set("k1", 1, "1")); + ASSERT_TRUE(redis.Set("k1", 4, "4")); + + ASSERT_EQ(redis.Length("k1"), 7); // Size should not change + ASSERT_TRUE(redis.Index("k1", 0, &tempv)); + ASSERT_EQ(tempv, "0"); + ASSERT_TRUE(redis.Index("k1", 1, &tempv)); + ASSERT_EQ(tempv, "1"); + ASSERT_TRUE(redis.Index("k1", 2, &tempv)); + ASSERT_EQ(tempv, "2"); + ASSERT_TRUE(redis.Index("k1", 3, &tempv)); + ASSERT_EQ(tempv, "3"); + ASSERT_TRUE(redis.Index("k1", 4, &tempv)); + ASSERT_EQ(tempv, "4"); + ASSERT_TRUE(redis.Index("k1", 5, &tempv)); + ASSERT_EQ(tempv, "5"); + ASSERT_TRUE(redis.Index("k1", 6, &tempv)); + ASSERT_EQ(tempv, "6"); + + // Set with negative indices + ASSERT_TRUE(redis.Set("k1", -7, "a")); + ASSERT_TRUE(redis.Set("k1", -4, "d")); + ASSERT_TRUE(redis.Set("k1", -1, "g")); + ASSERT_TRUE(redis.Set("k1", -5, "c")); + ASSERT_TRUE(redis.Set("k1", -2, "f")); + ASSERT_TRUE(redis.Set("k1", -6, "b")); + ASSERT_TRUE(redis.Set("k1", -3, "e")); + + ASSERT_EQ(redis.Length("k1"), 7); // Size should not change + ASSERT_TRUE(redis.Index("k1", 0, &tempv)); + ASSERT_EQ(tempv, "a"); + ASSERT_TRUE(redis.Index("k1", 1, &tempv)); + ASSERT_EQ(tempv, "b"); + ASSERT_TRUE(redis.Index("k1", 2, &tempv)); + ASSERT_EQ(tempv, "c"); + ASSERT_TRUE(redis.Index("k1", 3, &tempv)); + ASSERT_EQ(tempv, "d"); + ASSERT_TRUE(redis.Index("k1", 4, &tempv)); + ASSERT_EQ(tempv, "e"); + ASSERT_TRUE(redis.Index("k1", 5, &tempv)); + ASSERT_EQ(tempv, "f"); + ASSERT_TRUE(redis.Index("k1", 6, &tempv)); + ASSERT_EQ(tempv, "g"); + + // Bad indices (just out-of-bounds / off-by-one check) + ASSERT_EQ(redis.Set("k1", -8, "off-by-one in negative index"), false); + ASSERT_EQ(redis.Set("k1", 7, "off-by-one-error in positive index"), false); + ASSERT_EQ(redis.Set("k1", 43892, "big random index should fail"), false); + ASSERT_EQ(redis.Set("k1", -21391, "large negative index should fail"), false); + + // One last check (to make sure nothing weird happened) + ASSERT_EQ(redis.Length("k1"), 7); // Size should not change + ASSERT_TRUE(redis.Index("k1", 0, &tempv)); + ASSERT_EQ(tempv, "a"); + ASSERT_TRUE(redis.Index("k1", 1, &tempv)); + ASSERT_EQ(tempv, "b"); + ASSERT_TRUE(redis.Index("k1", 2, &tempv)); + ASSERT_EQ(tempv, "c"); + ASSERT_TRUE(redis.Index("k1", 3, &tempv)); + ASSERT_EQ(tempv, "d"); + ASSERT_TRUE(redis.Index("k1", 4, &tempv)); + ASSERT_EQ(tempv, "e"); + ASSERT_TRUE(redis.Index("k1", 5, &tempv)); + ASSERT_EQ(tempv, "f"); + ASSERT_TRUE(redis.Index("k1", 6, &tempv)); + ASSERT_EQ(tempv, "g"); +} + +// Testing Insert, Push, and Set, in a mixed environment +TEST(RedisListsTest, InsertPushSetTest) { + RedisLists redis(kDefaultDbName, options, true); // Destructive + + string tempv; // Used below for all Index(), PopRight(), PopLeft() + + // A series of pushes and insertions + // Will result in [newbegin, z, a, aftera, x, newend] + // Also, check the return value sometimes (should return length) + int lengthCheck; + lengthCheck = redis.PushLeft("k1", "a"); + ASSERT_EQ(lengthCheck, 1); + redis.PushLeft("k1", "z"); + redis.PushRight("k1", "x"); + lengthCheck = redis.InsertAfter("k1", "a", "aftera"); + ASSERT_EQ(lengthCheck , 4); + redis.InsertBefore("k1", "z", "newbegin"); // InsertBefore beginning of list + redis.InsertAfter("k1", "x", "newend"); // InsertAfter end of list + + // Check + std::vector res = redis.Range("k1", 0, -1); // Get the list + ASSERT_EQ((int)res.size(), 6); + ASSERT_EQ(res[0], "newbegin"); + ASSERT_EQ(res[5], "newend"); + ASSERT_EQ(res[3], "aftera"); + + // Testing duplicate values/pivots (multiple occurrences of 'a') + ASSERT_TRUE(redis.Set("k1", 0, "a")); // [a, z, a, aftera, x, newend] + redis.InsertAfter("k1", "a", "happy"); // [a, happy, z, a, aftera, ...] + ASSERT_TRUE(redis.Index("k1", 1, &tempv)); + ASSERT_EQ(tempv, "happy"); + redis.InsertBefore("k1", "a", "sad"); // [sad, a, happy, z, a, aftera, ...] + ASSERT_TRUE(redis.Index("k1", 0, &tempv)); + ASSERT_EQ(tempv, "sad"); + ASSERT_TRUE(redis.Index("k1", 2, &tempv)); + ASSERT_EQ(tempv, "happy"); + ASSERT_TRUE(redis.Index("k1", 5, &tempv)); + ASSERT_EQ(tempv, "aftera"); + redis.InsertAfter("k1", "a", "zz"); // [sad, a, zz, happy, z, a, aftera, ...] + ASSERT_TRUE(redis.Index("k1", 2, &tempv)); + ASSERT_EQ(tempv, "zz"); + ASSERT_TRUE(redis.Index("k1", 6, &tempv)); + ASSERT_EQ(tempv, "aftera"); + ASSERT_TRUE(redis.Set("k1", 1, "nota")); // [sad, nota, zz, happy, z, a, ...] + redis.InsertBefore("k1", "a", "ba"); // [sad, nota, zz, happy, z, ba, a, ...] + ASSERT_TRUE(redis.Index("k1", 4, &tempv)); + ASSERT_EQ(tempv, "z"); + ASSERT_TRUE(redis.Index("k1", 5, &tempv)); + ASSERT_EQ(tempv, "ba"); + ASSERT_TRUE(redis.Index("k1", 6, &tempv)); + ASSERT_EQ(tempv, "a"); + + // We currently have: [sad, nota, zz, happy, z, ba, a, aftera, x, newend] + // redis.Print("k1"); // manually check + + // Test Inserting before/after non-existent values + lengthCheck = redis.Length("k1"); // Ensure that the length doesn't change + ASSERT_EQ(lengthCheck, 10); + ASSERT_EQ(redis.InsertBefore("k1", "non-exist", "randval"), lengthCheck); + ASSERT_EQ(redis.InsertAfter("k1", "nothing", "a"), lengthCheck); + ASSERT_EQ(redis.InsertAfter("randKey", "randVal", "ranValue"), 0); // Empty + ASSERT_EQ(redis.Length("k1"), lengthCheck); // The length should not change + + // Simply Test the Set() function + redis.Set("k1", 5, "ba2"); + redis.InsertBefore("k1", "ba2", "beforeba2"); + ASSERT_TRUE(redis.Index("k1", 4, &tempv)); + ASSERT_EQ(tempv, "z"); + ASSERT_TRUE(redis.Index("k1", 5, &tempv)); + ASSERT_EQ(tempv, "beforeba2"); + ASSERT_TRUE(redis.Index("k1", 6, &tempv)); + ASSERT_EQ(tempv, "ba2"); + ASSERT_TRUE(redis.Index("k1", 7, &tempv)); + ASSERT_EQ(tempv, "a"); + + // We have: [sad, nota, zz, happy, z, beforeba2, ba2, a, aftera, x, newend] + + // Set() with negative indices + redis.Set("k1", -1, "endprank"); + ASSERT_TRUE(!redis.Index("k1", 11, &tempv)); + ASSERT_TRUE(redis.Index("k1", 10, &tempv)); + ASSERT_EQ(tempv, "endprank"); // Ensure Set worked correctly + redis.Set("k1", -11, "t"); + ASSERT_TRUE(redis.Index("k1", 0, &tempv)); + ASSERT_EQ(tempv, "t"); + + // Test out of bounds Set + ASSERT_EQ(redis.Set("k1", -12, "ssd"), false); + ASSERT_EQ(redis.Set("k1", 11, "sasd"), false); + ASSERT_EQ(redis.Set("k1", 1200, "big"), false); +} + +// Testing Trim, Pop +TEST(RedisListsTest, TrimPopTest) { + RedisLists redis(kDefaultDbName, options, true); // Destructive + + string tempv; // Used below for all Index(), PopRight(), PopLeft() + + // A series of pushes and insertions + // Will result in [newbegin, z, a, aftera, x, newend] + redis.PushLeft("k1", "a"); + redis.PushLeft("k1", "z"); + redis.PushRight("k1", "x"); + redis.InsertBefore("k1", "z", "newbegin"); // InsertBefore start of list + redis.InsertAfter("k1", "x", "newend"); // InsertAfter end of list + redis.InsertAfter("k1", "a", "aftera"); + + // Simple PopLeft/Right test + ASSERT_TRUE(redis.PopLeft("k1", &tempv)); + ASSERT_EQ(tempv, "newbegin"); + ASSERT_EQ(redis.Length("k1"), 5); + ASSERT_TRUE(redis.Index("k1", 0, &tempv)); + ASSERT_EQ(tempv, "z"); + ASSERT_TRUE(redis.PopRight("k1", &tempv)); + ASSERT_EQ(tempv, "newend"); + ASSERT_EQ(redis.Length("k1"), 4); + ASSERT_TRUE(redis.Index("k1", -1, &tempv)); + ASSERT_EQ(tempv, "x"); + + // Now have: [z, a, aftera, x] + + // Test Trim + ASSERT_TRUE(redis.Trim("k1", 0, -1)); // [z, a, aftera, x] (do nothing) + ASSERT_EQ(redis.Length("k1"), 4); + ASSERT_TRUE(redis.Trim("k1", 0, 2)); // [z, a, aftera] + ASSERT_EQ(redis.Length("k1"), 3); + ASSERT_TRUE(redis.Index("k1", -1, &tempv)); + ASSERT_EQ(tempv, "aftera"); + ASSERT_TRUE(redis.Trim("k1", 1, 1)); // [a] + ASSERT_EQ(redis.Length("k1"), 1); + ASSERT_TRUE(redis.Index("k1", 0, &tempv)); + ASSERT_EQ(tempv, "a"); + + // Test out of bounds (empty) trim + ASSERT_TRUE(redis.Trim("k1", 1, 0)); + ASSERT_EQ(redis.Length("k1"), 0); + + // Popping with empty list (return empty without error) + ASSERT_TRUE(!redis.PopLeft("k1", &tempv)); + ASSERT_TRUE(!redis.PopRight("k1", &tempv)); + ASSERT_TRUE(redis.Trim("k1", 0, 5)); + + // Exhaustive Trim test (negative and invalid indices) + // Will start in [newbegin, z, a, aftera, x, newend] + redis.PushLeft("k1", "a"); + redis.PushLeft("k1", "z"); + redis.PushRight("k1", "x"); + redis.InsertBefore("k1", "z", "newbegin"); // InsertBefore start of list + redis.InsertAfter("k1", "x", "newend"); // InsertAfter end of list + redis.InsertAfter("k1", "a", "aftera"); + ASSERT_TRUE(redis.Trim("k1", -6, -1)); // Should do nothing + ASSERT_EQ(redis.Length("k1"), 6); + ASSERT_TRUE(redis.Trim("k1", 1, -2)); + ASSERT_TRUE(redis.Index("k1", 0, &tempv)); + ASSERT_EQ(tempv, "z"); + ASSERT_TRUE(redis.Index("k1", 3, &tempv)); + ASSERT_EQ(tempv, "x"); + ASSERT_EQ(redis.Length("k1"), 4); + ASSERT_TRUE(redis.Trim("k1", -3, -2)); + ASSERT_EQ(redis.Length("k1"), 2); +} + +// Testing Remove, RemoveFirst, RemoveLast +TEST(RedisListsTest, RemoveTest) { + RedisLists redis(kDefaultDbName, options, true); // Destructive + + string tempv; // Used below for all Index(), PopRight(), PopLeft() + + // A series of pushes and insertions + // Will result in [newbegin, z, a, aftera, x, newend, a, a] + redis.PushLeft("k1", "a"); + redis.PushLeft("k1", "z"); + redis.PushRight("k1", "x"); + redis.InsertBefore("k1", "z", "newbegin"); // InsertBefore start of list + redis.InsertAfter("k1", "x", "newend"); // InsertAfter end of list + redis.InsertAfter("k1", "a", "aftera"); + redis.PushRight("k1", "a"); + redis.PushRight("k1", "a"); + + // Verify + ASSERT_TRUE(redis.Index("k1", 0, &tempv)); + ASSERT_EQ(tempv, "newbegin"); + ASSERT_TRUE(redis.Index("k1", -1, &tempv)); + ASSERT_EQ(tempv, "a"); + + // Check RemoveFirst (Remove the first two 'a') + // Results in [newbegin, z, aftera, x, newend, a] + int numRemoved = redis.Remove("k1", 2, "a"); + ASSERT_EQ(numRemoved, 2); + ASSERT_TRUE(redis.Index("k1", 0, &tempv)); + ASSERT_EQ(tempv, "newbegin"); + ASSERT_TRUE(redis.Index("k1", 1, &tempv)); + ASSERT_EQ(tempv, "z"); + ASSERT_TRUE(redis.Index("k1", 4, &tempv)); + ASSERT_EQ(tempv, "newend"); + ASSERT_TRUE(redis.Index("k1", 5, &tempv)); + ASSERT_EQ(tempv, "a"); + ASSERT_EQ(redis.Length("k1"), 6); + + // Repopulate some stuff + // Results in: [x, x, x, x, x, newbegin, z, x, aftera, x, newend, a, x] + redis.PushLeft("k1", "x"); + redis.PushLeft("k1", "x"); + redis.PushLeft("k1", "x"); + redis.PushLeft("k1", "x"); + redis.PushLeft("k1", "x"); + redis.PushRight("k1", "x"); + redis.InsertAfter("k1", "z", "x"); + + // Test removal from end + numRemoved = redis.Remove("k1", -2, "x"); + ASSERT_EQ(numRemoved, 2); + ASSERT_TRUE(redis.Index("k1", 8, &tempv)); + ASSERT_EQ(tempv, "aftera"); + ASSERT_TRUE(redis.Index("k1", 9, &tempv)); + ASSERT_EQ(tempv, "newend"); + ASSERT_TRUE(redis.Index("k1", 10, &tempv)); + ASSERT_EQ(tempv, "a"); + ASSERT_TRUE(!redis.Index("k1", 11, &tempv)); + numRemoved = redis.Remove("k1", -2, "x"); + ASSERT_EQ(numRemoved, 2); + ASSERT_TRUE(redis.Index("k1", 4, &tempv)); + ASSERT_EQ(tempv, "newbegin"); + ASSERT_TRUE(redis.Index("k1", 6, &tempv)); + ASSERT_EQ(tempv, "aftera"); + + // We now have: [x, x, x, x, newbegin, z, aftera, newend, a] + ASSERT_EQ(redis.Length("k1"), 9); + ASSERT_TRUE(redis.Index("k1", -1, &tempv)); + ASSERT_EQ(tempv, "a"); + ASSERT_TRUE(redis.Index("k1", 0, &tempv)); + ASSERT_EQ(tempv, "x"); + + // Test over-shooting (removing more than there exists) + numRemoved = redis.Remove("k1", -9000, "x"); + ASSERT_EQ(numRemoved , 4); // Only really removed 4 + ASSERT_EQ(redis.Length("k1"), 5); + ASSERT_TRUE(redis.Index("k1", 0, &tempv)); + ASSERT_EQ(tempv, "newbegin"); + numRemoved = redis.Remove("k1", 1, "x"); + ASSERT_EQ(numRemoved, 0); + + // Try removing ALL! + numRemoved = redis.Remove("k1", 0, "newbegin"); // REMOVE 0 will remove all! + ASSERT_EQ(numRemoved, 1); + + // Removal from an empty-list + ASSERT_TRUE(redis.Trim("k1", 1, 0)); + numRemoved = redis.Remove("k1", 1, "z"); + ASSERT_EQ(numRemoved, 0); +} + + +// Test Multiple keys and Persistence +TEST(RedisListsTest, PersistenceMultiKeyTest) { + + string tempv; // Used below for all Index(), PopRight(), PopLeft() + + // Block one: populate a single key in the database + { + RedisLists redis(kDefaultDbName, options, true); // Destructive + + // A series of pushes and insertions + // Will result in [newbegin, z, a, aftera, x, newend, a, a] + redis.PushLeft("k1", "a"); + redis.PushLeft("k1", "z"); + redis.PushRight("k1", "x"); + redis.InsertBefore("k1", "z", "newbegin"); // InsertBefore start of list + redis.InsertAfter("k1", "x", "newend"); // InsertAfter end of list + redis.InsertAfter("k1", "a", "aftera"); + redis.PushRight("k1", "a"); + redis.PushRight("k1", "a"); + + ASSERT_TRUE(redis.Index("k1", 3, &tempv)); + ASSERT_EQ(tempv, "aftera"); + } + + // Block two: make sure changes were saved and add some other key + { + RedisLists redis(kDefaultDbName, options, false); // Persistent, non-destructive + + // Check + ASSERT_EQ(redis.Length("k1"), 8); + ASSERT_TRUE(redis.Index("k1", 3, &tempv)); + ASSERT_EQ(tempv, "aftera"); + + redis.PushRight("k2", "randomkey"); + redis.PushLeft("k2", "sas"); + + redis.PopLeft("k1", &tempv); + } + + // Block three: Verify the changes from block 2 + { + RedisLists redis(kDefaultDbName, options, false); // Persistent, non-destructive + + // Check + ASSERT_EQ(redis.Length("k1"), 7); + ASSERT_EQ(redis.Length("k2"), 2); + ASSERT_TRUE(redis.Index("k1", 0, &tempv)); + ASSERT_EQ(tempv, "z"); + ASSERT_TRUE(redis.Index("k2", -2, &tempv)); + ASSERT_EQ(tempv, "sas"); + } +} + +/// THE manual REDIS TEST begins here +/// THIS WILL ONLY OCCUR IF YOU RUN: ./redis_test -m + +void MakeUpper(std::string* const s) { + int len = s->length(); + for(int i=0; i + } +} + +/// Allows the user to enter in REDIS commands into the command-line. +/// This is useful for manual / interacticve testing / debugging. +/// Use destructive=true to clean the database before use. +/// Use destructive=false to remember the previous state (i.e.: persistent) +/// Should be called from main function. +int manual_redis_test(bool destructive){ + RedisLists redis(RedisListsTest::kDefaultDbName, + RedisListsTest::options, + destructive); + + // TODO: Right now, please use spaces to separate each word. + // In actual redis, you can use quotes to specify compound values + // Example: RPUSH mylist "this is a compound value" + + std::string command; + while(true) { + cin >> command; + MakeUpper(&command); + + if (command == "LINSERT") { + std::string k, t, p, v; + cin >> k >> t >> p >> v; + MakeUpper(&t); + if (t=="BEFORE") { + std::cout << redis.InsertBefore(k, p, v) << std::endl; + } else if (t=="AFTER") { + std::cout << redis.InsertAfter(k, p, v) << std::endl; + } + } else if (command == "LPUSH") { + std::string k, v; + std::cin >> k >> v; + redis.PushLeft(k, v); + } else if (command == "RPUSH") { + std::string k, v; + std::cin >> k >> v; + redis.PushRight(k, v); + } else if (command == "LPOP") { + std::string k; + std::cin >> k; + string res; + redis.PopLeft(k, &res); + std::cout << res << std::endl; + } else if (command == "RPOP") { + std::string k; + std::cin >> k; + string res; + redis.PopRight(k, &res); + std::cout << res << std::endl; + } else if (command == "LREM") { + std::string k; + int amt; + std::string v; + + std::cin >> k >> amt >> v; + std::cout << redis.Remove(k, amt, v) << std::endl; + } else if (command == "LLEN") { + std::string k; + std::cin >> k; + std::cout << redis.Length(k) << std::endl; + } else if (command == "LRANGE") { + std::string k; + int i, j; + std::cin >> k >> i >> j; + std::vector res = redis.Range(k, i, j); + for (auto it = res.begin(); it != res.end(); ++it) { + std::cout << " " << (*it); + } + std::cout << std::endl; + } else if (command == "LTRIM") { + std::string k; + int i, j; + std::cin >> k >> i >> j; + redis.Trim(k, i, j); + } else if (command == "LSET") { + std::string k; + int idx; + std::string v; + cin >> k >> idx >> v; + redis.Set(k, idx, v); + } else if (command == "LINDEX") { + std::string k; + int idx; + std::cin >> k >> idx; + string res; + redis.Index(k, idx, &res); + std::cout << res << std::endl; + } else if (command == "PRINT") { // Added by Deon + std::string k; + cin >> k; + redis.Print(k); + } else if (command == "QUIT") { + return 0; + } else { + std::cout << "unknown command: " << command << std::endl; + } + } +} + +} // namespace rocksdb + + +// USAGE: "./redis_test" for default (unit tests) +// "./redis_test -m" for manual testing (redis command api) +// "./redis_test -m -d" for destructive manual test (erase db before use) + + +// Check for "want" argument in the argument list +bool found_arg(int argc, char* argv[], const char* want){ + for(int i=1; icompaction_filter) { + options->compaction_filter = + new TtlCompactionFilter(ttl, options->compaction_filter); + } else { + options->compaction_filter_factory = + std::shared_ptr(new TtlCompactionFilterFactory( + ttl, options->compaction_filter_factory)); + } + + if (options->merge_operator) { + options->merge_operator.reset( + new TtlMergeOperator(options->merge_operator)); + } +} + +// Open the db inside DBWithTTL because options needs pointer to its ttl +DBWithTTL::DBWithTTL(DB* db) : StackableDB(db) {} + +DBWithTTL::~DBWithTTL() { + delete GetOptions().compaction_filter; +} + +Status UtilityDB::OpenTtlDB( + const Options& options, + const std::string& dbname, + StackableDB** dbptr, + int32_t ttl, + bool read_only) { + Status st; + Options options_to_open = options; + DBWithTTL::SanitizeOptions(ttl, &options_to_open); + DB* db; + + if (read_only) { + st = DB::OpenForReadOnly(options_to_open, dbname, &db); + } else { + st = DB::Open(options_to_open, dbname, &db); + } + if (st.ok()) { + *dbptr = new DBWithTTL(db); + } else { + delete db; + } + return st; +} + +// Gives back the current time +Status DBWithTTL::GetCurrentTime(int64_t& curtime) { + return Env::Default()->GetCurrentTime(&curtime); +} + +// Appends the current timestamp to the string. +// Returns false if could not get the current_time, true if append succeeds +Status DBWithTTL::AppendTS(const Slice& val, std::string& val_with_ts) { + val_with_ts.reserve(kTSLength + val.size()); + char ts_string[kTSLength]; + int64_t curtime; + Status st = GetCurrentTime(curtime); + if (!st.ok()) { + return st; + } + EncodeFixed32(ts_string, (int32_t)curtime); + val_with_ts.append(val.data(), val.size()); + val_with_ts.append(ts_string, kTSLength); + return st; +} + +// Returns corruption if the length of the string is lesser than timestamp, or +// timestamp refers to a time lesser than ttl-feature release time +Status DBWithTTL::SanityCheckTimestamp(const Slice& str) { + if (str.size() < kTSLength) { + return Status::Corruption("Error: value's length less than timestamp's\n"); + } + // Checks that TS is not lesser than kMinTimestamp + // Gaurds against corruption & normal database opened incorrectly in ttl mode + int32_t timestamp_value = + DecodeFixed32(str.data() + str.size() - kTSLength); + if (timestamp_value < kMinTimestamp){ + return Status::Corruption("Error: Timestamp < ttl feature release time!\n"); + } + return Status::OK(); +} + +// Checks if the string is stale or not according to TTl provided +bool DBWithTTL::IsStale(const Slice& value, int32_t ttl) { + if (ttl <= 0) { // Data is fresh if TTL is non-positive + return false; + } + int64_t curtime; + if (!GetCurrentTime(curtime).ok()) { + return false; // Treat the data as fresh if could not get current time + } + int32_t timestamp_value = + DecodeFixed32(value.data() + value.size() - kTSLength); + return (timestamp_value + ttl) < curtime; +} + +// Strips the TS from the end of the string +Status DBWithTTL::StripTS(std::string* str) { + Status st; + if (str->length() < kTSLength) { + return Status::Corruption("Bad timestamp in key-value"); + } + // Erasing characters which hold the TS + str->erase(str->length() - kTSLength, kTSLength); + return st; +} + +Status DBWithTTL::Put(const WriteOptions& opt, const Slice& key, + const Slice& val) { + WriteBatch batch; + batch.Put(key, val); + return Write(opt, &batch); +} + +Status DBWithTTL::Get(const ReadOptions& options, + const Slice& key, + std::string* value) { + Status st = db_->Get(options, key, value); + if (!st.ok()) { + return st; + } + st = SanityCheckTimestamp(*value); + if (!st.ok()) { + return st; + } + return StripTS(value); +} + +std::vector DBWithTTL::MultiGet(const ReadOptions& options, + const std::vector& keys, + std::vector* values) { + return std::vector(keys.size(), + Status::NotSupported("MultiGet not\ + supported with TTL")); +} + +bool DBWithTTL::KeyMayExist(const ReadOptions& options, + const Slice& key, + std::string* value, + bool* value_found) { + bool ret = db_->KeyMayExist(options, key, value, value_found); + if (ret && value != nullptr && value_found != nullptr && *value_found) { + if (!SanityCheckTimestamp(*value).ok() || !StripTS(value).ok()) { + return false; + } + } + return ret; +} + +Status DBWithTTL::Merge(const WriteOptions& opt, + const Slice& key, + const Slice& value) { + WriteBatch batch; + batch.Merge(key, value); + return Write(opt, &batch); +} + +Status DBWithTTL::Write(const WriteOptions& opts, WriteBatch* updates) { + class Handler : public WriteBatch::Handler { + public: + WriteBatch updates_ttl; + Status batch_rewrite_status; + virtual void Put(const Slice& key, const Slice& value) { + std::string value_with_ts; + Status st = AppendTS(value, value_with_ts); + if (!st.ok()) { + batch_rewrite_status = st; + } else { + updates_ttl.Put(key, value_with_ts); + } + } + virtual void Merge(const Slice& key, const Slice& value) { + std::string value_with_ts; + Status st = AppendTS(value, value_with_ts); + if (!st.ok()) { + batch_rewrite_status = st; + } else { + updates_ttl.Merge(key, value_with_ts); + } + } + virtual void Delete(const Slice& key) { + updates_ttl.Delete(key); + } + virtual void LogData(const Slice& blob) { + updates_ttl.PutLogData(blob); + } + }; + Handler handler; + updates->Iterate(&handler); + if (!handler.batch_rewrite_status.ok()) { + return handler.batch_rewrite_status; + } else { + return db_->Write(opts, &(handler.updates_ttl)); + } +} + +Iterator* DBWithTTL::NewIterator(const ReadOptions& opts) { + return new TtlIterator(db_->NewIterator(opts)); +} + +void DBWithTTL::TEST_Destroy_DBWithTtl() { + ((DBImpl*) db_)->TEST_Destroy_DBImpl(); +} + +} // namespace rocksdb diff --git a/utilities/ttl/db_ttl.h b/utilities/ttl/db_ttl.h new file mode 100644 index 00000000..2fdc664e --- /dev/null +++ b/utilities/ttl/db_ttl.h @@ -0,0 +1,315 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/merge_operator.h" +#include "utilities/utility_db.h" +#include "db/db_impl.h" + +namespace rocksdb { + +class DBWithTTL : public StackableDB { + public: + static void SanitizeOptions(int32_t ttl, Options* options); + + explicit DBWithTTL(DB* db); + + virtual ~DBWithTTL(); + + virtual Status Put(const WriteOptions& o, const Slice& key, + const Slice& val) override; + + virtual Status Get(const ReadOptions& options, const Slice& key, + std::string* value) override; + + virtual std::vector MultiGet( + const ReadOptions& options, const std::vector& keys, + std::vector* values) override; + + virtual bool KeyMayExist(const ReadOptions& options, + const Slice& key, + std::string* value, + bool* value_found = nullptr) override; + + virtual Status Merge(const WriteOptions& options, const Slice& key, + const Slice& value) override; + + virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override; + + virtual Iterator* NewIterator(const ReadOptions& opts) override; + + // Simulate a db crash, no elegant closing of database. + void TEST_Destroy_DBWithTtl(); + + virtual DB* GetBaseDB() { + return db_; + } + + static bool IsStale(const Slice& value, int32_t ttl); + + static Status AppendTS(const Slice& val, std::string& val_with_ts); + + static Status SanityCheckTimestamp(const Slice& str); + + static Status StripTS(std::string* str); + + static Status GetCurrentTime(int64_t& curtime); + + static const uint32_t kTSLength = sizeof(int32_t); // size of timestamp + + static const int32_t kMinTimestamp = 1368146402; // 05/09/2013:5:40PM GMT-8 + + static const int32_t kMaxTimestamp = 2147483647; // 01/18/2038:7:14PM GMT-8 +}; + +class TtlIterator : public Iterator { + + public: + explicit TtlIterator(Iterator* iter) + : iter_(iter) { + assert(iter_); + } + + ~TtlIterator() { + delete iter_; + } + + bool Valid() const { + return iter_->Valid(); + } + + void SeekToFirst() { + iter_->SeekToFirst(); + } + + void SeekToLast() { + iter_->SeekToLast(); + } + + void Seek(const Slice& target) { + iter_->Seek(target); + } + + void Next() { + iter_->Next(); + } + + void Prev() { + iter_->Prev(); + } + + Slice key() const { + return iter_->key(); + } + + int32_t timestamp() const { + return DecodeFixed32( + iter_->value().data() + iter_->value().size() - DBWithTTL::kTSLength); + } + + Slice value() const { + //TODO: handle timestamp corruption like in general iterator semantics + assert(DBWithTTL::SanityCheckTimestamp(iter_->value()).ok()); + Slice trimmed_value = iter_->value(); + trimmed_value.size_ -= DBWithTTL::kTSLength; + return trimmed_value; + } + + Status status() const { + return iter_->status(); + } + + private: + Iterator* iter_; +}; + +class TtlCompactionFilter : public CompactionFilter { + + public: + TtlCompactionFilter( + int32_t ttl, + const CompactionFilter* user_comp_filter, + std::unique_ptr + user_comp_filter_from_factory = nullptr) + : ttl_(ttl), + user_comp_filter_(user_comp_filter), + user_comp_filter_from_factory_(std::move(user_comp_filter_from_factory)) { + // Unlike the merge operator, compaction filter is necessary for TTL, hence + // this would be called even if user doesn't specify any compaction-filter + if (!user_comp_filter_) { + user_comp_filter_ = user_comp_filter_from_factory_.get(); + } + } + + virtual bool Filter(int level, + const Slice& key, + const Slice& old_val, + std::string* new_val, + bool* value_changed) const override { + if (DBWithTTL::IsStale(old_val, ttl_)) { + return true; + } + if (user_comp_filter_ == nullptr) { + return false; + } + assert(old_val.size() >= DBWithTTL::kTSLength); + Slice old_val_without_ts(old_val.data(), + old_val.size() - DBWithTTL::kTSLength); + if (user_comp_filter_->Filter(level, key, old_val_without_ts, new_val, + value_changed)) { + return true; + } + if (*value_changed) { + new_val->append(old_val.data() + old_val.size() - DBWithTTL::kTSLength, + DBWithTTL::kTSLength); + } + return false; + } + + virtual const char* Name() const override { + return "Delete By TTL"; + } + + private: + int32_t ttl_; + const CompactionFilter* user_comp_filter_; + std::unique_ptr user_comp_filter_from_factory_; +}; + +class TtlCompactionFilterFactory : public CompactionFilterFactory { + public: + TtlCompactionFilterFactory( + int32_t ttl, + std::shared_ptr comp_filter_factory) + : ttl_(ttl), + user_comp_filter_factory_(comp_filter_factory) { } + + virtual std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& context) { + return std::unique_ptr( + new TtlCompactionFilter( + ttl_, + nullptr, + std::move(user_comp_filter_factory_->CreateCompactionFilter(context)) + ) + ); + } + + virtual const char* Name() const override { + return "TtlCompactionFilterFactory"; + } + + private: + int32_t ttl_; + std::shared_ptr user_comp_filter_factory_; +}; + +class TtlMergeOperator : public MergeOperator { + + public: + explicit TtlMergeOperator(const std::shared_ptr merge_op) + : user_merge_op_(merge_op) { + assert(merge_op); + } + + virtual bool FullMerge(const Slice& key, + const Slice* existing_value, + const std::deque& operands, + std::string* new_value, + Logger* logger) const override { + const uint32_t ts_len = DBWithTTL::kTSLength; + if (existing_value && existing_value->size() < ts_len) { + Log(logger, "Error: Could not remove timestamp from existing value."); + return false; + } + + // Extract time-stamp from each operand to be passed to user_merge_op_ + std::deque operands_without_ts; + for (const auto &operand : operands) { + if (operand.size() < ts_len) { + Log(logger, "Error: Could not remove timestamp from operand value."); + return false; + } + operands_without_ts.push_back(operand.substr(0, operand.size() - ts_len)); + } + + // Apply the user merge operator (store result in *new_value) + bool good = true; + if (existing_value) { + Slice existing_value_without_ts(existing_value->data(), + existing_value->size() - ts_len); + good = user_merge_op_->FullMerge(key, &existing_value_without_ts, + operands_without_ts, new_value, logger); + } else { + good = user_merge_op_->FullMerge(key, nullptr, operands_without_ts, + new_value, logger); + } + + // Return false if the user merge operator returned false + if (!good) { + return false; + } + + // Augment the *new_value with the ttl time-stamp + int64_t curtime; + if (!DBWithTTL::GetCurrentTime(curtime).ok()) { + Log(logger, "Error: Could not get current time to be attached internally " + "to the new value."); + return false; + } else { + char ts_string[ts_len]; + EncodeFixed32(ts_string, (int32_t)curtime); + new_value->append(ts_string, ts_len); + return true; + } + } + + virtual bool PartialMerge(const Slice& key, + const Slice& left_operand, + const Slice& right_operand, + std::string* new_value, + Logger* logger) const override { + const uint32_t ts_len = DBWithTTL::kTSLength; + + if (left_operand.size() < ts_len || right_operand.size() < ts_len) { + Log(logger, "Error: Could not remove timestamp from value."); + return false; + } + + // Apply the user partial-merge operator (store result in *new_value) + assert(new_value); + Slice left_without_ts(left_operand.data(), left_operand.size() - ts_len); + Slice right_without_ts(right_operand.data(), right_operand.size() - ts_len); + if (!user_merge_op_->PartialMerge(key, left_without_ts, right_without_ts, + new_value, logger)) { + return false; + } + + // Augment the *new_value with the ttl time-stamp + int64_t curtime; + if (!DBWithTTL::GetCurrentTime(curtime).ok()) { + Log(logger, "Error: Could not get current time to be attached internally " + "to the new value."); + return false; + } else { + char ts_string[ts_len]; + EncodeFixed32(ts_string, (int32_t)curtime); + new_value->append(ts_string, ts_len); + return true; + } + + } + + virtual const char* Name() const override { + return "Merge By TTL"; + } + + private: + std::shared_ptr user_merge_op_; +}; + +} diff --git a/utilities/ttl/ttl_test.cc b/utilities/ttl/ttl_test.cc new file mode 100644 index 00000000..8804d893 --- /dev/null +++ b/utilities/ttl/ttl_test.cc @@ -0,0 +1,505 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include "rocksdb/compaction_filter.h" +#include "utilities/utility_db.h" +#include "util/testharness.h" +#include "util/logging.h" +#include +#include + +namespace rocksdb { + +namespace { + +typedef std::map KVMap; + +enum BatchOperation { + PUT = 0, + DELETE = 1 +}; + +} + +class TtlTest { + public: + TtlTest() { + dbname_ = test::TmpDir() + "/db_ttl"; + options_.create_if_missing = true; + // ensure that compaction is kicked in to always strip timestamp from kvs + options_.max_grandparent_overlap_factor = 0; + // compaction should take place always from level0 for determinism + options_.max_mem_compaction_level = 0; + db_ttl_ = nullptr; + DestroyDB(dbname_, Options()); + } + + ~TtlTest() { + CloseTtl(); + DestroyDB(dbname_, Options()); + } + + // Open database with TTL support when TTL not provided with db_ttl_ pointer + void OpenTtl() { + assert(db_ttl_ == nullptr); // db should be closed before opening again + ASSERT_OK(UtilityDB::OpenTtlDB(options_, dbname_, &db_ttl_)); + } + + // Open database with TTL support when TTL provided with db_ttl_ pointer + void OpenTtl(int32_t ttl) { + assert(db_ttl_ == nullptr); + ASSERT_OK(UtilityDB::OpenTtlDB(options_, dbname_, &db_ttl_, ttl)); + } + + // Open with TestFilter compaction filter + void OpenTtlWithTestCompaction(int32_t ttl) { + options_.compaction_filter_factory = + std::shared_ptr( + new TestFilterFactory(kSampleSize_, kNewValue_)); + OpenTtl(ttl); + } + + // Open database with TTL support in read_only mode + void OpenReadOnlyTtl(int32_t ttl) { + assert(db_ttl_ == nullptr); + ASSERT_OK(UtilityDB::OpenTtlDB(options_, dbname_, &db_ttl_, ttl, true)); + } + + void CloseTtl() { + delete db_ttl_; + db_ttl_ = nullptr; + } + + // Populates and returns a kv-map + void MakeKVMap(int64_t num_entries) { + kvmap_.clear(); + int digits = 1; + for (int dummy = num_entries; dummy /= 10 ; ++digits); + int digits_in_i = 1; + for (int64_t i = 0; i < num_entries; i++) { + std::string key = "key"; + std::string value = "value"; + if (i % 10 == 0) { + digits_in_i++; + } + for(int j = digits_in_i; j < digits; j++) { + key.append("0"); + value.append("0"); + } + AppendNumberTo(&key, i); + AppendNumberTo(&value, i); + kvmap_[key] = value; + } + ASSERT_EQ((int)kvmap_.size(), num_entries);//check all insertions done + } + + // Makes a write-batch with key-vals from kvmap_ and 'Write''s it + void MakePutWriteBatch(const BatchOperation* batch_ops, int num_ops) { + assert(num_ops <= (int)kvmap_.size()); + static WriteOptions wopts; + static FlushOptions flush_opts; + WriteBatch batch; + kv_it_ = kvmap_.begin(); + for (int i = 0; i < num_ops && kv_it_ != kvmap_.end(); i++, kv_it_++) { + switch (batch_ops[i]) { + case PUT: + batch.Put(kv_it_->first, kv_it_->second); + break; + case DELETE: + batch.Delete(kv_it_->first); + break; + default: + assert(false); + } + } + db_ttl_->Write(wopts, &batch); + db_ttl_->Flush(flush_opts); + } + + // Puts num_entries starting from start_pos_map from kvmap_ into the database + void PutValues(int start_pos_map, int num_entries, bool flush = true) { + assert(db_ttl_); + ASSERT_LE(start_pos_map + num_entries, (int)kvmap_.size()); + static WriteOptions wopts; + static FlushOptions flush_opts; + kv_it_ = kvmap_.begin(); + advance(kv_it_, start_pos_map); + for (int i = 0; kv_it_ != kvmap_.end() && i < num_entries; i++, kv_it_++) { + ASSERT_OK(db_ttl_->Put(wopts, kv_it_->first, kv_it_->second)); + } + // Put a mock kv at the end because CompactionFilter doesn't delete last key + ASSERT_OK(db_ttl_->Put(wopts, "keymock", "valuemock")); + if (flush) { + db_ttl_->Flush(flush_opts); + } + } + + // Runs a manual compaction + void ManualCompact() { + db_ttl_->CompactRange(nullptr, nullptr); + } + + // checks the whole kvmap_ to return correct values using KeyMayExist + void SimpleKeyMayExistCheck() { + static ReadOptions ropts; + bool value_found; + std::string val; + for(auto &kv : kvmap_) { + bool ret = db_ttl_->KeyMayExist(ropts, kv.first, &val, &value_found); + if (ret == false || value_found == false) { + fprintf(stderr, "KeyMayExist could not find key=%s in the database but" + " should have\n", kv.first.c_str()); + assert(false); + } else if (val.compare(kv.second) != 0) { + fprintf(stderr, " value for key=%s present in database is %s but" + " should be %s\n", kv.first.c_str(), val.c_str(), + kv.second.c_str()); + assert(false); + } + } + } + + // Sleeps for slp_tim then runs a manual compaction + // Checks span starting from st_pos from kvmap_ in the db and + // Gets should return true if check is true and false otherwise + // Also checks that value that we got is the same as inserted; and =kNewValue + // if test_compaction_change is true + void SleepCompactCheck(int slp_tim, int st_pos, int span, bool check = true, + bool test_compaction_change = false) { + assert(db_ttl_); + sleep(slp_tim); + ManualCompact(); + static ReadOptions ropts; + kv_it_ = kvmap_.begin(); + advance(kv_it_, st_pos); + std::string v; + for (int i = 0; kv_it_ != kvmap_.end() && i < span; i++, kv_it_++) { + Status s = db_ttl_->Get(ropts, kv_it_->first, &v); + if (s.ok() != check) { + fprintf(stderr, "key=%s ", kv_it_->first.c_str()); + if (!s.ok()) { + fprintf(stderr, "is absent from db but was expected to be present\n"); + } else { + fprintf(stderr, "is present in db but was expected to be absent\n"); + } + assert(false); + } else if (s.ok()) { + if (test_compaction_change && v.compare(kNewValue_) != 0) { + fprintf(stderr, " value for key=%s present in database is %s but " + " should be %s\n", kv_it_->first.c_str(), v.c_str(), + kNewValue_.c_str()); + assert(false); + } else if (!test_compaction_change && v.compare(kv_it_->second) !=0) { + fprintf(stderr, " value for key=%s present in database is %s but " + " should be %s\n", kv_it_->first.c_str(), v.c_str(), + kv_it_->second.c_str()); + assert(false); + } + } + } + } + + // Similar as SleepCompactCheck but uses TtlIterator to read from db + void SleepCompactCheckIter(int slp, int st_pos, int span, bool check=true) { + assert(db_ttl_); + sleep(slp); + ManualCompact(); + static ReadOptions ropts; + Iterator *dbiter = db_ttl_->NewIterator(ropts); + kv_it_ = kvmap_.begin(); + advance(kv_it_, st_pos); + + dbiter->Seek(kv_it_->first); + if (!check) { + if (dbiter->Valid()) { + ASSERT_NE(dbiter->value().compare(kv_it_->second), 0); + } + } else { // dbiter should have found out kvmap_[st_pos] + for (int i = st_pos; + kv_it_ != kvmap_.end() && i < st_pos + span; + i++, kv_it_++) { + ASSERT_TRUE(dbiter->Valid()); + ASSERT_EQ(dbiter->value().compare(kv_it_->second), 0); + dbiter->Next(); + } + } + delete dbiter; + } + + class TestFilter : public CompactionFilter { + public: + TestFilter(const int64_t kSampleSize, const std::string kNewValue) + : kSampleSize_(kSampleSize), + kNewValue_(kNewValue) { + } + + // Works on keys of the form "key" + // Drops key if number at the end of key is in [0, kSampleSize_/3), + // Keeps key if it is in [kSampleSize_/3, 2*kSampleSize_/3), + // Change value if it is in [2*kSampleSize_/3, kSampleSize_) + // Eg. kSampleSize_=6. Drop:key0-1...Keep:key2-3...Change:key4-5... + virtual bool Filter(int level, const Slice& key, + const Slice& value, std::string* new_value, + bool* value_changed) const override { + assert(new_value != nullptr); + + std::string search_str = "0123456789"; + std::string key_string = key.ToString(); + size_t pos = key_string.find_first_of(search_str); + int num_key_end; + if (pos != std::string::npos) { + num_key_end = stoi(key_string.substr(pos, key.size() - pos)); + } else { + return false; // Keep keys not matching the format "key" + } + + int partition = kSampleSize_ / 3; + if (num_key_end < partition) { + return true; + } else if (num_key_end < partition * 2) { + return false; + } else { + *new_value = kNewValue_; + *value_changed = true; + return false; + } + } + + virtual const char* Name() const override { + return "TestFilter"; + } + + private: + const int64_t kSampleSize_; + const std::string kNewValue_; + }; + + class TestFilterFactory : public CompactionFilterFactory { + public: + TestFilterFactory(const int64_t kSampleSize, const std::string kNewValue) + : kSampleSize_(kSampleSize), + kNewValue_(kNewValue) { + } + + virtual std::unique_ptr + CreateCompactionFilter( + const CompactionFilter::Context& context) override { + return std::unique_ptr( + new TestFilter(kSampleSize_, kNewValue_)); + } + + virtual const char* Name() const override { + return "TestFilterFactory"; + } + + private: + const int64_t kSampleSize_; + const std::string kNewValue_; + }; + + + // Choose carefully so that Put, Gets & Compaction complete in 1 second buffer + const int64_t kSampleSize_ = 100; + + private: + std::string dbname_; + StackableDB* db_ttl_; + Options options_; + KVMap kvmap_; + KVMap::iterator kv_it_; + const std::string kNewValue_ = "new_value"; + unique_ptr test_comp_filter_; +}; // class TtlTest + +// If TTL is non positive or not provided, the behaviour is TTL = infinity +// This test opens the db 3 times with such default behavior and inserts a +// bunch of kvs each time. All kvs should accumulate in the db till the end +// Partitions the sample-size provided into 3 sets over boundary1 and boundary2 +TEST(TtlTest, NoEffect) { + MakeKVMap(kSampleSize_); + int boundary1 = kSampleSize_ / 3; + int boundary2 = 2 * boundary1; + + OpenTtl(); + PutValues(0, boundary1); //T=0: Set1 never deleted + SleepCompactCheck(1, 0, boundary1); //T=1: Set1 still there + CloseTtl(); + + OpenTtl(0); + PutValues(boundary1, boundary2 - boundary1); //T=1: Set2 never deleted + SleepCompactCheck(1, 0, boundary2); //T=2: Sets1 & 2 still there + CloseTtl(); + + OpenTtl(-1); + PutValues(boundary2, kSampleSize_ - boundary2); //T=3: Set3 never deleted + SleepCompactCheck(1, 0, kSampleSize_, true); //T=4: Sets 1,2,3 still there + CloseTtl(); +} + +// Puts a set of values and checks its presence using Get during ttl +TEST(TtlTest, PresentDuringTTL) { + MakeKVMap(kSampleSize_); + + OpenTtl(2); // T=0:Open the db with ttl = 2 + PutValues(0, kSampleSize_); // T=0:Insert Set1. Delete at t=2 + SleepCompactCheck(1, 0, kSampleSize_, true); // T=1:Set1 should still be there + CloseTtl(); +} + +// Puts a set of values and checks its absence using Get after ttl +TEST(TtlTest, AbsentAfterTTL) { + MakeKVMap(kSampleSize_); + + OpenTtl(1); // T=0:Open the db with ttl = 2 + PutValues(0, kSampleSize_); // T=0:Insert Set1. Delete at t=2 + SleepCompactCheck(2, 0, kSampleSize_, false); // T=2:Set1 should not be there + CloseTtl(); +} + +// Resets the timestamp of a set of kvs by updating them and checks that they +// are not deleted according to the old timestamp +TEST(TtlTest, ResetTimestamp) { + MakeKVMap(kSampleSize_); + + OpenTtl(3); + PutValues(0, kSampleSize_); // T=0: Insert Set1. Delete at t=3 + sleep(2); // T=2 + PutValues(0, kSampleSize_); // T=2: Insert Set1. Delete at t=5 + SleepCompactCheck(2, 0, kSampleSize_); // T=4: Set1 should still be there + CloseTtl(); +} + +// Similar to PresentDuringTTL but uses Iterator +TEST(TtlTest, IterPresentDuringTTL) { + MakeKVMap(kSampleSize_); + + OpenTtl(2); + PutValues(0, kSampleSize_); // T=0: Insert. Delete at t=2 + SleepCompactCheckIter(1, 0, kSampleSize_); // T=1: Set should be there + CloseTtl(); +} + +// Similar to AbsentAfterTTL but uses Iterator +TEST(TtlTest, IterAbsentAfterTTL) { + MakeKVMap(kSampleSize_); + + OpenTtl(1); + PutValues(0, kSampleSize_); // T=0: Insert. Delete at t=1 + SleepCompactCheckIter(2, 0, kSampleSize_, false); // T=2: Should not be there + CloseTtl(); +} + +// Checks presence while opening the same db more than once with the same ttl +// Note: The second open will open the same db +TEST(TtlTest, MultiOpenSamePresent) { + MakeKVMap(kSampleSize_); + + OpenTtl(2); + PutValues(0, kSampleSize_); // T=0: Insert. Delete at t=2 + CloseTtl(); + + OpenTtl(2); // T=0. Delete at t=2 + SleepCompactCheck(1, 0, kSampleSize_); // T=1: Set should be there + CloseTtl(); +} + +// Checks absence while opening the same db more than once with the same ttl +// Note: The second open will open the same db +TEST(TtlTest, MultiOpenSameAbsent) { + MakeKVMap(kSampleSize_); + + OpenTtl(1); + PutValues(0, kSampleSize_); // T=0: Insert. Delete at t=1 + CloseTtl(); + + OpenTtl(1); // T=0.Delete at t=1 + SleepCompactCheck(2, 0, kSampleSize_, false); // T=2: Set should not be there + CloseTtl(); +} + +// Checks presence while opening the same db more than once with bigger ttl +TEST(TtlTest, MultiOpenDifferent) { + MakeKVMap(kSampleSize_); + + OpenTtl(1); + PutValues(0, kSampleSize_); // T=0: Insert. Delete at t=1 + CloseTtl(); + + OpenTtl(3); // T=0: Set deleted at t=3 + SleepCompactCheck(2, 0, kSampleSize_); // T=2: Set should be there + CloseTtl(); +} + +// Checks presence during ttl in read_only mode +TEST(TtlTest, ReadOnlyPresentForever) { + MakeKVMap(kSampleSize_); + + OpenTtl(1); // T=0:Open the db normally + PutValues(0, kSampleSize_); // T=0:Insert Set1. Delete at t=1 + CloseTtl(); + + OpenReadOnlyTtl(1); + SleepCompactCheck(2, 0, kSampleSize_); // T=2:Set1 should still be there + CloseTtl(); +} + +// Checks whether WriteBatch works well with TTL +// Puts all kvs in kvmap_ in a batch and writes first, then deletes first half +TEST(TtlTest, WriteBatchTest) { + MakeKVMap(kSampleSize_); + BatchOperation batch_ops[kSampleSize_]; + for (int i = 0; i < kSampleSize_; i++) { + batch_ops[i] = PUT; + } + + OpenTtl(2); + MakePutWriteBatch(batch_ops, kSampleSize_); + for (int i = 0; i < kSampleSize_ / 2; i++) { + batch_ops[i] = DELETE; + } + MakePutWriteBatch(batch_ops, kSampleSize_ / 2); + SleepCompactCheck(0, 0, kSampleSize_ / 2, false); + SleepCompactCheck(0, kSampleSize_ / 2, kSampleSize_ - kSampleSize_ / 2); + CloseTtl(); +} + +// Checks user's compaction filter for correctness with TTL logic +TEST(TtlTest, CompactionFilter) { + MakeKVMap(kSampleSize_); + + OpenTtlWithTestCompaction(1); + PutValues(0, kSampleSize_); // T=0:Insert Set1. Delete at t=1 + // T=2: TTL logic takes precedence over TestFilter:-Set1 should not be there + SleepCompactCheck(2, 0, kSampleSize_, false); + CloseTtl(); + + OpenTtlWithTestCompaction(3); + PutValues(0, kSampleSize_); // T=0:Insert Set1. + int partition = kSampleSize_ / 3; + SleepCompactCheck(1, 0, partition, false); // Part dropped + SleepCompactCheck(0, partition, partition); // Part kept + SleepCompactCheck(0, 2 * partition, partition, true, true); // Part changed + CloseTtl(); +} + +// Insert some key-values which KeyMayExist should be able to get and check that +// values returned are fine +TEST(TtlTest, KeyMayExist) { + MakeKVMap(kSampleSize_); + + OpenTtl(); + PutValues(0, kSampleSize_, false); + + SimpleKeyMayExistCheck(); + + CloseTtl(); +} + +} // namespace rocksdb + +// A black-box test for the ttl wrapper around rocksdb +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +}